In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

In [2]:
# Load model from Weeks 3/4 

df = pd.read_csv("course_lead_scoring.csv")

cat_features = list(df.dtypes[df.dtypes == 'object'].index)
print(cat_features)

num_features = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
print(num_features)


for col in cat_features:
    df[col] = df[col].fillna('NA')

for col in num_features:
    df[col] = df[col].fillna(0)


df.isnull().sum()



['lead_source', 'industry', 'employment_status', 'location']
['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score', 'converted']


lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64

In [3]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)

In [4]:
# # Check and remove 'converted' from numerical features list
# if 'converted' in num_features:
#     num_features.remove('converted')
    
# # Check and remove 'converted' from categorical features list if it was mistakenly there
# if 'converted' in cat_features:
#     cat_features.remove('converted')

# # Initiate the model
all_features = cat_features +  num_features

In [5]:
# Define functions that train / run the model 

def train(df_train, y_train, C=1.0):
    """Trains the Logistic Regression model."""
    dicts = df_train[cat_features + num_features].to_dict(orient='records')

    dv = DictVectorizer(sparse=False)
    X_train = dv.fit_transform(dicts)

    # Using solver='liblinear' is important for C < 1.0 (though here C=1.0)
    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000) 
    model.fit(X_train, y_train)

    return dv, model

def predict(df, dv, model):
    """Generates probability predictions on the given DataFrame."""
    dicts = df[cat_features + num_features].to_dict(orient='records')

    X = dv.transform(dicts)
    y_pred = model.predict_proba(X)[:, 1]

    return y_pred

In [6]:
C = 1.0 
n_splits = 5

In [7]:
kfold = KFold(n_splits=n_splits, shuffle=True, random_state=1)

scores = []

for train_idx, val_idx in kfold.split(df_full_train): 
    df_train = df_full_train.iloc[train_idx]
    df_val = df_full_train.iloc[val_idx]

    y_train = df_train.converted.values
    y_val = df_val.converted.values

    dv, model = train(df_train, y_train, C=C)
    y_pred = predict(df_val, dv, model)

    auc = roc_auc_score(y_val, y_pred)
    scores.append(auc)

print('C=%s %.3f +- %.3f' % (C, np.mean(scores), np.std(scores)))


C=1.0 0.876 +- 0.032


In [8]:
scores

[0.8688304920717635,
 0.9216514191883649,
 0.8433810305731487,
 0.8422322540473225,
 0.9026466491255224]

In [9]:
dv, model = train(df_full_train, df_full_train.converted.values, C=1.0)
y_pred = predict(df_test, dv, model)

y_test = df_test.converted.values
auc = roc_auc_score(y_test, y_pred)
auc

0.8674501788451712

In [10]:
import pickle 

In [11]:
# output_file = open(f'model_C=%s.bin') % C
output_file = f'model_C={C}.bin'
output_file

'model_C=1.0.bin'

In [12]:
f_out = open(output_file, 'wb')
# pickle.dump(model, f_out)
pickle.dump((dv, model), f_out)

f_out.close() 

In [None]:
with open(output_file, 'wb') as f_out:
    pickle.dump((dv, model), f_out)  # do stuff

        # autoclosed outside of the block above
# ______________________________________________________

In [2]:
import pickle

In [4]:
input_file = 'model_C=1.0.bin'

with open(input_file, 'rb') as f_in:
    dv, model = pickle.load(f_in)

In [7]:
dv, model

(DictVectorizer(sparse=False),
 LogisticRegression(max_iter=1000, solver='liblinear'))

In [19]:
customer = {'lead_source': 'paid_ads',
            'industry': 'education',
            'number_of_courses_viewed': 5,
            'annual_income': 52254.0,
            'employment_status': 'employed',
            'location': 'south_america',
            'interaction_count': 5,
            'lead_score': 0.49,
            'converted': 1}

In [None]:
X = dv.transform([customer])

In [28]:
model.predict_proba(X)[0,1].round(4)

np.float64(0.951)