In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

In [7]:
df = pd.read_csv(r'C:\Users\GIA DAT\ML Zoomcamp\4. Evaluation\course_lead_scoring.csv')
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


## Data Preparation

In [28]:
df.isnull().sum()

lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64

In [16]:
df.dtypes

lead_source                  object
industry                     object
number_of_courses_viewed      int64
annual_income               float64
employment_status            object
location                     object
interaction_count             int64
lead_score                  float64
converted                     int64
dtype: object

In [18]:
caterogical_features = ["lead_source", "industry", "employment_status", "location"]
numerical_features = ["number_of_courses_viewed", "annual_income", "interaction_count", "lead_score"]

In [26]:
for ele in caterogical_features:
    df[ele] = df[ele].fillna('NA')

for ele in numerical_features:
    df[ele] = df[ele].fillna(0)

In [30]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train.converted.to_numpy()
y_val = df_val.converted.to_numpy()
y_test = df_test.converted.to_numpy()

del df_train['converted']
del df_val['converted']
del df_test['converted']

## Question 1

In [35]:
from sklearn.metrics import roc_auc_score

In [63]:
features = caterogical_features + numerical_features

X_train = df_train[features].reset_index(drop=True)
X_val = df_val[features].reset_index(drop=True)
X_test = df_test[features].reset_index(drop=True)

In [92]:
list1 = ['lead_score', 'number_of_courses_viewed', 'interaction_count', 'annual_income']

for ele in list1:
    dv = DictVectorizer(sparse=False)

    train_dict = df_train[[ele]].to_dict(orient='records')
    X_train = dv.fit_transform(train_dict)
    
    model = LogisticRegression()
    model.fit(X_train, y_train)

    y_pred = model.predict_proba(X_train)[:, 1]
    auc = roc_auc_score(y_train, y_pred)

    if auc < 0.5:
        auc = roc_auc_score(y_train, -y_pred)
    
    print("%s | %.3f" % (ele, auc))

lead_score | 0.614
number_of_courses_viewed | 0.764
interaction_count | 0.738
annual_income | 0.552


number_of_courses_viewed has the highest AUC

## Question 2

In [99]:
dv = DictVectorizer(sparse=False)

train_dict = df_train[features].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)
    
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000)
model.fit(X_train, y_train)

val_dict = df_val[features].to_dict(orient='records')
X_val = dv.transform(val_dict)

y_pred = model.predict_proba(X_val)[:, 1]
round(roc_auc_score(y_val, y_pred), 3)

0.817

## Question 3

In [132]:
scores = []
thresholds = np.linspace(0, 1, 101)
for t in thresholds:
    actual_positive = (y_val == 1)
    actual_negative = (y_val == 0)
    predict_positive = (y_pred >= t)
    predict_negative = (y_pred < t)
        
    tp = (predict_positive & actual_positive).sum()
    tn = (predict_negative & actual_negative).sum()
    fp = (predict_positive & actual_negative).sum()
    fn = (predict_negative & actual_positive).sum()
    
    scores.append((t, tp, fp, fn, tn))
    
columns = ['threshold', 'tp', 'fp', 'fn', 'tn']
df_scores = pd.DataFrame(scores, columns = columns)
df_scores['precision'] = df_scores.tp / (df_scores.tp + df_scores.fp)
df_scores['recall'] = df_scores.tp / (df_scores.tp + df_scores.fn)
df_scores['diff'] = df_scores['precision'] - df_scores['recall']
df_scores.abs().sort_values(by = "diff")

Unnamed: 0,threshold,tp,fp,fn,tn,precision,recall,diff
64,0.64,134,38,37,84,0.779070,0.783626,0.004556
65,0.65,130,37,41,85,0.778443,0.760234,0.018209
63,0.63,139,43,32,79,0.763736,0.812865,0.049129
66,0.66,124,35,47,87,0.779874,0.725146,0.054728
62,0.62,143,45,28,77,0.760638,0.836257,0.075619
...,...,...,...,...,...,...,...,...
96,0.96,3,0,168,122,1.000000,0.017544,0.982456
97,0.97,1,0,170,122,1.000000,0.005848,0.994152
98,0.98,0,0,171,122,,0.000000,
99,0.99,0,0,171,122,,0.000000,


Precision and Recall curves intersect at threshold 0.64

## Question 4

In [140]:
df_scores["F1"] = 2*df_scores['precision']*df_scores['recall'] / (df_scores['recall'] + df_scores['precision'])

In [152]:
df_scores.sort_values(by = "F1", ascending = False).head(1)

Unnamed: 0,threshold,tp,fp,fn,tn,precision,recall,diff,F1
57,0.57,156,57,15,65,0.732394,0.912281,-0.179886,0.8125


At 0.57, F1 is max

## Question 5

In [158]:
from sklearn.model_selection import KFold

In [206]:
def train(df_train, y_train, C = 1):
    dicts = df_train[features].to_dict(orient = 'records')

    dv = DictVectorizer(sparse = False)
    X_train = dv.fit_transform(dicts)

    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000)
    model.fit(X_train, y_train)

    return dv, model

In [174]:
def predict(df, dv, model):
    dicts = df[features].to_dict(orient = 'records')
    
    X = dv.transform(dicts)
    y_pred = model.predict_proba(X)[:, 1]

    return y_pred

In [208]:
n_splits = 5


scores = []
kfold = KFold(n_splits = n_splits, shuffle = True, random_state = 1)
    
for train_idx, val_idx in kfold.split(df_full_train):
    df_train = df_full_train.iloc[train_idx]
    df_val = df_full_train.iloc[val_idx]
    
    y_train = df_train.converted.to_numpy()
    y_val = df_val.converted.to_numpy()
    
    dv, model = train(df_train, y_train)
    y_pred = predict(df_val, dv, model)
    
    auc = roc_auc_score(y_val, y_pred)
    scores.append(auc)
print('%.3f +- %.3f' % ( np.mean(scores), np.std(scores)))

0.822 +- 0.036


Standard deviation is 0.036

In [210]:
scores # AUC Scores for each fold

[0.8060745924216483,
 0.8713738368910783,
 0.7754320118852139,
 0.8018368617683685,
 0.8558272713202291]

## Question 6

In [212]:
n_splits = 5


for C in [0.000001, 0.001, 1]:
    scores = []
    kfold = KFold(n_splits = n_splits, shuffle = True, random_state = 1)
    
    for train_idx, val_idx in kfold.split(df_full_train):
        df_train = df_full_train.iloc[train_idx]
        df_val = df_full_train.iloc[val_idx]
    
        y_train = df_train.converted.to_numpy()
        y_val = df_val.converted.to_numpy()
    
        dv, model = train(df_train, y_train, C)
        y_pred = predict(df_val, dv, model)
    
        auc = roc_auc_score(y_val, y_pred)
        scores.append(auc)
    print('%s %.3f +- %.3f' % (C, np.mean(scores), np.std(scores)))

1e-06 0.560 +- 0.024
0.001 0.867 +- 0.029
1 0.822 +- 0.036


C = 0.001 has the best mean score