# HOMEWORK 3

## Imports

In [332]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

## Dataset

In [333]:
df = pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv')

## Data Preparation

In [334]:
# Divide columns into categorical and numerical variables
categorical = ['lead_source', 'industry', 'employment_status', 'location']
numerical = ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']

In [335]:
# Replace missing valaues
df[categorical] = df[categorical].fillna('NA')
df[numerical] = df[numerical].fillna(0)

In [336]:
df

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.80,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1
...,...,...,...,...,...,...,...,...,...
1457,referral,manufacturing,1,0.0,self_employed,north_america,4,0.53,1
1458,referral,technology,3,65259.0,student,europe,2,0.24,1
1459,paid_ads,technology,1,45688.0,student,north_america,3,0.02,1
1460,referral,,5,71016.0,self_employed,north_america,0,0.25,1


## Question 1

In [337]:
df.industry.mode()

0    retail
Name: industry, dtype: object

Retail is most frequently observed in 'industry'.

## Question 2

In [338]:
df[numerical].corrwith(df.lead_score).abs()

number_of_courses_viewed    0.004879
annual_income               0.015610
interaction_count           0.009888
lead_score                  1.000000
dtype: float64

In [339]:
df[numerical].corrwith(df.interaction_count).abs()

number_of_courses_viewed    0.023565
annual_income               0.027036
interaction_count           1.000000
lead_score                  0.009888
dtype: float64

'annual_income' and 'interaction_count' have the strongest correlation.

## Split the Data

In [340]:
# 60/20/20 split
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

# Reset indeces
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [341]:
# Create y variables from converted
y_train = df_train.converted.values
y_val = df_val.converted.values
y_test = df_test.converted.values

# Remove converted from datasets
del df_train['converted']
del df_val['converted']
del df_test['converted']

## Question 3

In [342]:
round(mutual_info_score(df_full_train.converted, df_full_train.industry), 2)

0.01

In [343]:
round(mutual_info_score(df_full_train.converted, df_full_train.location), 2)

0.0

In [344]:
round(mutual_info_score(df_full_train.converted, df_full_train.lead_source), 2)

0.03

In [345]:
round(mutual_info_score(df_full_train.converted, df_full_train.employment_status), 2)

0.01

'lead_source' has the biggest mutual information score.

## Question 4

In [346]:
# One-hot encoding for categorical variables 
dv = DictVectorizer(sparse=False)

train_dict = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [347]:
# Train the model with logistic regression
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [348]:
# w0
model.intercept_[0]

np.float64(-0.0691472802783609)

In [349]:
# w
model.coef_[0].round(3)

array([-0.   , -0.015,  0.034,  0.003,  0.012, -0.103, -0.025,  0.049,
       -0.02 , -0.013, -0.003, -0.009, -0.032, -0.016,  0.311,  0.051,
        0.02 , -0.012, -0.012, -0.115,  0.08 , -0.03 ,  0.004, -0.011,
       -0.011, -0.006,  0.008,  0.006, -0.033, -0.025,  0.454])

In [350]:
# Check accuracy on validation set
y_pred = model.predict_proba(X_val)[:, 1]
churn_decision = (y_pred >= 0.5)
acc = (y_val == churn_decision).mean()
acc.round(2)

np.float64(0.7)

Found an accuracy of 0.74.

## Question 5

### Remove 'Industry'

In [351]:
# Create data sets without industry
df_train_no_ind = df_train.copy()
df_val_no_ind = df_val.copy()

del df_train_no_ind['industry']
del df_val_no_ind['industry']

categorical_no_ind = ['lead_source', 'employment_status', 'location']

In [352]:
# One-hot encoding for categorical variables 
dv_no_ind = DictVectorizer(sparse=False)

train_no_ind_dict = df_train_no_ind[categorical_no_ind + numerical].to_dict(orient='records')
X_train_no_ind = dv_no_ind.fit_transform(train_no_ind_dict)

val_no_ind_dict = df_val_no_ind[categorical_no_ind + numerical].to_dict(orient='records')
X_val_no_ind = dv_no_ind.transform(val_no_ind_dict)

In [353]:
# Train the model with logistic regression
model_no_ind = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model_no_ind.fit(X_train_no_ind, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [354]:
# Check accuracy on validation set
y_pred_no_ind = model_no_ind.predict_proba(X_val_no_ind)[:, 1]
churn_decision_no_ind = (y_pred_no_ind >= 0.5)
acc_no_ind = (y_val == churn_decision_no_ind).mean()
acc_no_ind

np.float64(0.6996587030716723)

In [355]:
# Difference from base model
acc_no_ind - acc

np.float64(0.0)

### Remove 'Employment Status'

In [356]:
# Create data sets without employment status
df_train_no_es = df_train.copy()
df_val_no_es = df_val.copy()

del df_train_no_es['employment_status']
del df_val_no_es['employment_status']

categorical_no_es = ['lead_source', 'industry', 'location']

In [357]:
# One-hot encoding for categorical variables 
dv_no_es = DictVectorizer(sparse=False)

train_no_es_dict = df_train_no_es[categorical_no_es + numerical].to_dict(orient='records')
X_train_no_es = dv_no_es.fit_transform(train_no_es_dict)

val_no_es_dict = df_val_no_es[categorical_no_es + numerical].to_dict(orient='records')
X_val_no_es = dv_no_es.transform(val_no_es_dict)

In [358]:
# Train the model with logistic regression
model_no_es = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model_no_es.fit(X_train_no_es, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [359]:
# Check accuracy on validation set
y_pred_no_es = model_no_es.predict_proba(X_val_no_es)[:, 1]
churn_decision_no_es = (y_pred_no_es >= 0.5)
acc_no_es = (y_val == churn_decision_no_es).mean()
acc_no_es

np.float64(0.6962457337883959)

In [367]:
# Difference from base model
abs(acc_no_es - acc)

np.float64(0.0034129692832763903)

### Remove 'Lead_Score'

In [361]:
# Create data sets without lead score status
df_train_no_ls = df_train.copy()
df_val_no_ls = df_val.copy()

del df_train_no_ls['lead_source']
del df_val_no_ls['lead_source']

categorical_no_ls = ['industry', 'employment_status', 'location']

In [362]:
# One-hot encoding for categorical variables 
dv_no_ls = DictVectorizer(sparse=False)

train_no_ls_dict = df_train_no_ls[categorical_no_ls + numerical].to_dict(orient='records')
X_train_no_ls = dv_no_ls.fit_transform(train_no_ls_dict)

val_no_ls_dict = df_val_no_ls[categorical_no_ls + numerical].to_dict(orient='records')
X_val_no_ls = dv_no_ls.transform(val_no_ls_dict)

In [363]:
# Train the model with logistic regression
model_no_ls = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model_no_ls.fit(X_train_no_ls, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [364]:
# Check accuracy on validation set
y_pred_no_ls = model_no_ls.predict_proba(X_val_no_ls)[:, 1]
churn_decision_no_ls = (y_pred_no_ls >= 0.5)
acc_no_ls = (y_val == churn_decision_no_ls).mean()
acc_no_ls

np.float64(0.7030716723549488)

In [366]:
# Difference from base model
abs(acc_no_ls - acc)

np.float64(0.0034129692832765013)

'industry' has the smallest difference.

## Question 6

In [369]:
for r in [0.01, 0.1, 1, 10, 100]:
    print(r)
    
    # Train the model with regularized logistic regression
    model = LogisticRegression(solver='liblinear', C=r, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)

    # Check accuracy on validation set
    y_pred = model.predict_proba(X_val)[:, 1]
    churn_decision = (y_pred >= 0.5)
    acc = (y_val == churn_decision).mean()
    print(acc)
    print()

0.01
0.6996587030716723

0.1
0.6996587030716723

1
0.6996587030716723

10
0.6996587030716723

100
0.6996587030716723



The best accuracy is with C = 0.01.