In [245]:
import pandas as pd
import numpy as np

df = pd.read_csv('course_lead_scoring.csv')

In [246]:
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [247]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1462 entries, 0 to 1461
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   lead_source               1334 non-null   object 
 1   industry                  1328 non-null   object 
 2   number_of_courses_viewed  1462 non-null   int64  
 3   annual_income             1281 non-null   float64
 4   employment_status         1362 non-null   object 
 5   location                  1399 non-null   object 
 6   interaction_count         1462 non-null   int64  
 7   lead_score                1462 non-null   float64
 8   converted                 1462 non-null   int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 102.9+ KB


In [248]:
df.isnull().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [249]:
categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)

for c in categorical_columns:
    df[c] = df[c].str.lower().str.replace(' ', '_')
    df[c] = df[c].fillna('NA')

In [250]:
numerical = (df.dtypes == 'float64') | (df.dtypes == 'int64')
numerical_columns = list(df.dtypes[numerical].index)
for c in numerical_columns:
    df[c] = df[c].fillna(0.0)

In [251]:
df.isnull().sum()

lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64

Question 1
What is the most frequent observation (mode) for the column industry?

In [252]:
df.industry.value_counts()

industry
retail           203
finance          200
other            198
healthcare       187
education        187
technology       179
manufacturing    174
NA               134
Name: count, dtype: int64

Question 2
Create the correlation matrix for the numerical features of your dataset. In a correlation matrix, you compute the correlation coefficient between every pair of features.

What are the two features that have the biggest correlation?

In [253]:
df[numerical_columns].corrwith(df.converted).abs()

number_of_courses_viewed    0.435914
annual_income               0.053131
interaction_count           0.374573
lead_score                  0.193673
converted                   1.000000
dtype: float64

Split the data

    Split your data in train/val/test sets with 60%/20%/20% distribution.
    Use Scikit-Learn for that (the train_test_split function) and set the seed to 42.
    Make sure that the target value converted is not in your dataframe.

Question 3

    Calculate the mutual information score between converted and other categorical variables in the dataset. Use the training set only.
    Round the scores to 2 decimals using round(score, 2).


In [254]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score

df_full_train, df_test = train_test_split(df, test_size = 0.2, random_state=42)
# 20% out of 80% from the full_train df
df_train, df_val = train_test_split(df_full_train, test_size = 0.25, random_state=42)

len(df_full_train), len(df_train), len(df_val), len(df_test)

(1169, 876, 293, 293)

In [255]:
# Target variables
y_train = df_train.converted.values
y_val = df_val.converted.values
y_test = df_test.converted.values

del df_train['converted']
del df_val['converted']
del df_test['converted']

In [256]:
def mutual_info_converted_score(series):
    return mutual_info_score(series, df_full_train.converted)

mi = df_full_train[categorical_columns].apply(mutual_info_converted_score).sort_values(ascending= False)
round(mi, 2)

lead_source          0.03
employment_status    0.01
industry             0.01
location             0.00
dtype: float64

Question 4

    Now let's train a logistic regression.
    Remember that we have several categorical variables in the dataset. Include them using one-hot encoding.
    Fit the model on the training dataset.
        To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
        model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    Calculate the accuracy on the validation dataset and round it to 2 decimal digits.


In [257]:
from sklearn.feature_extraction import DictVectorizer
numerical_columns.remove('converted')
train_dicts = df_train[categorical_columns + numerical_columns].to_dict(orient= 'records')
train_dicts[0]

{'lead_source': 'paid_ads',
 'industry': 'retail',
 'employment_status': 'student',
 'location': 'middle_east',
 'number_of_courses_viewed': 0,
 'annual_income': 58472.0,
 'interaction_count': 5,
 'lead_score': 0.03}

In [258]:
dv = DictVectorizer(sparse= False)
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical_columns + numerical_columns].to_dict(orient= 'records')
X_val = dv.transform(val_dicts)

In [259]:
X_train[0].round(3)

array([5.8472e+04, 0.0000e+00, 0.0000e+00, 0.0000e+00, 1.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00, 5.0000e+00,
       3.0000e-02, 0.0000e+00, 0.0000e+00, 0.0000e+00, 1.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00])

In [260]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

In [261]:
model.predict_proba(X_train)[:,1]

array([0.57914343, 0.87283491, 0.58816107, 0.51231211, 0.63666475,
       0.75521068, 0.67660732, 0.80463891, 0.31931236, 0.52336705,
       0.42762898, 0.75635962, 0.36365335, 0.479759  , 0.66891575,
       0.79118098, 0.84805492, 0.75295599, 0.74242974, 0.44250074,
       0.59299541, 0.71532757, 0.89505926, 0.40505602, 0.41274906,
       0.49397089, 0.76167979, 0.54183228, 0.91299586, 0.84329958,
       0.43502339, 0.8413228 , 0.45134426, 0.73331191, 0.69094239,
       0.63501139, 0.78633979, 0.70340975, 0.31374645, 0.66994889,
       0.81195513, 0.54070748, 0.52703678, 0.66105373, 0.89692378,
       0.93952744, 0.6910381 , 0.57569838, 0.43601078, 0.58839132,
       0.40836197, 0.62054107, 0.59360954, 0.80782455, 0.55881104,
       0.89447257, 0.85676682, 0.3290991 , 0.88454471, 0.61158866,
       0.55737895, 0.80158813, 0.29433961, 0.76846368, 0.26685382,
       0.61036624, 0.70840796, 0.76184472, 0.88772862, 0.60724473,
       0.74046877, 0.3967234 , 0.94279447, 0.89334272, 0.77167

In [262]:
y_pred = model.predict_proba(X_val)[:,1]
y_pred

array([0.61192163, 0.79982617, 0.53021344, 0.47131479, 0.57066131,
       0.44227168, 0.87127669, 0.84883115, 0.83290037, 0.61497801,
       0.54968026, 0.78153088, 0.69039786, 0.77017122, 0.5265944 ,
       0.91706425, 0.53170635, 0.42123048, 0.30146455, 0.84881583,
       0.79488653, 0.73670375, 0.44527211, 0.64838383, 0.4176882 ,
       0.75393418, 0.90166116, 0.33903049, 0.43181431, 0.9680681 ,
       0.92018714, 0.37487988, 0.652301  , 0.90650057, 0.75164116,
       0.64202121, 0.82250075, 0.83375553, 0.659116  , 0.30978853,
       0.78942264, 0.35546366, 0.96517758, 0.63389304, 0.51274195,
       0.53230533, 0.82287785, 0.744074  , 0.73452313, 0.68955217,
       0.46964443, 0.84539252, 0.55635243, 0.92637871, 0.65258021,
       0.61526273, 0.63816995, 0.28304018, 0.48049824, 0.57890618,
       0.35497342, 0.62175051, 0.38960778, 0.61156056, 0.85304278,
       0.75430136, 0.89185954, 0.71946459, 0.95387623, 0.89209517,
       0.75277087, 0.33850139, 0.61376593, 0.51622275, 0.64088

In [263]:
converted_decision = (y_pred >= 0.5)

In [264]:
(y_val == converted_decision).mean()

0.6996587030716723

Question 5

    Let's find the least useful feature using the feature elimination technique.
    Train a model using the same features and parameters as in Q4 (without rounding).
    Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
    For each feature, calculate the difference between the original accuracy and the accuracy without the feature.

Which of following feature has the smallest difference?

    'industry'
    'employment_status'
    'lead_score'


In [265]:
dict(zip(dv.get_feature_names_out(), model.coef_[0].round(3)))

{'annual_income': -0.0,
 'employment_status=NA': -0.015,
 'employment_status=employed': 0.034,
 'employment_status=self_employed': 0.003,
 'employment_status=student': 0.012,
 'employment_status=unemployed': -0.103,
 'industry=NA': -0.025,
 'industry=education': 0.049,
 'industry=finance': -0.02,
 'industry=healthcare': -0.013,
 'industry=manufacturing': -0.003,
 'industry=other': -0.009,
 'industry=retail': -0.032,
 'industry=technology': -0.016,
 'interaction_count': 0.311,
 'lead_score': 0.051,
 'lead_source=NA': 0.02,
 'lead_source=events': -0.012,
 'lead_source=organic_search': -0.012,
 'lead_source=paid_ads': -0.115,
 'lead_source=referral': 0.08,
 'lead_source=social_media': -0.03,
 'location=NA': 0.004,
 'location=africa': -0.011,
 'location=asia': -0.011,
 'location=australia': -0.006,
 'location=europe': 0.008,
 'location=middle_east': 0.006,
 'location=north_america': -0.033,
 'location=south_america': -0.025,
 'number_of_courses_viewed': 0.454}

'industry'
'employment_status'
'lead_score'

In [266]:
df_train.columns

Index(['lead_source', 'industry', 'number_of_courses_viewed', 'annual_income',
       'employment_status', 'location', 'interaction_count', 'lead_score'],
      dtype='object')

In [267]:
small_industry = ['lead_source', 'number_of_courses_viewed', 'annual_income',
       'employment_status', 'location', 'interaction_count', 'lead_score']
small_employment = ['lead_source', 'industry', 'number_of_courses_viewed', 'annual_income',
        'location', 'interaction_count', 'lead_score']
small_lead = ['lead_source', 'industry', 'number_of_courses_viewed', 'annual_income',
       'employment_status', 'location', 'interaction_count']

INDUSTRY DF

In [275]:
dicts_train_small = df_train[small_industry].to_dict(orient= 'records')
dicts_val_small = df_val[small_industry].to_dict(orient= 'records')
dv_small = DictVectorizer(sparse= False)
dv_small.fit(dicts_train_small)
dv_small.fit(dicts_val_small)

In [276]:
X_train_small = dv_small.transform(dicts_train_small)
X_val_small = dv_small.transform(dicts_val_small)
model_small = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model_small.fit(X_train_small, y_train)

6996587030716723

In [277]:
y_pred = model_small.predict_proba(X_val_small)[:,1]
converted_decision = (y_pred >= 0.5)
(y_val == converted_decision).mean()

0.6996587030716723

'employment_status'

In [273]:
dicts_train_small = df_train[small_employment].to_dict(orient= 'records')
dicts_val_small = df_val[small_employment].to_dict(orient= 'records')
dv_small = DictVectorizer(sparse= False)
dv_small.fit(dicts_train_small)
dv_small.fit(dicts_val_small)
X_train_small = dv_small.transform(dicts_train_small)
X_val_small = dv_small.transform(dicts_val_small)
model_small = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model_small.fit(X_train_small, y_train)
y_pred = model_small.predict_proba(X_val_small)[:,1]
converted_decision = (y_pred >= 0.5)
(y_val == converted_decision).mean()

0.6962457337883959

'lead_score'

In [278]:
dicts_train_small = df_train[small_lead].to_dict(orient= 'records')
dicts_val_small = df_val[small_lead].to_dict(orient= 'records')
dv_small = DictVectorizer(sparse= False)
dv_small.fit(dicts_train_small)
dv_small.fit(dicts_val_small)
X_train_small = dv_small.transform(dicts_train_small)
X_val_small = dv_small.transform(dicts_val_small)
model_small = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model_small.fit(X_train_small, y_train)
y_pred = model_small.predict_proba(X_val_small)[:,1]
converted_decision = (y_pred >= 0.5)
(y_val == converted_decision).mean()

0.7064846416382252