In [66]:
import pandas as pd
import numpy as np

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [67]:
df = pd.read_csv('data/dataset.csv')

In [68]:
#column names & string values: lowercasing everything and replace spaces with underscore
df.columns = df.columns.str.lower().str.replace(' ', '_')
string_columns = list(df.dtypes[df.dtypes == 'object'].index)

for col in string_columns:
    df[col] = df[col].str.lower().str.replace(' ', '_')

In [69]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9240 entries, 0 to 9239
Data columns (total 37 columns):
prospect_id                                      9240 non-null object
lead_number                                      9240 non-null int64
lead_origin                                      9240 non-null object
lead_source                                      9204 non-null object
do_not_email                                     9240 non-null object
do_not_call                                      9240 non-null object
converted                                        9240 non-null int64
totalvisits                                      9103 non-null float64
total_time_spent_on_website                      9240 non-null int64
page_views_per_visit                             9103 non-null float64
last_activity                                    9137 non-null object
country                                          6779 non-null object
specialization                                   7802 

In [70]:
df.isnull().sum()

prospect_id                                         0
lead_number                                         0
lead_origin                                         0
lead_source                                        36
do_not_email                                        0
do_not_call                                         0
converted                                           0
totalvisits                                       137
total_time_spent_on_website                         0
page_views_per_visit                              137
last_activity                                     103
country                                          2461
specialization                                   1438
how_did_you_hear_about_x_education               2207
what_is_your_current_occupation                  2690
what_matters_most_to_you_in_choosing_a_course    2709
search                                              0
magazine                                            0
newspaper_article           

Many columns have missing values. Below I use different imputation methods to replace<br>
the missing values

In [71]:
#totalvisits has 137 missing values
#use fillna() method to replace missing values with mean value
df['totalvisits'].fillna(df['totalvisits'].mean(), inplace = True)
df['page_views_per_visit'].fillna(df['page_views_per_visit'].mean(), inplace = True)
df['asymmetrique_activity_score'].fillna(df['asymmetrique_activity_score'].mean(), inplace = True)
df['asymmetrique_profile_score'].fillna(df['asymmetrique_profile_score'].mean(), inplace = True)


In [72]:
#get one list with categorical and one list with numerical features
categorical = ['prospect_id',
 'lead_origin',
 'lead_source',
 'do_not_email',
 'do_not_call',
 'last_activity',
 'country',
 'specialization',
 'how_did_you_hear_about_x_education',
 'what_is_your_current_occupation',
 'what_matters_most_to_you_in_choosing_a_course',
 'search',
 'magazine',
 'newspaper_article',
 'x_education_forums',
 'newspaper',
 'digital_advertisement',
 'through_recommendations',
 'receive_more_updates_about_our_courses',
 'tags',
 'lead_quality',
 'update_me_on_supply_chain_content',
 'get_updates_on_dm_content',
 'lead_profile',
 'city',
 'asymmetrique_activity_index',
 'asymmetrique_profile_index',
 'i_agree_to_pay_the_amount_through_cheque',
 'a_free_copy_of_mastering_the_interview',
 'last_notable_activity']
numerical = ['lead_number',
 'totalvisits',
 'total_time_spent_on_website',
 'page_views_per_visit',
 'asymmetrique_activity_score',
 'asymmetrique_profile_score']

In [73]:
#check if any of the numerical variables still have NaN values
n = 0
for n in numerical:
    print(df[n].isnull().sum())
    n =+ 1

0
0
0
0
0
0


In [74]:
#for 'lead_source', use impuation method 'most common class'
df['lead_source'].fillna(df['lead_source'].value_counts().index[0], inplace=True)

In [75]:
#for 'last_activity', use impution method 'most common class'
df['last_activity'].fillna(df['last_activity'].value_counts().index[0], inplace=True)

In [76]:
#for 'country', we use imputation menthod 'unknown' class
df['country'].fillna("unknown", inplace=True)
df['specialization'].fillna("unknown", inplace=True)
df['how_did_you_hear_about_x_education'].fillna("unknown", inplace=True)
df['what_is_your_current_occupation'].fillna("unknown", inplace=True)
df['what_matters_most_to_you_in_choosing_a_course'].fillna("unknown", inplace=True)
df['lead_quality'].fillna("unknown", inplace=True)
df['tags'].fillna("unknown", inplace=True)
df['lead_profile'].fillna("unknown", inplace=True)
df['city'].fillna("unknown", inplace=True)
df['asymmetrique_activity_index'].fillna("unknown", inplace=True)
df['asymmetrique_profile_index'].fillna("unknown", inplace=True)

In [77]:
df.isnull().sum()

prospect_id                                      0
lead_number                                      0
lead_origin                                      0
lead_source                                      0
do_not_email                                     0
do_not_call                                      0
converted                                        0
totalvisits                                      0
total_time_spent_on_website                      0
page_views_per_visit                             0
last_activity                                    0
country                                          0
specialization                                   0
how_did_you_hear_about_x_education               0
what_is_your_current_occupation                  0
what_matters_most_to_you_in_choosing_a_course    0
search                                           0
magazine                                         0
newspaper_article                                0
x_education_forums             

In [78]:
#splitting the dataset in different subsets
from sklearn.model_selection import train_test_split

In [79]:
#shuffling the data of df and splitting it into 2 sets
#df_train_full (80%), df_test(20%)
#random_state guarantees that the data is always shuffled in the same way
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=1)

In [80]:
#take df_train_full and split it into train and val
df_train, df_val = train_test_split(df_train_full, test_size=0.33, random_state=11)
#save target value in matrix array
y_train = df_train.converted.values
y_val = df_val.converted.values
#delete target variable from training and validation set
del df_train['converted']
del df_val['converted']

In [81]:
#check distribution of target variable
#two thirds did not convert
df_train_full.converted.value_counts()

0    4560
1    2832
Name: converted, dtype: int64

In [82]:
#get conversion rate
#it seems to be an inbalanced dataset
global_mean = df_train_full.converted.mean()
round(global_mean, 5)

0.38312

<b>Mutual Information (MI)</b>
* tells us how much info we learn about one variable if we know the value of other variables
* we use it to measure mutual dependency between 2 variables
* higher mutual info means higher dependency
* MI only works for categorical but not for numerical variables

In [84]:
#get mutual info of all categorical variables
#... most useful, ... least useful variable
from sklearn.metrics import mutual_info_score

#this function calculates mutual information
def calculate_mi(series):
    return mutual_info_score(series, df_train_full.converted)

df_mi = df_train_full[categorical].apply(calculate_mi)
df_mi = df_mi.sort_values(ascending=False).to_frame(name='MI')
df_mi

Unnamed: 0,MI
prospect_id,0.6655694
tags,0.3769913
lead_quality,0.1861223
lead_profile,0.1138871
what_is_your_current_occupation,0.0923814
last_activity,0.08462909
last_notable_activity,0.07251663
lead_source,0.0612376
what_matters_most_to_you_in_choosing_a_course,0.05569131
lead_origin,0.05559875
