# Data Preparation

In [12]:
import pandas as pd
data = pd.read_csv('course_lead_scoring.csv')

In [13]:
data.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [14]:
data.dtypes

lead_source                  object
industry                     object
number_of_courses_viewed      int64
annual_income               float64
employment_status            object
location                     object
interaction_count             int64
lead_score                  float64
converted                     int64
dtype: object

In [15]:
categorical = ['lead_source','industry','employment_status','location']
numerical = ['number_of_courses_viewed','annual_income','interaction_count','lead_score']

In [16]:
data[categorical].isnull().sum()

lead_source          128
industry             134
employment_status    100
location              63
dtype: int64

In [17]:
data[numerical].isnull().sum()

number_of_courses_viewed      0
annual_income               181
interaction_count             0
lead_score                    0
dtype: int64

In [18]:
data['lead_source'].value_counts()

lead_source
organic_search    282
social_media      278
paid_ads          264
referral          260
events            250
Name: count, dtype: int64

In [19]:
data[categorical] = data[categorical].fillna('NA')

In [20]:
data['lead_source'].value_counts()

lead_source
organic_search    282
social_media      278
paid_ads          264
referral          260
events            250
NA                128
Name: count, dtype: int64

In [21]:
data['industry'].value_counts()

industry
retail           203
finance          200
other            198
healthcare       187
education        187
technology       179
manufacturing    174
NA               134
Name: count, dtype: int64

In [23]:
data['employment_status'].value_counts()

employment_status
self_employed    352
student          348
unemployed       334
employed         328
NA               100
Name: count, dtype: int64

In [24]:
data['location'].value_counts()

location
north_america    225
europe           216
middle_east      198
asia             195
south_america    192
africa           188
australia        185
NA                63
Name: count, dtype: int64

In [25]:
data[numerical] = data[numerical].fillna('0')

In [26]:
data[numerical].isnull().sum()

number_of_courses_viewed    0
annual_income               0
interaction_count           0
lead_score                  0
dtype: int64

# Q1

In [27]:
data['industry'].mode()

0    retail
Name: industry, dtype: object

## Answer
* retail
---

# Q2

In [28]:
data[numerical].corr()

Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score
number_of_courses_viewed,1.0,0.00977,-0.023565,-0.004879
annual_income,0.00977,1.0,0.027036,0.01561
interaction_count,-0.023565,0.027036,1.0,0.009888
lead_score,-0.004879,0.01561,0.009888,1.0


## Answer
* interaction_count & annual_income = 0.027036
---

# Set up validation framework

In [30]:
from sklearn.model_selection import train_test_split

In [32]:

df_full_train, df_test = train_test_split(data, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

In [33]:
len(df_train), len(df_val), len(df_test)

(876, 293, 293)

In [35]:
df_train.head(2)

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
1077,paid_ads,retail,0,58472.0,student,middle_east,5,0.03,0
463,organic_search,manufacturing,3,71738.0,student,middle_east,6,0.77,1


In [36]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [37]:
df_train.head(2)

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,retail,0,58472.0,student,middle_east,5,0.03,0
1,organic_search,manufacturing,3,71738.0,student,middle_east,6,0.77,1


In [38]:

y_train = df_train.converted.values
y_val = df_val.converted.values
y_test = df_test.converted.values

del df_train['converted']
del df_val['converted']
del df_test['converted']

# Q3

In [39]:
from sklearn.metrics import mutual_info_score

In [40]:

def mutual_info_churn_score(series):
    return mutual_info_score(series, y_train)

In [43]:
mi = df_train[categorical].apply(mutual_info_churn_score)
mi.sort_values(ascending=False).round(2)

lead_source          0.04
employment_status    0.01
industry             0.01
location             0.00
dtype: float64

## Answer
* lead_source = 0.04
---