In [22]:
import pandas as pd
import numpy as np

In [23]:
df = pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv')
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [24]:
df.head().T

Unnamed: 0,0,1,2,3,4
lead_source,paid_ads,social_media,events,paid_ads,referral
industry,,retail,healthcare,retail,education
number_of_courses_viewed,1,1,5,2,3
annual_income,79450.0,46992.0,78796.0,83843.0,85012.0
employment_status,unemployed,employed,unemployed,,self_employed
location,south_america,south_america,australia,australia,europe
interaction_count,4,1,3,1,3
lead_score,0.94,0.8,0.69,0.87,0.62
converted,1,0,1,0,1


In [25]:
df.isna().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [26]:
df.dtypes

lead_source                  object
industry                     object
number_of_courses_viewed      int64
annual_income               float64
employment_status            object
location                     object
interaction_count             int64
lead_score                  float64
converted                     int64
dtype: object

In [27]:
df.columns= df.columns.str.lower().str.replace(' ', '_')
categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)

for column in categorical_columns:
    df[column] = df[column].str.lower().str.replace(' ', '_')
    #df[column].fillna('NA', inplace=True)
    df.fillna({column:'NA'}, inplace=True)

df.head().T

Unnamed: 0,0,1,2,3,4
lead_source,paid_ads,social_media,events,paid_ads,referral
industry,,retail,healthcare,retail,education
number_of_courses_viewed,1,1,5,2,3
annual_income,79450.0,46992.0,78796.0,83843.0,85012.0
employment_status,unemployed,employed,unemployed,,self_employed
location,south_america,south_america,australia,australia,europe
interaction_count,4,1,3,1,3
lead_score,0.94,0.8,0.69,0.87,0.62
converted,1,0,1,0,1


In [28]:
df.isna().sum()

lead_source                   0
industry                      0
number_of_courses_viewed      0
annual_income               181
employment_status             0
location                      0
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [29]:
df.annual_income.fillna(0, inplace=True)
df.isna().sum()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df.annual_income.fillna(0, inplace=True)


lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64

In [30]:
from sklearn.model_selection import train_test_split
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train,test_size=0.25, random_state=42) #20% of the orignal df

In [31]:
df_train= df_train.reset_index(drop= True)
df_val= df_val.reset_index(drop= True)
df_test= df_test.reset_index(drop= True)

In [32]:
y_train=df_train.converted.values
y_val=df_val.converted.values
y_test=df_test.converted.values

In [33]:
del df_train['converted']
del df_val['converted']
del df_test['converted']

In [34]:
df_full_train.converted.value_counts(normalize=True)

converted
1    0.607357
0    0.392643
Name: proportion, dtype: float64

In [35]:
df_full_train.industry.value_counts(normalize=True)

industry
finance          0.142857
retail           0.142002
other            0.132592
healthcare       0.128315
education        0.122327
manufacturing    0.119760
technology       0.115483
NA               0.096664
Name: proportion, dtype: float64

In [36]:
df_train.industry.value_counts(normalize=True)

industry
retail           0.142694
finance          0.141553
healthcare       0.133562
other            0.133562
manufacturing    0.124429
education        0.121005
technology       0.114155
NA               0.089041
Name: proportion, dtype: float64

In [37]:
numerical_columns = list(df.dtypes[df.dtypes != 'object'].index)
numerical_columns

['number_of_courses_viewed',
 'annual_income',
 'interaction_count',
 'lead_score',
 'converted']

In [38]:
df_full_train[numerical_columns].corrwith(df_full_train.converted)

number_of_courses_viewed    0.442068
annual_income               0.029612
interaction_count           0.378482
lead_score                  0.225641
converted                   1.000000
dtype: float64

In [39]:
df[numerical_columns].corr()

Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score,converted
number_of_courses_viewed,1.0,0.00977,-0.023565,-0.004879,0.435914
annual_income,0.00977,1.0,0.027036,0.01561,0.053131
interaction_count,-0.023565,0.027036,1.0,0.009888,0.374573
lead_score,-0.004879,0.01561,0.009888,1.0,0.193673
converted,0.435914,0.053131,0.374573,0.193673,1.0


In [40]:
from sklearn.metrics import mutual_info_score

In [42]:
mutual_info_score(df_full_train.converted, df_full_train.industry)

0.011684562750165564

In [43]:
mutual_info_score(df_full_train.converted, df_full_train.location)

0.0022530354195563346

In [45]:
mutual_info_score(df_full_train.converted, df_full_train.lead_source)

0.025665373935054955

In [47]:
mutual_info_score(df_full_train.converted, df_full_train.employment_status)

0.013258496589914293