In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv')

In [3]:
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [8]:
df.isnull().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [9]:
df.dtypes

lead_source                  object
industry                     object
number_of_courses_viewed      int64
annual_income               float64
employment_status            object
location                     object
interaction_count             int64
lead_score                  float64
converted                     int64
dtype: object

In [10]:
df.columns

Index(['lead_source', 'industry', 'number_of_courses_viewed', 'annual_income',
       'employment_status', 'location', 'interaction_count', 'lead_score',
       'converted'],
      dtype='object')

In [11]:
numerical =['number_of_courses_viewed', 'annual_income','interaction_count', 'lead_score', 'converted']

In [12]:
categorical = ['lead_source', 'industry','employment_status', 'location',]

In [13]:
df[numerical] = df[numerical].fillna(0.0)

In [16]:
df[categorical] = df[categorical].fillna('NA')

In [18]:
df.isnull().sum()

lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64

In [20]:
df['industry'].mode()[0]

'retail'

In [21]:
from sklearn.model_selection import train_test_split

In [27]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

In [28]:
len(df_train), len(df_val), len(df_test)

(876, 293, 293)

In [29]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [30]:
y_train = df_train.converted.values
y_val = df_val.converted.values
y_test = df_test.converted.values

del df_train['converted']
del df_val['converted']
del df_test['converted']

In [31]:
corr = df[numerical].corr()

In [32]:
pairs = [
     ('interaction_count', 'lead_score'),
    ('number_of_courses_viewed', 'lead_score'),
    ('number_of_courses_viewed', 'interaction_count'),
    ('annual_income', 'interaction_count')
]

In [33]:
pair_corrs = {}
max_corr = 0
best_pair = None

for p in pairs:
    value = corr[p[0]][p[1]]
    pair_corrs[p] = value
    if abs(value) > abs(max_corr):
        max_corr = value
        best_pair = p


In [34]:
best_pair

('annual_income', 'interaction_count')

In [35]:
from sklearn.metrics import mutual_info_score

In [36]:
def mutual_info_converted_score(series):
    return mutual_info_score(series, df_full_train.converted)

In [37]:
mi = df_full_train[categorical].apply(mutual_info_converted_score)
mi.sort_values(ascending=False)

lead_source          0.025665
employment_status    0.013258
industry             0.011685
location             0.002253
dtype: float64

In [38]:
from sklearn.feature_extraction import DictVectorizer

In [45]:
from sklearn.feature_extraction import DictVectorizer

dv = DictVectorizer(sparse=False)

train_cols = [c for c in categorical + numerical if c in df_train.columns]
val_cols = [c for c in categorical + numerical if c in df_val.columns]

train_dict = df_train[train_cols].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[val_cols].to_dict(orient='records')
X_val = dv.transform(val_dict)


In [46]:
from sklearn.linear_model import LogisticRegression

In [47]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

In [57]:
val_accuracy =round(model.score(X_val, y_val),2)
val_accuracy

0.7

In [70]:
accuracy_diff = {}
elim_feats = ['industry', 'employment_status', 'lead_score']

for feature in elim_feats:
    if feature not in df_train.columns:
        continue  

    
    train_subset = [f for f in categorical + numerical if f != feature and f in df_train.columns]
    val_subset = [f for f in categorical + numerical if f != feature and f in df_val.columns]
    
    X_train_sub = dv.fit_transform(df_train[train_subset].to_dict(orient='records'))
    X_val_sub = dv.transform(df_val[val_subset].to_dict(orient='records'))

In [72]:
model_sub = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model_sub.fit(X_train_sub, y_train)
acc_sub = model_sub.score(X_val_sub, y_val)


accuracy_diff[feature] =  val_accuracy - acc_sub
print(f"Feature: {feature}, Accuracy without feature: {acc_sub:.2f}, Difference: {accuracy_diff[feature]:.2f}")


Feature: lead_score, Accuracy without feature: 0.71, Difference: -0.01


In [75]:
least_useful_feature = min(accuracy_diff, key=accuracy_diff.get)
print("\nLeast useful feature:", least_useful_feature)
print("Smallest accuracy difference:", round(accuracy_diff[least_useful_feature], 2))


Least useful feature: lead_score
Smallest accuracy difference: -0.01


In [76]:
dv = DictVectorizer(sparse=False)

train_cols = [c for c in categorical + numerical if c in df_train.columns]
val_cols = [c for c in categorical + numerical if c in df_val.columns]

train_dict = df_train[train_cols].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[val_cols].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [77]:
C_values = [0.01, 0.1, 1, 10, 100]


In [78]:
val_accuracies = {}

for C in C_values:
    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    acc = round(model.score(X_val, y_val), 3)
    val_accuracies[C] = acc
    print(f"C={C}, Validation Accuracy={acc}")

C=0.01, Validation Accuracy=0.7
C=0.1, Validation Accuracy=0.7
C=1, Validation Accuracy=0.7
C=10, Validation Accuracy=0.7
C=100, Validation Accuracy=0.7


In [80]:
best_C = max(val_accuracies, key=val_accuracies.get)
best_C

0.01