In [90]:
import pandas as pd
import numpy as np

In [91]:
df = pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv')
df

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.80,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1
...,...,...,...,...,...,...,...,...,...
1457,referral,manufacturing,1,,self_employed,north_america,4,0.53,1
1458,referral,technology,3,65259.0,student,europe,2,0.24,1
1459,paid_ads,technology,1,45688.0,student,north_america,3,0.02,1
1460,referral,,5,71016.0,self_employed,north_america,0,0.25,1


In [92]:
df.dtypes[df.dtypes == 'object']
strings = list(df.dtypes[df.dtypes == 'object'].index)
strings

['lead_source', 'industry', 'employment_status', 'location']

In [93]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)

for c in categorical_columns:
    df[c] = df[c].fillna('NA')

numerical = list(df.dtypes[df.dtypes != 'object'].index)
for n in numerical:
    df[n] = df[n].fillna('0.0')

df.isnull().sum()


lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64

In [94]:
df

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.80,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1
...,...,...,...,...,...,...,...,...,...
1457,referral,manufacturing,1,0.0,self_employed,north_america,4,0.53,1
1458,referral,technology,3,65259.0,student,europe,2,0.24,1
1459,paid_ads,technology,1,45688.0,student,north_america,3,0.02,1
1460,referral,,5,71016.0,self_employed,north_america,0,0.25,1


In [95]:
df.dtypes

lead_source                  object
industry                     object
number_of_courses_viewed      int64
annual_income                object
employment_status            object
location                     object
interaction_count             int64
lead_score                  float64
converted                     int64
dtype: object

In [96]:
numerical = ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']
categorical = [
    'lead_source',
    'industry',
    'employment_status',
    'location',
]

In [97]:
df['industry'].mode()[0]

'retail'

In [98]:
df['annual_income'] = pd.to_numeric(df['annual_income'], errors='coerce')
df.annual_income = df.annual_income.fillna(0.0)

In [99]:
df_num = df.select_dtypes(include=['number'])
corr = df_num.corr()
corr


Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score,converted
number_of_courses_viewed,1.0,0.00977,-0.023565,-0.004879,0.435914
annual_income,0.00977,1.0,0.027036,0.01561,0.053131
interaction_count,-0.023565,0.027036,1.0,0.009888,0.374573
lead_score,-0.004879,0.01561,0.009888,1.0,0.193673
converted,0.435914,0.053131,0.374573,0.193673,1.0


In [100]:
pairs = {
    'interaction_count & lead_score': corr.loc['interaction_count', 'lead_score'],
    'number_of_courses_viewed & lead_score': corr.loc['number_of_courses_viewed', 'lead_score'],
    'number_of_courses_viewed & interaction_count': corr.loc['number_of_courses_viewed', 'interaction_count'],
    'annual_income & interaction_count': corr.loc['annual_income', 'interaction_count'],
}

pd.DataFrame.from_dict(pairs, orient='index', columns=['correlation'])

Unnamed: 0,correlation
interaction_count & lead_score,0.009888
number_of_courses_viewed & lead_score,-0.004879
number_of_courses_viewed & interaction_count,-0.023565
annual_income & interaction_count,0.027036


In [101]:
from sklearn.model_selection import train_test_split

In [102]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

In [103]:
len(df_train), len(df_val), len(df_test)

(876, 293, 293)

In [104]:
y_train = df_train.converted.values
y_val = df_val.converted.values
y_test = df_test.converted.values

del df_train['converted']
del df_val['converted']
del df_test['converted']

In [105]:
from sklearn.metrics import mutual_info_score

In [106]:
def mutual_info_converted_score(series):
    return mutual_info_score(series, df_full_train.converted)

In [107]:
mi = df_full_train[categorical].apply(mutual_info_converted_score)
mi.sort_values(ascending=False)
round(mi, 2)

lead_source          0.03
industry             0.01
employment_status    0.01
location             0.00
dtype: float64

In [108]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [109]:
features = numerical + categorical

In [110]:
for col in numerical:
    df_train[col] = pd.to_numeric(df_train[col], errors='coerce').fillna(0.0)
    df_val[col]   = pd.to_numeric(df_val[col],   errors='coerce').fillna(0.0)
    df_test[col]  = pd.to_numeric(df_test[col],  errors='coerce').fillna(0.0)

In [111]:
dv = DictVectorizer(sparse=False)

In [112]:
train_dicts = df_train[features].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[features].to_dict(orient='records')
X_val = dv.transform(val_dicts)

test_dicts = df_test[features].to_dict(orient='records')
X_test = dv.transform(test_dicts)

In [113]:
y_train = y_train.astype(int)
y_val = y_val.astype(int)
y_test = y_test.astype(int)

In [114]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [115]:
y_val_pred = model.predict(X_val)
acc = accuracy_score(y_val, y_val_pred)

print("Validation accuracy (rounded):", round(acc, 2))

Validation accuracy (rounded): 0.7


In [116]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split


target = 'converted'

numerical = ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']
categorical = ['lead_source', 'industry', 'employment_status', 'location']
features = numerical + categorical

df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

y_train = df_train[target].values
y_val = df_val[target].values



def train_and_eval(df_train, df_val, y_train, y_val, feature_list):
    dv = DictVectorizer(sparse=False)
    X_train = dv.fit_transform(df_train[feature_list].to_dict(orient='records'))
    X_val = dv.transform(df_val[feature_list].to_dict(orient='records'))

    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    return accuracy_score(y_val, y_pred)

acc_full = train_and_eval(df_train, df_val, y_train, y_val, features)
print(f"Baseline accuracy (all features): {acc_full:.4f}")

feature_diffs = {}

for f in ['industry', 'employment_status', 'lead_score']:
    reduced = [col for col in features if col != f]
    acc_no_f = train_and_eval(df_train, df_val, y_train, y_val, reduced)
    diff = acc_full - acc_no_f
    feature_diffs[f] = diff
    print(f"Without {f:20s} → acc = {acc_no_f:.4f}  | diff = {diff:.4f}")

least_useful = min(feature_diffs, key=lambda x: abs(feature_diffs[x]))
print("\nLeast useful feature:", least_useful)

Baseline accuracy (all features): 0.6997
Without industry             → acc = 0.6997  | diff = 0.0000
Without employment_status    → acc = 0.6962  | diff = 0.0034
Without lead_score           → acc = 0.7065  | diff = -0.0068

Least useful feature: industry


In [118]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

numerical = ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']
categorical = ['lead_source', 'industry', 'employment_status', 'location']
features = numerical + categorical


dv = DictVectorizer(sparse=False)

X_train = dv.fit_transform(df_train[features].to_dict(orient='records'))
X_val = dv.transform(df_val[features].to_dict(orient='records'))


y_train = df_train.converted.values
y_val = df_val.converted.values

C_values = [0.01, 0.1, 1, 10, 100]

results = {}

for c in C_values:
    model = LogisticRegression(solver='liblinear', C=c, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    acc = accuracy_score(y_val, y_pred)
    results[c] = acc
    print(f"C={c:<6} -> Accuracy: {acc:.3f}")

best_C = max(results, key=results.get)
print(f"\nBest C: {best_C} with accuracy {results[best_C]:.3f}")

C=0.01   -> Accuracy: 0.700
C=0.1    -> Accuracy: 0.700
C=1      -> Accuracy: 0.700
C=10     -> Accuracy: 0.700
C=100    -> Accuracy: 0.700

Best C: 0.01 with accuracy 0.700
