In [1]:
import requests
import numpy as np
import pandas as pd

url = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv"
response = requests.get(url)
with open("data.csv", "wb") as f:
    f.write(response.content)

In [2]:
df = pd.read_csv("data.csv")
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [3]:
df.describe(include='all')

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
count,1334,1328,1462.0,1281.0,1362,1399,1462.0,1462.0,1462.0
unique,5,7,,,4,7,,,
top,organic_search,retail,,,self_employed,north_america,,,
freq,282,203,,,352,225,,,
mean,,,2.031464,59886.273224,,,2.976744,0.506108,0.619015
std,,,1.449717,15070.140389,,,1.681564,0.288465,0.485795
min,,,0.0,13929.0,,,0.0,0.0,0.0
25%,,,1.0,49698.0,,,2.0,0.2625,0.0
50%,,,2.0,60148.0,,,3.0,0.51,1.0
75%,,,3.0,69639.0,,,4.0,0.75,1.0


So the first one is retail in industry

In [4]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)

for c in categorical_columns:
    df[c] = df[c].str.lower().str.replace(' ', '_')

In [5]:
df.head()


Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [6]:
# correlation between 2 columns
print(df[['interaction_count', 'lead_score']].corr())

print(df[['number_of_courses_viewed', 'lead_score']].corr())
print(df[['number_of_courses_viewed', 'interaction_count']].corr())
print(df[['annual_income', 'interaction_count']].corr())


                   interaction_count  lead_score
interaction_count           1.000000    0.009888
lead_score                  0.009888    1.000000
                          number_of_courses_viewed  lead_score
number_of_courses_viewed                  1.000000   -0.004879
lead_score                               -0.004879    1.000000
                          number_of_courses_viewed  interaction_count
number_of_courses_viewed                  1.000000          -0.023565
interaction_count                        -0.023565           1.000000
                   annual_income  interaction_count
annual_income           1.000000           0.048618
interaction_count       0.048618           1.000000


so 0.04 is the one

In [7]:
from sklearn.model_selection import train_test_split

X = df.drop(columns=["converted"])
y = df["converted"]

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.4, random_state=42
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42
)

print(X_train.shape, X_val.shape, X_test.shape)


(877, 8) (292, 8) (293, 8)


In [8]:
from sklearn.metrics import mutual_info_score
mutual_info_score(X_train['interaction_count'], y_train)


0.07936454209010488

In [9]:
categorical_cols = X_train.select_dtypes(include=["object", "category", "bool"]).columns.tolist()
numerical_cols   = X_train.select_dtypes(include=["number"]).columns.tolist()

In [10]:
X_train_prep = X_train.copy()
X_val_prep   = X_val.copy()

# Categorical NaNs -> "Missing"
X_train_prep[categorical_cols] = X_train_prep[categorical_cols].astype("object").fillna("Missing")
X_val_prep[categorical_cols]   = X_val_prep[categorical_cols].astype("object").fillna("Missing")

# Numerical: coerce to numeric and fill remaining NaNs with median (computed on train)
X_train_prep[numerical_cols] = X_train_prep[numerical_cols].apply(pd.to_numeric, errors="coerce")
train_medians = X_train_prep[numerical_cols].median()
X_train_prep[numerical_cols] = X_train_prep[numerical_cols].fillna(train_medians)

X_val_prep[numerical_cols] = X_val_prep[numerical_cols].apply(pd.to_numeric, errors="coerce")
X_val_prep[numerical_cols] = X_val_prep[numerical_cols].fillna(train_medians)


In [11]:
def mutual_info_churn_score(series):
    return mutual_info_score(series, y_train)

In [14]:
mi = X_train_prep[categorical_cols].apply(mutual_info_churn_score)
mi.sort_values(ascending=False)

lead_source          0.028192
employment_status    0.018339
industry             0.015059
location             0.003457
dtype: float64

In [15]:
from sklearn.feature_extraction import DictVectorizer
dv = DictVectorizer(sparse=False)

train_dict = X_train[categorical_cols + numerical_cols].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = X_val[categorical_cols + numerical_cols].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

X = df.drop(columns=['converted'])
y = df['converted']

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.4, random_state=42, stratify=y
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

cat_cols = X_train.select_dtypes(include=['object', 'category', 'bool']).columns.tolist()
num_cols = X_train.select_dtypes(include=['number']).columns.tolist()
feat_cols = cat_cols + num_cols

X_train_p = X_train.copy()
X_val_p   = X_val.copy()

X_train_p[cat_cols] = X_train_p[cat_cols].astype('object').fillna('Missing')
X_val_p[cat_cols]   = X_val_p[cat_cols].astype('object').fillna('Missing')

X_train_p[num_cols] = X_train_p[num_cols].apply(pd.to_numeric, errors='coerce')
medians = X_train_p[num_cols].median()
X_train_p[num_cols] = X_train_p[num_cols].fillna(medians)

X_val_p[num_cols] = X_val_p[num_cols].apply(pd.to_numeric, errors='coerce')
X_val_p[num_cols] = X_val_p[num_cols].fillna(medians)

dv = DictVectorizer(sparse=False)
X_train_vec = dv.fit_transform(X_train_p[feat_cols].to_dict(orient='records'))
X_val_vec   = dv.transform(X_val_p[feat_cols].to_dict(orient='records'))

y_train_bin = pd.Series(y_train).astype(int).values
y_val_bin   = pd.Series(y_val).astype(int).values

model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train_vec, y_train_bin)

val_pred = model.predict(X_val_vec)              
acc = accuracy_score(y_val_bin, val_pred)
print(round(acc, 3))


0.695


In [None]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

feat_cols = cat_cols + num_cols

def vec_fit(X_df, cols, dv=None):
    if dv is None:
        dv = DictVectorizer(sparse=False)
        Xv = dv.fit_transform(X_df[cols].to_dict(orient='records'))
        return Xv, dv
    else:
        Xv = dv.transform(X_df[cols].to_dict(orient='records'))
        return Xv

dv_all = DictVectorizer(sparse=False)
Xtr_all = dv_all.fit_transform(X_train_p[feat_cols].to_dict(orient='records'))
Xva_all = dv_all.transform(X_val_p[feat_cols].to_dict(orient='records'))

base_model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
base_model.fit(Xtr_all, y_train_bin)
base_acc = accuracy_score(y_val_bin, base_model.predict(Xva_all))

candidates = ['industry', 'employment_status', 'lead_score']
diffs = {}

for f in candidates:
    cols_minus_f = [c for c in feat_cols if c != f]
    dv = DictVectorizer(sparse=False)
    Xtr = dv.fit_transform(X_train_p[cols_minus_f].to_dict(orient='records'))
    Xva = dv.transform(X_val_p[cols_minus_f].to_dict(orient='records'))
    
    m = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    m.fit(Xtr, y_train_bin)
    acc_minus_f = accuracy_score(y_val_bin, m.predict(Xva))
    diffs[f] = base_acc - acc_minus_f

print("Base accuracy:", base_acc)
print("Differences (base - without feature):", diffs)
print("Smallest difference feature:", min(diffs, key=diffs.get))


Base accuracy: 0.6952054794520548
Differences (base - without feature): {'industry': 0.003424657534246589, 'employment_status': 0.003424657534246589, 'lead_score': 0.0}
Smallest difference feature: lead_score


In [24]:
Cs = [0.01, 0.1, 1, 10, 100]
results = {}

for C in Cs:
    m = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    m.fit(Xtr_all, y_train_bin)
    acc = accuracy_score(y_val_bin, m.predict(Xva_all))
    results[C] = acc

for C, acc in results.items():
    print(C, round(acc, 3))

best_C = max(results, key=results.get)
print("Best C:", best_C, "with acc:", round(results[best_C], 3))


0.01 0.723
0.1 0.695
1 0.695
10 0.695
100 0.695
Best C: 0.01 with acc: 0.723
