In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

In [3]:
data = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv"

In [4]:
!wget $data -O data-week-4.csv 

--2025-10-24 06:21:25--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.109.133, 185.199.108.133, ...
connected. to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... 
HTTP request sent, awaiting response... 200 OK
Length: 80876 (79K) [text/plain]
Saving to: ‘data-week-4.csv’


2025-10-24 06:21:26 (824 KB/s) - ‘data-week-4.csv’ saved [80876/80876]



In [5]:
df = pd.read_csv("data-week-4.csv")
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [6]:
#Checking for missing values
df.isnull().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [7]:
# Replacing columns with nulls
categorical_columns = df.select_dtypes('object').columns
numerical_columns = df.columns.difference(categorical_columns)        
# 2. force every numeric-looking column to numeric, turning bad entries into NaN
for c in numerical_columns:
    df[c] = pd.to_numeric(df[c], errors='coerce')
# 3. now impute NaNs
df[categorical_columns] = df[categorical_columns].fillna('NA')
df[numerical_columns] = df[numerical_columns].fillna(0.0)

In [8]:
df.converted.head()

0    1
1    0
2    1
3    0
4    1
Name: converted, dtype: int64

In [9]:
# Splitting the data
from sklearn.model_selection import train_test_split
df_full_train, df_test = train_test_split(df, test_size = 0.2, random_state= 1)
df_train, df_val = train_test_split(df_full_train, test_size = 0.25, random_state = 1)

In [10]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train.converted.values
y_val = df_val.converted.values
y_test = df_test.converted.values

del df_train['converted']
del df_val['converted']
del df_test['converted']

In [11]:

from sklearn.metrics import roc_auc_score

# 1. Pick the four columns we want to screen
num_cols = ['lead_score', 'number_of_courses_viewed',
            'interaction_count', 'annual_income']

# 2. Container for the final, comparable AUCs
auc_results = {}

for col in num_cols:
    # --------------------------------------------------------
    # 2a.  RAW RANKING TEST
    #      "If we used this column *as-is* as our churn score,
    #       how well would it rank customers?"
    # --------------------------------------------------------
    raw_auc = roc_auc_score(y_train, df_train[col])

    # --------------------------------------------------------
    # 2b.  DIRECTION FIX
    #      AUC < 0.5  ⇒  the ranking is upside-down.
    #      Flip the sign so low values → high churn risk
    #      (now the relation is positive and AUC ≥ 0.5).
    # --------------------------------------------------------
    if raw_auc < 0.5:
        final_auc = roc_auc_score(y_train, -df_train[col])
    else:
        final_auc = raw_auc

    # --------------------------------------------------------
    # 2c.  STORE THE NUMBER WE CAN FAIRLY COMPARE
    # --------------------------------------------------------
    auc_results[col] = final_auc

# 3. Show the ranking
import pandas as pd
pd.Series(auc_results).sort_values(ascending=False)

number_of_courses_viewed    0.763568
interaction_count           0.738270
lead_score                  0.614499
annual_income               0.551958
dtype: float64

In [12]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

In [15]:
dv = DictVectorizer(sparse=False)

train_dict = df_train.to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)

model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000)
model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'liblinear'
,max_iter,1000


In [18]:
y_pred = model.predict_proba(X_val)[:, 1]
auc_score = roc_auc_score(y_val, y_pred)
auc_score

0.8171316268814112

In [22]:
thresholds = np.linspace(0, 1, 101)

scores = []

for t in thresholds:
    score = accuracy_score(y_val, y_pred >= t)
    print('%.2f %.3f' % (t, score))
    scores.append(score)

0.00 0.584
0.01 0.584
0.02 0.584
0.03 0.584
0.04 0.584
0.05 0.584
0.06 0.584
0.07 0.584
0.08 0.584
0.09 0.584
0.10 0.584
0.11 0.584
0.12 0.584
0.13 0.584
0.14 0.584
0.15 0.584
0.16 0.584
0.17 0.584
0.18 0.584
0.19 0.584
0.20 0.584
0.21 0.584
0.22 0.584
0.23 0.584
0.24 0.584
0.25 0.584
0.26 0.584
0.27 0.587
0.28 0.590
0.29 0.594
0.30 0.594
0.31 0.594
0.32 0.597
0.33 0.597
0.34 0.601
0.35 0.601
0.36 0.608
0.37 0.608
0.38 0.614
0.39 0.628
0.40 0.642
0.41 0.642
0.42 0.659
0.43 0.662
0.44 0.676
0.45 0.679
0.46 0.686
0.47 0.689
0.48 0.700
0.49 0.696
0.50 0.700
0.51 0.706
0.52 0.717
0.53 0.730
0.54 0.737
0.55 0.747
0.56 0.747
0.57 0.754
0.58 0.751
0.59 0.754
0.60 0.747
0.61 0.751
0.62 0.751
0.63 0.744
0.64 0.744
0.65 0.734
0.66 0.720
0.67 0.727
0.68 0.717
0.69 0.703
0.70 0.693
0.71 0.693
0.72 0.676
0.73 0.659
0.74 0.662
0.75 0.655
0.76 0.655
0.77 0.645
0.78 0.635
0.79 0.625
0.80 0.614
0.81 0.611
0.82 0.604
0.83 0.584
0.84 0.580
0.85 0.567
0.86 0.549
0.87 0.526
0.88 0.515
0.89 0.505
0.90 0.495

In [23]:
from sklearn.metrics import accuracy_score

In [24]:
actual_positive = (y_val == 1)     
actual_negative   = (y_val == 0)
predict_positive = (y_pred >= t)     
predict_negative  = (y_pred < t)
tp = (predict_positive & actual_positive).sum()
tn = (predict_negative  & actual_negative).sum()
fp = (predict_positive & actual_negative).sum() 
fn = (predict_negative  & actual_positive).sum()

In [27]:
p = tp / (tp + fp)
p

  p = tp / (tp + fp)  # PRECISION: % of predicted churners who really churn


np.float64(nan)

In [26]:
r = tp / (tp + fn)  # RECALL: % of actual churners we successfully caught
r

np.float64(0.0)

In [28]:
# ---------- NEW: precision & recall for each threshold ----------
precisions, recalls = [], []

for t in thresholds:
    y_pred_lbl = (y_pred >= t).astype(int)
    tp = ((y_pred_lbl == 1) & (y_val == 1)).sum()
    fp = ((y_pred_lbl == 1) & (y_val == 0)).sum()
    fn = ((y_pred_lbl == 0) & (y_val == 1)).sum()

    p = tp / (tp + fp) if (tp + fp) else 0   # your old lines, guarded
    r = tp / (tp + fn) if (tp + fn) else 0

    precisions.append(p)
    recalls.append(r)

# find crossing point
cross_idx = np.argmin(np.abs(np.array(precisions) - np.array(recalls)))
print("curves intersect at threshold ≈", thresholds[cross_idx])
# ---------------------------------------------------------------

curves intersect at threshold ≈ 0.98


In [29]:
f1_scores = []                               # new empty list
for t in thresholds:
    y_pred_lbl = (y_pred >= t).astype(int)
    tp = ((y_pred_lbl == 1) & (y_val == 1)).sum()
    fp = ((y_pred_lbl == 1) & (y_val == 0)).sum()
    fn = ((y_pred_lbl == 0) & (y_val == 1)).sum()

    p = tp / (tp + fp) if (tp + fp) else 0
    r = tp / (tp + fn) if (tp + fn) else 0

    f1 = 2 * p * r / (p + r) if (p + r) else 0   # <-- NEW: compute F1
    precisions.append(p)
    recalls.append(r)
    f1_scores.append(f1)                         # <-- NEW: store it

best_idx = np.argmax(f1_scores)
print("F1 is maximal at threshold ≈", thresholds[best_idx])

F1 is maximal at threshold ≈ 0.5700000000000001


In [31]:
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import roc_auc_score

# 1. prepare full data
X_full = df_full_train.drop(columns='converted')
y_full = df_full_train.converted

# 2. 5-fold splitter
kf = KFold(n_splits=5, shuffle=True, random_state=1)

aucs = []

for fold, (idx_tr, idx_va) in enumerate(kf.split(X_full), 1):
    # 3. split
    X_tr, X_va = X_full.iloc[idx_tr], X_full.iloc[idx_va]
    y_tr, y_va = y_full.iloc[idx_tr], y_full.iloc[idx_va]

    # 4. vectorise
    dv = DictVectorizer(sparse=False)
    X_tr_vec = dv.fit_transform(X_tr.to_dict(orient='records'))
    X_va_vec = dv.transform(X_va.to_dict(orient='records'))

    # 5. train model
    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000)
    model.fit(X_tr_vec, y_tr)

    # 6. evaluate
    auc = roc_auc_score(y_va, model.predict_proba(X_va_vec)[:, 1])
    aucs.append(auc)

# 7. summary
print("fold AUCs:", aucs)
print("mean AUC :", np.mean(aucs))
print("std AUC  :", np.std(aucs))

fold AUCs: [0.8060745924216483, 0.8713738368910783, 0.7754320118852139, 0.8018368617683685, 0.8558272713202291]
mean AUC : 0.8221089148573075
std AUC  : 0.03580711942905165


In [32]:
X_full = df_full_train.drop(columns='converted')
y_full = df_full_train.converted
kf = KFold(n_splits=5, shuffle=True, random_state=1)

for C in [0.000001, 0.001, 1]:
    aucs = []
    for idx_tr, idx_va in kf.split(X_full):
        X_tr, X_va = X_full.iloc[idx_tr], X_full.iloc[idx_va]
        y_tr, y_va = y_full.iloc[idx_tr], y_full.iloc[idx_va]

        dv = DictVectorizer(sparse=False)
        X_tr_vec = dv.fit_transform(X_tr.to_dict(orient='records'))
        X_va_vec = dv.transform(X_va.to_dict(orient='records'))

        model = LogisticRegression(solver='liblinear', C=C, max_iter=1000)
        model.fit(X_tr_vec, y_tr)
        auc = roc_auc_score(y_va, model.predict_proba(X_va_vec)[:, 1])
        aucs.append(auc)

    print(f'C={C:7f}  mean AUC: {np.mean(aucs):.3f}  std: {np.std(aucs):.3f}')

C=0.000001  mean AUC: 0.560  std: 0.024
C=0.001000  mean AUC: 0.867  std: 0.029
C=1.000000  mean AUC: 0.822  std: 0.036
