In [127]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mutual_info_score

In [128]:
df = pd.read_csv("/workspaces/ml-zoomcamp-hw/03-Classification/course_lead_scoring.csv")


In [129]:
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [130]:
df.isnull().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [131]:
df.dtypes

lead_source                  object
industry                     object
number_of_courses_viewed      int64
annual_income               float64
employment_status            object
location                     object
interaction_count             int64
lead_score                  float64
converted                     int64
dtype: object

In [132]:
categorical_fill = ['lead_source', 'industry', 'employment_status', 'location']
numerical_fill = ['annual_income']

In [133]:
df[categorical_fill] = df[categorical_fill].fillna('NA')
df[numerical_fill] = df[numerical_fill].fillna(0)

In [134]:
df.isnull().sum()

lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64

In [135]:
df['industry'].value_counts()
#Retail is the most frequent observation

industry
retail           203
finance          200
other            198
healthcare       187
education        187
technology       179
manufacturing    174
NA               134
Name: count, dtype: int64

In [136]:
corr_matrix = df.select_dtypes(include=['number']).corr()
corr_matrix

Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score,converted
number_of_courses_viewed,1.0,0.00977,-0.023565,-0.004879,0.435914
annual_income,0.00977,1.0,0.027036,0.01561,0.053131
interaction_count,-0.023565,0.027036,1.0,0.009888,0.374573
lead_score,-0.004879,0.01561,0.009888,1.0,0.193673
converted,0.435914,0.053131,0.374573,0.193673,1.0


In [137]:
interaction_count_lead_score = corr_matrix.loc['interaction_count', 'lead_score']
number_of_courses_viewed_lead_score = corr_matrix.loc['number_of_courses_viewed', 'lead_score']
number_of_courses_viewed_interaction_count = corr_matrix.loc['number_of_courses_viewed', 'interaction_count']
annual_income_interaction_count = corr_matrix.loc['annual_income', 'interaction_count']


In [138]:
print("interaction_count and lead_score correlation:", interaction_count_lead_score)
print("number_of_courses_viewed and lead_score correlation:", number_of_courses_viewed_lead_score)
print("number_of_courses_viewed and interaction_count correlation:", number_of_courses_viewed_interaction_count)
print("annual_income and interaction_count correlation:", annual_income_interaction_count)

interaction_count and lead_score correlation: 0.009888182496913131
number_of_courses_viewed and lead_score correlation: -0.004878998354681276
number_of_courses_viewed and interaction_count correlation: -0.023565222882888037
annual_income and interaction_count correlation: 0.02703647240481443


In [139]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

In [140]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)


In [141]:
y_train = df_train.converted.values
y_val = df_val.converted.values
y_test = df_test.converted.values

In [142]:
del df_train['converted']
del df_val['converted']
del df_test['converted']

In [143]:
def apply_mutual_score(value):
    return mutual_info_score(value, df_full_train.converted)

In [144]:
categorical_mi = ['industry', 'location', 'lead_source', 'employment_status']
mi = df_full_train[categorical_mi].apply(apply_mutual_score)
mi
#lead source

industry             0.008173
location             0.001212
lead_source          0.024562
employment_status    0.012690
dtype: float64

In [145]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

In [146]:
df_train_dummies = pd.get_dummies(df_train.copy(), drop_first=False)
df_val_dummies = pd.get_dummies(df_val.copy(), drop_first=False)

df_val_dummies = df_val_dummies.reindex(columns=df_train_dummies.columns, fill_value=0)

In [147]:
model.fit(df_train_dummies, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [148]:
y_predict = model.predict_proba(df_val_dummies)[:,1]


In [149]:
churn_decision = (y_predict >= 0.5)

In [150]:
percent_correct_predictions = (churn_decision == y_val).mean()


In [151]:
features = ['industry', 'employment_status', 'lead_score']
accuracy_scores = {}
accuracy_diffs = {}


In [152]:
for f in features:
    df_train_loop = df_train.copy().drop(columns=[f])
    df_val_loop = df_val.copy().drop(columns=[f])
    df_train_loop = pd.get_dummies(df_train_loop)
    df_val_loop = pd.get_dummies(df_val_loop)
    
    
    model_loop = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model_loop.fit(df_train_loop, y_train)
    
    y_pred_loop = model_loop.predict_proba(df_val_loop)[:,1]
    churn_decision_loop = (y_pred_loop >= 0.5)
    accuracy = (churn_decision_loop == y_val).mean()
    accuracy_scores[f] = accuracy
    
    diff = percent_correct_predictions - accuracy
    accuracy_diffs[f] = diff
    

In [153]:
for f, diff in accuracy_diffs.items():
    print(f"{f}: {diff:.4f}")

industry: -0.0102
employment_status: 0.0000
lead_score: 0.0000


#Could be something with my code, but my accuracy results for employment_status and lead_score are the exact same. 

In [154]:
import sklearn.metrics as skm  # safer than from ... import ...

In [155]:


df_train_c001 = df_train.copy(deep=True)
df_val_c001   = df_val.copy(deep=True)
X_train_c001  = pd.get_dummies(df_train_c001, drop_first=False)
X_val_c001    = pd.get_dummies(df_val_c001, drop_first=False).reindex(columns=X_train_c001.columns, fill_value=0)

# --- Dataset 2 ---
df_train_c01 = df_train.copy(deep=True)
df_val_c01   = df_val.copy(deep=True)
X_train_c01  = pd.get_dummies(df_train_c01, drop_first=False)
X_val_c01    = pd.get_dummies(df_val_c01, drop_first=False).reindex(columns=X_train_c01.columns, fill_value=0)

# --- Dataset 3 ---
df_train_c1 = df_train.copy(deep=True)
df_val_c1   = df_val.copy(deep=True)
X_train_c1  = pd.get_dummies(df_train_c1, drop_first=False)
X_val_c1    = pd.get_dummies(df_val_c1, drop_first=False).reindex(columns=X_train_c1.columns, fill_value=0)

# --- Dataset 4 ---
df_train_c10 = df_train.copy(deep=True)
df_val_c10   = df_val.copy(deep=True)
X_train_c10  = pd.get_dummies(df_train_c10, drop_first=False)
X_val_c10    = pd.get_dummies(df_val_c10, drop_first=False).reindex(columns=X_train_c10.columns, fill_value=0)

# --- Dataset 5 ---
df_train_c100 = df_train.copy(deep=True)
df_val_c100   = df_val.copy(deep=True)
X_train_c100  = pd.get_dummies(df_train_c100, drop_first=False)
X_val_c100    = pd.get_dummies(df_val_c100, drop_first=False).reindex(columns=X_train_c100.columns, fill_value=0)

# --- Train models ---
model_c001 = LogisticRegression(solver='liblinear', C=0.01, max_iter=1000, random_state=42)
model_c001.fit(X_train_c001, y_train)
acc_c001 = skm.accuracy_score(y_val, (model_c001.predict_proba(X_val_c001)[:,1] >= 0.5))

model_c01 = LogisticRegression(solver='liblinear', C=0.1, max_iter=1000, random_state=42)
model_c01.fit(X_train_c01, y_train)
acc_c01 = skm.accuracy_score(y_val, (model_c01.predict_proba(X_val_c01)[:,1] >= 0.5))

model_c1 = LogisticRegression(solver='liblinear', C=1, max_iter=1000, random_state=42)
model_c1.fit(X_train_c1, y_train)
acc_c1 = skm.accuracy_score(y_val, (model_c1.predict_proba(X_val_c1)[:,1] >= 0.5))

model_c10 = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
model_c10.fit(X_train_c10, y_train)
acc_c10 = skm.accuracy_score(y_val, (model_c10.predict_proba(X_val_c10)[:,1] >= 0.5))

model_c100 = LogisticRegression(solver='liblinear', C=100, max_iter=1000, random_state=42)
model_c100.fit(X_train_c100, y_train)
acc_c100 = skm.accuracy_score(y_val, (model_c100.predict_proba(X_val_c100)[:,1] >= 0.5))

# --- Collect results ---
models = {
    0.01: model_c001,
    0.1:  model_c01,
    1:    model_c1,
    10:   model_c10,
    100:  model_c100
}

accuracies = {
    0.01: acc_c001,
    0.1:  acc_c01,
    1:    acc_c1,
    10:   acc_c10,
    100:  acc_c100
}

# --- Print summary ---
for c in [0.01, 0.1, 1, 10, 100]:
    print(f"C={c}: Accuracy={accuracies[c]:.10f}")

best_c = max(accuracies, key=accuracies.get)
print(f"Best model C={best_c} with accuracy {accuracies[best_c]:.10f}")

C=0.01: Accuracy=0.7303754266
C=0.1: Accuracy=0.7303754266
C=1: Accuracy=0.7303754266
C=10: Accuracy=0.7303754266
C=100: Accuracy=0.7303754266
Best model C=0.01 with accuracy 0.7303754266
