<a href="https://colab.research.google.com/github/Gon-Frecces/ML-zoomcamp/blob/main/course_lead_scoring.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
df = pd.read_csv('/content/drive/MyDrive/course_lead_scoring.csv')
len(df)

1462

In [5]:
missing_summary = df.isnull().sum()
missing_summary

Unnamed: 0,0
lead_source,128
industry,134
number_of_courses_viewed,0
annual_income,181
employment_status,100
location,63
interaction_count,0
lead_score,0
converted,0


In [6]:
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = df[col].fillna('NA')
    else:
        df[col] = df[col].fillna(0.0)

In [7]:
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [8]:
df['industry'].mode()

Unnamed: 0,industry
0,retail


In [9]:
numerical_df = df.select_dtypes(include=['number'])
numerical_df

Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score,converted
0,1,79450.0,4,0.94,1
1,1,46992.0,1,0.80,0
2,5,78796.0,3,0.69,1
3,2,83843.0,1,0.87,0
4,3,85012.0,3,0.62,1
...,...,...,...,...,...
1457,1,0.0,4,0.53,1
1458,3,65259.0,2,0.24,1
1459,1,45688.0,3,0.02,1
1460,5,71016.0,0,0.25,1


In [10]:
corr_matrix = numerical_df.corr()
corr_matrix

Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score,converted
number_of_courses_viewed,1.0,0.00977,-0.023565,-0.004879,0.435914
annual_income,0.00977,1.0,0.027036,0.01561,0.053131
interaction_count,-0.023565,0.027036,1.0,0.009888,0.374573
lead_score,-0.004879,0.01561,0.009888,1.0,0.193673
converted,0.435914,0.053131,0.374573,0.193673,1.0


In [11]:
c1 = corr_matrix.loc["interaction_count", "lead_score"]
c2 = corr_matrix.loc["number_of_courses_viewed", "lead_score"]
c3 = corr_matrix.loc["number_of_courses_viewed", "interaction_count"]
c4 = corr_matrix.loc["annual_income", "interaction_count"]

c1, c2, c3, c4


(np.float64(0.009888182496913131),
 np.float64(-0.004878998354681276),
 np.float64(-0.023565222882888037),
 np.float64(0.02703647240481443))

In [12]:
from sklearn.model_selection import train_test_split
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

len(df_train), len(df_val), len(df_test)

(876, 293, 293)

In [13]:
X = df.drop(columns=['converted'])
y = df['converted']

X_full_train, X_test, y_full_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_full_train, y_full_train, test_size=0.25, random_state=42)


In [14]:
categorical_cols = df.select_dtypes(include=['object', 'category']).columns

print(categorical_cols)

Index(['lead_source', 'industry', 'employment_status', 'location'], dtype='object')


In [15]:
from sklearn.metrics import mutual_info_score

m1 = round(mutual_info_score(y, df['lead_source']), 2)
print('y vs lead_source ', m1)
m2 = round(mutual_info_score(y, df['industry']), 2)
print('y vs industry ', m2)
m3 = round(mutual_info_score(y, df['employment_status']), 2)
print('y vs employment_status ',m3)
m4 = round(mutual_info_score(y, df['location']), 2)
print('y vs location', m4)

y vs lead_source  0.03
y vs industry  0.01
y vs employment_status  0.01
y vs location 0.0


In [16]:
from sklearn.feature_extraction import DictVectorizer
train_dicts = X_train[['lead_source', 'industry', 'employment_status', 'location']].iloc[:].to_dict(orient='records')

In [17]:
dv = DictVectorizer()

In [18]:
X_train = dv.fit_transform(train_dicts)


In [19]:
val_dicts = X_val[['lead_source', 'industry', 'employment_status', 'location']].iloc[:].to_dict(orient='records')

In [20]:
X_val = dv.transform(val_dicts)


In [21]:
test_dicts = X_test[['lead_source', 'industry', 'employment_status', 'location']].iloc[:].to_dict(orient='records')

In [22]:
X_test= dv.transform(test_dicts)


In [23]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

In [24]:
from sklearn.metrics import accuracy_score
y_pred = model.predict(X_val)

val_accuracy = accuracy_score(y_val, y_pred)

print("Validation Accuracy:", round(val_accuracy, 2))

Validation Accuracy: 0.61


Model 1 without 'lead_source'

In [25]:
X = df.drop(columns=['converted'])
y = df['converted']

X_full_train, X_test, y_full_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_full_train, y_full_train, test_size=0.25, random_state=42)



train_dicts = X_train[['industry', 'employment_status', 'location']].iloc[:].to_dict(orient='records')
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)
val_dicts = X_val[['industry', 'employment_status', 'location']].iloc[:].to_dict(orient='records')
X_val = dv.transform(val_dicts)

# Logistic regression
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

# Prediction and accuracy
y_pred = model.predict(X_val)

val_accuracy = accuracy_score(y_val, y_pred)

print("Validation Accuracy:", round(val_accuracy, 2))
print("Difference in accuracy: ", round((0.61 - val_accuracy), 2))

Validation Accuracy: 0.62
Difference in accuracy:  -0.01


Model 2 without 'employment_status'

In [26]:
X = df.drop(columns=['converted'])
y = df['converted']

X_full_train, X_test, y_full_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_full_train, y_full_train, test_size=0.25, random_state=42)



train_dicts = X_train[['lead_source', 'industry', 'location']].iloc[:].to_dict(orient='records')
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)
val_dicts = X_val[['lead_source', 'industry', 'location']].iloc[:].to_dict(orient='records')
X_val = dv.transform(val_dicts)

# Logistic regression
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

# Prediction and accuracy
y_pred = model.predict(X_val)

val_accuracy = accuracy_score(y_val, y_pred)

print("Validation Accuracy:", round(val_accuracy, 2))
print("Difference in accuracy: ", round((0.61 - val_accuracy), 2))


Validation Accuracy: 0.55
Difference in accuracy:  0.06


Model 3 without 'industry'

In [27]:
X = df.drop(columns=['converted'])
y = df['converted']

X_full_train, X_test, y_full_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_full_train, y_full_train, test_size=0.25, random_state=42)



train_dicts = X_train[['lead_source', 'employment_status', 'location']].iloc[:].to_dict(orient='records')
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)
val_dicts = X_val[['lead_source', 'employment_status', 'location']].iloc[:].to_dict(orient='records')
X_val = dv.transform(val_dicts)

# Logistic regression
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

# Prediction and accuracy
y_pred = model.predict(X_val)

val_accuracy = accuracy_score(y_val, y_pred)

print("Validation Accuracy:", round(val_accuracy, 2))
print("Difference in accuracy: ", round((0.61 - val_accuracy), 2))


Validation Accuracy: 0.6
Difference in accuracy:  0.01


Best C value


In [28]:
X = df.drop(columns=['converted'])
y = df['converted']

X_full_train, X_test, y_full_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_full_train, y_full_train, test_size=0.25, random_state=42)


train_dicts = X_train[['lead_source', 'industry', 'employment_status', 'location']].iloc[:].to_dict(orient='records')
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)
val_dicts = X_val[['lead_source', 'industry', 'employment_status', 'location']].iloc[:].to_dict(orient='records')
X_val = dv.transform(val_dicts)



In [29]:
# List of C values to test
C_values = [0.01, 0.1, 1, 10, 100]


results = []

for c in C_values:
    model = LogisticRegression(solver='liblinear', C=c, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    acc = accuracy_score(y_val, y_pred)
    results.append((c, round(acc, 3)))
    print(f"C={c}: Validation Accuracy={round(acc, 3)}")


best_c, best_acc = max(results, key=lambda x: x[1])


C=0.01: Validation Accuracy=0.56
C=0.1: Validation Accuracy=0.601
C=1: Validation Accuracy=0.608
C=10: Validation Accuracy=0.604
C=100: Validation Accuracy=0.604


In [34]:
from sklearn.metrics import roc_auc_score
import numpy as np

num_vars = ['lead_score', 'number_of_courses_viewed', 'interaction_count', 'annual_income']

auc_scores = {}

for col in num_vars:
    auc = roc_auc_score(y_train, df_train[col])

    if auc < 0.5:
        auc = roc_auc_score(y_train, -df_train[col])

    auc_scores[col] = auc
    print(f"{col}: {auc:.3f}")

best_var = max(auc_scores, key=auc_scores.get)
print("Highest AUC feature:", best_var)


lead_score: 0.630
number_of_courses_viewed: 0.755
interaction_count: 0.720
annual_income: 0.529
Highest AUC feature: number_of_courses_viewed


In [31]:
LogisticRegression(solver='liblinear', C=1.0, max_iter=1000)


In [37]:
from sklearn.model_selection import KFold
from sklearn.metrics import auc, roc_curve

KFold(n_splits=5, shuffle=True, random_state=1)


KFold(n_splits=5, random_state=1, shuffle=True)