In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
data = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv'
!wget $data

--2025-10-20 15:11:15--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 80876 (79K) [text/plain]
Saving to: ‘course_lead_scoring.csv’


2025-10-20 15:11:15 (1018 KB/s) - ‘course_lead_scoring.csv’ saved [80876/80876]



Check if the missing values are presented in the features.

In [5]:
df = pd.read_csv('course_lead_scoring.csv')
df.isnull().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [6]:
df.dtypes

lead_source                  object
industry                     object
number_of_courses_viewed      int64
annual_income               float64
employment_status            object
location                     object
interaction_count             int64
lead_score                  float64
converted                     int64
dtype: object

For caterogiral features, replace them with 'NA'.

For numerical features, replace with with 0.0

In [48]:
categorical_features = df.select_dtypes(include=['object']).columns
df[categorical_features]= df[categorical_features].fillna('NA')

numerical_features = df.select_dtypes(include=['number']).columns
df[numerical_features]= df[numerical_features].fillna(0.0)

Split the data into 3 parts: train/validation/test with 60%/20%/20% distribution.

In [49]:
from sklearn.model_selection import train_test_split

df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train,df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

# Ectract the desired target for classification

y_train = df_train.converted.values
y_val = df_val.converted.values
y_test = df_test.converted.values

del df_train['converted']
del df_val['converted']
del df_test['converted']

numerical_features= numerical_features.drop(['converted'])


Question 1. ROC AUC feature importance

In [31]:
from sklearn.metrics import roc_auc_score

for numerical in numerical_features:
    score = roc_auc_score(y_train,df_train[numerical])
    if score<0.5:
        df_train[numerical]=-df_train[numerical]
        score = roc_auc_score(y_train,df_train[numerical])
    print("the AUC score for the ",numerical,"is: ",score)


the AUC score for the  number_of_courses_viewed is:  0.7635680590007088
the AUC score for the  annual_income is:  0.5519578313253012
the AUC score for the  interaction_count is:  0.738270176293409
the AUC score for the  lead_score is:  0.6144993577250176


number_of_courses_viewed has the highest AUC.

In [50]:
numerical_features= numerical_features.tolist()
categorical_features = categorical_features.tolist()

Question 2. Model AUC 

In [56]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression



train_dicts = df_train[categorical_features+ numerical_features].to_dict(orient='records')

dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(train_dicts)

model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000)
model.fit(X_train, y_train)

val_dicts = df_val[categorical_features + numerical_features].to_dict(orient='records')
X_val = dv.transform(val_dicts)

y_pred = model.predict_proba(X_val)[:, 1]

#the AUC of this model on the validation dataset

roc_auc = roc_auc_score(y_val, y_pred)
print("the AUC score on the validation dataset is: ", round(roc_auc,3))

the AUC score on the validation dataset is:  0.817
