In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, precision_recall_curve, f1_score
import matplotlib.pyplot as plt

In [3]:
# Load the data
df = pd.read_csv('course_lead_scoring.csv')

In [4]:
print("Dataset shape:", df.shape)
print("\nFirst few rows:")
print(df.head())
print("\nColumn data types:")
print(df.dtypes)
print("\nMissing values:")
print(df.isnull().sum())

Dataset shape: (1462, 9)

First few rows:
    lead_source    industry  number_of_courses_viewed  annual_income  \
0      paid_ads         NaN                         1        79450.0   
1  social_media      retail                         1        46992.0   
2        events  healthcare                         5        78796.0   
3      paid_ads      retail                         2        83843.0   
4      referral   education                         3        85012.0   

  employment_status       location  interaction_count  lead_score  converted  
0        unemployed  south_america                  4        0.94          1  
1          employed  south_america                  1        0.80          0  
2        unemployed      australia                  3        0.69          1  
3               NaN      australia                  1        0.87          0  
4     self_employed         europe                  3        0.62          1  

Column data types:
lead_source                  ob

In [5]:
# Data preparation
# Handle missing values
# Categorical features: replace with 'NA'
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    df[col] = df[col].fillna('NA')

In [6]:
# Numerical features: replace with 0.0
numerical_cols = df.select_dtypes(include=['number']).columns.drop('converted')
for col in numerical_cols:
    df[col] = df[col].fillna(0.0)

print("\nMissing values after handling:")
print(df.isnull().sum())


Missing values after handling:
lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64


In [14]:
# Split the data
# First split: 60% train, 40% temp
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)

# Second split: 20% validation, 20% test from the remaining
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

In [15]:
# Check datasets sizes after splitting
len(df_train), len(df_val), len(df_test)

(876, 293, 293)

In [16]:
# Reset indexes
df_train = df_train.reset_index(drop = True)
df_test = df_test.reset_index(drop = True)
df_val = df_val.reset_index(drop = True)

In [18]:
print(f"\nTrain size: {len(df_train)} ({len(df_train)/len(df)*100:.1f}%)")
print(f"Validation size: {len(df_val)} ({len(df_val)/len(df)*100:.1f}%)")
print(f"Test size: {len(df_test)} ({len(df_test)/len(df)*100:.1f}%)")

y_train = df_train['converted'].values
y_val = df_val['converted'].values
y_test = df_test['converted'].values


Train size: 876 (59.9%)
Validation size: 293 (20.0%)
Test size: 293 (20.0%)


In [19]:
# Question 1: ROC AUC feature importance
numerical_features = ['lead_score', 'number_of_courses_viewed', 
                      'interaction_count', 'annual_income']

auc_scores = {}
for feature in numerical_features:
    score = df_train[feature].values
    auc = roc_auc_score(y_train, score)
    
    # If AUC < 0.5, invert the variable
    if auc < 0.5:
        auc = roc_auc_score(y_train, -score)
        print(f"{feature}: {auc:.4f} (inverted)")
    else:
        print(f"{feature}: {auc:.4f}")
    
    auc_scores[feature] = auc

best_feature = max(auc_scores, key=auc_scores.get)
print(f"\nHighest AUC feature: {best_feature} with AUC = {auc_scores[best_feature]:.4f}")

lead_score: 0.6145
number_of_courses_viewed: 0.7636
interaction_count: 0.7383
annual_income: 0.5520

Highest AUC feature: number_of_courses_viewed with AUC = 0.7636


In [20]:
# Question 2: Training the model

# Prepare data for training
def prepare_data(df):
    # Drop the target column and convert to dictionary
    df_features = df.drop('converted', axis=1)
    df_dict = df_features.to_dict(orient='records')
    return df_dict

train_dict = prepare_data(df_train)
val_dict = prepare_data(df_val)

# One-hot encoding
dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(train_dict)
X_val = dv.transform(val_dict)

print(f"Training matrix shape: {X_train.shape}")
print(f"Validation matrix shape: {X_val.shape}")
print(f"Number of features after encoding: {len(dv.get_feature_names_out())}")

# Train logistic regression
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=1)
model.fit(X_train, y_train)

# Evaluate on validation set
y_pred_val = model.predict_proba(X_val)[:, 1]
auc_val = roc_auc_score(y_val, y_pred_val)
print(f"\nValidation AUC: {auc_val:.3f}")
print(f"Rounded to 3 digits: {round(auc_val, 3)}")

Training matrix shape: (876, 31)
Validation matrix shape: (293, 31)
Number of features after encoding: 31

Validation AUC: 0.817
Rounded to 3 digits: 0.817


In [44]:
# Question 3: Precision and Recall
thresholds = np.arange(0.0, 1.01, 0.01)
precisions = []
recalls = []

for threshold in thresholds:
    y_pred_binary = (y_pred_val >= threshold).astype(int)
    
    tp = ((y_pred_binary == 1) & (y_val == 1)).sum()
    fp = ((y_pred_binary == 1) & (y_val == 0)).sum()
    fn = ((y_pred_binary == 0) & (y_val == 1)).sum()
    
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    
    precisions.append(precision)
    recalls.append(recall)

# Find intersection point
differences = [abs(p - r) for p, r in zip(precisions, recalls)]
min_diff_idx = np.argmin(differences)
intersection_threshold = thresholds[min_diff_idx]

print(f"Precision and Recall intersect at threshold: {intersection_threshold:.3f}")
print(f"At this threshold: Precision = {precisions[min_diff_idx]:.3f}, Recall = {recalls[min_diff_idx]:.3f}")

Precision and Recall intersect at threshold: 0.980
At this threshold: Precision = 0.000, Recall = 0.000


In [46]:
# Question 4: F1 Score
f1_scores = []
for p, r in zip(precisions, recalls):
    if p + r > 0:
        f1 = 2 * (p * r) / (p + r)
    else:
        f1 = 0
    f1_scores.append(f1)

max_f1_idx = np.argmax(f1_scores)
max_f1_threshold = thresholds[max_f1_idx]
max_f1_value = f1_scores[max_f1_idx]

print(f"Maximum F1 score: {max_f1_value:.3f}")
print(f"At threshold: {max_f1_threshold:.2f}")

Maximum F1 score: 0.812
At threshold: 0.57


In [47]:
# Question 5: 5-Fold CV
kfold = KFold(n_splits=5, shuffle=True, random_state=1)
cv_scores = []

full_train_dict = prepare_data(df_full_train)
y_full_train = df_full_train['converted'].values

for fold, (train_idx, val_idx) in enumerate(kfold.split(df_full_train), 1):
    # Split data
    df_train_fold = df_full_train.iloc[train_idx]
    df_val_fold = df_full_train.iloc[val_idx]
    
    y_train_fold = df_train_fold['converted'].values
    y_val_fold = df_val_fold['converted'].values
    
    # Prepare dictionaries
    train_dict_fold = prepare_data(df_train_fold)
    val_dict_fold = prepare_data(df_val_fold)
    
    # Transform
    dv_fold = DictVectorizer(sparse=False)
    X_train_fold = dv_fold.fit_transform(train_dict_fold)
    X_val_fold = dv_fold.transform(val_dict_fold)
    
    # Train and evaluate
    model_fold = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000)
    model_fold.fit(X_train_fold, y_train_fold)
    
    y_pred_fold = model_fold.predict_proba(X_val_fold)[:, 1]
    auc_fold = roc_auc_score(y_val_fold, y_pred_fold)
    cv_scores.append(auc_fold)
    
    print(f"Fold {fold}: AUC = {auc_fold:.4f}")

mean_auc = np.mean(cv_scores)
std_auc = np.std(cv_scores)

print(f"\nMean AUC: {mean_auc:.4f}")
print(f"Standard Deviation: {std_auc:.4f}")

Fold 1: AUC = 0.8061
Fold 2: AUC = 0.8714
Fold 3: AUC = 0.7754
Fold 4: AUC = 0.8018
Fold 5: AUC = 0.8558

Mean AUC: 0.8221
Standard Deviation: 0.0358


In [48]:
# Question 6: Hyperparameter Tuning
C_values = [0.000001, 0.001, 1]
results = []

for C in C_values:
    cv_scores_c = []
    
    for fold, (train_idx, val_idx) in enumerate(kfold.split(df_full_train), 1):
        # Split data
        df_train_fold = df_full_train.iloc[train_idx]
        df_val_fold = df_full_train.iloc[val_idx]
        
        y_train_fold = df_train_fold['converted'].values
        y_val_fold = df_val_fold['converted'].values
        
        # Prepare dictionaries
        train_dict_fold = prepare_data(df_train_fold)
        val_dict_fold = prepare_data(df_val_fold)
        
        # Transform
        dv_fold = DictVectorizer(sparse=False)
        X_train_fold = dv_fold.fit_transform(train_dict_fold)
        X_val_fold = dv_fold.transform(val_dict_fold)
        
        # Train and evaluate
        model_fold = LogisticRegression(solver='liblinear', C=C, max_iter=1000)
        model_fold.fit(X_train_fold, y_train_fold)
        
        y_pred_fold = model_fold.predict_proba(X_val_fold)[:, 1]
        auc_fold = roc_auc_score(y_val_fold, y_pred_fold)
        cv_scores_c.append(auc_fold)
    
    mean_score = np.mean(cv_scores_c)
    std_score = np.std(cv_scores_c)
    
    results.append({
        'C': C,
        'mean': round(mean_score, 3),
        'std': round(std_score, 3)
    })
    
    print(f"C = {C}: mean = {mean_score:.3f}, std = {std_score:.3f}")

# Find best C
results_df = pd.DataFrame(results)
max_mean = results_df['mean'].max()
best_results = results_df[results_df['mean'] == max_mean]

if len(best_results) > 1:
    # If tie, select lowest std
    min_std = best_results['std'].min()
    best_results = best_results[best_results['std'] == min_std]
    
    if len(best_results) > 1:
        # If still tie, select smallest C
        best_c = best_results['C'].min()
    else:
        best_c = best_results['C'].values[0]
else:
    best_c = best_results['C'].values[0]

print(f"\nBest C: {best_c}")
print(f"With mean score: {results_df[results_df['C']==best_c]['mean'].values[0]}")
print(f"And std: {results_df[results_df['C']==best_c]['std'].values[0]}")

C = 1e-06: mean = 0.560, std = 0.024
C = 0.001: mean = 0.867, std = 0.029
C = 1: mean = 0.822, std = 0.036

Best C: 0.001
With mean score: 0.867
And std: 0.029
