In [197]:
import numpy as np
import pandas as pd  
import matplotlib.pyplot as plt

# In this dataset our desired target for classification task will be converted variable - has the client signed up to the platform or not.

df = pd.read_csv("course_lead_scoring.csv")

# print(df.head(5))
# print(df.dtypes)

df.head().T


Unnamed: 0,0,1,2,3,4
lead_source,paid_ads,social_media,events,paid_ads,referral
industry,,retail,healthcare,retail,education
number_of_courses_viewed,1,1,5,2,3
annual_income,79450.0,46992.0,78796.0,83843.0,85012.0
employment_status,unemployed,employed,unemployed,,self_employed
location,south_america,south_america,australia,australia,europe
interaction_count,4,1,3,1,3
lead_score,0.94,0.8,0.69,0.87,0.62
converted,1,0,1,0,1


In [198]:
# Data preparation
    # Check if the missing values are presented in the features.
df.isnull().sum()
df.info() 


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1462 entries, 0 to 1461
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   lead_source               1334 non-null   object 
 1   industry                  1328 non-null   object 
 2   number_of_courses_viewed  1462 non-null   int64  
 3   annual_income             1281 non-null   float64
 4   employment_status         1362 non-null   object 
 5   location                  1399 non-null   object 
 6   interaction_count         1462 non-null   int64  
 7   lead_score                1462 non-null   float64
 8   converted                 1462 non-null   int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 102.9+ KB


In [199]:
# If there are missing values:
    # For categorical features, replace them with 'NA'
    # For numerical features, replace with with 0.0

cat_features = list(df.dtypes[df.dtypes == 'object'].index)
print (cat_features)

num_features = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
print(num_features)

for col in cat_features: 
    df[col] = df[col].fillna('NA')

for col in num_features:
    df[col] = df[col].fillna(0)


df.isnull().sum() 

['lead_source', 'industry', 'employment_status', 'location']
['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score', 'converted']


lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64

In [200]:
# Question 1
    # What is the most frequent observation (mode) for the column industry?

most_freq = df['industry'].mode()[0]
print(f"Mode for 'industry': {most_freq}")

# Question 2
# Create the correlation matrix for the numerical features of your dataset. In a correlation matrix, you compute the correlation coefficient between every pair of features.
corr_matrix = df[num_features].corr()

# Check the correlation values for the specified pairs (absolute value indicates strength)
corr_interaction_leadscore = corr_matrix.loc['interaction_count', 'lead_score']
corr_courses_leadscore = corr_matrix.loc['number_of_courses_viewed', 'lead_score']
corr_courses_interaction = corr_matrix.loc['number_of_courses_viewed', 'interaction_count']
corr_annualincome_interaction = corr_matrix.loc['annual_income', 'interaction_count']

# Create a dictionary to hold the absolute correlations
abs_correlations = {
    'interaction_count and lead_score': (corr_interaction_leadscore),
    'number_of_courses_viewed and lead_score': (corr_courses_leadscore),
    'number_of_courses_viewed and interaction_count': (corr_courses_interaction),
    'annual_income and interaction_count': (corr_annualincome_interaction)
}

print(abs_correlations)

# # Find the pair with the maximum absolute correlation
biggest_correlation_pair = max(abs_correlations, key=abs_correlations.get)
print(f"Pair with the biggest correlation: {biggest_correlation_pair}")

# What are the two features that have the biggest correlation?
    # interaction_count and lead_score
    # number_of_courses_viewed and lead_score
    # number_of_courses_viewed and interaction_count
                # annual_income and interaction_count  *******answer 
    # Only consider the pairs above when answering this question.

Mode for 'industry': retail
{'interaction_count and lead_score': np.float64(0.009888182496913131), 'number_of_courses_viewed and lead_score': np.float64(-0.004878998354681276), 'number_of_courses_viewed and interaction_count': np.float64(-0.023565222882888037), 'annual_income and interaction_count': np.float64(0.02703647240481443)}
Pair with the biggest correlation: annual_income and interaction_count


In [None]:
from sklearn.model_selection import train_test_split 

 # Split the data
    # Split your data in train/val/test sets with 60%/20%/20% distribution.
    # Use Scikit-Learn for that (the train_test_split function) and set the seed to 42.
    # Make sure that the target value converted is not in your dataframe.

df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)

df_train, df_val = train_test_split(df_full_train, test_size = 0.25, random_state=42)


print(len(df))
print(len(df_full_train))
print(len(df_train))
print(len(df_test))   
print(len(df_val))
                                    
# Separate target variables
y_train = df_train["converted"].values
y_val = df_val["converted"].values
y_test = df_test["converted"].values

# Drop the target variable from feature dataframes
df_train = df_train.drop(columns=["converted"])
df_val = df_val.drop(columns=["converted"])
df_test = df_test.drop(columns=["converted"])         



1462
1169
876
293
293


In [202]:
# Question 3
    # Calculate the mutual information score between converted and other categorical variables in the dataset. Use the training set only.
    # Round the scores to 2 decimals using round(score, 2).
# Which of these variables has the biggest mutual information score?
    # industry
    # location
          # lead_source  ** largest at 0.04 
    # employment_status

from sklearn.metrics import mutual_info_score

mi_industry = mutual_info_score(y_train, df_train.industry)
mi_location = mutual_info_score(y_train, df_train.location)
mi_lead_source = mutual_info_score(y_train, df_train.lead_source)
mi_employment_status = mutual_info_score(y_train, df_train.employment_status)

# The first time I did this I just used df_full_train on everything above and got a very similar answer (0.03 - lead_source) .. would this have been incorrect? 

print("industry MI = ", round(mi_industry,2))
print("location MI = ", round(mi_location,2))
print("lead_source MI = ", round(mi_lead_source,2))
print("employment_status MI = ", round(mi_employment_status,2))


# or ...


def mutual_info_churn_score(series):
    return mutual_info_score(series, df_full_train.converted)

mi = df_full_train[cat_features].apply(mutual_info_churn_score).round(2)
mi.sort_values(ascending=False)


industry MI =  0.01
location MI =  0.0
lead_source MI =  0.03
employment_status MI =  0.01


lead_source          0.02
industry             0.01
employment_status    0.01
location             0.00
dtype: float64

In [None]:
# Question 4
# Now let's train a logistic regression.
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

# Check and remove 'converted' from numerical features list
if 'converted' in num_features:
    num_features.remove('converted')
    
# Check and remove 'converted' from categorical features list if it was mistakenly there
if 'converted' in cat_features:
    cat_features.remove('converted')

# Remember that we have several categorical variables in the dataset. Include them using one-hot encoding.
all_features = cat_features +  num_features

train_dicts = df_train[all_features].to_dict(orient='records') # For making dict row-wise we use orient = 'records
val_dicts = df_val[all_features].to_dict(orient='records')

dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(train_dicts)
X_val = dv.transform(val_dicts)

# Fit the model on the training dataset.
# To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
# model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
                # y_pred = model.predict_proba(X_val)[:,1]
                # churn_decision = (y_pred >= 0.5)

model = LogisticRegression(solver = 'liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_val)
accuracy = (y_val == y_pred).mean()
acc_round = round(accuracy, 4)
original_accuracy = accuracy 

w0 = model.intercept_[0]
print(w0)

w = model.coef_[0]
w.round(3)

dict(zip(dv.get_feature_names_out(), w.round(3)))

# Calculate the accuracy on the validation dataset and round it to 2 decimal digits.
# What accuracy did you get?
    # 0.64
     # 0.74  ** this one is the closest but I get 0.7304 
    # 0.84
    # 0.94

print(acc_round)

-0.08556616376377484
0.7304


In [204]:
# Question 5
# Let's find the least useful feature using the feature elimination technique.
# Train a model using the same features and parameters as in Q4 (without rounding).
# Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
# For each feature, calculate the difference between the original accuracy and the accuracy without the feature.
    # Which of following feature has the smallest difference?
    # 'industry'
            # 'employment_status'  - Answer 
            # 'lead_score'  this is also 0.0 as the smallest difference... is this correct? 
    # Note: The difference doesn't have to be positive.

# Question 5: Feature Elimination (Finding the least useful feature)
features_to_check = ['industry', 'employment_status', 'lead_score']
accuracy_differences = {}

# Use the one-hot encoded feature names for exclusion
full_features = set(dv.get_feature_names_out())

for feature_to_exclude in features_to_check:
    
    # Identify which columns to keep (all features excluding the current one)
    # This logic is simplified; for OHE, this would need to exclude ALL columns 

    current_features_df = [f for f in all_features if f != feature_to_exclude]
    
    # Re-vectorize on the reduced feature set
    dv_subset = DictVectorizer(sparse=False)
    X_train_subset = dv_subset.fit_transform(df_train[current_features_df].to_dict(orient='records'))
    X_val_subset = dv_subset.transform(df_val[current_features_df].to_dict(orient='records'))
    
    # Train a new model with the subset
    model_subset = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model_subset.fit(X_train_subset, y_train)
    
    # Calculate new validation accuracy
    y_pred_subset = model_subset.predict(X_val_subset)
    accuracy_subset = (y_val == y_pred_subset).mean()
    
    # Calculate the difference (original accuracy - new accuracy)
    diff = original_accuracy - accuracy_subset
    accuracy_differences[feature_to_exclude] = diff


# Find the feature with the smallest change (smallest absolute difference)
# The "least useful" feature is the one whose removal causes the smallest absolute drop in performance.
least_useful_feature = min(accuracy_differences, key=lambda k: abs(accuracy_differences[k]))

print(f"Accuracy differences: {accuracy_differences}")
print(f"Feature with the smallest change (least useful): {least_useful_feature}")

Accuracy differences: {'industry': np.float64(-0.010238907849829393), 'employment_status': np.float64(0.0), 'lead_score': np.float64(0.0)}
Feature with the smallest change (least useful): employment_status


In [212]:
# Question 6
# Now let's train a regularized logistic regression.
# Let's try the following values of the parameter C: [0.01, 0.1, 1, 10, 100].
# Train models using all the features as in Q4.


parameter_c = [0.01, 0.1, 1, 10, 100]
accuracy_scores = {}

for c in parameter_c:
    model = LogisticRegression(solver='liblinear', C = c , max_iter=20000, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    accuracy = (y_val == y_pred).mean()
    accuracy_scores[c] = round(accuracy,4)

# Print all results to find the best C
print("Accuracy scores for each C:", accuracy_scores)

# Find the C value with the highest accuracy
best_c = max(accuracy_scores, key=accuracy_scores.get)

# Calculate the accuracy on the validation dataset and round it to 3 decimal digits.
# Which of these C leads to the best accuracy on the validation set?
        # 0.01 - this should be the closest one 
# 0.1
# 1
# 10
# 100
# Note: If there are multiple options, select the smallest C.

Accuracy scores for each C: {0.01: np.float64(0.7304), 0.1: np.float64(0.7304), 1: np.float64(0.7304), 10: np.float64(0.7304), 100: np.float64(0.7304)}


In [None]:
# LLM notes / code for experimentation below 


parameter_c = [0.01, 0.1, 1, 10, 100]
accuracy_scores = {}
threshold = 0.5 # The default threshold we are making explicit

# Use the X_train and X_val from Question 4 (all features)

for c in parameter_c:
    # 1. Define the model with the current C value and high max_iter
    model = LogisticRegression(solver='liblinear', C = c , max_iter=20000, random_state=42)
    
    # 2. Fit the model on the training data
    model.fit(X_train, y_train) 
    
    # --- EXPLICIT THRESHOLDING METHOD ---
    # 3. Get the probability estimates for the positive class (index 1)
    y_proba = model.predict_proba(X_val)[:, 1]
    
    # 4. Apply the 0.5 threshold and convert the boolean results to integers (0s and 1s)
    y_pred = (y_proba >= threshold).astype(int)
    
    # 5. Calculate the UNROUNDED accuracy
    accuracy = (y_val == y_pred).mean()
    
    # 6. Store the result
    accuracy_scores[c] = accuracy 

# Print all results with full precision to clearly see the differences
print("Full Accuracy Scores for each C:")
for c, acc in accuracy_scores.items():
    # Use high precision printing to reveal subtle differences
    print(f"C={c:<5}: {acc:.8f}")


print(y_proba) #added to check values 

# Find the C value with the highest accuracy
best_c = max(accuracy_scores, key=accuracy_scores.get)
print(f"\nThe C value that yields the highest accuracy is: {best_c}")

Full Accuracy Scores for each C:
C=0.01 : 0.73037543
C=0.1  : 0.73037543
C=1    : 0.73037543
C=10   : 0.73037543
C=100  : 0.73037543
[0.57225234 0.94760944 0.70570537 0.36663941 0.41823678 0.85255382
 0.74743733 0.48268898 0.75544304 0.71296911 0.59343865 0.33730063
 0.53399183 0.64826513 0.62104479 0.5161717  0.80676091 0.77287496
 0.65057282 0.54980744 0.87676395 0.58514127 0.40470678 0.63606997
 0.63949595 0.44136745 0.44509255 0.34487497 0.72547584 0.72741909
 0.29958754 0.67105431 0.82811361 0.96043549 0.93631193 0.93637751
 0.50570363 0.61381657 0.64192505 0.71477489 0.90748216 0.4870827
 0.83938187 0.60022485 0.62711865 0.59993718 0.7352204  0.52521649
 0.53371387 0.74175564 0.48976788 0.89816649 0.93749382 0.90816271
 0.61654057 0.4356769  0.40061734 0.58189244 0.36149332 0.45652569
 0.87265849 0.80960247 0.65846375 0.68398226 0.80925654 0.57398685
 0.84387498 0.52784078 0.77842587 0.70672505 0.60522308 0.72735917
 0.41979148 0.82703618 0.71495659 0.60481247 0.85282389 0.516357