In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import mutual_info_classif
import warnings
warnings.filterwarnings('ignore')

!wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv -O lead_scoring.csv
df = pd.read_csv('lead_scoring.csv')

print("dataset shape:", df.shape)
df.head()

--2025-10-12 04:59:17--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 80876 (79K) [text/plain]
Saving to: ‘lead_scoring.csv’


2025-10-12 04:59:17 (8.98 MB/s) - ‘lead_scoring.csv’ saved [80876/80876]

dataset shape: (1462, 9)


Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [2]:
# Data preparation: Check for missing values
print("Missing values before processing:")
print(df.isnull().sum())

categorical_features = ['industry', 'location', 'lead_source', 'employment_status']
numerical_features = ['annual_income', 'number_of_courses_viewed', 'interaction_count', 'lead_score']

for col in categorical_features:
    df[col] = df[col].fillna('NA')

for col in numerical_features:
    df[col] = df[col].fillna(0.0)

print("\nMissing values after processing:")
print(df.isnull().sum())
df.tail()

Missing values before processing:
lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

Missing values after processing:
lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64


Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
1457,referral,manufacturing,1,0.0,self_employed,north_america,4,0.53,1
1458,referral,technology,3,65259.0,student,europe,2,0.24,1
1459,paid_ads,technology,1,45688.0,student,north_america,3,0.02,1
1460,referral,,5,71016.0,self_employed,north_america,0,0.25,1
1461,organic_search,finance,3,92855.0,student,north_america,3,0.41,1


In [3]:
# Q1 - Most frequent observation for industry
industry_mode = df['industry'].mode()[0]
print("Most frequent observation for 'industry':", industry_mode)

Most frequent observation for 'industry': retail


In [4]:
# Q2 - Correlation matrix
correlation_matrix = df[numerical_features].corr()
print("Correlation Matrix:")
print(correlation_matrix)

corr_pairs = []
for i in range(len(numerical_features)):
    for j in range(i+1, len(numerical_features)):
        feat1, feat2 = numerical_features[i], numerical_features[j]
        corr_value = correlation_matrix.loc[feat1, feat2]
        corr_pairs.append(((feat1, feat2), abs(corr_value)))

corr_pairs_sorted = sorted(corr_pairs, key=lambda x: x[1], reverse=True)

print("\nTop correlated pairs:")
for (feat1, feat2), corr_val in corr_pairs_sorted[:3]:
    print(f"{feat1} - {feat2}: {corr_val:.4f}")

Correlation Matrix:
                          annual_income  number_of_courses_viewed  \
annual_income                  1.000000                  0.009770   
number_of_courses_viewed       0.009770                  1.000000   
interaction_count              0.027036                 -0.023565   
lead_score                     0.015610                 -0.004879   

                          interaction_count  lead_score  
annual_income                      0.027036    0.015610  
number_of_courses_viewed          -0.023565   -0.004879  
interaction_count                  1.000000    0.009888  
lead_score                         0.009888    1.000000  

Top correlated pairs:
annual_income - interaction_count: 0.0270
number_of_courses_viewed - interaction_count: 0.0236
annual_income - lead_score: 0.0156


In [5]:
# Split the data
X = df.drop('converted', axis=1)
y = df['converted']

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [6]:
# Q3 - Mutual information
X_train_categorical = X_train[categorical_features]
X_train_encoded = pd.get_dummies(X_train_categorical, prefix=categorical_features)

mi_scores = mutual_info_classif(X_train_encoded, y_train, random_state=42)
mi_results = pd.DataFrame({
    'feature': X_train_encoded.columns,
    'mi_score': mi_scores
})

original_features_mi = {}
for feature in categorical_features:
    feature_cols = [col for col in mi_results['feature'] if col.startswith(feature + '_')]
    if feature_cols:
        max_mi = mi_results[mi_results['feature'].isin(feature_cols)]['mi_score'].max()
        original_features_mi[feature] = max_mi

print("Mutual Information Scores:")
for feature, score in original_features_mi.items():
    print(f"{feature}: {round(score, 2)}")

max_mi_feature = max(original_features_mi, key=original_features_mi.get)
print(f"\nFeature with highest MI score: {max_mi_feature}")

Mutual Information Scores:
industry: 0.02
location: 0.02
lead_source: 0.01
employment_status: 0.03

Feature with highest MI score: employment_status


In [7]:
# Q4 - Logistic Regression with one-hot encoding
def prepare_features(X):
    X_encoded = pd.get_dummies(X, columns=categorical_features, prefix=categorical_features)
    return X_encoded

X_train_encoded = prepare_features(X_train)
X_val_encoded = prepare_features(X_val)

model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train_encoded, y_train)

y_val_pred = model.predict(X_val_encoded)
accuracy = accuracy_score(y_val, y_val_pred)

print(f"Validation Accuracy: {round(accuracy, 2)}")

Validation Accuracy: 0.74


In [8]:
# Q5 - Feature elimination
features_to_test = ['industry', 'employment_status', 'lead_score']

accuracy_differences = {}

for feature in features_to_test:

    if feature in categorical_features:
        cols_to_keep = [col for col in X_train_encoded.columns if not col.startswith(feature + '_')]
        X_train_reduced = X_train_encoded[cols_to_keep]
        X_val_reduced = X_val_encoded[cols_to_keep]
    else:

        X_train_reduced = X_train_encoded.drop(feature, axis=1)
        X_val_reduced = X_val_encoded.drop(feature, axis=1)

    model_reduced = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model_reduced.fit(X_train_reduced, y_train)

    y_val_pred_reduced = model_reduced.predict(X_val_reduced)
    accuracy_reduced = accuracy_score(y_val, y_val_pred_reduced)

    difference = accuracy - accuracy_reduced
    accuracy_differences[feature] = difference

    print(f"Without {feature}: Accuracy = {accuracy_reduced:.4f}, Difference = {difference:.4f}")

smallest_diff_feature = min(accuracy_differences, key=lambda x: abs(accuracy_differences[x]))
print(f"\nFeature with smallest difference: {smallest_diff_feature}")

Without industry: Accuracy = 0.7432, Difference = 0.0000
Without employment_status: Accuracy = 0.7466, Difference = -0.0034
Without lead_score: Accuracy = 0.7432, Difference = 0.0000

Feature with smallest difference: industry


In [9]:
# Q6 - Regularized Logistic Regression
C_values = [0.01, 0.1, 1, 10, 100]
best_accuracy = 0
best_C = None

print("Regularized Logistic Regression Results:")
for C in C_values:
    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model.fit(X_train_encoded, y_train)

    y_val_pred = model.predict(X_val_encoded)
    accuracy = accuracy_score(y_val, y_val_pred)

    print(f"C = {C}: Accuracy = {accuracy:.4f} (rounded: {round(accuracy, 3)})")

    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_C = C
    elif accuracy == best_accuracy and C < best_C:
        best_C = C

print(f"\nBest C: {best_C} with accuracy: {best_accuracy:.4f}")

Regularized Logistic Regression Results:
C = 0.01: Accuracy = 0.7432 (rounded: 0.743)
C = 0.1: Accuracy = 0.7432 (rounded: 0.743)
C = 1: Accuracy = 0.7432 (rounded: 0.743)
C = 10: Accuracy = 0.7432 (rounded: 0.743)
C = 100: Accuracy = 0.7432 (rounded: 0.743)

Best C: 0.01 with accuracy: 0.7432
