In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load the dataset
path = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv'
df = pd.read_csv(path)

print("Dataset shape:", df.shape)
print("\nFirst few rows:")
print(df.head())

Dataset shape: (1462, 9)

First few rows:
    lead_source    industry  number_of_courses_viewed  annual_income  \
0      paid_ads         NaN                         1        79450.0   
1  social_media      retail                         1        46992.0   
2        events  healthcare                         5        78796.0   
3      paid_ads      retail                         2        83843.0   
4      referral   education                         3        85012.0   

  employment_status       location  interaction_count  lead_score  converted  
0        unemployed  south_america                  4        0.94          1  
1          employed  south_america                  1        0.80          0  
2        unemployed      australia                  3        0.69          1  
3               NaN      australia                  1        0.87          0  
4     self_employed         europe                  3        0.62          1  


In [3]:
# Data Preparation - Handle missing values
cat_cols = ['lead_source', 'industry', 'employment_status', 'location']
num_cols = ['annual_income']

for col in cat_cols:
    df[col] = df[col].fillna('NA')

for col in num_cols:
    df[col] = df[col].fillna(0.0)

print("Missing values after handling:")
print(df.isnull().sum())

Missing values after handling:
lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64


In [4]:
# Question 1: Most frequent observation for industry
industry_mode = df['industry'].mode()[0]
industry_counts = df['industry'].value_counts()
print("Industry value counts:")
print(industry_counts.head())
print(f"\nQuestion 1 Answer: Most frequent industry is '{industry_mode}'")

Industry value counts:
industry
retail        203
finance       200
other         198
healthcare    187
education     187
Name: count, dtype: int64

Question 1 Answer: Most frequent industry is 'retail'


In [5]:
# Question 2: Correlation matrix for numerical features
numerical_features = ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']
correlation_matrix = df[numerical_features].corr()

print("Correlation Matrix:")
print(correlation_matrix)

# Find the pair with highest absolute correlation (excluding diagonal)
corr_pairs = []
for i in range(len(numerical_features)):
    for j in range(i+1, len(numerical_features)):
        feat1, feat2 = numerical_features[i], numerical_features[j]
        corr_val = correlation_matrix.loc[feat1, feat2]
        corr_pairs.append(((feat1, feat2), abs(corr_val)))

# Sort by absolute correlation
corr_pairs_sorted = sorted(corr_pairs, key=lambda x: x[1], reverse=True)

print("\nTop correlated pairs:")
for (feat1, feat2), corr_val in corr_pairs_sorted[:3]:
    print(f"{feat1} & {feat2}: {corr_val:.4f}")

# Check the specific pairs mentioned in the question
print("\nSpecific pairs from question:")
pairs_to_check = [
    ('interaction_count', 'lead_score'),
    ('number_of_courses_viewed', 'lead_score'), 
    ('number_of_courses_viewed', 'interaction_count'),
    ('annual_income', 'interaction_count')
]

max_corr = 0
max_pair = None
for feat1, feat2 in pairs_to_check:
    corr_val = abs(correlation_matrix.loc[feat1, feat2])
    print(f"{feat1} & {feat2}: {corr_val:.4f}")
    if corr_val > max_corr:
        max_corr = corr_val
        max_pair = (feat1, feat2)

print(f"\nQuestion 2 Answer: {max_pair[0]} and {max_pair[1]}")

Correlation Matrix:
                          number_of_courses_viewed  annual_income  \
number_of_courses_viewed                  1.000000       0.009770   
annual_income                             0.009770       1.000000   
interaction_count                        -0.023565       0.027036   
lead_score                               -0.004879       0.015610   

                          interaction_count  lead_score  
number_of_courses_viewed          -0.023565   -0.004879  
annual_income                      0.027036    0.015610  
interaction_count                  1.000000    0.009888  
lead_score                         0.009888    1.000000  

Top correlated pairs:
annual_income & interaction_count: 0.0270
number_of_courses_viewed & interaction_count: 0.0236
annual_income & lead_score: 0.0156

Specific pairs from question:
interaction_count & lead_score: 0.0099
number_of_courses_viewed & lead_score: 0.0049
number_of_courses_viewed & interaction_count: 0.0236
annual_income & intera

In [6]:
# Split the data
X = df.drop('converted', axis=1)
y = df['converted']

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

print(f"Train set: {X_train.shape[0]} samples")
print(f"Val set: {X_val.shape[0]} samples") 
print(f"Test set: {X_test.shape[0]} samples")

Train set: 877 samples
Val set: 292 samples
Test set: 293 samples


In [7]:
# Question 3: Mutual information with categorical variables
categorical_features = ['industry', 'location', 'lead_source', 'employment_status']

mi_scores = {}
for feature in categorical_features:
    mi = mutual_info_score(y_train, X_train[feature])
    mi_scores[feature] = round(mi, 2)

print("Mutual Information Scores:")
for feature, score in mi_scores.items():
    print(f"{feature}: {score}")

max_mi_feature = max(mi_scores, key=mi_scores.get)
print(f"\nQuestion 3 Answer: {max_mi_feature}")

Mutual Information Scores:
industry: 0.02
location: 0.0
lead_source: 0.03
employment_status: 0.02

Question 3 Answer: lead_source


In [8]:
# Prepare data for logistic regression (one-hot encoding)
# Combine train and val for encoding to avoid dimension mismatch
X_combined = pd.concat([X_train, X_val])
y_combined = pd.concat([y_train, y_val])

# One-hot encode categorical features
categorical_features = ['lead_source', 'industry', 'employment_status', 'location']
X_encoded = pd.get_dummies(X_combined, columns=categorical_features, prefix=categorical_features)

# Split back into train and val
X_train_encoded = X_encoded.iloc[:len(X_train)]
X_val_encoded = X_encoded.iloc[len(X_train):len(X_train)+len(X_val)]

print(f"Encoded train features: {X_train_encoded.shape}")
print(f"Encoded val features: {X_val_encoded.shape}")

Encoded train features: (877, 31)
Encoded val features: (292, 31)


In [9]:
# Question 4: Logistic Regression
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train_encoded, y_train)

y_val_pred = model.predict(X_val_encoded)
accuracy = accuracy_score(y_val, y_val_pred)
accuracy_rounded = round(accuracy, 2)

print(f"Validation Accuracy: {accuracy:.4f}")
print(f"Rounded to 2 decimals: {accuracy_rounded}")

print(f"\nQuestion 4 Answer: {accuracy_rounded}")

Validation Accuracy: 0.7432
Rounded to 2 decimals: 0.74

Question 4 Answer: 0.74


In [10]:
# Question 5: Feature elimination
original_accuracy = accuracy_score(y_val, model.predict(X_val_encoded))
print(f"Original accuracy: {original_accuracy:.4f}")

features_to_test = ['industry', 'employment_status', 'lead_score']
accuracy_differences = {}

for feature in features_to_test:
    if feature == 'lead_score':  # numerical feature
        features_to_drop = [feature]
    else:  # categorical feature - drop all its one-hot encoded columns
        features_to_drop = [col for col in X_train_encoded.columns if col.startswith(feature + '_')]
    
    X_train_reduced = X_train_encoded.drop(columns=features_to_drop)
    X_val_reduced = X_val_encoded.drop(columns=features_to_drop)
    
    model_reduced = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model_reduced.fit(X_train_reduced, y_train)
    
    reduced_accuracy = accuracy_score(y_val, model_reduced.predict(X_val_reduced))
    difference = original_accuracy - reduced_accuracy
    
    accuracy_differences[feature] = difference
    print(f"Without {feature}: accuracy = {reduced_accuracy:.4f}, difference = {difference:.4f}")

min_diff_feature = min(accuracy_differences, key=lambda x: abs(accuracy_differences[x]))
print(f"\nQuestion 5 Answer: {min_diff_feature}")

Original accuracy: 0.7432
Without industry: accuracy = 0.7432, difference = 0.0000
Without employment_status: accuracy = 0.7466, difference = -0.0034
Without lead_score: accuracy = 0.7432, difference = 0.0000

Question 5 Answer: industry


In [11]:
# Question 6: Regularized Logistic Regression
C_values = [0.01, 0.1, 1, 10, 100]
best_accuracy = 0
best_C = None

print("Regularized Logistic Regression Results:")
for C in C_values:
    model_reg = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model_reg.fit(X_train_encoded, y_train)
    
    y_val_pred_reg = model_reg.predict(X_val_encoded)
    accuracy_reg = accuracy_score(y_val, y_val_pred_reg)
    accuracy_rounded_3 = round(accuracy_reg, 3)
    
    print(f"C = {C}: Accuracy = {accuracy_reg:.4f} (Rounded: {accuracy_rounded_3})")
    
    if accuracy_reg > best_accuracy:
        best_accuracy = accuracy_reg
        best_C = C

print(f"\nQuestion 6 Answer: C = {best_C}")

Regularized Logistic Regression Results:
C = 0.01: Accuracy = 0.7432 (Rounded: 0.743)
C = 0.1: Accuracy = 0.7432 (Rounded: 0.743)
C = 1: Accuracy = 0.7432 (Rounded: 0.743)
C = 10: Accuracy = 0.7432 (Rounded: 0.743)
C = 100: Accuracy = 0.7432 (Rounded: 0.743)

Question 6 Answer: C = 0.01
