In [8]:
import pandas as pd
import numpy as np

In [33]:
# Load the dataset
url = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv'
df = pd.read_csv(url)

# Quick look at the data
print(df.head())
print(df.shape)
print(df.columns)

    lead_source    industry  number_of_courses_viewed  annual_income  \
0      paid_ads         NaN                         1        79450.0   
1  social_media      retail                         1        46992.0   
2        events  healthcare                         5        78796.0   
3      paid_ads      retail                         2        83843.0   
4      referral   education                         3        85012.0   

  employment_status       location  interaction_count  lead_score  converted  
0        unemployed  south_america                  4        0.94          1  
1          employed  south_america                  1        0.80          0  
2        unemployed      australia                  3        0.69          1  
3               NaN      australia                  1        0.87          0  
4     self_employed         europe                  3        0.62          1  
(1462, 9)
Index(['lead_source', 'industry', 'number_of_courses_viewed', 'annual_income',
    

In [34]:
df

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.80,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1
...,...,...,...,...,...,...,...,...,...
1457,referral,manufacturing,1,,self_employed,north_america,4,0.53,1
1458,referral,technology,3,65259.0,student,europe,2,0.24,1
1459,paid_ads,technology,1,45688.0,student,north_america,3,0.02,1
1460,referral,,5,71016.0,self_employed,north_america,0,0.25,1


In [35]:
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [36]:
# Check for missing values
print(df.isnull().sum())

# Identify categorical and numerical columns
categorical = df.select_dtypes(include='object').columns.tolist()
numerical = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Replace missing values
for col in categorical:
    df[col] = df[col].fillna('NA')
for col in numerical:
    df[col] = df[col].fillna(0.0)

# Confirm no missing values remain
print(df.isnull().sum())


lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64
lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64


In [38]:
# QUESTION 1

print(df['industry'].value_counts())

industry
retail           203
finance          200
other            198
healthcare       187
education        187
technology       179
manufacturing    174
NA               134
Name: count, dtype: int64


In [None]:
# QUESTION 2

# List the relevant numerical features
numerical = ['interaction_count', 'lead_score', 'number_of_courses_viewed', 'annual_income']

# Compute the correlation matrix
corr_matrix = df[numerical].corr()
print(corr_matrix)

                          interaction_count  lead_score  \
interaction_count                  1.000000    0.009888   
lead_score                         0.009888    1.000000   
number_of_courses_viewed          -0.023565   -0.004879   
annual_income                      0.027036    0.015610   

                          number_of_courses_viewed  annual_income  
interaction_count                        -0.023565       0.027036  
lead_score                               -0.004879       0.015610  
number_of_courses_viewed                  1.000000       0.009770  
annual_income                             0.009770       1.000000  


In [None]:
# Split the data

from sklearn.model_selection import train_test_split

# Separate features and target
y = df['converted'].values
X = df.drop(columns=['converted'])

# First split: train+val and test (80% train+val, 20% test)
X_full_train, X_test, y_full_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Second split: train and val (25% of train+val for val, so 0.25 * 0.8 = 0.2)
X_train, X_val, y_train, y_val = train_test_split(X_full_train, y_full_train, test_size=0.25, random_state=42)

# Check shapes
print(X_train.shape, X_val.shape, X_test.shape)


(876, 8) (293, 8) (293, 8)


In [43]:
# QUESTION 3

# Step 1: Prepare the Data

# Assume X_train and y_train are already defined from your previous split
categorical = ['industry', 'location', 'lead_source', 'employment_status']


In [45]:
# Step 2: Calculate Mutual Information

from sklearn.preprocessing import OrdinalEncoder
from sklearn.feature_selection import mutual_info_classif

# Select the categorical columns
categorical = ['industry', 'location', 'lead_source', 'employment_status']

# Fill missing values with 'NA'
X_train_cat = X_train[categorical].fillna('NA')

# Encode as integers
encoder = OrdinalEncoder()
X_train_cat_encoded = encoder.fit_transform(X_train_cat)

# Calculate mutual information scores
mi_scores = mutual_info_classif(X_train_cat_encoded, y_train, discrete_features=True)

# Print rounded scores for each variable
for col, score in zip(categorical, mi_scores):
    print(f'{col:20s}: {round(score, 2)}')


industry            : 0.01
location            : 0.0
lead_source         : 0.04
employment_status   : 0.01


In [46]:
# QUESTION 4
# Step 1: One-Hot Encode Categorical Variables
from sklearn.feature_extraction import DictVectorizer

# Identify categorical and numerical columns
categorical = X_train.select_dtypes(include='object').columns.tolist()
numerical = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Prepare data as a list of dictionaries (one per row)
train_dicts = X_train[categorical + numerical].to_dict(orient='records')
val_dicts = X_val[categorical + numerical].to_dict(orient='records')

# Initialize and fit the vectorizer
dv = DictVectorizer(sparse=False)
X_train_encoded = dv.fit_transform(train_dicts)
X_val_encoded = dv.transform(val_dicts)


In [47]:
# Step 2: Train Logistic Regression

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Initialize the model with specified parameters
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

# Fit the model on the training data
model.fit(X_train_encoded, y_train)

# Predict on the validation set
y_pred = model.predict(X_val_encoded)

# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation accuracy: {round(accuracy, 2)}')


Validation accuracy: 0.7


In [49]:
# QUESTION 5
# 1. Train the Baseline Model

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

features = X_train.columns.tolist()
train_dicts = X_train[features].to_dict(orient='records')
val_dicts = X_val[features].to_dict(orient='records')

dv = DictVectorizer(sparse=False)
X_train_encoded = dv.fit_transform(train_dicts)
X_val_encoded = dv.transform(val_dicts)

model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train_encoded, y_train)
y_pred = model.predict(X_val_encoded)
baseline_accuracy = accuracy_score(y_val, y_pred)
print(f'Baseline accuracy: {baseline_accuracy:.4f}')


Baseline accuracy: 0.6997


In [50]:
# 2. Feature Elimination
features_to_test = ['industry', 'employment_status', 'lead_score']
differences = {}

for feature in features_to_test:
    reduced_features = [f for f in features if f != feature]
    train_dicts = X_train[reduced_features].to_dict(orient='records')
    val_dicts = X_val[reduced_features].to_dict(orient='records')
    X_train_encoded = dv.fit_transform(train_dicts)
    X_val_encoded = dv.transform(val_dicts)
    model.fit(X_train_encoded, y_train)
    y_pred = model.predict(X_val_encoded)
    acc = accuracy_score(y_val, y_pred)
    diff = baseline_accuracy - acc
    differences[feature] = diff
    print(f'{feature:20s}: {diff:.4f}')


industry            : 0.0000
employment_status   : 0.0034
lead_score          : -0.0068


In [51]:
# QUESTION 6

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

C_values = [0.01, 0.1, 1, 10, 100]
accuracies = {}

for C in C_values:
    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model.fit(X_train_encoded, y_train)
    y_pred = model.predict(X_val_encoded)
    acc = accuracy_score(y_val, y_pred)
    accuracies[C] = round(acc, 3)
    print(f'C={C}: Validation accuracy = {round(acc, 3)}')


C=0.01: Validation accuracy = 0.696
C=0.1: Validation accuracy = 0.7
C=1: Validation accuracy = 0.706
C=10: Validation accuracy = 0.706
C=100: Validation accuracy = 0.706
