In [139]:
import numpy as np
import pandas as pd  
import matplotlib.pyplot as plt

# In this dataset our desired target for classification task will be converted variable - has the client signed up to the platform or not.

df = pd.read_csv("course_lead_scoring.csv")

# print(df.head(5))
# print(df.dtypes)

df.head().T


Unnamed: 0,0,1,2,3,4
lead_source,paid_ads,social_media,events,paid_ads,referral
industry,,retail,healthcare,retail,education
number_of_courses_viewed,1,1,5,2,3
annual_income,79450.0,46992.0,78796.0,83843.0,85012.0
employment_status,unemployed,employed,unemployed,,self_employed
location,south_america,south_america,australia,australia,europe
interaction_count,4,1,3,1,3
lead_score,0.94,0.8,0.69,0.87,0.62
converted,1,0,1,0,1


In [140]:
# Data preparation
    # Check if the missing values are presented in the features.
df.isnull().sum()
df.info() 


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1462 entries, 0 to 1461
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   lead_source               1334 non-null   object 
 1   industry                  1328 non-null   object 
 2   number_of_courses_viewed  1462 non-null   int64  
 3   annual_income             1281 non-null   float64
 4   employment_status         1362 non-null   object 
 5   location                  1399 non-null   object 
 6   interaction_count         1462 non-null   int64  
 7   lead_score                1462 non-null   float64
 8   converted                 1462 non-null   int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 102.9+ KB


In [141]:
# If there are missing values:
    # For categorical features, replace them with 'NA'
    # For numerical features, replace with with 0.0

cat_features = list(df.dtypes[df.dtypes == 'object'].index)
print (cat_features)

num_features = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
print(num_features)

for col in cat_features: 
    df[col] = df[col].fillna('NA')

for col in num_features:
    df[col] = df[col].fillna(0)


df.isnull().sum() 

['lead_source', 'industry', 'employment_status', 'location']
['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score', 'converted']


lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64

In [142]:
# Question 1
    # What is the most frequent observation (mode) for the column industry?

most_freq = df['industry'].mode()[0]
print(f"Mode for 'industry': {most_freq}")

# Question 2
# Create the correlation matrix for the numerical features of your dataset. In a correlation matrix, you compute the correlation coefficient between every pair of features.
corr_matrix = df[num_features].corr()

# Check the correlation values for the specified pairs (absolute value indicates strength)
corr_interaction_leadscore = corr_matrix.loc['interaction_count', 'lead_score']
corr_courses_leadscore = corr_matrix.loc['number_of_courses_viewed', 'lead_score']
corr_courses_interaction = corr_matrix.loc['number_of_courses_viewed', 'interaction_count']
corr_annualincome_interaction = corr_matrix.loc['annual_income', 'interaction_count']

# Create a dictionary to hold the absolute correlations
abs_correlations = {
    'interaction_count and lead_score': (corr_interaction_leadscore),
    'number_of_courses_viewed and lead_score': (corr_courses_leadscore),
    'number_of_courses_viewed and interaction_count': (corr_courses_interaction),
    'annual_income and interaction_count': (corr_annualincome_interaction)
}

print(abs_correlations)

# # Find the pair with the maximum absolute correlation
biggest_correlation_pair = max(abs_correlations, key=abs_correlations.get)
print(f"Pair with the biggest correlation: {biggest_correlation_pair}")

# What are the two features that have the biggest correlation?
    # interaction_count and lead_score
    # number_of_courses_viewed and lead_score
    # number_of_courses_viewed and interaction_count
                # annual_income and interaction_count  *******answer 
    # Only consider the pairs above when answering this question.

Mode for 'industry': retail
{'interaction_count and lead_score': np.float64(0.009888182496913131), 'number_of_courses_viewed and lead_score': np.float64(-0.004878998354681276), 'number_of_courses_viewed and interaction_count': np.float64(-0.023565222882888037), 'annual_income and interaction_count': np.float64(0.02703647240481443)}
Pair with the biggest correlation: annual_income and interaction_count


In [143]:
from sklearn.model_selection import train_test_split 

 # Split the data
    # Split your data in train/val/test sets with 60%/20%/20% distribution.
    # Use Scikit-Learn for that (the train_test_split function) and set the seed to 42.
    # Make sure that the target value converted is not in your dataframe.

df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)

df_train, df_val = train_test_split(df_full_train, test_size = 0.25, random_state=42)
# split the df_train which was 80% of the original df as 0.25 * 0.80 = .20) 

print(len(df))
print(len(df_full_train))
print(len(df_train))
print(len(df_test))   
print(len(df_val))
                                    
# Separate target variables
y_train = df_train["converted"].values
y_val = df_val["converted"].values
y_test = df_test["converted"].values

# Drop the target variable from feature dataframes
df_train = df_train.drop(columns=["converted"])
df_val = df_val.drop(columns=["converted"])
df_test = df_test.drop(columns=["converted"])         



1462
1169
876
293
293


In [144]:
# Question 3
    # Calculate the mutual information score between converted and other categorical variables in the dataset. Use the training set only.
    # Round the scores to 2 decimals using round(score, 2).
# Which of these variables has the biggest mutual information score?
    # industry
    # location
          # lead_source  ** largest at 0.04 
    # employment_status

from sklearn.metrics import mutual_info_score

mi_industry = mutual_info_score(y_train, df_train.industry)
mi_location = mutual_info_score(y_train, df_train.location)
mi_lead_source = mutual_info_score(y_train, df_train.lead_source)
mi_employment_status = mutual_info_score(y_train, df_train.employment_status)

# The first time I did this I just used df_full_train on everything above and got a very similar answer (0.03 - lead_source) .. would this have been incorrect? 

print("industry MI = ", round(mi_industry,2))
print("location MI = ", round(mi_location,2))
print("lead_source MI = ", round(mi_lead_source,2))
print("employment_status MI = ", round(mi_employment_status,2))


# or ...


def mutual_info_churn_score(series):
    return mutual_info_score(series, df_full_train.converted)

mi = df_full_train[cat_features].apply(mutual_info_churn_score).round(2)
mi.sort_values(ascending=False)


industry MI =  0.01
location MI =  0.0
lead_source MI =  0.03
employment_status MI =  0.01


lead_source          0.02
industry             0.01
employment_status    0.01
location             0.00
dtype: float64

In [146]:
# Question 4
# Now let's train a logistic regression.
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

# Check and remove 'converted' from numerical features list
if 'converted' in num_features:
    num_features.remove('converted')
    
# Check and remove 'converted' from categorical features list if it was mistakenly there
if 'converted' in cat_features:
    cat_features.remove('converted')

# Remember that we have several categorical variables in the dataset. Include them using one-hot encoding.
all_features = cat_features +  num_features

train_dicts = df_train[all_features].to_dict(orient='records') # For making dict row-wise we use orient = 'records
val_dicts = df_val[all_features].to_dict(orient='records')

dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(train_dicts)
X_val = dv.transform(val_dicts)

# Fit the model on the training dataset.
# To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
# model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

model = LogisticRegression(solver = 'liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_val)

accuracy = (y_val == y_pred).mean()

acc_round = round(accuracy, 4)


# Calculate the accuracy on the validation dataset and round it to 2 decimal digits.
# What accuracy did you get?
    # 0.64
     # 0.74  ** this one is the closest but I get 0.7304 
    # 0.84
    # 0.94

print(acc_round)




0.7304


In [147]:
model.coef_[0].round(3)

array([-0.   , -0.024,  0.044, -0.01 ,  0.001, -0.096, -0.028,  0.038,
       -0.005, -0.034,  0.001, -0.018, -0.033, -0.006,  0.297,  0.048,
        0.008, -0.012, -0.012, -0.111,  0.073, -0.031, -0.001, -0.009,
       -0.011, -0.02 , -0.006, -0.009, -0.027, -0.003,  0.45 ])

In [None]:
# Question 5
# Let's find the least useful feature using the feature elimination technique.
# Train a model using the same features and parameters as in Q4 (without rounding).
# Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
# For each feature, calculate the difference between the original accuracy and the accuracy without the feature.
    # Which of following feature has the smallest difference?
    # 'industry'
    # 'employment_status'
    # 'lead_score'
    # Note: The difference doesn't have to be positive.



# Question 5: Feature Elimination (Finding the least useful feature)

features_to_check = ['industry', 'employment_status', 'lead_score']
accuracy_differences = {}

# Use the one-hot encoded feature names for exclusion
full_features = set(dv.get_feature_names_out())

for feature_to_exclude in features_to_check:
    
    # Identify which columns to keep (all features excluding the current one)
    # This logic is simplified; for OHE, this would need to exclude ALL columns 
    # generated by the categorical feature, but for the *generic* approach:
    current_features_df = [f for f in all_features if f != feature_to_exclude]
    
    # Re-vectorize on the reduced feature set
    dv_subset = DictVectorizer(sparse=False)
    X_train_subset = dv_subset.fit_transform(df_train[current_features_df].to_dict(orient='records'))
    X_val_subset = dv_subset.transform(df_val[current_features_df].to_dict(orient='records'))
    
    # Train a new model with the subset
    model_subset = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model_subset.fit(X_train_subset, y_train)
    
    # Calculate new validation accuracy
    y_pred_subset = model_subset.predict(X_val_subset)
    accuracy_subset = (y_val == y_pred_subset).mean()
    
    # Calculate the difference (original accuracy - new accuracy)
    diff = original_accuracy - accuracy_subset
    accuracy_differences[feature_to_exclude] = diff

# Find the feature with the smallest change (smallest absolute difference)
# The "least useful" feature is the one whose removal causes the smallest absolute drop in performance.
least_useful_feature = min(accuracy_differences, key=lambda k: abs(accuracy_differences[k]))

print(f"Accuracy differences: {accuracy_differences}")
print(f"Feature with the smallest change (least useful): {least_useful_feature}")

In [None]:

# Question 6
# Now let's train a regularized logistic regression.
# Let's try the following values of the parameter C: [0.01, 0.1, 1, 10, 100].
# Train models using all the features as in Q4.
# Calculate the accuracy on the validation dataset and round it to 3 decimal digits.
# Which of these C leads to the best accuracy on the validation set?

# 0.01
# 0.1
# 1
# 10
# 100
# Note: If there are multiple options, select the smallest C.