In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv")
df.isnull().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [3]:
for col in df.columns:
    if df[col].dtype == 'object':   # categorical
        df[col] = df[col].fillna('NA')
    else:                           # numerical
        df[col] = df[col].fillna(0.0)


In [4]:
df['industry'].mode()[0]


'retail'

In [5]:
target = "converted"
assert target in df.columns, f"Expected target column named '{target}'"

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import mutual_info_classif
from sklearn.metrics import mutual_info_score

In [7]:
num_cols = df.select_dtypes(exclude=["object", "category"]).columns.tolist()
if target in num_cols:
    num_cols.remove(target)

corr = df[num_cols].corr()

print("=== Correlation matrix (numerical features) ===")
print(corr)
print()



=== Correlation matrix (numerical features) ===
                          number_of_courses_viewed  annual_income  \
number_of_courses_viewed                  1.000000       0.009770   
annual_income                             0.009770       1.000000   
interaction_count                        -0.023565       0.027036   
lead_score                               -0.004879       0.015610   

                          interaction_count  lead_score  
number_of_courses_viewed          -0.023565   -0.004879  
annual_income                      0.027036    0.015610  
interaction_count                  1.000000    0.009888  
lead_score                         0.009888    1.000000  



In [8]:
# Only consider the candidate pairs listed in the question:
pairs = [
    ("interaction_count", "lead_score"),
    ("number_of_courses_viewed", "lead_score"),
    ("number_of_courses_viewed", "interaction_count"),
    ("annual_income", "interaction_count"),
]
pair_corrs = {pair: corr.loc[pair[0], pair[1]] for pair in pairs}
print("Correlations for the candidate pairs:")
for pair, val in pair_corrs.items():
    print(pair, ":", val)
best_pair = max(pair_corrs.items(), key=lambda kv: abs(kv[1]))[0]
print("\nQ2 answer (which pair has biggest correlation among provided options):", best_pair)
print()


Correlations for the candidate pairs:
('interaction_count', 'lead_score') : 0.009888182496913131
('number_of_courses_viewed', 'lead_score') : -0.004878998354681276
('number_of_courses_viewed', 'interaction_count') : -0.023565222882888037
('annual_income', 'interaction_count') : 0.02703647240481443

Q2 answer (which pair has biggest correlation among provided options): ('annual_income', 'interaction_count')



In [9]:
train_val_test_seed = 42
df_train, df_temp = train_test_split(df, test_size=0.4, random_state=train_val_test_seed, stratify=None)
df_val, df_test = train_test_split(df_temp, test_size=0.5, random_state=train_val_test_seed, stratify=None)

X_train = df_train.drop(columns=[target])
y_train = df_train[target].values

X_val = df_val.drop(columns=[target])
y_val = df_val[target].values

X_test = df_test.drop(columns=[target])
y_test = df_test[target].values

In [10]:
cat_cols = X_train.select_dtypes(include=["object", "category"]).columns.tolist()

mi_scores = {}
for col in cat_cols:
    mi = mutual_info_score(y_train, X_train[col])
    mi_scores[col] = round(mi, 2)

print("=== Q3: Mutual information (training set) — categorical features ===")
for col, score in mi_scores.items():
    print(col, score)

best_mi_col = max(mi_scores.items(), key=lambda kv: kv[1])[0]
print("\nQ3 answer (biggest mutual information):", best_mi_col)
print()


=== Q3: Mutual information (training set) — categorical features ===
lead_source 0.03
industry 0.02
employment_status 0.02
location 0.0

Q3 answer (biggest mutual information): lead_source



In [11]:
def build_preprocessor(X):
    # separate numeric and categorical features
    numeric_features = X.select_dtypes(exclude=["object", "category"]).columns.tolist()
    categorical_features = X.select_dtypes(include=["object", "category"]).columns.tolist()
    # numeric scaler
    numeric_transformer = StandardScaler()
    # one-hot encoder for categorical
    cat_transformer = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
    preprocessor = ColumnTransformer(
        transformers=[
            ("num", numeric_transformer, numeric_features),
            ("cat", cat_transformer, categorical_features)
        ],
        remainder="drop"
    )
    return preprocessor, numeric_features + categorical_features

preprocessor, used_features = build_preprocessor(X_train)

model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

X_train_trans = preprocessor.fit_transform(X_train)
X_val_trans = preprocessor.transform(X_val)

model.fit(X_train_trans, y_train)
y_val_pred = model.predict(X_val_trans)
acc_val = accuracy_score(y_val, y_val_pred)
print("=== Q4: Logistic Regression validation accuracy ===")
print("Validation accuracy (unrounded):", acc_val)
print("Validation accuracy (rounded 2 decimals):", round(acc_val, 2))
print()

=== Q4: Logistic Regression validation accuracy ===
Validation accuracy (unrounded): 0.8561643835616438
Validation accuracy (rounded 2 decimals): 0.86



In [12]:
baseline_acc = acc_val  # original accuracy with all features
features_to_test = ['industry', 'employment_status', 'lead_score']
diffs = {}

for feat in features_to_test:
    # create training & validation sets without this feature
    Xtr = X_train.drop(columns=[feat])
    Xv = X_val.drop(columns=[feat])

    preproc_tmp, _ = build_preprocessor(Xtr)
    Xtr_t = preproc_tmp.fit_transform(Xtr)
    Xv_t = preproc_tmp.transform(Xv)

    model_tmp = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model_tmp.fit(Xtr_t, y_train)
    yv_pred = model_tmp.predict(Xv_t)
    acc_no_feat = accuracy_score(y_val, yv_pred)
    dif = baseline_acc - acc_no_feat
    diffs[feat] = dif
    print(f"Dropping {feat}: val accuracy = {acc_no_feat:.6f}, diff = {dif:.6f}")

smallest_diff_feature = min(diffs.items(), key=lambda kv: kv[1])[0]
print("\n=== Q5 answer (feature with smallest difference) ===")
print("Smallest difference feature:", smallest_diff_feature)
print()



Dropping industry: val accuracy = 0.856164, diff = 0.000000
Dropping employment_status: val accuracy = 0.842466, diff = 0.013699
Dropping lead_score: val accuracy = 0.849315, diff = 0.006849

=== Q5 answer (feature with smallest difference) ===
Smallest difference feature: industry



In [13]:
C_values = [0.01, 0.1, 1, 10, 100]
accs_by_C = {}
for C in C_values:
    model_C = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model_C.fit(X_train_trans, y_train)  # use the same preprocessed X_train_trans
    yv_pred_C = model_C.predict(X_val_trans)
    accs_by_C[C] = round(accuracy_score(y_val, yv_pred_C), 3)

print("=== Q6: Validation accuracies by C (rounded to 3 decimals) ===")
for C, a in accs_by_C.items():
    print("C =", C, "->", a)

max_acc = max(accs_by_C.values())
best_C_candidates = [C for C, a in accs_by_C.items() if a == max_acc]
best_C = min(best_C_candidates)
print("\nQ6 answer (best C):", best_C, "with accuracy", accs_by_C[best_C])

=== Q6: Validation accuracies by C (rounded to 3 decimals) ===
C = 0.01 -> 0.849
C = 0.1 -> 0.856
C = 1 -> 0.856
C = 10 -> 0.856
C = 100 -> 0.856

Q6 answer (best C): 0.1 with accuracy 0.856
