In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
df = pd.read_csv("https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv")

numerical_features = df.select_dtypes(include="number").columns.tolist()
categorical_features = df.select_dtypes(include="object").columns.tolist()

df[numerical_features] = df[numerical_features].fillna(0)
df[categorical_features] = df[categorical_features].fillna("NA")

In [None]:
# Question 1
df["industry"].mode()

In [None]:
# Question 2
numeric_df = df.select_dtypes(include='number').drop(columns="converted")
numeric_df.corr()

In [None]:
# Splitting the data
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

features = categorical_features + numerical_features
features.remove("converted")

# Training set
X_train = df_train[features].reset_index(drop=True)
y_train = df_train["converted"].reset_index(drop=True)

# Validation set
X_val = df_val[features].reset_index(drop=True)
y_val = df_val["converted"].reset_index(drop=True)

# Test set
X_test = df_test[features].reset_index(drop=True)
y_test = df_test["converted"].reset_index(drop=True)

In [None]:
# Question 3
def mutual_info_churn_score(series):
    return mutual_info_score(series, y_train)

X_train.select_dtypes(include="object").apply(mutual_info_churn_score)

In [None]:
# Question 4
dv = DictVectorizer(sparse=False)

X_train = dv.fit_transform(X_train.to_dict(orient="records"))
X_val = dv.transform(X_val.to_dict(orient="records"))
X_test = dv.transform(X_test.to_dict(orient="records"))

model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1_000, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_val)[:, 1]
baseline_accuracy = accuracy_score(y_val, y_pred >= 0.5)
print(baseline_accuracy)

In [None]:
# Question 5
accuracy_differences = {}

for feature in features:
    reduced_features = [f for f in features if f != feature]

    dv_iter = DictVectorizer(sparse=False)
    X_train_iter = dv_iter.fit_transform(df_train[reduced_features].to_dict(orient="records"))
    X_val_iter = dv_iter.transform(df_val[reduced_features].to_dict(orient="records"))

    model_iter = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
    model_iter.fit(X_train_iter, y_train)

    y_pred_iter = model_iter.predict_proba(X_val_iter)[:, 1]
    iter_accuracy = accuracy_score(y_val, y_pred_iter >= 0.5)

    accuracy_differences[feature] = abs(baseline_accuracy - iter_accuracy)

    print(f"Without {feature:<25} | Accuracy: {iter_accuracy:.4f} | Δ Accuracy: {abs(baseline_accuracy - iter_accuracy):.4f}")

# Identify least useful feature
least_useful_feat = min(accuracy_differences, key=accuracy_differences.get)
print(f"\nThe least useful feature is: '{least_useful_feat}'")


In [None]:
# Question 6
C_values = [0.01, 0.1, 1, 10, 100]

dv = DictVectorizer(sparse=False)
X_train = df_train[features].reset_index(drop=True)
X_val = df_val[features].reset_index(drop=True)
X_train_encoded = dv.fit_transform(X_train.to_dict(orient="records"))
X_val_encoded = dv.transform(X_val.to_dict(orient="records"))

for C in C_values:
    model = LogisticRegression(solver="liblinear", C=C, max_iter=1000, random_state=42)
    model.fit(X_train_encoded, y_train)

    y_pred = model.predict_proba(X_val_encoded)[:, 1]
    acc = accuracy_score(y_val, y_pred >= 0.5)

    print(f"{C:<10}\t{acc:.3f}")