In [1]:
!pip install pandas scikit-learn numpy



In [2]:
!wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv -O course_lead_scoring.csv


--2025-10-13 08:34:30--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.108.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 80876 (79K) [text/plain]
Saving to: ‘course_lead_scoring.csv’


2025-10-13 08:34:30 (4.03 MB/s) - ‘course_lead_scoring.csv’ saved [80876/80876]



In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import mutual_info_classif

df = pd.read_csv("course_lead_scoring.csv")

# --- Data Preparation ---
cat_cols = ["lead_source", "industry", "employment_status", "location"]
num_cols = ["number_of_courses_viewed", "annual_income", "interaction_count", "lead_score"]
target = "converted"

for c in cat_cols:
    df[c] = df[c].astype("object").fillna("NA")
for c in num_cols:
    df[c] = pd.to_numeric(df[c], errors="coerce").fillna(0.0)
df[target] = pd.to_numeric(df[target], errors="coerce").fillna(0).astype(int)

# --- Q1 ---
q1 = df["industry"].mode(dropna=False)[0]
print("Q1:", q1)

Q1: retail


In [4]:
# --- Q2 ---
corr = df[num_cols].corr().abs()
pairs = [
    ("interaction_count", "lead_score"),
    ("number_of_courses_viewed", "lead_score"),
    ("number_of_courses_viewed", "interaction_count"),
    ("annual_income", "interaction_count"),
]
best = max(pairs, key=lambda p: corr.loc[p[0], p[1]])
print("Q2:", best)

Q2: ('annual_income', 'interaction_count')


In [5]:
# --- Split ---
X = df.drop(columns=[target])
y = df[target]
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

# --- Q3 ---
X_cat = X_train[cat_cols].astype("category")
X_codes = pd.DataFrame({c: X_cat[c].cat.codes for c in cat_cols})
mi = mutual_info_classif(X_codes.values, y_train, discrete_features=True, random_state=42)
mi_series = pd.Series(mi, index=cat_cols).sort_values(ascending=False)
print("Q3:", mi_series)

Q3: lead_source          0.027161
industry             0.013788
employment_status    0.007115
location             0.001347
dtype: float64


In [11]:
# --- Q4 ---
# Calculate base accuracy
oh = OneHotEncoder(handle_unknown="ignore")
Xtr = np.hstack([X_train[num_cols].values, oh.fit_transform(X_train[cat_cols]).toarray()])
Xv = np.hstack([X_val[num_cols].values, oh.transform(X_val[cat_cols]).toarray()])
m = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
m.fit(Xtr, y_train)
acc = accuracy_score(y_val, m.predict(Xv))
print("Q4:", acc)

Q4: 0.6815068493150684


In [16]:
# --- Q5 ---
base_acc = acc
drops = ["industry", "employment_status", "lead_score"]
for col in drops:
    keep_cat = [c for c in cat_cols if c != col]
    keep_num = [c for c in num_cols if c != col]
    oh2 = OneHotEncoder(handle_unknown="ignore")
    Xtr2 = np.hstack([X_train[keep_num].values, oh2.fit_transform(X_train[keep_cat]).toarray()])
    Xv2 = np.hstack([X_val[keep_num].values, oh2.transform(X_val[keep_cat]).toarray()])
    m2 = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
    m2.fit(Xtr2, y_train)
    acc2 = accuracy_score(y_val, m2.predict(Xv2))
    print(f"Q5 drop {col}: Δ={base_acc-acc2:+.5f}")

Q5 drop industry: Δ=-0.00685
Q5 drop employment_status: Δ=+0.00000
Q5 drop lead_score: Δ=+0.00685


In [13]:
# --- Q6 ---
for C in [0.01, 0.1, 1, 10, 100]:
    m = LogisticRegression(solver="liblinear", C=C, max_iter=1000, random_state=42)
    m.fit(Xtr, y_train)
    acc = accuracy_score(y_val, m.predict(Xv))
    print(f"Q6 C={C}: acc={acc:.3f}")

Q6 C=0.01: acc=0.688
Q6 C=0.1: acc=0.682
Q6 C=1: acc=0.682
Q6 C=10: acc=0.682
Q6 C=100: acc=0.682
