In [1]:
import polars as pl
data = pl.read_csv('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv')

In [2]:
data.head()

lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
str,str,i64,i64,str,str,i64,f64,i64
"""paid_ads""",,1,79450,"""unemployed""","""south_america""",4,0.94,1
"""social_media""","""retail""",1,46992,"""employed""","""south_america""",1,0.8,0
"""events""","""healthcare""",5,78796,"""unemployed""","""australia""",3,0.69,1
"""paid_ads""","""retail""",2,83843,,"""australia""",1,0.87,0
"""referral""","""education""",3,85012,"""self_employed""","""europe""",3,0.62,1


In [3]:
df = data

print("Dataset shape:", df.shape)
print("\nFirst few rows:")
print(df.head())

# Count null values per column
null_counts = df.null_count()
print("\nNull values per column:")
print(null_counts)

# Show only columns with missing values
columns_with_nulls = [col for col in df.columns if df[col].null_count() > 0]
print(f"\nColumns with missing values: {columns_with_nulls}")
for col in columns_with_nulls:
    print(f"  {col}: {df[col].null_count()} missing values")


# Separate numerical and categorical columns
numerical_cols = [col for col in df.columns if df[col].dtype in [pl.Int64, pl.Float64, pl.Int32, pl.Float32]]
categorical_cols = [col for col in df.columns if df[col].dtype == pl.Utf8]

print(f"\nNumerical columns: {numerical_cols}")
print(f"Categorical columns: {categorical_cols}")

# Create expressions for filling missing values
fill_expressions = []

for col in df.columns:
    if df[col].null_count() > 0:
        if col in categorical_cols:
            print(f"Filling {df[col].null_count()} missing values in '{col}' with 'NA'")
            fill_expressions.append(pl.col(col).fill_null('NA').alias(col))
        elif col in numerical_cols:
            print(f"Filling {df[col].null_count()} missing values in '{col}' with 0.0")
            fill_expressions.append(pl.col(col).fill_null(0.0).alias(col))
        else:
            fill_expressions.append(pl.col(col))
    else:
        fill_expressions.append(pl.col(col))

# Apply the transformations
df_prepared = df.select(fill_expressions)

Dataset shape: (1462, 9)

First few rows:
shape: (5, 9)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ lead_sour ┆ industry  ┆ number_of ┆ annual_in ┆ … ┆ location  ┆ interacti ┆ lead_scor ┆ converte │
│ ce        ┆ ---       ┆ _courses_ ┆ come      ┆   ┆ ---       ┆ on_count  ┆ e         ┆ d        │
│ ---       ┆ str       ┆ viewed    ┆ ---       ┆   ┆ str       ┆ ---       ┆ ---       ┆ ---      │
│ str       ┆           ┆ ---       ┆ i64       ┆   ┆           ┆ i64       ┆ f64       ┆ i64      │
│           ┆           ┆ i64       ┆           ┆   ┆           ┆           ┆           ┆          │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ paid_ads  ┆ null      ┆ 1         ┆ 79450     ┆ … ┆ south_ame ┆ 4         ┆ 0.94      ┆ 1        │
│           ┆           ┆           ┆           ┆   ┆ rica      ┆           ┆           ┆          │
│ social_me ┆ retail    ┆ 1        

In [4]:
df_prepared.describe()

statistic,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
str,str,str,f64,f64,str,str,f64,f64,f64
"""count""","""1462""","""1462""",1462.0,1462.0,"""1462""","""1462""",1462.0,1462.0,1462.0
"""null_count""","""0""","""0""",0.0,0.0,"""0""","""0""",0.0,0.0,0.0
"""mean""",,,2.031464,52472.172367,,,2.976744,0.506108,0.619015
"""std""",,,1.449717,24254.34703,,,1.681564,0.288465,0.485795
"""min""","""NA""","""NA""",0.0,0.0,"""NA""","""NA""",0.0,0.0,0.0
"""25%""",,,1.0,44084.0,,,2.0,0.26,0.0
"""50%""",,,2.0,57474.0,,,3.0,0.51,1.0
"""75%""",,,3.0,68243.0,,,4.0,0.75,1.0
"""max""","""social_media""","""technology""",9.0,109899.0,"""unemployed""","""south_america""",11.0,1.0,1.0


In [5]:
df_prepared['industry'].mode()

industry
str
"""retail"""


In [6]:
numerical_features = [col for col in numerical_cols if col != 'converted']
print(f"\nNumerical features for correlation: {numerical_features}")


Numerical features for correlation: ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']


In [7]:
df_numerical = df_prepared.select(numerical_features)

In [8]:
correlations = {}
for col1 in numerical_features:
    correlations[col1] = {}
    for col2 in numerical_features:
        corr = df_numerical.select(pl.corr(col1, col2)).item()
        correlations[col1][col2] = corr

In [9]:
correlations

{'number_of_courses_viewed': {'number_of_courses_viewed': 1.0,
  'annual_income': 0.009770285756444626,
  'interaction_count': -0.023565222882887944,
  'lead_score': -0.004878998354681265},
 'annual_income': {'number_of_courses_viewed': 0.009770285756444633,
  'annual_income': 1.0,
  'interaction_count': 0.027036472404814337,
  'lead_score': 0.015609546050138964},
 'interaction_count': {'number_of_courses_viewed': -0.02356522288288794,
  'annual_income': 0.02703647240481433,
  'interaction_count': 1.0,
  'lead_score': 0.009888182496913107},
 'lead_score': {'number_of_courses_viewed': -0.004878998354681265,
  'annual_income': 0.01560954605013897,
  'interaction_count': 0.009888182496913112,
  'lead_score': 1.0}}

In [10]:
for col in numerical_features:
    print(f"{col:30s}", end=" ")
    for col2 in numerical_features:
        print(f"{correlations[col][col2]:7.4f}", end=" ")
    print()

number_of_courses_viewed        1.0000  0.0098 -0.0236 -0.0049 
annual_income                   0.0098  1.0000  0.0270  0.0156 
interaction_count              -0.0236  0.0270  1.0000  0.0099 
lead_score                     -0.0049  0.0156  0.0099  1.0000 


In [12]:
pairs = [
    ('interaction_count', 'lead_score'),
    ('number_of_courses_viewed', 'lead_score'),
    ('number_of_courses_viewed', 'interaction_count'),
    ('annual_income', 'interaction_count')
]

pair_correlations = {}
for feat1, feat2 in pairs:
    corr_value = correlations[feat1][feat2]
    pair_correlations[(feat1, feat2)] = corr_value
    print(f"{feat1} <-> {feat2}: {corr_value:.4f}")

# Find the pair with highest correlation
max_pair = max(pair_correlations.items(), key=lambda x: x[1])
print(f"   {max_pair[0][0]} and {max_pair[0][1]}: {max_pair[1]:.4f}")

interaction_count <-> lead_score: 0.0099
number_of_courses_viewed <-> lead_score: -0.0049
number_of_courses_viewed <-> interaction_count: -0.0236
annual_income <-> interaction_count: 0.0270
   annual_income and interaction_count: 0.0270


In [14]:
### Question 3

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_classif

# # ============================================================
# # Load and prepare data (from Q1)
# # ============================================================
# url = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv'
# df = pl.read_csv(url)

# # Fill missing values
# numerical_cols = [col for col in df.columns if df[col].dtype in [pl.Int64, pl.Float64, pl.Int32, pl.Float32]]
# categorical_cols = [col for col in df.columns if df[col].dtype == pl.Utf8]

# fill_expressions = []
# for col in df.columns:
#     if df[col].null_count() > 0:
#         if col in categorical_cols:
#             fill_expressions.append(pl.col(col).fill_null('NA').alias(col))
#         elif col in numerical_cols:
#             fill_expressions.append(pl.col(col).fill_null(0.0).alias(col))
#         else:
#             fill_expressions.append(pl.col(col))
#     else:
#         fill_expressions.append(pl.col(col))

# df_prepared = df.select(fill_expressions)

# print("Data prepared!")
# print(f"Shape: {df_prepared.shape}")

# ============================================================
# Split the data (from Q2)
# ============================================================
print("\n" + "="*60)
print("Splitting the data (60%/20%/20%)")
print("="*60)

# Convert to pandas for sklearn (we'll use numpy arrays)
df_pandas = df_prepared.to_pandas()

# Separate features and target
X = df_pandas.drop('converted', axis=1)
y = df_pandas['converted']

print(f"\nOriginal dataset: {len(X)} rows")
print(f"Target distribution:\n{y.value_counts()}")

# First split: 60% train, 40% temp (which will be split into val and test)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.4, random_state=42
)

# Second split: split the 40% into 20% val and 20% test (50-50 split of the 40%)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42
)

print(f"\nTrain set: {len(X_train)} rows ({len(X_train)/len(X)*100:.1f}%)")
print(f"Val set:   {len(X_val)} rows ({len(X_val)/len(X)*100:.1f}%)")
print(f"Test set:  {len(X_test)} rows ({len(X_test)/len(X)*100:.1f}%)")


categorical_features = [col for col in categorical_cols if col != 'converted']
print(f"\nCategorical features: {categorical_features}")

from sklearn.preprocessing import LabelEncoder

print("\nEncoding categorical features...")
X_train_encoded = X_train.copy()

encoders = {}
for col in categorical_features:
    le = LabelEncoder()
    X_train_encoded[col] = le.fit_transform(X_train[col].astype(str))
    encoders[col] = le

X_train_categorical = X_train_encoded[categorical_features]

mi_scores = mutual_info_classif(
    X_train_categorical, 
    y_train, 
    discrete_features=True,  # All features are categorical (encoded)
    random_state=42
)

mi_dict = {}
for feature, score in zip(categorical_features, mi_scores):
    rounded_score = round(score, 2)
    mi_dict[feature] = rounded_score

for feature in sorted(mi_dict.keys()):
    print(f"{feature:25s}: {mi_dict[feature]:.2f}")

question_vars = ['industry', 'location', 'lead_source', 'employment_status']
question_scores = {var: mi_dict[var] for var in question_vars if var in mi_dict}

for var in question_vars:
    print(f"{var:25s}: {question_scores[var]:.2f}")

# Find the variable with the biggest MI score
max_var = max(question_scores.items(), key=lambda x: x[1])
print(f"\n🎯 ANSWER: The variable with the biggest mutual information score is:")
print(f"   '{max_var[0]}' with a score of {max_var[1]:.2f}")


Splitting the data (60%/20%/20%)

Original dataset: 1462 rows
Target distribution:
converted
1    905
0    557
Name: count, dtype: int64

Train set: 877 rows (60.0%)
Val set:   292 rows (20.0%)
Test set:  293 rows (20.0%)

QUESTION 3: Mutual Information Score

Categorical features: ['lead_source', 'industry', 'employment_status', 'location']

Encoding categorical features...
employment_status        : 0.02
industry                 : 0.02
lead_source              : 0.03
location                 : 0.00
industry                 : 0.02
location                 : 0.00
lead_source              : 0.03
employment_status        : 0.02

🎯 ANSWER: The variable with the biggest mutual information score is:
   'lead_source' with a score of 0.03


In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score
import numpy as np


df_pandas = df_prepared.to_pandas()

# Separate features and target
X = df_pandas.drop('converted', axis=1)
y = df_pandas['converted']

# Split: 60% train, 40% temp
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.4, random_state=42
)

# Split temp: 20% val, 20% test
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42
)

print(f"Train set: {len(X_train)} rows")
print(f"Val set:   {len(X_val)} rows")
print(f"Test set:  {len(X_test)} rows")



# Identify categorical and numerical features
categorical_features = [col for col in categorical_cols if col != 'converted']
numerical_features = [col for col in numerical_cols if col != 'converted']

print(f"\nCategorical features: {categorical_features}")
print(f"Numerical features: {numerical_features}")

# Create OneHotEncoder
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# Fit on training data and transform
X_train_cat_encoded = ohe.fit_transform(X_train[categorical_features])
X_val_cat_encoded = ohe.transform(X_val[categorical_features])
X_test_cat_encoded = ohe.transform(X_test[categorical_features])

print(f"Original categorical features: {len(categorical_features)}")
print(f"After one-hot encoding: {X_train_cat_encoded.shape[1]} features")

X_train_num = X_train[numerical_features].values
X_val_num = X_val[numerical_features].values
X_test_num = X_test[numerical_features].values

print(f"Numerical features: {X_train_num.shape[1]}")

X_train_final = np.concatenate([X_train_num, X_train_cat_encoded], axis=1)
X_val_final = np.concatenate([X_val_num, X_val_cat_encoded], axis=1)
X_test_final = np.concatenate([X_test_num, X_test_cat_encoded], axis=1)

print(f"Final feature count: {X_train_final.shape[1]}")
print(f"  - Numerical: {X_train_num.shape[1]}")
print(f"  - One-hot encoded categorical: {X_train_cat_encoded.shape[1]}")

model = LogisticRegression(
    solver='liblinear',
    C=1.0,
    max_iter=1000,
    random_state=42
)

print("Fitting model on training data...")
model.fit(X_train_final, y_train)

# Make predictions
y_val_pred = model.predict(X_val_final)

# Calculate accuracy
val_accuracy = accuracy_score(y_val, y_val_pred)
val_accuracy_rounded = round(val_accuracy, 2)

print(f"Validation Accuracy: {val_accuracy:.4f}")
print(f"Rounded to 2 decimals: {val_accuracy_rounded}")

print(f"ANSWER: The validation accuracy is: {val_accuracy_rounded}")
print(f"Training set accuracy: {accuracy_score(y_train, model.predict(X_train_final)):.4f}")
print(f"Validation set accuracy: {val_accuracy:.4f}")

Train set: 877 rows
Val set:   292 rows
Test set:  293 rows

Categorical features: ['lead_source', 'industry', 'employment_status', 'location']
Numerical features: ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']
Original categorical features: 4
After one-hot encoding: 27 features
Numerical features: 4
Final feature count: 31
  - Numerical: 4
  - One-hot encoded categorical: 27
Fitting model on training data...
Validation Accuracy: 0.7432
Rounded to 2 decimals: 0.74
ANSWER: The validation accuracy is: 0.74
Training set accuracy: 0.7469
Validation set accuracy: 0.7432


In [17]:
categorical_features = [col for col in categorical_cols if col != 'converted']
numerical_features = [col for col in numerical_cols if col != 'converted']
all_features = categorical_features + numerical_features

print(f"\nAll features: {all_features}")

def train_and_evaluate(cat_features, num_features):
    """Train model with given features and return validation accuracy"""
    
    if len(cat_features) > 0:
        ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
        X_train_cat = ohe.fit_transform(X_train[cat_features])
        X_val_cat = ohe.transform(X_val[cat_features])
    else:
        X_train_cat = np.array([]).reshape(len(X_train), 0)
        X_val_cat = np.array([]).reshape(len(X_val), 0)
    
    if len(num_features) > 0:
        X_train_num = X_train[num_features].values
        X_val_num = X_val[num_features].values
    else:
        X_train_num = np.array([]).reshape(len(X_train), 0)
        X_val_num = np.array([]).reshape(len(X_val), 0)
    
    X_train_final = np.concatenate([X_train_num, X_train_cat], axis=1)
    X_val_final = np.concatenate([X_val_num, X_val_cat], axis=1)
    
    # Train model
    model = LogisticRegression(
        solver='liblinear',
        C=1.0,
        max_iter=1000,
        random_state=42
    )
    model.fit(X_train_final, y_train)
    
    # Predict and calculate accuracy
    y_val_pred = model.predict(X_val_final)
    accuracy = accuracy_score(y_val, y_val_pred)
    
    return accuracy



baseline_accuracy = train_and_evaluate(categorical_features, numerical_features)
print(f"Baseline accuracy (all features): {baseline_accuracy:.6f}")

feature_importance = {}

for feature in all_features:
    # Exclude this feature
    if feature in categorical_features:
        cat_feats = [f for f in categorical_features if f != feature]
        num_feats = numerical_features
    else:
        cat_feats = categorical_features
        num_feats = [f for f in numerical_features if f != feature]
    
    # Train model without this feature
    accuracy_without = train_and_evaluate(cat_feats, num_feats)
    
    # Calculate difference (how much accuracy dropped)
    difference = baseline_accuracy - accuracy_without
    feature_importance[feature] = {
        'accuracy_without': accuracy_without,
        'difference': difference
    }
    
    print(f"Without '{feature}': accuracy = {accuracy_without:.6f}, diff = {difference:.6f}")


print("\nFeatures sorted by importance (difference):")
sorted_features = sorted(feature_importance.items(), key=lambda x: x[1]['difference'])

for feature, info in sorted_features:
    print(f"{feature:30s}: diff = {info['difference']:.6f}")


question_features = ['industry', 'employment_status', 'lead_score']
for feature in question_features:
    if feature in feature_importance:
        diff = feature_importance[feature]['difference']
        print(f"{feature:30s}: {diff:.6f}")

# Find the feature with smallest difference
smallest_diff_feature = min(
    [(f, info['difference']) for f, info in feature_importance.items() 
     if f in question_features],
    key=lambda x: x[1]
)

print(f"\n ANSWER: The feature with the SMALLEST difference is:")
print(f"   '{smallest_diff_feature[0]}' with difference = {smallest_diff_feature[1]:.6f}")
print(f"\nThis means '{smallest_diff_feature[0]}' is the LEAST USEFUL feature")
print("(removing it has the smallest impact on accuracy)")


All features: ['lead_source', 'industry', 'employment_status', 'location', 'number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']
Baseline accuracy (all features): 0.743151
Without 'lead_source': accuracy = 0.729452, diff = 0.013699
Without 'industry': accuracy = 0.743151, diff = 0.000000
Without 'employment_status': accuracy = 0.746575, diff = -0.003425
Without 'location': accuracy = 0.743151, diff = 0.000000
Without 'number_of_courses_viewed': accuracy = 0.678082, diff = 0.065068
Without 'annual_income': accuracy = 0.856164, diff = -0.113014
Without 'interaction_count': accuracy = 0.674658, diff = 0.068493
Without 'lead_score': accuracy = 0.743151, diff = 0.000000

Features sorted by importance (difference):
annual_income                 : diff = -0.113014
employment_status             : diff = -0.003425
industry                      : diff = 0.000000
location                      : diff = 0.000000
lead_score                    : diff = 0.000000
lead_source 

In [18]:
df_pandas = df_prepared.to_pandas()

X = df_pandas.drop('converted', axis=1)
y = df_pandas['converted']

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.4, random_state=42
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42
)

print("Data split complete!")
print(f"Train: {len(X_train)}, Val: {len(X_val)}, Test: {len(X_test)}")


categorical_features = [col for col in categorical_cols if col != 'converted']
numerical_features = [col for col in numerical_cols if col != 'converted']

print(f"\nCategorical features: {len(categorical_features)}")
print(f"Numerical features: {len(numerical_features)}")

# One-hot encode categorical features
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
X_train_cat_encoded = ohe.fit_transform(X_train[categorical_features])
X_val_cat_encoded = ohe.transform(X_val[categorical_features])

# Get numerical features
X_train_num = X_train[numerical_features].values
X_val_num = X_val[numerical_features].values

# Combine features
X_train_final = np.concatenate([X_train_num, X_train_cat_encoded], axis=1)
X_val_final = np.concatenate([X_val_num, X_val_cat_encoded], axis=1)

print(f"Final feature count: {X_train_final.shape[1]}")

# Define C values to try
C_values = [0.01, 0.1, 1, 10, 100]

print(f"\nTrying C values: {C_values}")
print("\nNote: Smaller C = stronger regularization")
print("      Larger C = weaker regularization\n")

# Store results
results = {}

for C in C_values:
    # Train model with this C value
    model = LogisticRegression(
        solver='liblinear',
        C=C,
        max_iter=1000,
        random_state=42
    )
    
    model.fit(X_train_final, y_train)
    
    # Predict on validation set
    y_val_pred = model.predict(X_val_final)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_val, y_val_pred)
    accuracy_rounded = round(accuracy, 3)
    
    # Store results
    results[C] = {
        'accuracy': accuracy,
        'accuracy_rounded': accuracy_rounded
    }
    
    print(f"C = {C:6.2f} -> Validation Accuracy = {accuracy:.6f} (rounded: {accuracy_rounded})")

sorted_results = sorted(results.items(), key=lambda x: x[1]['accuracy'], reverse=True)

print("\nC values ranked by validation accuracy:")
for i, (C, info) in enumerate(sorted_results, 1):
    marker = "🎯" if i == 1 else "  "
    print(f"{marker} {i}. C = {C:6.2f} -> Accuracy = {info['accuracy_rounded']}")

best_C = sorted_results[0][0]
best_accuracy = sorted_results[0][1]['accuracy_rounded']

print(f"\n🎯 ANSWER: The best C value is: {best_C}")
print(f"   Validation accuracy: {best_accuracy}")

# Additional insight
print("\n" + "-"*60)
print("Understanding the results:")
print("-"*60)
print("C parameter controls regularization strength:")
print("  - Small C (e.g., 0.01) = Strong regularization = Simpler model")
print("  - Large C (e.g., 100) = Weak regularization = More complex model")

Data split complete!
Train: 877, Val: 292, Test: 293

Categorical features: 4
Numerical features: 4
Final feature count: 31

Trying C values: [0.01, 0.1, 1, 10, 100]

Note: Smaller C = stronger regularization
      Larger C = weaker regularization

C =   0.01 -> Validation Accuracy = 0.743151 (rounded: 0.743)
C =   0.10 -> Validation Accuracy = 0.743151 (rounded: 0.743)
C =   1.00 -> Validation Accuracy = 0.743151 (rounded: 0.743)
C =  10.00 -> Validation Accuracy = 0.743151 (rounded: 0.743)
C = 100.00 -> Validation Accuracy = 0.743151 (rounded: 0.743)

C values ranked by validation accuracy:
🎯 1. C =   0.01 -> Accuracy = 0.743
   2. C =   0.10 -> Accuracy = 0.743
   3. C =   1.00 -> Accuracy = 0.743
   4. C =  10.00 -> Accuracy = 0.743
   5. C = 100.00 -> Accuracy = 0.743

🎯 ANSWER: The best C value is: 0.01
   Validation accuracy: 0.743

------------------------------------------------------------
Understanding the results:
------------------------------------------------------------