<a href="https://colab.research.google.com/github/Nadir-Git/machine-learning-zoomcamp-homework/blob/main/03-Classification/ML2025_HW3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Solution 1**

In [12]:
import pandas as pd

url = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv"
df = pd.read_csv(url)

df['industry'] = df['industry'].fillna('NA')

most_frequent = df['industry'].mode()[0]
print("Most frequent observation in 'industry' column:", most_frequent)

Most frequent observation in 'industry' column: retail


# **Solution 2**

In [13]:
for col in df.select_dtypes(include=['float64', 'int64']).columns:
    df[col] = df[col].fillna(0.0)

numerical_features = df.select_dtypes(include=['float64', 'int64'])

corr_matrix = numerical_features.corr()
print("Correlation matrix:\n", corr_matrix)

pairs = [
    ('interaction_count', 'lead_score'),
    ('number_of_courses_viewed', 'lead_score'),
    ('number_of_courses_viewed', 'interaction_count'),
    ('annual_income', 'interaction_count')
]

max_corr = 0
max_pair = None
for x, y in pairs:
    corr_value = corr_matrix.loc[x, y]
    if abs(corr_value) > abs(max_corr):
        max_corr = corr_value
        max_pair = (x, y)

print("\nPair with the biggest correlation:", max_pair, "with correlation:", max_corr)

Correlation matrix:
                           number_of_courses_viewed  annual_income  \
number_of_courses_viewed                  1.000000       0.009770   
annual_income                             0.009770       1.000000   
interaction_count                        -0.023565       0.027036   
lead_score                               -0.004879       0.015610   
converted                                 0.435914       0.053131   

                          interaction_count  lead_score  converted  
number_of_courses_viewed          -0.023565   -0.004879   0.435914  
annual_income                      0.027036    0.015610   0.053131  
interaction_count                  1.000000    0.009888   0.374573  
lead_score                         0.009888    1.000000   0.193673  
converted                          0.374573    0.193673   1.000000  

Pair with the biggest correlation: ('annual_income', 'interaction_count') with correlation: 0.02703647240481443


# **Solution 3**

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_classif

y = df['converted']
X = df.drop(columns=['converted'])

X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)

categorical_cols = X_train.select_dtypes(include=['object']).columns

X_train_encoded = X_train[categorical_cols].apply(lambda col: pd.factorize(col)[0])

mi_scores = mutual_info_classif(X_train_encoded, y_train, discrete_features=True, random_state=42)
mi_scores_rounded = {col: round(score, 2) for col, score in zip(categorical_cols, mi_scores)}

print("Mutual information scores:", mi_scores_rounded)

max_mi_feature = max(mi_scores_rounded, key=mi_scores_rounded.get)
print("Categorical variable with the biggest MI score:", max_mi_feature)

Mutual information scores: {'lead_source': np.float64(0.04), 'industry': np.float64(0.01), 'employment_status': np.float64(0.01), 'location': np.float64(0.0)}
Categorical variable with the biggest MI score: lead_source


# **Solution 4**

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score

categorical_cols = X_train.select_dtypes(include=['object']).columns
numerical_cols = X_train.select_dtypes(exclude=['object']).columns

ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
X_train_ohe = ohe.fit_transform(X_train[categorical_cols])
X_val_ohe = ohe.transform(X_val[categorical_cols])

import numpy as np
X_train_final = np.hstack([X_train[numerical_cols].values, X_train_ohe])
X_val_final = np.hstack([X_val[numerical_cols].values, X_val_ohe])

model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train_final, y_train)

y_val_pred = model.predict(X_val_final)

accuracy = round(accuracy_score(y_val, y_val_pred), 2)
print("Validation accuracy:", accuracy)

Validation accuracy: 0.7


# **Solution 5**

In [16]:
ohe_features = ohe.get_feature_names_out(categorical_cols)
all_features = list(numerical_cols) + list(ohe_features)

model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train_final, y_train)
original_acc = accuracy_score(y_val, model.predict(X_val_final))

def feature_drop_accuracy(feature_name):
    if feature_name in numerical_cols:
        idx_to_keep = [i for i, f in enumerate(all_features) if f != feature_name]
    else:
        idx_to_keep = [i for i, f in enumerate(all_features) if not f.startswith(feature_name + "_")]

    X_train_mod = X_train_final[:, idx_to_keep]
    X_val_mod = X_val_final[:, idx_to_keep]

    model_mod = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model_mod.fit(X_train_mod, y_train)
    return original_acc - accuracy_score(y_val, model_mod.predict(X_val_mod))

features_to_check = ['industry', 'employment_status', 'lead_score']
differences = {feat: feature_drop_accuracy(feat) for feat in features_to_check}

print("Accuracy differences:", differences)
least_useful_feature = min(differences, key=lambda k: abs(differences[k]))
print("Least useful feature:", least_useful_feature)

Accuracy differences: {'industry': 0.0, 'employment_status': 0.0034129692832763903, 'lead_score': -0.0068259385665528916}
Least useful feature: industry


# **Solution 6**

In [17]:
C_values = [0.01, 0.1, 1, 10, 100]
results = {}

for C in C_values:
    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model.fit(X_train_final, y_train)
    acc = round(accuracy_score(y_val, model.predict(X_val_final)), 3)
    results[C] = acc

print("Validation accuracies for different C values:", results)

best_C = max(results, key=results.get)
print("Best C value:", best_C)

Validation accuracies for different C values: {0.01: 0.7, 0.1: 0.7, 1: 0.7, 10: 0.7, 100: 0.7}
Best C value: 0.01
