In [67]:
# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [68]:
file_path = "bank.csv"
df = pd.read_csv(file_path)

In [69]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,deposit
0,59,admin.,married,secondary,no,2343,yes,no,unknown,5,may,1042,1,-1,0,unknown,yes
1,56,admin.,married,secondary,no,45,no,no,unknown,5,may,1467,1,-1,0,unknown,yes
2,41,technician,married,secondary,no,1270,yes,no,unknown,5,may,1389,1,-1,0,unknown,yes
3,55,services,married,secondary,no,2476,yes,no,unknown,5,may,579,1,-1,0,unknown,yes
4,54,admin.,married,tertiary,no,184,no,no,unknown,5,may,673,2,-1,0,unknown,yes


In [70]:
df.isnull().sum()

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
deposit      0
dtype: int64

In [71]:
le = LabelEncoder()
categorical_columns = df.columns[df.dtypes == 'object']

for col in categorical_columns:
  df[col] = le.fit_transform(df[col])

X = df.drop('deposit', axis=1)
y = df['deposit']

In [72]:
scaler = StandardScaler().set_output(transform="pandas")
X = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# print the count of positive and negative values
print(y_train.value_counts())
print(y_test.value_counts())

deposit
0    4707
1    4222
Name: count, dtype: int64
deposit
0    1166
1    1067
Name: count, dtype: int64


In [None]:
from sklearn.model_selection import StratifiedKFold

models = [
  LogisticRegression(),
]
scores = {}
scores2 = []

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
fold_splits = skf.split(X, y)
for model in models:
  train_accuracies = []
  train_errors = []
  test_accuracies = []
  test_errors = []

  for train_index, test_index in fold_splits:

    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    m_train, m_test = X_train.shape[0], X_test.shape[0]
    model.fit(X_train, y_train)
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    train_accuracies.append(accuracy_score(y_train, y_pred_train))
    train_errors.append(1/(2*m_train) * np.sum((y_pred_train - y_train)**2))
    test_accuracies.append(accuracy_score(y_test, y_pred_test))
    test_errors.append(1/(2*m_test) * np.sum((y_pred_test - y_test)**2))
  
  scores[model] = {
    'train_accuracy': np.mean(train_accuracies),
    'train_error': np.mean(train_errors),
    'test_accuracy': np.mean(test_accuracies),
    'test_error': np.mean(test_errors)
  }

print(scores)

# y_pred = model.predict(X_train_scaled)
# print('Accuracy:', accuracy_score(y_train, y_pred))
# print('Confusion Matrix:\n', confusion_matrix(y_train, y_pred))
# print('Classification Report:\n', classification_report(y_train, y_pred))

{LogisticRegression(): {'train_accuracy': 0.7967508584453772, 'train_error': 0.10162457077731138, 'test_accuracy': 0.7959136736586414, 'test_error': 0.10204316317067927}}
0.7967508584453772 0.10162457077731138 0.7959136736586414 0.10204316317067927


In [74]:
"""
Step 2
Test the optimized models from step 1 with the CV set and choose the
model with the min CV error (or other performance metric with dev data)
"""
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold



print("Logistic Regression Parameters: ", LogisticRegression().get_params())
model = GridSearchCV(
  estimator=LogisticRegression(random_state=42, class_weight='balanced'),
  cv=10,
  scoring="roc_auc", # what score should it be
  param_grid={
    'C': [0.01, 0.02, 0.03, 0.04, 0.05, 0.1, 0.5, 1, 10, 11, 15, 20, 100, 1000],
    'solver': ['liblinear', 'saga'],
    'penalty': ['l1', 'l2']
  },
  return_train_score=True,
)
model.fit(X_train_scaled, y_train)
print('Best Parameters:', model.best_params_)
print('Best Score:', model.best_score_)

Logistic Regression Parameters:  {'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}


ValueError: Found input variables with inconsistent numbers of samples: [8929, 10046]

In [None]:
"""
Step 3
Retrain the best model from step 2 with both train and CV sets
starting from the parameters got at step 2. Test the retrained model with test
set and compute test data perf. metric (the real model performance !!!):
"""
# 0.8100297862947545 -> 2
# 0.8100297862947545 

from sklearn.metrics import f1_score


best_model_lr = model.best_estimator_
print(best_model_lr.get_params()["C"])
print(model.best_params_["C"])
best_model_lr.fit(X_train_scaled, y_train)
y_pred = best_model_lr.predict(X_test_scaled)
print('F1 Score:', f1_score(y_test, y_pred))
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_pred))
print('Classification Report:\n', classification_report(y_test, y_pred))

0.1
0.1
F1 Score: 0.8185693983893889
Accuracy: 0.8284818629646216
Confusion Matrix:
 [[986 189]
 [194 864]]
Classification Report:
               precision    recall  f1-score   support

       False       0.84      0.84      0.84      1175
        True       0.82      0.82      0.82      1058

    accuracy                           0.83      2233
   macro avg       0.83      0.83      0.83      2233
weighted avg       0.83      0.83      0.83      2233

