In [118]:
# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import VarianceThreshold

In [119]:
file_path = "bank.csv"
df = pd.read_csv(file_path)

In [120]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,deposit
0,59,admin.,married,secondary,no,2343,yes,no,unknown,5,may,1042,1,-1,0,unknown,yes
1,56,admin.,married,secondary,no,45,no,no,unknown,5,may,1467,1,-1,0,unknown,yes
2,41,technician,married,secondary,no,1270,yes,no,unknown,5,may,1389,1,-1,0,unknown,yes
3,55,services,married,secondary,no,2476,yes,no,unknown,5,may,579,1,-1,0,unknown,yes
4,54,admin.,married,tertiary,no,184,no,no,unknown,5,may,673,2,-1,0,unknown,yes


In [121]:
df.isnull().sum()

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
deposit      0
dtype: int64

In [122]:
le = LabelEncoder()
categorical_columns = df.columns[df.dtypes == 'object']

for col in categorical_columns:
  df[col] = le.fit_transform(df[col])

X = df.drop('deposit', axis=1)
y = df['deposit']

In [123]:
scaler = StandardScaler().set_output(transform="pandas")
X = scaler.fit_transform(X)
selector = VarianceThreshold().set_output(transform="pandas")
print(X.columns)
print(selector.fit_transform(X).columns)

# DOUBT: What is this for now? Can I delete this train_test_split?
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(y_train.value_counts())
print(y_test.value_counts())

Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome'],
      dtype='object')
Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome'],
      dtype='object')
deposit
0    4707
1    4222
Name: count, dtype: int64
deposit
0    1166
1    1067
Name: count, dtype: int64


In [124]:
"""
Step 1 & 2 Merged

Step 1: Optimize parameters  (to minimize some cost function J) using the
same training set for all models. Compute some perf. metrics with the
training data (i.e. error, accuracy)

Step 2: Test the optimized models from step 1 with the CV set and choose the
model with the min CV error (or other performance metric with dev data)
"""
models = [
  LogisticRegression(),
]
scores = {}
scores2 = []

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
fold_splits = skf.split(X, y)
for model in models:
  train_accuracies = []
  train_errors = []
  test_accuracies = []
  test_errors = []

  for train_index, test_index in fold_splits:

    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    m_train, m_test = X_train.shape[0], X_test.shape[0]
    model.fit(X_train, y_train)
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    train_accuracies.append(accuracy_score(y_train, y_pred_train))
    train_errors.append(1/(2*m_train) * np.sum((y_pred_train - y_train)**2))
    test_accuracies.append(accuracy_score(y_test, y_pred_test))
    test_errors.append(1/(2*m_test) * np.sum((y_pred_test - y_test)**2))

  # DOUBT: Is mean square error being well calculated??
  # DOUBT: Where should we play with the hyperparameters??
  
  scores[model] = {
    'train_accuracy': np.mean(train_accuracies),
    'train_error': np.mean(train_errors),
    'test_accuracy': np.mean(test_accuracies),
    'test_error': np.mean(test_errors)
  }

# DOUBT: Select the best model based on the test_accuracy or test_error ??
best_model = max(scores, key=lambda x: scores[x]['test_accuracy'])
print('Best Model:', best_model, scores[best_model])

# y_pred = model.predict(X_train_scaled)
# print('Accuracy:', accuracy_score(y_train, y_pred))
# print('Confusion Matrix:\n', confusion_matrix(y_train, y_pred))
# print('Classification Report:\n', classification_report(y_train, y_pred))

Best Model: LogisticRegression() {'train_accuracy': 0.7967508584453772, 'train_error': 0.10162457077731138, 'test_accuracy': 0.7959136736586414, 'test_error': 0.10204316317067927}


In [125]:
"""
Step 3
Retrain the best model from step 2 with both train and CV sets
starting from the parameters got at step 2. Test the retrained model with test
set and compute test data perf. metric (the real model performance !!!):
"""

# DOUBT: Do we use the first split now?
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

best_model.fit(X_train, y_train)
print("Logistic Regression Parameters: ", LogisticRegression().get_params())
model = GridSearchCV(
  estimator=best_model,
  cv=10,
  scoring="f1", # what score should it be
  param_grid={
    'C': [0.01, 0.02, 0.03, 0.04, 0.05, 0.1, 0.5, 1, 10, 11, 15, 20, 100, 1000],
    'solver': ['liblinear', 'saga'],
    'penalty': ['l1', 'l2']
  },
  return_train_score=True,
)
print('Best Parameters:', model)
print("accuracy score: ", accuracy_score(y_test, best_model.predict(X_test)))

Logistic Regression Parameters:  {'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}
Best Parameters: GridSearchCV(cv=10, estimator=LogisticRegression(),
             param_grid={'C': [0.01, 0.02, 0.03, 0.04, 0.05, 0.1, 0.5, 1, 10,
                               11, 15, 20, 100, 1000],
                         'penalty': ['l1', 'l2'],
                         'solver': ['liblinear', 'saga']},
             return_train_score=True, scoring='f1')
accuracy score:  0.7899686520376176
