In [617]:
# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import OneHotEncoder


In [618]:
file_path = "bank.csv"
df = pd.read_csv(file_path)

In [619]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,deposit
0,59,admin.,married,secondary,no,2343,yes,no,unknown,5,may,1042,1,-1,0,unknown,yes
1,56,admin.,married,secondary,no,45,no,no,unknown,5,may,1467,1,-1,0,unknown,yes
2,41,technician,married,secondary,no,1270,yes,no,unknown,5,may,1389,1,-1,0,unknown,yes
3,55,services,married,secondary,no,2476,yes,no,unknown,5,may,579,1,-1,0,unknown,yes
4,54,admin.,married,tertiary,no,184,no,no,unknown,5,may,673,2,-1,0,unknown,yes


In [620]:
df.isnull().sum()

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
deposit      0
dtype: int64

In [621]:
df['deposit'] = LabelEncoder().fit_transform(df['deposit'])
categorical_columns = df.columns[df.dtypes == 'object']
numerical_columns = df.columns[df.dtypes != 'object'].drop('deposit')
print("numerical_columns:", numerical_columns)

# encoder = LabelEncoder()
df = pd.get_dummies(df, columns=categorical_columns, drop_first=True)

df[numerical_columns] = StandardScaler().fit_transform(df[numerical_columns])

X = df.drop('deposit', axis=1)
y = df['deposit']

numerical_columns: Index(['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous'], dtype='object')


In [622]:
numerical_columns = X.select_dtypes(include=[np.number]).columns
categorical_columns = X.select_dtypes(exclude=[np.number]).columns

for column in X.columns:
  print(f"{column}: {len(X[column].unique())}", type(X[column]))

age: 76 <class 'pandas.core.series.Series'>
balance: 3805 <class 'pandas.core.series.Series'>
day: 31 <class 'pandas.core.series.Series'>
duration: 1428 <class 'pandas.core.series.Series'>
campaign: 36 <class 'pandas.core.series.Series'>
pdays: 472 <class 'pandas.core.series.Series'>
previous: 34 <class 'pandas.core.series.Series'>
job_blue-collar: 2 <class 'pandas.core.series.Series'>
job_entrepreneur: 2 <class 'pandas.core.series.Series'>
job_housemaid: 2 <class 'pandas.core.series.Series'>
job_management: 2 <class 'pandas.core.series.Series'>
job_retired: 2 <class 'pandas.core.series.Series'>
job_self-employed: 2 <class 'pandas.core.series.Series'>
job_services: 2 <class 'pandas.core.series.Series'>
job_student: 2 <class 'pandas.core.series.Series'>
job_technician: 2 <class 'pandas.core.series.Series'>
job_unemployed: 2 <class 'pandas.core.series.Series'>
job_unknown: 2 <class 'pandas.core.series.Series'>
marital_married: 2 <class 'pandas.core.series.Series'>
marital_single: 2 <clas

In [623]:
# Step 1
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(y_train.unique())

[1 0]


In [624]:
# Step 2
lr = LogisticRegression(class_weight='balanced')
param_grid = {
  "C": [0.001, 0.01, 0.1, 1, 10, 100, 1000],
  'solver': ['liblinear', 'saga']
}
cv = 10
scoring = 'roc_auc'

grid_search = GridSearchCV(lr, param_grid, cv=10, scoring=scoring)
grid_search.fit(X_train, y_train)
tuned_lr = grid_search.best_estimator_
print("Best parameters found: ", grid_search.best_params_)
print(tuned_lr)


# model = GridSearchCV(
#   estimator=best_model,
#   cv=10,
#   scoring="f1", # DOUBT: what score should we try to maximize in this kind of problem?
#   param_grid={
#     'C': [0.01, 0.02, 0.03, 0.04, 0.05, 0.1, 0.5, 1, 10, 11, 15, 20, 100, 1000],
#     'solver': ['liblinear', 'saga'],
#     'penalty': ['l1', 'l2']
#   },
#   return_train_score=True,
# )


Best parameters found:  {'C': 1, 'solver': 'saga'}
LogisticRegression(C=1, class_weight='balanced', solver='saga')


In [625]:
# Step 3
print(tuned_lr)
tuned_lr.fit(X_train, y_train)
y_pred_test = tuned_lr.predict(X_test)

print("Accuracy: ", accuracy_score(y_test, y_pred_test))
print("Confusion Matrix: ", confusion_matrix(y_test, y_pred_test))
print("Classification Report: ", classification_report(y_test, y_pred_test))

tuned_lr = LogisticRegression()
print(tuned_lr)
tuned_lr.fit(X_train, y_train)
y_pred_test = tuned_lr.predict(X_test)

print("Accuracy: ", accuracy_score(y_test, y_pred_test))
print("Confusion Matrix: ", confusion_matrix(y_test, y_pred_test))
print("Classification Report: ", classification_report(y_test, y_pred_test))

LogisticRegression(C=1, class_weight='balanced', solver='saga')
Accuracy:  0.8087774294670846
Confusion Matrix:  [[957 209]
 [218 849]]
Classification Report:                precision    recall  f1-score   support

           0       0.81      0.82      0.82      1166
           1       0.80      0.80      0.80      1067

    accuracy                           0.81      2233
   macro avg       0.81      0.81      0.81      2233
weighted avg       0.81      0.81      0.81      2233

LogisticRegression()
Accuracy:  0.8078817733990148
Confusion Matrix:  [[971 195]
 [234 833]]
Classification Report:                precision    recall  f1-score   support

           0       0.81      0.83      0.82      1166
           1       0.81      0.78      0.80      1067

    accuracy                           0.81      2233
   macro avg       0.81      0.81      0.81      2233
weighted avg       0.81      0.81      0.81      2233

