In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

import warnings
warnings.filterwarnings('ignore')

In [13]:
file_path = "bank.csv"
df = pd.read_csv(file_path)

In [14]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,deposit
0,59,admin.,married,secondary,no,2343,yes,no,unknown,5,may,1042,1,-1,0,unknown,yes
1,56,admin.,married,secondary,no,45,no,no,unknown,5,may,1467,1,-1,0,unknown,yes
2,41,technician,married,secondary,no,1270,yes,no,unknown,5,may,1389,1,-1,0,unknown,yes
3,55,services,married,secondary,no,2476,yes,no,unknown,5,may,579,1,-1,0,unknown,yes
4,54,admin.,married,tertiary,no,184,no,no,unknown,5,may,673,2,-1,0,unknown,yes


In [15]:
df.isnull().sum()

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
deposit      0
dtype: int64

In [16]:
df['deposit'] = LabelEncoder().fit_transform(df['deposit'])

categorical_columns = df.columns[df.dtypes == 'object']
binary_category_columns = [col for col in categorical_columns if df[col].nunique() == 2]
n_category_columns = [col for col in categorical_columns if df[col].nunique() > 2]
numerical_columns = df.columns[df.dtypes != 'object'].drop('deposit')

df = pd.get_dummies(df, columns=binary_category_columns, drop_first=True)
df = pd.get_dummies(df, columns=n_category_columns)

df[numerical_columns] = StandardScaler().fit_transform(df[numerical_columns])

X = df.drop('deposit', axis=1)
y = df['deposit']

In [17]:
numerical_columns = X.select_dtypes(include=[np.number]).columns
categorical_columns = X.select_dtypes(exclude=[np.number]).columns

for column in X.columns:
  print(f"{column}: {len(X[column].unique())}", X[column].dtype)

print(f"{y.name}: {len(y.unique())}", y.dtype)

age: 76 float64
balance: 3805 float64
day: 31 float64
duration: 1428 float64
campaign: 36 float64
pdays: 472 float64
previous: 34 float64
default_yes: 2 bool
housing_yes: 2 bool
loan_yes: 2 bool
job_admin.: 2 bool
job_blue-collar: 2 bool
job_entrepreneur: 2 bool
job_housemaid: 2 bool
job_management: 2 bool
job_retired: 2 bool
job_self-employed: 2 bool
job_services: 2 bool
job_student: 2 bool
job_technician: 2 bool
job_unemployed: 2 bool
job_unknown: 2 bool
marital_divorced: 2 bool
marital_married: 2 bool
marital_single: 2 bool
education_primary: 2 bool
education_secondary: 2 bool
education_tertiary: 2 bool
education_unknown: 2 bool
contact_cellular: 2 bool
contact_telephone: 2 bool
contact_unknown: 2 bool
month_apr: 2 bool
month_aug: 2 bool
month_dec: 2 bool
month_feb: 2 bool
month_jan: 2 bool
month_jul: 2 bool
month_jun: 2 bool
month_mar: 2 bool
month_may: 2 bool
month_nov: 2 bool
month_oct: 2 bool
month_sep: 2 bool
poutcome_failure: 2 bool
poutcome_other: 2 bool
poutcome_success: 2 b

In [18]:
# Step 1
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(y_train.unique())

[0 1]


In [None]:
# Step 2
model_attributes = [
  # [model, param_grid, cv]
  [
    KNeighborsClassifier(),
    {'n_neighbors': list(range(1, 20)),'weights': ['uniform', 'distance'],'p': [1, 2],'metric' : ['minkowski','euclidean','manhattan']},
    10
  ],[
    LogisticRegression(class_weight='balanced'),
    {"C": [0.001, 0.01, 0.1, 1, 10, 100, 1000], 'solver': ['liblinear', 'saga']},
    10
  ],[
    RandomForestClassifier(),
    {'n_estimators': [100, 200, 400], 'max_depth': [10, 15, 20, 30, None], 'min_samples_leaf': [1, 2, 4]},
    10
  ],[
    SVC(),
    {'C': [0.1, 1, 10, 100, 1000], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001]},
    5
  ]
]

for model, param_grid, cv in model_attributes:
  print(f"Model: {model.__class__.__name__}")
  grid_search = GridSearchCV(model, param_grid, cv=cv, scoring='accuracy')
  grid_search.fit(X_train, y_train)
  tuned_model = grid_search.best_estimator_
  print(f"Best parameters found: {grid_search.best_params_}; score: {grid_search.best_score_}")
  print(f"Tuned Model: {tuned_model}")

  tuned_model.fit(X_train, y_train)
  y_pred_test = tuned_model.predict(X_test)
  print(f"Accuracy: {accuracy_score(y_test, y_pred_test)}")
  print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_pred_test)}")
  print(f"Classification Report:\n{classification_report(y_test, y_pred_test)}\n\n")

Model: KNeighborsClassifier
Best parameters found: {'metric': 'minkowski', 'n_neighbors': 16, 'p': 2, 'weights': 'distance'}; score: 0.8180058903579912
Tuned Model: KNeighborsClassifier(n_neighbors=16, weights='distance')
Accuracy: 0.8266905508284819
Confusion Matrix:
[[1004  171]
 [ 216  842]]
Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.85      0.84      1175
           1       0.83      0.80      0.81      1058

    accuracy                           0.83      2233
   macro avg       0.83      0.83      0.83      2233
weighted avg       0.83      0.83      0.83      2233



Model: LogisticRegression
Best parameters found: {'C': 0.1, 'solver': 'saga'}; score: 0.8278635777020071
Tuned Model: LogisticRegression(C=0.1, class_weight='balanced', solver='saga')
Accuracy: 0.8280340349305867
Confusion Matrix:
[[982 193]
 [191 867]]
Classification Report:
              precision    recall  f1-score   support

           0       0