In [69]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

import warnings
warnings.filterwarnings('ignore')

In [70]:
file_path = "bank.csv"
df = pd.read_csv(file_path)

In [71]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,deposit
0,59,admin.,married,secondary,no,2343,yes,no,unknown,5,may,1042,1,-1,0,unknown,yes
1,56,admin.,married,secondary,no,45,no,no,unknown,5,may,1467,1,-1,0,unknown,yes
2,41,technician,married,secondary,no,1270,yes,no,unknown,5,may,1389,1,-1,0,unknown,yes
3,55,services,married,secondary,no,2476,yes,no,unknown,5,may,579,1,-1,0,unknown,yes
4,54,admin.,married,tertiary,no,184,no,no,unknown,5,may,673,2,-1,0,unknown,yes


In [72]:
df.isnull().sum()

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
deposit      0
dtype: int64

In [73]:
df['deposit'] = LabelEncoder().fit_transform(df['deposit'])

categorical_columns = df.columns[df.dtypes == 'object']
binary_category_columns = [col for col in categorical_columns if df[col].nunique() == 2]
n_category_columns = [col for col in categorical_columns if df[col].nunique() > 2]
numerical_columns = df.columns[df.dtypes != 'object'].drop('deposit')

df = pd.get_dummies(df, columns=binary_category_columns, drop_first=True)
df = pd.get_dummies(df, columns=n_category_columns)

df[numerical_columns] = StandardScaler().fit_transform(df[numerical_columns])

X = df.drop('deposit', axis=1)
y = df['deposit']

In [74]:
numerical_columns = X.select_dtypes(include=[np.number]).columns
categorical_columns = X.select_dtypes(exclude=[np.number]).columns

for column in X.columns:
  print(f"{column}: {len(X[column].unique())}", X[column].dtype)

print(f"{y.name}: {len(y.unique())}", y.dtype)

age: 76 float64
balance: 3805 float64
day: 31 float64
duration: 1428 float64
campaign: 36 float64
pdays: 472 float64
previous: 34 float64
default_yes: 2 bool
housing_yes: 2 bool
loan_yes: 2 bool
job_admin.: 2 bool
job_blue-collar: 2 bool
job_entrepreneur: 2 bool
job_housemaid: 2 bool
job_management: 2 bool
job_retired: 2 bool
job_self-employed: 2 bool
job_services: 2 bool
job_student: 2 bool
job_technician: 2 bool
job_unemployed: 2 bool
job_unknown: 2 bool
marital_divorced: 2 bool
marital_married: 2 bool
marital_single: 2 bool
education_primary: 2 bool
education_secondary: 2 bool
education_tertiary: 2 bool
education_unknown: 2 bool
contact_cellular: 2 bool
contact_telephone: 2 bool
contact_unknown: 2 bool
month_apr: 2 bool
month_aug: 2 bool
month_dec: 2 bool
month_feb: 2 bool
month_jan: 2 bool
month_jul: 2 bool
month_jun: 2 bool
month_mar: 2 bool
month_may: 2 bool
month_nov: 2 bool
month_oct: 2 bool
month_sep: 2 bool
poutcome_failure: 2 bool
poutcome_other: 2 bool
poutcome_success: 2 b

In [75]:
# Step 1
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(y_train.unique())

[0 1]


In [None]:
# Step 2
svm = SVC()
param_grid = {
  'C': [0.1, 1, 10, 100, 1000],  
  'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
}
cv = 5
scoring = 'accuracy'

grid_search = GridSearchCV(svm, param_grid, cv=cv, scoring=scoring) # Grid Search takes 5 minutes - very computationally expensive
grid_search.fit(X_train, y_train)
tuned_svm = grid_search.best_estimator_
print(f"Best parameters found: {grid_search.best_params_}; score: {grid_search.best_score_}")
print("Tuned Model: ", tuned_svm)

Best parameters found: {'C': 100, 'gamma': 0.01}; score: 0.8512707300165306
Tuned Model:  SVC(C=100, gamma=0.01)


In [77]:
# Step 3
tuned_svm.fit(X_train, y_train)
y_pred_test = tuned_svm.predict(X_test)

print(tuned_svm)
print("Accuracy: ", accuracy_score(y_test, y_pred_test))
print("Confusion Matrix: ", confusion_matrix(y_test, y_pred_test))
print("Classification Report: ", classification_report(y_test, y_pred_test))

# Default Logistic Regression
default_svm = SVC()
default_svm.fit(X_train, y_train)
y_pred_test = default_svm.predict(X_test)

print(default_svm)
print("Accuracy: ", accuracy_score(y_test, y_pred_test))
print("Confusion Matrix: ", confusion_matrix(y_test, y_pred_test))
print("Classification Report: ", classification_report(y_test, y_pred_test))

SVC(C=100, gamma=0.01)
Accuracy:  0.8607254814151366
Confusion Matrix:  [[985 190]
 [121 937]]
Classification Report:                precision    recall  f1-score   support

           0       0.89      0.84      0.86      1175
           1       0.83      0.89      0.86      1058

    accuracy                           0.86      2233
   macro avg       0.86      0.86      0.86      2233
weighted avg       0.86      0.86      0.86      2233

SVC()
Accuracy:  0.8557993730407524
Confusion Matrix:  [[977 198]
 [124 934]]
Classification Report:                precision    recall  f1-score   support

           0       0.89      0.83      0.86      1175
           1       0.83      0.88      0.85      1058

    accuracy                           0.86      2233
   macro avg       0.86      0.86      0.86      2233
weighted avg       0.86      0.86      0.86      2233

