# Bank Marketing Predictor

1. Mubbara Majid (SP23-BAI-027)
2. Noor Fatima (SP23-BAI-046)

# Data Loading and Pre Processing

In [12]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, RandomizedSearchCV
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import  f1_score,classification_report, confusion_matrix, accuracy_score, ConfusionMatrixDisplay, roc_auc_score, roc_curve

In [13]:
bank_data = pd.read_csv('bank.csv', sep=';')

In [14]:
bank_data.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,no
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,no
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,no
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown,no
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown,no


In [15]:
bank_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4521 entries, 0 to 4520
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        4521 non-null   int64 
 1   job        4521 non-null   object
 2   marital    4521 non-null   object
 3   education  4521 non-null   object
 4   default    4521 non-null   object
 5   balance    4521 non-null   int64 
 6   housing    4521 non-null   object
 7   loan       4521 non-null   object
 8   contact    4521 non-null   object
 9   day        4521 non-null   int64 
 10  month      4521 non-null   object
 11  duration   4521 non-null   int64 
 12  campaign   4521 non-null   int64 
 13  pdays      4521 non-null   int64 
 14  previous   4521 non-null   int64 
 15  poutcome   4521 non-null   object
 16  y          4521 non-null   object
dtypes: int64(7), object(10)
memory usage: 600.6+ KB


In [16]:
bank_data['y'].value_counts()

y
no     4000
yes     521
Name: count, dtype: int64

In [17]:
bank_data.describe()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
count,4521.0,4521.0,4521.0,4521.0,4521.0,4521.0,4521.0
mean,41.170095,1422.657819,15.915284,263.961292,2.79363,39.766645,0.542579
std,10.576211,3009.638142,8.247667,259.856633,3.109807,100.121124,1.693562
min,19.0,-3313.0,1.0,4.0,1.0,-1.0,0.0
25%,33.0,69.0,9.0,104.0,1.0,-1.0,0.0
50%,39.0,444.0,16.0,185.0,2.0,-1.0,0.0
75%,49.0,1480.0,21.0,329.0,3.0,-1.0,0.0
max,87.0,71188.0,31.0,3025.0,50.0,871.0,25.0


Label Encoding

In [18]:
category_columns = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome', 'y']
label_encoders = {col: LabelEncoder() for col in category_columns}

for col in category_columns:
    bank_data[col] = label_encoders[col].fit_transform(bank_data[col])

In [19]:
X = bank_data.drop(['y'], axis=1)
Y = bank_data['y']

X.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
0,30,10,1,0,0,1787,0,0,0,19,10,79,1,-1,0,3
1,33,7,1,1,0,4789,1,1,0,11,8,220,1,339,4,0
2,35,4,2,2,0,1350,1,0,0,16,0,185,1,330,1,0
3,30,4,1,2,0,1476,1,1,2,3,6,199,4,-1,0,3
4,59,1,1,1,0,0,1,0,2,5,8,226,1,-1,0,3


Data Splitting

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42, stratify=Y)

Data Normalization

In [21]:
scaler = StandardScaler()
numeric_columns = ['age', 'balance', 'day','duration', 'campaign', 'pdays', 'previous']
X_train[numeric_columns] = scaler.fit_transform(X_train[numeric_columns])
X_test[numeric_columns] = scaler.fit_transform(X_test[numeric_columns])

Handling Imbalanced Classes

In [22]:
sm = SMOTE(random_state=42)
X_train, y_train = sm.fit_resample(X_train, y_train)

# Logistic Regression

In [None]:
logistic_model = LogisticRegression(random_state=42, max_iter=1000)

In [None]:
logistic_model.fit(X_train, y_train)

In [None]:
logistic_predict = logistic_model.predict(X_test)
logistic_predict_probab = logistic_model.predict_log_proba(X_test)[:, 1]

In [None]:
f1 = f1_score(y_test, logistic_predict)
print("F1-Score:", f1)

In [None]:
auc = roc_auc_score(y_test, logistic_predict_probab)
print("AUC-ROC:", auc)

Tuning Hyperparameters

In [None]:
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga']
}

In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

GridSearchCV and RandomizedSearchCv

In [None]:
grid_logistic = GridSearchCV(LogisticRegression(max_iter=1000, random_state=42), param_grid, scoring='f1', cv=cv, n_jobs=-1)
grid_logistic.fit(X_train, y_train)


random_logistic = RandomizedSearchCV(LogisticRegression(max_iter=1000, random_state=42), param_grid, scoring='f1', cv=cv, n_jobs=-1, n_iter=20, random_state=42)
random_logistic.fit(X_train, y_train)


In [None]:
grid_logistic = GridSearchCV(estimator=LogisticRegression(max_iter=1000, random_state=42),
                           param_grid=param_grid,
                           scoring='f1',
                           cv=cv,
                           verbose=1,
                           n_jobs=-1)

In [None]:
grid_logistic.fit(X_train, y_train)

In [None]:
grid_logistic.best_params_

In [None]:
best_model = grid_logistic.best_estimator_

In [None]:
y_pred_tuned = best_model.predict(X_test)
y_pred_proba_tuned = best_model.predict_proba(X_test)[:, 1]

In [None]:
f1_tuned = f1_score(y_test, y_pred_tuned)
print("Tuned F1-Score:", f1_tuned)

auc_tuned = roc_auc_score(y_test, y_pred_proba_tuned)
print("Tuned AUC-ROC:", auc_tuned)

In [None]:
print(classification_report(y_test, y_pred_tuned))

Checking importance of features

In [None]:
coefficients = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': best_model.coef_[0]
}).sort_values(by='Coefficient', ascending=False)

print(coefficients)

XGboost Algorithm

In [None]:
xgb_model = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train, y_train)

In [None]:
xgb_predict_tuned = xgb_model.predict(X_test)
xgb_predict_probab_tuned = xgb_model.predict_proba(X_test)[:, 1]

Hyperparameters tuning

In [None]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2]
}

In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
grid_xgb = GridSearchCV(estimator=XGBClassifier(random_state=42, use_label_encoder=False, eval_metric="logloss"),
                           param_grid=param_grid,
                           scoring='f1',
                           cv=cv,
                           verbose=1,
                           n_jobs=-1)

In [None]:
grid_xgb.fit(X_train, y_train)

In [None]:
print(grid_xgb.best_params_)

In [None]:
best_model = grid_xgb.best_estimator_

Evaluating Tuned Model

In [None]:
y_predict_tuned = best_model.predict(X_test)
y_predict_probab_tuned = best_model.predict_proba(X_test)[:, 1]

In [None]:
f1_tuned = f1_score(y_test, y_predict_tuned)
print(f'f1 tuned: {f1_tuned}')
auc_tuned = roc_auc_score(y_test, y_predict_probab_tuned)
print(f'AUC-ROC Tuned: {auc_tuned}')

In [None]:
print("Confusion Matrix:\n", confusion_matrix(y_test, y_predict_tuned))
print(classification_report(y_test, y_predict_tuned))