# Objective 

The objective of the Bank Marketing dataset is to predict whether a client will subscribe to a term deposit with a bank or not, based on various demographic, social, and economic factors. 


# Import libraries & Data Overview

In [None]:
import pandas as pd 
import numpy as np 

import plotly.express as px 
import seaborn as sns
import matplotlib.pyplot as plt
            
import random 
import os 
import math 
import joblib

from sklearn.model_selection import StratifiedKFold
from sklearn.datasets import load_diabetes
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

# Configurations

In [None]:
class Configurations:
    SEED = 42
    NUM_FOLDS = 5
    SHUFFLE = True
    MODELS = {
        'LogisticRegression' : LogisticRegression(),
        'KNN' : KNeighborsClassifier(n_neighbors=5),
        'DecisionTree' : DecisionTreeClassifier(), 

    }

    df_path = '/content/bank.csv'
    TARGET_NAME = 'target'

    FOLD_TYPE = 'STRATIFIED'
    SAVE_MODEL = True

    LOAD_MODEL = './'

# Exploratory Data Analysis 

In [None]:
df = pd.read_csv(Configurations.df_path)
df = df.rename(columns = {'y' : 'target'})
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,target
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,no
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,no
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,no
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown,no
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown,no


In [None]:
df.describe()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
count,4521.0,4521.0,4521.0,4521.0,4521.0,4521.0,4521.0
mean,41.170095,1422.657819,15.915284,263.961292,2.79363,39.766645,0.542579
std,10.576211,3009.638142,8.247667,259.856633,3.109807,100.121124,1.693562
min,19.0,-3313.0,1.0,4.0,1.0,-1.0,0.0
25%,33.0,69.0,9.0,104.0,1.0,-1.0,0.0
50%,39.0,444.0,16.0,185.0,2.0,-1.0,0.0
75%,49.0,1480.0,21.0,329.0,3.0,-1.0,0.0
max,87.0,71188.0,31.0,3025.0,50.0,871.0,25.0


In [None]:
df.isnull().sum()

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
target       0
dtype: int64

In [1]:
def plot_bar(df, col_name = None, title = None):
    temp = df.groupby([col_name, 'target']).count().reset_index()
    temp = temp.pivot(index = col_name, columns = 'target', values = 'age').reset_index()
    y = temp[col_name].tolist()
    fig = px.bar(temp, x = col_name, y = ['no', 'yes'], title = title)
    return fig 

In [None]:
def plot_icile(df, col_name = None, title = None):
    temp = df.groupby([col_name, 'target']).count().reset_index()
    temp = temp.rename(columns = {'age' : 'counts'})
    fig = px.icicle(temp, path=[px.Constant(col_name), col_name, 'target'], values='counts', title = title)
    return fig 

In [None]:
def plot_sunburst(df, col_name = None, title = None):
    temp = df.groupby([col_name, 'target']).count().reset_index()
    temp = temp.rename(columns = {'age' : 'counts'})
    fig = px.sunburst(temp, path=[px.Constant(col_name), col_name, 'target'], values='counts', title = title)
    return fig 

In [None]:
plot_bar(df, col_name = 'marital', title = 'Counts of Marital')

In [None]:
plot_icile(df, col_name = 'education', title = 'Counts of education')

In [None]:
plot_icile(df, col_name = 'job', title = 'Counts of job')

In [None]:
plot_bar(df, col_name = 'default', title = 'Counts of default')

In [None]:
plot_bar(df, col_name = 'housing', title = 'Counts of housing')

In [None]:
plot_bar(df, col_name = 'loan', title = 'Counts of loan')

In [None]:
plot_sunburst(df, col_name = 'month', title = 'Counts of month')

# Data Preparation

In [None]:
df = pd.get_dummies(df, columns = ['job', 'default', 'education', 'housing', 'marital', 'contact', 'month', 'loan', 'poutcome'])
df.head()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,target,job_admin.,job_blue-collar,...,month_may,month_nov,month_oct,month_sep,loan_no,loan_yes,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown
0,30,1787,19,79,1,-1,0,no,0,0,...,0,0,1,0,1,0,0,0,0,1
1,33,4789,11,220,1,339,4,no,0,0,...,1,0,0,0,0,1,1,0,0,0
2,35,1350,16,185,1,330,1,no,0,0,...,0,0,0,0,1,0,1,0,0,0
3,30,1476,3,199,4,-1,0,no,0,0,...,0,0,0,0,0,1,0,0,0,1
4,59,0,5,226,1,-1,0,no,0,1,...,1,0,0,0,1,0,0,0,0,1


In [None]:
df.isnull().sum()

age                    0
balance                0
day                    0
duration               0
campaign               0
pdays                  0
previous               0
target                 0
job_admin.             0
job_blue-collar        0
job_entrepreneur       0
job_housemaid          0
job_management         0
job_retired            0
job_self-employed      0
job_services           0
job_student            0
job_technician         0
job_unemployed         0
job_unknown            0
default_no             0
default_yes            0
education_primary      0
education_secondary    0
education_tertiary     0
education_unknown      0
housing_no             0
housing_yes            0
marital_divorced       0
marital_married        0
marital_single         0
contact_cellular       0
contact_telephone      0
contact_unknown        0
month_apr              0
month_aug              0
month_dec              0
month_feb              0
month_jan              0
month_jul              0


# Data Partitioning - StratifiedKFold Strategy 

In [None]:
def create_folds(df, num_folds = Configurations.NUM_FOLDS):
    df["kfold"] = -1
    df = df.sample(frac=1).reset_index(drop=True) 
    y = df.target.values
    kf = StratifiedKFold(n_splits=5)
    for f, (t_, v_) in enumerate(kf.split(X=df, y=y)): 
        df.loc[v_, 'kfold'] = f
    return df

In [None]:
df = create_folds(df)
df['target'][df['target'] == 'no'] = 0;
df['target'][df['target'] == 'yes'] = 1;
df['target'] = df['target'].astype(int)
df.to_csv('Folds-bank.csv', index = False)
df.head()



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,target,job_admin.,job_blue-collar,...,month_nov,month_oct,month_sep,loan_no,loan_yes,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown,kfold
0,38,4425,30,162,1,-1,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
1,35,725,20,219,7,133,1,0,1,0,...,1,0,0,1,0,1,0,0,0,0
2,37,33,20,106,2,167,1,0,0,1,...,1,0,0,1,0,1,0,0,0,0
3,33,210,20,201,1,-1,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
4,53,751,16,343,4,-1,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0


In [None]:
df['temp-count'] = 1
df.groupby(['kfold', 'target']).sum()['temp-count']

kfold  target
0      0         800
       1         105
1      0         800
       1         104
2      0         800
       1         104
3      0         800
       1         104
4      0         800
       1         104
Name: temp-count, dtype: int64

# Model Building

In [None]:
def run_folds(df, model_name, model, fold = 0):

    trainset = df[df['kfold'] != fold]
    validset = df[df['kfold'] == fold]

    X_train, y_train = trainset.drop(['target', 'kfold'], axis = 1), trainset['target']
    X_valid, y_valid = validset.drop(['target', 'kfold'], axis = 1), validset['target']

    model.fit(X_train, y_train);
    y_train_pred = model.predict(X_train);
    y_valid_pred = model.predict(X_valid);

    train_acc_score = accuracy_score(y_train, y_train_pred)
    valid_acc_score = accuracy_score(y_valid, y_valid_pred)
    train_auc_score = roc_auc_score(y_train, y_train_pred)
    valid_auc_score = roc_auc_score(y_valid, y_valid_pred)

    print(f'MODEL : {model_name} FOLD : {fold} TRAIN_ACC : {train_acc_score} VALID_ACC : {valid_acc_score} TRAIN_AUC : {train_auc_score} VALID_AUC : {valid_auc_score}')

    if Configurations.SAVE_MODEL == True:
        joblib.dump(model, 'bank fold '+ str(fold) + ' ' +model_name+'.pkl')

    return train_acc_score, valid_acc_score, train_auc_score, valid_auc_score

In [None]:
temp1 = {}
temp2 = {}
for model_name, model in Configurations.MODELS.items():
    train_acc_scores = []
    valid_acc_scores = []
    train_auc_scores = []
    valid_auc_scores = []
    for i in range(Configurations.NUM_FOLDS):
        train_acc_score, valid_acc_score, train_auc_score, valid_auc_score = run_folds(df, model_name, model, fold = i)
        train_acc_scores.append(train_acc_score)
        valid_acc_scores.append(valid_acc_score)
        train_auc_scores.append(train_auc_score)
        valid_auc_scores.append(valid_auc_score)
    print('=============================================')
    temp1[model_name + '_train_acc'] = train_acc_scores
    temp2[model_name + '_valid_acc'] = valid_acc_scores
    temp1[model_name + '_train_auc_roc'] = train_auc_scores
    temp2[model_name + '_valid_auc_roc'] = valid_auc_scores
temp_df1 = pd.DataFrame(temp1)
temp_df2 = pd.DataFrame(temp2)


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



MODEL : LogisticRegression FOLD : 0 TRAIN_ACC : 0.8918694690265486 VALID_ACC : 0.8972375690607735 TRAIN_AUC : 0.5969711538461538 VALID_AUC : 0.627470238095238



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



MODEL : LogisticRegression FOLD : 1 TRAIN_ACC : 0.8932817251866187 VALID_ACC : 0.8805309734513275 TRAIN_AUC : 0.6039088729016786 VALID_AUC : 0.5727884615384616



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



MODEL : LogisticRegression FOLD : 2 TRAIN_ACC : 0.8905170030411944 VALID_ACC : 0.8882743362831859 TRAIN_AUC : 0.5856617206235012 VALID_AUC : 0.5646153846153846



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



MODEL : LogisticRegression FOLD : 3 TRAIN_ACC : 0.8943876140447885 VALID_ACC : 0.8871681415929203 TRAIN_AUC : 0.6055766636690647 VALID_AUC : 0.5765384615384616



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



MODEL : LogisticRegression FOLD : 4 TRAIN_ACC : 0.8927287807575338 VALID_ACC : 0.8960176991150443 TRAIN_AUC : 0.6004680005995203 VALID_AUC : 0.6233653846153846
MODEL : KNN FOLD : 0 TRAIN_ACC : 0.9073561946902655 VALID_ACC : 0.876243093922652 TRAIN_AUC : 0.6642788461538461 VALID_AUC : 0.5866369047619047
MODEL : KNN FOLD : 1 TRAIN_ACC : 0.9071053359137407 VALID_ACC : 0.8606194690265486 TRAIN_AUC : 0.6742888189448442 VALID_AUC : 0.5489903846153845
MODEL : KNN FOLD : 2 TRAIN_ACC : 0.9048935581974011 VALID_ACC : 0.8783185840707964 TRAIN_AUC : 0.6657392835731415 VALID_AUC : 0.5799038461538462
MODEL : KNN FOLD : 3 TRAIN_ACC : 0.9043406137683163 VALID_ACC : 0.8816371681415929 TRAIN_AUC : 0.6476993405275779 VALID_AUC : 0.5943269230769231
MODEL : KNN FOLD : 4 TRAIN_ACC : 0.8985346972629251 VALID_ACC : 0.8816371681415929 TRAIN_AUC : 0.6360757643884892 VALID_AUC : 0.5859615384615384
MODEL : DecisionTree FOLD : 0 TRAIN_ACC : 1.0 VALID_ACC : 0.8651933701657458 TRAIN_AUC : 1.0 VALID_AUC : 0.675535714

# Performance Evaluation

In [None]:
temp_df1

Unnamed: 0,LogisticRegression_train_acc,LogisticRegression_train_auc_roc,KNN_train_acc,KNN_train_auc_roc,DecisionTree_train_acc,DecisionTree_train_auc_roc
0,0.891869,0.596971,0.907356,0.664279,1.0,1.0
1,0.893282,0.603909,0.907105,0.674289,1.0,1.0
2,0.890517,0.585662,0.904894,0.665739,1.0,1.0
3,0.894388,0.605577,0.904341,0.647699,1.0,1.0
4,0.892729,0.600468,0.898535,0.636076,1.0,1.0


In [None]:
temp_df2

Unnamed: 0,LogisticRegression_valid_acc,LogisticRegression_valid_auc_roc,KNN_valid_acc,KNN_valid_auc_roc,DecisionTree_valid_acc,DecisionTree_valid_auc_roc
0,0.897238,0.62747,0.876243,0.586637,0.865193,0.675536
1,0.880531,0.572788,0.860619,0.54899,0.872788,0.660433
2,0.888274,0.564615,0.878319,0.579904,0.870575,0.705192
3,0.887168,0.576538,0.881637,0.594327,0.863938,0.680529
4,0.896018,0.623365,0.881637,0.585962,0.873894,0.681971


# Model Comparison & Selection

From the below analysis of 5_CV_Score of each model tells that Logistic Regression has the least overfitting and also the AUC_ROC_SCORE for that is having no overfitting. Followed by KNN is the second best model. 

In [None]:
print(temp_df1.mean())
print("=============")
print(temp_df2.mean())

LogisticRegression_train_acc        0.892557
LogisticRegression_train_auc_roc    0.598517
KNN_train_acc                       0.904446
KNN_train_auc_roc                   0.657616
DecisionTree_train_acc              1.000000
DecisionTree_train_auc_roc          1.000000
dtype: float64
LogisticRegression_valid_acc        0.889846
LogisticRegression_valid_auc_roc    0.592956
KNN_valid_acc                       0.875691
KNN_valid_auc_roc                   0.579164
DecisionTree_valid_acc              0.869278
DecisionTree_valid_auc_roc          0.680732
dtype: float64
