In [26]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier,\
GradientBoostingClassifier,StackingClassifier,VotingClassifier
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, cohen_kappa_score, confusion_matrix,accuracy_score
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings('ignore')

### Impot the data

In [4]:
data=pd.read_excel('Bank_Personal_Loan_Modelling-1 (1).xlsx',sheet_name=1)
data

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,1,25,1,49,91107,4,1.6,1,0,0,1,0,0,0
1,2,45,19,34,90089,3,1.5,1,0,0,1,0,0,0
2,3,39,15,11,94720,1,1.0,1,0,0,0,0,0,0
3,4,35,9,100,94112,1,2.7,2,0,0,0,0,0,0
4,5,35,8,45,91330,4,1.0,2,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,4996,29,3,40,92697,1,1.9,3,0,0,0,0,1,0
4996,4997,30,4,15,92037,4,0.4,1,85,0,0,0,1,0
4997,4998,63,39,24,93023,2,0.3,3,0,0,0,0,0,0
4998,4999,65,40,49,90034,3,0.5,2,0,0,0,0,1,0


In [5]:
del data['ID']

In [6]:
x=data.drop(columns=['Personal Loan'])
y=data['Personal Loan']

In [7]:
x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=0.8,stratify=y,random_state=2)

In [8]:
print(x_train.shape,y_train.shape,x_test.shape,y_test.shape)

(4000, 12) (4000,) (1000, 12) (1000,)


In [9]:
y_train.value_counts()

Personal Loan
0    3616
1     384
Name: count, dtype: int64

In [10]:
# Balance the data
os=SMOTE(sampling_strategy=0.6)

In [11]:
x_train_os,y_train_os=os.fit_resample(x_train,y_train)

In [12]:
y_train_os.value_counts()

Personal Loan
0    3616
1    2169
Name: count, dtype: int64

## Ensemble Techinques

### Bagging Algorithms

In [27]:
def model_validation(model,trainx,trainy,testx,testy):
    m = model
    m.fit(trainx,trainy)
    y_pred = m.predict(testx)
    print('Confusion Matrix:\n',confusion_matrix(testy,y_pred))
    print('Classification Report\n', classification_report(testy,y_pred))
    print('Cohen Kappa Score\n', cohen_kappa_score(testy,y_pred))
    print('Accuracy Score \n', accuracy_score(testy,y_pred))

In [28]:
# Logistic Regression
model_validation(LogisticRegression(),x_train_os,y_train_os,x_test,y_test)

Confusion Matrix:
 [[800 104]
 [ 18  78]]
Classification Report
               precision    recall  f1-score   support

           0       0.98      0.88      0.93       904
           1       0.43      0.81      0.56        96

    accuracy                           0.88      1000
   macro avg       0.70      0.85      0.75      1000
weighted avg       0.93      0.88      0.89      1000

Cohen Kappa Score
 0.49805806069383185
Accuracy Score 
 0.878


In [29]:
# Random forest
model_validation(RandomForestClassifier(n_estimators=200,max_depth=5),x_train_os,y_train_os,x_test,y_test)

Confusion Matrix:
 [[867  37]
 [  4  92]]
Classification Report
               precision    recall  f1-score   support

           0       1.00      0.96      0.98       904
           1       0.71      0.96      0.82        96

    accuracy                           0.96      1000
   macro avg       0.85      0.96      0.90      1000
weighted avg       0.97      0.96      0.96      1000

Cohen Kappa Score
 0.7952375244716129
Accuracy Score 
 0.959


In [30]:
# Bagging with naive bayes 
from sklearn.naive_bayes import GaussianNB
model_validation(BaggingClassifier(base_estimator=GaussianNB(),n_estimators=200),x_train_os,\
                 y_train_os,x_test,y_test)

Confusion Matrix:
 [[831  73]
 [ 16  80]]
Classification Report
               precision    recall  f1-score   support

           0       0.98      0.92      0.95       904
           1       0.52      0.83      0.64        96

    accuracy                           0.91      1000
   macro avg       0.75      0.88      0.80      1000
weighted avg       0.94      0.91      0.92      1000

Cohen Kappa Score
 0.5947619567988927
Accuracy Score 
 0.911


### Boosting

In [31]:
# AdaBoost
model_validation(AdaBoostClassifier(n_estimators=200),x_train_os,y_train_os,x_test,y_test)

Confusion Matrix:
 [[864  40]
 [  5  91]]
Classification Report
               precision    recall  f1-score   support

           0       0.99      0.96      0.97       904
           1       0.69      0.95      0.80        96

    accuracy                           0.95      1000
   macro avg       0.84      0.95      0.89      1000
weighted avg       0.97      0.95      0.96      1000

Cohen Kappa Score
 0.777059965914946
Accuracy Score 
 0.955


In [32]:
# Gradient Boosting 
model_validation(GradientBoostingClassifier(n_estimators=200,max_depth=5),x_train_os,y_train_os,x_test,y_test)

Confusion Matrix:
 [[898   6]
 [  4  92]]
Classification Report
               precision    recall  f1-score   support

           0       1.00      0.99      0.99       904
           1       0.94      0.96      0.95        96

    accuracy                           0.99      1000
   macro avg       0.97      0.98      0.97      1000
weighted avg       0.99      0.99      0.99      1000

Cohen Kappa Score
 0.9429171613846014
Accuracy Score 
 0.99


In [33]:
# XGBoost
from xgboost import XGBClassifier

In [34]:
model_validation(XGBClassifier(n_estimators=150,gamma=2),x_train_os,y_train_os,x_test,y_test)

Confusion Matrix:
 [[891  13]
 [  5  91]]
Classification Report
               precision    recall  f1-score   support

           0       0.99      0.99      0.99       904
           1       0.88      0.95      0.91        96

    accuracy                           0.98      1000
   macro avg       0.93      0.97      0.95      1000
weighted avg       0.98      0.98      0.98      1000

Cohen Kappa Score
 0.9000177746178457
Accuracy Score 
 0.982


### Voting

In [35]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

In [36]:
base_estimators = [('Logistic Regression',LogisticRegression()),
                   ('Naive Bayes',GaussianNB()),
                   ('Decision Tree',DecisionTreeClassifier(max_depth=5)),
                   ('KNN',KNeighborsClassifier(n_neighbors=7))]

In [37]:
model_validation(VotingClassifier(base_estimators),x_train_os,y_train_os,x_test,y_test)

Confusion Matrix:
 [[863  41]
 [ 18  78]]
Classification Report
               precision    recall  f1-score   support

           0       0.98      0.95      0.97       904
           1       0.66      0.81      0.73        96

    accuracy                           0.94      1000
   macro avg       0.82      0.88      0.85      1000
weighted avg       0.95      0.94      0.94      1000

Cohen Kappa Score
 0.6929514134643406
Accuracy Score 
 0.941


### Stacking

In [38]:
base_estimators = [('Naive Bayes',GaussianNB()),
                   ('Decision Tree',DecisionTreeClassifier(max_depth=5)),
                   ('KNN',KNeighborsClassifier(n_neighbors=7)),
                  ('Random Forest',RandomForestClassifier(n_estimators=200,max_depth=5))]

In [39]:
model_validation(StackingClassifier(estimators=base_estimators,\
                                    final_estimator=DecisionTreeClassifier(max_depth=3))\
                                    ,x_train_os,y_train_os,x_test,y_test)

Confusion Matrix:
 [[890  14]
 [  6  90]]
Classification Report
               precision    recall  f1-score   support

           0       0.99      0.98      0.99       904
           1       0.87      0.94      0.90        96

    accuracy                           0.98      1000
   macro avg       0.93      0.96      0.94      1000
weighted avg       0.98      0.98      0.98      1000

Cohen Kappa Score
 0.888908638464273
Accuracy Score 
 0.98
