# In this notebook I will go through an approach to address class imbalance using the Synthetic Minority Oversampling Technique

In [34]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
import math 
from matplotlib.figure import Figure
pd.set_option('display.max_colwidth', None)
from sklearn.datasets import make_blobs
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier

In [35]:
file=pd.read_excel('Bank_Personal_Loan_Modelling.xlsx', sheet_name='Data')

In [36]:
details=pd.read_excel('Bank_Personal_Loan_Modelling.xlsx')

In [37]:
details

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9
0,,,,,,,,,,
1,,,,,,,,,,
2,,,,,,,,,,
3,,,,,,,,,,
4,,Data Description:,,,,,,,,
5,,,,,,,,,,
6,,ID,Customer ID,,,,,,,
7,,Age,Customer's age in completed years,,,,,,,
8,,Experience,#years of professional experience,,,,,,,
9,,Income,Annual income of the customer ($000),,,,,,,


In [38]:
y=file['Personal Loan']

In [39]:
X=file.drop('Personal Loan', axis='columns')

In [40]:
X

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Securities Account,CD Account,Online,CreditCard
0,1,25,1,49,91107,4,1.6,1,0,1,0,0,0
1,2,45,19,34,90089,3,1.5,1,0,1,0,0,0
2,3,39,15,11,94720,1,1.0,1,0,0,0,0,0
3,4,35,9,100,94112,1,2.7,2,0,0,0,0,0
4,5,35,8,45,91330,4,1.0,2,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,4996,29,3,40,92697,1,1.9,3,0,0,0,1,0
4996,4997,30,4,15,92037,4,0.4,1,85,0,0,1,0
4997,4998,63,39,24,93023,2,0.3,3,0,0,0,0,0
4998,4999,65,40,49,90034,3,0.5,2,0,0,0,1,0


## First approach: 
### Reduce sample of banking customers without personal loan and complete machine learnign model

In [41]:
file_0=file[file['Personal Loan']==0]
file_1=file[file['Personal Loan']==1]

In [42]:
len(file_0)

4520

In [43]:
len(file_1)

480

#### For the SMOTE approach I will bring in the library imblearn

In [47]:
!pip install imbalanced-learn
#import imblearn
from imblearn.over_sampling import SMOTE



In [50]:
smote = SMOTE(sampling_strategy='minority')
X_sm, y_sm =smote.fit_resample(X,y)
y_sm.value_counts()

0    4520
1    4520
Name: Personal Loan, dtype: int64

In [52]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression 
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

!pip install category_encoders
%matplotlib inline



In [53]:
X_train, X_test, y_train, y_test = train_test_split(X_sm, y_sm, test_size=0.20, random_state=2)

In [54]:

print('Train', X_train.shape, y_train.shape)
print('Test', X_test.shape, y_test.shape)

Train (7232, 13) (7232,)
Test (1808, 13) (1808,)


In [55]:
print(X_train)

        ID  Age  Experience  Income  ZIP Code  Family     CCAvg  Education  \
2380  2381   40          16      50     92606       2  0.600000          3   
6403  1494   49          22     180     92617       2  4.222170          3   
4055  4056   42          18      65     93460       3  2.100000          3   
385    386   35           9      40     93943       3  0.900000          1   
3880  3881   48          24      25     90024       4  0.500000          2   
...    ...  ...         ...     ...       ...     ...       ...        ...   
1099  1100   30           6      52     92717       3  0.700000          2   
2514  2515   41          16      25     92182       2  0.100000          2   
6637  2785   52          27     179     95984       2  8.101766          3   
2575  2576   42          16      41     90401       3  0.500000          3   
7336   399   42          17     173     95060       3  5.910111          1   

      Mortgage  Securities Account  CD Account  Online  CreditC

In [56]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [57]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

In [58]:


model = GradientBoostingClassifier(learning_rate=0.1, max_depth=3, n_estimators=1000, subsample=0.5)

In [None]:
model.fit(X_train, y_train)

In [None]:
X_train

In [None]:
predictions=model.predict(X_test)


In [None]:
X_test.info()

In [None]:
classification_report(y_test,predictions)

In [None]:

confusion_matrix(y_test,predictions)

In [85]:
accuracy_score(y_test,predictions)

0.9928097345132744

In [86]:
print('The Precision Score- What proportion of positive identifications was actually correct?: ' + str(100*precision_score(y_test,predictions))+'%')
print('The Recall Score - What proportion of actual positives was identified correctly?: ' + str(100*recall_score(y_test,predictions))+'%')

The Precision Score- What proportion of positive identifications was actually correct?: 98.53273137697516%
The Recall Score - What proportion of actual positives was identified correctly?: 100.0%


In [87]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(model, X_train, y_train, cv=15)
print('Cross-Validation Accuracy Scores', scores.mean())
recall = cross_val_score(model, X_train, y_train, cv=15, scoring='recall')
print('Recall', recall.mean())
precision = cross_val_score(model, X_train, y_train, cv=15, scoring='precision')
print('Precision', precision.mean())

Cross-Validation Accuracy Scores 0.9955748563181361
Recall 1.0
Precision 0.9908033015738985


In [None]:
# example of grid searching key hyperparameters for GradientBoostingClassifier
from sklearn.datasets import make_blobs
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier

# define models and parameters
model = GradientBoostingClassifier()
n_estimators = [10, 100, 1000]
learning_rate = [0.001, 0.01, 0.1]
subsample = [0.5, 0.7, 1.0]
max_depth = [3, 7, 9]
# define grid search
grid = dict(learning_rate=learning_rate, n_estimators=n_estimators, subsample=subsample, max_depth=max_depth)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='recall',error_score=0)
grid_result = grid_search.fit(X_train, y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

# Accuracy: 98.45%
# Precision 94.33%
# Recall: 90.48%
