In [44]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import warnings
warnings.filterwarnings('ignore')

In [82]:
# Import the data
df = pd.read_excel('data/Bank_Personal_Loan_Modelling.xlsx',sheet_name=1)


# Print the first five rows
df.head()

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,1,25,1,49,91107,4,1.6,1,0,0,1,0,0,0
1,2,45,19,34,90089,3,1.5,1,0,0,1,0,0,0
2,3,39,15,11,94720,1,1.0,1,0,0,0,0,0,0
3,4,35,9,100,94112,1,2.7,2,0,0,0,0,0,0
4,5,35,8,45,91330,4,1.0,2,0,0,0,0,0,1


# Splitting the Data

In [83]:
# Split the predictor and target variables
y = df['Personal Loan']
X = df.drop('Personal Loan',axis=1)

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=42)

In [84]:
# Instantiate StandardScaler
scaler = StandardScaler()

# Transform the training and test sets
scaled_data_train = scaler.fit_transform(X_train)
scaled_data_test = scaler.transform(X_test)

# Convert into a DataFrame
scaled_df_train = pd.DataFrame(scaled_data_train, columns=X_train.columns)
scaled_df_train.head()

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Securities Account,CD Account,Online,CreditCard
0,1.659352,1.289669,1.222972,-0.518388,1.121134,0.540359,0.154859,1.33346,-0.556494,-0.33727,-0.249646,0.818312,1.543187
1,0.458065,0.505425,0.61324,-0.474899,0.43274,-0.33382,-0.53065,1.33346,-0.556494,-0.33727,-0.249646,-1.222027,-0.64801
2,-1.407734,-0.975925,-0.867538,0.351391,0.509278,-0.33382,0.440488,-1.049421,-0.556494,-0.33727,-0.249646,0.818312,-0.64801
3,-1.148072,0.418287,0.264821,-1.279445,0.384848,-0.33382,-0.53065,0.14202,0.421254,-0.33727,-0.249646,-1.222027,-0.64801
4,-0.535213,-0.017404,0.003507,-0.322688,0.824829,-1.207998,0.269111,-1.049421,-0.556494,2.964983,4.005678,0.818312,-0.64801


In [85]:
# Instantiate KNeighborsClassifier
clf = KNeighborsClassifier()

# Fit the classifier
clf.fit(scaled_data_train,y_train)
# Print the accuracy on test set
clf.score(scaled_data_test,y_test)

0.9504

# First Pipeline

In [86]:
# Build a pipeline with StandardScaler and KNeighborsClassifier
scaled_pipeline_1 = Pipeline([('ss',StandardScaler()),
                                    ('knn',KNeighborsClassifier())])
# Fit the training data to pipeline

scaled_pipeline_1.fit(X_train,y_train)
# Print the accuracy on test set
scaled_pipeline_1.score(X_test,y_test)

0.9504

# Second Pipeline

In [87]:
# Build a pipeline with StandardScaler and RandomForestClassifier
scaled_pipeline_2 = Pipeline([('ss', StandardScaler()), 
                              ('RF', RandomForestClassifier(random_state=123))])

In [88]:
# Define the grid
grid = [{'RF__max_depth': [4, 5, 6], 
         'RF__min_samples_split': [2, 5, 10], 
         'RF__min_samples_leaf': [1, 3, 5]}]

In [89]:
# Define a grid search
gridsearch = GridSearchCV(estimator=scaled_pipeline_2, 
                          param_grid=grid, 
                          scoring='accuracy', 
                          cv=5)

In [90]:
# Fit the training data
gridsearch.fit(X_train, y_train)

# Print the accuracy on test set
gridsearch.score(X_test, y_test)

0.9808

In [20]:
y_pred = gridsearch.predict(X_test)
#accuracy_score, recall_score, precision_score, f1_score
print("Accuracy: {}".format(accuracy_score(y_test,y_pred)) )
print("Recall: {}".format(recall_score(y_test,y_pred)) )
print("Precision: {}".format(precision_score(y_test,y_pred)) )
print("F1 : {}".format(f1_score(y_test,y_pred)) )

Accuracy: 0.9808
Recall: 0.8283582089552238
Precision: 0.9910714285714286
F1 : 0.9024390243902439


In [105]:
confusion_matrix(y_test,y_pred)

array([[1115,    1],
       [  23,  111]])

## Second Pipeline with Oversampling

In [94]:
# now we will try and oversample the target variable
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(sampling_strategy=1)
X_train_ros, y_train_ros = ros.fit_resample(X_train, y_train)

In [95]:
y_train_ros.size

6808

In [96]:
y_train.value_counts()

0    3404
1     346
Name: Personal Loan, dtype: int64

In [97]:
y_train_ros.value_counts()

1    3404
0    3404
Name: Personal Loan, dtype: int64

In [98]:
# Build a pipeline with StandardScaler and RandomForestClassifier
overfitted_pipeline = Pipeline([('ss', StandardScaler()), 
                              ('RF', RandomForestClassifier(random_state=123))])
# Define the grid
grid = [{'RF__max_depth': [4, 5, 6], 
         'RF__min_samples_split': [2, 5, 10], 
         'RF__min_samples_leaf': [1, 3, 5]}]
# Define a grid search
overfitted_gridsearch = GridSearchCV(estimator=overfitted_pipeline, 
                          param_grid=grid, 
                          scoring='accuracy', 
                          cv=5)
# Fit the training data
overfitted_gridsearch.fit(X_train_ros, y_train_ros)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('ss', StandardScaler()),
                                       ('RF',
                                        RandomForestClassifier(random_state=123))]),
             param_grid=[{'RF__max_depth': [4, 5, 6],
                          'RF__min_samples_leaf': [1, 3, 5],
                          'RF__min_samples_split': [2, 5, 10]}],
             scoring='accuracy')

In [100]:
# Print the accuracy on test set
overfitted_gridsearch.score(X_test, y_test)
y_pred2 = overfitted_gridsearch.predict(X_test)
#accuracy_score, recall_score, precision_score, f1_score
print("Accuracy: {}".format(accuracy_score(y_test,y_pred2)) )
print("Recall: {}".format(recall_score(y_test,y_pred2)) )
print("Precision: {}".format(precision_score(y_test,y_pred2)) )
print("F1 : {}".format(f1_score(y_test,y_pred2)) )

Accuracy: 0.9784
Recall: 0.9776119402985075
Precision: 0.8451612903225807
F1 : 0.9065743944636678


# Second Pipeline with ROS, Best Model

In [101]:
best_model = overfitted_gridsearch.best_estimator_

In [102]:
best_model.fit(X_train_ros,y_train_ros)

Pipeline(steps=[('ss', StandardScaler()),
                ('RF',
                 RandomForestClassifier(max_depth=6, min_samples_leaf=3,
                                        random_state=123))])

In [103]:
y_pred_ros = best_model.predict(X_test)
print("Accuracy: {}".format(accuracy_score(y_test,y_pred)) )
print("Recall: {}".format(recall_score(y_test,y_pred)) )
print("Precision: {}".format(precision_score(y_test,y_pred)) )
print("F1 : {}".format(f1_score(y_test,y_pred)) )

Accuracy: 0.9808
Recall: 0.8283582089552238
Precision: 0.9910714285714286
F1 : 0.9024390243902439


In [104]:
confusion_matrix(y_test,y_pred_ros)

array([[1092,   24],
       [   3,  131]])

In [75]:
3371 / (3371 + 87)

0.974840948525159