In [1]:
import numpy as np
import pandas as pd
import scipy 
from math import sqrt
import matplotlib.pyplot as plt
import seaborn as sns

#Feature selection
from sklearn.feature_selection import RFE
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

#Estimators
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

#Model Metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score

#cross-validation
from sklearn.model_selection import train_test_split

In [2]:
rawData = pd.read_csv('credit_edited.csv', header=0, dtype={'SEX': 'category',
                                           'EDUCATION': 'category',
                                           'MARRIAGE': 'category',
                                           'PAY_0': 'category',
                                           'PAY_2': 'category',
                                           'PAY_3': 'category',
                                           'PAY_4': 'category',
                                           'PAY_5': 'category',
                                           'PAY_6': 'category', 
                                            'DEFAULT': 'object'})

In [3]:
credit = rawData.drop(["Unnamed: 0", "BILL_CHANGE","BILL_CHANGE_CAT", 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6'], axis=1)

In [4]:
credit.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,DEFAULT
0,20000,2,2,1,24,2,2,-1,-1,-2,-2,3913,0,689,0,0,0,0,1
1,120000,2,2,2,26,-1,2,0,0,0,2,2682,0,1000,1000,1000,0,2000,1
2,90000,2,2,2,34,0,0,0,0,0,0,29239,1518,1500,1000,1000,1000,5000,0
3,50000,2,2,1,37,0,0,0,0,0,0,46990,2000,2019,1200,1100,1069,1000,0
4,50000,1,2,1,57,-1,0,-1,0,0,0,8617,2000,36681,10000,9000,689,679,0


In [5]:
credit.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 19 columns):
LIMIT_BAL    30000 non-null int64
SEX          30000 non-null category
EDUCATION    30000 non-null category
MARRIAGE     30000 non-null category
AGE          30000 non-null int64
PAY_0        30000 non-null category
PAY_2        30000 non-null category
PAY_3        30000 non-null category
PAY_4        30000 non-null category
PAY_5        30000 non-null category
PAY_6        30000 non-null category
BILL_AMT1    30000 non-null int64
PAY_AMT1     30000 non-null int64
PAY_AMT2     30000 non-null int64
PAY_AMT3     30000 non-null int64
PAY_AMT4     30000 non-null int64
PAY_AMT5     30000 non-null int64
PAY_AMT6     30000 non-null int64
DEFAULT      30000 non-null object
dtypes: category(9), int64(9), object(1)
memory usage: 2.5+ MB


In [6]:
y = credit['DEFAULT'].values
X = credit.drop('DEFAULT', axis=1)

In [7]:
X = pd.get_dummies(X)
X.head()

Unnamed: 0,LIMIT_BAL,AGE,BILL_AMT1,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,SEX_1,...,PAY_6_-1,PAY_6_-2,PAY_6_0,PAY_6_2,PAY_6_3,PAY_6_4,PAY_6_5,PAY_6_6,PAY_6_7,PAY_6_8
0,20000,24,3913,0,689,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,120000,26,2682,0,1000,1000,1000,0,2000,0,...,0,0,0,1,0,0,0,0,0,0
2,90000,34,29239,1518,1500,1000,1000,1000,5000,0,...,0,0,1,0,0,0,0,0,0,0
3,50000,37,46990,2000,2019,1200,1100,1069,1000,0,...,0,0,1,0,0,0,0,0,0,0
4,50000,57,8617,2000,36681,10000,9000,689,679,1,...,0,0,1,0,0,0,0,0,0,0


In [8]:
# Test_Train_Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, stratify=y, random_state=1)

In [9]:
# Decision Tree 1

dt = DecisionTreeClassifier(random_state=1)

dt.fit(X_train, y_train)
print(np.mean(cross_val_score(dt, X_train, y_train, cv = 5)))

y_pred = dt.predict(X_test)

acc2 = accuracy_score(y_test, y_pred)

print(cross_val_score(dt, X_train, y_train))

print("Test set accuracy: {:.2f}".format(acc2))

print(classification_report(y_test, y_pred))



0.7230671397070193




[0.72546667 0.724      0.72506667]
Test set accuracy: 0.72
              precision    recall  f1-score   support

           0       0.83      0.81      0.82      5841
           1       0.38      0.41      0.39      1659

   micro avg       0.72      0.72      0.72      7500
   macro avg       0.60      0.61      0.61      7500
weighted avg       0.73      0.72      0.73      7500



In [10]:
# Decision Tree 2

dt2 = DecisionTreeClassifier(max_depth=3, random_state=1)
dt2.fit(X_train, y_train)
print(np.mean(cross_val_score(dt2, X_train, y_train, cv = 5)))

y_pred = dt2.predict(X_test)

acc2 = accuracy_score(y_test, y_pred)

print(cross_val_score(dt2, X_train, y_train))

print("Test set accuracy: {:.2f}".format(acc2))

print(classification_report(y_test, y_pred))

0.818445000263402




[0.8188     0.81373333 0.8228    ]
Test set accuracy: 0.82
              precision    recall  f1-score   support

           0       0.83      0.96      0.89      5841
           1       0.69      0.30      0.42      1659

   micro avg       0.82      0.82      0.82      7500
   macro avg       0.76      0.63      0.65      7500
weighted avg       0.80      0.82      0.79      7500



In [11]:
# K-nearest Neighbors classifier

Knn = KNeighborsClassifier()
Knn.fit(X_train, y_train)
print(np.mean(cross_val_score(Knn, X_train, y_train, cv= 5)))

y_pred = Knn.predict(X_test)
acc3 = accuracy_score(y_test, y_pred)

print("Test set accuracy: {:.2f}".format(acc3))

print(classification_report(y_test, y_pred))

0.7530220445432011
Test set accuracy: 0.75
              precision    recall  f1-score   support

           0       0.79      0.92      0.85      5841
           1       0.36      0.15      0.22      1659

   micro avg       0.75      0.75      0.75      7500
   macro avg       0.58      0.54      0.53      7500
weighted avg       0.70      0.75      0.71      7500



In [12]:
Knn = KNeighborsClassifier(n_neighbors=6)
Knn.fit(X_train, y_train)
print(np.mean(cross_val_score(Knn, X_train, y_train, cv= 5)))

y_pred = Knn.predict(X_test)
acc3 = accuracy_score(y_test, y_pred)

print("Test set accuracy: {:.2f}".format(acc3))

print(classification_report(y_test, y_pred))

0.7695110230211205
Test set accuracy: 0.77
              precision    recall  f1-score   support

           0       0.79      0.97      0.87      5841
           1       0.40      0.08      0.14      1659

   micro avg       0.77      0.77      0.77      7500
   macro avg       0.60      0.52      0.50      7500
weighted avg       0.70      0.77      0.71      7500



In [13]:
# Random Forest Classifier

RFC1 = RandomForestClassifier()
RFC1.fit(X_train, y_train)
print(np.mean(cross_val_score(RFC1, X_train, y_train, cv=3)))

y_pred = RFC1.predict(X_test)
acc4 = accuracy_score(y_test, y_pred)
print("Test set accuracy: {:.2f}".format(acc4))

print(classification_report(y_test, y_pred))



0.8072444444444445
Test set accuracy: 0.81
              precision    recall  f1-score   support

           0       0.83      0.95      0.88      5841
           1       0.62      0.32      0.42      1659

   micro avg       0.81      0.81      0.81      7500
   macro avg       0.73      0.63      0.65      7500
weighted avg       0.78      0.81      0.78      7500



In [14]:
# Random forest classifier 2
RFC2 = RandomForestClassifier(n_estimators = 50, max_depth = 3)
RFC2.fit(X_train, y_train)
print(np.mean(cross_val_score(RFC2, X_train, y_train, cv=3)))

y_pred = RFC2.predict(X_test)
acc5 = accuracy_score(y_test, y_pred)
print("Test set accuracy: {:.2f}".format(acc5))

print(classification_report(y_test, y_pred))

0.7972888888888888
Test set accuracy: 0.80
              precision    recall  f1-score   support

           0       0.80      0.98      0.88      5841
           1       0.73      0.14      0.24      1659

   micro avg       0.80      0.80      0.80      7500
   macro avg       0.76      0.56      0.56      7500
weighted avg       0.79      0.80      0.74      7500



In [15]:
#Random Forest with Recursive Feature Elimination

RFC3 = RandomForestClassifier(n_estimators = 50, max_depth = 3)

rfe = RFE(RFC3, n_features_to_select=3)

rfe.fit(X_train, y_train)

print(np.mean(cross_val_score(rfe, X_train, y_train, cv=3)))

y_pred = rfe.predict(X_test)
acc6 = accuracy_score(y_test, y_pred)
print("Test set accuracy: {:.2f}".format(acc6))

print(classification_report(y_test, y_pred))

print(rfe.support_)
print(rfe.ranking_)

0.8149777777777779
Test set accuracy: 0.81
              precision    recall  f1-score   support

           0       0.84      0.94      0.89      5841
           1       0.62      0.37      0.46      1659

   micro avg       0.81      0.81      0.81      7500
   macro avg       0.73      0.65      0.67      7500
weighted avg       0.79      0.81      0.79      7500

[False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False  True False False False False False False False False False
 False  True False False False False False False False False False False
  True False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False]
[ 8 24 11  5  6 12 16 15 13 28 30 32 31 47 34 60 44 35 62 39 67 42 19 48
  4  9  1  7 50 68 69 75 59 61 5

In [24]:
#RFE Knn
Knn = KNeighborsClassifier(n_neighbors=6)

rfe = RFE(Knn, n_features_to_select=3)

rfe.fit(X_train, y_train)

print(np.mean(cross_val_score(rfe, X_train, y_train, cv=3)))

y_pred = rfe.predict(X_test)
acc6 = accuracy_score(y_test, y_pred)
print("Test set accuracy: {:.2f}".format(acc6))

print(classification_report(y_test, y_pred))

0.7933333333333333
Test set accuracy: 0.80
              precision    recall  f1-score   support

           0       0.82      0.95      0.88      5841
           1       0.58      0.26      0.36      1659

   micro avg       0.80      0.80      0.80      7500
   macro avg       0.70      0.60      0.62      7500
weighted avg       0.77      0.80      0.76      7500



In [28]:
# Applying PCA on k-nearest neighbors

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

pca = PCA(n_components = 3)

X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

explained_variance = pca.explained_variance_ratio_

Knn.fit(X_train, y_train)

print(np.mean(cross_val_score(rfe, X_train, y_train, cv=3)))

y_pred = Knn.predict(X_test)

acc8 = accuracy_score(y_test, y_pred)
print("Test set accuracy: {:.2f}".format(acc8))

print(classification_report(y_test, y_pred))

print(explained_variance)

0.7942222222222223
Test set accuracy: 0.80
              precision    recall  f1-score   support

           0       0.82      0.95      0.88      5841
           1       0.59      0.25      0.35      1659

   micro avg       0.80      0.80      0.80      7500
   macro avg       0.70      0.60      0.62      7500
weighted avg       0.77      0.80      0.76      7500

[0.33333333 0.33333333 0.33333333]


In [30]:
# Applying PCA on Random Forest

RFC2.fit(X_train, y_train)

y_pred = RFC2.predict(X_test)

acc8 = accuracy_score(y_test, y_pred)

print(np.mean(cross_val_score(RFC2, X_train, y_train, cv=3)))

print("Test set accuracy: {:.2f}".format(acc8))

print(classification_report(y_test, y_pred))

0.798
Test set accuracy: 0.81
              precision    recall  f1-score   support

           0       0.82      0.96      0.89      5841
           1       0.66      0.27      0.38      1659

   micro avg       0.81      0.81      0.81      7500
   macro avg       0.74      0.61      0.63      7500
weighted avg       0.79      0.81      0.77      7500

