In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
from joblib import dump, load
from imblearn.over_sampling import SMOTE
from sklearn import preprocessing
from sklearn.metrics import (confusion_matrix,accuracy_score,jaccard_score,f1_score,log_loss,classification_report)
from sklearn.model_selection import train_test_split
BaseDat = pd.read_csv("/content/default-1.csv")
dest = '/content/'


In [None]:
print(BaseDat.head)

drop irrelevant vars and create dummy variables from categorical variables

In [None]:
print(BaseDat.columns)
#- categorical Vars change -#
print(BaseDat['SEX'].value_counts())
print(BaseDat['EDUCATION'].value_counts())
print(BaseDat['MARRIAGE'].value_counts())
BaseDat['SEX'] = BaseDat['SEX'].replace(1,'Male')
BaseDat['SEX'] = BaseDat['SEX'].replace(2,'FeMale')
BaseDat['EDUCATION'] = BaseDat['EDUCATION'].replace(1,'Grad_School')
BaseDat['EDUCATION'] = BaseDat['EDUCATION'].replace(2,'University')
BaseDat['EDUCATION'] = BaseDat['EDUCATION'].replace(3,'High_School')
BaseDat['EDUCATION'] = BaseDat['EDUCATION'].replace([0,4,5,6],'Educ_Others')
BaseDat['MARRIAGE'] = BaseDat['MARRIAGE'].replace(1,'Married')
BaseDat['MARRIAGE'] = BaseDat['MARRIAGE'].replace(2,'Single')
BaseDat['MARRIAGE'] = BaseDat['MARRIAGE'].replace([0,3],'Marriage_Others')
BaseDat = pd.concat((BaseDat,pd.get_dummies(BaseDat['SEX'])), axis=1)
BaseDat = pd.concat((BaseDat,pd.get_dummies(BaseDat['EDUCATION'])), axis=1)
BaseDat = pd.concat((BaseDat,pd.get_dummies(BaseDat['MARRIAGE'])), axis=1)
#- add age Square -#
BaseDat['Age_Sqr'] = BaseDat['AGE'] ** 2
#- drop irrelevant -#
BaseDat = BaseDat.drop(['ID','SEX','EDUCATION','MARRIAGE'],axis=1)
print(BaseDat.columns)

## Step 1 – descriptive statistics

In [None]:
#- means of features by default -#
Tab = pd.DataFrame(BaseDat.groupby(['default']).mean())
Tab.to_excel(dest + 'Desc-Stat.xlsx')
#- correlation matrix -#
pd.DataFrame(BaseDat.corr()).to_excel(dest + 'CorrMatrix.xlsx')
#- simple ols regression -#
X = BaseDat.drop(['default','Male','Educ_Others','Marriage_Others'],axis=1)
Ols = sm.OLS(BaseDat['default'], X).fit()
print(Ols.summary())

Credit Card Defualt is relativly rare so we will use SMOTE to resample it equaly

In [None]:
y = BaseDat.loc[:, BaseDat.columns == 'default']
X = BaseDat.drop(['default'],axis=1)
#- convert to arrays -#
X = np.asarray(X)
y = np.asarray(y)
#- standardizing the features and spliting for train/test -#
X = preprocessing.StandardScaler().fit(X).transform(X)
print(X.shape)

spliting for train/test

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.7, random_state=0)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

## Step 2 – fitting 7 different ML algorithms

**1. K Nearest Neighbor (KNN)**

In [None]:
from sklearn.neighbors import KNeighborsClassifier

for n in range(1, 6):
    # Train Model
    KNN_Model = KNeighborsClassifier(n_neighbors=n, n_jobs=-1).fit(X_train, y_train.ravel())
   # save the model for later exploration
    FName = str(dest + 'KNN_' + str(n) + '.joblib')
    dump(KNN_Model, FName)

**2. Random Forest Classifier (RFC)**

In [None]:
from sklearn.ensemble import RandomForestClassifier
RFC_Model = RandomForestClassifier(random_state=0, n_jobs=-1).fit(X_train, y_train.ravel())
FName = str(dest + 'RFC.joblib')
dump(RFC_Model, FName)

**3. AdaBoost Classifier**

In [None]:
from sklearn.ensemble import AdaBoostClassifier
Adaboost_Model = AdaBoostClassifier(random_state=0).fit(X_train, y_train.ravel())
FName = str(dest + 'Adaboost.joblib')
dump(Adaboost_Model, FName)

**4. Suport Vector Machine (SVM)**

In [None]:
from sklearn import svm
SVM_Model = svm.SVC(kernel='rbf').fit(X_train, y_train.ravel())
FName = str(dest + 'SVM_rbf.joblib')
dump(SVM_Model, FName)

**5. Gaussian naive Bayes**

In [None]:
from sklearn.naive_bayes import GaussianNB
Gauss_Model = GaussianNB().fit(X_train, y_train.ravel())
FName = str(dest + 'GaussianNB.joblib')
dump(Gauss_Model, FName)

**6. Multi-layer Perceptron classifier (MLPC)**

In [None]:
from sklearn.neural_network import MLPClassifier
MLPC_Model = MLPClassifier(random_state=0).fit(X_train, y_train.ravel())
FName = str(dest + 'MLPC.joblib')
dump(MLPC_Model, FName)

**7. Logistic Regression CV (LR)**

In [None]:
from sklearn.linear_model import LogisticRegressionCV
LR_Model = LogisticRegressionCV(random_state=0).fit(X_train, y_train.ravel())
FName = str(dest + 'LR.joblib')
dump(LR_Model, FName)

## Step 3 – Reuse of the fitted Models to predict the test data

In [None]:
# import sys
# sys.stdout = open("/content/drive/MyDrive/Python/Credit-Card-Default/Models_Comp.txt", "w")
Mod_List=['LR','MLPC','GaussianNB','SVM_rbf','Adaboost','RFC','KNN_1','KNN_2','KNN_3','KNN_4','KNN_5']
Compare_Table = pd.DataFrame(columns=['Model','Accuracy','TP','FP','FN','TN'])
for mod in Mod_List:
  Test_Mod = load(dest + mod + '.joblib')
  y_pred = Test_Mod.predict(X_test)
  Accu = accuracy_score(y_test, y_pred)
  Conf_Mat = np.ndarray.tolist(confusion_matrix(y_test, y_pred))
  Compare_Table = Compare_Table.append({'Model': mod, 'Accuracy': Accu,
                                    'TN':Conf_Mat[0][0],'FP':Conf_Mat[0][1],
                                    'FN':Conf_Mat[1][0],'TP':Conf_Mat[1][1]}, ignore_index=True)

Compare_Table = Compare_Table.sort_values(by='Accuracy', ascending=False)
print(Compare_Table)
# sys.stdout.close()

the Best 3 models are: SVM (82.12%), Adaboost (81.98%), RFC (81.5%)

## Step 4 – oversample the data With SMOTE and fit the best 3 Alg. from previous step

In [None]:
y = BaseDat.loc[:, BaseDat.columns == 'default']
X = BaseDat.drop(['default'],axis=1)
os = SMOTE(random_state=0)
X_os,y_os=os.fit_resample(X, y)
#- Check the changes of our data distribution and design -#
print("Original data: Length=",len(X), "Percent default=",len(y[y['default']==1])/len(X))
print('OverSampled data: Length=',len(X_os), 'Percent default=',len(y_os[y_os['default']==1])/len(X_os))

In [None]:
#- convert to arrays -#
X_os = np.asarray(X_os)
y_os = np.asarray(y_os)
#- standardizing the features and spliting for train/test -#
X_os = preprocessing.StandardScaler().fit(X_os).transform(X_os)
#- slpit to train/test -#
X_train_os, X_test_os, y_train_os, y_test_os = train_test_split(X_os, y_os, test_size=0.7, random_state=0)
print(X_train_os.shape, X_test_os.shape, y_train_os.shape, y_test_os.shape)

In [None]:
from sklearn import svm
SVM_Model_os = svm.SVC(kernel='rbf').fit(X_train_os, y_train_os.ravel())
FName = str(dest + 'SVM_rbf_os.joblib')
dump(SVM_Model_os, FName)

In [None]:
from sklearn.ensemble import AdaBoostClassifier
Adaboost_Model_os = AdaBoostClassifier(random_state=0).fit(X_train_os, y_train_os.ravel())
FName = str(dest + 'Adaboost_os.joblib')
dump(Adaboost_Model_os, FName)

In [None]:
from sklearn.ensemble import RandomForestClassifier
RFC_Model_os = RandomForestClassifier(random_state=0, n_jobs=-1).fit(X_train_os, y_train_os.ravel())
FName = str(dest + 'RFC_os.joblib')
dump(RFC_Model_os, FName)

## Step 5 – test prediction accuracy after resampling

In [None]:
# import sys
# sys.stdout = open(dest + "Models_Comp_os.txt", "w")
Mod_List=['SVM_rbf_os','Adaboost_os','RFC_os']
Compare_Table = pd.DataFrame(columns=['Model','Accuracy','TP','FP','FN','TN'])
for mod in Mod_List:
  Test_Mod = load(dest + mod + '.joblib')
  y_pred = Test_Mod.predict(X_test_os)
  Accu = accuracy_score(y_test_os, y_pred)
  Conf_Mat = np.ndarray.tolist(confusion_matrix(y_test_os, y_pred))
  Compare_Table = Compare_Table.append({'Model': mod, 'Accuracy': Accu,
                                    'TN':Conf_Mat[0][0],'FP':Conf_Mat[0][1],
                                    'FN':Conf_Mat[1][0],'TP':Conf_Mat[1][1]}, ignore_index=True)

Compare_Table = Compare_Table.sort_values(by='Accuracy', ascending=False)
print(Compare_Table)
# sys.stdout.close()

the rate of accuracy for all 3 models improved with resampled data. the best Alg. is RFC (86.13%)

## Step 6 – Find best hyper parameters for most accurate alg. from prevoius step

In [None]:
from pprint import pprint
#- this will show the default parameters of the algorithm -#
pprint(RandomForestClassifier(random_state=0, n_jobs=-1).get_params())

create a dictionary with lists of hyper parameters to test

In [None]:
Param_grid = {'bootstrap': [False,True],
 'max_features': ['auto','log2'],              
 'criterion': ['gini','entropy'],
 'n_estimators': [500,1000],
 'warm_start': [False,True]}

run the grid search algorithm to find the best hyper parameters from the given options in step 2

In [None]:
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(estimator = RandomForestClassifier(random_state=0, n_jobs=-1), param_grid = Param_grid, n_jobs = -1)
grid_search.fit(X_train_os, y_train_os.ravel())
grid_search.best_params_

test the best hyper parameters on the data

note: only the n_estimators and bootstrap changed in compare with the default

In [None]:
from sklearn.metrics import plot_confusion_matrix
RFC_Model_BestTest = RandomForestClassifier(bootstrap=False,
                                            criterion='gini',
                                            max_features='auto',
                                            n_estimators=1000,
                                            warm_start=False,
                                            random_state=0, n_jobs=-1,).fit(X_train_os, y_train_os.ravel())
FName = str(dest + 'RFC_BestTest.joblib')
dump(RFC_Model_BestTest, FName)
y_pred = RFC_Model_BestTest.predict(X_test_os)
print(accuracy_score(y_test_os, y_pred))
plot_confusion_matrix(RFC_Model_BestTest, X_test_os, y_test_os) 

after changing the bootstrap and n_estimators hyper parameters the the accuracy of prdection changed slightly to 86.52%