# 3. Model Implementation 

#### Setup

In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
import matplotlib.pyplot as plt 
import seaborn as sns

file_path=r'C:\Users\Marco\Desktop\TU Dublin\Programming for Big Data - H6018\2nd Assignment\data'
file_name='/credit_card_default.xls'

df = pd.read_excel(file_path+file_name, encoding = "ISO-8859-1")
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

#### Data Manipulation & Preprocessing

In [2]:
#checking duplicates for ID and drop it
df.ID.duplicated().sum()
df.drop(['ID'], axis=1, inplace=True)
#rename columns
df = df.rename(columns = {'default payment next month' : 'def_next_month', 
                          'PAY_0' : 'PAY_1'})
df.def_next_month.value_counts()
#Correct data inconsistencies 
# MARRIAGE = 0 is deleted
df = df.drop(df[df['MARRIAGE']==0].index)
# EDUCATION = 0, 5 and 6 are deleted
df = df.drop(df[df['EDUCATION']==0].index)
df = df.drop(df[df['EDUCATION']==5].index)
df = df.drop(df[df['EDUCATION']==6].index)
#Fixing PAY variables
for att in ['PAY_1', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']:
  # categories -2,-1 are grouped into a single class -1: pay duty   
  filter = (df[att] == -2) | (df[att] == -1) 
  df.loc[filter, att] = -1
  df[att] = df[att].astype('int64')
  filter = (df[att] >= 0)
  df.loc[filter, att] = df.loc[filter, att] + 1

In [3]:
# Set 'category' type to categorical attributes
for att in ['SEX', 'EDUCATION', 'MARRIAGE']:
  df[att] = df[att].astype('category')

# one-hot encoding
df_encoded=pd.concat([pd.get_dummies(df['SEX'], prefix='SEX'),
                pd.get_dummies(df['EDUCATION'], prefix='EDUCATION'), 
                pd.get_dummies(df['MARRIAGE'], prefix='MARRIAGE'),
                df],axis=1)
# drop original columns
df_encoded.drop(['EDUCATION'],axis=1, inplace=True)
df_encoded.drop(['SEX'],axis=1, inplace=True)
df_encoded.drop(['MARRIAGE'],axis=1, inplace=True)

# drop response variable and the hot encoded variables
df_drop_var = df_encoded.drop(['def_next_month', 'SEX_1','SEX_2','EDUCATION_1','EDUCATION_2','EDUCATION_3','EDUCATION_4',
                               'MARRIAGE_1','MARRIAGE_2','MARRIAGE_3'],axis=1)

df_scaled = df_drop_var/df_drop_var.std()

#concatenating the encoded variables with the scaled variables
df_encoded_only = df_encoded.iloc[:,:9]
df_prep = pd.concat([df_encoded_only, df_scaled, df.def_next_month],axis=1)

### Train, Validation and Test Split

Finally, we split the data into train, validation and test sets. 

In [4]:
# Splitting Train, test dataset
from sklearn.model_selection import train_test_split

X_train, X_NOT_train, y_train, y_NOT_train = train_test_split(df_prep.drop(['def_next_month'],axis=1),
                                                                        df_prep.def_next_month,test_size=0.3,
                                                                        random_state=101)

# split 30% groups into validation and test sets
X_val, X_test, y_val, y_test = train_test_split(X_NOT_train,y_NOT_train,test_size=0.5,
                                                                    random_state=101)

Check all the shapes to make sure that everything has worked out okay.

In [5]:
print(X_train.shape)
print(X_val.shape)
print(X_test.shape)
print(y_val.shape)
print(y_test.shape)

(20720, 29)
(4440, 29)
(4441, 29)
(4440,)
(4441,)


### Models

The following models will be explored:

* Random Forest

* KNN
* Decision Tree
* Decision Tree with Resampled data

### Random Forest

We can start applying Random Forest with gridSearch for hyperparameter tuning, we can observe that model can be improved in prediciting Default Group. Especially the precision results. This is probably due to the unbalanced dataset.

In [108]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix,classification_report

In [109]:
scoring = {'accuracy': make_scorer(accuracy_score),
           'precision': make_scorer(precision_score, average = 'macro'),
           'recall': make_scorer(recall_score, average = 'macro'),
           'f1': make_scorer(f1_score, average = 'macro')}

In [110]:
forest = RandomForestClassifier()

grid_values = {'n_estimators': [10, 30, 50, 100],
               'max_features': ['sqrt', 0.25, 0.5, 0.75, 1.0],
               'max_depth' : [4,5,6,7,8],
              }


grid_search_rfc = GridSearchCV(forest, param_grid = grid_values, scoring = scoring, refit='f1',)
grid_search_rfc.fit(X_train, y_train)

GridSearchCV(estimator=RandomForestClassifier(),
             param_grid={'max_depth': [4, 5, 6, 7, 8],
                         'max_features': ['sqrt', 0.25, 0.5, 0.75, 1.0],
                         'n_estimators': [10, 30, 50, 100]},
             refit='f1',
             scoring={'accuracy': make_scorer(accuracy_score),
                      'f1': make_scorer(f1_score, average=macro),
                      'precision': make_scorer(precision_score, average=macro),
                      'recall': make_scorer(recall_score, average=macro)})

In [111]:
grid_search_rfc.best_params_

{'max_depth': 8, 'max_features': 1.0, 'n_estimators': 100}

In [112]:
forest = RandomForestClassifier(max_depth=8, max_features=1.0, n_estimators=100)

In [113]:
forest.fit(X_train, y_train)

RandomForestClassifier(max_depth=8, max_features=1.0)

In [115]:
predRF = forest.predict(X_val)

In [116]:
print('Random Forest')
print(classification_report(predRF,y_test))
print('\n')

Random Forest
              precision    recall  f1-score   support

           0       0.81      0.69      0.75      3156
           1       0.19      0.31      0.24       734

    accuracy                           0.62      3890
   macro avg       0.50      0.50      0.49      3890
weighted avg       0.69      0.62      0.65      3890





### KNN

Now, we can try to apply a KNN model using repeated stratified cross validation in order to preserve the percentage of samples for each class.

In [16]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score

In [17]:
k_range = range(2, 10)
k_scores = []

for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, X_train, y_train, cv=5, scoring='accuracy')
    k_scores.append(scores.mean())

In [18]:
k_scores

[0.7811293436293438,
 0.7698841698841699,
 0.7907818532818532,
 0.788465250965251,
 0.7951254826254825,
 0.7942084942084942,
 0.8001447876447877,
 0.7974903474903476]

We can test the validity of this model with the training and validation set, and Cohen's Kappa. We will select 6 neighbors - though it doesn't seem to matter too much. 

Commonly, KNN is fitted with the training set and used to predict the validation set. Then a new KNN is fitted to the validation set, and the two models are compared to see how generaliseable the KNN is. 

In [19]:
knn1 = KNeighborsClassifier(n_neighbors=6)
knn1.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=6)

In [21]:
preds = knn1.predict(X_val)
print(confusion_matrix(preds, y_val))

[[3309  704]
 [ 174  253]]


In [22]:
knn2 = KNeighborsClassifier(n_neighbors = 6)
knn2.fit(X_val, y_val)

KNeighborsClassifier(n_neighbors=6)

In [23]:
preds_VAL = knn2.predict(X_val)
print(confusion_matrix(preds_VAL, y_val))

[[3403  678]
 [  80  279]]


Just by comparing the two confusion matrices, we can see that the numbers are similar. There is a small increase in accuracy in the model trained on the validation set, but it is low enough to say that the clustering is generaliseable. 

We can also check the predictions with Cohen's Kappa. We get 47% agreement, which is low score.

In [25]:
from sklearn.metrics import cohen_kappa_score
cohen_kappa_score(preds_VAL, preds)

0.4699763198287036

### Decision Tree

Finally we can apply a decision tree model, we see that precision, recal and F1 are higher than in the Random Forest with gridsearch. 

In [26]:
from sklearn.tree import DecisionTreeClassifier

In [27]:
tree1 = DecisionTreeClassifier()
tree1.fit(X_train, y_train)

DecisionTreeClassifier()

In [28]:
pred1 = tree1.predict(X_val)

In [103]:
print('Decision Tree')
print(classification_report(pred1,y_val))
print('\n')

Decision Tree
              precision    recall  f1-score   support

           0       0.80      0.83      0.81      3320
           1       0.43      0.36      0.39      1120

    accuracy                           0.72      4440
   macro avg       0.61      0.60      0.60      4440
weighted avg       0.70      0.72      0.71      4440





### Decision Tree with Resampling

As we have got always poor results in prediciting the minority class, we can try to resample the data.
We can try to increase it up to 30%.
As we can see, we got the best results for precision, recall and F1.

In [31]:
# resampling dataset
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

In [99]:
oversamp = SMOTE(sampling_strategy = 0.35)
undersamp = RandomUnderSampler(sampling_strategy = 0.45)
pipeline = Pipeline(steps = [('over', oversamp),('under', undersamp)])

X, y = pipeline.fit_resample(df_prep.drop(['def_next_month'],axis=1),df_prep.def_next_month)

In [100]:
print(y.sum()/len(y))

0.310350146537097


In [101]:
# resampled dataset
resampled_df = pd.concat([X, y],axis=1)

# Splitting Train, test dataset

X_train, X_NOT_train, y_train, y_NOT_train = train_test_split(resampled_df.drop(['def_next_month'],axis=1),
                                                                        resampled_df.def_next_month,test_size=0.3,
                                                                        random_state=101)

# split 30% groups into validation and test sets
X_val, X_test, y_val, y_test = train_test_split(X_NOT_train,y_NOT_train,test_size=0.5,
                                                                    random_state=101)

In [105]:
tree1 = DecisionTreeClassifier()
tree1.fit(X_train, y_train)
pred1 = tree1.predict(X_val)
print('Decision Tree - Resampled')
print(classification_report(pred1,y_val))
print('\n')

Decision Tree - Resampled
              precision    recall  f1-score   support

           0       0.78      0.79      0.79      2651
           1       0.54      0.52      0.53      1239

    accuracy                           0.71      3890
   macro avg       0.66      0.66      0.66      3890
weighted avg       0.70      0.71      0.70      3890





### Conclusions
We tried to classify the credit card default using Random Forest, KNN and decision tree models.
We had the best results when we used the resampled data. This is probably the path to follow: performing more modeling on the resampled data.

As we saw in the data exploratory part, there were many predictors correlated among each other. A further experiment might be trying to drop them and keep only th ones with low correlation.
Lastly, we saw there were outliers. We could try to rerun the models without the outliers, or replacing them with the 5th and 95th percentile.