## Credit Card Default - Random Forest - Logan

In [52]:
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import tree

import matplotlib.pyplot as plt
import seaborn as sns

##### Explore data and try to find some strong predictors

In [53]:
data = pd.read_csv("UCI_Credit_Card.csv")
data.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
0,1,20000.0,2,2,1,24,2,2,-1,-1,...,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1
1,2,120000.0,2,2,2,26,-1,2,0,0,...,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0,1
2,3,90000.0,2,2,2,34,0,0,0,0,...,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0,0
3,4,50000.0,2,2,1,37,0,0,0,0,...,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0,0
4,5,50000.0,1,2,1,57,-1,0,-1,0,...,20940.0,19146.0,19131.0,2000.0,36681.0,10000.0,9000.0,689.0,679.0,0


Check if there is anomalies in the data -- No missing values

In [54]:
data.isnull().sum()

ID                            0
LIMIT_BAL                     0
SEX                           0
EDUCATION                     0
MARRIAGE                      0
AGE                           0
PAY_0                         0
PAY_2                         0
PAY_3                         0
PAY_4                         0
PAY_5                         0
PAY_6                         0
BILL_AMT1                     0
BILL_AMT2                     0
BILL_AMT3                     0
BILL_AMT4                     0
BILL_AMT5                     0
BILL_AMT6                     0
PAY_AMT1                      0
PAY_AMT2                      0
PAY_AMT3                      0
PAY_AMT4                      0
PAY_AMT5                      0
PAY_AMT6                      0
default.payment.next.month    0
dtype: int64

##### 75% Training Set, 25% Test Set

In [55]:
train_df, val_df = train_test_split(data, test_size=0.25, random_state=2019, shuffle=True)
train_df.head(10)

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
19472,19473,60000.0,2,2,2,24,0,0,0,0,...,56273.0,28316.0,30763.0,2600.0,2900.0,2034.0,2000.0,3000.0,1000.0,0
22999,23000,110000.0,2,2,2,42,0,0,0,0,...,79421.0,78286.0,65875.0,3951.0,4276.0,2666.0,2811.0,2500.0,465.0,0
16147,16148,140000.0,2,3,1,34,0,0,0,0,...,137490.0,106322.0,104583.0,6300.0,7000.0,5421.0,4000.0,4000.0,4200.0,0
22329,22330,110000.0,2,1,2,27,0,0,0,0,...,34820.0,36005.0,25976.0,1772.0,1900.0,1815.0,2000.0,1500.0,2000.0,0
6203,6204,200000.0,2,1,2,29,0,0,0,0,...,3270.0,774.0,0.0,3000.0,1000.0,0.0,700.0,0.0,0.0,0
14682,14683,50000.0,1,1,2,25,0,0,2,2,...,49296.0,47220.0,49584.0,4100.0,400.0,3000.0,2.0,4005.0,6.0,0
29258,29259,70000.0,1,2,2,24,1,2,0,0,...,46918.0,53153.0,52174.0,0.0,20000.0,1807.0,7042.0,0.0,1925.0,0
22251,22252,60000.0,2,3,2,26,-1,2,-1,-1,...,6444.0,3783.0,3219.0,0.0,6472.0,6444.0,3219.0,0.0,0.0,0
11069,11070,80000.0,2,1,2,24,0,0,0,0,...,42663.0,43136.0,43676.0,3000.0,7139.0,6000.0,1694.0,2000.0,3000.0,0
25499,25500,90000.0,2,1,1,29,0,0,0,0,...,35670.0,41995.0,35912.0,10400.0,3400.0,36139.0,618.0,3200.0,8500.0,0


In [56]:
#Ratio of default in training data
print( float(train_df["default.payment.next.month"].sum()) / len(train_df["default.payment.next.month"]) )

#Ratio of default in validation data
print( float(val_df["default.payment.next.month"].sum()) / len(val_df["default.payment.next.month"]) )

0.224088888889
0.212533333333


### Decision Tree

In [57]:
target = 'default.payment.next.month'
predictors = ['LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 
              'BILL_AMT1','BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']
clf_dt = DecisionTreeClassifier()
clf_dt.fit(train_df[predictors], train_df[target].values)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [58]:
clf_dt.predict(val_df[predictors])

array([1, 0, 1, ..., 0, 0, 0], dtype=int64)

In [63]:
print len(val_df[clf_dt.predict(val_df[predictors]) == val_df['default.payment.next.month']])
print len(val_df)
print len(val_df[clf_dt.predict(val_df[predictors]) == val_df['default.payment.next.month']])/float(len(val_df))

5448
7500
0.7264


# 72.64%

### Random Forest

In [66]:
clf_rf = RandomForestClassifier(n_estimators = 2)
clf_rf.fit(train_df[predictors], train_df[target].values)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=2, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [69]:
clf_rf.predict(val_df[predictors])

array([0, 0, 0, ..., 0, 1, 0], dtype=int64)

In [71]:
print len(val_df[clf_rf.predict(val_df[predictors]) == val_df['default.payment.next.month']])
print len(val_df)
print len(val_df[clf_rf.predict(val_df[predictors]) == val_df['default.payment.next.month']])/float(len(val_df))

5928
7500
0.7904


# 79.04%