In [2]:
# Import modules
import numpy as np
import pandas as pd
import scipy
from math import sqrt
import matplotlib.pyplot as plt
import seaborn as sns

# helpers
%matplotlib inline

In [3]:
# SKLearn modules
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.datasets import load_digits
from sklearn.tree import plot_tree
from sklearn.neighbors import KNeighborsClassifier

from sklearn.tree import export_graphviz

In [4]:
# Import the data set
df = pd.read_csv('credit1.csv')
credit = df
credit.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,DEFAULT
0,1,20000,2,2,1,24,2,2,-1,-1,...,0,0,0,0,689,0,0,0,0,0
1,2,120000,2,2,2,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,0
2,3,90000,2,2,2,34,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,1
3,4,50000,2,2,1,37,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,1
4,5,50000,1,2,1,57,-1,0,-1,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,1


In [5]:
# set up array for models that will be compared - age_label is the dependant variable
# Added GradientBoostingClassifier and AdaBoostClassifier as suggest by the POA
algos_Class = []
algos_Class.append(('Random Forest Classifier', RandomForestClassifier()))
algos_Class.append(('Decision Tree Classifier', DecisionTreeClassifier()))
algos_Class.append(('Gradient Boosting Classifier', GradientBoostingClassifier()))
algos_Class.append(('Ada Boost Classifier', AdaBoostClassifier()))
algos_Class.append(('K Nearest Neighbors', KNeighborsClassifier(n_neighbors=3)))

# Cross Validation
## CREDIT_BAL is dependent variable
## Using all features

In [5]:
# Discretize amount
discr_limit = ([9999, 50000, 100000, 200000, 300000, 400000, 500000, 1000000])
credit['limit_label'] = pd.cut(credit['LIMIT_BAL'], discr_limit, labels = False)

In [6]:
# Set LIMIT_BAL as the dependent variable
y=credit['limit_label']
y.head(5)

0    0
1    2
2    1
3    0
4    0
Name: limit_label, dtype: int64

In [11]:
# Use all features for independent variables
X = credit.iloc[:,2:25]
X.head(5)

Unnamed: 0,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,DEFAULT
0,2,2,1,24,2,2,-1,-1,-2,-2,...,0,0,0,0,689,0,0,0,0,0
1,2,2,2,26,-1,2,0,0,0,2,...,3272,3455,3261,0,1000,1000,1000,0,2000,0
2,2,2,2,34,0,0,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,1
3,2,2,1,37,0,0,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,1
4,1,2,1,57,-1,0,-1,0,0,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,1


In [12]:
results = []
names = []
for name, model in algos_Class:
    result = cross_val_score(model, X, y, cv=3, scoring='accuracy')
    names.append(name)
    results.append(result)

In [13]:
for i in range(len(names)):
   print(names[i],results[i].mean())
# All models are poor, with the ability to predict, at best, ~50%
# Random Forest Classifier is the best performer at just over 50%
# This indicates none of the models are useful to predict how much credit to offer

Random Forest Classifier 0.5133000000000001
Decision Tree Classifier 0.4056
Gradient Boosting Classifier 0.5188333333333334
Ada Boost Classifier 0.4794333333333333
K Nearest Neighbors 0.4443


# LIMIT_BAL as the dependent variable
## Random Forest Classifier - Depth = 3

In [14]:
# define features and independant variable
# Define x - all features
X = credit.iloc[:,2:25]

# Define y
discr_limit = ([9999, 50000, 100000, 200000, 300000, 400000, 500000, 1000000])
credit['limit_label'] = pd.cut(credit['LIMIT_BAL'], discr_limit, labels = False)
y=credit['limit_label']

In [15]:
# Train/Test/Split
X_train, X_test, y_train, y_test = train_test_split (X,y,
                                                    test_size = .30, random_state = 321)

In [16]:
# Select RandomForestClassifier as model
algo=RandomForestClassifier()

In [17]:
# Modeling - depth = 3
rfc_all = RandomForestClassifier(max_depth = 3,
           #   min_samples_split = 10, 
              criterion = "gini")
rfc_fit_store = rfc_all.fit(X_train, y_train)

In [18]:
# Predictions
rfc_preds = rfc_fit_store.predict(X_test)

In [19]:
# Print out results
print(classification_report(y_test, rfc_preds,
                           target_names=['9,999-49,999','50,000-99,999','100,000-199,999',
                                         '200,000-299,999', '300,000-399,999',
                                         '400,000-499,999', '500,000-1,000,000'],
                           zero_division=0))

# Poor predictor, right at 50%, the same as tossing a coin

                   precision    recall  f1-score   support

     9,999-49,999       0.56      0.87      0.68      2301
    50,000-99,999       0.61      0.32      0.42      1468
  100,000-199,999       0.39      0.70      0.50      2364
  200,000-299,999       0.43      0.10      0.16      1567
  300,000-399,999       0.18      0.00      0.01       764
  400,000-499,999       0.00      0.00      0.00       472
500,000-1,000,000       0.00      0.00      0.00        64

         accuracy                           0.48      9000
        macro avg       0.31      0.28      0.25      9000
     weighted avg       0.43      0.48      0.40      9000



#  Cross Validation
## DEFAULT is dependent variable
#### Using all features

In [20]:
# Use all features for independent variables
X = credit.iloc[:,1:24]
X.head(5)

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6
0,20000,2,2,1,24,2,2,-1,-1,-2,...,689,0,0,0,0,689,0,0,0,0
1,120000,2,2,2,26,-1,2,0,0,0,...,2682,3272,3455,3261,0,1000,1000,1000,0,2000
2,90000,2,2,2,34,0,0,0,0,0,...,13559,14331,14948,15549,1518,1500,1000,1000,1000,5000
3,50000,2,2,1,37,0,0,0,0,0,...,49291,28314,28959,29547,2000,2019,1200,1100,1069,1000
4,50000,1,2,1,57,-1,0,-1,0,0,...,35835,20940,19146,19131,2000,36681,10000,9000,689,679


In [21]:
# Set LIMIT_BAL as the dependent variable
y=credit['DEFAULT']

In [22]:
# Train/Test Split 
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size = .30, random_state = 321)

In [23]:
# set up array for models that will be compared - age_label is the dependant variable
# Added GradientBoostingClassifier and AdaBoostClassifier as suggest by the POA
algos_Class = []
algos_Class.append(('Random Forest Classifier', RandomForestClassifier()))
algos_Class.append(('Decision Tree Classifier', DecisionTreeClassifier()))
algos_Class.append(('Gradient Boosting Classifier', GradientBoostingClassifier()))
algos_Class.append(('Ada Boost Classifier', AdaBoostClassifier()))
algos_Class.append(('K Nearest Neighbors', KNeighborsClassifier(n_neighbors=3)))

In [24]:
results = []
names = []
for name, model in algos_Class:
        result = cross_val_score(model, X, y, cv=5, scoring='accuracy')
        names.append(name)
        results.append(result)

In [25]:
for i in range(len(names)):
   print(names[i],results[i].mean())
# Much better results.  GradientBoosting Classifier seems to be the best option

Random Forest Classifier 0.8150999999999999
Decision Tree Classifier 0.7244
Gradient Boosting Classifier 0.8205333333333333
Ada Boost Classifier 0.8175000000000001
K Nearest Neighbors 0.7368333333333333


## Gradient Boosting Classifier
###  Using DEFAULT as the Dependent Variable

In [29]:
# Use all features for independent variables
X = credit.iloc[:,1:24]
X.head(5)

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6
0,20000,2,2,1,24,2,2,-1,-1,-2,...,689,0,0,0,0,689,0,0,0,0
1,120000,2,2,2,26,-1,2,0,0,0,...,2682,3272,3455,3261,0,1000,1000,1000,0,2000
2,90000,2,2,2,34,0,0,0,0,0,...,13559,14331,14948,15549,1518,1500,1000,1000,1000,5000
3,50000,2,2,1,37,0,0,0,0,0,...,49291,28314,28959,29547,2000,2019,1200,1100,1069,1000
4,50000,1,2,1,57,-1,0,-1,0,0,...,35835,20940,19146,19131,2000,36681,10000,9000,689,679


In [30]:
# Set LIMIT_BAL as the dependent variable
y=credit['DEFAULT']

In [31]:
# Train/Test Split 
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size = .30, random_state = 321)

In [32]:
# gbc modeling - depth = 3
gbc3 = GradientBoostingClassifier(max_depth = 3)
gbc_fit_3 = gbc3.fit(X_train, y_train)

In [34]:
# gbc_fit_3 predictions
gbc_3_preds = gbc_fit_3.predict(X_test)
print(classification_report(y_test, gbc_3_preds,
                           target_names=['Default', 'Not Default'], 
                           zero_division=0))

              precision    recall  f1-score   support

     Default       0.65      0.38      0.48      1948
 Not Default       0.85      0.94      0.89      7052

    accuracy                           0.82      9000
   macro avg       0.75      0.66      0.68      9000
weighted avg       0.80      0.82      0.80      9000



#### Use only Customer information features

In [35]:
# Use only Customer Demographic features
X = credit.iloc[:,1:6]
y=credit['DEFAULT']
X.head(5)

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE
0,20000,2,2,1,24
1,120000,2,2,2,26
2,90000,2,2,2,34
3,50000,2,2,1,37
4,50000,1,2,1,57


In [38]:
# gbc modeling - depth = 3 - Customer Demographic features
gbcdemo = GradientBoostingClassifier(max_depth = 3)
gbc_fit_demo = gbcdemo.fit(X_train, y_train)

In [39]:
# gbc_fit_3 predictions
gbc_demo_preds = gbc_fit_demo.predict(X_test)
print(classification_report(y_test, gbc_demo_preds,
                           target_names=['Default', 'Not Default'], 
                           zero_division=0))

              precision    recall  f1-score   support

     Default       0.65      0.38      0.48      1948
 Not Default       0.85      0.94      0.89      7052

    accuracy                           0.82      9000
   macro avg       0.75      0.66      0.68      9000
weighted avg       0.80      0.82      0.80      9000



### Use only Pay_ Features

In [41]:
# Use only PAY_ features
X = credit.iloc[:,6:12]
y=credit['DEFAULT']
X.head(5)

Unnamed: 0,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6
0,2,2,-1,-1,-2,-2
1,-1,2,0,0,0,2
2,0,0,0,0,0,0
3,0,0,0,0,0,0
4,-1,0,-1,0,0,0


In [44]:
# gbc modeling - depth = 3 - Customer PAY_ features
gbcpay = GradientBoostingClassifier(max_depth = 3)
gbc_fit_pay = gbcpay.fit(X_train, y_train)

In [45]:
# gbc_fit_3 predictions
gbc_pay_preds = gbc_fit_pay.predict(X_test)
print(classification_report(y_test, gbc_pay_preds,
                           target_names=['Default', 'Not Default'], 
                           zero_division=0))

              precision    recall  f1-score   support

     Default       0.65      0.38      0.48      1948
 Not Default       0.85      0.94      0.89      7052

    accuracy                           0.82      9000
   macro avg       0.75      0.66      0.69      9000
weighted avg       0.80      0.82      0.80      9000



# Random Forest Classifier
## Run model through different depths, 1-20

In [46]:
# Use only Customer Demographic features
X = credit.iloc[:,2:24]
y=credit['DEFAULT']
X.head(5)

Unnamed: 0,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,...,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6
0,2,2,1,24,2,2,-1,-1,-2,-2,...,689,0,0,0,0,689,0,0,0,0
1,2,2,2,26,-1,2,0,0,0,2,...,2682,3272,3455,3261,0,1000,1000,1000,0,2000
2,2,2,2,34,0,0,0,0,0,0,...,13559,14331,14948,15549,1518,1500,1000,1000,1000,5000
3,2,2,1,37,0,0,0,0,0,0,...,49291,28314,28959,29547,2000,2019,1200,1100,1069,1000
4,1,2,1,57,-1,0,-1,0,0,0,...,35835,20940,19146,19131,2000,36681,10000,9000,689,679


In [47]:
# Train/Test Split 
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size = .30, random_state = 321)
train_scores, test_scores =list(), list()

In [67]:
# Execute the model up to a depth of 20
values = [i for i in range(1,21)]

for i in values:
    model = RandomForestClassifier(max_depth = i)
    model.fit(X_train, y_train)
    train_yhat = model.predict(X_train)
    train_acc = accuracy_score(y_train, train_yhat)
    train_scores.append(train_acc)
    test_yhat = model.predict(X_test)
    test_acc=accuracy_score(y_test, test_yhat)
    test_scores.append(test_acc)
    print('>%d, train: %.3f, test: %.3f' % (i, train_acc, test_acc))
# Begins overfitting at a depth of around 8-9.  Will use max_depth = 8

>1, train: 0.784, test: 0.785
>2, train: 0.805, test: 0.804
>3, train: 0.810, test: 0.807
>4, train: 0.814, test: 0.810
>5, train: 0.818, test: 0.811
>6, train: 0.827, test: 0.817
>7, train: 0.833, test: 0.820
>8, train: 0.839, test: 0.820
>9, train: 0.848, test: 0.820
>10, train: 0.858, test: 0.820
>11, train: 0.871, test: 0.818
>12, train: 0.878, test: 0.819
>13, train: 0.889, test: 0.819
>14, train: 0.896, test: 0.818
>15, train: 0.904, test: 0.820
>16, train: 0.912, test: 0.820
>17, train: 0.917, test: 0.820
>18, train: 0.926, test: 0.818
>19, train: 0.936, test: 0.817
>20, train: 0.943, test: 0.819


## Random Forest Classifier
### Additional testing
### max_depth = 8, per above

#### Use all features

In [20]:
# Use all available features for independent variables
X = credit.iloc[:,1:24]

# Set DEFAULT as the dependent variable
y=credit['DEFAULT']

X.head(5)

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6
0,20000,2,2,1,24,2,2,-1,-1,-2,...,689,0,0,0,0,689,0,0,0,0
1,120000,2,2,2,26,-1,2,0,0,0,...,2682,3272,3455,3261,0,1000,1000,1000,0,2000
2,90000,2,2,2,34,0,0,0,0,0,...,13559,14331,14948,15549,1518,1500,1000,1000,1000,5000
3,50000,2,2,1,37,0,0,0,0,0,...,49291,28314,28959,29547,2000,2019,1200,1100,1069,1000
4,50000,1,2,1,57,-1,0,-1,0,0,...,35835,20940,19146,19131,2000,36681,10000,9000,689,679


In [21]:
# Train/Test Split 
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size = .30, random_state = 321)

In [22]:
# rfc modeling - depth = 8 - all features
rfcpay = RandomForestClassifier(max_depth = 8)
rfc_fit_pay = rfcpay.fit(X_train, y_train)

In [23]:
# rfc_fit_3 predictions
rfc_pay_preds = rfc_fit_pay.predict(X_test)
print(classification_report(y_test, rfc_pay_preds,
                           target_names=['Default', 'Not Default'], 
                           zero_division=0))

# Overall accuracy - 82%  
#    Correctly ID rate - Default 68%, Not default 84%
#    False postive rate - Default 65%, Not default 5%
#    Pretty good model

              precision    recall  f1-score   support

     Default       0.68      0.35      0.46      1989
 Not Default       0.84      0.95      0.89      7011

    accuracy                           0.82      9000
   macro avg       0.76      0.65      0.67      9000
weighted avg       0.80      0.82      0.80      9000



#### PAY_ features

In [11]:
# Use PAY_ features for independent variables
X = credit.iloc[:,6:12]
X.head(5)

Unnamed: 0,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6
0,2,2,-1,-1,-2,-2
1,-1,2,0,0,0,2
2,0,0,0,0,0,0
3,0,0,0,0,0,0
4,-1,0,-1,0,0,0


In [12]:
# Set DEFAULT as the dependent variable
y=credit['DEFAULT']

In [13]:
# Train/Test Split 
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size = .30, random_state = 321)

In [14]:
# rfc modeling - depth = 8 - Customer PAY_ features
rfcpay = RandomForestClassifier(max_depth = 8)
rfc_fit_pay = rfcpay.fit(X_train, y_train)

In [15]:
# rfc_fit_3 predictions
rfc_pay_preds = rfc_fit_pay.predict(X_test)
print(classification_report(y_test, rfc_pay_preds,
                           target_names=['Default', 'Not Default'], 
                           zero_division=0))
# Essentially the same as when using all features

              precision    recall  f1-score   support

     Default       0.68      0.36      0.47      1989
 Not Default       0.84      0.95      0.89      7011

    accuracy                           0.82      9000
   macro avg       0.76      0.65      0.68      9000
weighted avg       0.80      0.82      0.80      9000



#### Demographic Features

In [24]:
# Use Customer demographics features for independent variables
X = credit.iloc[:,1:6]
X.head(5)

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE
0,20000,2,2,1,24
1,120000,2,2,2,26
2,90000,2,2,2,34
3,50000,2,2,1,37
4,50000,1,2,1,57


In [25]:
# Set DEFAULT as the dependent variable
y=credit['DEFAULT']

In [26]:
# Train/Test Split 
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size = .30, random_state = 321)

In [27]:
# rfc modeling - depth = 8 
rfcpay = RandomForestClassifier(max_depth = 8)
rfc_fit_pay = rfcpay.fit(X_train, y_train)

In [28]:
# predictions
rfc_pay_preds = rfc_fit_pay.predict(X_test)
print(classification_report(y_test, rfc_pay_preds,
                           target_names=['Default', 'Not Default'], 
                           zero_division=0))

# Not as good of results as with all features and just the PAY_ features

              precision    recall  f1-score   support

     Default       0.31      0.00      0.00      1989
 Not Default       0.78      1.00      0.88      7011

    accuracy                           0.78      9000
   macro avg       0.55      0.50      0.44      9000
weighted avg       0.68      0.78      0.68      9000



#### Bill Amount features

In [29]:
# Use Customer demographics features for independent variables
X = credit.iloc[:,12:18]

# Set DEFAULT as the dependent variable
y=credit['DEFAULT']

X.head(5)

Unnamed: 0,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6
0,3913,3102,689,0,0,0
1,2682,1725,2682,3272,3455,3261
2,29239,14027,13559,14331,14948,15549
3,46990,48233,49291,28314,28959,29547
4,8617,5670,35835,20940,19146,19131


In [30]:
# Train/Test Split 
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size = .30, random_state = 321)

In [31]:
# rfc modeling - depth = 8 
rfcpay = RandomForestClassifier(max_depth = 8)
rfc_fit_pay = rfcpay.fit(X_train, y_train)

In [32]:
# rfc_fit_3 predictions
rfc_pay_preds = rfc_fit_pay.predict(X_test)
print(classification_report(y_test, rfc_pay_preds,
                           target_names=['Default', 'Not Default'], 
                           zero_division=0))

# Not as good of results as with all features and just PAY_ features

              precision    recall  f1-score   support

     Default       0.00      0.00      0.00      1989
 Not Default       0.78      1.00      0.88      7011

    accuracy                           0.78      9000
   macro avg       0.39      0.50      0.44      9000
weighted avg       0.61      0.78      0.68      9000



#### PAY_AMT Features

In [33]:
# Use Customer demographics features for independent variables
X = credit.iloc[:,18:24]

# Set DEFAULT as the dependent variable
y=credit['DEFAULT']

X.head(5)

Unnamed: 0,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6
0,0,689,0,0,0,0
1,0,1000,1000,1000,0,2000
2,1518,1500,1000,1000,1000,5000
3,2000,2019,1200,1100,1069,1000
4,2000,36681,10000,9000,689,679


In [43]:
# Train/Test Split 
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size = .30, random_state = 321)

In [44]:
# rfc modeling - depth = 8 - Customer PAY_ features
rfcpay = RandomForestClassifier(max_depth = 8)
rfc_fit_pay = rfcpay.fit(X_train, y_train)

In [45]:
# rfc_fit_3 predictions
rfc_pay_preds = rfc_fit_pay.predict(X_test)
print(classification_report(y_test, rfc_pay_preds,
                           target_names=['Default', 'Not Default'], 
                           zero_division=0))

# Not as good of results as with the PAY_ features

              precision    recall  f1-score   support

     Default       0.59      0.02      0.03      1989
 Not Default       0.78      1.00      0.88      7011

    accuracy                           0.78      9000
   macro avg       0.68      0.51      0.45      9000
weighted avg       0.74      0.78      0.69      9000



## Best option is to use Random Forest Classifier
### max_depth =8