In [10]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix

import pandas as pd 
pd.set_option('display.max_columns', None)

In [21]:
df = pd.read_csv('../../cleaned_accepted_2007_to_2015Q4.csv.gz')
df.drop(columns='Unnamed: 0', inplace=True)

In [22]:
df = df[df['issue_year'] < 2016]

#df = df[df['term'] != ' 60 months']

df = df[df['grade'].isin(['1','2'])]

In [23]:
df.shape

(355041, 33)

In [24]:
X = df[['grade', 'purpose', 'annual_inc', 'dti', 'term',
       'emp_length', 'funded_amnt', 'inq_fi', 'inq_last_12m', 'installment',
       'int_rate', 'loan_amnt', 'open_acc', 'percent_bc_gt_75',
       'pub_rec', 'pub_rec_bankruptcies', 'tax_liens', 'total_acc',
       'delinq_amnt', 'delinq_2yrs', 'mths_since_last_delinq',
       'mths_since_last_major_derog', 'cr_yeargap', 'avg_fico_score',
       'home_ownership_cat', 'application_type_cat', 'purpose_cat',
       'sub_grade_cat', 'home_ownership', 'annual_inc']].copy()
    
y = df['loan_status']
    
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=27, shuffle=True)

In [25]:
gbm_cd = GradientBoostingClassifier()
gbm_cd.fit(X_train, y_train)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [26]:
print('The train accuracy is %.3f' %(gbm_cd.score(X_train, y_train)))
print('The train accuracy is %.3f' %(gbm_cd.score(X_test, y_test)))

predictions = gbm_cd.predict(X_test)
cm = confusion_matrix(y_test, predictions)
print(cm)

The train accuracy is 0.911
The train accuracy is 0.911
[[    0  6351]
 [    0 64658]]


## Hyperparameter Tuning

In [9]:
# gbm_cd_tuned = GradientBoostingClassifier()
# parameters = {
#     "n_estimators":[5,25,125,625,1300],
#     "max_depth":[1,2,3,4,7],
#     "learning_rate":[0.01,0.1,1,10]
# }

In [13]:
# cv = GridSearchCV(gbm_cd_tuned, parameters, cv=5, n_jobs=7)
# cv.fit(X_train, y_train)


GridSearchCV(cv=5, error_score=nan,
             estimator=GradientBoostingClassifier(ccp_alpha=0.0,
                                                  criterion='friedman_mse',
                                                  init=None, learning_rate=0.1,
                                                  loss='deviance', max_depth=3,
                                                  max_features=None,
                                                  max_leaf_nodes=None,
                                                  min_impurity_decrease=0.0,
                                                  min_impurity_split=None,
                                                  min_samples_leaf=1,
                                                  min_samples_split=2,
                                                  min_weight_fraction_leaf=0.0,
                                                  n_estimators=100,
                                                  n_iter_no_change=None,
         

In [15]:
# print('Tuned hyperparameters : (best parameters) ', cv.best_params_)
# print('accuracy: ', cv.best_score_)

Tuned hyperparameters : (best parameters)  {'learning_rate': 0.1, 'max_depth': 1, 'n_estimators': 625}
accuracy:  0.803248267296841


## Run with the short list of features

In [18]:
X_cut = df[['issue_year', 'loan_amnt', 'funded_amnt', 'installment', 'sub_grade_cat', 'int_rate',
            'percent_bc_gt_75', 'avg_fico_score', 'mths_since_last_delinq', 'mths_since_last_major_derog', 
            'cr_yeargap', 'dti', 'emp_length', 'inq_last_12m', 'open_acc', 'total_acc', 'delinq_amnt', 'term']]

y = df['loan_status']
    
X_cut_train, X_cut_test, y_cut_train, y_cut_test = train_test_split(X_cut, y, test_size=0.2, random_state=27, shuffle=True)

In [27]:
gbm_cd_cut = GradientBoostingClassifier(learning_rate=0.1, max_depth = 3, n_estimators=500)
gbm_cd_cut.fit(X_cut_train, y_cut_train)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=500,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [28]:
print('The train accuracy is %.3f' %(gbm_cd_cut.score(X_cut_train, y_cut_train)))
print('The train accuracy is %.3f' %(gbm_cd_cut.score(X_cut_test, y_cut_test)))

predictions = gbm_cd.predict(X_test)
cm = confusion_matrix(y_test, predictions)
print(cm)

The train accuracy is 0.804
The train accuracy is 0.802
[[    0  6351]
 [    0 64658]]
