In [1]:
# Import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest
from sklearn.linear_model import LogisticRegression

In [2]:
# Load dataset
df = pd.read_csv('../data/SBA_cleaned_data.csv')

df_copy = df.copy()

df_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891424 entries, 0 to 891423
Data columns (total 11 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   Unnamed: 0    891424 non-null  int64 
 1   NAICS         891424 non-null  int64 
 2   ApprovalFY    891424 non-null  int64 
 3   Term          891424 non-null  int64 
 4   NewExist      891424 non-null  int64 
 5   HasFranchise  891424 non-null  int64 
 6   UrbanRural    891424 non-null  int64 
 7   RevLineCr     891424 non-null  int64 
 8   Default       891424 non-null  int64 
 9   GrAppv        891424 non-null  int64 
 10  Industry      692257 non-null  object
dtypes: int64(10), object(1)
memory usage: 74.8+ MB


## Build Model

In [3]:
# Establish target and feature fields
use_cols = ['NAICS', 'ApprovalFY', 'Term', 'NewExist','HasFranchise','UrbanRural','RevLineCr','GrAppv']
y = df_copy.Default
X = df_copy[use_cols]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,random_state=520)

In [None]:
from sklearn.ensemble import RandomForestClassifier
#initialize the model
rfc = RandomForestClassifier()
rfc.fit(X_train,y_train)

In [20]:
y_rfpred = rfc.predict(X_test)
print(classification_report(y_test,y_rfpred,digits=3))

              precision    recall  f1-score   support

           0      0.953     0.970     0.961    183215
           1      0.847     0.780     0.812     39641

    accuracy                          0.936    222856
   macro avg      0.900     0.875     0.887    222856
weighted avg      0.934     0.936     0.935    222856



In [21]:
# use cross validation to improve
from sklearn.model_selection import GridSearchCV
params = {
    'n_estimators': [50, 100, 120],
    'max_depth': [15, 20, 25],
    'criterion': ["gini", "entropy"]
}
grid_search = GridSearchCV(RandomForestClassifier(), 
                           param_grid=params, 
                           refit=True,
                           cv=5, n_jobs=-1, verbose=1, scoring = "balanced_accuracy").fit(X_train,y_train)

print(f'bt best hyperparams      : {grid_search.best_params_}')
print(f'bt best mean cv accuracy : {grid_search.best_score_:.5f}')

Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed: 14.8min
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed: 39.8min finished


bt best hyperparams      : {'criterion': 'entropy', 'max_depth': 20, 'n_estimators': 120}
bt best mean cv accuracy : 0.87986


In [15]:
rfc_improved = RandomForestClassifier(criterion = 'entropy', max_depth = 20, n_estimators = 100).fit(X_train,y_train)
y_rfcim_pred = rfc_improved.predict(X_test)
print(classification_report(y_test,y_rfcim_pred,digits=3))

              precision    recall  f1-score   support

           0      0.956     0.971     0.963    183215
           1      0.855     0.792     0.822     39641

    accuracy                          0.939    222856
   macro avg      0.905     0.882     0.893    222856
weighted avg      0.938     0.939     0.938    222856



In [23]:
rfc_improveb = RandomForestClassifier(criterion = 'entropy', max_depth = 20, n_estimators = 120).fit(X_train,y_train)
y_rfcimb_pred = rfc_improveb.predict(X_test)

              precision    recall  f1-score   support

           0      0.956     0.971     0.963    183215
           1      0.854     0.791     0.821     39641

    accuracy                          0.939    222856
   macro avg      0.905     0.881     0.892    222856
weighted avg      0.937     0.939     0.938    222856



In [27]:
from sklearn import metrics
metrics.balanced_accuracy_score(y_test,y_rfcimb_pred)

0.8809388127257463

In [28]:
metrics.accuracy_score(y_test,y_rfcimb_pred)

0.9388169939333022

In [37]:
from sklearn.ensemble import GradientBoostingClassifier

gbc = GradientBoostingClassifier()
gbc.fit(X_train,y_train)

GradientBoostingClassifier(n_estimators=10)

In [38]:
y_gbpred = gbc.predict(X_val)
print(classification_report(y_val,y_gbpred,digits=3))

              precision    recall  f1-score   support

           0      0.874     0.980     0.924     89705
           1      0.871     0.483     0.621     24530

    accuracy                          0.874    114235
   macro avg      0.872     0.732     0.773    114235
weighted avg      0.873     0.874     0.859    114235

