In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics 
import matplotlib.pyplot as plt
import seaborn as sns
from mlxtend.plotting import plot_decision_regions

sns.set_style('darkgrid')
%matplotlib inline

In [2]:
# "DisbursementDate" for duplication
sba = pd.read_csv('../data/SBAnational.csv', usecols=['City','Bank','NAICS','ApprovalFY','Term','NewExist','FranchiseCode','UrbanRural','RevLineCr','DisbursementDate','MIS_Status','ChgOffPrinGr','GrAppv','SBA_Appv'])
df = sba.copy()
df.City = df.City.str.upper()

  exec(code_obj, self.user_global_ns, self.user_ns)


In [3]:
# drop na (relatively NOT large data loss)
df.dropna(inplace=True)
# drop duplication
df.drop_duplicates(subset=None,keep='first',inplace=True)

In [4]:
# keep first 2 digits of NAICS
df.NAICS = pd.to_numeric(df.NAICS.astype(str).str[:2])

In [5]:
# RevLineCr = 0, 1
df.RevLineCr.replace(['N', '0', 'Y', 'T'],[0, 0, 1, 1], inplace=True)
df = df[(df.RevLineCr == 0) | (df.RevLineCr == 1)]
df.RevLineCr = pd.to_numeric(df.RevLineCr)

In [6]:
# Franchise Code = 0, 1
df.FranchiseCode[df.FranchiseCode <= 1] = 0
df.FranchiseCode[df.FranchiseCode > 1] = 1
df = df.rename(columns={"FranchiseCode":"HasFranchise"})

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.FranchiseCode[df.FranchiseCode <= 1] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.FranchiseCode[df.FranchiseCode > 1] = 1


In [7]:
# New Exist = 0, 1 (Delet NewExist = 0.0)
df.NewExist = df.NewExist.astype(int)
df = df[(df.NewExist == 1) | (df.NewExist == 2)]
df.NewExist[df.NewExist == 2] = 1
df.NewExist[df.NewExist == 1] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.NewExist[df.NewExist == 1] = 0


In [8]:
df.MIS_Status.replace(['P I F', 'CHGOFF'],[0, 1], inplace=True)
df = df.rename(columns={"MIS_Status":"Default"})
df.Default = pd.to_numeric(df.Default)

In [9]:
df.ChgOffPrinGr = df.ChgOffPrinGr.apply(lambda x: x.strip('$'))
df.ChgOffPrinGr = df.ChgOffPrinGr.apply(lambda x : x.replace(',',''))
df.ChgOffPrinGr = pd.to_numeric(df.ChgOffPrinGr)
df.GrAppv = df.GrAppv.apply(lambda x: x.strip('$'))
df.GrAppv = df.GrAppv.apply(lambda x : x.replace(',',''))
df.GrAppv = pd.to_numeric(df.GrAppv)
df.SBA_Appv = df.SBA_Appv.apply(lambda x: x.strip('$'))
df.SBA_Appv = df.SBA_Appv.apply(lambda x : x.replace(',',''))
df.SBA_Appv = pd.to_numeric(df.SBA_Appv)
df.ApprovalFY[df.ApprovalFY == "1976A"] = df.ApprovalFY[df.ApprovalFY == "1976A"].apply(lambda x: x.strip('A'))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.ApprovalFY[df.ApprovalFY == "1976A"] = df.ApprovalFY[df.ApprovalFY == "1976A"].apply(lambda x: x.strip('A'))


In [10]:
df.dtypes

City                 object
Bank                 object
NAICS                 int64
ApprovalFY           object
Term                  int64
NewExist              int32
HasFranchise          int64
UrbanRural            int64
RevLineCr             int64
DisbursementDate     object
Default               int64
ChgOffPrinGr        float64
GrAppv              float64
SBA_Appv            float64
dtype: object

In [17]:
df = df.astype({"ChgOffPrinGr":'int', "GrAppv":'int', "SBA_Appv":'int', "ApprovalFY":'int'}) 
df.shape

(887035, 14)

In [18]:
# Create Features & Target variable
feature_cols = ['NAICS', 'ApprovalFY', 'Term', 'NewExist','HasFranchise','UrbanRural','RevLineCr','GrAppv','SBA_Appv']
X = df[feature_cols]
y = df.Default

In [19]:
# Split Train Data 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1)

In [20]:
# Create Decision Tree Model
dt = DecisionTreeClassifier().fit(X_train,y_train)
y_pred = dt.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.9226979617604618


In [27]:
# Improve Model
from sklearn.model_selection import GridSearchCV
params = {
    'max_depth': [5, 10, 20, 50, 100],
    'min_samples_leaf': [5, 10, 20, 50, 100],
    'criterion': ["gini", "entropy"]
}
grid_search = GridSearchCV(DecisionTreeClassifier(), 
                           param_grid=params, 
                           refit=True,
                           cv=5, n_jobs=-1, verbose=1, scoring = "accuracy").fit(X_train,y_train)

print(f'bt best hyperparams      : {grid_search.best_params_}')
print(f'bt best mean cv accuracy : {grid_search.best_score_:.5f}')

Fitting 5 folds for each of 50 candidates, totalling 250 fits


In [30]:
dt_imp = DecisionTreeClassifier(criterion = 'entropy', max_depth = 20, min_samples_leaf = 50).fit(X_train,y_train)
y_pred_imp = dt_imp.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred_imp))

Accuracy: 0.943226911976912


In [114]:
from sklearn.tree import plot_tree
# fig,ax = plt.subplots(1,1,figsize=(24,12))
# plot_tree(dt_imp,ax=ax,fontsize=7,feature_names=X.columns,filled=True);