# Split Data Sampling

In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_predict

## Load Data

In [20]:
data_clean = "data_clean.csv"
data_results = "data_results.csv"
sampling = [1, 0.9, 0.7, 0.5, 0.3]
splits = [0.9, 0.8, 0.7, 0.6, 0.5]
rand_split = 44

num_cols = ["Term","NoEmp","SecuredSBA","GrDisburs","GrApprov","ApprovSBA"]
target = "Default"

In [3]:
df = pd.read_csv(data_clean, low_memory=False)
df.shape

(894609, 19)

## Load Metrics

In [5]:
metrics = {
    "Accuracy": accuracy_score,
    "Precision": precision_score,
    "Recall": recall_score,
    "F1-Score": f1_score,
    "AUC": roc_auc_score
}

## Load Models

In [6]:
models = [
    ("lr", LogisticRegression(C=1, penalty="l2", random_state=rand_split, max_iter=1000)),
    ("knn", KNeighborsClassifier(n_neighbors=10, weights="uniform", metric="manhattan")),
    ("dtc", DecisionTreeClassifier(criterion="entropy", max_depth=20, random_state=rand_split)),
    ("rfc", RandomForestClassifier(criterion="entropy", max_depth=20, random_state=rand_split)),
    ("xgb", XGBClassifier(learning_rate=0.1, max_depth=10, n_estimators=300, random_state=rand_split))
]

In [7]:
lr_cols = list(df.drop(columns=[target]).columns)
knn_cols = ["State","DifState","Sector","AppYear","Term","Secured","Urban","RevLine","LowDoc","SecuredSBA"]
dtc_cols = ["State","BankState","DifState","AppYear","Term","NoEmp","GrDisburs","ApprovSBA","SecuredSBA"]
rfc_cols = ["State","BankState","Sector","AppYear","Term","GrDisburs","GrApprov","ApprovSBA","SecuredSBA"]
xgb_cols = list(df.drop(columns=[target]).columns)

## Split Data

In [8]:
def ZScore(X_train, X_test):
    scaler = StandardScaler()

    X_train_std = X_train.copy()
    X_test_std = X_test.copy()

    X_train_std[num_cols] = scaler.fit_transform(X_train_std[num_cols])
    X_test_std[num_cols] = scaler.fit_transform(X_test_std[num_cols])

    return X_train_std, X_test_std

In [9]:
def MinMax(X_train, X_test):
    scaler = MinMaxScaler()

    X_train_nrm = X_train.copy()
    X_test_nrm = X_test.copy()

    X_train_nrm[num_cols] = scaler.fit_transform(X_train_nrm[num_cols])
    X_test_nrm[num_cols] = scaler.fit_transform(X_test_nrm[num_cols])

    return X_train_nrm, X_test_nrm

In [18]:
cols = ["Model","Sample","Split","Metric","Value"]
results = pd.DataFrame(columns=cols)

for samp in sampling:
    df_ = df.sample(frac=samp, random_state=rand_split)
    X = df_.drop(columns=[target])
    y = df_[target]

    for split in splits:
        X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                            train_size=split,
                                                            random_state=rand_split)

        print(f"{samp} - {split}: ({X_train.shape[0]}, {X_test.shape[0]})...")

        X_train_std, X_test_std = ZScore(X_train, X_test)
        X_train_nrm, X_test_nrm = MinMax(X_train, X_test)

        y_lr = models[0][1].fit(X_train_std[lr_cols], y_train).predict(X_test_std[lr_cols])
        mod_met = []
        for metric_name, metric_func in metrics.items():
            mod_met.append(["LR", samp, split, metric_name, metric_func(y_test, y_lr)])
        results = pd.concat([results, pd.DataFrame(mod_met, columns=cols)], ignore_index=True)

        y_knn = models[1][1].fit(X_train_nrm[knn_cols], y_train).predict(X_test_nrm[knn_cols])
        mod_met = []
        for metric_name, metric_func in metrics.items():
            mod_met.append(["KNN", samp, split, metric_name, metric_func(y_test, y_knn)])
        results = pd.concat([results, pd.DataFrame(mod_met, columns=cols)], ignore_index=True)

        y_dtc = models[2][1].fit(X_train[dtc_cols], y_train).predict(X_test[dtc_cols])
        mod_met = []
        for metric_name, metric_func in metrics.items():
            mod_met.append(["DTC", samp, split, metric_name, metric_func(y_test, y_dtc)])
        results = pd.concat([results, pd.DataFrame(mod_met, columns=cols)], ignore_index=True)

        y_rfc = models[3][1].fit(X_train[rfc_cols], y_train).predict(X_test[rfc_cols])
        mod_met = []
        for metric_name, metric_func in metrics.items():
            mod_met.append(["RFC", samp, split, metric_name, metric_func(y_test, y_rfc)])
        results = pd.concat([results, pd.DataFrame(mod_met, columns=cols)], ignore_index=True)

        y_xgb = models[4][1].fit(X_train[xgb_cols], y_train).predict(X_test[xgb_cols])
        mod_met = []
        for metric_name, metric_func in metrics.items():
            mod_met.append(["XGB", samp, split, metric_name, metric_func(y_test, y_xgb)])
        results = pd.concat([results, pd.DataFrame(mod_met, columns=cols)], ignore_index=True)

1 - 0.9: (805148, 89461)...
1 - 0.8: (715687, 178922)...
1 - 0.7: (626226, 268383)...
1 - 0.6: (536765, 357844)...
1 - 0.5: (447304, 447305)...
0.9 - 0.9: (724633, 80515)...
0.9 - 0.8: (644118, 161030)...
0.9 - 0.7: (563603, 241545)...
0.9 - 0.6: (483088, 322060)...
0.9 - 0.5: (402574, 402574)...
0.7 - 0.9: (563603, 62623)...
0.7 - 0.8: (500980, 125246)...
0.7 - 0.7: (438358, 187868)...
0.7 - 0.6: (375735, 250491)...
0.7 - 0.5: (313113, 313113)...
0.5 - 0.9: (402573, 44731)...
0.5 - 0.8: (357843, 89461)...
0.5 - 0.7: (313112, 134192)...
0.5 - 0.6: (268382, 178922)...
0.5 - 0.5: (223652, 223652)...
0.3 - 0.9: (241544, 26839)...
0.3 - 0.8: (214706, 53677)...
0.3 - 0.7: (187868, 80515)...
0.3 - 0.6: (161029, 107354)...
0.3 - 0.5: (134191, 134192)...


In [21]:
results.to_csv(data_results, index=False)
# results = pd.read_csv(data_results, low_memory=False)
results

Unnamed: 0,Model,Sample,Split,Metric,Value
0,LR,1,0.9,Accuracy,0.853567
1,LR,1,0.9,Precision,0.698886
2,LR,1,0.9,Recall,0.308133
3,LR,1,0.9,F1-Score,0.427698
4,LR,1,0.9,AUC,0.639734
...,...,...,...,...,...
620,XGB,0.3,0.5,Accuracy,0.950049
621,XGB,0.3,0.5,Precision,0.879026
622,XGB,0.3,0.5,Recall,0.835777
623,XGB,0.3,0.5,F1-Score,0.856856


In [25]:
results.pivot_table(columns=["Model", "Sample", "Split"],
                    index=["Metric"],
                    values=["Value"],
                    aggfunc="mean") #.reset_index()

Unnamed: 0_level_0,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value
Model,DTC,DTC,DTC,DTC,DTC,DTC,DTC,DTC,DTC,DTC,...,XGB,XGB,XGB,XGB,XGB,XGB,XGB,XGB,XGB,XGB
Sample,0.3,0.3,0.3,0.3,0.3,0.5,0.5,0.5,0.5,0.5,...,0.9,0.9,0.9,0.9,0.9,1.0,1.0,1.0,1.0,1.0
Split,0.5,0.6,0.7,0.8,0.9,0.5,0.6,0.7,0.8,0.9,...,0.5,0.6,0.7,0.8,0.9,0.5,0.6,0.7,0.8,0.9
Metric,Unnamed: 1_level_4,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4,Unnamed: 13_level_4,Unnamed: 14_level_4,Unnamed: 15_level_4,Unnamed: 16_level_4,Unnamed: 17_level_4,Unnamed: 18_level_4,Unnamed: 19_level_4,Unnamed: 20_level_4,Unnamed: 21_level_4
AUC,0.872218,0.873525,0.879841,0.881191,0.886037,0.878406,0.885364,0.885652,0.885702,0.884058,...,0.907375,0.910878,0.910579,0.91178,0.911905,0.90991,0.910221,0.910097,0.911688,0.913611
Accuracy,0.92706,0.928182,0.930858,0.930585,0.933008,0.930562,0.932591,0.934102,0.934944,0.933894,...,0.952416,0.953574,0.953739,0.954176,0.954108,0.953175,0.953382,0.953425,0.954053,0.954595
F1-Score,0.794205,0.797361,0.805138,0.805044,0.811134,0.80208,0.810012,0.812605,0.813962,0.811932,...,0.860738,0.864879,0.865208,0.866223,0.866813,0.863766,0.864541,0.864651,0.86673,0.869263
Precision,0.801723,0.806604,0.809816,0.805634,0.808925,0.806376,0.80776,0.814382,0.818199,0.816972,...,0.884769,0.885689,0.887519,0.886884,0.887904,0.885438,0.886615,0.887286,0.888287,0.889357
Recall,0.786827,0.788328,0.800515,0.804455,0.813356,0.79783,0.812277,0.810835,0.809769,0.806953,...,0.837979,0.845024,0.84399,0.846503,0.846701,0.84313,0.843541,0.843142,0.846195,0.850057


## Graphic Results