# Data Preprocessing Plan

In [1]:
import pandas as pd
import numpy as np
import joblib
from datetime import datetime

from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn import set_config

from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE

In [4]:
random_seed = 44
data_raw = "../../data/raw/sba_national.csv"
data_train = "../../data/raw/sba_train.csv"
data_val = "../../data/raw/sba_val.csv"
data_test = "../../data/raw/sba_test.csv"
data_clean = "../../data/clean/data_clean.csv"
clean_train = "../../data/clean/clean_train.csv"
train_subsam = "../../data/clean/train_subsam.csv"
train_smote = "../../data/clean/train_smote.csv"
prepro_pipe = "../../models/preprocessing.joblib"

drop_cols = ["LoanNr_ChkDgt","Name","City","Zip","Bank","ApprovalFY",
             "CreateJob","RetainedJob","FranchiseCode","ChgOffDate",
             "DisbursementDate","BalanceGross","ChgOffPrinGr"]
currency_cols = ["DisbursementGross","GrAppv","SBA_Appv"]
naics_sector = {"11":"Agriculture, forestry, fishing and hunting",
                "21":"Mining, quarrying, and oil and gas extraction",
                "22":"Utilities",
                "23":"Construction",
                "31":"Manufacturing",
                "32":"Manufacturing",
                "33":"Manufacturing",
                "42":"Wholesale trade",
                "44":"Retail trade",
                "45":"Retail trade",
                "48":"Transportation and warehousing",
                "49":"Transportation and warehousing",
                "51":"Information",
                "52":"Finance and insurance",
                "53":"Real estate and rental and leasing",
                "54":"Professional, scientific, and technical services",
                "55":"Management of companies and enterprises",
                "56":"Administrative and support and waste management and remediation services",
                "61":"Educational services",
                "62":"Health care and social assistance",
                "71":"Arts, entertainment, and recreation",
                "72":"Accommodation and food services",
                "81":"Other services (except public administration)",
                "92":"Public administration",
                "0":"[Unallocated sector]"}
target_col = "MIS_Status"
new_cols = ["DifState","Secured","SecuredSBA"]
rename_cols = {"NAICS":"Sector", 
               "RevLineCr":"RevLine",
               "DisbursementGross":"GrDisburs", 
               "GrAppv":"GrApprov",
               "SBA_Appv":"ApprovSBA"}

drop_nans = ["Default"]
mode_cols = ["State","BankState"]
class_cols = ["NewExist", "RevLine", "LowDoc"]
pred_cols = ["_", "AppYear","AppMonth","Term","NoEmp",
             "UrbanRural","GrDisburs","GrApprov","ApprovSBA"]

cat_nom_cols = ["State","BankState","Sector"]
cat_num_cols = "UrbanRural"
cat_ord_cols = "AppYear"
sort_cols = ["State","BankState","DifState","Sector","AppYear","AppMonth",
             "Term","NoEmp","Secured","NewExist","Urban","Rural","RevLine",
             "LowDoc","GrDisburs","GrApprov","ApprovSBA","SecuredSBA","Default"]

num_dis = ["Term","NoEmp","SecuredSBA"]
num_con = ["GrDisburs","GrApprov","ApprovSBA"]
cat_nom = ["State","BankState","Sector"]
cat_num = []
cat_ord = ["AppYear","AppMonth"]
binary = ["DifState","Secured","NewExist","Urban","Rural","RevLine","LowDoc"]
target = ["Default"]

## Import the Data

In [5]:
df = pd.read_csv(data_raw, low_memory=False)
print(df.shape)

(899164, 27)


## Hold-Out Method

In [4]:
df_train, df_temp = train_test_split(df, train_size=0.7, random_state=random_seed)
df_val, df_test = train_test_split(df_temp, test_size=0.5, random_state=random_seed)

In [6]:
df_train.to_csv(data_train, index=False)
df_val.to_csv(data_val, index=False)
df_test.to_csv(data_test, index=False)

In [5]:
print(f"Train: {df_train.shape}")
print(f"Val:   {df_val.shape}")
print(f"Test:  {df_test.shape}")

Train: (629414, 27)
Val:   (134875, 27)
Test:  (134875, 27)


## Create the Pipeline

### Feature Transform

In [6]:
class DropColumns(BaseEstimator, TransformerMixin):
    def __init__(self, variables):
        self.variables = variables

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X = X.copy()
        X = X.drop(columns=self.variables)
        return X

In [7]:
class CurrencyToInt(BaseEstimator, TransformerMixin):
    def __init__(self, variables):
        self.variables = variables

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X = X.copy()
        for var in self.variables:
            X[var] = X[var].str.replace(r"[\$,]", "", regex=True).astype(float)
            X[var] = X[var].round().astype(int)
        return X

In [8]:
class ApprovalDate(BaseEstimator, TransformerMixin):
    def __init__(self, variable):
        self.variable = variable

    def fit(self, X, y=None):
        return self

    def __format_date(self, app_date):
        date = app_date.split("-")
        year = date[2]
        if int(year) > 14: year = "19" + year
        else: year = "20" + year
        return f"{date[0]}-{date[1]}-{year}"

    def transform(self, X, y=None):
        X = X.copy()
        var = self.variable
        var_year = var[0:3] + "Year"
        var_month = var[0:3] + "Month"

        X[var] = pd.to_datetime(X[var].apply(self.__format_date))
        X[var_year] = X[var].dt.year
        X[var_month] = X[var].dt.month
        X = X.drop(columns=var)
        return X

In [9]:
class CategorizeNAICS(BaseEstimator, TransformerMixin):
    def __init__(self, variable, sector):
        self.variable = variable
        self.sector = sector

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X = X.copy()
        var = self.variable

        X[var] = X[var].astype(str).str[0:2]
        X[var] = X[var].apply(lambda x: self.sector[x])
        return X

In [10]:
class ConvertNewExist(BaseEstimator, TransformerMixin):
    def __init__(self, variable):
        self.variable = variable

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X = X.copy()
        var = self.variable

        X[var] = np.where(X[var] == 0.0, np.nan, X[var])
        X[var] = np.where(X[var] == 1, 0, X[var])
        X[var] = np.where(X[var] == 2, 1, X[var])
        X[var] = X[var].astype("Int64")
        return X

In [11]:
class ConvertRevLineCr(BaseEstimator, TransformerMixin):
    def __init__(self, variable):
        self.variable = variable

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X = X.copy()
        var = self.variable

        X[var] = np.where(X[var].isin(["Y","T"]), "1", X[var])
        X[var] = np.where(X[var].isin(["N"]), "0", X[var])
        X[var] = np.where(~X[var].isin(["1","0"]), np.nan, X[var])
        X[var] = X[var].astype("Int64")
        return X

In [12]:
class ConvertLowDoc(BaseEstimator, TransformerMixin):
    def __init__(self, variable):
        self.variable = variable

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X = X.copy()
        var = self.variable

        X[var] = np.where(X[var]=="Y", "1", X[var])
        X[var] = np.where(X[var]=="N", "0", X[var])
        X[var] = np.where(~X[var].isin(["1","0"]), np.nan, X[var])
        X[var] = X[var].astype("Int64")
        return X

In [13]:
class ConvertTarget(BaseEstimator, TransformerMixin):
    def __init__(self, variable):
        self.variable = variable

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X = X.copy()
        var = self.variable

        if var in X.columns:
            X[var] = np.where(X[var]=="CHGOFF", 1, X[var])
            X[var] = np.where(X[var]=="P I F", 0, X[var])
            X[var] = X[var].astype("Int64")

            X.rename(columns={var:"Default"}, inplace=True)
        return X

In [14]:
class CreateFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, variables):
        self.variables = variables

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X = X.copy()
        vars = self.variables

        X[vars[0]] = np.where(X["State"] != X["BankState"], 1, 0)
        X[vars[1]] = np.where(X["Term"] >= 240, 1, 0)
        X[vars[2]] = round((X["SBA_Appv"] / X["GrAppv"]) * 100)
        X[vars[2]] = X[vars[2]].astype(int)
        return X

In [15]:
class RenameColumns(BaseEstimator, TransformerMixin):
    def __init__(self, variables):
        self.variables = variables

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X = X.copy()
        X.rename(columns=self.variables, inplace=True)
        return X

In [16]:
feature_transform = Pipeline([
    ("drop_columns", DropColumns(drop_cols)),
    ("currency_int", CurrencyToInt(currency_cols)),
    ("approval_date", ApprovalDate("ApprovalDate")),
    ("categorize_naics", CategorizeNAICS("NAICS", naics_sector)),
    ("convert_newexist", ConvertNewExist("NewExist")),
    ("convert_revlinecr", ConvertRevLineCr("RevLineCr")),
    ("convert_lowdoc", ConvertLowDoc("LowDoc")),
    ("convert_target", ConvertTarget(target_col)),
    ("create_features", CreateFeatures(new_cols)),
    ("rename_columns", RenameColumns(rename_cols))
    ])

In [17]:
feature_transform_fit = feature_transform.fit(df)
df1 = feature_transform_fit.transform(df)

In [18]:
print(df1.shape)
df1.sample(3)

(899164, 18)


Unnamed: 0,State,BankState,Sector,Term,NoEmp,NewExist,UrbanRural,RevLine,LowDoc,GrDisburs,Default,GrApprov,ApprovSBA,AppYear,AppMonth,DifState,Secured,SecuredSBA
408369,ND,ND,Finance and insurance,60,5,0,1,0,0,30000,0,30000,30000,2010,3,0,0,100
624072,HI,HI,[Unallocated sector],77,1,0,0,0,0,100000,1,100000,90000,1994,2,0,0,90
587228,OR,OR,[Unallocated sector],48,7,0,0,0,0,75000,0,75000,67500,1993,7,0,0,90


### Missing Values

In [19]:
class ModeImputer(BaseEstimator, TransformerMixin):
    def __init__(self, variables):
        self.variables = variables

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X = X.copy()
        for var in self.variables:
            col_values = X[[var]]
            mode_imputer = SimpleImputer(strategy="most_frequent")
            X[var] = mode_imputer.fit_transform(col_values)[:, 0]
        return X

In [20]:
class ClassImputer(BaseEstimator, TransformerMixin):
    def __init__(self, variables, df_subset):
        self.variables = variables
        self.df_subset = df_subset

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X = X.copy()
        for var in self.variables:
            if X[var].isnull().sum() > 0:
                self.df_subset[0] = var
                df_ = X[self.df_subset]

                df_train = df_.dropna(subset=[var])
                df_test = df_[df_[var].isnull()]

                X_train = df_train.drop(columns=[var])
                y_train = df_train[var]
                X_test = df_test.drop(columns=[var])

                rf_classifier = RandomForestClassifier()
                rf_classifier.fit(X_train, y_train)

                y_test = rf_classifier.predict(X_test)
                X.loc[X[var].isnull(), var] = y_test

                X[var] = X[var].astype(int)
        return X

In [21]:
class DropDuplicates(BaseEstimator, TransformerMixin):
    def __init__(self, variable):
        self.variable = variable
    
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        if self.variable[0] in X.columns:
            X = X.copy()
            X.drop_duplicates(inplace=True)
        return X

In [22]:
missing_values = Pipeline([
    ("mode_imputer", ModeImputer(mode_cols)),
    ("class_imputer", ClassImputer(class_cols, pred_cols)),
    ("drop_duplicates", DropDuplicates(target))
    ])

In [23]:
missing_values_fit = missing_values.fit(df1)
df2 = missing_values_fit.transform(df1)

In [24]:
print(df2.shape)
print(df2.isnull().sum())

(896548, 18)
State            0
BankState        0
Sector           0
Term             0
NoEmp            0
NewExist         0
UrbanRural       0
RevLine          0
LowDoc           0
GrDisburs        0
Default       1939
GrApprov         0
ApprovSBA        0
AppYear          0
AppMonth         0
DifState         0
Secured          0
SecuredSBA       0
dtype: int64


### Encode Features

In [25]:
class LabelTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, variables):
        self.variables = variables
        self.encoders = {}

    def fit(self, X, y=None):
        for var in self.variables:
            encoder = LabelEncoder()
            encoder.fit(X[var])
            self.encoders[var] = encoder
        return self

    def transform(self, X, y=None):
        X = X.copy()
        for var, encoder in self.encoders.items():
            X[var] = encoder.transform(X[var])
        return X

In [26]:
class OneHotTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, variable):
        self.variable = variable

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X = X.copy()
        # X = pd.get_dummies(X, columns=[self.variable])
        # X.rename(columns={"UrbanRural_1":"Urban",
        #                   "UrbanRural_2":"Rural"}, inplace=True)
        # X.drop(columns="UrbanRural_0", axis=1, inplace=True)

        X["Urban"] = np.where(X["UrbanRural"]==1, 1, 0)
        X["Rural"] = np.where(X["UrbanRural"]==2, 1, 0)
        X.drop(columns="UrbanRural", axis=1, inplace=True)

        X["Urban"] = X["Urban"].astype(int)
        X["Rural"] = X["Rural"].astype(int)
        return X

In [27]:
class OrdinalTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, variable):
        self.variable = variable
        self.encoder = OrdinalEncoder()

    def fit(self, X, y=None):
        self.encoder.categories_ = list(sorted(X[self.variable].unique()))
        self.encoder.fit(X[[self.variable]])
        return self

    def transform(self, X, y=None):
        X = X.copy()
        var = self.variable

        X[var] = self.encoder.transform(X[[var]])
        X[var] = X[var].astype(int)
        return X

In [28]:
class DropNaNs(BaseEstimator, TransformerMixin):
    def __init__(self, variables):
        self.variables = variables

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X = X.copy()

        for var in self.variables:
            if var in X.columns:
                X.dropna(subset=[var], inplace=True)
                X[var] = X[var].astype(int)
        return X

In [29]:
class SortColumns(BaseEstimator, TransformerMixin):
    def __init__(self, variables):
        self.variables = variables

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X = X.copy()
        sort_cols = [col for col in self.variables if col in X.columns]
        X = X[sort_cols]
        return X

In [30]:
encode_features = Pipeline([
    ("label_encoder", LabelTransformer(cat_nom_cols)),
    ("one-hot_encoder", OneHotTransformer(cat_num_cols)),
    ("ordinal_encoder", OrdinalTransformer(cat_ord_cols)),
    ("drop_nans", DropNaNs(drop_nans)),
    ("sort_columns", SortColumns(sort_cols))
    ])

In [31]:
encode_features_fit = encode_features.fit(df2)
df3 = encode_features_fit.transform(df2)

In [32]:
print(df3.shape)
df3.sample(3)

(894609, 19)


Unnamed: 0,State,BankState,DifState,Sector,AppYear,AppMonth,Term,NoEmp,Secured,NewExist,Urban,Rural,RevLine,LowDoc,GrDisburs,GrApprov,ApprovSBA,SecuredSBA,Default
123854,14,49,1,4,42,7,72,4,0,1,1,0,0,0,15000,15000,7500,50,1
546037,4,5,0,20,28,11,252,3,1,0,0,0,0,0,170000,170000,136799,80,0
623580,37,40,0,0,39,3,294,5,1,1,1,0,0,0,825000,825000,618750,75,0


### Preprocessing

In [33]:
preprocessing = Pipeline([
    ("feature_transform", feature_transform),
    ("missing_values", missing_values),
    ("encode_features", encode_features)
    ])

In [34]:
preprocessing_fit = preprocessing.fit(df)
df_ = preprocessing_fit.transform(df)

In [35]:
df_.info()

<class 'pandas.core.frame.DataFrame'>
Index: 894610 entries, 0 to 899163
Data columns (total 19 columns):
 #   Column      Non-Null Count   Dtype
---  ------      --------------   -----
 0   State       894610 non-null  int64
 1   BankState   894610 non-null  int64
 2   DifState    894610 non-null  int64
 3   Sector      894610 non-null  int64
 4   AppYear     894610 non-null  int64
 5   AppMonth    894610 non-null  int32
 6   Term        894610 non-null  int64
 7   NoEmp       894610 non-null  int64
 8   Secured     894610 non-null  int64
 9   NewExist    894610 non-null  int64
 10  Urban       894610 non-null  int64
 11  Rural       894610 non-null  int64
 12  RevLine     894610 non-null  int64
 13  LowDoc      894610 non-null  int64
 14  GrDisburs   894610 non-null  int64
 15  GrApprov    894610 non-null  int64
 16  ApprovSBA   894610 non-null  int64
 17  SecuredSBA  894610 non-null  int64
 18  Default     894610 non-null  int64
dtypes: int32(1), int64(18)
memory usage: 133.1 MB


In [36]:
df_.sample(3)

Unnamed: 0,State,BankState,DifState,Sector,AppYear,AppMonth,Term,NoEmp,Secured,NewExist,Urban,Rural,RevLine,LowDoc,GrDisburs,GrApprov,ApprovSBA,SecuredSBA,Default
658429,4,5,0,13,39,8,120,6,0,0,1,0,0,0,331000,331000,248250,75,0
43838,34,37,0,7,41,6,84,10,0,0,1,0,0,0,50000,50000,25000,50,0
209367,6,7,0,20,34,11,294,5,1,0,0,0,0,0,50000,50000,40000,80,0


In [37]:
test = df.drop(columns=["MIS_Status"])

df_test = preprocessing_fit.transform(test)

In [38]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 896534 entries, 0 to 899163
Data columns (total 18 columns):
 #   Column      Non-Null Count   Dtype
---  ------      --------------   -----
 0   State       896534 non-null  int64
 1   BankState   896534 non-null  int64
 2   DifState    896534 non-null  int64
 3   Sector      896534 non-null  int64
 4   AppYear     896534 non-null  int64
 5   AppMonth    896534 non-null  int32
 6   Term        896534 non-null  int64
 7   NoEmp       896534 non-null  int64
 8   Secured     896534 non-null  int64
 9   NewExist    896534 non-null  int64
 10  Urban       896534 non-null  int64
 11  Rural       896534 non-null  int64
 12  RevLine     896534 non-null  int64
 13  LowDoc      896534 non-null  int64
 14  GrDisburs   896534 non-null  int64
 15  GrApprov    896534 non-null  int64
 16  ApprovSBA   896534 non-null  int64
 17  SecuredSBA  896534 non-null  int64
dtypes: int32(1), int64(17)
memory usage: 126.5 MB


In [39]:
df_test.sample(3)

Unnamed: 0,State,BankState,DifState,Sector,AppYear,AppMonth,Term,NoEmp,Secured,NewExist,Urban,Rural,RevLine,LowDoc,GrDisburs,GrApprov,ApprovSBA,SecuredSBA
541781,24,30,1,19,28,11,6,230,0,0,0,0,0,0,97910,97910,83224,85
554901,13,16,0,1,38,6,240,17,1,0,1,0,0,0,227000,227000,227000,100
742708,26,29,0,1,40,8,183,5,0,0,0,1,0,0,785000,785000,588750,75


## Display the Pipeline

In [40]:
set_config(display="diagram")
display(preprocessing)

## Save the Pipeline

In [41]:
joblib.dump(preprocessing_fit, prepro_pipe)

prepro_pipeline = joblib.load(prepro_pipe)

try:
    prepro_pipeline

    dt_now = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    print(f"{dt_now}: Pipeline is loaded...")

except Exception as e:
    print("Error:", str(e))

2023-08-31 01:15:13: Pipeline is loaded...


## Test the Pipeline

### Original & Train Datasets

**sba_national.csv -> sba_clean.csv**

In [42]:
sba_raw = pd.read_csv(data_raw, low_memory=False)
sba_clean = prepro_pipeline.transform(sba_raw)

In [43]:
sba_clean.to_csv(data_clean, index=False)
print(sba_clean.info())
sba_clean.sample(3)

<class 'pandas.core.frame.DataFrame'>
Index: 894610 entries, 0 to 899163
Data columns (total 19 columns):
 #   Column      Non-Null Count   Dtype
---  ------      --------------   -----
 0   State       894610 non-null  int64
 1   BankState   894610 non-null  int64
 2   DifState    894610 non-null  int64
 3   Sector      894610 non-null  int64
 4   AppYear     894610 non-null  int64
 5   AppMonth    894610 non-null  int32
 6   Term        894610 non-null  int64
 7   NoEmp       894610 non-null  int64
 8   Secured     894610 non-null  int64
 9   NewExist    894610 non-null  int64
 10  Urban       894610 non-null  int64
 11  Rural       894610 non-null  int64
 12  RevLine     894610 non-null  int64
 13  LowDoc      894610 non-null  int64
 14  GrDisburs   894610 non-null  int64
 15  GrApprov    894610 non-null  int64
 16  ApprovSBA   894610 non-null  int64
 17  SecuredSBA  894610 non-null  int64
 18  Default     894610 non-null  int64
dtypes: int32(1), int64(18)
memory usage: 133.1 MB
Non

Unnamed: 0,State,BankState,DifState,Sector,AppYear,AppMonth,Term,NoEmp,Secured,NewExist,Urban,Rural,RevLine,LowDoc,GrDisburs,GrApprov,ApprovSBA,SecuredSBA,Default
103607,19,43,1,17,39,4,31,2,0,1,1,0,1,0,5000,5000,2500,50,1
310392,38,41,0,7,41,1,12,1,0,1,1,0,1,0,200000,200000,100000,50,0
584443,43,17,1,20,26,7,84,6,0,0,0,0,0,0,125000,125000,106250,85,0


**sba_train.csv -> clean_train.csv**

In [44]:
sba_train = pd.read_csv(data_train, low_memory=False)
sba_clean_train = prepro_pipeline.transform(sba_train)

In [45]:
sba_clean_train.to_csv(clean_train, index=False)
print(sba_clean_train.info())
sba_clean_train.sample(3)

<class 'pandas.core.frame.DataFrame'>
Index: 626670 entries, 0 to 629413
Data columns (total 19 columns):
 #   Column      Non-Null Count   Dtype
---  ------      --------------   -----
 0   State       626670 non-null  int64
 1   BankState   626670 non-null  int64
 2   DifState    626670 non-null  int64
 3   Sector      626670 non-null  int64
 4   AppYear     626670 non-null  int64
 5   AppMonth    626670 non-null  int32
 6   Term        626670 non-null  int64
 7   NoEmp       626670 non-null  int64
 8   Secured     626670 non-null  int64
 9   NewExist    626670 non-null  int64
 10  Urban       626670 non-null  int64
 11  Rural       626670 non-null  int64
 12  RevLine     626670 non-null  int64
 13  LowDoc      626670 non-null  int64
 14  GrDisburs   626670 non-null  int64
 15  GrApprov    626670 non-null  int64
 16  ApprovSBA   626670 non-null  int64
 17  SecuredSBA  626670 non-null  int64
 18  Default     626670 non-null  int64
dtypes: int32(1), int64(18)
memory usage: 93.2 MB
None

Unnamed: 0,State,BankState,DifState,Sector,AppYear,AppMonth,Term,NoEmp,Secured,NewExist,Urban,Rural,RevLine,LowDoc,GrDisburs,GrApprov,ApprovSBA,SecuredSBA,Default
215345,36,40,1,12,39,2,17,1,0,1,1,0,0,0,20000,20000,17000,85,1
459166,46,51,0,7,29,4,85,2,0,0,0,0,0,1,30000,30000,24000,80,0
309690,4,5,0,16,35,9,240,110,1,0,1,0,0,0,742000,742000,742000,100,0


### Resampling Datasets

In [46]:
sba_train = pd.read_csv(data_train, low_memory=False)
print(sba_train.shape)
print(sba_train["MIS_Status"].value_counts())

(629414, 27)
MIS_Status
P I F     518128
CHGOFF    109856
Name: count, dtype: int64


**SubSampling: sba_train.csv -> train_subsam.csv**

In [47]:
class SubSampling(BaseEstimator, TransformerMixin):
    def __init__(self, target, rand_seed):
        self.target = target
        self.rand_seed = rand_seed

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X = X.copy()
        y = X[self.target]
        X = X.drop(columns=self.target)

        undersampler = RandomUnderSampler(sampling_strategy="auto",
                                          random_state=self.rand_seed )
        X_resampled, y_resampled = undersampler.fit_resample(X, y)

        df_under = pd.DataFrame(X_resampled)
        df_under[self.target] = y_resampled
        return df_under

In [48]:
under_sampling = Pipeline([
    ("feature_transform", feature_transform),
    ("missing_values", missing_values),
    ("encode_features", encode_features),
    ("under_sampler", SubSampling(target[0], random_seed))
    ])

In [49]:
under_sampling_fit = under_sampling.fit(df)
df_under = under_sampling_fit.transform(sba_train)

In [50]:
df_under.to_csv(train_subsam, index=False)
print(df_under.shape)
print(df_under["Default"].value_counts())
df_under.sample(3)

(219654, 19)
Default
0    109827
1    109827
Name: count, dtype: int64


Unnamed: 0,State,BankState,DifState,Sector,AppYear,AppMonth,Term,NoEmp,Secured,NewExist,Urban,Rural,RevLine,LowDoc,GrDisburs,GrApprov,ApprovSBA,SecuredSBA,Default
415602,13,16,0,0,37,7,84,12,0,1,1,0,0,1,140000,140000,119000,85,0
129023,12,15,0,20,27,10,96,5,0,0,0,0,0,1,91000,91000,81900,90,0
149932,4,47,1,20,23,3,240,25,1,1,0,0,0,0,500000,500000,425000,85,0


**SMOTE: sba_train.csv -> train_smote.csv**

In [51]:
class SmoteSampling(BaseEstimator, TransformerMixin):
    def __init__(self, target, rand_seed):
        self.target = target
        self.rand_seed = rand_seed

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X = X.copy()
        y = X[self.target]
        X = X.drop(columns=self.target)

        smote = SMOTE(sampling_strategy="auto", random_state=self.rand_seed)
        X_resampled, y_resampled = smote.fit_resample(X, y)

        df_smote = pd.DataFrame(X_resampled)
        df_smote[self.target] = y_resampled
        return df_smote

In [52]:
prepro_pipeline = joblib.load(prepro_pipe)

In [53]:
smote_sampling = Pipeline([
    ("preprocessing", prepro_pipeline),
    ("smote_sampler", SmoteSampling(target[0], random_seed))
    ])

In [54]:
smote_sampling_fit = smote_sampling.fit(df)
df_smote = smote_sampling_fit.transform(sba_train)

In [55]:
df_smote.to_csv(train_smote, index=False)
print(df_smote.shape)
print(df_smote["Default"].value_counts())
df_smote.sample(3)

(1033686, 19)
Default
0    516843
1    516843
Name: count, dtype: int64


Unnamed: 0,State,BankState,DifState,Sector,AppYear,AppMonth,Term,NoEmp,Secured,NewExist,Urban,Rural,RevLine,LowDoc,GrDisburs,GrApprov,ApprovSBA,SecuredSBA,Default
957784,27,5,1,8,39,1,61,3,0,1,1,0,0,0,10000,10000,8500,85,1
148873,34,37,0,4,33,11,11,75,0,0,1,0,0,0,375000,375000,281250,75,0
991332,42,46,0,16,40,8,33,7,0,0,1,0,0,0,500000,500000,375000,75,1
