In [1]:
import pandas as pd
import numpy as np  
import matplotlib.pyplot as plt

In [2]:
# Load the data
df = pd.read_csv('.csv/cleaned_data.csv', index_col = 0)
model_data = df.copy()
pd.set_option('display.max_column', None)

In [3]:
# Data for iterative imputing
randomforest_data = df.copy()

In [4]:
randomforest_data.replace(np.inf, np.nan, inplace=True)

In [5]:
randomforest_data['NewExist_Encoded'] = randomforest_data['NewExist'].map({1.0 : 1, 2.0: 2, np.nan: 0})
randomforest_data['UrbanRural_Encoded'] = randomforest_data['UrbanRural'].map({1.0 : 1, 2.0: 2, 0.0: 0}).fillna(0).astype('int64')
randomforest_data['MIS_Status_Encoded'] = randomforest_data['MIS_Status'].map({'CHGOFF': 0, 'PIF': 1})
randomforest_data['RevLineCr_Encoded'] = randomforest_data['RevLineCr'].map({'N': 1, 'Y': 2}).fillna(0).astype('int64')
randomforest_data['LowDoc_Encoded'] = randomforest_data['LowDoc'].map({'N': 0, 'Y': 1})
randomforest_data['FranchiseCode_Encoded'] = randomforest_data['FranchiseCode_Encoded'].map({'No': 0, 'Yes': 1}).astype('int64')
randomforest_data['RealEstate_Backed'] = randomforest_data['RealEstate_Backed'].map({'No': 0, 'Yes': 1}).astype('int64')
randomforest_data['CreateJob_Encoded'] = randomforest_data['CreateJob'].apply(lambda x: 1 if x > 0 else 0)
randomforest_data['RetainedJob_Encoded'] = randomforest_data['RetainedJob'].apply(lambda x: 1 if x > 0 else 0)

In [6]:
randomforest_data.loc[:, 'EmployeeLoanRatio'] = randomforest_data.apply(
    lambda row: round(row['GrAppv']) if pd.isna(row['EmployeeLoanRatio']) else round(row['EmployeeLoanRatio']),
    axis=1
).astype('int64')

In [7]:
randomforest_data.drop(labels=['LoanNr_ChkDgt', 'Name', 'City', 'Bank', 'BankState', 'TermDays', 'ApprovalDate', 'ApprovalFY', 'Zip', 'DisbursementDate', 
                      'DisbursementGross','NewExist', 'RetainedJob', 'LowDoc' ,'UrbanRural', 'RevLineCr', 'ChgOffDate',
                      'BalanceGross', 'MIS_Status', 'ChgOffPrinGr', 'SBA_Appv', 'Industry', 'Recession',
                       'ApprovalDateYear', 'ChgOffDateYear', 'ApprovalDateMonth', 'DisbursementDateYear',
                       'LoanDateEnd'], axis=1, inplace=True )

#### One-Hot Encode categories

In [8]:
# One Hot-Encode the categorical variables
randomforest_data = pd.get_dummies(randomforest_data, columns=['State', 'Region', 'StateRisk']).fillna(0).astype('int64')

#### Random Forest

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier

# Split the data into features and target
X_rf = randomforest_data.drop(columns=['MIS_Status_Encoded'])
y_rf = randomforest_data['MIS_Status_Encoded']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_rf, y_rf, test_size=0.2, random_state=42, stratify=y_rf)

# Define numerical and categorical features
numerical_features = ['GrAppv', 'CreateJob', 'NAICS', 'Term', 'NoEmp', 'EmployeeLoanRatio']
categorical_features = ['NAICS_class_code', 'FranchiseCode_Encoded', 'RealEstate_Backed', 'EmployeeLoanRatio', 'NewExist_Encoded',
                        'UrbanRural_Encoded', 'RevLineCr_Encoded',
                        'LowDoc_Encoded', 'CreateJob_Encoded', 'RetainedJob_Encoded',
                        'State_AK', 'State_AL', 'State_AR', 'State_AZ', 'State_CA', 'State_CO',
                        'State_CT', 'State_DC', 'State_DE', 'State_FL', 'State_GA', 'State_HI',
                        'State_IA', 'State_ID', 'State_IL', 'State_IN', 'State_KS', 'State_KY',
                        'State_LA', 'State_MA', 'State_MD', 'State_ME', 'State_MI', 'State_MN',
                        'State_MO', 'State_MS', 'State_MT', 'State_NC', 'State_ND', 'State_NE',
                        'State_NH', 'State_NJ', 'State_NM', 'State_NV', 'State_NY', 'State_OH',
                        'State_OK', 'State_OR', 'State_PA', 'State_RI', 'State_SC', 'State_SD',
                        'State_TN', 'State_TX', 'State_UT', 'State_VA', 'State_VT', 'State_WA',
                        'State_WI', 'State_WV', 'State_WY', 'Region_Eastern', 'Region_Northern',
                        'Region_Southern', 'Region_Western', 'StateRisk_High', 'StateRisk_Low',
                        'StateRisk_Medium']

# Define the preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', 'passthrough', categorical_features)
    ])

# Build Pipeline
model_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    #('poly', PolynomialFeatures(degree=2)),
    ('random', RandomForestClassifier()),
])

# Set RandomSearch parameters
params = {"random__random_state": [42],
          "random__n_jobs": [7],
          "random__criterion": ['gini','entropy', 'log_loss'],
          "random__n_estimators": [900],
          "random__class_weight": ['balanced_subsample'],
          "random__ccp_alpha": [1.8],
}

# Set RandomSearchCV parameters
rf_random_search = RandomizedSearchCV(model_pipeline, param_distributions=params, cv=5, random_state= 42, error_score='raise')
rf_random_search.fit(X_train, y_train)

# Best parameterd and score
print("Best Params", rf_random_search.best_estimator_)
print("Best Score:", rf_random_search.best_score_)

