In [2]:
import pandas as pd
import numpy as np  
import matplotlib.pyplot as plt

In [3]:
# Load the data
df = pd.read_csv('.csv/cleaned_data.csv', index_col = 0)
xgb_data = df.copy()
pd.set_option('display.max_column', None)

In [4]:
xgb_data.replace(np.inf, np.nan, inplace=True)

In [5]:
xgb_data['NewExist_Encoded'] = xgb_data['NewExist'].map({1.0 : 1, 2.0: 2, np.nan: 0})
xgb_data['UrbanRural_Encoded'] = xgb_data['UrbanRural'].map({1.0 : 1, 2.0: 2, 0.0: 0}).fillna(0).astype('int64')
xgb_data['MIS_Status_Encoded'] = xgb_data['MIS_Status'].map({'CHGOFF': 0, 'PIF': 1})
xgb_data['RevLineCr_Encoded'] = xgb_data['RevLineCr'].map({'N': 1, 'Y': 2}).fillna(0).astype('int64')
xgb_data['LowDoc_Encoded'] = xgb_data['LowDoc'].map({'N': 0, 'Y': 1})
xgb_data['FranchiseCode_Encoded'] = xgb_data['FranchiseCode_Encoded'].map({'No': 0, 'Yes': 1}).astype('int64')
xgb_data['RealEstate_Backed'] = xgb_data['RealEstate_Backed'].map({'No': 0, 'Yes': 1}).astype('int64')
xgb_data['CreateJob_Encoded'] = xgb_data['CreateJob'].apply(lambda x: 1 if x > 0 else 0)
xgb_data['RetainedJob_Encoded'] = xgb_data['RetainedJob'].apply(lambda x: 1 if x > 0 else 0)
xgb_data['State'] = xgb_data['State'].astype('category')
xgb_data['StateRisk'] = xgb_data['StateRisk'].astype('category')
xgb_data['Region'] = xgb_data['Region'].astype('category')

In [5]:
# xgb_data['NewExist_Encoded'] = xgb_data['NewExist'].map({1.0 : 1, 2.0: 2}, na_action='ignore')
# xgb_data['UrbanRural_Encoded'] = xgb_data['UrbanRural'].map({1.0 : 1, 2.0: 2, 0.0: 0}, na_action='ignore')
# xgb_data['MIS_Status_Encoded'] = xgb_data['MIS_Status'].map({'CHGOFF': 0, 'PIF': 1})
# xgb_data['RevLineCr_Encoded'] = xgb_data['RevLineCr'].map({'N': 1, 'Y': 2}, na_action='ignore')
# xgb_data['LowDoc_Encoded'] = xgb_data['LowDoc'].map({'N': 0, 'Y': 1})
# xgb_data['FranchiseCode_Encoded'] = xgb_data['FranchiseCode_Encoded'].map({'No': 0, 'Yes': 1}).astype('int64')
# xgb_data['RealEstate_Backed'] = xgb_data['RealEstate_Backed'].map({'No': 0, 'Yes': 1}).astype('int64')
# xgb_data['CreateJob_Encoded'] = xgb_data['CreateJob'].apply(lambda x: 1 if x > 0 else 0)
# xgb_data['RetainedJob_Encoded'] = xgb_data['RetainedJob'].apply(lambda x: 1 if x > 0 else 0)
# xgb_data['State'] = xgb_data['State'].astype('category')
# xgb_data['StateRisk'] = xgb_data['StateRisk'].astype('category')
# xgb_data['Region'] = xgb_data['Region'].astype('category')

In [6]:
xgb_data.loc[:, 'EmployeeLoanRatio'] = xgb_data.apply(
    lambda row: round(row['GrAppv']) if pd.isna(row['EmployeeLoanRatio']) else round(row['EmployeeLoanRatio']),
    axis=1
).astype('int64')

In [7]:
xgb_data.drop(labels=['LoanNr_ChkDgt', 'Name', 'City', 'Bank', 'BankState', 'TermDays', 'ApprovalDate', 'ApprovalFY', 'Zip', 'DisbursementDate', 
                      'DisbursementGross','NewExist', 'RetainedJob', 'LowDoc' ,'UrbanRural', 'RevLineCr', 'ChgOffDate',
                      'BalanceGross', 'MIS_Status', 'ChgOffPrinGr', 'SBA_Appv', 'Industry',
                       'ApprovalDateYear', 'ChgOffDateYear', 'ApprovalDateMonth', 'DisbursementDateYear',
                       'LoanDateEnd'], axis=1, inplace=True )

In [8]:
# # Create dummies
# dummy_cols = pd.get_dummies(xgb_data[['Region', 'StateRisk']])  

# # Convert dummies to int
# dummy_cols = dummy_cols.astype(int)  

# # Merge back to original DataFrame
# xgb_data = pd.concat([xgb_data, dummy_cols], axis=1)  


In [9]:
# xgb_data.drop(['Region', 'StateRisk'], axis=1, inplace=True)


In [8]:
from sklearn.preprocessing import StandardScaler

numerical_features = ['GrAppv', 'CreateJob', 'Term', 'NoEmp', 'NAICS', 'EmployeeLoanRatio', 'FranchiseCode',]
categorical_features = ['FranchiseCode_Encoded', 'RealEstate_Backed','NAICS_class_code',
                        'Recession', 'NewExist_Encoded', 'UrbanRural_Encoded', 'RevLineCr_Encoded',
                        'LowDoc_Encoded', 'CreateJob_Encoded', 'Region',
                        'StateRisk', 'State']

scaler = StandardScaler()
scaled_numerics = scaler.fit_transform(xgb_data[numerical_features])

xgb_data[numerical_features] = scaled_numerics

In [9]:
from sklearn.preprocessing import PolynomialFeatures

# Define the target columns
target_columns = ['MIS_Status_Encoded', 'State', 'Region', 'StateRisk']

# Separate features (X) and target (y)
X = xgb_data.drop(columns=target_columns).reset_index(drop=True)  # Drop target and reset index
y = xgb_data[target_columns].reset_index(drop=True)  # Store target separately and reset index

# Apply PolynomialFeatures
polyfit = PolynomialFeatures(degree=2) 
X_poly = polyfit.fit_transform(X)

# Convert back to DataFrame with feature names
X_poly_df = pd.DataFrame(X_poly, columns=polyfit.get_feature_names_out(X.columns))

# Concatenate transformed features and target
xgb_data_polyfit_df = pd.concat([X_poly_df, y], axis=1)


In [10]:
from sklearn.model_selection import train_test_split

# Split the data into features and target
X = xgb_data.drop(columns=['MIS_Status_Encoded'])
y = xgb_data['MIS_Status_Encoded']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [None]:
   # "xgb__booster": ['dart', 'gbtree'],
    # "xgb__device" : ['cuda'],
    # 'xgb__learning_rate': [0.01, 0.05, 0.1, 0.2, 0.8],  # Step size shrinkage
    #'xgb__max_depth': [3, 5, 7, 12],  # Maximum depth of trees
    # 'xgb__min_child_weight': [3, 5, 7],  # Minimum sum of weights required in a child
    # 'xgb__gamma': [0, 0.1, 0.2],  # Minimum loss reduction for further partitioning
    # 'xgb__subsample': [0.6, 0.8, 1.0],  # Fraction of samples per tree
    # 'xgb__reg_alpha': [0, 0.01, 0.1, 1],  # L1 regularization
    # 'xgb__scale_pos_weight': [2, 5, 10],  # Balance classes (useful for imbalanced datasets)
    # 'xgb__objective': ['binary:logistic'],  # Binary classification objective
    # 'xgb__eval_metric': ['logloss'],  # Evaluation metric
    # 'xgb__random_state': [42]  # For reproducibility

In [None]:
import xgboost as xgb

# Define model
model = xgb.XGBClassifier(booster= 'gbtree', enable_categorical=True,
                          device='cuda', objective='binary:logistic',
                          eval_metric= 'logloss',
                          subsample= 0.8,
                          gamma= 4,
                          colsample_bytree=0.7,
                          max_depth= 25,
                          reg_lambda= 0.1,
                          reg_alpha= 10,
                          n_estimators= 800,
                          learning_rate=0.27777)

model.fit(X_train, y_train)

train_score = model.score(X_train, y_train)

print("Training Score:", round(train_score, 2))

# Store model parameters in a DataFrame
best_xgb_params = model.get_params()  # Get model's hyperparameters
best_xgb_params_df = pd.DataFrame([best_xgb_params])

# Save to CSV
best_xgb_params_df.to_csv("best_xgb_params.csv", index=False)

print("Best parameters saved to CSV successfully!")



Training Score: 0.95
Best parameters saved to CSV successfully!


In [15]:
from sklearn.metrics import classification_report

# Make predictions
xgb_y_pred = model.predict(X_test)

# Calculate the accuracy
xgb_classification_report = classification_report(y_test, xgb_y_pred)

print(xgb_classification_report)

              precision    recall  f1-score   support

           0       0.85      0.81      0.83     31564
           1       0.96      0.97      0.96    147922

    accuracy                           0.94    179486
   macro avg       0.91      0.89      0.90    179486
weighted avg       0.94      0.94      0.94    179486



- LightGBM

In [15]:
import lightgbm as lgb

# Define model
lgb_model = lgb.LGBMClassifier(boosting_type= 'gbdt',
                            n_jobs = 7,
                            bagging_fraction=0.8, 
                            bagging_freq=1,
                            feature_fraction=0.8, 
                            importance_type='gain',
                            min_child_weight=0.1, 
                            min_split_gain=0.1,
                            objective='binary',
                            subsample=1.0,
                            learning_rate=0.2777,
                            n_estimators= 800,
                            )

# "lgb__boosting_type": ['gbdt',

lgb_model.fit(X_train, y_train)

lgb_train_score = lgb_model.score(X_train, y_train)

print("Training Score:", round(lgb_train_score, 2))

# Store model parameters in a DataFrame
best_lgb_params = lgb_model.get_params()  # Get model's hyperparameters
best_lgb_params_df = pd.DataFrame([best_lgb_params])

# Save to CSV
best_lgb_params_df.to_csv("best_lgb_params.csv", index=False)

print("Best parameters saved to CSV successfully!")

[LightGBM] [Info] Number of positive: 591687, number of negative: 126255
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.031956 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1734
[LightGBM] [Info] Number of data points in the train set: 717942, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.824143 -> initscore=1.544674
[LightGBM] [Info] Start training from score 1.544674
Training Score: 0.96
Best parameters saved to CSV successfully!


In [16]:
from sklearn.metrics import classification_report

# Make predictions
lgb_y_pred = lgb_model.predict(X_test)

# Calculate the accuracy
lgb_classification_report = classification_report(y_test, lgb_y_pred)

print(lgb_classification_report)

              precision    recall  f1-score   support

           0       0.85      0.82      0.84     31564
           1       0.96      0.97      0.97    147922

    accuracy                           0.94    179486
   macro avg       0.91      0.90      0.90    179486
weighted avg       0.94      0.94      0.94    179486



- Catboost

In [12]:
import catboost as cb

# Define model
cb_model = cb.CatBoostClassifier(iterations= 20,
                                max_depth=10,
                                learning_rate=0.2777,
                                l2_leaf_reg= 0.1,
                            )

categorical_features_indices = [X_train.columns.get_loc(col) for col in X_train.select_dtypes(include=['category']).columns]
cb_model.fit(X_train, y_train, cat_features=categorical_features_indices)


cb_train_score = cb_model.score(X_train, y_train)

print("Training Score:", round(cb_train_score, 2))

# Store model parameters in a DataFrame
best_cb_params = cb_model.get_params()  # Get model's hyperparameters
best_cb_params_df = pd.DataFrame([best_cb_params])

# Save to CSV
best_cb_params_df.to_csv("best_cb_params.csv", index=False)

print("Best parameters saved to CSV successfully!")

0:	learn: 0.4238619	total: 257ms	remaining: 4.88s
1:	learn: 0.3292202	total: 452ms	remaining: 4.07s
2:	learn: 0.2836503	total: 644ms	remaining: 3.65s
3:	learn: 0.2579911	total: 817ms	remaining: 3.27s
4:	learn: 0.2388670	total: 956ms	remaining: 2.87s
5:	learn: 0.2215477	total: 1.13s	remaining: 2.63s
6:	learn: 0.2150707	total: 1.3s	remaining: 2.42s
7:	learn: 0.2083435	total: 1.45s	remaining: 2.18s
8:	learn: 0.2047586	total: 1.63s	remaining: 1.99s
9:	learn: 0.2010778	total: 1.81s	remaining: 1.81s
10:	learn: 0.1956705	total: 2s	remaining: 1.63s
11:	learn: 0.1909566	total: 2.18s	remaining: 1.45s
12:	learn: 0.1880429	total: 2.35s	remaining: 1.27s
13:	learn: 0.1861644	total: 2.55s	remaining: 1.09s
14:	learn: 0.1842904	total: 2.73s	remaining: 911ms
15:	learn: 0.1813587	total: 2.88s	remaining: 720ms
16:	learn: 0.1800970	total: 3.07s	remaining: 541ms
17:	learn: 0.1776264	total: 3.23s	remaining: 359ms
18:	learn: 0.1758857	total: 3.4s	remaining: 179ms
19:	learn: 0.1750090	total: 3.58s	remaining: 0

In [13]:
from sklearn.metrics import classification_report

# Make predictions
cb_y_pred = cb_model.predict(X_test)

# Calculate the accuracy
cb_classification_report = classification_report(y_test, cb_y_pred)

print(cb_classification_report)

              precision    recall  f1-score   support

           0       0.85      0.77      0.81     31564
           1       0.95      0.97      0.96    147922

    accuracy                           0.93    179486
   macro avg       0.90      0.87      0.88    179486
weighted avg       0.93      0.93      0.93    179486



In [None]:
# from sklearn.model_selection import RandomizedSearchCV

# model = xgb.XGBClassifier(booster='gbtree', enable_categorical=True,
#                           device='cuda',
#                           objective='binary:logistic')

# # Define search space for hyperparameters
# param_dist = {
#     "n_estimators": np.arange(100, 501, 100),  # 100 to 1000, step 100
#     "max_depth": np.arange(3, 16, 2),  # 3 to 15, step 2
#     "learning_rate": np.linspace(0.01, 0.3, 5),  # 10 values between 0.01 and 0.3
#     "gamma": np.linspace(0, 5, 4),  # 6 values between 0 and 5
#     "reg_lambda": np.logspace(-3, 2, 4),  # Regularization term λ
#     "reg_alpha": np.logspace(-3, 2, 4),  # Regularization term α
# }

# # Perform randomized search
# random_search = RandomizedSearchCV(
#     model, param_distributions=param_dist, 
#     n_iter=5,  # Number of random parameter combinations to try
#     scoring="accuracy", 
#     cv=4,  # 5-fold cross-validation
#     verbose=1, 
#     n_jobs=-1,  # Use all CPU cores
#     random_state=42
# )

# # Fit RandomizedSearchCV
# random_search.fit(X_train, y_train)

# # Get best parameters
# best_xgb_params = random_search.best_params_
# best_xgb_params_df = pd.DataFrame([best_xgb_params])

# # Save best parameters to CSV
# best_xgb_params_df.to_csv("best_xgb_params.csv", index=False)

# # Print results
# print("Best Parameters:", best_xgb_params)
# print("Best Score:", round(random_search.best_score_, 2))
# print("Best parameters saved to CSV successfully!")

In [None]:
# from sklearn.metrics import classification_report

# t_model = random_search.best_estimator_

# # Make predictions
# xgb_y_pred = t_model.predict(X_test)

# # Calculate the accuracy
# xgb_classification_report = classification_report(y_test, xgb_y_pred)

In [None]:
# print(xgb_classification_report)

In [None]:
# from sklearn.model_selection import cross_val_score

# # Evaluate model using cross-validation
# xgb_roc_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='roc_auc')
# print(f'ROC-AUC: {xgb_roc_scores.mean()}')

In [None]:
# xgb_data['NewExist_Encoded'] = xgb_data['NewExist'].map({1.0 : 1, 2.0: 2, np.nan: 0})
# xgb_data['UrbanRural_Encoded'] = xgb_data['UrbanRural'].map({1.0 : 1, 2.0: 2, 0.0: 0}).fillna(0).astype('int64')
# xgb_data['MIS_Status_Encoded'] = xgb_data['MIS_Status'].map({'CHGOFF': 0, 'PIF': 1})
# xgb_data['RevLineCr_Encoded'] = xgb_data['RevLineCr'].map({'N': 1, 'Y': 2}).fillna(0).astype('int64')
# xgb_data['LowDoc_Encoded'] = xgb_data['LowDoc'].map({'N': 0, 'Y': 1})
# xgb_data['FranchiseCode_Encoded'] = xgb_data['FranchiseCode_Encoded'].map({'No': 0, 'Yes': 1}).astype('int64')
# xgb_data['RealEstate_Backed'] = xgb_data['RealEstate_Backed'].map({'No': 0, 'Yes': 1}).astype('int64')
# xgb_data['CreateJob_Encoded'] = xgb_data['CreateJob'].apply(lambda x: 1 if x > 0 else 0)
# xgb_data['RetainedJob_Encoded'] = xgb_data['RetainedJob'].apply(lambda x: 1 if x > 0 else 0)

#### One-Hot Encode categories

In [None]:
# df_multi = xgb_data[['NAICS', 'Term', 'NoEmp',
#        'CreateJob', 'RetainedJob', 'FranchiseCode', 'GrAppv',
#        'NAICS_class_code', 'FranchiseCode_Encoded', 'RealEstate_Backed',
#        'TermDays', 'Recession', 'EmployeeLoanRatio', 'NewExist_Encoded',
#        'UrbanRural_Encoded', 'MIS_Status_Encoded', 'RevLineCr_Encoded',
#        'LowDoc_Encoded', 'CreateJob_Encoded', 'RetainedJob_Encoded',
#        'Region_Northern', 'Region_Southern', 'Region_Western', 'StateRisk_Low',
#        'StateRisk_Medium', 'State_AL', 'State_AR', 'State_AZ', 'State_CA',
#        'State_CO', 'State_CT', 'State_DC', 'State_DE', 'State_FL', 'State_GA',
#        'State_HI', 'State_IA', 'State_ID', 'State_IL', 'State_IN', 'State_KS',
#        'State_KY', 'State_LA', 'State_MA', 'State_MD', 'State_ME', 'State_MI',
#        'State_MN', 'State_MO', 'State_MS', 'State_MT', 'State_NC', 'State_ND',
#        'State_NE', 'State_NH', 'State_NJ', 'State_NM', 'State_NV', 'State_NY',
#        'State_OH', 'State_OK', 'State_OR', 'State_PA', 'State_RI', 'State_SC',
#        'State_SD', 'State_TN', 'State_TX', 'State_UT', 'State_VA', 'State_VT',
#        'State_WA', 'State_WI', 'State_WV', 'State_WY']]