In [1]:
import pandas as pd
import numpy as np  
import matplotlib.pyplot as plt

In [2]:
# Load the data
df = pd.read_csv('.csv/cleaned_data.csv', index_col = 0)
xgb_data = df.copy()
pd.set_option('display.max_column', None)

In [3]:
xgb_data.replace(np.inf, np.nan, inplace=True)

In [4]:
xgb_data['NewExist_Encoded'] = xgb_data['NewExist'].map({1.0 : 1, 2.0: 2, np.nan: 0})
xgb_data['UrbanRural_Encoded'] = xgb_data['UrbanRural'].map({1.0 : 1, 2.0: 2, 0.0: 0}).fillna(0).astype('int64')
xgb_data['MIS_Status_Encoded'] = xgb_data['MIS_Status'].map({'CHGOFF': 0, 'PIF': 1})
xgb_data['RevLineCr_Encoded'] = xgb_data['RevLineCr'].map({'N': 1, 'Y': 2}).fillna(0).astype('int64')
xgb_data['LowDoc_Encoded'] = xgb_data['LowDoc'].map({'N': 0, 'Y': 1})
xgb_data['FranchiseCode_Encoded'] = xgb_data['FranchiseCode_Encoded'].map({'No': 0, 'Yes': 1}).astype('int64')
xgb_data['RealEstate_Backed'] = xgb_data['RealEstate_Backed'].map({'No': 0, 'Yes': 1}).astype('int64')
xgb_data['CreateJob_Encoded'] = xgb_data['CreateJob'].apply(lambda x: 1 if x > 0 else 0)
xgb_data['RetainedJob_Encoded'] = xgb_data['RetainedJob'].apply(lambda x: 1 if x > 0 else 0)
xgb_data['State'] = xgb_data['State'].astype('category')
xgb_data['StateRisk'] = xgb_data['StateRisk'].astype('category')
xgb_data['Region'] = xgb_data['Region'].astype('category')

In [5]:
# xgb_data['NewExist_Encoded'] = xgb_data['NewExist'].map({1.0 : 1, 2.0: 2}, na_action='ignore')
# xgb_data['UrbanRural_Encoded'] = xgb_data['UrbanRural'].map({1.0 : 1, 2.0: 2, 0.0: 0}, na_action='ignore')
# xgb_data['MIS_Status_Encoded'] = xgb_data['MIS_Status'].map({'CHGOFF': 0, 'PIF': 1})
# xgb_data['RevLineCr_Encoded'] = xgb_data['RevLineCr'].map({'N': 1, 'Y': 2}, na_action='ignore')
# xgb_data['LowDoc_Encoded'] = xgb_data['LowDoc'].map({'N': 0, 'Y': 1})
# xgb_data['FranchiseCode_Encoded'] = xgb_data['FranchiseCode_Encoded'].map({'No': 0, 'Yes': 1}).astype('int64')
# xgb_data['RealEstate_Backed'] = xgb_data['RealEstate_Backed'].map({'No': 0, 'Yes': 1}).astype('int64')
# xgb_data['CreateJob_Encoded'] = xgb_data['CreateJob'].apply(lambda x: 1 if x > 0 else 0)
# xgb_data['RetainedJob_Encoded'] = xgb_data['RetainedJob'].apply(lambda x: 1 if x > 0 else 0)
# xgb_data['State'] = xgb_data['State'].astype('category')
# xgb_data['StateRisk'] = xgb_data['StateRisk'].astype('category')
# xgb_data['Region'] = xgb_data['Region'].astype('category')

In [5]:
xgb_data.loc[:, 'EmployeeLoanRatio'] = xgb_data.apply(
    lambda row: round(row['GrAppv']) if pd.isna(row['EmployeeLoanRatio']) else round(row['EmployeeLoanRatio']),
    axis=1
).astype('int64')

In [None]:
xgb_data.drop(labels=['LoanNr_ChkDgt', 'Name', 'City', 'Bank', 'BankState', 'TermDays', 'ApprovalDate', 'ApprovalFY', 'Zip', 'DisbursementDate', 
                      'DisbursementGross','NewExist', 'RetainedJob', 'LowDoc' ,'UrbanRural', 'RevLineCr', 'ChgOffDate',
                      'BalanceGross', 'MIS_Status', 'ChgOffPrinGr', 'SBA_Appv', 'Industry', 'RetainedJob_Encoded',
                       'ApprovalDateYear', 'ChgOffDateYear', 'ApprovalDateMonth', 'DisbursementDateYear',
                       'LoanDateEnd'], axis=1, inplace=True )

In [7]:
# Create dummies
dummy_cols = pd.get_dummies(xgb_data[['State', 'Region', 'StateRisk']])  

# Convert dummies to int
dummy_cols = dummy_cols.astype(int)  

# Merge back to original DataFrame
xgb_data = pd.concat([xgb_data, dummy_cols], axis=1)  


In [8]:
xgb_data.columns

Index(['State', 'NAICS', 'Term', 'NoEmp', 'CreateJob', 'FranchiseCode',
       'GrAppv', 'NAICS_class_code', 'FranchiseCode_Encoded',
       'RealEstate_Backed', 'Region', 'Recession', 'EmployeeLoanRatio',
       'StateRisk', 'NewExist_Encoded', 'UrbanRural_Encoded',
       'MIS_Status_Encoded', 'RevLineCr_Encoded', 'LowDoc_Encoded',
       'CreateJob_Encoded', 'RetainedJob_Encoded', 'State_AK', 'State_AL',
       'State_AR', 'State_AZ', 'State_CA', 'State_CO', 'State_CT', 'State_DC',
       'State_DE', 'State_FL', 'State_GA', 'State_HI', 'State_IA', 'State_ID',
       'State_IL', 'State_IN', 'State_KS', 'State_KY', 'State_LA', 'State_MA',
       'State_MD', 'State_ME', 'State_MI', 'State_MN', 'State_MO', 'State_MS',
       'State_MT', 'State_NC', 'State_ND', 'State_NE', 'State_NH', 'State_NJ',
       'State_NM', 'State_NV', 'State_NY', 'State_OH', 'State_OK', 'State_OR',
       'State_PA', 'State_RI', 'State_SC', 'State_SD', 'State_TN', 'State_TX',
       'State_UT', 'State_VA', 'Stat

In [None]:
xgb_data.drop(['State', 'Region', 'StateRisk'], axis=1, inplace=True)

In [None]:
from sklearn.preprocessing import StandardScaler

numerical_features = ['GrAppv', 'CreateJob', 'Term', 'NoEmp', 'NAICS', 'EmployeeLoanRatio', 'FranchiseCode',]

scaler = StandardScaler()
scaled_numerics = scaler.fit_transform(xgb_data[numerical_features])

xgb_data[numerical_features] = scaled_numerics

In [None]:
from sklearn.preprocessing import PolynomialFeatures

# Define the target columns
target_columns = ['MIS_Status_Encoded', 'State_AK', 'State_AL',
       'State_AR', 'State_AZ', 'State_CA', 'State_CO', 'State_CT', 'State_DC',
       'State_DE', 'State_FL', 'State_GA', 'State_HI', 'State_IA', 'State_ID',
       'State_IL', 'State_IN', 'State_KS', 'State_KY', 'State_LA', 'State_MA',
       'State_MD', 'State_ME', 'State_MI', 'State_MN', 'State_MO', 'State_MS',
       'State_MT', 'State_NC', 'State_ND', 'State_NE', 'State_NH', 'State_NJ',
       'State_NM', 'State_NV', 'State_NY', 'State_OH', 'State_OK', 'State_OR',
       'State_PA', 'State_RI', 'State_SC', 'State_SD', 'State_TN', 'State_TX',
       'State_UT', 'State_VA', 'State_VT', 'State_WA', 'State_WI', 'State_WV',
       'State_WY', 'Region_Eastern', 'Region_Northern', 'Region_Southern',
       'Region_Western', 'StateRisk_High', 'StateRisk_Low',
       'StateRisk_Medium']

# Separate features (X) and target (y)
X = xgb_data.drop(columns=target_columns).reset_index(drop=True)  # Drop target and reset index
y = xgb_data[target_columns].reset_index(drop=True)  # Store target separately and reset index

# Apply PolynomialFeatures
polyfit = PolynomialFeatures(degree=2) 
X_poly = polyfit.fit_transform(X)

# Convert back to DataFrame with feature names
X_poly_df = pd.DataFrame(X_poly, columns=polyfit.get_feature_names_out(X.columns))

# Concatenate transformed features and target
xgb_data_polyfit_df = pd.concat([X_poly_df, y], axis=1)


In [10]:
from sklearn.model_selection import train_test_split

# Split the data into features and target
X = xgb_data.drop(columns=['MIS_Status_Encoded'])
y = xgb_data['MIS_Status_Encoded']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [None]:
import xgboost as xgb

# Define model
model = xgb.XGBClassifier(booster= 'gbtree', enable_categorical=True,
                          device='cuda', objective='binary:logistic',
                          eval_metric= 'logloss',
                          subsample= 0.8,
                          gamma= 4,
                          colsample_bytree=0.7,
                          max_depth= 25,
                          reg_lambda= 0.1,
                          reg_alpha= 10,
                          n_estimators= 800,
                          learning_rate=0.27777)

model.fit(X_train, y_train)

train_score = model.score(X_train, y_train)

print("Training Score:", round(train_score, 2))

# Store model parameters in a DataFrame
best_xgb_params = model.get_params()  # Get model's hyperparameters
best_xgb_params_df = pd.DataFrame([best_xgb_params])

# Save to CSV
best_xgb_params_df.to_csv("best_xgb_params.csv", index=False)

print("Best parameters saved to CSV successfully!")



Training Score: 0.95
Best parameters saved to CSV successfully!


In [15]:
from sklearn.metrics import classification_report

# Make predictions
xgb_y_pred = model.predict(X_test)

# Calculate the accuracy
xgb_classification_report = classification_report(y_test, xgb_y_pred)

print(xgb_classification_report)

NameError: name 'model' is not defined

In [None]:
# from sklearn.model_selection import RandomizedSearchCV

# model = xgb.XGBClassifier(booster='gbtree', enable_categorical=True,
#                           device='cuda',
#                           objective='binary:logistic')

# # Define search space for hyperparameters
# param_dist = {
#     "n_estimators": np.arange(100, 501, 100),  # 100 to 1000, step 100
#     "max_depth": np.arange(3, 16, 2),  # 3 to 15, step 2
#     "learning_rate": np.linspace(0.01, 0.3, 5),  # 10 values between 0.01 and 0.3
#     "gamma": np.linspace(0, 5, 4),  # 6 values between 0 and 5
#     "reg_lambda": np.logspace(-3, 2, 4),  # Regularization term λ
#     "reg_alpha": np.logspace(-3, 2, 4),  # Regularization term α
# }

# # Perform randomized search
# random_search = RandomizedSearchCV(
#     model, param_distributions=param_dist, 
#     n_iter=5,  # Number of random parameter combinations to try
#     scoring="accuracy", 
#     cv=4,  # 5-fold cross-validation
#     verbose=1, 
#     n_jobs=-1,  # Use all CPU cores
#     random_state=42
# )

# # Fit RandomizedSearchCV
# random_search.fit(X_train, y_train)

# # Get best parameters
# best_xgb_params = random_search.best_params_
# best_xgb_params_df = pd.DataFrame([best_xgb_params])

# # Save best parameters to CSV
# best_xgb_params_df.to_csv("best_xgb_params.csv", index=False)

# # Print results
# print("Best Parameters:", best_xgb_params)
# print("Best Score:", round(random_search.best_score_, 2))
# print("Best parameters saved to CSV successfully!")

In [None]:
# from sklearn.metrics import classification_report

# t_model = random_search.best_estimator_

# # Make predictions
# xgb_y_pred = t_model.predict(X_test)

# # Calculate the accuracy
# xgb_classification_report = classification_report(y_test, xgb_y_pred)

In [None]:
# print(xgb_classification_report)