In [1]:
import pandas as pd
import numpy as np  
import matplotlib.pyplot as plt

In [2]:
# Load the data
df = pd.read_csv('.csv/cleaned_data.csv', index_col = 0)
model_data = df.copy()
pd.set_option('display.max_column', None)

In [3]:
model_data.replace(np.inf, np.nan, inplace=True)

In [4]:
# Convert categorical columns to string first, then to category
model_data['NewExist_Encoded'] = model_data['NewExist'].map({1.0: 1, 2.0: 0}, na_action='ignore') \
                                                      .fillna('missing') \
                                                      .astype('str') \
                                                      .astype('category')

model_data['UrbanRural_Encoded'] = model_data['UrbanRural'].map({1.0: 1, 2.0: 0}, na_action='ignore') \
                                                          .fillna('missing') \
                                                          .astype('str') \
                                                          .astype('category')

model_data['MIS_Status_Encoded'] = model_data['MIS_Status'].map({'CHGOFF': 0, 'PIF': 1}) \
                                                          .fillna('missing') \
                                                          .astype('str') \
                                                          .astype('category')

model_data['RevLineCr_Encoded'] = model_data['RevLineCr'].map({'N': 0, 'Y': 1}, na_action='ignore') \
                                                      .fillna('missing') \
                                                      .astype('str') \
                                                      .astype('category')

model_data['LowDoc_Encoded'] = model_data['LowDoc'].map({'N': 0, 'Y': 1}) \
                                                  .fillna('missing') \
                                                  .astype('str') \
                                                  .astype('category')

model_data['FranchiseCode_Encoded'] = model_data['FranchiseCode_Encoded'].map({'No': 0, 'Yes': 1}) \
                                                                    .fillna('missing') \
                                                                    .astype('str') \
                                                                    .astype('category')

model_data['RealEstate_Backed'] = model_data['RealEstate_Backed'].map({'No': 0, 'Yes': 1}) \
                                                              .fillna('missing') \
                                                              .astype('str') \
                                                              .astype('category')

model_data['CreateJob_Encoded'] = model_data['CreateJob'].apply(lambda x: 1 if x > 0 else 0) \
                                                      .astype('str') \
                                                      .astype('category')

model_data['RetainedJob_Encoded'] = model_data['RetainedJob'].apply(lambda x: 1 if x > 0 else 0) \
                                                        .astype('str') \
                                                        .astype('category')

model_data['State'] = model_data['State'].fillna('missing').astype('str').astype('category')

model_data['StateRisk'] = model_data['StateRisk'].fillna('missing').astype('str').astype('category')

model_data['Region'] = model_data['Region'].fillna('missing').astype('str').astype('category')

model_data['NAICS_class_code'] = model_data['NAICS_class_code'].fillna('missing').astype('str').astype('category')


In [5]:
model_data.loc[:, 'EmployeeLoanRatio'] = model_data.apply(
    lambda row: round(row['GrAppv']) if pd.isna(row['EmployeeLoanRatio']) else round(row['EmployeeLoanRatio']),
    axis=1
).astype('int64')

In [6]:
model_data.drop(labels=['LoanNr_ChkDgt', 'Name', 'City', 'Bank', 'BankState', 'TermDays', 'ApprovalDate', 'ApprovalFY', 'Zip', 'DisbursementDate', 
                      'DisbursementGross','NewExist', 'RetainedJob', 'LowDoc' ,'UrbanRural', 'RevLineCr', 'ChgOffDate',
                      'BalanceGross', 'MIS_Status', 'ChgOffPrinGr', 'SBA_Appv', 'Industry', 'Recession',
                       'ApprovalDateYear', 'ChgOffDateYear', 'ApprovalDateMonth', 'DisbursementDateYear',
                       'LoanDateEnd'], axis=1, inplace=True )

In [7]:
model_data.head()

Unnamed: 0,State,NAICS,Term,NoEmp,CreateJob,FranchiseCode,GrAppv,NAICS_class_code,FranchiseCode_Encoded,RealEstate_Backed,Region,EmployeeLoanRatio,StateRisk,NewExist_Encoded,UrbanRural_Encoded,MIS_Status_Encoded,RevLineCr_Encoded,LowDoc_Encoded,CreateJob_Encoded,RetainedJob_Encoded
0,IN,451120,84,4,0,1,60000,45,0,0,Eastern,15000.0,Medium,0.0,missing,1,0.0,1,0,0
1,IN,722410,60,2,0,1,40000,72,0,0,Eastern,20000.0,Medium,0.0,missing,1,0.0,1,0,0
2,IN,621210,180,7,0,1,287000,62,0,0,Eastern,41000.0,Medium,1.0,missing,1,0.0,0,0,0
3,OK,0,60,2,0,1,35000,81,0,0,Eastern,17500.0,Low,1.0,missing,1,0.0,1,0,0
4,FL,0,240,14,7,1,229000,81,0,1,Eastern,16357.0,High,1.0,missing,1,0.0,0,1,1


In [13]:
model_data.dtypes

State                    category
NAICS                       int64
Term                        int64
NoEmp                       int64
CreateJob                   int64
FranchiseCode               int64
GrAppv                      int64
NAICS_class_code         category
FranchiseCode_Encoded    category
RealEstate_Backed        category
Region                   category
EmployeeLoanRatio         float64
StateRisk                category
NewExist_Encoded         category
UrbanRural_Encoded       category
MIS_Status_Encoded       category
RevLineCr_Encoded        category
LowDoc_Encoded           category
CreateJob_Encoded        category
RetainedJob_Encoded      category
dtype: object

In [8]:
from sklearn.model_selection import train_test_split

# Split the data into features and target
X = model_data.drop(columns=['MIS_Status_Encoded'])
y = model_data['MIS_Status_Encoded']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [10]:
import catboost as cb

# Define model
cb_model = cb.CatBoostClassifier(#task_type="GPU",
                                random_state=42,
                                scale_pos_weight= 1.0,
                                random_strength= 10.0,
                                learning_rate= 0.15,
                                l2_leaf_reg= 5.0,
                                iterations= 800,
                                depth= 10,
                                border_count= 128,
                                bagging_temperature= 0.0,
                                verbose=100,  # Log every 100 iterations
                            )

categorical_features_indices = [X_train.columns.get_loc(col) for col in X_train.select_dtypes(include=['category']).columns]

cb_model.fit(X_train, y_train, cat_features=categorical_features_indices)

cb_train_score = cb_model.score(X_train, y_train)

print("Training Score:", round(cb_train_score, 2))

# Store model parameters in a DataFrame
best_cb_params = cb_model.get_params()  # Get model's hyperparameters
best_cb_params_df = pd.DataFrame([best_cb_params])

# Save to CSV
best_cb_params_df.to_csv("best_cb_params.csv", index=False)

print("Best parameters saved to CSV successfully!")

0:	learn: 0.5519333	total: 961ms	remaining: 12m 47s
100:	learn: 0.1812159	total: 57.9s	remaining: 6m 40s
200:	learn: 0.1490036	total: 2m 7s	remaining: 6m 20s
300:	learn: 0.1400814	total: 3m 17s	remaining: 5m 28s
400:	learn: 0.1341215	total: 4m 29s	remaining: 4m 28s
500:	learn: 0.1294995	total: 5m 40s	remaining: 3m 23s
600:	learn: 0.1256027	total: 6m 55s	remaining: 2m 17s
700:	learn: 0.1220517	total: 8m 12s	remaining: 1m 9s
799:	learn: 0.1187871	total: 9m 30s	remaining: 0us
Training Score: 0.95
Best parameters saved to CSV successfully!


In [11]:
from sklearn.metrics import classification_report

# Make predictions
cb_y_pred = cb_model.predict(X_test)

# Calculate the accuracy
cb_classification_report = classification_report(y_test, cb_y_pred)

print(cb_classification_report)

              precision    recall  f1-score   support

           0       0.86      0.82      0.84     31564
           1       0.96      0.97      0.97    147922

    accuracy                           0.94    179486
   macro avg       0.91      0.90      0.90    179486
weighted avg       0.94      0.94      0.94    179486



In [19]:
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split


cat_boost_params = {
    'scale_pos_weight': 1.0,
    'random_strength': 10.0,
    'learning_rate': 0.15,
    'l2_leaf_reg': 5.0,
    'iterations': 800,
    'depth': 10,
    'border_count': 128,
    'bagging_temperature': 0.0
}

cat_boost_best = CatBoostClassifier(
    random_state=42,
    verbose=100,  # Log every 100 iterations
    **cat_boost_params  # Apply your optimal parameters
)

# Detect categorical features
categorical_features = X.select_dtypes(include=['category']).columns.tolist()

# Get the indices of these columns
categorical_features_indices = [X.columns.get_loc(col) for col in categorical_features]

# Create data pools
train_pool = Pool(X_train, y_train, cat_features=categorical_features_indices)
test_pool = Pool(X_test, y_test, cat_features=categorical_features_indices)

cat_boost_best.fit(train_pool, eval_set=test_pool, early_stopping_rounds=50)

0:	learn: 0.5519333	test: 0.5517333	best: 0.5517333 (0)	total: 1.74s	remaining: 23m 10s
100:	learn: 0.1812159	test: 0.1829520	best: 0.1829520 (100)	total: 1m 20s	remaining: 9m 14s
200:	learn: 0.1490036	test: 0.1542117	best: 0.1542117 (200)	total: 2m 57s	remaining: 8m 47s
300:	learn: 0.1400814	test: 0.1499902	best: 0.1499902 (300)	total: 4m 25s	remaining: 7m 20s
400:	learn: 0.1341215	test: 0.1482708	best: 0.1482708 (400)	total: 6m 3s	remaining: 6m 1s
500:	learn: 0.1294995	test: 0.1474230	best: 0.1474230 (500)	total: 7m 28s	remaining: 4m 27s
600:	learn: 0.1256027	test: 0.1469276	best: 0.1469276 (600)	total: 9m 16s	remaining: 3m 4s
700:	learn: 0.1220517	test: 0.1467059	best: 0.1467059 (700)	total: 10m 52s	remaining: 1m 32s
799:	learn: 0.1187871	test: 0.1465542	best: 0.1465280 (774)	total: 12m 11s	remaining: 0us

bestTest = 0.1465280468
bestIteration = 774

Shrink model to first 775 iterations.


<catboost.core.CatBoostClassifier at 0x12d7b6060>

In [None]:
from sklearn.metrics import classification_report

# Make predictions
cb_y_pred = cat_boost_best.predict(X_test)

# Calculate the accuracy
cb_classification_report = classification_report(y_test, cb_y_pred)

print(cb_classification_report)

In [21]:
from catboost import cv

cv_params = cat_boost_params.copy()  # Copy best parameters
cv_params.update({'random_state': 42, 'verbose': 100, 'loss_function': 'Logloss'})

cv_data = cv(
    params=cv_params,
    pool=train_pool,
    fold_count=5,  # 5-Fold Cross-Validation
    early_stopping_rounds=50,
    stratified=True,  # Ensure stratified folds
    partition_random_seed=42
)

# Print the mean ROC-AUC score from cross-validation
print(f"Mean ROC-AUC Score: {cv_data['test-AUC-mean'].max()}")


Training on fold [0/5]
0:	learn: 0.5641634	test: 0.5642853	best: 0.5642853 (0)	total: 1.39s	remaining: 18m 27s
100:	learn: 0.1801657	test: 0.1801100	best: 0.1801100 (100)	total: 1m 2s	remaining: 7m 15s
200:	learn: 0.1482670	test: 0.1521812	best: 0.1521812 (200)	total: 2m 36s	remaining: 7m 46s
300:	learn: 0.1385271	test: 0.1479435	best: 0.1479432 (299)	total: 3m 39s	remaining: 6m 3s
400:	learn: 0.1317433	test: 0.1460771	best: 0.1460771 (400)	total: 4m 56s	remaining: 4m 55s
500:	learn: 0.1267114	test: 0.1454815	best: 0.1454815 (500)	total: 6m 29s	remaining: 3m 52s
600:	learn: 0.1221319	test: 0.1450893	best: 0.1450720 (588)	total: 7m 41s	remaining: 2m 32s
700:	learn: 0.1182474	test: 0.1448789	best: 0.1448552 (692)	total: 9m 24s	remaining: 1m 19s
799:	learn: 0.1147736	test: 0.1446922	best: 0.1446922 (799)	total: 10m 54s	remaining: 0us

bestTest = 0.1446922125
bestIteration = 799

Training on fold [1/5]
0:	learn: 0.5642798	test: 0.5642907	best: 0.5642907 (0)	total: 1.18s	remaining: 15m 40s


KeyError: 'test-AUC-mean'

In [22]:
# Print available columns to check keys
print(cv_data.columns)

# Access the correct metric name
best_logloss = cv_data['test-Logloss-mean'].min()
print(f"Best Mean Logloss: {best_logloss}")


Index(['iterations', 'test-Logloss-mean', 'test-Logloss-std',
       'train-Logloss-mean', 'train-Logloss-std'],
      dtype='object')
Best Mean Logloss: 0.14599035796803722
