# **HYPERPARAMETER TUNING**

**LOAD DATA**

In [None]:
# Import libraries
import os
import joblib
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
from google.colab import drive
import matplotlib.pyplot as plt
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from scipy.stats.mstats import winsorize
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import RFECV
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import accuracy_score, classification_report
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import recall_score, precision_score, accuracy_score, roc_auc_score, f1_score
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, OneHotEncoder, OrdinalEncoder
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# List drive content
os.listdir('/content/drive/MyDrive')

['Colab Notebooks',
 'split_P2_data',
 'LC_loans_granting_model_dataset.csv',
 'processed_loan_data.csv',
 'processed_loan_data.parquet',
 'LCDataDictionary.xlsx',
 'NOTE2.ipynb',
 'loan_2019_2020.csv',
 'loan_2019_2020_filtered.csv',
 'loan_2019_2020_final.csv',
 'Portfolio - Bertie Dickinson.ipynb',
 'd1_cleaned.csv',
 'categorical_variable_plots.png',
 'categorical_default_non_default_rate_plots.png',
 'Default_Rate_Of_Categorical_variable.png',
 'numerical_density_plots_by_default.png',
 'Categorical_Default_non_Default_Count.png',
 'Numerical_density_plots_by_default.png',
 'FE_data.csv',
 'Feature_Engineering_data.csv',
 'processed_dataset.xls',
 'Categorical_Default_non_Default_Rate.png',
 'PHASE_2_data.csv',
 'project_data',
 'Phase_2_data_splits',
 'Phase_2_data_preprocessed',
 'Phase_2_data_preprocessed_csv',
 'Phase_2_models',
 'sd_30.csv',
 'P1_data.csv',
 'Innitial Data Cleaning.ipynb',
 'P2_data.csv',
 'First.ipynb']

In [None]:
train_path = '/content/drive/MyDrive/split_P2_data/full_train_set.csv'
test_path  = '/content/drive/MyDrive/split_P2_data/test_set.csv'

# Load datasets
full_train = pd.read_csv(train_path)
test_set   = pd.read_csv(test_path)

# Quick check
print("Full train shape:", full_train.shape)
print("Test set shape:", test_set.shape)

Full train shape: (255194, 38)
Test set shape: (114845, 38)


In [None]:
# Separate features (X) and target variable (y) for the full training set
X_train_df = full_train.drop('default', axis=1)
y_train = full_train['default']

# Separate features (X) and target variable (y) for the test set
X_test_df = test_set.drop('default', axis=1)
y_test = test_set['default']

# Print the shapes to verify
print("X_train shape:", X_train_df.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test_df.shape)
print("y_test shape:", y_test.shape)

X_train shape: (255194, 37)
y_train shape: (255194,)
X_test shape: (114845, 37)
y_test shape: (114845,)


_

# **1. LOGISTIC REGRESSION**

In [None]:
# ---------------------------
# 1. Set up logistic regression and hyperparameter grid
# ---------------------------
logreg = LogisticRegression(max_iter=1000)

param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'penalty': ['l1', 'l2'],
    'class_weight': ['balanced'],
    'solver': ['liblinear']  # Supports both l1 and l2
}

# ---------------------------
# 2. Grid search with cross-validation
# ---------------------------
grid_search = GridSearchCV(
    estimator=logreg,
    param_grid=param_grid,
    scoring='roc_auc',
    cv=5,
    verbose=1,
    n_jobs=-1,
    return_train_score=True
)

# Fit on training data
grid_search.fit(X_train_df, y_train)

# ---------------------------
# 3. Best parameters and score
# ---------------------------
print("✅ Best Parameters:", grid_search.best_params_)
print("✅ Best ROC AUC Score (CV):", grid_search.best_score_)

# ---------------------------
# 4. View tuning results as a DataFrame
# ---------------------------
cv_results_df = pd.DataFrame(grid_search.cv_results_)
display(cv_results_df[['params', 'mean_test_score', 'std_test_score', 'mean_train_score']].sort_values(by='mean_test_score', ascending=False))

# ---------------------------
# 5. Save best model
# ---------------------------
best_model = grid_search.best_estimator_

# Save to Google Drive or local path
save_path = "/content/drive/MyDrive/Phase_3_models/"
os.makedirs(save_path, exist_ok=True)
model_filename = os.path.join(save_path, 'best_logistic_regression_balanced.pkl')

joblib.dump(best_model, model_filename)
print(f"✅ Best model saved to: {model_filename}")


Fitting 5 folds for each of 8 candidates, totalling 40 fits
✅ Best Parameters: {'C': 0.01, 'class_weight': 'balanced', 'penalty': 'l2', 'solver': 'liblinear'}
✅ Best ROC AUC Score (CV): 0.7082674152770324


Unnamed: 0,params,mean_test_score,std_test_score,mean_train_score
1,"{'C': 0.01, 'class_weight': 'balanced', 'penal...",0.708267,0.00158,0.708542
2,"{'C': 0.1, 'class_weight': 'balanced', 'penalt...",0.708266,0.001575,0.708545
3,"{'C': 0.1, 'class_weight': 'balanced', 'penalt...",0.708263,0.001571,0.708542
4,"{'C': 1, 'class_weight': 'balanced', 'penalty'...",0.708262,0.001569,0.708541
5,"{'C': 1, 'class_weight': 'balanced', 'penalty'...",0.708262,0.001569,0.708541
7,"{'C': 10, 'class_weight': 'balanced', 'penalty...",0.708262,0.001569,0.708541
6,"{'C': 10, 'class_weight': 'balanced', 'penalty...",0.708261,0.001569,0.708541
0,"{'C': 0.01, 'class_weight': 'balanced', 'penal...",0.708129,0.001626,0.708413


✅ Best model saved to: /content/drive/MyDrive/Phase_3_models/best_logistic_regression_balanced.pkl


_

## **2. RANDOM FOREST**

In [None]:
# Define class balancing methods
sampling_methods = {
    'None': None,
    'class_weight': 'balanced',
    'over': RandomOverSampler(random_state=42),
    'under': RandomUnderSampler(random_state=42),
    'SMOTE': SMOTE(random_state=42),
    'SMOTE+ENN': SMOTEENN(random_state=42)
}

# Stratified 5-fold CV
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Store results
results = []

for method_name, sampler in sampling_methods.items():
    if method_name == 'class_weight':
        rf = RandomForestClassifier(
            n_estimators=500,
            max_depth=10,
            min_samples_split=5,
            min_samples_leaf=2,
            max_features='sqrt',
            class_weight='balanced',
            random_state=42,
            n_jobs=-1
        )
        scores = cross_val_score(rf, X_train_df, y_train, cv=cv, scoring='roc_auc', n_jobs=-1)
    elif sampler is not None:
        pipeline = Pipeline([
            ('sampler', sampler),
            ('classifier', RandomForestClassifier(
                n_estimators=500,
                max_depth=10,
                min_samples_split=5,
                min_samples_leaf=2,
                max_features='sqrt',
                random_state=42,
                n_jobs=-1
            ))
        ])
        scores = cross_val_score(pipeline, X_train_df, y_train, cv=cv, scoring='roc_auc', n_jobs=-1)
    else:  # No sampling
        rf = RandomForestClassifier(
            n_estimators=500,
            max_depth=10,
            min_samples_split=5,
            min_samples_leaf=2,
            max_features='sqrt',
            random_state=42,
            n_jobs=-1
        )
        scores = cross_val_score(rf, X_train_df, y_train, cv=cv, scoring='roc_auc', n_jobs=-1)

    results.append({'sampling': method_name, 'roc_auc_mean': scores.mean(), 'roc_auc_std': scores.std()})
    print(f"Evaluated: {method_name}, mean ROC-AUC: {scores.mean():.4f}")

# Convert results to DataFrame
results_df = pd.DataFrame(results).sort_values(by='roc_auc_mean', ascending=False).reset_index(drop=True)
results_df

Evaluated: None, mean ROC-AUC: 0.7092
Evaluated: class_weight, mean ROC-AUC: 0.7068
Evaluated: over, mean ROC-AUC: 0.7073
Evaluated: under, mean ROC-AUC: 0.7084
Evaluated: SMOTE, mean ROC-AUC: 0.6964
Evaluated: SMOTE+ENN, mean ROC-AUC: 0.6988


Unnamed: 0,sampling,roc_auc_mean,roc_auc_std
0,,0.709248,0.002466
1,under,0.708382,0.002524
2,over,0.70725,0.00269
3,class_weight,0.706765,0.002889
4,SMOTE+ENN,0.698812,0.002927
5,SMOTE,0.696439,0.002472


In [None]:
# ---------------------------
# 1. Define base model
# ---------------------------
rf = RandomForestClassifier(
    random_state=42,
    n_jobs=-1
)

# ---------------------------
# 2. Define parameter grid
# ---------------------------
param_grid_rf = {
    'n_estimators': [100, 300],
    'max_depth': [10, 20],              # limited to avoid overfitting
    'min_samples_split': [2, 10],
    'min_samples_leaf': [1, 4],
    'max_features': ['sqrt'],           # better for classification
    'class_weight': ['balanced']        # handle imbalance internally
}

# ---------------------------
# 3. Grid Search with 5-fold CV, optimize ROC-AUC
# ---------------------------
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid_rf,
    scoring='roc_auc',
    cv=5,
    n_jobs=-1,
    verbose=3,
    return_train_score=True
)

# ---------------------------
# 4. Fit the model
# ---------------------------
grid_search.fit(X_train_df, y_train)

# ---------------------------
# 5. Display best results
# ---------------------------
print("\n✅ Best Parameters:", grid_search.best_params_)
print("✅ Best ROC AUC Score (CV):", grid_search.best_score_)

# Save cross-validation results
cv_results_df = pd.DataFrame(grid_search.cv_results_)
display(
    cv_results_df[['params', 'mean_test_score', 'std_test_score', 'mean_train_score']]
    .sort_values(by='mean_test_score', ascending=False)
)

# ---------------------------
# 6. Save the best model
# ---------------------------
best_rf_model = grid_search.best_estimator_

save_path = "/content/drive/MyDrive/Phase_2_models/"
os.makedirs(save_path, exist_ok=True)
rf_model_filename = os.path.join(save_path, 'best_random_forest_auc.pkl')

joblib.dump(best_rf_model, rf_model_filename)
print(f"\n✅ Best Random Forest model saved to: {rf_model_filename}")


Fitting 5 folds for each of 16 candidates, totalling 80 fits





✅ Best Parameters: {'class_weight': 'balanced', 'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 300}
✅ Best ROC AUC Score (CV): 0.7234286969664726


Unnamed: 0,params,mean_test_score,std_test_score,mean_train_score
15,"{'class_weight': 'balanced', 'max_depth': 20, ...",0.723429,0.000845,0.969635
13,"{'class_weight': 'balanced', 'max_depth': 20, ...",0.723228,0.000979,0.974093
11,"{'class_weight': 'balanced', 'max_depth': 20, ...",0.72168,0.000855,0.98397
14,"{'class_weight': 'balanced', 'max_depth': 20, ...",0.720913,0.000831,0.967796
12,"{'class_weight': 'balanced', 'max_depth': 20, ...",0.720872,0.000906,0.972359
10,"{'class_weight': 'balanced', 'max_depth': 20, ...",0.718631,0.000977,0.982593
7,"{'class_weight': 'balanced', 'max_depth': 10, ...",0.717611,0.000631,0.747618
1,"{'class_weight': 'balanced', 'max_depth': 10, ...",0.717574,0.000718,0.749329
3,"{'class_weight': 'balanced', 'max_depth': 10, ...",0.717534,0.000622,0.748178
5,"{'class_weight': 'balanced', 'max_depth': 10, ...",0.717518,0.000635,0.747699



✅ Best Random Forest model saved to: /content/drive/MyDrive/Phase_2_models/best_random_forest_auc.pkl


_

## **3. XGBOOST**

In [None]:
# Compute scale_pos_weight
neg = sum(y_train == 0)
pos = sum(y_train == 1)
scale = neg / pos

# Define model
xgb = XGBClassifier(
    objective='binary:logistic',
    eval_metric='auc',
    use_label_encoder=False,
    scale_pos_weight=scale,
    n_jobs=-1,
    random_state=42
)

# Parameter grid (reduced)
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4],
    'learning_rate': [0.01, 0.1],
    'subsample': [0.6, 0.8],
    'colsample_bytree': [0.6, 0.8],
    'min_child_weight': [1, 5],
    'gamma': [0, 1]
}

# Randomized Search with 3-fold CV
random_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_grid,
    n_iter=16,  # number of parameter settings to try
    scoring='roc_auc',
    cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=42),
    n_jobs=-1,
    verbose=3,
    random_state=42,
    return_train_score=True
)

# Fit Randomized Search
random_search.fit(X_train_df, y_train)

# Display best results
print("\n✅ Best Parameters from Randomized Search:", random_search.best_params_)
print("✅ Best ROC AUC Score (CV) from Randomized Search:", random_search.best_score_)

# Save CV results
cv_results_df = pd.DataFrame(random_search.cv_results_)
display(cv_results_df[['params', 'mean_test_score', 'std_test_score', 'mean_train_score']].sort_values(by='mean_test_score', ascending=False))

# Get the best estimator
best_xgb_model = random_search.best_estimator_


Fitting 3 folds for each of 16 candidates, totalling 48 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



✅ Best Parameters from Randomized Search: {'subsample': 0.8, 'n_estimators': 300, 'min_child_weight': 5, 'max_depth': 4, 'learning_rate': 0.1, 'gamma': 0, 'colsample_bytree': 0.8}
✅ Best ROC AUC Score (CV) from Randomized Search: 0.7328464216417618


Unnamed: 0,params,mean_test_score,std_test_score,mean_train_score
3,"{'subsample': 0.8, 'n_estimators': 300, 'min_c...",0.732846,0.00095,0.755676
1,"{'subsample': 0.6, 'n_estimators': 300, 'min_c...",0.732377,0.00073,0.755174
0,"{'subsample': 0.8, 'n_estimators': 200, 'min_c...",0.731682,0.000801,0.746737
2,"{'subsample': 0.6, 'n_estimators': 300, 'min_c...",0.731389,0.000784,0.741175
5,"{'subsample': 0.8, 'n_estimators': 200, 'min_c...",0.729426,0.000545,0.736964
10,"{'subsample': 0.6, 'n_estimators': 100, 'min_c...",0.725289,0.000685,0.729563
15,"{'subsample': 0.6, 'n_estimators': 100, 'min_c...",0.725289,0.000685,0.729563
8,"{'subsample': 0.6, 'n_estimators': 300, 'min_c...",0.718313,0.000222,0.721647
11,"{'subsample': 0.6, 'n_estimators': 300, 'min_c...",0.718262,0.000237,0.721801
13,"{'subsample': 0.8, 'n_estimators': 300, 'min_c...",0.718041,0.00011,0.721468


In [None]:
# Set early stopping
best_xgb_model.set_params(early_stopping_rounds=10)

# Train on full training data with validation set for early stopping
best_xgb_model.fit(
    X_train_df, y_train,
    eval_set=[(X_val_df, y_val)],
    verbose=False
)

# Save the trained model
save_path = "/content/drive/MyDrive/Phase_2_models/"
os.makedirs(save_path, exist_ok=True)
xgb_model_filename = os.path.join(save_path, 'best_xgboost_auc_early_stopping.pkl')
import joblib
joblib.dump(best_xgb_model, xgb_model_filename)

print(f"\n✅ Best XGBoost model saved to: {xgb_model_filename}")


Parameters: { "use_label_encoder" } are not used.

  self.starting_round = model.num_boosted_rounds()



✅ Best XGBoost model saved to: /content/drive/MyDrive/Phase_2_models/best_xgboost_auc_early_stopping.pkl


_

## **4. LIGHTGBM**

In [None]:
# Compute scale_pos_weight
neg = sum(y_train == 0)
pos = sum(y_train == 1)
scale_pos_weight = neg / pos

# Define model
lgbm = LGBMClassifier(
    objective='binary',
    metric='auc',
    scale_pos_weight=scale_pos_weight,
    random_state=42,
    n_jobs=-1
)

print("✅ LGBMClassifier model instantiated with scale_pos_weight.")

✅ LGBMClassifier model instantiated with scale_pos_weight.


In [None]:
# Define parameter grid for LightGBM (reduced)
param_grid_lgbm = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4],
    'learning_rate': [0.01, 0.1],
    'subsample': [0.6, 0.8],
    'colsample_bytree': [0.6, 0.8],
    'min_child_weight': [1, 5],
    'gamma': [0, 1]
}

# Print the parameter grid
print("✅ LightGBM Parameter Grid:")
print(param_grid_lgbm)

✅ LightGBM Parameter Grid:
{'n_estimators': [100, 200, 300], 'max_depth': [3, 4], 'learning_rate': [0.01, 0.1], 'subsample': [0.6, 0.8], 'colsample_bytree': [0.6, 0.8], 'min_child_weight': [1, 5], 'gamma': [0, 1]}


In [None]:
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold

# Instantiate StratifiedKFold
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# Instantiate RandomizedSearchCV
random_search_lgbm = RandomizedSearchCV(
    estimator=lgbm,
    param_distributions=param_grid_lgbm,
    n_iter=16,  # number of parameter settings to try
    scoring='roc_auc',
    cv=cv,
    n_jobs=-1,
    verbose=3,
    random_state=42,
    return_train_score=True
)

# Fit Randomized Search
random_search_lgbm.fit(X_train_df, y_train)

# Display best results
print("\n✅ Best Parameters from Randomized Search:", random_search_lgbm.best_params_)
print("✅ Best ROC AUC Score (CV) from Randomized Search:", random_search_lgbm.best_score_)

# Save CV results
cv_results_df_lgbm = pd.DataFrame(random_search_lgbm.cv_results_)
display(cv_results_df_lgbm[['params', 'mean_test_score', 'std_test_score', 'mean_train_score']].sort_values(by='mean_test_score', ascending=False))

Fitting 3 folds for each of 16 candidates, totalling 48 fits
[LightGBM] [Info] Number of positive: 68657, number of negative: 233269
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.106380 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2416
[LightGBM] [Info] Number of data points in the train set: 301926, number of used features: 75
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.227397 -> initscore=-1.223069
[LightGBM] [Info] Start training from score -1.223069

✅ Best Parameters from Randomized Search: {'subsample': 0.8, 'n_estimators': 300, 'min_child_weight': 5, 'max_depth': 4, 'learning_rate': 0.1, 'gamma': 0, 'colsample_bytree': 0.8}
✅ Best ROC AUC Score (CV) from Randomized Search: 0.732606959440066


Unnamed: 0,params,mean_test_score,std_test_score,mean_train_score
3,"{'subsample': 0.8, 'n_estimators': 300, 'min_c...",0.732607,0.000867,0.755573
1,"{'subsample': 0.6, 'n_estimators': 300, 'min_c...",0.732592,0.000771,0.755584
0,"{'subsample': 0.8, 'n_estimators': 200, 'min_c...",0.731175,0.000709,0.746905
2,"{'subsample': 0.6, 'n_estimators': 300, 'min_c...",0.73098,0.000609,0.740844
5,"{'subsample': 0.8, 'n_estimators': 200, 'min_c...",0.729166,0.000478,0.736897
10,"{'subsample': 0.6, 'n_estimators': 100, 'min_c...",0.725232,0.00053,0.729955
15,"{'subsample': 0.6, 'n_estimators': 100, 'min_c...",0.725232,0.00053,0.729955
13,"{'subsample': 0.8, 'n_estimators': 300, 'min_c...",0.718484,0.000125,0.722444
8,"{'subsample': 0.6, 'n_estimators': 300, 'min_c...",0.718484,0.000125,0.722444
11,"{'subsample': 0.6, 'n_estimators': 300, 'min_c...",0.71847,0.000264,0.722551


In [None]:
from lightgbm.callback import early_stopping

# Access the best LightGBM model from the Randomized Search object
best_lgbm_model = random_search_lgbm.best_estimator_

# Train on full training data with validation set for early stopping
best_lgbm_model.fit(
    X_train_df, y_train,
    eval_set=[(X_val_df, y_val)],
    callbacks=[early_stopping(stopping_rounds=10, verbose=False)]
)

# Save the trained model
save_path = "/content/drive/MyDrive/Phase_2_models/"
os.makedirs(save_path, exist_ok=True)
lgbm_model_filename = os.path.join(save_path, 'best_lightgbm_auc_early_stopping.pkl')
joblib.dump(best_lgbm_model, lgbm_model_filename)

print(f"\n✅ Best LightGBM model saved to: {lgbm_model_filename}")

[LightGBM] [Info] Number of positive: 68657, number of negative: 233269
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.054123 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2416
[LightGBM] [Info] Number of data points in the train set: 301926, number of used features: 75
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.227397 -> initscore=-1.223069
[LightGBM] [Info] Start training from score -1.223069

✅ Best LightGBM model saved to: /content/drive/MyDrive/Phase_2_models/best_lightgbm_auc_early_stopping.pkl


_