# Parameter Tuning and Complex Model Design

In [1]:
# Redo the original bagging model for comparison purposes

In [2]:
import pickle

# Load the data from the pickle file
with open('train_test_data.pkl', 'rb') as f:
    X_train_full, X_test_full, y_train_full, y_test_full = pickle.load(f)

# Randomly sample just 1/10 of the data records to speed up the computation time
X_train = X_train_full.sample(frac=0.01, random_state=101)
y_train = y_train_full.loc[X_train.index]

# Test set remains unchanged
X_test = X_test_full
y_test = y_test_full

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import BaggingClassifier

In [4]:
# Excluding the unncessary columns from the input features for the models by dropping them now
columns_to_exclude = ['ID', 'Customer_ID', 'Month', 'SSN', 'Type_of_Loan', 'Name', 'Annual_Income']  # List of columns to exclude
# Notice how we are also excluding monthly_inhand_salary because that variable shows signs of multicollinearity with annual_income
X_train = X_train.drop(columns=columns_to_exclude)
X_test = X_test.drop(columns=columns_to_exclude)

In [5]:
# Ensemble Methods - Bagging
bagging = BaggingClassifier()
bagging.fit(X_train, y_train)
y_pred_bagging = bagging.predict(X_test)
bagging_score = accuracy_score(y_test, y_pred_bagging)
print("Bagging Classifier Accuracy:", bagging_score)

Bagging Classifier Accuracy: 0.6477448254556688


In [6]:
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Encode the target variable
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Instantiate XGBoost classifier
xgb_classifier = xgb.XGBClassifier()

# Train XGBoost classifier using the encoded target variables
xgb_classifier.fit(X_train, y_train_encoded)

# Predict
y_pred_xgb = xgb_classifier.predict(X_test)

# Evaluate accuracy
accuracy_xgb = accuracy_score(y_test_encoded, y_pred_xgb)
print("XG Boost Classifier Accuracy:", accuracy_xgb)

XG Boost Classifier Accuracy: 0.6666666666666666


In [7]:
# Print the classification reports of each of the models

from sklearn.metrics import classification_report

# Transform predicted labels back to original classes just for XG Boost
y_pred_xgb_original = label_encoder.inverse_transform(y_pred_xgb)

print("XG Boost:")
print(classification_report(y_test, y_pred_xgb_original))

print("Bagging:")
print(classification_report(y_test, y_pred_bagging))

XG Boost:
              precision    recall  f1-score   support

        Good       0.50      0.80      0.61      2100
        Poor       0.66      0.71      0.68      4018
    Standard       0.78      0.60      0.68      6830

    accuracy                           0.67     12948
   macro avg       0.65      0.70      0.66     12948
weighted avg       0.70      0.67      0.67     12948

Bagging:
              precision    recall  f1-score   support

        Good       0.46      0.78      0.58      2100
        Poor       0.64      0.74      0.69      4018
    Standard       0.79      0.55      0.65      6830

    accuracy                           0.65     12948
   macro avg       0.63      0.69      0.64     12948
weighted avg       0.69      0.65      0.65     12948



# Parameter Tuning

In [8]:
import numpy as np

# Get feature importances from base estimators
importances = []

for estimator in bagging.estimators_:
    if hasattr(estimator, 'feature_importances_'):
        importances.append(estimator.feature_importances_)

# Aggregate feature importances across all base estimators
importances = np.mean(importances, axis=0)
print("Feature importances:", importances)

Feature importances: [0.03474182 0.03800521 0.02724009 0.02377532 0.03914683 0.01473854
 0.066294   0.02427995 0.05597283 0.04281516 0.21629789 0.18426778
 0.04236843 0.04487706 0.00232976 0.05667367 0.03797162 0.01089858
 0.         0.00097073 0.00094526 0.00278298 0.0003224  0.00230311
 0.00578368 0.00302646 0.00185707 0.00080761 0.00408106 0.00189306
 0.00325648 0.00295902 0.00631654]


In [9]:
# Get feature names
feature_names = X_train.columns  # Assuming X_train is a DataFrame
feature_names

Index(['Age', 'Monthly_Inhand_Salary', 'Num_Bank_Accounts', 'Num_Credit_Card',
       'Interest_Rate', 'Num_of_Loan', 'Delay_from_due_date',
       'Num_of_Delayed_Payment', 'Changed_Credit_Limit',
       'Num_Credit_Inquiries', 'Credit_Mix', 'Outstanding_Debt',
       'Credit_Utilization_Ratio', 'Credit_History_Age',
       'Payment_of_Min_Amount', 'Total_EMI_per_month',
       'Amount_invested_monthly', 'Payment_Behaviour', 'Monthly_Balance',
       'x0_Architect', 'x0_Developer', 'x0_Doctor', 'x0_Engineer',
       'x0_Entrepreneur', 'x0_Journalist', 'x0_Lawyer', 'x0_Manager',
       'x0_Mechanic', 'x0_Media_Manager', 'x0_Musician', 'x0_Scientist',
       'x0_Teacher', 'x0_Writer'],
      dtype='object')

In [10]:
import pandas as pd
df_importance = pd.DataFrame({'Column': feature_names, 'Feature Importance': importances}, columns = ['Column', 'Feature Importance'])
# Sort feature importances in descending order
sorted_importance_df = df_importance.sort_values(by='Feature Importance', ascending=False)
sorted_importance_df

Unnamed: 0,Column,Feature Importance
10,Credit_Mix,0.216298
11,Outstanding_Debt,0.184268
6,Delay_from_due_date,0.066294
15,Total_EMI_per_month,0.056674
8,Changed_Credit_Limit,0.055973
13,Credit_History_Age,0.044877
9,Num_Credit_Inquiries,0.042815
12,Credit_Utilization_Ratio,0.042368
4,Interest_Rate,0.039147
1,Monthly_Inhand_Salary,0.038005


In [11]:
# Originally had this set to a threshold but realized if I am going to continue to rerun it and keep changing the 
# amounto f data included then I need to make this part static so that my application will still work
dropped_features = ['Num_Bank_Accounts', 'Num_Credit_Card', 'Num_of_Loan', 'Num_Credit_Inquiries', 'Payment_of_Min_Amount', 'Payment_Behaviour', 'Monthly_Balance', 'x0_Architect', 'x0_Developer', 'x0_Doctor', 'x0_Engineer', 'x0_Entrepreneur', 'x0_Journalist', 'x0_Lawyer', 'x0_Manager', 'x0_Mechanic', 'x0_Media_Manager', 'x0_Musician', 'x0_Scientist', 'x0_Teacher', 'x0_Writer']


In [12]:
# Assuming X_train is a DataFrame and dropped_features contains the features to be dropped
X_train = X_train.drop(columns=dropped_features)
X_train.columns

Index(['Age', 'Monthly_Inhand_Salary', 'Interest_Rate', 'Delay_from_due_date',
       'Num_of_Delayed_Payment', 'Changed_Credit_Limit', 'Credit_Mix',
       'Outstanding_Debt', 'Credit_Utilization_Ratio', 'Credit_History_Age',
       'Total_EMI_per_month', 'Amount_invested_monthly'],
      dtype='object')

In [13]:
# Drop the same features from X_test
X_test = X_test.drop(columns=dropped_features)

# Now rerun the model
bagging.fit(X_train, y_train)
y_pred_bagging = bagging.predict(X_test)
bagging_score = accuracy_score(y_test, y_pred_bagging)
print("Bagging Classifier Accuracy after feature selection:", bagging_score)

print("Bagging:")
print(classification_report(y_test, y_pred_bagging))

Bagging Classifier Accuracy after feature selection: 0.6539233858510967
Bagging:
              precision    recall  f1-score   support

        Good       0.47      0.78      0.59      2100
        Poor       0.64      0.74      0.69      4018
    Standard       0.80      0.56      0.66      6830

    accuracy                           0.65     12948
   macro avg       0.64      0.70      0.65     12948
weighted avg       0.70      0.65      0.66     12948



In [14]:
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Encode the target variable
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Instantiate XGBoost classifier
xgb_classifier = xgb.XGBClassifier()

# Train XGBoost classifier using the encoded target variables
xgb_classifier.fit(X_train, y_train_encoded)

# Predict
y_pred_xgb = xgb_classifier.predict(X_test)

# Evaluate accuracy
accuracy_xgb = accuracy_score(y_test_encoded, y_pred_xgb)
print("XGBoost Classifier Accuracy after feature selection:", accuracy_xgb)

from sklearn.metrics import classification_report

# Transform predicted labels back to original classes just for XG Boost
y_pred_xgb_original = label_encoder.inverse_transform(y_pred_xgb)

print("XG Boost:")
print(classification_report(y_test, y_pred_xgb_original))

XGBoost Classifier Accuracy after feature selection: 0.6582483781278962
XG Boost:
              precision    recall  f1-score   support

        Good       0.48      0.77      0.59      2100
        Poor       0.65      0.71      0.68      4018
    Standard       0.78      0.59      0.67      6830

    accuracy                           0.66     12948
   macro avg       0.64      0.69      0.65     12948
weighted avg       0.69      0.66      0.66     12948



# Parameter Tuning Section

The Bagging model has been chosen as the final model because of its simplicity, low computation time, and high accuracy value. In this section, we will attempt to approve upon its accuracy by using parameter tuning, and other techniques. 

In [15]:
# Access the default parameters
default_params = bagging.get_params()

# This will help decide what hyperparameters to use in the gridsearch and what values to try based on what was used in the original model
print("Default parameters:")
for param, value in default_params.items():
    print(param, ":", value)

Default parameters:
base_estimator : deprecated
bootstrap : True
bootstrap_features : False
estimator : None
max_features : 1.0
max_samples : 1.0
n_estimators : 10
n_jobs : None
oob_score : False
random_state : None
verbose : 0
warm_start : False


In [16]:
from sklearn.model_selection import GridSearchCV

# Define the hyperparameters grid
param_grid = {
    'n_estimators': [200, 500, 1000],  # Number of base estimators
    'max_samples': [0.7, 1.0, 1.3],  # Number of samples to draw from X to train each base estimator
    'max_features': [0.5, 0.7, 1.0]   # Number of features to draw from X to train each base estimator
}

# Instantiate the grid search model
grid_search = GridSearchCV(estimator=bagging, param_grid=param_grid, cv=5, scoring='accuracy')

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Print the best parameters
print("Best Parameters:", grid_search.best_params_)

# Get the best estimator
best_bagging = grid_search.best_estimator_

# Predict with the best estimator
y_pred_best_bagging = best_bagging.predict(X_test)

# Evaluate accuracy
best_bagging_score = accuracy_score(y_test, y_pred_best_bagging)
print("Tuned Bagging Classifier Accuracy:", best_bagging_score)

print("Bagging:")
print(classification_report(y_test, y_pred_bagging))

45 fits failed out of a total of 135.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
45 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\lisal\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\lisal\anaconda3\lib\site-packages\sklearn\base.py", line 1144, in wrapper
    estimator._validate_params()
  File "C:\Users\lisal\anaconda3\lib\site-packages\sklearn\base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\lisal\anaconda3\lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParamete

Best Parameters: {'max_features': 0.5, 'max_samples': 1.0, 'n_estimators': 500}
Tuned Bagging Classifier Accuracy: 0.6742354031510658
Bagging:
              precision    recall  f1-score   support

        Good       0.47      0.78      0.59      2100
        Poor       0.64      0.74      0.69      4018
    Standard       0.80      0.56      0.66      6830

    accuracy                           0.65     12948
   macro avg       0.64      0.70      0.65     12948
weighted avg       0.70      0.65      0.66     12948



In [17]:
# Access the default parameters
default_params = xgb_classifier.get_params()

# Print the default parameters
print("Default parameters:")
for param, value in default_params.items():
    print(param, ":", value)


Default parameters:
objective : multi:softprob
use_label_encoder : None
base_score : None
booster : None
callbacks : None
colsample_bylevel : None
colsample_bynode : None
colsample_bytree : None
early_stopping_rounds : None
enable_categorical : False
eval_metric : None
feature_types : None
gamma : None
gpu_id : None
grow_policy : None
importance_type : None
interaction_constraints : None
learning_rate : None
max_bin : None
max_cat_threshold : None
max_cat_to_onehot : None
max_delta_step : None
max_depth : None
max_leaves : None
min_child_weight : None
missing : nan
monotone_constraints : None
n_estimators : 100
n_jobs : None
num_parallel_tree : None
predictor : None
random_state : None
reg_alpha : None
reg_lambda : None
sampling_method : None
scale_pos_weight : None
subsample : None
tree_method : None
validate_parameters : None
verbosity : None


In [18]:
from sklearn.model_selection import GridSearchCV

# Define a grid of hyperparameters to search
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1, 0.01, 0.001]
}

# Instantiate XGBoost classifier
xgb_classifier = xgb.XGBClassifier()

# Initialize GridSearch with the defined parameter grid and cross-validation
grid_search = GridSearchCV(estimator=xgb_classifier, param_grid=param_grid, cv=3, scoring='accuracy')

# Perform GridSearch to find the best hyperparameters
grid_search.fit(X_train, y_train_encoded)

# Get the best hyperparameters found by GridSearch
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Predict using the model with the best hyperparameters
best_xgb_classifier = grid_search.best_estimator_
y_pred_xgb_tuned = best_xgb_classifier.predict(X_test)

# Evaluate accuracy
accuracy_xgb_tuned = accuracy_score(y_test_encoded, y_pred_xgb_tuned)
print("Tuned XGBoost Classifier Accuracy:", accuracy_xgb_tuned)

# Transform predicted labels back to original classes just for XG Boost
y_pred_xgb_original = label_encoder.inverse_transform(y_pred_xgb)

print("XG Boost:")
print(classification_report(y_test, y_pred_xgb_original))


Best Hyperparameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100}
Tuned XGBoost Classifier Accuracy: 0.6679023787457522
XG Boost:
              precision    recall  f1-score   support

        Good       0.48      0.77      0.59      2100
        Poor       0.65      0.71      0.68      4018
    Standard       0.78      0.59      0.67      6830

    accuracy                           0.66     12948
   macro avg       0.64      0.69      0.65     12948
weighted avg       0.69      0.66      0.66     12948



In [19]:
import joblib
joblib.dump(best_bagging, 'best_bagging_model.pkl')

['best_bagging_model.pkl']

# Final Model Design