# Parameter Tuning and Complex Model Design

In [1]:
# Redo the original bagging model for comparison purposes

In [2]:
import pickle

# Load the data from the pickle file
with open('train_test_data.pkl', 'rb') as f:
    X_train_full, X_test_full, y_train_full, y_test_full = pickle.load(f)

# Randomly sample half of the data records
X_train = X_train_full.sample(frac=0.5, random_state=42)
y_train = y_train_full.loc[X_train.index]

# Test set remains unchanged
X_test = X_test_full
y_test = y_test_full

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import BaggingClassifier

In [4]:
# Excluding the unncessary columns from the input features for the models by dropping them now
columns_to_exclude = ['ID', 'Customer_ID', 'Month', 'SSN', 'Type_of_Loan', 'Name']  # List of columns to exclude
X_train = X_train.drop(columns=columns_to_exclude)
X_test = X_test.drop(columns=columns_to_exclude)

In [5]:
# Ensemble Methods - Bagging
bagging = BaggingClassifier()
bagging.fit(X_train, y_train)
y_pred_bagging = bagging.predict(X_test)
bagging_score = accuracy_score(y_test, y_pred_bagging)
print("Bagging Classifier Accuracy:", bagging_score)

Bagging Classifier Accuracy: 0.7557924003707136


# Parameter Tuning

In [6]:
import numpy as np

# Get feature importances from base estimators
importances = []

for estimator in bagging.estimators_:
    if hasattr(estimator, 'feature_importances_'):
        importances.append(estimator.feature_importances_)

# Aggregate feature importances across all base estimators
importances = np.mean(importances, axis=0)
print("Feature importances:", importances)

Feature importances: [3.88640476e-02 3.25373305e-02 3.38686333e-02 1.99899181e-02
 2.80303298e-02 6.90990634e-02 1.43620049e-02 4.87702505e-02
 3.52910206e-02 6.31087715e-02 2.69108118e-02 1.16119889e-01
 1.80523650e-01 5.50448508e-02 5.93599777e-02 5.53911262e-03
 4.68399443e-02 5.69409679e-02 2.04020587e-02 1.12810518e-05
 3.97498350e-03 3.53875688e-03 4.16166832e-03 3.08080161e-03
 3.37289461e-03 3.02300553e-03 3.86668760e-03 3.91518549e-03
 3.37289491e-03 3.18058696e-03 3.24150526e-03 3.42817313e-03
 3.18395532e-03 3.04498638e-03]


In [7]:
# Get feature names
feature_names = X_train.columns  # Assuming X_train is a DataFrame
feature_names

Index(['Age', 'Annual_Income', 'Monthly_Inhand_Salary', 'Num_Bank_Accounts',
       'Num_Credit_Card', 'Interest_Rate', 'Num_of_Loan',
       'Delay_from_due_date', 'Num_of_Delayed_Payment', 'Changed_Credit_Limit',
       'Num_Credit_Inquiries', 'Credit_Mix', 'Outstanding_Debt',
       'Credit_Utilization_Ratio', 'Credit_History_Age',
       'Payment_of_Min_Amount', 'Total_EMI_per_month',
       'Amount_invested_monthly', 'Payment_Behaviour', 'Monthly_Balance',
       'x0_Architect', 'x0_Developer', 'x0_Doctor', 'x0_Engineer',
       'x0_Entrepreneur', 'x0_Journalist', 'x0_Lawyer', 'x0_Manager',
       'x0_Mechanic', 'x0_Media_Manager', 'x0_Musician', 'x0_Scientist',
       'x0_Teacher', 'x0_Writer'],
      dtype='object')

In [8]:
import pandas as pd
df_importance = pd.DataFrame({'Column': feature_names, 'Feature Importance': importances}, columns = ['Column', 'Feature Importance'])
# Sort feature importances in descending order
sorted_importance_df = df_importance.sort_values(by='Feature Importance', ascending=False)
sorted_importance_df

Unnamed: 0,Column,Feature Importance
12,Outstanding_Debt,0.180524
11,Credit_Mix,0.11612
5,Interest_Rate,0.069099
9,Changed_Credit_Limit,0.063109
14,Credit_History_Age,0.05936
17,Amount_invested_monthly,0.056941
13,Credit_Utilization_Ratio,0.055045
7,Delay_from_due_date,0.04877
16,Total_EMI_per_month,0.04684
0,Age,0.038864


In [9]:
# Set the threshold cutoff to 0.01
importance_threshold = 0.01

# Assuming df_importance is a DataFrame with 'Symptom' as a column
low_importance = df_importance[df_importance['Feature Importance'] <= importance_threshold]['Column']

dropped_features = list(low_importance)

dropped_features

['Payment_of_Min_Amount',
 'Monthly_Balance',
 'x0_Architect',
 'x0_Developer',
 'x0_Doctor',
 'x0_Engineer',
 'x0_Entrepreneur',
 'x0_Journalist',
 'x0_Lawyer',
 'x0_Manager',
 'x0_Mechanic',
 'x0_Media_Manager',
 'x0_Musician',
 'x0_Scientist',
 'x0_Teacher',
 'x0_Writer']

In [10]:
# Assuming X_train is a DataFrame and dropped_features contains the features to be dropped
X_train = X_train.drop(columns=dropped_features)
X_train.columns

Index(['Age', 'Annual_Income', 'Monthly_Inhand_Salary', 'Num_Bank_Accounts',
       'Num_Credit_Card', 'Interest_Rate', 'Num_of_Loan',
       'Delay_from_due_date', 'Num_of_Delayed_Payment', 'Changed_Credit_Limit',
       'Num_Credit_Inquiries', 'Credit_Mix', 'Outstanding_Debt',
       'Credit_Utilization_Ratio', 'Credit_History_Age', 'Total_EMI_per_month',
       'Amount_invested_monthly', 'Payment_Behaviour'],
      dtype='object')

In [11]:
# select less rows for the models so that it doesn't take an hour to run

In [12]:
# Drop the same features from X_test
X_test = X_test.drop(columns=dropped_features)

# Now rerun the model
bagging.fit(X_train, y_train)
y_pred_bagging = bagging.predict(X_test)
bagging_score = accuracy_score(y_test, y_pred_bagging)
print("Bagging Classifier Accuracy after feature selection:", bagging_score)


Bagging Classifier Accuracy after feature selection: 0.7606580166821131


# Parameter Tuning Section

The Bagging model has been chosen as the final model because of its simplicity, low computation time, and high accuracy value. In this section, we will attempt to approve upon its accuracy by using parameter tuning, and other techniques. 

In [13]:
# Access the default parameters
default_params = bagging.get_params()

# This will help decide what hyperparameters to use in the gridsearch and what values to try based on what was used in the original model
print("Default parameters:")
for param, value in default_params.items():
    print(param, ":", value)

Default parameters:
base_estimator : deprecated
bootstrap : True
bootstrap_features : False
estimator : None
max_features : 1.0
max_samples : 1.0
n_estimators : 10
n_jobs : None
oob_score : False
random_state : None
verbose : 0
warm_start : False


In [None]:
from sklearn.model_selection import GridSearchCV

# Define the hyperparameters grid
param_grid = {
    'n_estimators': [200, 500, 1000],  # Number of base estimators
    'max_samples': [0.7, 1.0, 1.3],  # Number of samples to draw from X to train each base estimator
    'max_features': [0.5, 0.7, 1.0]   # Number of features to draw from X to train each base estimator
}

# Instantiate the grid search model
grid_search = GridSearchCV(estimator=bagging, param_grid=param_grid, cv=5, scoring='accuracy')

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Print the best parameters
print("Best Parameters:", grid_search.best_params_)

# Get the best estimator
best_bagging = grid_search.best_estimator_

# Predict with the best estimator
y_pred_best_bagging = best_bagging.predict(X_test)

# Evaluate accuracy
best_bagging_score = accuracy_score(y_test, y_pred_best_bagging)
print("Tuned Bagging Classifier Accuracy:", best_bagging_score)


In [None]:
import joblib
joblib.dump(best_bagging, 'best_bagging_model.pkl')

# Final Model Design