### Now that we have performed EDA and finished feature engineering, we now train and test our model.

In [1]:
# Retrieve the dataset stored from 'feature_engineering.ipynb'
%store -r df_sample

In [20]:
# Imports
import pandas as pd
import polars as pl
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns
import matplotlib.pyplot as plt
import shap
import plotly.express as px
import xgboost as xgb
from sklearn.model_selection import cross_val_score
from imblearn.over_sampling import SMOTE

In [3]:
df_sample.tail(4)

Unnamed: 0,customer_id,age,dependents,annual_income,loan_amount,loan_term,interest_rate,loan_to_value_ratio,credit_score,debt_to_income_ratio,...,credit_score_income_interaction,loan_amount_interest_rate_interaction,log_debt_to_income_ratio,dti_bracket_Moderate,dti_bracket_High,dti_bracket_Very High,credit_score_bucket_Fair,credit_score_bucket_Good,credit_score_bucket_Very Good,credit_score_bucket_Excellent
10564103,110093584,-1.030666,3,0.913511,-0.121201,-1.558319,16.17,-0.626402,659,-0.599836,...,602.003886,-1.959816,-0.91588,False,False,False,True,False,False,False
13909598,144944137,0.87878,4,-0.319053,0.936372,-0.241441,5.88,1.62518,527,0.388164,...,-168.140952,5.505867,0.327982,False,True,False,False,False,False,False
19278490,52004965,-1.598339,6,0.621393,-0.396575,1.075437,5.06,1.313422,344,1.093877,...,213.759247,-2.006672,0.739018,False,False,True,False,False,False,False
14598909,152119315,-1.598339,2,0.831615,-1.584405,-0.121725,18.25,1.244143,792,-1.587835,...,658.63894,-28.915399,,False,False,False,False,False,True,False


In [4]:
# Split the data into features (X) and target (y)
X = df_sample.drop(columns=['default'])  # 'default' is the target variable
y = df_sample['default']

# Convert categorical variables to numeric using one-hot encoding (if not already done)
X_encoded = pd.get_dummies(X, drop_first=True)

# Ensure that all columns are numeric
assert X_encoded.select_dtypes(include=['object']).empty, "There are still object columns!"

In [5]:
# Initialize RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

In [6]:
print("Missing values in each column:")
print(X_encoded.isnull().sum())

Missing values in each column:
customer_id                                  0
age                                          0
dependents                                   0
annual_income                                0
loan_amount                                  0
loan_term                                    0
interest_rate                                0
loan_to_value_ratio                          0
credit_score                                 0
debt_to_income_ratio                         0
delinquencies                                0
credit_history_length                        0
default_amount                               0
repayment_tenure                             0
log_loan_amount                              0
age_income_interaction                       0
credit_to_income_ratio                       0
gender_Female                                0
gender_Male                                  0
marital_status_Divorced                      0
marital_status_Married       

In [7]:
# Fill missing values with the median for numeric columns ('log_debt_to_income_ratio')
X_encoded.fillna(X_encoded.median(), inplace=True)

In [8]:
print("Missing values in each column:")
print(X_encoded.isnull().sum())

Missing values in each column:
customer_id                              0
age                                      0
dependents                               0
annual_income                            0
loan_amount                              0
loan_term                                0
interest_rate                            0
loan_to_value_ratio                      0
credit_score                             0
debt_to_income_ratio                     0
delinquencies                            0
credit_history_length                    0
default_amount                           0
repayment_tenure                         0
log_loan_amount                          0
age_income_interaction                   0
credit_to_income_ratio                   0
gender_Female                            0
gender_Male                              0
marital_status_Divorced                  0
marital_status_Married                   0
marital_status_Single                    0
marital_status_Widowed 

In [9]:
# Fit the model to the data
rf_model.fit(X_encoded, y)

KeyboardInterrupt: 

In [None]:
# Get feature importances
feature_importances = rf_model.feature_importances_

In [None]:
# Create a DataFrame to store the feature importances
importance_df = pd.DataFrame({
    'Feature': X_encoded.columns,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

In [None]:
# Print the most important features
print(importance_df.head(10))  # Top 10 features by importance

In [None]:
importance_df

In [10]:
# Drop customer_id from the dataset (worthless column)
X_encoded = X_encoded.drop(columns=['customer_id'])

In [None]:
# Refit the model
rf_model.fit(X_encoded, y)

In [None]:
# Recalculate feature importances
feature_importances = rf_model.feature_importances_

In [None]:
# Recreate a DataFrame to store the updated feature importances
importance_df = pd.DataFrame({
    'Feature': X_encoded.columns,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

In [None]:
# Print the most important features again
importance_df.head(20)
# 'default_amount' feature contributes approximately 5.32% of the decision-making in the Random Forest model
# 'interest_rate': 5.10%
# etc.

In [None]:


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Initialize the XGBoost model
xgb_model = xgb.XGBClassifier(n_estimators=100, random_state=42)

# Fit the model to the training data
xgb_model.fit(X_train, y_train)

# Check the feature importance from XGBoost
xgb_importance = xgb_model.feature_importances_

# Create a DataFrame for XGBoost feature importances
xgb_importance_df = pd.DataFrame({
    'Feature': X_encoded.columns,
    'Importance': xgb_importance
}).sort_values(by='Importance', ascending=False)

# Print the top 10 features
print(xgb_importance_df.head(10))

In [None]:


# Cross-validation for Random Forest
rf_cv_scores = cross_val_score(rf_model, X_encoded, y, cv=5, scoring='roc_auc')
print("Random Forest AUC-ROC scores:", rf_cv_scores)
print("Random Forest Mean AUC-ROC:", rf_cv_scores.mean())

In [None]:
# Cross-validation for XGBoost
xgb_cv_scores = cross_val_score(xgb_model, X_encoded, y, cv=5, scoring='roc_auc')
print("XGBoost AUC-ROC scores:", xgb_cv_scores)
print("XGBoost Mean AUC-ROC:", xgb_cv_scores.mean())

In [None]:


# Initialize SHAP explainer for XGBoost
explainer = shap.TreeExplainer(xgb_model)

# Compute SHAP values for the test set
shap_values = explainer.shap_values(X_test)

# Plot SHAP summary plot
shap.summary_plot(shap_values, X_test)

## Logistic Regression Algorithm

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

In [12]:
# Initialize the Logistic Regression model
logreg_model = LogisticRegression(max_iter=1000, random_state=42)

# Fit the model to the training data
logreg_model.fit(X_train, y_train)

In [15]:
# Perform cross-validation using AUC-ROC as the scoring metric
logreg_cv_scores = cross_val_score(logreg_model, X_encoded, y, cv=5, scoring='roc_auc')

# Print the cross-validation AUC-ROC scores
print("Logistic Regression AUC-ROC scores:", logreg_cv_scores)
print("Logistic Regression Mean AUC-ROC:", logreg_cv_scores.mean())

Logistic Regression AUC-ROC scores: [0.50433156 0.503581   0.50395051 0.50001962 0.50286054]
Logistic Regression Mean AUC-ROC: 0.5029486480498919


In [18]:
# Predict probabilities on the test set
y_pred_proba = logreg_model.predict_proba(X_test)[:, 1]  # Probabilities for the positive class (1)

# Calculate AUC-ROC on the test set
auc_roc_test = roc_auc_score(y_test, y_pred_proba)

# Print the test AUC-ROC
print("Test AUC-ROC for Logistic Regression:", auc_roc_test)

Test AUC-ROC for Logistic Regression: 0.506777213133351


In [19]:
from sklearn.metrics import classification_report

# Predict class labels on the test set
y_pred = logreg_model.predict(X_test)

# Print the classification report
print(classification_report(y_test, y_pred))

# Print the confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.50      1.00      0.67     20056
           1       0.00      0.00      0.00     19944

    accuracy                           0.50     40000
   macro avg       0.25      0.50      0.33     40000
weighted avg       0.25      0.50      0.33     40000

Confusion Matrix:
[[20056     0]
 [19944     0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [21]:
# Assuming X_encoded and y are your features and target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Check the new class distribution after SMOTE
from collections import Counter
print(f"Original class distribution: {Counter(y_train)}")
print(f"Resampled class distribution: {Counter(y_train_resampled)}")

Original class distribution: Counter({0: 80245, 1: 79755})
Resampled class distribution: Counter({1: 80245, 0: 80245})


In [22]:
# Initialize and train the Logistic Regression model on the resampled data
logreg_model = LogisticRegression(max_iter=1000, random_state=42)
logreg_model.fit(X_train_resampled, y_train_resampled)

# Predict probabilities on the test set
y_pred_proba = logreg_model.predict_proba(X_test)[:, 1]  # Probabilities for the positive class (1)

# Calculate AUC-ROC on the test set
auc_roc_test = roc_auc_score(y_test, y_pred_proba)
print("Test AUC-ROC for Logistic Regression after SMOTE:", auc_roc_test)

Test AUC-ROC for Logistic Regression after SMOTE: 0.5054497752262377


In [23]:
# Predict class labels on the test set
y_pred = logreg_model.predict(X_test)

# Print the classification report
print("Classification Report after SMOTE:")
print(classification_report(y_test, y_pred))

# Print the confusion matrix
print("Confusion Matrix after SMOTE:")
print(confusion_matrix(y_test, y_pred))

Classification Report after SMOTE:
              precision    recall  f1-score   support

           0       0.50      0.58      0.54     20056
           1       0.50      0.42      0.46     19944

    accuracy                           0.50     40000
   macro avg       0.50      0.50      0.50     40000
weighted avg       0.50      0.50      0.50     40000

Confusion Matrix after SMOTE:
[[11554  8502]
 [11468  8476]]


In [24]:
from sklearn.model_selection import cross_val_score

# Perform cross-validation using AUC-ROC as the scoring metric on resampled data
logreg_cv_scores = cross_val_score(logreg_model, X_train_resampled, y_train_resampled, cv=5, scoring='roc_auc')

# Print the cross-validation AUC-ROC scores
print("Logistic Regression AUC-ROC scores after SMOTE:", logreg_cv_scores)
print("Logistic Regression Mean AUC-ROC after SMOTE:", logreg_cv_scores.mean())

Logistic Regression AUC-ROC scores after SMOTE: [0.50076342 0.50049134 0.50374692 0.49661323 0.50481544]
Logistic Regression Mean AUC-ROC after SMOTE: 0.5012860712982312
