In [1]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

In [3]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
data = Path('larosalía.csv')
df = pd.read_csv(data)


# grouped_by_creditstatus_df = df.groupby('default_payment_next_month').mean()
# grouped_by_creditstatus_df

# Review the DataFrame
df.head()


Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default_payment_next_month
0,1,20000,2.0,2.0,1.0,24.0,2,2.0,-1.0,-1.0,...,0.0,0,0.0,0,689.0,0.0,0.0,0,0.0,1
1,2,120000,2.0,2.0,2.0,26.0,-1,2.0,0.0,0.0,...,3272.0,3455,3261.0,0,1000.0,1000.0,1000.0,0,2000.0,1
2,3,90000,2.0,2.0,2.0,34.0,0,0.0,0.0,0.0,...,14331.0,14948,15549.0,1518,1500.0,1000.0,1000.0,1000,5000.0,0
3,4,50000,2.0,2.0,1.0,37.0,0,0.0,0.0,0.0,...,28314.0,28959,29547.0,2000,2019.0,1200.0,1100.0,1069,1000.0,0
4,5,50000,1.0,2.0,1.0,57.0,-1,0.0,-1.0,0.0,...,20940.0,19146,19131.0,2000,36681.0,10000.0,9000.0,689,679.0,0


In [4]:
# Create a DataFrame with the scaled data for selected columns
df_scaled = df[["LIMIT_BAL", "AGE", "BILL_AMT1", "BILL_AMT2", "BILL_AMT3", "BILL_AMT4", "BILL_AMT5", "BILL_AMT6", "PAY_AMT1", "PAY_AMT2", "PAY_AMT3", "PAY_AMT4", "PAY_AMT5", "PAY_AMT6"]].copy()
df_scaled = pd.DataFrame(StandardScaler().fit_transform(df_scaled), columns=df_scaled.columns)

# Copy the remaining columns to df_scaled
df_scaled[["SEX", "EDUCATION", "MARRIAGE", "PAY_0", "PAY_2", "PAY_3", "PAY_4", "PAY_5", "PAY_6", "default_payment_next_month"]] = df[["SEX", "EDUCATION", "MARRIAGE", "PAY_0", "PAY_2", "PAY_3", "PAY_4", "PAY_5", "PAY_6", "default_payment_next_month"]]

# Deleting the missing values, given the low number of them, this change will not affect the results significantly, so our models will work seamlessly.
df_scaled =  df_scaled.dropna()


In [5]:
# Specify the columns to convert to dummies
columns_to_dummies = ["SEX", "EDUCATION", "MARRIAGE", "PAY_0", "PAY_2", "PAY_3", "PAY_4", "PAY_5", "PAY_6"]

# Convert categorical data to numeric with pd.get_dummies
df_scaled = pd.get_dummies(df_scaled, columns=columns_to_dummies)
df_scaled.head()


Unnamed: 0,LIMIT_BAL,AGE,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,...,PAY_6_-2.0,PAY_6_-1.0,PAY_6_0.0,PAY_6_2.0,PAY_6_3.0,PAY_6_4.0,PAY_6_5.0,PAY_6_6.0,PAY_6_7.0,PAY_6_8.0
0,-1.138692,-1.244193,-0.643984,-0.649165,-0.670222,-0.674761,-0.664612,-0.654184,-0.341111,-0.226549,...,1,0,0,0,0,0,0,0,0,0
1,-0.362119,-1.027284,-0.660808,-0.668648,-0.641285,-0.62349,-0.607384,-0.599055,-0.341111,-0.213041,...,0,0,0,1,0,0,0,0,0,0
2,-0.595091,-0.159649,-0.297842,-0.49459,-0.483361,-0.450202,-0.417015,-0.391318,-0.249457,-0.191325,...,0,0,1,0,0,0,0,0,0,0
3,-0.905721,0.165714,-0.05523,-0.01062,0.035435,-0.231095,-0.184938,-0.154672,-0.220355,-0.168783,...,0,0,1,0,0,0,0,0,0,0
4,-0.905721,2.334803,-0.579692,-0.612831,-0.159933,-0.346642,-0.34748,-0.330762,-0.220355,1.336674,...,0,0,1,0,0,0,0,0,0,0


In [6]:
# Separate the y variable, the labels without the segments variables
y = df_scaled["default_payment_next_month"]
# Separate the X variable, the features
X = df_scaled.drop(columns="default_payment_next_month")

In [7]:
# Import the train_test_split module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
# Specify stratify=y to ensure stratified sampling based on y
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

# Check the shape of X_train
X_train.shape

(21888, 91)

In [8]:
# Now we will try to do a over sampler to confirm if this can increase the performance of the model.

# Import the RandomOverSampler module form imbalanced-learn
from imblearn.over_sampling import RandomOverSampler

# Instantiate the random oversampler model
# # Assign a random_state parameter of 1 to the model
ros = RandomOverSampler(random_state=1)

# Fit the original training data to the random_oversampler model
X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)

In [9]:
# Count the distinct values of the resampled labels data
label_counts = pd.Series(y_train_resampled).value_counts()

print(label_counts)

0    17040
1    17040
Name: default_payment_next_month, dtype: int64


In [10]:
# Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=5000, random_state=1)



In [11]:
# Fitting the model
rf_model.fit(X_train_resampled, y_train_resampled)

In [12]:
# Making predictions using the testing data
predictions = rf_model.predict(X_test)

In [17]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [18]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,5212,469
Actual 1,894,722


Accuracy Score : 0.8132109085925723
Classification Report
              precision    recall  f1-score   support

           0       0.85      0.92      0.88      5681
           1       0.61      0.45      0.51      1616

    accuracy                           0.81      7297
   macro avg       0.73      0.68      0.70      7297
weighted avg       0.80      0.81      0.80      7297



In [19]:

print("Interpretation of the model:")
print("1. Precision of 0 (non-default credits) is 0.85: Out of all instances predicted as non-default credits, 85% are actually non-default credits.")
print("2. Precision of 1 (default credits) is 0.61: Out of all instances predicted as default credits, 61% are actually default credits.")
print("3. Recall of 0 (non-default credits) is 0.92: The model correctly identifies 92% of the non-default credits out of all actual non-default credits.")
print("4. Recall of 1 (default credits) is 0.45: The model correctly identifies 45% of the default credits out of all actual default credits.")
print("5. Accuracy Score is 0.8132: The model predicts the correct credit status (default or non-default) for approximately 81.32% of the instances.")
print("At this moment, we have decided to keep this last model, a Random Forest with oversampler, because the overall performance has improved, and the precision of the default credits has increased to 0.61.")


Interpretation of the model:
1. Precision of 0 (non-default credits) is 0.85: Out of all instances predicted as non-default credits, 85% are actually non-default credits.
2. Precision of 1 (default credits) is 0.61: Out of all instances predicted as default credits, 61% are actually default credits.
3. Recall of 0 (non-default credits) is 0.92: The model correctly identifies 92% of the non-default credits out of all actual non-default credits.
4. Recall of 1 (default credits) is 0.45: The model correctly identifies 45% of the default credits out of all actual default credits.
5. Accuracy Score is 0.8132: The model predicts the correct credit status (default or non-default) for approximately 81.32% of the instances.
At this moment, we have decided to keep this last model, a Random Forest with oversampler, because the overall performance has improved, and the precision of the default credits has increased to 0.61.


In [20]:
# Random Forests in sklearn will automatically calculate feature importance
importances = rf_model.feature_importances_

# We can sort the features by their importance
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.06370783672014645, 'LIMIT_BAL'),
 (0.060289013186134575, 'AGE'),
 (0.05653406620810526, 'BILL_AMT1'),
 (0.051737726360045616, 'PAY_AMT1'),
 (0.05159693150310306, 'BILL_AMT2'),
 (0.049644428002254486, 'PAY_AMT2'),
 (0.04925966634827667, 'BILL_AMT3'),
 (0.04745722208281032, 'BILL_AMT4'),
 (0.04730636997139779, 'PAY_0_2'),
 (0.046837874772486264, 'BILL_AMT5'),
 (0.04660115925650171, 'BILL_AMT6'),
 (0.04418884647250364, 'PAY_AMT3'),
 (0.04395305232654605, 'PAY_AMT6'),
 (0.043144058865040046, 'PAY_AMT4'),
 (0.0418968490907355, 'PAY_AMT5'),
 (0.02657806242170745, 'PAY_2_2.0'),
 (0.02239591556824082, 'PAY_0_0'),
 (0.019117050305683943, 'PAY_3_2.0'),
 (0.014167610278929143, 'PAY_4_2.0'),
 (0.010904390383799898, 'PAY_5_2.0'),
 (0.010766634139811599, 'PAY_6_2.0'),
 (0.009652695930849838, 'EDUCATION_2.0'),
 (0.008922299849636738, 'MARRIAGE_1.0'),
 (0.008891965285660958, 'EDUCATION_1.0'),
 (0.008797139142200401, 'MARRIAGE_2.0'),
 (0.008539146214885095, 'SEX_1.0'),
 (0.008494588495298033, 'SEX_

In [23]:
import csv

# Random Forest feature importances
importances = rf_model.feature_importances_

# Feature importance data
feature_importance_data = sorted(zip(importances, X.columns), reverse=True)

# CSV file path
csv_file_path = 'feature_importance.csv'

# Saving feature importance data to a CSV file
with open(csv_file_path, 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['Importance', 'Feature'])
    writer.writerows(feature_importance_data)

print(f"Feature importances saved to {csv_file_path}")

Feature importances saved to feature_importance.csv


In [1]:

print("Interpretation of the model:")
print("1. Precision of 0 (non-default credits) is 0.85: Out of all instances predicted as non-default credits, 85% are actually non-default credits.")
print("2. Precision of 1 (default credits) is 0.61: Out of all instances predicted as default credits, 61% are actually default credits.")
print("3. Recall of 0 (non-default credits) is 0.92: The model correctly identifies 92% of the non-default credits out of all actual non-default credits.")
print("4. Recall of 1 (default credits) is 0.45: The model correctly identifies 45% of the default credits out of all actual default credits.")
print("5. Accuracy Score is 0.8132: The model predicts the correct credit status (default or non-default) for approximately 81.32% of the instances.")
print("At this moment, we have decided to keep this last model, a Random Forest with oversampler, because the overall performance has improved, and the precision of the default credits has increased to 0.61.")


Interpretation of the model:
1. Precision of 0 (non-default credits) is 0.85: Out of all instances predicted as non-default credits, 85% are actually non-default credits.
2. Precision of 1 (default credits) is 0.61: Out of all instances predicted as default credits, 61% are actually default credits.
3. Recall of 0 (non-default credits) is 0.92: The model correctly identifies 92% of the non-default credits out of all actual non-default credits.
4. Recall of 1 (default credits) is 0.45: The model correctly identifies 45% of the default credits out of all actual default credits.
5. Accuracy Score is 0.8132: The model predicts the correct credit status (default or non-default) for approximately 81.32% of the instances.
At this moment, we have decided to keep this last model, a Random Forest with oversampler, because the overall performance has improved, and the precision of the default credits has increased to 0.61.
