In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [3]:
data = pd.read_csv("customers.csv")
data = pd.get_dummies(data, columns=["SEX", "EDUCATION", "MARRIAGE"], drop_first=True)
print(data.isna().sum())

ID                            0
LIMIT_BAL                     0
AGE                           0
PAY_0                         0
PAY_2                         0
PAY_3                         0
PAY_4                         0
PAY_5                         0
PAY_6                         0
BILL_AMT1                     0
BILL_AMT2                     0
BILL_AMT3                     0
BILL_AMT4                     0
BILL_AMT5                     0
BILL_AMT6                     0
PAY_AMT1                      0
PAY_AMT2                      0
PAY_AMT3                      0
PAY_AMT4                      0
PAY_AMT5                      0
PAY_AMT6                      0
default_payment_next_month    0
SEX_male                      0
EDUCATION_high school         0
EDUCATION_others              0
EDUCATION_university          0
MARRIAGE_others               0
MARRIAGE_single               0
dtype: int64


In [18]:
pay_columns = ["PAY_0", "PAY_2", "PAY_3", "PAY_4", "PAY_5", "PAY_6"]
for col in pay_columns:
    new_col = f"LIMIT_BAL_PAY{col[4:]}"
    data[new_col] = data["LIMIT_BAL"] * data[col]


data.columns

Index(['ID', 'LIMIT_BAL', 'AGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5',
       'PAY_6', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4',
       'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3',
       'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6', 'default_payment_next_month',
       'SEX_male', 'EDUCATION_high school', 'EDUCATION_others',
       'EDUCATION_university', 'MARRIAGE_others', 'MARRIAGE_single',
       'LIMIT_BAL_PAY__0', 'LIMIT_BAL_PAY__2', 'LIMIT_BAL_PAY__3',
       'LIMIT_BAL_PAY__4', 'LIMIT_BAL_PAY__5', 'LIMIT_BAL_PAY__6',
       'LIMIT_BAL_PAY_0', 'LIMIT_BAL_PAY_2', 'LIMIT_BAL_PAY_3',
       'LIMIT_BAL_PAY_4', 'LIMIT_BAL_PAY_5', 'LIMIT_BAL_PAY_6',
       'LIMIT_BAL_PAY0', 'LIMIT_BAL_PAY2', 'LIMIT_BAL_PAY3', 'LIMIT_BAL_PAY4',
       'LIMIT_BAL_PAY5', 'LIMIT_BAL_PAY6'],
      dtype='object')

In [21]:
data.head()

Unnamed: 0,ID,LIMIT_BAL,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,...,LIMIT_BAL_PAY_3,LIMIT_BAL_PAY_4,LIMIT_BAL_PAY_5,LIMIT_BAL_PAY_6,LIMIT_BAL_PAY0,LIMIT_BAL_PAY2,LIMIT_BAL_PAY3,LIMIT_BAL_PAY4,LIMIT_BAL_PAY5,LIMIT_BAL_PAY6
0,1,20000,24,2,2,-1,-1,-2,-2,3913.0,...,-20000,-20000,-40000,-40000,40000,40000,-20000,-20000,-40000,-40000
1,2,120000,26,-1,2,0,0,0,2,2682.0,...,0,0,0,240000,-120000,240000,0,0,0,240000
2,3,90000,34,0,0,0,0,0,0,29239.0,...,0,0,0,0,0,0,0,0,0,0
3,4,50000,37,0,0,0,0,0,0,46990.0,...,0,0,0,0,0,0,0,0,0,0
4,5,50000,57,-1,0,-1,0,0,0,8617.0,...,-50000,0,0,0,-50000,0,-50000,0,0,0


In [None]:

X = data.drop("default_payment_next_month", axis=1)
y = data["default_payment_next_month"]
y.head()

In [5]:
# Split up the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

classifier = RandomForestClassifier(
    n_estimators=100,
    class_weight="balanced",
    random_state=42,
    max_depth=10,
    min_samples_split=10
)

classifier.fit(X_train, y_train)

# Make predictions
y_pred = classifier.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.85      0.86      5841
           1       0.52      0.57      0.54      1659

    accuracy                           0.79      7500
   macro avg       0.69      0.71      0.70      7500
weighted avg       0.79      0.79      0.79      7500



In [6]:
distro = data["default_payment_next_month"].value_counts(normalize=True)
distro

default_payment_next_month
0    0.7788
1    0.2212
Name: proportion, dtype: float64