In [16]:
from sklearn.metrics import (roc_auc_score, precision_score, recall_score, f1_score, confusion_matrix, accuracy_score, classification_report)
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
!pip install openpyxl xlrd
!pip install catboost
import seaborn as sns
import pandas as pd
import numpy as np 




# Data extraction & praparation

In [None]:
try:
    df = pd.read_excel(r"E:\programming_projects\Fintech_payments_risk_analysis\data\raw\default_of_credit_card_clients.xls", header=1)
    print("Full dataset loaded")
except FileNotFoundError:
    df = pd.read_csv("data/samples/default_sample.csv")
    print("Sample dataset loaded")



# Double check: let's drop the first column if it's just a row counter
# Sometimes CSV exports add an unnamed 'Unnamed: 0' column at the start
if 'Unnamed: 0' in df.columns:
    df.drop('Unnamed: 0', axis=1, inplace=True)


# English comment: Verify that column names are now correct (LIMIT_BAL, SEX, etc.)
print("Columns after cleaning:", df.columns.tolist())  

# Drop ID
df = df.drop(columns=['ID'])

# Fix EDUCATION
df['EDUCATION'] = df['EDUCATION'].replace([0, 5, 6], 4)

# Fix MARRIAGE
df['MARRIAGE'] = df['MARRIAGE'].replace(0, 3)

# Rename target
df = df.rename(columns={'default payment next month': 'default'})

Full dataset loaded
Columns after cleaning: ['ID', 'LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6', 'default payment next month']


# Feature engineering

In [3]:
df['UTILIZATION_SEP'] = df['BILL_AMT1'] / df['LIMIT_BAL']
df['UTILIZATION_AUG'] = df['BILL_AMT2'] / df['LIMIT_BAL']

df['PAY_RATIO_SEP'] = df['PAY_AMT1'] / (df['BILL_AMT2'] + 1)
df['BILL_CHG'] = df['BILL_AMT1'] - df['BILL_AMT2']

# Replace inf â†’ NaN
df.replace([np.inf, -np.inf], np.nan, inplace=True)

# Fill NaN with 0 (safe after ratios)
df.fillna(0, inplace=True)

# Clip extreme ratios
df['PAY_RATIO_SEP'] = df['PAY_RATIO_SEP'].clip(0, 5)
df['UTILIZATION_SEP'] = df['UTILIZATION_SEP'].clip(0, 2)
df['UTILIZATION_AUG'] = df['UTILIZATION_AUG'].clip(0, 2)

In [4]:
X = df.drop(columns=['default'])
y = df['default']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [9]:
num_features = [
    'LIMIT_BAL', 'AGE',
    'BILL_AMT1','BILL_AMT2','BILL_AMT3','BILL_AMT4','BILL_AMT5','BILL_AMT6',
    'PAY_AMT1','PAY_AMT2','PAY_AMT3','PAY_AMT4','PAY_AMT5','PAY_AMT6',
    'UTILIZATION_SEP','UTILIZATION_AUG','PAY_RATIO_SEP','BILL_CHG'
]

ordinal_features = [
    'PAY_0','PAY_2','PAY_3','PAY_4','PAY_5','PAY_6'
]

cat_features = [
    'SEX',
    'MARRIAGE',
    'EDUCATION'
]


preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_features),
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), cat_features),
        ('ord', 'passthrough', ordinal_features)
    ],
    remainder='drop'
)

In [10]:
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

print("Train shape:", X_train_processed.shape)
print("Test shape:", X_test_processed.shape)

Train shape: (24000, 30)
Test shape: (6000, 30)


# Logistic Regression

In [None]:
# 1. Train Logistic Regression with class balancing
model = LogisticRegression(
    max_iter=1000,
    class_weight='balanced',
    n_jobs=-1
)

model.fit(X_train_processed, y_train)

# 2. Predict probabilities
y_pred_proba = model.predict_proba(X_test_processed)[:, 1]

# 3. ROC-AUC
roc_auc = roc_auc_score(y_test, y_pred_proba)

# 4. Apply threshold
threshold = 0.5
y_pred = (y_pred_proba >= threshold).astype(int)

# 5. Metrics
print("ROC-AUC:", roc_auc)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1-score:", f1_score(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

ROC-AUC: 0.7138349165813453
Accuracy: 0.6916666666666667
Precision: 0.3791030975496995
Recall: 0.6179351921627732
F1-score: 0.4699140401146132

Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.71      0.78      4673
           1       0.38      0.62      0.47      1327

    accuracy                           0.69      6000
   macro avg       0.62      0.67      0.63      6000
weighted avg       0.76      0.69      0.71      6000

Confusion Matrix:
[[3330 1343]
 [ 507  820]]


# CatBoost Classifier

In [None]:
from catboost import CatBoostClassifier
from sklearn.metrics import (
    roc_auc_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    classification_report
)


y = df['default']

X = df.drop(columns=['default'])
cat_features = ['SEX', 'MARRIAGE', 'EDUCATION']

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

cb = CatBoostClassifier(
    iterations=800,
    depth=6,
    learning_rate=0.05,
    loss_function='Logloss',
    eval_metric='AUC',
    class_weights=[1, 3.5], 
    random_seed=42,
    verbose=100
)

cb.fit(
    X_train, y_train,
    cat_features=cat_features,
    eval_set=(X_test, y_test),
    use_best_model=True
)


y_pred_proba = cb.predict_proba(X_test)[:, 1]
y_pred = (y_pred_proba >= 0.5).astype(int)


roc_auc = roc_auc_score(y_test, y_pred_proba)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"ROC-AUC   : {roc_auc:.4f}")
print(f"Precision : {precision:.4f}")
print(f"Recall    : {recall:.4f}")
print(f"F1-score  : {f1:.4f}")

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred, digits=4))

0:	test: 0.7410014	best: 0.7410014 (0)	total: 200ms	remaining: 2m 39s
100:	test: 0.7813269	best: 0.7813269 (100)	total: 3.38s	remaining: 23.4s
200:	test: 0.7831729	best: 0.7835709 (188)	total: 6.52s	remaining: 19.4s
300:	test: 0.7820546	best: 0.7835709 (188)	total: 9.73s	remaining: 16.1s
400:	test: 0.7798509	best: 0.7835709 (188)	total: 13s	remaining: 13s
500:	test: 0.7782520	best: 0.7835709 (188)	total: 16.3s	remaining: 9.7s
600:	test: 0.7772893	best: 0.7835709 (188)	total: 19.5s	remaining: 6.46s
700:	test: 0.7758900	best: 0.7835709 (188)	total: 22.8s	remaining: 3.22s
799:	test: 0.7742690	best: 0.7835709 (188)	total: 26.1s	remaining: 0us

bestTest = 0.7835708541
bestIteration = 188

Shrink model to first 189 iterations.
ROC-AUC   : 0.7836
Precision : 0.4749
Recall    : 0.6285
F1-score  : 0.5410

Confusion Matrix:
[[3751  922]
 [ 493  834]]

Classification Report:
              precision    recall  f1-score   support

           0     0.8838    0.8027    0.8413      4673
           1  