# Fintech Project: Credit Card Default Prediction

## 1. Problem Statement
Predicting whether a credit card client will default on their payment next month is a critical task for financial institutions. 
This project uses the **'Default of Credit Card Clients'** dataset to build a predictive model. We will explore the data, clean it, 
visualize key patterns, and compare a baseline Leasing Regression model with an advanced XGBoost model.

**Goal**: Maximize predictive performance (accuracy/F1-score) and interpret key risk drivers.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, accuracy_score
from xgboost import XGBClassifier
from fpdf import FPDF
import os

# Setup for visualizations
sns.set(style="whitegrid")
if not os.path.exists('images'):
    os.makedirs('images')

## 2. Data Loading
We will use the 'default-of-credit-card-clients' dataset from OpenML.

In [None]:
print("Loading dataset...")
# ID 42477 is 'default-of-credit-card-clients'
data = fetch_openml(data_id=42477, as_frame=True, parser='auto')
df = data.frame

print(f"Dataset Shape: {df.shape}")
df.head()

## 3. Data Cleaning & Preprocessing
Check for missing values, fix column names, and scale features.

In [None]:
# Rename columns explicitly because OpenML returns x1...x23
column_mapping = {
    'x1': 'LIMIT_BAL', 'x2': 'SEX', 'x3': 'EDUCATION', 'x4': 'MARRIAGE', 'x5': 'AGE',
    'x6': 'PAY_1', 'x7': 'PAY_2', 'x8': 'PAY_3', 'x9': 'PAY_4', 'x10': 'PAY_5', 'x11': 'PAY_6',
    'x12': 'BILL_AMT1', 'x13': 'BILL_AMT2', 'x14': 'BILL_AMT3', 'x15': 'BILL_AMT4', 'x16': 'BILL_AMT5', 'x17': 'BILL_AMT6',
    'x18': 'PAY_AMT1', 'x19': 'PAY_AMT2', 'x20': 'PAY_AMT3', 'x21': 'PAY_AMT4', 'x22': 'PAY_AMT5', 'x23': 'PAY_AMT6',
    'y': 'Default'
}
df.rename(columns=column_mapping, inplace=True)

# If 'y' was not in the frame (sometimes separate target), handle it
if 'Default' not in df.columns and 'y' not in df.columns:
    if hasattr(data, 'target') and data.target is not None:
        df['Default'] = data.target
    else:
        df.rename(columns={df.columns[-1]: 'Default'}, inplace=True)

# Ensure target is integer
if 'Default' in df.columns:
    df['Default'] = df['Default'].astype(str).astype(int)

print("Target Distribution:")
print(df['Default'].value_counts(normalize=True))

## 4. Exploratory Data Analysis (EDA)

In [None]:
# 1. Target Distribution Plot
plt.figure(figsize=(6, 4))
sns.countplot(x='Default', data=df, palette='coolwarm')
plt.title('Distribution of Default')
plt.savefig('images/target_dist.png')
plt.show()

# 2. Correlation Matrix
plt.figure(figsize=(12, 10))
corr = df.corr()
sns.heatmap(corr, cmap='RdBu', annot=False)
plt.title('Correlation Heatmap')
plt.savefig('images/correlation.png')
plt.show()

# 3. Limit Balance vs Default
plt.figure(figsize=(10, 6))
sns.boxplot(x='Default', y='LIMIT_BAL', data=df, palette='viridis')
plt.title('Credit Limit Balance vs Default Status')
plt.savefig('images/limit_bal_vs_default.png')
plt.show()

## 5. Model Building

In [None]:
X = df.drop(columns=['Default'])
y = df['Default']

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Data prepared for modeling.")

### Baseline Model: Logistic Regression

In [None]:
lr_model = LogisticRegression(random_state=42, max_iter=1000)
lr_model.fit(X_train_scaled, y_train)
y_pred_lr = lr_model.predict(X_test_scaled)
y_prob_lr = lr_model.predict_proba(X_test_scaled)[:, 1]

print("Logistic Regression Report:")
print(classification_report(y_test, y_pred_lr))

### Advanced Model: XGBoost

In [None]:
xgb_model = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train_scaled, y_train)
y_pred_xgb = xgb_model.predict(X_test_scaled)
y_prob_xgb = xgb_model.predict_proba(X_test_scaled)[:, 1]

print("XGBoost Report:")
print(classification_report(y_test, y_pred_xgb))

## 6. Evaluation & Comparison

In [None]:
# ROC Curve Comparison
fpr_lr, tpr_lr, _ = roc_curve(y_test, y_prob_lr)
auc_lr = roc_auc_score(y_test, y_prob_lr)

fpr_xgb, tpr_xgb, _ = roc_curve(y_test, y_prob_xgb)
auc_xgb = roc_auc_score(y_test, y_prob_xgb)

plt.figure(figsize=(8, 6))
plt.plot(fpr_lr, tpr_lr, label=f'Logistic Regression (AUC = {auc_lr:.3f})')
plt.plot(fpr_xgb, tpr_xgb, label=f'XGBoost (AUC = {auc_xgb:.3f})')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve Comparison')
plt.legend()
plt.savefig('images/roc_comparison.png')
plt.show()

print(f"Logistic Regression AUC: {auc_lr:.4f}")
print(f"XGBoost AUC: {auc_xgb:.4f}")

In [None]:
# Feature Importance (XGBoost)
importances = xgb_model.feature_importances_
features = X.columns
indices = np.argsort(importances)[::-1][:10]  # Top 10

plt.figure(figsize=(10, 6))
plt.title('Top 10 Feature Importances (XGBoost)')
plt.bar(range(len(indices)), importances[indices], align='center')
plt.xticks(range(len(indices)), [features[i] for i in indices], rotation=45)
plt.savefig('images/feature_importance.png')
plt.show()

## 7. Generate PDF Report

In [None]:
class PDFReport(FPDF):
    def header(self):
        self.set_font('Arial', 'B', 16)
        self.cell(0, 10, 'Fintech Project: Credit Default Prediction', 0, 1, 'C')
        self.ln(5)

    def footer(self):
        self.set_y(-15)
        self.set_font('Arial', 'I', 8)
        self.cell(0, 10, f'Page {self.page_no()}', 0, 0, 'C')

pdf = PDFReport()
pdf.add_page()
pdf.set_font("Arial", size=12)

# 1. Problem Statement
pdf.set_font("Arial", 'B', 14)
pdf.cell(0, 10, "1. Problem Statement", 0, 1)
pdf.set_font("Arial", size=11)
pdf.multi_cell(0, 7, "The goal of this project is to predict credit card default based on client demographic and status patterns. "
                     "We utilized the UCS Credit Card Default dataset and compared Logistic Regression with XGBoost.")
pdf.ln(5)

# 2. EDA Summary
pdf.set_font("Arial", 'B', 14)
pdf.cell(0, 10, "2. Exploratory Data Analysis", 0, 1)
pdf.set_font("Arial", size=11)
pdf.multi_cell(0, 7, "We observed the class balance and correlations. Key visualizations are attached below:")

# Add images
if os.path.exists('images/target_dist.png'):
    pdf.image('images/target_dist.png', x=10, w=90)
if os.path.exists('images/limit_bal_vs_default.png'):
    pdf.image('images/limit_bal_vs_default.png', x=110, y=pdf.get_y() - 90, w=90)
pdf.ln(95)

# 3. Model Performance
pdf.set_font("Arial", 'B', 14)
pdf.cell(0, 10, "3. Model Performance", 0, 1)
pdf.set_font("Arial", size=11)
pdf.cell(0, 10, f"Logistic Regression AUC: {auc_lr:.4f}", 0, 1)
pdf.cell(0, 10, f"XGBoost AUC: {auc_xgb:.4f}", 0, 1)
pdf.multi_cell(0, 7, "XGBoost generally captured non-linear relationships better than the baseline model.")
pdf.ln(5)

if os.path.exists('images/roc_comparison.png'):
    pdf.image('images/roc_comparison.png', x=30, w=150)

# 4. Conclusion
pdf.add_page()
pdf.set_font("Arial", 'B', 14)
pdf.cell(0, 10, "4. Conclusion & Key Findings", 0, 1)
pdf.set_font("Arial", size=11)
pdf.multi_cell(0, 7, "Key drivers of default risk appear to be related to the most recent payment statuses (PAY_x features). "
                     "Future improvements could include hyperparameter tuning and handling class imbalance using SMOTE.")

pdf.output("Loan_Default_Prediction_Report.pdf")
print("PDF Report generated: Loan_Default_Prediction_Report.pdf")