# Exploratory Data Analysis — Credit Card Default

This notebook provides a visual exploration of the **Default of Credit Card Clients** dataset (UCI).  
We inspect distributions, correlations, and default-rate patterns to guide feature engineering and modelling.

In [None]:
import sys, os
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

# Add src/ to path so we can reuse project utilities
sys.path.insert(0, os.path.join(os.path.dirname(os.getcwd()), 'src'))
# Also try adding relative to the notebook's own location
sys.path.insert(0, os.path.join(os.path.abspath(''), '..', 'src'))

from utils import PLOTS_DIR, ensure_dirs, save_plot

ensure_dirs()
sns.set_theme(style='whitegrid', palette='muted', font_scale=1.1)
%matplotlib inline
print('Setup complete.')

## 1 · Load & Inspect the Dataset

In [None]:
from data_preprocessing import load_data, clean_data

df_raw = load_data()
df = clean_data(df_raw)
df.head()

In [None]:
print(f'Shape : {df.shape}')
print(f'\nColumn types:\n{df.dtypes.value_counts()}')
print(f'\nMissing values: {df.isnull().sum().sum()}')
df.describe().T

## 2 · Target Class Distribution

In [None]:
fig, ax = plt.subplots(figsize=(6, 4))
counts = df['default'].value_counts()
bars = ax.bar(['No Default (0)', 'Default (1)'], counts.values,
              color=['#4CAF50', '#F44336'], edgecolor='black')
for bar, val in zip(bars, counts.values):
    ax.text(bar.get_x() + bar.get_width() / 2, val + 200,
            f'{val:,}', ha='center', fontsize=12, fontweight='bold')
ax.set_ylabel('Count')
ax.set_title('Target Class Distribution')
save_plot(fig, 'target_distribution.png')
plt.show()

## 3 · Correlation Heatmap

In [None]:
fig, ax = plt.subplots(figsize=(16, 12))
corr = df.corr(numeric_only=True)
mask = np.triu(np.ones_like(corr, dtype=bool))
sns.heatmap(corr, mask=mask, annot=True, fmt='.2f',
            cmap='coolwarm', center=0, linewidths=0.5, ax=ax)
ax.set_title('Correlation Heatmap', fontsize=15)
save_plot(fig, 'correlation_heatmap.png')
plt.show()

## 4 · Credit Limit Distribution

In [None]:
fig, ax = plt.subplots(figsize=(8, 4))
sns.histplot(df['LIMIT_BAL'], bins=50, kde=True, ax=ax,
             color='#2196F3', edgecolor='white')
ax.set_title('Credit Limit Distribution')
ax.set_xlabel('Credit Limit (NT$)')
save_plot(fig, 'credit_limit_distribution.png')
plt.show()

## 5 · Age Distribution

In [None]:
fig, ax = plt.subplots(figsize=(8, 4))
sns.histplot(df['AGE'], bins=40, kde=True, ax=ax,
             color='#FF9800', edgecolor='white')
ax.set_title('Age Distribution')
ax.set_xlabel('Age (years)')
save_plot(fig, 'age_distribution.png')
plt.show()

## 6 · Default Rate vs Past Payment Delays

In [None]:
pay_cols = ['PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']

fig, axes = plt.subplots(2, 3, figsize=(16, 9), sharey=True)
for ax, col in zip(axes.ravel(), pay_cols):
    rates = df.groupby(col)['default'].mean() * 100
    rates.plot(kind='bar', ax=ax, color='#E91E63', edgecolor='black')
    ax.set_title(f'Default Rate by {col}')
    ax.set_ylabel('Default Rate (%)')
    ax.set_xlabel(col)
    ax.tick_params(axis='x', rotation=0)

fig.suptitle('Default Rate vs Past Payment Delay Status', fontsize=15, y=1.01)
fig.tight_layout()
save_plot(fig, 'default_rate_vs_payment_delays.png')
plt.show()

## 7 · Default Rate vs Credit Limit

In [None]:
df['limit_bin'] = pd.cut(df['LIMIT_BAL'], bins=10)

fig, ax = plt.subplots(figsize=(10, 5))
default_by_limit = df.groupby('limit_bin', observed=True)['default'].mean() * 100
default_by_limit.plot(kind='bar', ax=ax, color='#9C27B0', edgecolor='black')
ax.set_title('Default Rate by Credit Limit Range')
ax.set_ylabel('Default Rate (%)')
ax.set_xlabel('Credit Limit Bin')
ax.tick_params(axis='x', rotation=45)
fig.tight_layout()
save_plot(fig, 'default_rate_vs_credit_limit.png')
plt.show()

# Clean up temp column
df.drop(columns=['limit_bin'], inplace=True)

## 8 · Key EDA Take-aways

1. **Class imbalance** — ~22 % of clients default; we use `class_weight='balanced'` in training.  
2. **Payment delays are the strongest signal** — higher `PAY_0` values (longer delays) drastically increase default rates.  
3. **Lower credit limits → higher default risk** — the bank's own risk assessment captures valuable information.  
4. **Bill & pay amounts are right-skewed** — scaling is important for any distance-based methods.  
5. **Demographic features** (sex, education, marriage) show weak individual correlations with default.