# EDA — Processed data, predictions and RL datasets

Quick exploratory analysis for: data splits, target balance, feature distributions, reward distribution, and policy comparisons (historical / BC / RL).

In [1]:
import warnings
warnings.filterwarnings('ignore')
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='whitegrid')

DATA_DIR = Path('../data/processed')
RL_DIR = Path('../data/rl')
MODELS_DIR = Path('../models')
REPORT_DIR = Path('../reports/figures')
REPORT_DIR.mkdir(parents=True, exist_ok=True)

print('Paths:')
print(' processed:', DATA_DIR.resolve())
print(' rl npz:  ', RL_DIR.resolve())
print(' models:  ', MODELS_DIR.resolve())
print(' output figs:', REPORT_DIR.resolve())

Paths:
 processed: C:\Users\mayan\OneDrive\Desktop\New folder\shodh-loan-project\data\processed
 rl npz:   C:\Users\mayan\OneDrive\Desktop\New folder\shodh-loan-project\data\rl
 models:   C:\Users\mayan\OneDrive\Desktop\New folder\shodh-loan-project\models
 output figs: C:\Users\mayan\OneDrive\Desktop\New folder\shodh-loan-project\reports\figures


In [13]:
# Robust feature_cols loading / debug
train = pd.read_parquet(DATA_DIR / 'train.parquet').reset_index(drop=True)
val = pd.read_parquet(DATA_DIR / 'val.parquet').reset_index(drop=True)
test = pd.read_parquet(DATA_DIR / 'test.parquet').reset_index(drop=True)

exclude = {'target', 'loan_amnt', 'int_rate', 'id', 'index'}
fpath = RL_DIR / 'feature_cols.csv'

if fpath.exists():
    print('feature_cols.csv found at', fpath)
    txt = fpath.read_text(errors='ignore')
    print('--- file preview (first 1k chars) ---\n', txt[:1000])
    try:
        fc = pd.read_csv(fpath, header=None).iloc[:,0].astype(str).tolist()
        print('Read as single-column list (first 20):', fc[:20])
    except Exception:
        try:
            df_fc = pd.read_csv(fpath)
            fc = df_fc.columns.tolist()
            print('Read header columns (first 20):', fc[:20])
        except Exception as e:
            print('Failed to read feature file:', e)
            fc = []
    # keep only valid columns that exist in the dataframe and not in exclude
    feature_cols = [c for c in fc if (c in train.columns) and (c not in exclude)]
    if not feature_cols:
        # try interpreting numeric strings as indices into train.columns
        idxs = []
        for x in fc:
            try:
                idxs.append(int(x))
            except Exception:
                pass
        if idxs:
            cols = list(train.columns)
            feature_cols = [cols[i] for i in idxs if 0 <= i < len(cols) and cols[i] not in exclude]
    if not feature_cols:
        print('feature_cols from file invalid or empty — falling back to automatic detection')
        feature_cols = [c for c in train.columns if c not in exclude]
else:
    feature_cols = [c for c in train.columns if c not in exclude]

print('Final feature_cols (n=%d). Sample:' % len(feature_cols), feature_cols[:20])

feature_cols.csv found at ..\data\rl\feature_cols.csv
--- file preview (first 1k chars) ---
 0
total_acc
installment
home_ownership
pub_rec
income_to_loan
pub_rec_bankruptcies
purpose
verification_status
sub_grade
term
grade
emp_length_years
dti
open_acc
revol_bal
annual_inc
revol_util
installment_to_income
fico_range_low
inq_last_6mths
credit_utilization
fico_score
delinq_2yrs
term_months
addr_state
mort_acc
emp_length
fico_range_high

Read as single-column list (first 20): ['0', 'total_acc', 'installment', 'home_ownership', 'pub_rec', 'income_to_loan', 'pub_rec_bankruptcies', 'purpose', 'verification_status', 'sub_grade', 'term', 'grade', 'emp_length_years', 'dti', 'open_acc', 'revol_bal', 'annual_inc', 'revol_util', 'installment_to_income', 'fico_range_low']
Final feature_cols (n=28). Sample: ['total_acc', 'installment', 'home_ownership', 'pub_rec', 'income_to_loan', 'pub_rec_bankruptcies', 'purpose', 'verification_status', 'sub_grade', 'term', 'grade', 'emp_length_years', 'dti', 'o

In [14]:
# Target / class balance
for name, df in [('train', train), ('val', val), ('test', test)]:
    counts = df['target'].value_counts().sort_index()
    pct = 100 * counts / counts.sum()
    print(f"{name}:\n {counts.to_dict()}  ({pct.round(2).to_dict()})")

# save simple chart
fig, ax = plt.subplots(figsize=(6,4))
sns.barplot(x=['fully_paid','default'], y=[(train['target']==0).mean(), (train['target']==1).mean()])
ax.set_ylabel('Proportion (train)')
fig.tight_layout()
fig.savefig(REPORT_DIR / 'target_balance_train.png', dpi=150)
plt.close(fig)
print('Saved target balance plot to', REPORT_DIR / 'target_balance_train.png')

train:
 {0: 115250, 1: 28750}  ({0: 80.03, 1: 19.97})
val:
 {0: 12806, 1: 3194}  ({0: 80.04, 1: 19.96})
test:
 {0: 32014, 1: 7986}  ({0: 80.04, 1: 19.96})
Saved target balance plot to ..\reports\figures\target_balance_train.png


In [4]:
# Numeric feature distributions (select up to 6 numeric features)
num_cols = [c for c in feature_cols if pd.api.types.is_numeric_dtype(train[c])] 
sample_cols = num_cols[:6]
fig, axes = plt.subplots(2, 3, figsize=(14,8))
for ax, col in zip(axes.flatten(), sample_cols):
    sns.histplot(train[col].dropna(), bins=50, ax=ax, kde=False)
    ax.set_title(col)
fig.tight_layout()
fig.savefig(REPORT_DIR / 'numeric_feature_histograms.png', dpi=150)
plt.close(fig)
print('Saved numeric histograms to', REPORT_DIR / 'numeric_feature_histograms.png')

KeyError: '0'

In [5]:
# Correlation heatmap for numeric features (subset to speed up)
corr_cols = num_cols[:12]
corr = train[corr_cols].corr()
fig, ax = plt.subplots(figsize=(10,8))
sns.heatmap(corr, annot=False, cmap='coolwarm', center=0, ax=ax)
fig.tight_layout()
fig.savefig(REPORT_DIR / 'corr_heatmap.png', dpi=150)
plt.close(fig)
print('Saved correlation heatmap to', REPORT_DIR / 'corr_heatmap.png')

NameError: name 'num_cols' is not defined

In [6]:
# Reward distribution and approve rates (using RL npz and model preds if available)
rl_npz = np.load(RL_DIR / 'test_rl.npz')
rewards = rl_npz['rewards']
fig, ax = plt.subplots(1,2, figsize=(12,4))
sns.histplot(rewards, bins=80, ax=ax[0])
ax[0].set_title('Reward distribution (test)')
ax[0].set_xlabel('reward')

# Load predictions if present
preds_path = MODELS_DIR / 'test_predictions.csv'
df_preds = pd.read_csv(preds_path) if preds_path.exists() else None
if df_preds is not None:
    approve_rate = df_preds.filter(regex='pred_label').iloc[:,0].mean()
else:
    approve_rate = None
ax[1].bar(['approve_rate'], [approve_rate if approve_rate is not None else np.nan])
ax[1].set_ylim(0,1)
ax[1].set_title('Sample approve rate (from preds)')
fig.tight_layout()
fig.savefig(REPORT_DIR / 'reward_and_approve_rate.png', dpi=150)
plt.close(fig)
print('Saved reward + approve-rate plot to', REPORT_DIR / 'reward_and_approve_rate.png')

Saved reward + approve-rate plot to ..\reports\figures\reward_and_approve_rate.png


In [7]:
# Compare policies: historical (from data), supervised (test_predictions.csv), BC (if available), RL (q_agent preds)
results = {}
n = len(rewards)

# historical policy: actions column if present in processed data (fallback: all approved)
if 'is_approved' in test.columns:
    hist_actions = test['is_approved'].astype(int).values
else:
    hist_actions = np.ones(n, dtype=int)
results['historical_avg_reward'] = (rewards * (hist_actions==1)).mean()
results['historical_approve_rate'] = hist_actions.mean()

if df_preds is not None:
    # choose pred_label_optimal if present else 0.5
    if 'pred_label_optimal' in df_preds.columns:
        sup_actions = df_preds['pred_label_optimal'].astype(int).values
    elif 'pred_label_0.5' in df_preds.columns:
        sup_actions = df_preds['pred_label_0.5'].astype(int).values
    else:
        sup_actions = (df_preds['pred_proba'] >= 0.5).astype(int).values
    results['supervised_avg_reward'] = (rewards * (sup_actions==1)).mean()
    results['supervised_approve_rate'] = sup_actions.mean()
else:
    results['supervised_avg_reward'] = np.nan
    results['supervised_approve_rate'] = np.nan

# RL Q-agent preds
q_preds_path = MODELS_DIR / 'rl_q' / 'q_agent_test_predictions.csv'
if q_preds_path.exists():
    q_df = pd.read_csv(q_preds_path)
    q_actions = q_df['action'].astype(int).values
    results['rl_avg_reward'] = (rewards * (q_actions==1)).mean()
    results['rl_approve_rate'] = q_actions.mean()
else:
    results['rl_avg_reward'] = np.nan
    results['rl_approve_rate'] = np.nan

pd.DataFrame([results]).T.rename(columns={0:'value'})
pd.DataFrame([results]).T.to_csv(REPORT_DIR / 'policy_comparison.csv')
print('Saved policy comparison CSV to', REPORT_DIR / 'policy_comparison.csv')
pd.DataFrame([results]).T

Saved policy comparison CSV to ..\reports\figures\policy_comparison.csv


Unnamed: 0,0
historical_avg_reward,-0.026583
historical_approve_rate,1.0
supervised_avg_reward,-0.039507
supervised_approve_rate,0.42685
rl_avg_reward,0.042508
rl_approve_rate,0.6107


## Next steps
- Inspect feature importance / model explanations (SHAP) for supervised model and RL Q-network.
- Tune reward function and retrain RL agent or train conservative objective.
- Add more EDA plots if you want (e.g., approval rate vs. credit score bins, PD vs. loan amount).

If you want, I can extend this notebook with more specific plots — tell me which figures you need.

In [12]:
# Add this cell after the "Next steps" markdown (adds SHAP explanations for supervised model and Q-agent)
try:
    import shap
except Exception:
    print("shap not installed. Install: pip install shap")
else:
    import torch
    import matplotlib.pyplot as plt
    from dataset import MLPClassifier
    from pathlib import Path

    REPORT_DIR = Path('../reports/figures')
    REPORT_DIR.mkdir(parents=True, exist_ok=True)

    X = train[feature_cols].fillna(0).astype(float)
    X_test = test[feature_cols].fillna(0).astype(float)
    sample_n = min(500, len(X_test))
    idx = np.random.RandomState(42).choice(len(X_test), sample_n, replace=False)
    X_back = X.sample(n=min(200, len(X)), random_state=42).values  # background for explainer
    X_samp = X_test.iloc[idx].values

    # Supervised model SHAP (probability of default -> use prob of default or paid)
    sup_ckpt_path = Path('../models/best_model.pth')
    if sup_ckpt_path.exists():
        try:
            ckpt = torch.load(sup_ckpt_path, map_location='cpu')
            input_dim = ckpt.get('input_dim', X.shape[1])
            hidden_dims = ckpt.get('hidden_dims', [256,128,64])
            dropout = ckpt.get('dropout', 0.3)
            sup_model = MLPClassifier(input_dim, hidden_dims, dropout)
            sup_model.load_state_dict(ckpt['model_state_dict'])
            sup_model.eval()

            def sup_predict_prob(x):
                with torch.no_grad():
                    t = torch.from_numpy(np.asarray(x, dtype=np.float32))
                    logits = sup_model(t)
                    probs = torch.sigmoid(logits).numpy().flatten()
                    return np.vstack([1 - probs, probs]).T  # shape (N,2)

            explainer = shap.Explainer(sup_predict_prob, X_back, feature_names=feature_cols)
            shap_vals = explainer(X_samp)
            plt.figure(figsize=(8,6))
            shap.summary_plot(shap_vals, features=pd.DataFrame(X_samp, columns=feature_cols), show=False)
            plt.tight_layout()
            plt.savefig(REPORT_DIR / 'shap_supervised_summary.png', dpi=150)
            plt.close()
            print("Saved supervised SHAP summary to", REPORT_DIR / 'shap_supervised_summary.png')
        except Exception as e:
            print("Supervised SHAP failed:", e)
    else:
        print("Supervised model checkpoint not found at", sup_ckpt_path)

    # Q-agent SHAP (prob approve = softmax(Q)[1])
    q_ckpt_path = Path('../models/rl_q/q_agent_best.pth')
    if q_ckpt_path.exists():
        try:
            # define simple QNetwork matching training script
            import torch.nn as nn
            class QNetwork(nn.Module):
                def __init__(self, input_dim, hidden_dims=(256,128)):
                    super().__init__()
                    layers = []
                    prev = input_dim
                    for h in hidden_dims:
                        layers.append(nn.Linear(prev, h))
                        layers.append(nn.ReLU())
                        prev = h
                    layers.append(nn.Linear(prev, 2))
                    self.net = nn.Sequential(*layers)
                def forward(self, x):
                    return self.net(x)

            q_ckpt = torch.load(q_ckpt_path, map_location='cpu')
            q_input = int(q_ckpt.get('input_dim', X.shape[1]))
            q_hidden = tuple(q_ckpt.get('hidden_dims', [256,128]))
            q_model = QNetwork(q_input, q_hidden)
            q_model.load_state_dict(q_ckpt['model_state_dict'])
            q_model.eval()

            def q_predict_prob(x):
                with torch.no_grad():
                    t = torch.from_numpy(np.asarray(x, dtype=np.float32))
                    qvals = q_model(t).numpy()
                    # softmax to get action probabilities
                    exp = np.exp(qvals - np.max(qvals, axis=1, keepdims=True))
                    probs = exp / exp.sum(axis=1, keepdims=True)
                    return probs  # shape (N,2)

            expl = shap.Explainer(q_predict_prob, X_back, feature_names=feature_cols)
            shap_vals_q = expl(X_samp)
            plt.figure(figsize=(8,6))
            shap.summary_plot(shap_vals_q, features=pd.DataFrame(X_samp, columns=feature_cols), show=False)
            plt.tight_layout()
            plt.savefig(REPORT_DIR / 'shap_qagent_summary.png', dpi=150)
            plt.close()
            print("Saved Q-agent SHAP summary to", REPORT_DIR / 'shap_qagent_summary.png')
        except Exception as e:
            print("Q-agent SHAP failed:", e)
    else:
        print("Q-agent checkpoint not found at", q_ckpt_path)

KeyError: "['0'] not in index"