
# Accounting-approach, Naive DD per Bharath and Shumway (2008). No solver.

Implements Bharath and Shumway naive DD. 
Uses $\hat V = E + F$,
$\hat\sigma_D = 0.05 + 0.25 \sigma_E$,
value-weighted $\hat\sigma_V$,
and $\hat\mu = r_{i,t−1}$. No solver.

This notebook follows the Naive DD approach issue from Bharath and Shumway (2008) which we call the accounting-approach distance-to-default (DD) workflow as it involves the use of proxies for the default intensity and volatility, without invoking a numerical solver. 



## 1. Environment setup

Install the minimal dependencies required to reproduce the accounting workflow locally.


In [1]:
# 1. Install needed packages (run once per environment)
%pip install pandas numpy


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.



## 2. Imports, configuration, and helper utilities

Load core libraries, set display defaults, and define helper functions used throughout the notebook.


In [2]:
import math
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

try:
    from scipy.stats import norm
    Phi = norm.cdf
except Exception:
    from math import erf
    def Phi(x):  # normal CDF fallback
        x = np.asarray(x, dtype=float)
        return 0.5*(1.0 + np.vectorize(erf)(x/np.sqrt(2.0)))

pd.set_option('display.width', 120)
pd.set_option('display.max_columns', 40)

MM = 1_000_000.0
T = 1.0


def find_repo_root(start: Path, marker: str = '.git') -> Path:
    """Walk up from *start* until a directory containing *marker* is found."""
    current = start.resolve()
    for candidate in [current, *current.parents]:
        if (candidate / marker).exists():
            return candidate
    return current


def winsorize_series(series: pd.Series, lower: float = 0.01, upper: float = 0.99) -> pd.Series:
    """Clip *series* to the given quantile range, ignoring NaNs."""
    clean = series.dropna()
    if clean.empty:
        return series
    ql, qh = np.nanpercentile(clean, [lower * 100, upper * 100])
    if not np.isfinite(ql) or not np.isfinite(qh) or ql > qh:
        return series
    return series.clip(ql, qh)


base_dir = find_repo_root(Path.cwd())
print(f"Repository root: {base_dir}")

model_fp   = base_dir / 'data' / 'clean' / 'Book2_clean.csv'
output_dir = base_dir / 'data' / 'outputs' / 'datasheet'
log_dir    = base_dir / 'data' / 'logs'

output_dir.mkdir(parents=True, exist_ok=True)
log_dir.mkdir(parents=True, exist_ok=True)

print(f"Accounting input -> {'FOUND' if model_fp.exists() else 'MISSING'}")

Repository root: /Users/guillaumebld/Documents/Graduate_Research/Professor Abol Jalilvand/fall2025/risk_bank/risk_bank
Accounting input -> FOUND



## 3. Load and normalise accounting inputs

Read the accounting file, harmonise column names, and enforce expected data types for key fields such as instrument, year, and balance sheet magnitudes.


In [3]:
print('[INFO] Loading accounting data…')
df_raw = pd.read_csv(model_fp)
print(f"→ {df_raw.shape[0]} rows before cleaning")

# Standardise column names we rely on
rename_map = {
    'nstrument': 'instrument',
    'weighted_average_cost_of_capital,_(%)': 'wacc_pct',
    'wacc_tax_rate,_(%)': 'wacc_tax_rate_pct',
    'wacc_cost_of_debt,_(%)': 'wacc_cost_of_debt_pct',
    'wacc_debt_weight,_(%)': 'wacc_debt_weight_pct',
    'wacc_equity_weight,_(%)': 'wacc_equity_weight_pct',
}

df = df_raw.rename(columns=rename_map).copy()

# Drop columns that are clearly placeholders
unnamed_cols = [c for c in df.columns if c.lower().startswith('unnamed')]
if unnamed_cols:
    df = df.drop(columns=unnamed_cols)

# Instrument string cleanup
if 'instrument' in df.columns:
    df['instrument'] = (df['instrument']
                        .astype(str)
                        .str.strip()
                        .str.replace('"', '', regex=False)
                        .str.upper())
else:
    raise KeyError('`instrument` column not found after renaming.')

# Year as integer panel key
df['year'] = pd.to_numeric(df['year'], errors='coerce').astype('Int64')

# Numeric conversions for balance sheet figures
for col in ['total_assets', 'debt_total', 'price_to_book_value_per_share', 'd/e', 'rit', 'rit_rf', 'new_wacc']:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')

# Convert WACC weights from percentages to decimals when present
for src, dest in [('wacc_equity_weight_pct', 'wacc_equity_weight'),
                  ('wacc_debt_weight_pct', 'wacc_debt_weight')]:
    if src in df.columns:
        df[dest] = pd.to_numeric(df[src], errors='coerce') / 100.0

assert (df['total_assets'].dropna() >= 0).all() and (df['debt_total'].dropna() >= 0).all(), 'Assets/debt must be nonnegative'
if not df['rit'].between(-1, 1).all(skipna=True):
    print('[WARN] rit outside [-1,1]. Ensure rit is a decimal return, not percent.')

print(df[['instrument', 'year', 'total_assets', 'debt_total']].head())

[INFO] Loading accounting data…
→ 1425 rows before cleaning
[WARN] rit outside [-1,1]. Ensure rit is a decimal return, not percent.
  instrument  year  total_assets  debt_total
0        JPM  2016     2490972.0    495354.0
1        JPM  2017     2533600.0    494798.0
2        JPM  2018     2622532.0    533627.0
3        JPM  2019     2687379.0    516093.0
4        JPM  2020     3384757.0    542102.0



## 4. Construct book equity and market equity proxies

Compute book equity along with the three Bharath–Shumway equity proxies (price-to-book, D/E, and WACC weights). Select the best available proxy and record its source for auditability.


In [4]:
# Book equity in USD
required_cols = ['total_assets', 'debt_total']
# For banks: debt_total represents total liabilities
missing_required = [c for c in required_cols if c not in df.columns]
if missing_required:
    raise KeyError(f"Missing required columns: {missing_required}")

df['assets_usd'] = pd.to_numeric(df['total_assets'], errors='coerce') * MM
df['debt_usd'] = pd.to_numeric(df['debt_total'], errors='coerce') * MM
df['be_usd'] = (df['total_assets'] - df['debt_total']) * MM

# Market equity proxies
df['E_pb'] = np.where(
    (df['price_to_book_value_per_share'] > 0) & (df['be_usd'] > 0),
    df['price_to_book_value_per_share'] * df['be_usd'],
    np.nan
)

df['E_de'] = np.where(
    (df['d/e'] > 0) & (df['debt_usd'] > 0),
    df['debt_usd'] / df['d/e'],
    np.nan
)

equity_weight = pd.to_numeric(
    df.get('wacc_equity_weight', pd.Series(index=df.index, dtype=float)),
    errors='coerce'
)
debt_weight = pd.to_numeric(
    df.get('wacc_debt_weight', pd.Series(index=df.index, dtype=float)),
    errors='coerce'
)
tolerance = 1e-3
weights_sum = equity_weight + debt_weight
wacc_mask = (
    (equity_weight > 0)
    & (debt_weight > 0)
    & (np.abs(weights_sum - 1) <= tolerance)
)

df['E_wacc'] = np.where(
    wacc_mask,
    df['assets_usd'] * equity_weight,
    np.nan
)

# Prioritise proxies: price-to-book, then D/E, then WACC
values = []
sources = []
for _, row in df[['E_pb', 'E_de', 'E_wacc']].iterrows():
    value = np.nan
    source = 'missing'
    for key in ['E_pb', 'E_de', 'E_wacc']:
        v = row[key]
        if pd.notna(v) and v > 0:
            value = v
            source = key
            break
    values.append(value)
    sources.append(source)

df['E'] = values
df['E_source'] = sources

df['weak_E_proxy'] = df['E_source'].isin(['E_de', 'E_wacc'])

df['F'] = df['debt_usd']

print(df[['instrument', 'year', 'be_usd', 'E', 'E_source', 'F']].head())


  instrument  year        be_usd             E E_source             F
0        JPM  2016  1.995618e+12  2.688226e+12     E_pb  4.953540e+11
1        JPM  2017  2.038802e+12  3.252329e+12     E_pb  4.947980e+11
2        JPM  2018  2.088905e+12  2.898674e+12     E_pb  5.336270e+11
3        JPM  2019  2.171286e+12  3.983422e+12     E_pb  5.160930e+11
4        JPM  2020  2.842655e+12  4.418551e+12     E_pb  5.421020e+11



## 5. Load Equity Volatility (Daily Returns, 252-Day Window)

Load pre-calculated equity volatility following Bharath & Shumway (2008): Daily returns with 252-day window (year t-1 only), annualized using √252. Primary method uses all trading days from year t-1 (minimum 180 days). Fallbacks include partial year data and peer median imputation.


In [5]:
# Load DAILY equity volatility from pre-calculated file (Bharath & Shumway 2008)
print('[INFO] Loading sigma_E from equity_volatility_by_year_DAILY.csv (252-day window)...')

# Load the daily volatility file
vol_fp = base_dir / 'data' / 'clean' / 'equity_volatility_by_year_DAILY.csv'
equity_vol = pd.read_csv(vol_fp)

# Rename columns to match expected format
equity_vol_merge = equity_vol.rename(columns={
    'ticker': 'instrument',
    'sigma_E': 'sigma_E_tminus1',
    'method': 'sigma_E_method',
    'days_used': 'sigma_E_days'
})

# Merge into main DataFrame
df = df.merge(
    equity_vol_merge[['instrument', 'year', 'sigma_E_tminus1', 
                      'sigma_E_method', 'sigma_E_days']],
    on=['instrument', 'year'],
    how='left'
)

# Calculate window provenance for daily method
# Daily method uses year t-1 only (252 trading days)
# Example: year=2018 uses all of 2017 (window start = end = 2017)
df['sigmaE_window_end_year'] = df['year'] - 1
df['sigmaE_window_start_year'] = df['year'] - 1  # Daily uses year t-1 only
df['sigma_E_days'] = df['sigma_E_days'].fillna(0)

# Use sigma_E_tminus1 for calculations
df['sigma_E'] = df['sigma_E_tminus1']

print(f'  sigma_E_tminus1: {df["sigma_E_tminus1"].notna().sum()} non-null values')
print(f'  Window validation:')
print(f'    - All end years = t-1: {(df["sigmaE_window_end_year"] == df["year"] - 1).all()}')
print(f'    - All start <= end: {(df["sigmaE_window_start_year"] <= df["sigmaE_window_end_year"]).all()}')
print(f'  sigma_E method distribution:')
print(df["sigma_E_method"].value_counts())

# DATA QUALITY FILTER: Drop bank-years without complete volatility
print('\n[INFO] Applying data quality filter: Dropping bank-years without complete volatility...')
initial_count = len(df)
df = df[df['sigma_E'].notna()].copy()
filtered_count = len(df)
dropped_count = initial_count - filtered_count

print(f'  Initial bank-years: {initial_count:,}')
print(f'  With complete volatility: {filtered_count:,} ({filtered_count/initial_count*100:.1f}%)')
print(f'  Dropped (no volatility): {dropped_count:,} ({dropped_count/initial_count*100:.1f}%)')
print(f'  → Only bank-years with complete daily volatility will proceed to DD/PD calculation')

# Show retained bank-years by year
if dropped_count > 0:
    retained_by_year = df.groupby('year').size()
    print(f'\n  Bank-years retained by year:')
    for year, count in retained_by_year.items():
        print(f'    {year}: {count} banks')

# Build mu_hat_t = r_{i,t-1} with provenance tracking
print('[INFO] Computing mu_hat_t = r_{i,t-1} with fallbacks...')

df['mu_hat_from'] = 'rit_tminus1'
df['mu_source_year'] = df['year'] - 1
df['mu_hat'] = df.groupby('instrument', group_keys=False)['rit'].shift(1)

# Create size buckets for later imputation
print('[INFO] Creating size buckets...')
df = df.sort_values(['instrument', 'year']).reset_index(drop=True)

size_bucket = np.select(
    [df.get('dummylarge', 0) == 1, df.get('dummymid', 0) == 1],
    ['large', 'mid'],
    default='small'
)
df['size_bucket'] = size_bucket
print(f'  Size bucket counts: {df["size_bucket"].value_counts().to_dict()}')

# Create flags that were in old sigma_E calculation (now dummy flags)
# Since we're using pre-calculated sigma_E from file, these are all False
df['insufficient_returns'] = False  # All sigma_E come from file, none insufficient
df['imputed_sigmaE_sizebucket'] = False  # Will be set properly based on sigma_E_method

# Mark imputed values based on method from equity_volatility file
if 'sigma_E_method' in df.columns:
    df['imputed_sigmaE_sizebucket'] = df['sigma_E_method'] == 'imputed_peer'
    print(f'  Imputed sigma_E (imputed_peer): {df["imputed_sigmaE_sizebucket"].sum()} rows')


[INFO] Loading sigma_E from equity_volatility_by_year_DAILY.csv (252-day window)...
  sigma_E_tminus1: 1366 non-null values
  Window validation:
    - All end years = t-1: True
    - All start <= end: True
  sigma_E method distribution:
sigma_E_method
daily_252        1295
imputed_peer       55
daily_partial      16
Name: count, dtype: int64

[INFO] Applying data quality filter: Dropping bank-years without complete volatility...
  Initial bank-years: 1,431
  With complete volatility: 1,366 (95.5%)
  Dropped (no volatility): 65 (4.5%)
  → Only bank-years with complete daily volatility will proceed to DD/PD calculation

  Bank-years retained by year:
    2016: 68 banks
    2017: 131 banks
    2018: 194 banks
    2019: 203 banks
    2020: 207 banks
    2021: 207 banks
    2022: 208 banks
    2023: 148 banks
[INFO] Computing mu_hat_t = r_{i,t-1} with fallbacks...
[INFO] Creating size buckets...
  Size bucket counts: {'small': 1270, 'mid': 64, 'large': 32}
  Imputed sigma_E (imputed_peer): 

In [6]:
# TIME INTEGRITY ASSERTIONS
print('[INFO] Validating time integrity for accounting approach...')

# Import time checks
import sys
from pathlib import Path
sys.path.insert(0, str(Path.cwd()))
from utils.time_checks import assert_time_integrity

# Assertion 1: sigma_E window must end at t-1
assert (df['sigmaE_window_end_year'] == df['year'] - 1).all(), \
    'sigma_E window end must be t-1 (no lookahead)'

# Assertion 2: mu_hat provenance is set
assert df['mu_hat_from'].notna().all(), 'mu_hat provenance must be tracked'

# Assertion 3: When using rit_tminus1, source year must be t-1
uses_lag = df['mu_hat_from'].eq('rit_tminus1')
if uses_lag.any():
    assert (df.loc[uses_lag, 'mu_source_year'] == df.loc[uses_lag, 'year'] - 1).all(), \
        'mu_hat source year must be t-1 when using lagged return'

# Run comprehensive time integrity check
assert_time_integrity(df)

print('[PASS] All time integrity assertions passed')
print(f'  - sigma_E uses only data up to t-1: {(df["sigmaE_window_end_year"] == df["year"] - 1).all()}')
print(f'  - mu_hat uses t-1 data: {(df[df["mu_hat_from"].eq("rit_tminus1")]["mu_source_year"] == df[df["mu_hat_from"].eq("rit_tminus1")]["year"] - 1).all()}')


[INFO] Validating time integrity for accounting approach...
[PASS] All time integrity assertions passed
  - sigma_E uses only data up to t-1: True
  - mu_hat uses t-1 data: True



## 6. Debt volatility proxy, asset proxies, and drift proxy

Derive the Bharath–Shumway debt volatility proxy, approximate asset value/volatility, and compute the drift proxy based on lagged equity returns with firm and size-bucket fallbacks.


In [7]:
# Debt and asset volatility proxies
df['sigma_D_hat'] = 0.05 + 0.25 * df['sigma_E']
df['V_hat'] = df['E'] + df['F']
valid_v = df['V_hat'] > 0

sigma_V_components = np.where(
    valid_v,
    (df['E'] / df['V_hat']) * df['sigma_E'] + (df['F'] / df['V_hat']) * df['sigma_D_hat'],
    np.nan
)
df['sigma_V_hat'] = sigma_V_components

df['sigma_V_hat'] = df['sigma_V_hat'].clip(lower=1e-6)

# Drift proxy using lagged returns
lagged_rit = df.groupby('instrument', group_keys=False)['rit'].shift(1)
firm_mean = (
    df.groupby('instrument', group_keys=False)['rit']
      .apply(lambda s: s.expanding().mean().shift(1))
)

df['mu_hat'] = lagged_rit
mask_mu = df['mu_hat'].isna()
df.loc[mask_mu, 'mu_hat'] = firm_mean[mask_mu]

size_median_mu = df.groupby('size_bucket')['mu_hat'].transform('median')
df['mu_hat'] = df['mu_hat'].fillna(size_median_mu)
df['mu_hat'] = df['mu_hat'].fillna(df['mu_hat'].median())

print(df[['instrument', 'year', 'sigma_D_hat', 'sigma_V_hat', 'mu_hat']].head())


  instrument  year  sigma_D_hat  sigma_V_hat    mu_hat
0       ABCB  2016     0.109790     0.233860  0.046529
1       ABCB  2017     0.113656     0.251598  0.366828
2       ABCB  2018     0.114907     0.256425  0.244121
3       ABCB  2019     0.121581     0.274489 -0.350111
4       ABCB  2020     0.113675     0.250830  0.033435



## 7. Compute naive distance to default (DD) and probability of default (PD)

Apply the Bharath–Shumway naive formulas using the proxies above. Probability of default is clipped to the [0, 1] interval.


In [8]:
valid_sigmaV = np.isfinite(df['sigma_V_hat']) & (df['sigma_V_hat'] > 0)
valid_inputs = df['E'].gt(0) & df['F'].gt(0) & valid_sigmaV & df['mu_hat'].notna()

# Bharath & Shumway naive DD: V_hat=E+F, sigma_D_hat=0.05+0.25*sigma_E,
# sigma_V_hat = value-weighted mix, mu_hat = lagged equity return. No solver.
df['DD_naive'] = np.where(
    valid_inputs,
    (np.log(df['V_hat'] / df['F']) + (df['mu_hat'] - 0.5 * df['sigma_V_hat'] ** 2) * T)
    / (df['sigma_V_hat'] * math.sqrt(T)),
    np.nan,
)
df['PD_naive'] = np.where(np.isfinite(df['DD_naive']), Phi(-df['DD_naive']), np.nan)
df['invalid_sigmaV'] = ~valid_sigmaV

print(df[['instrument', 'year', 'DD_naive', 'PD_naive']].head())
print(f"PD==0 count: {(df['PD_naive'] == 0).sum()}, PD==1 count: {(df['PD_naive'] == 1).sum()}")

  instrument  year   DD_naive      PD_naive
0       ABCB  2016  13.743862  2.771898e-43
1       ABCB  2017  16.600072  3.480372e-62
2       ABCB  2018  15.685909  9.442022e-56
3       ABCB  2019   8.181287  1.404143e-16
4       ABCB  2020  14.341664  6.006968e-47
PD==0 count: 3, PD==1 count: 0



## 8. Data-quality flags and status tracking

Capture the first applicable status flag (missing inputs, fallbacks, or imputations) as `naive_status` so downstream users understand how each observation was derived.


In [9]:
missing_E = ~np.isfinite(df['E'])
missing_F = ~np.isfinite(df['F'])
nonpos_EF = (df['E'] <= 0) | (df['F'] <= 0)

flag_specs = [
    ('invalid_sigmaV', df['invalid_sigmaV']),
    ('missing_E', missing_E | (df['E_source'] == 'missing')),
    ('missing_F', missing_F),
    ('nonpos_EF', nonpos_EF),
    ('insufficient_returns', df['insufficient_returns']),
    ('imputed_sigmaE_sizebucket', df['imputed_sigmaE_sizebucket']),
    ('fallback_E_from_de', df['E_source'] == 'E_de'),
    ('fallback_E_from_wacc', df['E_source'] == 'E_wacc'),
]

for name, mask in flag_specs:
    df[name] = mask.astype(bool)


def assign_status(idx: int) -> str:
    for name, _ in flag_specs:
        if bool(df.iloc[idx][name]):
            return name
    return 'ok'


naive_status = [assign_status(i) for i in range(len(df))]
df['naive_status'] = naive_status

status_counts = df['naive_status'].value_counts(dropna=False).sort_index()
print('Naive status counts:')
print(status_counts)

fallback_summary = {
    'invalid_sigmaV': int(df['invalid_sigmaV'].sum()),
    'fallback_E_from_de': int(df['fallback_E_from_de'].sum()),
    'fallback_E_from_wacc': int(df['fallback_E_from_wacc'].sum()),
    'imputed_sigmaE_sizebucket': int(df['imputed_sigmaE_sizebucket'].sum()),
    'insufficient_returns': int(df['insufficient_returns'].sum()),
}
print("\nFallback indicator counts:")
for label, count in fallback_summary.items():
    print(f"{label}: {count}")

e_source_counts = df['E_source'].value_counts(dropna=False).sort_index()
print("\nEquity source mix:")
print(e_source_counts)
weak_proxy_count = int(df['weak_E_proxy'].sum())
print(f"Weak equity proxy count: {weak_proxy_count}")

percentiles = [0.10, 0.25, 0.50, 0.75, 0.90]
dd_stats = df['DD_naive'].describe(percentiles=percentiles)
pd_stats = df['PD_naive'].describe(percentiles=percentiles)
dd_missing = int(df['DD_naive'].isna().sum())
pd_missing = int(df['PD_naive'].isna().sum())

print("\nDD_naive summary:")
print(dd_stats)
print(f"Rows with missing DD_naive: {dd_missing}")

print("\nPD_naive summary:")
print(pd_stats)
print(f"Rows with missing PD_naive: {pd_missing}")

log_lines = []
log_lines.append('=== Naive DD/PD Diagnostics ===')
log_lines.append(f'Total rows processed: {len(df)}')
log_lines.append('')

log_lines.append('Naive status counts:')
log_lines.extend([f"{status}: {count}" for status, count in status_counts.items()])
log_lines.append('')

log_lines.append('Fallback indicator counts:')
for label, count in fallback_summary.items():
    log_lines.append(f"{label}: {count}")
log_lines.append('')

log_lines.append('Equity source mix:')
log_lines.extend([f"{source}: {count}" for source, count in e_source_counts.items()])
log_lines.append('')
log_lines.append(f"Weak equity proxy count: {weak_proxy_count}")

log_lines.append('DD_naive summary:')
log_lines.extend([f"{idx}: {value}" for idx, value in dd_stats.items()])
log_lines.append(f'Rows with missing DD_naive: {dd_missing}')
log_lines.append('')

log_lines.append('PD_naive summary:')
log_lines.extend([f"{idx}: {value}" for idx, value in pd_stats.items()])
log_lines.append(f'Rows with missing PD_naive: {pd_missing}')
log_lines.append(f"PD==0 count: {(df['PD_naive'] == 0).sum()}, PD==1 count: {(df['PD_naive'] == 1).sum()}")

log_path = log_dir / 'dd_pd_accounting_log.txt'
log_path.write_text("\n".join(log_lines))
print(f"[INFO] Wrote diagnostics to {log_path}")

Naive status counts:
naive_status
imputed_sigmaE_sizebucket      55
nonpos_EF                      16
ok                           1295
Name: count, dtype: int64

Fallback indicator counts:
invalid_sigmaV: 0
fallback_E_from_de: 0
fallback_E_from_wacc: 0
imputed_sigmaE_sizebucket: 55
insufficient_returns: 0

Equity source mix:
E_source
E_pb    1366
Name: count, dtype: int64
Weak equity proxy count: 0

DD_naive summary:
count    1350.000000
mean       11.808152
std         5.497494
min        -5.719526
10%         5.391005
25%         8.547141
50%        11.238610
75%        14.281865
90%        17.737170
max        61.440271
Name: DD_naive, dtype: float64
Rows with missing DD_naive: 16

PD_naive summary:
count    1.350000e+03
mean     8.508458e-04
std      2.729152e-02
min      0.000000e+00
10%      1.213645e-70
25%      1.426443e-46
50%      1.317532e-29
75%      6.309154e-18
90%      3.508941e-08
max      1.000000e+00
Name: PD_naive, dtype: float64
Rows with missing PD_naive: 16
[INFO


## 9. Persist outputs and quick diagnostics

Save the naive DD/PD results and a percentile summary by year for quick reference.


In [10]:
# Archiving and timestamped output setup
from datetime import datetime
import pytz
import shutil
import glob
import os

def get_timestamp_cdt():
    """Generate timestamp in YYYYMMDD_HHMMSS format (CDT timezone)"""
    cdt = pytz.timezone('America/Chicago')
    return datetime.now(cdt).strftime('%Y%m%d_%H%M%S')

def archive_old_files(output_dir, archive_dir, dataset_type, max_keep=5):
    """Move old files of dataset_type to archive, keeping only max_keep most recent"""
    pattern = str(output_dir / f"{dataset_type}_*.csv")
    old_files = sorted(glob.glob(pattern), key=lambda x: os.path.getmtime(x), reverse=True)
    
    # Move all existing files to archive
    for old_file in old_files:
        archive_path = archive_dir / os.path.basename(old_file)
        shutil.move(old_file, str(archive_path))
        print(f"[ARCHIVE] Moved to archive: {os.path.basename(old_file)}")
    
    # Clean up archive to keep only max_keep files
    archive_pattern = str(archive_dir / f"{dataset_type}_*.csv")
    archive_files = sorted(glob.glob(archive_pattern), key=lambda x: os.path.getmtime(x), reverse=True)
    
    for old_archive in archive_files[max_keep:]:
        os.remove(old_archive)
        print(f"[CLEANUP] Removed old archive: {os.path.basename(old_archive)}")

# Rename columns to standard naming convention
df = df.rename(columns={'DD_naive': 'DD_a', 'PD_naive': 'PD_a'})

result_cols = [
    'instrument', 'year', 'E', 'E_source', 'weak_E_proxy', 'E_pb', 'E_de', 'E_wacc',
    'F', 'sigma_E', 'sigma_D_hat', 'sigma_V_hat', 'mu_hat',
    'DD_a', 'PD_a', 'naive_status'
]

# Setup archive directory
archive_dir = base_dir / 'archive' / 'datasets'
archive_dir.mkdir(parents=True, exist_ok=True)

# Archive old accounting files and save new one with timestamp
archive_old_files(output_dir, archive_dir, 'accounting', max_keep=5)

timestamp = get_timestamp_cdt()
dd_output = output_dir / f'accounting_{timestamp}.csv'
# Provenance columns for time integrity audit
provenance_cols = ["sigma_E_tminus1", "sigmaE_window_start_year", 
                   "sigmaE_window_end_year", "mu_hat", "mu_hat_from", 
                   "mu_source_year", "DD_a", "PD_a"]

df[result_cols].to_csv(dd_output, index=False)
print(f"[INFO] Saved accounting DD/PD results to {dd_output}")

cfg = {'T': T, 'ROLL_YEARS': 3, 'WINSOR_P': [0.01, 0.99], 'Phi': 'scipy' if 'norm' in globals() else 'erf_fallback', 'spec': 'Bharath–Shumway naive, no solver, v1'}
(Path(output_dir) / 'dd_pd_naive_config.json').write_text(pd.Series(cfg).to_json())

percentiles = [0.10, 0.25, 0.50, 0.75, 0.90]
percentile_columns = [f"p{int(p * 100)}" for p in percentiles]


def build_percentile_table(metric: str) -> pd.DataFrame:
    clean = df[['year', metric]].dropna()
    if clean.empty:
        empty = {'year': ['overall'], 'metric': [metric]}
        for col in percentile_columns:
            empty[col] = [np.nan]
        return pd.DataFrame(empty)

    by_year = (
        clean.groupby('year')[metric]
             .quantile(percentiles)
             .unstack(level=-1)
    )
    by_year.columns = percentile_columns
    by_year = by_year.reset_index()
    by_year.insert(0, 'metric', metric)

    overall_values = df[metric].dropna()
    overall_row = {'metric': metric, 'year': 'overall'}
    for col, pct in zip(percentile_columns, percentiles):
        overall_row[col] = overall_values.quantile(pct) if not overall_values.empty else np.nan

    combined = pd.concat([by_year, pd.DataFrame([overall_row])], ignore_index=True)
    ordered_columns = ['year', 'metric', *percentile_columns]
    return combined[ordered_columns]


summary_frames = [build_percentile_table(metric) for metric in ['DD_a', 'PD_a']]
summary = pd.concat(summary_frames, ignore_index=True)

summary_output = base_dir / 'data' / 'outputs' / 'analysis' / f'accounting_{timestamp}_summary.csv'
summary.to_csv(summary_output, index=False)
print(f"[INFO] Saved percentile summary to {summary_output}")

summary.head()

[ARCHIVE] Moved to archive: accounting_20251011_042604.csv
[CLEANUP] Removed old archive: accounting_20251005_033611.csv
[INFO] Saved accounting DD/PD results to /Users/guillaumebld/Documents/Graduate_Research/Professor Abol Jalilvand/fall2025/risk_bank/risk_bank/data/outputs/datasheet/accounting_20251014_022117.csv
[INFO] Saved percentile summary to /Users/guillaumebld/Documents/Graduate_Research/Professor Abol Jalilvand/fall2025/risk_bank/risk_bank/data/outputs/analysis/accounting_20251014_022117_summary.csv


Unnamed: 0,year,metric,p10,p25,p50,p75,p90
0,2016,DD_a,9.790408,11.161703,13.084508,15.147648,18.107119
1,2017,DD_a,8.812868,10.182653,12.274538,14.784404,17.406708
2,2018,DD_a,8.874889,10.493215,12.210076,14.9173,18.541829
3,2019,DD_a,7.70635,9.222354,11.874427,14.785628,18.33007
4,2020,DD_a,9.007549,10.955405,13.502497,17.318781,20.837078
