# Merging Accounting and Market DD/PD Datasets

This notebook merges the accounting-based and market-based distance-to-default (DD) and probability-of-default (PD) datasets into a single combined dataset.

**Workflow:**
1. Load the latest accounting and market datasets from `data/outputs/datasheet/`
2. Merge on `['instrument', 'year']`
3. Apply clear labeling to distinguish accounting vs market variables
4. Save timestamped merged dataset with archiving (max 5 archives)

In [1]:
# Setup and imports
from pathlib import Path
from datetime import datetime
import pytz
import shutil
import glob
import os
import pandas as pd
import numpy as np

# Find repository root
def find_repo_root(start: Path, marker: str = '.git') -> Path:
    current = start.resolve()
    for candidate in [current, *current.parents]:
        if (candidate / marker).exists():
            return candidate
    return current

base_dir = find_repo_root(Path.cwd())
output_dir = base_dir / 'data' / 'outputs' / 'datasheet'
archive_dir = base_dir / 'archive' / 'datasets'

print(f"Repository root: {base_dir}")
print(f"Output directory: {output_dir}")
print(f"Archive directory: {archive_dir}")

Repository root: /Users/guillaumebld/Documents/Graduate_Research/Professor Abol Jalilvand/fall2025/risk_bank/risk_bank
Output directory: /Users/guillaumebld/Documents/Graduate_Research/Professor Abol Jalilvand/fall2025/risk_bank/risk_bank/data/outputs/datasheet
Archive directory: /Users/guillaumebld/Documents/Graduate_Research/Professor Abol Jalilvand/fall2025/risk_bank/risk_bank/archive/datasets


In [2]:
# Helper functions
def get_timestamp_cdt():
    """Generate timestamp in YYYYMMDD_HHMMSS format (CDT timezone)"""
    cdt = pytz.timezone('America/Chicago')
    return datetime.now(cdt).strftime('%Y%m%d_%H%M%S')

def archive_old_files(output_dir, archive_dir, dataset_type, max_keep=5):
    """Move old files of dataset_type to archive, keeping only max_keep most recent"""
    pattern = str(output_dir / f"{dataset_type}_*.csv")
    old_files = sorted(glob.glob(pattern), key=lambda x: os.path.getmtime(x), reverse=True)
    
    # Move all existing files to archive
    for old_file in old_files:
        archive_path = archive_dir / os.path.basename(old_file)
        shutil.move(old_file, str(archive_path))
        print(f"[ARCHIVE] Moved to archive: {os.path.basename(old_file)}")
    
    # Clean up archive to keep only max_keep files
    archive_pattern = str(archive_dir / f"{dataset_type}_*.csv")
    archive_files = sorted(glob.glob(archive_pattern), key=lambda x: os.path.getmtime(x), reverse=True)
    
    for old_archive in archive_files[max_keep:]:
        os.remove(old_archive)
        print(f"[CLEANUP] Removed old archive: {os.path.basename(old_archive)}")

def get_latest_file(output_dir, dataset_type):
    """Get the most recent file of given dataset_type"""
    pattern = str(output_dir / f"{dataset_type}_*.csv")
    files = sorted(glob.glob(pattern), key=lambda x: os.path.getmtime(x), reverse=True)
    if not files:
        raise FileNotFoundError(f"No {dataset_type} files found in {output_dir}")
    return files[0]

In [3]:
# Load latest accounting and market datasets
accounting_file = get_latest_file(output_dir, 'accounting')
market_file = get_latest_file(output_dir, 'market')

print(f"Loading accounting data from: {os.path.basename(accounting_file)}")
print(f"Loading market data from: {os.path.basename(market_file)}")

df_accounting = pd.read_csv(accounting_file)
df_market = pd.read_csv(market_file)

print(f"\nAccounting dataset: {len(df_accounting)} rows")
print(f"Market dataset: {len(df_market)} rows")

Loading accounting data from: accounting_20251004_041436.csv
Loading market data from: market_20251004_050613.csv

Accounting dataset: 1425 rows
Market dataset: 1404 rows


In [4]:
# Merge datasets on instrument and year
merge_keys = ['instrument', 'year']

# Add prefixes to distinguish variables (except merge keys and final DD/PD)
accounting_cols_to_prefix = [c for c in df_accounting.columns 
                             if c not in merge_keys + ['DD_a', 'PD_a']]
market_cols_to_prefix = [c for c in df_market.columns 
                         if c not in merge_keys + ['DD_m', 'PD_m']]

df_accounting_prefixed = df_accounting.rename(
    columns={c: f'a_{c}' for c in accounting_cols_to_prefix}
)
df_market_prefixed = df_market.rename(
    columns={c: f'm_{c}' for c in market_cols_to_prefix}
)

# Perform outer merge to keep all observations
df_merged = pd.merge(
    df_accounting_prefixed,
    df_market_prefixed,
    on=merge_keys,
    how='outer',
    suffixes=('_a', '_m')
)

print(f"Merged dataset: {len(df_merged)} rows")
print(f"\nColumn count:")
print(f"  Accounting: {len(df_accounting.columns)}")
print(f"  Market: {len(df_market.columns)}")
print(f"  Merged: {len(df_merged.columns)}")

# Show sample
print(f"\nSample merged data:")
display(df_merged[['instrument', 'year', 'DD_a', 'PD_a', 'DD_m', 'PD_m']].head(10))

Merged dataset: 1425 rows

Column count:
  Accounting: 16
  Market: 59
  Merged: 73

Sample merged data:


Unnamed: 0,instrument,year,DD_a,PD_a,DD_m,PD_m
0,ABCB,2016,17.191718,1.531595e-66,,
1,ABCB,2017,22.14095,6.375977e-109,,
2,ABCB,2018,46.873017,0.0,28.963232,9.560799e-185
3,ABCB,2019,6.049042,7.285482e-10,4.133342,1.787627e-05
4,ABCB,2020,12.092687,5.772411e-34,6.880048,2.991617e-12
5,ABCB,2021,16.945177,1.044526e-64,10.279523,4.357884000000001e-25
6,ABCB,2022,13.409921,2.644752e-41,7.278021,1.69376e-13
7,ABCB,2023,17.757759,7.505416e-71,10.586485,1.721396e-26
8,ACNB,2019,17.095048,8.078189e-66,,
9,ACNB,2020,16.942326,1.0964e-64,,


In [5]:
# Archive old merged files and save new one with timestamp
archive_old_files(output_dir, archive_dir, 'merged', max_keep=5)

timestamp = get_timestamp_cdt()
merged_output = output_dir / f'merged_{timestamp}.csv'
df_merged.to_csv(merged_output, index=False)

print(f"[INFO] Merged dataset saved to: {merged_output}")
print(f"[INFO] Total rows: {len(df_merged)}")
print(f"[INFO] Total columns: {len(df_merged.columns)}")

[ARCHIVE] Moved to archive: merged_20251004_050658.csv
[INFO] Merged dataset saved to: /Users/guillaumebld/Documents/Graduate_Research/Professor Abol Jalilvand/fall2025/risk_bank/risk_bank/data/outputs/datasheet/merged_20251004_051328.csv
[INFO] Total rows: 1425
[INFO] Total columns: 73


In [6]:
# Create ESG dataset with DD/PD columns appended
print('[INFO] Creating ESG dataset with DD/PD columns...')

# Load ESG data
esg_file = base_dir / 'data' / 'esg_0718.csv'
if not esg_file.exists():
    print(f'[ERROR] ESG file not found: {esg_file}')
else:
    df_esg = pd.read_csv(esg_file)
    print(f'  Loaded ESG data: {len(df_esg)} rows, {len(df_esg.columns)} columns')
    
    # Extract DD/PD columns from accounting and market datasets
    dd_pd_data = df_merged[['instrument', 'year', 'DD_a', 'PD_a', 'DD_m', 'PD_m']].copy()
    
    # Merge ESG data with DD/PD
    df_esg_dd = pd.merge(
        df_esg,
        dd_pd_data,
        on=['instrument', 'year'],
        how='left'
    )
    
    print(f'  Merged ESG+DD/PD: {len(df_esg_dd)} rows, {len(df_esg_dd.columns)} columns')
    print(f'  New columns: DD_a, PD_a, DD_m, PD_m')
    
    # Archive old ESG+DD files
    archive_old_files(output_dir, archive_dir, 'esg_dd_pd', max_keep=5)
    
    # Save with timestamp
    esg_output = output_dir / f'esg_dd_pd_{timestamp}.csv'
    df_esg_dd.to_csv(esg_output, index=False)
    
    print(f'\n[INFO] ESG+DD/PD dataset saved to: {esg_output}')
    print(f'[INFO] Sample data:')
    display(df_esg_dd[['instrument', 'year', 'lnta', 'esg_score', 'DD_a', 'PD_a', 'DD_m', 'PD_m']].head(10))


[INFO] Creating ESG dataset with DD/PD columns...
  Loaded ESG data: 1427 rows, 49 columns
  Merged ESG+DD/PD: 1431 rows, 53 columns
  New columns: DD_a, PD_a, DD_m, PD_m

[INFO] ESG+DD/PD dataset saved to: /Users/guillaumebld/Documents/Graduate_Research/Professor Abol Jalilvand/fall2025/risk_bank/risk_bank/data/outputs/datasheet/esg_dd_pd_20251004_051328.csv
[INFO] Sample data:


Unnamed: 0,instrument,year,lnta,esg_score,DD_a,PD_a,DD_m,PD_m
0,JPM,2016,14.728184,81.521252,8.636867,2.888782e-18,,
1,JPM,2017,14.745152,82.353064,9.478464,1.290267e-21,,
2,JPM,2018,14.779651,80.046519,55.513711,0.0,35.0,1.1249110000000001e-268
3,JPM,2019,14.804077,83.907682,7.926925,1.123191e-15,4.995781,2.929913e-07
4,JPM,2020,15.034793,85.545384,10.417968,1.026457e-25,5.212109,9.33531e-08
5,JPM,2021,15.135549,82.867509,11.619534,1.639605e-31,6.430166,6.373234e-11
6,JPM,2022,15.114542,78.6728,11.84762,1.106948e-32,5.648837,8.076836e-09
7,JPM,2023,15.170158,79.512383,10.047577,4.707697e-24,5.674581,6.951444e-09
8,BAC,2016,14.598529,69.918468,7.767357,4.00703e-15,,
9,BAC,2017,14.640227,75.384317,9.018904,9.499007e-20,,


In [7]:
# Summary of ESG+DD/PD dataset
if 'df_esg_dd' in locals():
    print('=== ESG+DD/PD DATASET SUMMARY ===')
    print(f'\nTotal observations: {len(df_esg_dd)}')
    print(f'Observations with DD_a: {df_esg_dd["DD_a"].notna().sum()}')
    print(f'Observations with DD_m: {df_esg_dd["DD_m"].notna().sum()}')
    print(f'Observations with both DD_a and DD_m: {df_esg_dd[["DD_a", "DD_m"]].dropna().shape[0]}')
    print(f'\nUnique instruments: {df_esg_dd["instrument"].nunique()}')
    print(f'Year range: {df_esg_dd["year"].min()} - {df_esg_dd["year"].max()}')


=== ESG+DD/PD DATASET SUMMARY ===

Total observations: 1431
Observations with DD_a: 1415
Observations with DD_m: 936
Observations with both DD_a and DD_m: 936

Unique instruments: 244
Year range: 2016 - 2023


In [8]:
# Summary statistics
print("=== MERGED DATASET SUMMARY ===")
print(f"\nObservations with both DD_a and DD_m: {df_merged[['DD_a', 'DD_m']].dropna().shape[0]}")
print(f"Observations with only DD_a: {df_merged['DD_a'].notna().sum() - df_merged[['DD_a', 'DD_m']].dropna().shape[0]}")
print(f"Observations with only DD_m: {df_merged['DD_m'].notna().sum() - df_merged[['DD_a', 'DD_m']].dropna().shape[0]}")

print(f"\nDD_a statistics:")
print(df_merged['DD_a'].describe())

print(f"\nDD_m statistics:")
print(df_merged['DD_m'].describe())

print(f"\nPD_a statistics:")
print(df_merged['PD_a'].describe())

print(f"\nPD_m statistics:")
print(df_merged['PD_m'].describe())

=== MERGED DATASET SUMMARY ===

Observations with both DD_a and DD_m: 930
Observations with only DD_a: 479
Observations with only DD_m: 0

DD_a statistics:
count    1409.000000
mean       20.334403
std        14.666043
min        -3.665984
25%        12.242458
50%        16.616343
75%        22.917770
max       155.472624
Name: DD_a, dtype: float64

DD_m statistics:
count    930.000000
mean      12.198401
std        8.032074
min        1.946932
25%        6.439077
50%        9.538566
75%       15.106819
max       35.000000
Name: DD_m, dtype: float64

PD_a statistics:
count     1.409000e+03
mean      7.469649e-04
std       2.667248e-02
min       0.000000e+00
25%      1.545023e-116
50%       2.653631e-62
75%       9.218141e-35
max       9.998768e-01
Name: PD_a, dtype: float64

PD_m statistics:
count     9.300000e+02
mean      8.217333e-05
std       1.093972e-03
min      1.124911e-268
25%       7.685700e-52
50%       7.245797e-22
75%       6.013341e-11
max       2.577147e-02
Name: PD_m, d