# Merging Accounting and Market DD/PD Datasets

This notebook merges the accounting-based and market-based distance-to-default (DD) and probability-of-default (PD) datasets into a single combined dataset.

**Workflow:**
1. Load the latest accounting and market datasets from `data/outputs/datasheet/`
2. Merge on `['instrument', 'year']`
3. Apply clear labeling to distinguish accounting vs market variables
4. Save timestamped merged dataset with archiving (max 5 archives)

In [None]:
# Setup and imports
from pathlib import Path
from datetime import datetime
import pytz
import shutil
import glob
import os
import pandas as pd
import numpy as np

# Find repository root
def find_repo_root(start: Path, marker: str = '.git') -> Path:
    current = start.resolve()
    for candidate in [current, *current.parents]:
        if (candidate / marker).exists():
            return candidate
    return current

base_dir = find_repo_root(Path.cwd())
output_dir = base_dir / 'data' / 'outputs' / 'datasheet'
archive_dir = base_dir / 'archive' / 'datasets'

print(f"Repository root: {base_dir}")
print(f"Output directory: {output_dir}")
print(f"Archive directory: {archive_dir}")

In [None]:
# Helper functions
def get_timestamp_cdt():
    """Generate timestamp in YYYYMMDD_HHMMSS format (CDT timezone)"""
    cdt = pytz.timezone('America/Chicago')
    return datetime.now(cdt).strftime('%Y%m%d_%H%M%S')

def archive_old_files(output_dir, archive_dir, dataset_type, max_keep=5):
    """Move old files of dataset_type to archive, keeping only max_keep most recent"""
    pattern = str(output_dir / f"{dataset_type}_*.csv")
    old_files = sorted(glob.glob(pattern), key=lambda x: os.path.getmtime(x), reverse=True)
    
    # Move all existing files to archive
    for old_file in old_files:
        archive_path = archive_dir / os.path.basename(old_file)
        shutil.move(old_file, str(archive_path))
        print(f"[ARCHIVE] Moved to archive: {os.path.basename(old_file)}")
    
    # Clean up archive to keep only max_keep files
    archive_pattern = str(archive_dir / f"{dataset_type}_*.csv")
    archive_files = sorted(glob.glob(archive_pattern), key=lambda x: os.path.getmtime(x), reverse=True)
    
    for old_archive in archive_files[max_keep:]:
        os.remove(old_archive)
        print(f"[CLEANUP] Removed old archive: {os.path.basename(old_archive)}")

def get_latest_file(output_dir, dataset_type):
    """Get the most recent file of given dataset_type"""
    pattern = str(output_dir / f"{dataset_type}_*.csv")
    files = sorted(glob.glob(pattern), key=lambda x: os.path.getmtime(x), reverse=True)
    if not files:
        raise FileNotFoundError(f"No {dataset_type} files found in {output_dir}")
    return files[0]

In [None]:
# Load latest accounting and market datasets
accounting_file = get_latest_file(output_dir, 'accounting')
market_file = get_latest_file(output_dir, 'market')

print(f"Loading accounting data from: {os.path.basename(accounting_file)}")
print(f"Loading market data from: {os.path.basename(market_file)}")

df_accounting = pd.read_csv(accounting_file)
df_market = pd.read_csv(market_file)

print(f"\nAccounting dataset: {len(df_accounting)} rows")
print(f"Market dataset: {len(df_market)} rows")

In [None]:
# Merge datasets on instrument and year
merge_keys = ['instrument', 'year']

# Add prefixes to distinguish variables (except merge keys and final DD/PD)
accounting_cols_to_prefix = [c for c in df_accounting.columns 
                             if c not in merge_keys + ['DD_a', 'PD_a']]
market_cols_to_prefix = [c for c in df_market.columns 
                         if c not in merge_keys + ['DD_m', 'PD_m']]

df_accounting_prefixed = df_accounting.rename(
    columns={c: f'a_{c}' for c in accounting_cols_to_prefix}
)
df_market_prefixed = df_market.rename(
    columns={c: f'm_{c}' for c in market_cols_to_prefix}
)

# Perform outer merge to keep all observations
df_merged = pd.merge(
    df_accounting_prefixed,
    df_market_prefixed,
    on=merge_keys,
    how='outer',
    suffixes=('_a', '_m')
)

print(f"Merged dataset: {len(df_merged)} rows")
print(f"\nColumn count:")
print(f"  Accounting: {len(df_accounting.columns)}")
print(f"  Market: {len(df_market.columns)}")
print(f"  Merged: {len(df_merged.columns)}")

# Show sample
print(f"\nSample merged data:")
display(df_merged[['instrument', 'year', 'DD_a', 'PD_a', 'DD_m', 'PD_m']].head(10))

In [None]:
# Archive old merged files and save new one with timestamp
archive_old_files(output_dir, archive_dir, 'merged', max_keep=5)

timestamp = get_timestamp_cdt()
merged_output = output_dir / f'merged_{timestamp}.csv'
df_merged.to_csv(merged_output, index=False)

print(f"[INFO] Merged dataset saved to: {merged_output}")
print(f"[INFO] Total rows: {len(df_merged)}")
print(f"[INFO] Total columns: {len(df_merged.columns)}")

In [None]:
# Summary statistics
print("=== MERGED DATASET SUMMARY ===")
print(f"\nObservations with both DD_a and DD_m: {df_merged[['DD_a', 'DD_m']].dropna().shape[0]}")
print(f"Observations with only DD_a: {df_merged['DD_a'].notna().sum() - df_merged[['DD_a', 'DD_m']].dropna().shape[0]}")
print(f"Observations with only DD_m: {df_merged['DD_m'].notna().sum() - df_merged[['DD_a', 'DD_m']].dropna().shape[0]}")

print(f"\nDD_a statistics:")
print(df_merged['DD_a'].describe())

print(f"\nDD_m statistics:")
print(df_merged['DD_m'].describe())

print(f"\nPD_a statistics:")
print(df_merged['PD_a'].describe())

print(f"\nPD_m statistics:")
print(df_merged['PD_m'].describe())