# Validate Pulled Data for VAE vs Heston Comparison

This notebook validates that all pulled WRDS data is consistent and ready for:
1. **VAE Training** - Volatility surface data
2. **Heston Calibration** - Zero curves + forward prices

We need to ensure:
- Same date coverage across all datasets
- Compatible maturity grids (or identify interpolation needs)
- No critical missing values

In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# -------------------
# CONFIG
# -------------------
TICKER = "AAPL"
START_DATE = "2016-01-01"
END_DATE = "2025-12-31"

RAW_DIR = Path("../../data/raw/ivydb")

# File paths
VS_PATH = RAW_DIR / "vol_surface" / f"{TICKER}_vsurfd_{START_DATE}_{END_DATE}.csv.gz"
PX_PATH = RAW_DIR / "security_price" / f"{TICKER}_underlying_{START_DATE}_{END_DATE}.csv.gz"
ZC_PATH = RAW_DIR / "zero_curve" / f"zero_curve_{START_DATE}_{END_DATE}.csv.gz"
STDOP_PATH = RAW_DIR / "std_option_price" / f"{TICKER}_stdopd_{START_DATE}_{END_DATE}.csv.gz"

print("Data files:")
for name, path in [("Vol Surface", VS_PATH), ("Underlying Px", PX_PATH), 
                    ("Zero Curve", ZC_PATH), ("Std Option", STDOP_PATH)]:
    exists = "‚úì" if path.exists() else "‚úó MISSING"
    print(f"  {exists} {name}: {path.name}")

## 1. Load All Datasets

In [None]:
# Load datasets
vs = pd.read_csv(VS_PATH, parse_dates=["date"])
px = pd.read_csv(PX_PATH, parse_dates=["date"])
zc = pd.read_csv(ZC_PATH, parse_dates=["date"])
stdop = pd.read_csv(STDOP_PATH, parse_dates=["date"])

print("Dataset shapes:")
print(f"  Volatility Surface: {vs.shape}")
print(f"  Underlying Prices:  {px.shape}")
print(f"  Zero Curve:         {zc.shape}")
print(f"  Std Option Prices:  {stdop.shape}")

In [None]:
# Quick peek at each dataset
print("=" * 60)
print("VOLATILITY SURFACE (vsurfd)")
print("=" * 60)
print(f"Columns: {list(vs.columns)}")
display(vs.head(3))

print("\n" + "=" * 60)
print("ZERO CURVE (zerocd)")
print("=" * 60)
print(f"Columns: {list(zc.columns)}")
display(zc.head(3))

print("\n" + "=" * 60)
print("STD OPTION PRICES (stdopd)")
print("=" * 60)
print(f"Columns: {list(stdop.columns)}")
display(stdop.head(3))

## 2. Date Coverage Analysis

In [None]:
# Extract unique dates from each dataset
vs_dates = set(vs["date"].dt.date)
px_dates = set(px["date"].dt.date)
zc_dates = set(zc["date"].dt.date)
stdop_dates = set(stdop["date"].dt.date)

print("Date Coverage Summary:")
print(f"  Vol Surface:  {len(vs_dates):,} unique dates | {min(vs_dates)} to {max(vs_dates)}")
print(f"  Underlying:   {len(px_dates):,} unique dates | {min(px_dates)} to {max(px_dates)}")
print(f"  Zero Curve:   {len(zc_dates):,} unique dates | {min(zc_dates)} to {max(zc_dates)}")
print(f"  Std Option:   {len(stdop_dates):,} unique dates | {min(stdop_dates)} to {max(stdop_dates)}")

In [None]:
# Find intersection of all dates
common_dates = vs_dates & px_dates & zc_dates & stdop_dates
print(f"\nCommon dates across ALL datasets: {len(common_dates):,}")
print(f"  Range: {min(common_dates)} to {max(common_dates)}")

# Check what's missing from each
print("\nDates present in Vol Surface but missing from:")
print(f"  Underlying:  {len(vs_dates - px_dates):,}")
print(f"  Zero Curve:  {len(vs_dates - zc_dates):,}")
print(f"  Std Option:  {len(vs_dates - stdop_dates):,}")

# The key constraint: we need vs_dates to be a subset of the others for Heston
vs_minus_common = vs_dates - common_dates
print(f"\n‚ö†Ô∏è  Vol surface dates NOT in common set: {len(vs_minus_common):,}")
if vs_minus_common:
    print(f"  Sample: {sorted(vs_minus_common)[:5]}...")

In [None]:
# Visualize date coverage
fig, ax = plt.subplots(figsize=(14, 4))

datasets = {
    "Vol Surface": vs_dates,
    "Underlying": px_dates,
    "Zero Curve": zc_dates,
    "Std Option": stdop_dates,
}

for i, (name, dates) in enumerate(datasets.items()):
    dates_sorted = sorted(dates)
    ax.scatter(dates_sorted, [i] * len(dates_sorted), s=1, alpha=0.5, label=name)

ax.set_yticks(range(len(datasets)))
ax.set_yticklabels(list(datasets.keys()))
ax.set_xlabel("Date")
ax.set_title("Date Coverage by Dataset")
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 3. Maturity (Days) Grid Analysis

Critical question: Do the maturities in `vsurfd` match those in `zerocd` and `stdopd`?

In [None]:
# Unique days in each dataset
vs_days = sorted(vs["days"].unique())
zc_days = sorted(zc["days"].unique())
stdop_days = sorted(stdop["days"].unique())

print("Maturity (days) Grid Summary:")
print(f"\nVol Surface ({len(vs_days)} unique):")
print(f"  {vs_days}")

print(f"\nZero Curve ({len(zc_days)} unique):")
print(f"  {zc_days[:20]}{'...' if len(zc_days) > 20 else ''}")

print(f"\nStd Option ({len(stdop_days)} unique):")
print(f"  {stdop_days}")

In [None]:
# Check overlap
vs_days_set = set(vs_days)
zc_days_set = set(zc_days)
stdop_days_set = set(stdop_days)

print("Maturity Grid Alignment:")
print(f"\nVol Surface days in Zero Curve:  {len(vs_days_set & zc_days_set)}/{len(vs_days_set)}")
print(f"Vol Surface days in Std Option:  {len(vs_days_set & stdop_days_set)}/{len(vs_days_set)}")

vs_not_in_zc = vs_days_set - zc_days_set
vs_not_in_stdop = vs_days_set - stdop_days_set

if vs_not_in_zc:
    print(f"\n‚ö†Ô∏è  Vol Surface days NOT in Zero Curve: {sorted(vs_not_in_zc)}")
    print("   ‚Üí Will need to INTERPOLATE zero rates for these maturities")
else:
    print("\n‚úì All Vol Surface maturities found in Zero Curve")

if vs_not_in_stdop:
    print(f"\n‚ö†Ô∏è  Vol Surface days NOT in Std Option: {sorted(vs_not_in_stdop)}")
    print("   ‚Üí Will need to INTERPOLATE forward prices for these maturities")
else:
    print("\n‚úì All Vol Surface maturities found in Std Option")

In [None]:
# Visualize maturity grids
fig, ax = plt.subplots(figsize=(12, 4))

y_pos = {"Vol Surface": 0, "Zero Curve": 1, "Std Option": 2}

ax.scatter(vs_days, [0] * len(vs_days), s=50, marker='o', label="Vol Surface", alpha=0.7)
ax.scatter(zc_days, [1] * len(zc_days), s=20, marker='s', label="Zero Curve", alpha=0.5)
ax.scatter(stdop_days, [2] * len(stdop_days), s=50, marker='^', label="Std Option", alpha=0.7)

ax.set_yticks([0, 1, 2])
ax.set_yticklabels(["Vol Surface", "Zero Curve", "Std Option"])
ax.set_xlabel("Days to Maturity")
ax.set_title("Maturity Grid Comparison")
ax.set_xlim(-10, max(max(vs_days), max(zc_days), max(stdop_days)) + 20)
ax.grid(True, alpha=0.3, axis='x')
ax.legend(loc='upper right')
plt.tight_layout()
plt.show()

## 4. Delta Grid Analysis (Vol Surface)

In [None]:
# Unique deltas and cp_flag combinations
vs_deltas = sorted(vs["delta"].unique())
vs_cp = sorted(vs["cp_flag"].unique())

print("Vol Surface Delta Grid:")
print(f"  Deltas ({len(vs_deltas)}): {vs_deltas}")
print(f"  CP flags: {vs_cp}")

# Expected grid size per date
expected_grid_size = len(vs_days) * len(vs_deltas) * len(vs_cp)
print(f"\nExpected full grid per date: {len(vs_days)} days √ó {len(vs_deltas)} deltas √ó {len(vs_cp)} cp = {expected_grid_size}")

In [None]:
# Check grid completeness per date
grid_size_per_date = vs.groupby("date").size()

print("Grid size per date:")
print(grid_size_per_date.describe())

full_grid_dates = (grid_size_per_date == expected_grid_size).sum()
print(f"\nDates with full grid: {full_grid_dates}/{len(grid_size_per_date)} ({100*full_grid_dates/len(grid_size_per_date):.1f}%)")

In [None]:
# Visualize grid completeness over time
fig, ax = plt.subplots(figsize=(14, 4))

completeness = (grid_size_per_date / expected_grid_size * 100).reset_index()
completeness.columns = ["date", "pct_complete"]

ax.plot(completeness["date"], completeness["pct_complete"], linewidth=0.5, alpha=0.7)
ax.axhline(100, color='g', linestyle='--', alpha=0.5, label='Full grid')
ax.axhline(completeness["pct_complete"].median(), color='r', linestyle=':', alpha=0.5, 
           label=f'Median ({completeness["pct_complete"].median():.1f}%)')

ax.set_xlabel("Date")
ax.set_ylabel("Grid Completeness (%)")
ax.set_title("Vol Surface Grid Completeness Over Time")
ax.set_ylim(0, 105)
ax.legend()
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 5. Missing Values Analysis

In [None]:
# Check for missing/sentinel values in critical columns
print("Missing Values Summary:")
print("\nVol Surface:")
print(f"  impl_volatility NaN:       {vs['impl_volatility'].isna().sum():,}")
print(f"  impl_volatility == -99.99: {(vs['impl_volatility'] == -99.99).sum():,}")
print(f"  dispersion == -99.99:      {(vs['dispersion'] == -99.99).sum():,}")

print("\nZero Curve:")
print(f"  rate NaN:  {zc['rate'].isna().sum():,}")
print(f"  rate <= 0: {(zc['rate'] <= 0).sum():,}")

print("\nStd Option:")
print(f"  forward_price NaN:  {stdop['forward_price'].isna().sum():,}")
print(f"  forward_price <= 0: {(stdop['forward_price'] <= 0).sum():,}")

In [None]:
# Heatmap of IV missingness by (days, delta) for a sample date
sample_date = vs["date"].iloc[len(vs)//2]
sample = vs[vs["date"] == sample_date].copy()

# Create pivot for calls
calls = sample[sample["cp_flag"] == "C"].pivot_table(
    index="days", columns="delta", values="impl_volatility", aggfunc="first"
)

fig, ax = plt.subplots(figsize=(12, 6))
mask = calls.isna() | (calls == -99.99)
sns.heatmap(calls.where(~mask), annot=False, cmap="YlOrRd", ax=ax,
            cbar_kws={"label": "Implied Volatility"})
ax.set_title(f"Call IV Surface on {str(sample_date)[:10]} (white = missing)")
ax.set_xlabel("Delta")
ax.set_ylabel("Days to Maturity")
plt.tight_layout()
plt.show()

## 6. Forward Price / Carry Rate Analysis

For Heston calibration, we need the carry rate $q(T)$ derived from:
$$q(T) = r(T) - \frac{1}{T} \ln\left(\frac{F(T)}{S_0}\right)$$

In [None]:
# Check if we can compute q(T) for common dates
# Need: spot price, forward price, risk-free rate, all for the same (date, days)

# Get spot prices
px_spot = px[["date", "close"]].copy()
px_spot.columns = ["date", "spot"]

# Get forward prices (one per date/days - should be same for C and P)
fwd = stdop.groupby(["date", "days"])["forward_price"].first().reset_index()

# Get rates
rates = zc[["date", "days", "rate"]].copy()

print(f"Spot prices: {len(px_spot):,} dates")
print(f"Forward prices: {len(fwd):,} (date, days) pairs")
print(f"Rates: {len(rates):,} (date, days) pairs")

In [None]:
# Merge to check coverage
merged = fwd.merge(rates, on=["date", "days"], how="inner")
merged = merged.merge(px_spot, on="date", how="inner")

print(f"Merged (date, days) with spot, forward, rate: {len(merged):,} rows")
print(f"Unique dates: {merged['date'].nunique():,}")
print(f"Unique days: {sorted(merged['days'].unique())}")

In [None]:
# Compute implied carry rate q(T)
merged["T"] = merged["days"] / 365.0
merged["q"] = merged["rate"] - (1 / merged["T"]) * np.log(merged["forward_price"] / merged["spot"])

print("Implied Carry Rate q(T) Summary:")
print(merged["q"].describe())

# Check for anomalies
anomalies = merged[(merged["q"] < -0.1) | (merged["q"] > 0.2)]
print(f"\nAnomalous q values (< -10% or > 20%): {len(anomalies):,}")

In [None]:
# Plot q(T) term structure for sample dates
sample_dates = merged["date"].drop_duplicates().iloc[::500].head(5).tolist()

fig, ax = plt.subplots(figsize=(10, 5))
for d in sample_dates:
    curve = merged[merged["date"] == d].sort_values("days")
    ax.plot(curve["days"], curve["q"] * 100, 'o-', label=str(d)[:10], markersize=6)

ax.set_xlabel("Days to Maturity")
ax.set_ylabel("Carry Rate q(T) (%)")
ax.set_title("Implied Carry Rate Term Structure")
ax.legend()
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 7. Summary & Recommendations

In [None]:
print("=" * 70)
print("DATA VALIDATION SUMMARY")
print("=" * 70)

print(f"\nüìÖ DATE COVERAGE:")
print(f"   Vol Surface dates:        {len(vs_dates):,}")
print(f"   Common dates (all data):  {len(common_dates):,}")
pct_common = 100 * len(common_dates) / len(vs_dates)
status = "‚úì" if pct_common > 95 else "‚ö†Ô∏è"
print(f"   {status} Coverage: {pct_common:.1f}%")

print(f"\nüìä MATURITY GRID:")
print(f"   Vol Surface maturities: {vs_days}")
print(f"   Std Option maturities:  {stdop_days}")
if vs_not_in_stdop:
    print(f"   ‚ö†Ô∏è  Need interpolation for: {sorted(vs_not_in_stdop)}")
else:
    print(f"   ‚úì All maturities aligned")

print(f"\nüî¢ GRID COMPLETENESS:")
print(f"   Expected grid size: {expected_grid_size}")
print(f"   Median actual:      {grid_size_per_date.median():.0f}")
print(f"   Full grid dates:    {full_grid_dates}/{len(grid_size_per_date)}")

print(f"\nüí∞ HESTON INPUTS:")
print(f"   Forward prices available: {len(merged):,} (date, T) pairs")
print(f"   Carry rate q range: [{merged['q'].min():.4f}, {merged['q'].max():.4f}]")

print("\n" + "=" * 70)
print("RECOMMENDATIONS:")
print("=" * 70)

In [None]:
# Generate specific recommendations
recommendations = []

if pct_common < 100:
    missing_pct = 100 - pct_common
    recommendations.append(
        f"DATES: {missing_pct:.1f}% of vol surface dates lack complete Heston inputs. "
        f"Option 1: Filter to common dates only. Option 2: Forward-fill missing rates/forwards."
    )

if vs_not_in_stdop:
    recommendations.append(
        f"MATURITIES: Maturities {sorted(vs_not_in_stdop)} are in vol surface but not in stdopd. "
        f"Must interpolate forward prices for these tenors."
    )

if vs_not_in_zc:
    recommendations.append(
        f"ZERO CURVE: Maturities {sorted(vs_not_in_zc)} need rate interpolation."
    )

if grid_size_per_date.median() < expected_grid_size * 0.9:
    recommendations.append(
        f"GRID SPARSITY: Median grid fill is {100*grid_size_per_date.median()/expected_grid_size:.0f}%. "
        f"VAE must handle missing values (masking) during training."
    )

if anomalies is not None and len(anomalies) > 0:
    recommendations.append(
        f"CARRY RATES: {len(anomalies)} anomalous q(T) values detected. "
        f"Consider filtering or capping extreme values."
    )

if not recommendations:
    recommendations.append("‚úì Data looks well-aligned! Ready for processing.")

for i, rec in enumerate(recommendations, 1):
    print(f"\n{i}. {rec}")

In [None]:
# Save common dates list for downstream use
common_dates_df = pd.DataFrame({"date": sorted(common_dates)})
common_dates_path = RAW_DIR.parent / "processed" / "common_dates.csv"
common_dates_path.parent.mkdir(parents=True, exist_ok=True)
common_dates_df.to_csv(common_dates_path, index=False)
print(f"Saved {len(common_dates_df)} common dates to: {common_dates_path}")