# Pull Additional WRDS Data for Heston Calibration

This notebook pulls:
1. **Zero Curve** (`zerocd`) - Risk-free rates by days to maturity
2. **Standard Option Prices** (`stdopd`) - Standard option prices with forward prices

**Prerequisites:**
- WRDS account with OptionMetrics access
- Run `DataGetter.ipynb` first to get volatility surface data

In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
import wrds

# -------------------
# USER CONFIG
# -------------------
WRDS_USERNAME = "acaraman"
LIB = "optionm"

TICKER = "AAPL"
START_DATE = "2016-01-01"
END_DATE = "2025-12-31"

YEARS = list(range(int(START_DATE[:4]), int(END_DATE[:4]) + 1))

# -------------------
# OUTPUT PATHS
# -------------------
RAW_DIR = Path("../../data/raw/ivydb")
RAW_DIR.mkdir(parents=True, exist_ok=True)

# Ensure subdirectories exist
(RAW_DIR / "zero_curve").mkdir(parents=True, exist_ok=True)
(RAW_DIR / "std_option_price").mkdir(parents=True, exist_ok=True)

print("Output directories:")
print(f"  Zero curve:       {(RAW_DIR / 'zero_curve').resolve()}")
print(f"  Std option price: {(RAW_DIR / 'std_option_price').resolve()}")

In [None]:
# Connect to WRDS
db = wrds.Connection(wrds_username=WRDS_USERNAME)

In [None]:
# Helper functions
def list_tables(lib=LIB):
    return set(db.list_tables(library=lib))

def describe_cols(table, lib=LIB):
    desc = db.describe_table(library=lib, table=table)
    return desc["name"].tolist(), desc

def pick_col(cols, candidates, required=True):
    cols_l = [c.lower() for c in cols]
    for cand in candidates:
        cand_l = cand.lower()
        if cand_l in cols_l:
            return cols[cols_l.index(cand_l)]
    if required:
        raise KeyError(f"Could not find any of {candidates} in columns: {cols}")
    return None

def has_table(table, lib=LIB):
    return table in list_tables(lib)

print("Helper functions defined.")

## 1. Resolve SECID for Ticker

In [None]:
def resolve_secid(ticker: str, start_date: str, end_date: str) -> int:
    """
    Resolve ticker to SECID using secnmd table.
    Returns the SECID with most volatility surface coverage.
    """
    ticker = ticker.upper()
    
    # Get secids from secnmd
    q = f"""
    SELECT DISTINCT secid
    FROM {LIB}.secnmd
    WHERE ticker = '{ticker}'
    """
    secids = db.raw_sql(q)
    
    if secids.empty:
        raise RuntimeError(f"No SECID found for ticker {ticker}")
    
    print(f"Found {len(secids)} SECID(s) for {ticker}: {secids['secid'].tolist()}")
    
    # If multiple, choose one with most surface coverage
    if len(secids) > 1:
        secid_list = ",".join(str(int(x)) for x in secids["secid"].unique())
        
        counts = []
        for y in YEARS:
            t = f"vsurfd{y}"
            if not has_table(t):
                continue
            q = f"""
            SELECT secid, COUNT(DISTINCT date) AS n_days
            FROM {LIB}.{t}
            WHERE secid IN ({secid_list})
              AND date BETWEEN '{start_date}' AND '{end_date}'
            GROUP BY secid
            """
            counts.append(db.raw_sql(q))
        
        if counts:
            cov = (pd.concat(counts, ignore_index=True)
                     .groupby("secid", as_index=False)["n_days"].sum()
                     .sort_values("n_days", ascending=False))
            display(cov)
            return int(cov.iloc[0]["secid"])
    
    return int(secids.iloc[0]["secid"])

SECID = resolve_secid(TICKER, START_DATE, END_DATE)
print(f"\nUsing SECID: {SECID}")

## 2. Pull Zero Curve Data

The zero curve provides risk-free rates by days to maturity. This is **not** security-specific - it's the same for all options on a given date.

In [None]:
# First, let's explore the zero curve table structure
if has_table("zerocd2024"):
    cols, desc = describe_cols("zerocd2024")
    print("zerocd2024 columns:")
    display(desc)
elif has_table("zerocd"):
    cols, desc = describe_cols("zerocd")
    print("zerocd columns:")
    display(desc)
else:
    print("Zero curve tables not found. Available tables:")
    print([t for t in list_tables() if 'zero' in t.lower()])

In [None]:
def pull_zero_curve(start_date: str, end_date: str) -> pd.DataFrame:
    """
    Pull zero curve data from optionm.zerocd (or yearly tables).
    
    Returns DataFrame with columns:
        date, days, rate (continuously compounded)
    """
    frames = []
    
    # Try yearly tables first (zerocdYYYY)
    for y in YEARS:
        t = f"zerocd{y}"
        if not has_table(t):
            continue
        
        print(f"  Pulling from {t}...")
        
        q = f"""
        SELECT date, days, rate
        FROM {LIB}.{t}
        WHERE date BETWEEN '{start_date}' AND '{end_date}'
        """
        df = db.raw_sql(q, date_cols=["date"])
        if not df.empty:
            print(f"    -> {len(df):,} rows")
            frames.append(df)
    
    # Fallback to zerocd if yearly tables don't exist
    if not frames and has_table("zerocd"):
        print("  Pulling from zerocd...")
        q = f"""
        SELECT date, days, rate
        FROM {LIB}.zerocd
        WHERE date BETWEEN '{start_date}' AND '{end_date}'
        """
        df = db.raw_sql(q, date_cols=["date"])
        if not df.empty:
            frames.append(df)
    
    if not frames:
        raise RuntimeError("Could not pull zero curve data")
    
    zc = pd.concat(frames, ignore_index=True)
    zc = zc.drop_duplicates(subset=["date", "days"]).sort_values(["date", "days"])
    
    return zc

print("Pulling zero curve data...")
zc = pull_zero_curve(START_DATE, END_DATE)
print(f"\nTotal rows: {len(zc):,}")

In [None]:
# Inspect zero curve data
print("Zero Curve Summary:")
print(f"  Date range: {zc['date'].min()} to {zc['date'].max()}")
print(f"  Unique dates: {zc['date'].nunique():,}")
print(f"  Days range: {zc['days'].min()} to {zc['days'].max()}")
print(f"  Unique days: {zc['days'].nunique()}")
print(f"  Rate range: {zc['rate'].min():.4f} to {zc['rate'].max():.4f}")

print("\nSample data:")
display(zc.head(20))

In [None]:
# Plot a few sample days of the zero curve
import matplotlib.pyplot as plt

sample_dates = zc['date'].drop_duplicates().iloc[::500].head(5).tolist()  # Every 500th date

fig, ax = plt.subplots(figsize=(10, 5))
for d in sample_dates:
    curve = zc[zc['date'] == d].sort_values('days')
    ax.plot(curve['days'], curve['rate'] * 100, label=str(d)[:10], marker='o', markersize=3)

ax.set_xlabel('Days to Maturity')
ax.set_ylabel('Rate (%)')
ax.set_title('Sample Zero Curves')
ax.legend()
ax.grid(True, alpha=0.3)
plt.show()

In [None]:
# Save zero curve data
zc_path = RAW_DIR / "zero_curve" / f"zero_curve_{START_DATE}_{END_DATE}.csv.gz"
zc.to_csv(zc_path, index=False, compression="gzip")
print(f"Saved zero curve to: {zc_path}")

## 3. Pull Standard Option Prices

The `stdopd` table contains **forward prices** for standard maturities. This is critical for computing the dividend/carry yield $q(T)$:

$$q(T) = r(T) - \frac{1}{T} \ln\left(\frac{F(T)}{S_0}\right)$$

In [None]:
# First, let's explore the stdopd table structure
if has_table("stdopd2024"):
    cols, desc = describe_cols("stdopd2024")
    print("stdopd2024 columns:")
    display(desc)
elif has_table("stdopd"):
    cols, desc = describe_cols("stdopd")
    print("stdopd columns:")
    display(desc)
else:
    print("Stdopd tables not found. Available tables:")
    print([t for t in list_tables() if 'stdop' in t.lower()])

In [None]:
def pull_std_option_price(secid: int, start_date: str, end_date: str) -> pd.DataFrame:
    """
    Pull standard option prices from optionm.stdopd (or yearly tables).
    
    This table contains forward prices for standard maturities.
    """
    frames = []
    
    # Try yearly tables first (stdopdYYYY)
    for y in YEARS:
        t = f"stdopd{y}"
        if not has_table(t):
            continue
        
        print(f"  Pulling from {t}...")
        
        # Select key columns - forward_price is the critical one
        q = f"""
        SELECT secid, date, days, cp_flag, 
               forward_price, strike_price, 
               impl_volatility, impl_premium, bs_price
        FROM {LIB}.{t}
        WHERE secid = {secid}
          AND date BETWEEN '{start_date}' AND '{end_date}'
        """
        df = db.raw_sql(q, date_cols=["date"])
        if not df.empty:
            print(f"    -> {len(df):,} rows")
            frames.append(df)
    
    # Fallback to stdopd if yearly tables don't exist
    if not frames and has_table("stdopd"):
        print("  Pulling from stdopd...")
        q = f"""
        SELECT secid, date, days, cp_flag,
               forward_price, strike_price,
               impl_volatility, impl_premium, bs_price
        FROM {LIB}.stdopd
        WHERE secid = {secid}
          AND date BETWEEN '{start_date}' AND '{end_date}'
        """
        df = db.raw_sql(q, date_cols=["date"])
        if not df.empty:
            frames.append(df)
    
    if not frames:
        raise RuntimeError("Could not pull std option prices")
    
    stdop = pd.concat(frames, ignore_index=True)
    stdop = stdop.drop_duplicates().sort_values(["date", "days", "cp_flag"])
    
    return stdop

print(f"Pulling standard option prices for SECID {SECID}...")
stdop = pull_std_option_price(SECID, START_DATE, END_DATE)
print(f"\nTotal rows: {len(stdop):,}")

In [None]:
# Inspect std option price data
print("Standard Option Price Summary:")
print(f"  Date range: {stdop['date'].min()} to {stdop['date'].max()}")
print(f"  Unique dates: {stdop['date'].nunique():,}")
print(f"  Days range: {stdop['days'].min()} to {stdop['days'].max()}")
print(f"  Unique days: {sorted(stdop['days'].unique())}")
print(f"  cp_flag values: {sorted(stdop['cp_flag'].unique())}")

print("\nForward price stats:")
display(stdop['forward_price'].describe())

print("\nSample data:")
display(stdop.head(20))

In [None]:
# Check for missing forward prices
missing_fwd = stdop['forward_price'].isna().sum()
print(f"Missing forward_price: {missing_fwd:,} ({100*missing_fwd/len(stdop):.2f}%)")

# Check for sentinel values
sentinel_fwd = (stdop['forward_price'] < 0).sum()
print(f"Negative forward_price (sentinel?): {sentinel_fwd:,}")

# Forward prices per day per maturity (should be same for C and P)
fwd_by_day = stdop.groupby(['date', 'days'])['forward_price'].nunique()
print(f"\nForward prices per (date, days): min={fwd_by_day.min()}, max={fwd_by_day.max()}")

In [None]:
# Plot forward prices for a sample date across maturities
sample_date = stdop['date'].iloc[len(stdop)//2]  # Middle date
sample = stdop[stdop['date'] == sample_date].drop_duplicates(subset=['days'])

fig, ax = plt.subplots(figsize=(10, 5))
ax.plot(sample['days'], sample['forward_price'], 'o-', markersize=8)
ax.set_xlabel('Days to Maturity')
ax.set_ylabel('Forward Price ($)')
ax.set_title(f'Forward Prices on {str(sample_date)[:10]}')
ax.grid(True, alpha=0.3)
plt.show()

In [None]:
# Save std option price data
stdop_path = RAW_DIR / "std_option_price" / f"{TICKER}_stdopd_{START_DATE}_{END_DATE}.csv.gz"
stdop.to_csv(stdop_path, index=False, compression="gzip")
print(f"Saved std option prices to: {stdop_path}")

## 4. Summary

In [None]:
print("="*60)
print("SUMMARY")
print("="*60)
print(f"Ticker: {TICKER}")
print(f"SECID: {SECID}")
print(f"Date range: {START_DATE} to {END_DATE}")
print()
print(f"Zero curve:         {len(zc):,} rows")
print(f"  Saved to: {zc_path}")
print()
print(f"Std option prices:  {len(stdop):,} rows")
print(f"  Saved to: {stdop_path}")
print("="*60)

In [None]:
# Close WRDS connection
db.close()
print("WRDS connection closed.")