In [1]:
import ipywidgets as widgets
from pathlib import Path
import pandas as pd
import numpy as np

RAW_DIR = Path("../raw")   
PROCESSED_DIR = Path("../processed")
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

TICKER = "AAPL"
START_DATE = "2016-01-01"
END_DATE   = "2025-12-31"

vs_path = RAW_DIR / f"{TICKER}_vsurfd_{START_DATE}_{END_DATE}.csv.gz"


vs = pd.read_csv(vs_path, parse_dates=["date"])


vs.shape


(908446, 9)

In [2]:
# Ensure types
vs["days"] = vs["days"].astype(int)
vs["cp_flag"] = vs["cp_flag"].str.upper().astype("category")

# Normalize delta: if it looks like 10..90 convert to 0.10..0.90
if vs["delta"].max() > 2:
    vs["delta"] = vs["delta"] / 100.0

print("date range:", vs["date"].min(), "->", vs["date"].max())
print("unique dates:", vs["date"].nunique())
print("unique days:", vs["days"].nunique())
print("unique deltas:", vs["delta"].nunique())
print("cp:", vs["cp_flag"].unique())


date range: 2016-01-04 00:00:00 -> 2025-08-29 00:00:00
unique dates: 2429
unique days: 11
unique deltas: 34
cp: ['P', 'C']
Categories (2, object): ['C', 'P']


In [3]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import ipywidgets as widgets
from IPython.display import display

days_grid  = np.sort(vs["days"].unique())
delta_grid = np.sort(vs["delta"].unique())
dates = np.sort(vs["date"].unique())

def build_surface_matrix(day_vs, cp="C"):
    # pivot to days x delta
    mat = (day_vs[day_vs["cp_flag"] == cp]
           .pivot_table(index="days", columns="delta", values="impl_volatility", aggfunc="mean")
           .reindex(index=days_grid, columns=delta_grid))

    # fill missing values lightly for visualization
    mat = mat.copy()
    mat = mat.interpolate(axis=1, limit_direction="both")  # along delta
    mat = mat.interpolate(axis=0, limit_direction="both")  # along days
    mat = mat.ffill().bfill()

    return mat

In [4]:
date_slider = widgets.IntSlider(min=0, max=len(dates)-1, step=1, value=0, description="Date")
cp_toggle = widgets.ToggleButtons(options=[("Call", "C"), ("Put", "P")], description="Side")

out = widgets.Output()

def render(date_idx, cp):
    d = pd.to_datetime(dates[date_idx])
    day_vs = vs[vs["date"] == d]
    mat = build_surface_matrix(day_vs, cp=cp)

    # Build X/Y mesh for plotly
    X, Y = np.meshgrid(mat.columns.astype(float), mat.index.astype(int))
    Z = mat.values.astype(float)

    fig = go.Figure(data=[go.Surface(x=X, y=Y, z=Z)])
    fig.update_layout(
        title=f"{'CALL' if cp=='C' else 'PUT'} Vol Surface — {d.date()}",
        scene=dict(
            xaxis_title="Delta",
            yaxis_title="Days to Expiration",
            zaxis_title="Implied Vol",
        ),
        height=650,
        width=950,
        margin=dict(l=0, r=0, t=40, b=0),
    )
    fig.show()

def on_change(_=None):
    out.clear_output(wait=True)
    with out:
        render(date_slider.value, cp_toggle.value)

date_slider.observe(on_change, names="value")
cp_toggle.observe(on_change, names="value")

display(widgets.HBox([date_slider, cp_toggle]))
display(out)

on_change()


HBox(children=(IntSlider(value=0, description='Date', max=2428), ToggleButtons(description='Side', options=(('…

Output()

In [5]:
pd.set_option('display.max_rows', None)
vs.head()

Unnamed: 0,secid,date,days,delta,cp_flag,impl_volatility,impl_strike,impl_premium,dispersion
0,101594.0,2016-01-04,10,-0.9,P,0.351577,113.5996,8.665669,0.214947
1,101594.0,2016-01-04,10,-0.85,P,0.296881,110.8722,6.048973,0.144321
2,101594.0,2016-01-04,10,-0.8,P,0.267112,109.3283,4.620924,0.069316
3,101594.0,2016-01-04,10,-0.75,P,0.261231,108.4435,3.891608,0.023896
4,101594.0,2016-01-04,10,-0.7,P,0.263302,107.7605,3.397845,0.007847


In [6]:
import os, json
from pathlib import Path
import numpy as np
import pandas as pd

# ---- paths ----
PROCESSED_DIR = Path("../processed").resolve()
PARQUET_DIR = PROCESSED_DIR / "parquet"
META_DIR = PROCESSED_DIR / "meta"
PARQUET_DIR.mkdir(parents=True, exist_ok=True)
META_DIR.mkdir(parents=True, exist_ok=True)

OUT_PARQUET = PARQUET_DIR / "AAPL_vsurf_processed.parquet"
OUT_META = META_DIR / "AAPL_vsurf_processed_meta.json"

# ---- required columns ----
required = [
    "secid", "date", "days", "delta", "cp_flag",
    "impl_volatility", "impl_strike", "impl_premium", "dispersion"
]
missing = [c for c in required if c not in vs.columns]
if missing:
    raise ValueError(f"Missing required columns: {missing}")

# ---- date parsing  ----
vs["date"] = pd.to_datetime(vs["date"], errors="raise").dt.normalize()

# ---- types  ----
vs["secid"] = vs["secid"].astype("int32")
vs["days"] = pd.to_numeric(vs["days"], errors="raise").astype("float32")
vs["delta"] = pd.to_numeric(vs["delta"], errors="raise").astype("float32")
vs["cp_flag"] = vs["cp_flag"].astype("category")  # efficient in parquet

for c in ["impl_volatility", "impl_strike", "impl_premium", "dispersion"]:
    vs[c] = pd.to_numeric(vs[c], errors="raise").astype("float32")

# ---- sanity filters  ----
mask = (
    (vs["days"] > 0) &
    np.isfinite(vs["impl_volatility"]) & (vs["impl_volatility"] > 0) &
    np.isfinite(vs["impl_strike"]) & (vs["impl_strike"] > 0) &
    np.isfinite(vs["impl_premium"]) & (vs["impl_premium"] >= 0) &
    np.isfinite(vs["dispersion"])
)
vs = vs.loc[mask].copy()

# ---- uniqueness check per (date, maturity, delta, cp_flag) ----
key = ["date", "days", "delta", "cp_flag"]
dup_count = vs.duplicated(subset=key).sum()
if dup_count > 0:
    # If duplicates exist, keep the first :
    vs = vs.drop_duplicates(subset=key, keep="first").copy()

# ---- sort for determinism/debugging ----
vs = vs.sort_values(["date", "days", "delta", "cp_flag"]).reset_index(drop=True)

# ---- write parquet ----
vs.to_parquet(
    OUT_PARQUET,
    index=False,
    engine="pyarrow",
    compression="zstd"  
)

# ---- write small metadata file  ----
meta = {
    "secid_min": int(vs["secid"].min()),
    "secid_max": int(vs["secid"].max()),
    "date_min": str(vs["date"].min().date()),
    "date_max": str(vs["date"].max().date()),
    "n_rows": int(len(vs)),
    "n_dates": int(vs["date"].nunique()),
    "n_days": int(vs["days"].nunique()),
    "n_delta": int(vs["delta"].nunique()),
    "cp_flag_levels": list(map(str, vs["cp_flag"].cat.categories)),
    "duplicate_rows_dropped": int(dup_count),
    "columns": list(vs.columns),
    "notes": "Minimal cleaning + deterministic dtypes; vendor signed delta preserved"
}

with open(OUT_META, "w") as f:
    json.dump(meta, f, indent=2)

print("Saved:", OUT_PARQUET)
print("Meta :", OUT_META)
meta

Saved: C:\Users\Admin\OneDrive\Desktop\Fifth Year\Computer Science\CS4490 Thesis\Codebase\Thesis\Data\processed\parquet\AAPL_vsurf_processed.parquet
Meta : C:\Users\Admin\OneDrive\Desktop\Fifth Year\Computer Science\CS4490 Thesis\Codebase\Thesis\Data\processed\meta\AAPL_vsurf_processed_meta.json


{'secid_min': 101594,
 'secid_max': 101594,
 'date_min': '2016-01-04',
 'date_max': '2025-08-29',
 'n_rows': 908446,
 'n_dates': 2429,
 'n_days': 11,
 'n_delta': 34,
 'cp_flag_levels': ['C', 'P'],
 'duplicate_rows_dropped': 0,
 'columns': ['secid',
  'date',
  'days',
  'delta',
  'cp_flag',
  'impl_volatility',
  'impl_strike',
  'impl_premium',
  'dispersion'],
 'notes': 'Minimal cleaning + deterministic dtypes; vendor signed delta preserved'}

In [8]:
from pathlib import Path
import numpy as np
import pandas as pd

# --- locate parquet robustly (root vs Data/processed notebook cwd) ---
candidates = [
    Path("./Data/processed/parquet/AAPL_vsurf_processed.parquet"),
    Path("./processed/parquet/AAPL_vsurf_processed.parquet"),
    Path("../Data/processed/parquet/AAPL_vsurf_processed.parquet"),
    Path("./parquet/AAPL_vsurf_processed.parquet"),
]
PARQUET_PATH = next((p for p in candidates if p.exists()), None)
if PARQUET_PATH is None:
    raise FileNotFoundError("Could not find AAPL_vsurf_processed.parquet in expected locations.")

vs = pd.read_parquet(PARQUET_PATH)

# --- basic cardinalities ---
vs["date"] = pd.to_datetime(vs["date"]).dt.normalize()
vs["delta_abs"] = vs["delta"].abs().round(6)   # stable grouping key

n_dates = vs["date"].nunique()
days_grid = np.sort(vs["days"].unique())
delta_abs_grid = np.sort(vs["delta_abs"].unique())
cp_levels = (
    list(vs["cp_flag"].cat.categories)
    if str(vs["cp_flag"].dtype) == "category"
    else sorted(vs["cp_flag"].unique())
)

print("Loaded:", PARQUET_PATH)
print("Rows:", len(vs))
print("Date range:", vs["date"].min().date(), "->", vs["date"].max().date())
print("Unique dates:", n_dates)
print("Unique maturities (days):", len(days_grid), "| min/max:", float(days_grid.min()), float(days_grid.max()))
print("Unique |delta|:", len(delta_abs_grid), "| min/max:", float(delta_abs_grid.min()), float(delta_abs_grid.max()))
print("cp_flag levels:", cp_levels)

# --- does each date have a "complete" surface on the global grid? (days x |delta| x cp_flag) ---
# count unique grid points actually present per date
combo_counts = (
    vs.drop_duplicates(["date", "days", "delta_abs", "cp_flag"])
      .groupby("date")
      .size()
)

expected_per_date = len(days_grid) * len(delta_abs_grid) * len(cp_levels)
coverage = combo_counts / expected_per_date

print("\nExpected unique grid points per date:", expected_per_date)
print("Coverage summary (actual/expected):")
print(coverage.describe(percentiles=[0.01, 0.05, 0.1, 0.5, 0.9, 0.95, 0.99]))

n_full = int((coverage == 1.0).sum())
print(f"\nDates with FULL global-grid coverage: {n_full}/{n_dates} ({n_full/n_dates:.2%})")

# --- how stable are maturities/deltas across dates? ---
days_per_date = vs.groupby("date")["days"].nunique()
deltas_per_date = vs.groupby("date")["delta_abs"].nunique()

print("\nUnique maturities per date (days) summary:")
print(days_per_date.describe(percentiles=[0.05, 0.5, 0.95]))
print("\nUnique |delta| per date summary:")
print(deltas_per_date.describe(percentiles=[0.05, 0.5, 0.95]))

# --- intersection grid (present on ALL dates) & top coverage grid points ---
days_coverage = (vs.drop_duplicates(["date", "days"]).groupby("days")["date"].nunique() / n_dates).sort_values(ascending=False)
delta_coverage = (vs.drop_duplicates(["date", "delta_abs"]).groupby("delta_abs")["date"].nunique() / n_dates).sort_values(ascending=False)

common_days = days_coverage[days_coverage == 1.0].index.to_numpy()
common_deltas = delta_coverage[delta_coverage == 1.0].index.to_numpy()

print("\nIntersection maturities (days) present on ALL dates:", len(common_days))
print("Intersection |delta| present on ALL dates:", len(common_deltas))

print("\nTop 15 maturities by date-coverage:")
print(days_coverage.head(15))
print("\nTop 15 |delta| by date-coverage:")
print(delta_coverage.head(15))

# --- show a concrete example of what's missing on the first incomplete date ---
incomplete_dates = coverage[coverage < 1.0].index
if len(incomplete_dates) > 0:
    d0 = incomplete_dates[0]
    df0 = vs[vs["date"] == d0].drop_duplicates(["days", "delta_abs", "cp_flag"])
    present = pd.MultiIndex.from_frame(df0[["days", "delta_abs", "cp_flag"]])
    all_idx = pd.MultiIndex.from_product([days_grid, delta_abs_grid, cp_levels], names=["days", "delta_abs", "cp_flag"])
    missing = all_idx.difference(present)

    print(f"\nFirst incomplete date: {d0.date()} | missing points: {len(missing)}")
    print("First 25 missing (days, |delta|, cp_flag):")
    print(list(missing[:25]))
else:
    print("\nAll dates are complete on the global grid (days x |delta| x cp_flag).")


Loaded: parquet\AAPL_vsurf_processed.parquet
Rows: 908446
Date range: 2016-01-04 -> 2025-08-29
Unique dates: 2429
Unique maturities (days): 11 | min/max: 10.0 730.0
Unique |delta|: 17 | min/max: 0.10000000149011612 0.8999999761581421
cp_flag levels: ['C', 'P']

Expected unique grid points per date: 374
Coverage summary (actual/expected):
count    2429.0
mean        1.0
std         0.0
min         1.0
1%          1.0
5%          1.0
10%         1.0
50%         1.0
90%         1.0
95%         1.0
99%         1.0
max         1.0
dtype: float64

Dates with FULL global-grid coverage: 2429/2429 (100.00%)

Unique maturities per date (days) summary:
count    2429.0
mean       11.0
std         0.0
min        11.0
5%         11.0
50%        11.0
95%        11.0
max        11.0
Name: days, dtype: float64

Unique |delta| per date summary:
count    2429.0
mean       17.0
std         0.0
min        17.0
5%         17.0
50%        17.0
95%        17.0
max        17.0
Name: delta_abs, dtype: float64

