In [1]:
"""
Runs automated checks on ingested price and options data:
  - Price spike detection (z-score & IQR)
  - Bid-ask inversion (bid > ask)
  - Missing expiration gaps
  - Implied volatility outliers
  - Zero / negative price detection
  - Data freshness check
"""

import logging
import sqlite3
from datetime import datetime, timedelta

import numpy as np
import pandas as pd
from sqlalchemy import create_engine, text

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


In [2]:
log = logging.getLogger(__name__)
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s  %(levelname)-8s  %(message)s",
    handlers=[logging.FileHandler("pipeline.log"), logging.StreamHandler()],
)

DB_PATH = "quant.db"

In [3]:
# Thresholds 
ZSCORE_SPIKE_THRESHOLD = 4.0    # daily return z-score beyond this is a spike
IQR_MULTIPLIER = 3.0            # IQR fence for price level outliers
MAX_BID_ASK_SPREAD_PCT = 0.50   # flag if (ask-bid)/mid > 50%
MAX_IV_THRESHOLD = 5.0          # 500% IV is likely garbage data
MIN_OPTION_ROWS_PER_EXPIRY = 5  # flag expirations with too few strikes

In [4]:
def load_price_data(engine) -> pd.DataFrame:
    with sqlite3.connect(DB_PATH) as conn:
        df = pd.read_sql("SELECT * FROM price_history ORDER BY date ASC", conn)
    df["date"] = pd.to_datetime(df["date"])
    return df

def load_options_data(engine) -> pd.DataFrame:
    with sqlite3.connect(DB_PATH) as conn:
        df = pd.read_sql("SELECT * FROM options_chain", conn)
    df["expiration"] = pd.to_datetime(df["expiration"])
    return df

In [5]:
# Daily return spikes 
def check_price_spikes(df: pd.DataFrame) -> pd.DataFrame:
    """Flag rows where the daily return is a statistical outlier (z-score)."""
    df = df.copy().sort_values("date")
    df["daily_return"] = df["close"].pct_change()

    mean_ret = df["daily_return"].mean()
    std_ret = df["daily_return"].std()
    df["zscore"] = (df["daily_return"] - mean_ret) / std_ret

    spikes = df[df["zscore"].abs() > ZSCORE_SPIKE_THRESHOLD].copy()
    spikes["anomaly_type"] = "price_spike"
    spikes["detail"] = spikes.apply(
        lambda r: f"Return={r['daily_return']:.2%}, Z={r['zscore']:.2f}", axis=1
    )

    log.info(f"[CHECK 1] Price spikes (|z| > {ZSCORE_SPIKE_THRESHOLD}): {len(spikes)} found")
    for _, row in spikes.iterrows():
        log.warning(f"  Spike on {row['date'].date()}: {row['detail']}")

    return spikes[["date", "close", "daily_return", "zscore", "anomaly_type", "detail"]]

In [6]:
# Zero / negative prices
def check_zero_negative_prices(df: pd.DataFrame) -> pd.DataFrame:
    bad = df[(df["close"] <= 0) | (df["open"] <= 0) | (df["high"] <= 0) | (df["low"] <= 0)].copy()
    bad["anomaly_type"] = "zero_or_negative_price"
    bad["detail"] = "One or more OHLC fields <= 0"

    log.info(f"[CHECK 2] Zero/negative prices: {len(bad)} found")
    if not bad.empty:
        log.warning(bad[["date", "open", "high", "low", "close"]].to_string())
    return bad[["date", "close", "anomaly_type", "detail"]]

In [7]:
# OHLC consistency 
def check_ohlc_consistency(df: pd.DataFrame) -> pd.DataFrame:
    """High must be >= Low, and High >= Open/Close, Low <= Open/Close."""
    bad = df[
        (df["high"] < df["low"]) |
        (df["high"] < df["open"]) |
        (df["high"] < df["close"]) |
        (df["low"] > df["open"]) |
        (df["low"] > df["close"])
    ].copy()
    bad["anomaly_type"] = "ohlc_inconsistency"
    bad["detail"] = "OHLC relationship violated"

    log.info(f"[CHECK 3] OHLC inconsistencies: {len(bad)} found")
    for _, row in bad.iterrows():
        log.warning(f"  {row['date'].date()} O={row['open']} H={row['high']} L={row['low']} C={row['close']}")
    return bad[["date", "open", "high", "low", "close", "anomaly_type", "detail"]]

In [8]:
# Missing trading day gaps
def check_missing_dates(df: pd.DataFrame, max_gap_days=5) -> pd.DataFrame:
    """
    Flags gaps larger than max_gap_days between consecutive trading dates.
    (Normal weekends = 3-day gap; holidays can create 4-day gaps.)
    """
    df = df.sort_values("date").copy()
    df["prev_date"] = df["date"].shift(1)
    df["gap_days"] = (df["date"] - df["prev_date"]).dt.days

    gaps = df[df["gap_days"] > max_gap_days].copy()
    gaps["anomaly_type"] = "missing_date_gap"
    gaps["detail"] = gaps["gap_days"].apply(lambda g: f"Gap of {g} calendar days")

    log.info(f"[CHECK 4] Date gaps > {max_gap_days} days: {len(gaps)} found")
    for _, row in gaps.iterrows():
        log.warning(f"  Gap ending {row['date'].date()}: {row['detail']}")

    return gaps[["date", "gap_days", "anomaly_type", "detail"]]

In [9]:
# Bid-ask inversions 
def check_bid_ask_inversions(df: pd.DataFrame) -> pd.DataFrame:
    """Bid > Ask is a data error; also flags extremely wide spreads."""
    df = df.copy()

    # Inversion: bid strictly greater than ask
    inversions = df[df["bid"] > df["ask"]].copy()
    inversions["anomaly_type"] = "bid_ask_inversion"
    inversions["detail"] = inversions.apply(
        lambda r: f"bid={r['bid']:.2f} > ask={r['ask']:.2f}", axis=1
    )

    # Wide spreads (non-inverted)
    valid = df[(df["bid"] > 0) & (df["ask"] > 0) & (df["ask"] >= df["bid"])].copy()
    valid["mid"] = (valid["bid"] + valid["ask"]) / 2
    valid["spread_pct"] = (valid["ask"] - valid["bid"]) / valid["mid"].replace(0, np.nan)
    wide = valid[valid["spread_pct"] > MAX_BID_ASK_SPREAD_PCT].copy()
    wide["anomaly_type"] = "wide_bid_ask_spread"
    wide["detail"] = wide["spread_pct"].apply(lambda s: f"Spread={s:.1%}")

    result = pd.concat([inversions, wide], ignore_index=True)
    log.info(
        f"[CHECK 5] Bid-ask inversions: {len(inversions)} | Wide spreads: {len(wide)}"
    )
    return result[["expiration", "option_type", "strike", "bid", "ask", "anomaly_type", "detail"]]

In [10]:
# Implied volatility outliers
def check_iv_outliers(df: pd.DataFrame) -> pd.DataFrame:
    """Flag IV = 0 (missing) or IV > MAX_IV_THRESHOLD (garbage)."""
    df = df.copy()

    zero_iv = df[df["implied_vol"] <= 0].copy()
    zero_iv["anomaly_type"] = "zero_implied_vol"
    zero_iv["detail"] = "IV is zero or negative — likely missing data"

    high_iv = df[df["implied_vol"] > MAX_IV_THRESHOLD].copy()
    high_iv["anomaly_type"] = "extreme_implied_vol"
    high_iv["detail"] = high_iv["implied_vol"].apply(lambda v: f"IV={v:.1%}")

    result = pd.concat([zero_iv, high_iv], ignore_index=True)
    log.info(f"[CHECK 6] IV outliers — zero: {len(zero_iv)}, extreme: {len(high_iv)}")
    return result[["expiration", "option_type", "strike", "implied_vol", "anomaly_type", "detail"]]

In [11]:
# Sparse expirations 
def check_sparse_expirations(df: pd.DataFrame) -> pd.DataFrame:
    """Flag expiration dates with very few strikes (suggests incomplete data)."""
    counts = (
        df.groupby(["expiration", "option_type"])
        .size()
        .reset_index(name="strike_count")
    )
    sparse = counts[counts["strike_count"] < MIN_OPTION_ROWS_PER_EXPIRY].copy()
    sparse["anomaly_type"] = "sparse_expiration"
    sparse["detail"] = sparse["strike_count"].apply(lambda c: f"Only {c} strikes loaded")

    log.info(f"[CHECK 7] Sparse expirations: {len(sparse)} found")
    for _, row in sparse.iterrows():
        log.warning(
            f"  {row['expiration'].date()} {row['option_type']}: {row['detail']}"
        )
    return sparse

In [12]:
# Data freshness 
def check_data_freshness(price_df: pd.DataFrame, max_stale_days=5):
    """Warn if the most recent price date is older than max_stale_days."""
    if price_df.empty:
        log.warning("[CHECK 8] No price data to check freshness.")
        return

    latest = price_df["date"].max()
    staleness = (pd.Timestamp.today() - latest).days
    if staleness > max_stale_days:
        log.warning(
            f"[CHECK 8] Data is STALE — latest date: {latest.date()}, "
            f"{staleness} days ago (threshold: {max_stale_days})"
        )
    else:
        log.info(
            f"[CHECK 8] Data freshness OK — latest date: {latest.date()} ({staleness} days ago)"
        )

In [13]:
# Summary report
def print_summary(anomaly_dict: dict):
    print("\n" + "=" * 60)
    print("DATA QUALITY REPORT SUMMARY")
    print("=" * 60)
    total = 0
    for check_name, df in anomaly_dict.items():
        count = len(df)
        total += count
        status = "  PASS" if count == 0 else f"   {count} ISSUE(S)"
        print(f"  {check_name:<35} {status}")
    print("-" * 60)
    print(f"  {'TOTAL ANOMALIES':<35} {total}")
    print("=" * 60 + "\n")
    return total

In [14]:
# Main entry
def run_all_checks(engine=None):
    if engine is None:
        engine = create_engine(f"sqlite:///{DB_PATH}", echo=False)

    log.info("VALIDATION PIPELINE START")

    price_df = load_price_data(engine)
    options_df = load_options_data(engine)

    # Price checks
    spikes    = check_price_spikes(price_df)       if not price_df.empty else pd.DataFrame()
    zeros     = check_zero_negative_prices(price_df) if not price_df.empty else pd.DataFrame()
    ohlc      = check_ohlc_consistency(price_df)   if not price_df.empty else pd.DataFrame()
    gaps      = check_missing_dates(price_df)       if not price_df.empty else pd.DataFrame()
    check_data_freshness(price_df)

    # Options checks
    ba_issues = check_bid_ask_inversions(options_df) if not options_df.empty else pd.DataFrame()
    iv_issues = check_iv_outliers(options_df)        if not options_df.empty else pd.DataFrame()
    sparse    = check_sparse_expirations(options_df) if not options_df.empty else pd.DataFrame()

    anomalies = {
        "Price Spikes":           spikes,
        "Zero/Negative Prices":   zeros,
        "OHLC Inconsistencies":   ohlc,
        "Missing Date Gaps":      gaps,
        "Bid-Ask Issues":         ba_issues,
        "IV Outliers":            iv_issues,
        "Sparse Expirations":     sparse,
    }

    total = print_summary(anomalies)
    return anomalies, total


if __name__ == "__main__":
    run_all_checks()

2026-02-17 22:59:32,885  INFO      VALIDATION PIPELINE START
2026-02-17 22:59:32,901  INFO      [CHECK 1] Price spikes (|z| > 4.0): 4 found
2026-02-17 22:59:32,906  INFO      [CHECK 2] Zero/negative prices: 0 found
2026-02-17 22:59:32,908  INFO      [CHECK 3] OHLC inconsistencies: 0 found
2026-02-17 22:59:32,911  INFO      [CHECK 4] Date gaps > 5 days: 0 found
2026-02-17 22:59:32,913  INFO      [CHECK 8] Data freshness OK — latest date: 2026-02-17 (0 days ago)
2026-02-17 22:59:32,917  INFO      [CHECK 5] Bid-ask inversions: 0 | Wide spreads: 47
2026-02-17 22:59:32,921  INFO      [CHECK 6] IV outliers — zero: 0, extreme: 2
2026-02-17 22:59:32,924  INFO      [CHECK 7] Sparse expirations: 0 found



DATA QUALITY REPORT SUMMARY
  Price Spikes                           4 ISSUE(S)
  Zero/Negative Prices                  PASS
  OHLC Inconsistencies                  PASS
  Missing Date Gaps                     PASS
  Bid-Ask Issues                         47 ISSUE(S)
  IV Outliers                            2 ISSUE(S)
  Sparse Expirations                    PASS
------------------------------------------------------------
  TOTAL ANOMALIES                     53

