# Database Check
Validate raw + derived tables and provenance for the FINRA + Polygon pipeline.


In [None]:
from darkpool_analysis.config import load_config
from darkpool_analysis.db import get_connection
import pandas as pd

config = load_config()
conn = get_connection(config.db_path)


In [None]:
tables = conn.execute("SHOW TABLES").df()["name"].tolist()
tables


In [None]:
from IPython.display import display

for table in tables:
    print(table)
    display(conn.execute(f"DESCRIBE {table}").df())
    display(conn.execute(f"SELECT COUNT(*) AS n FROM {table}").df())


In [None]:
sample_symbol = "AMZN"
start_date = min(config.target_dates)
end_date = max(config.target_dates)
sample_symbol, start_date, end_date


## Raw Tables Samples


In [None]:
conn.execute("""
SELECT * FROM finra_otc_weekly_raw
WHERE symbol = ?
ORDER BY week_start_date DESC
LIMIT 5
""", [sample_symbol]).df()


In [None]:
conn.execute("""
SELECT * FROM finra_short_daily_raw
WHERE symbol = ? AND trade_date BETWEEN ? AND ?
ORDER BY trade_date DESC
LIMIT 5
""", [sample_symbol, start_date, end_date]).df()


In [None]:
conn.execute("""
SELECT * FROM polygon_daily_agg_raw
WHERE symbol = ? AND trade_date BETWEEN ? AND ?
ORDER BY trade_date DESC
LIMIT 5
""", [sample_symbol, start_date, end_date]).df()


In [None]:
conn.execute("""
SELECT * FROM polygon_equity_trades_raw
WHERE symbol = ? AND timestamp::DATE BETWEEN ? AND ?
ORDER BY timestamp DESC
LIMIT 5
""", [sample_symbol, start_date, end_date]).df()


## Derived Tables Samples


In [None]:
conn.execute("""
SELECT * FROM lit_direction_daily
WHERE symbol = ? AND date BETWEEN ? AND ?
ORDER BY date DESC
LIMIT 5
""", [sample_symbol, start_date, end_date]).df()


In [None]:
conn.execute("""
SELECT * FROM daily_metrics
WHERE symbol = ? AND date BETWEEN ? AND ?
ORDER BY date DESC
LIMIT 10
""", [sample_symbol, start_date, end_date]).df()


In [None]:
conn.execute("""
SELECT index_symbol, trade_date, coverage_count, expected_constituent_count, coverage_pct
FROM index_constituent_short_agg_daily
ORDER BY trade_date DESC
LIMIT 5
""").df()


## Coverage and Provenance Checks


In [None]:
conn.execute("""
SELECT symbol, COUNT(*) AS rows
FROM daily_metrics
WHERE date BETWEEN ? AND ?
GROUP BY symbol
ORDER BY symbol
""", [start_date, end_date]).df()


In [None]:
conn.execute("""
SELECT date, symbol, otc_off_exchange_volume, otc_week_used, data_quality
FROM daily_metrics
WHERE symbol = ? AND date BETWEEN ? AND ?
ORDER BY date DESC
""", [sample_symbol, start_date, end_date]).df()


## Sanity Checks


In [None]:
conn.execute("""
SELECT symbol, date, lit_buy_volume, lit_sell_volume, log_buy_sell
FROM lit_direction_daily
WHERE (lit_buy_volume <= 0 OR lit_sell_volume <= 0)
  AND log_buy_sell IS NOT NULL
LIMIT 10
""").df()


In [None]:
conn.execute("""
SELECT symbol, date, short_ratio, short_ratio_denominator_type
FROM daily_metrics
WHERE short_ratio_denominator_type IS NOT NULL
  AND (short_ratio < 0 OR short_ratio > 1)
LIMIT 10
""").df()


In [None]:
conn.execute("""
SELECT symbol, date, COUNT(*) AS n
FROM daily_metrics
GROUP BY symbol, date
HAVING n > 1
""").df()


In [None]:
conn.execute("""
SELECT symbol, date, COUNT(*) AS n
FROM lit_direction_daily
GROUP BY symbol, date
HAVING n > 1
""").df()


In [None]:
conn.close()
