# Database Check
Quick validation of database health after repopulation.

| Cell | Description |
|------|-------------|
| 1 | Setup - imports and database connection |
| 2 | Quick table overview with row counts |
| 3 | Data Completeness - newest/oldest 5 days + NULL flagging |
| 4 | Multi-Table Health Summary - freshness check |
| 5 | Consolidated Sanity Checks - duplicates, invalid values |
| 6 | Cleanup - close connection |

**Usage:** Change `check_symbol` in Cell 3 to test different tickers.

In [1]:
# Cell 1: Setup - imports and database connection
import sys
from pathlib import Path

# Add darkpool root to path for darkpool_analysis imports
darkpool_root = Path(__file__).parent.parent if '__file__' in dir() else Path.cwd().parent
sys.path.insert(0, str(darkpool_root))

from darkpool_analysis.config import load_config
from darkpool_analysis.db import get_connection
import pandas as pd

config = load_config()
conn = get_connection(config.db_path)

In [2]:
# Cell 2: Quick Table Overview - all tables with row counts
from IPython.display import display

overview = conn.execute("""
    SELECT name as table_name
    FROM (SHOW TABLES)
""").df()

# Add row counts
row_counts = []
for tbl in overview['table_name']:
    cnt = conn.execute(f"SELECT COUNT(*) as n FROM {tbl}").df()['n'].iloc[0]
    row_counts.append(cnt)
overview['rows'] = row_counts
display(overview)

Unnamed: 0,table_name,rows
0,composite_signal,0
1,daily_metrics,2953
2,finra_otc_weekly_raw,508
3,finra_short_daily_raw,2953
4,index_constituent_short_agg_daily,0
5,lit_direction_daily,2953
6,options_premium_daily,149927
7,options_premium_summary,2867
8,polygon_daily_agg_raw,2953
9,polygon_equity_trades_raw,1790336


In [3]:
# Cell 3: Data Completeness Check - newest/oldest 5 days with NULL field flagging

check_symbol = "META"  # Change to test different tickers

print(f"{'='*60}")
print(f"DATA COMPLETENESS CHECK FOR: {check_symbol}")
print(f"{'='*60}")

# Get newest 5 days
print("\nüìÖ NEWEST 5 DAYS (daily_metrics):")
newest = conn.execute("""
    SELECT * FROM daily_metrics
    WHERE symbol = ?
    ORDER BY date DESC
    LIMIT 5
""", [check_symbol]).df()
display(newest)

# Get oldest 5 days
print("\nüìÖ OLDEST 5 DAYS (daily_metrics):")
oldest = conn.execute("""
    SELECT * FROM daily_metrics
    WHERE symbol = ?
    ORDER BY date ASC
    LIMIT 5
""", [check_symbol]).df()
display(oldest)

# Combine for NULL analysis
combined = pd.concat([newest, oldest], ignore_index=True)

# Count NULLs per column
print("\nüîç NULL FIELD ANALYSIS (across newest + oldest 10 rows):")
null_counts = combined.isnull().sum()
total_rows = len(combined)

# Create summary dataframe
null_summary = pd.DataFrame({
    'column': null_counts.index,
    'null_count': null_counts.values,
    'total_rows': total_rows,
    'null_pct': (null_counts.values / total_rows * 100).round(1),
    'status': ['‚ö†Ô∏è MISSING' if n > 0 else '‚úÖ OK' for n in null_counts.values]
})

# Show only columns with issues first, then OK columns
null_summary_sorted = null_summary.sort_values(['null_count', 'column'], ascending=[False, True])
display(null_summary_sorted)

# Summary flag
missing_cols = null_summary[null_summary['null_count'] > 0]['column'].tolist()
if missing_cols:
    print(f"\n‚ö†Ô∏è  ALERT: {len(missing_cols)} columns have NULL values:")
    for col in missing_cols:
        pct = null_summary[null_summary['column'] == col]['null_pct'].values[0]
        print(f"   - {col}: {pct}% null")
else:
    print("\n‚úÖ ALL FIELDS POPULATED - No NULL values detected")

DATA COMPLETENESS CHECK FOR: META

üìÖ NEWEST 5 DAYS (daily_metrics):


Unnamed: 0,date,symbol,log_buy_sell,short_volume,short_exempt_volume,short_total_volume,short_sell_volume,short_ratio,short_ratio_z,short_buy_sell_ratio,...,accumulation_score_display,confidence,data_quality,has_otc,has_short,has_lit,has_price,pressure_context_label,inference_version,accumulation_short_z_source
0,2026-01-05,META,0.157048,1923528.0,30835.0,4827906.0,2904378.0,0.398419,-0.285768,0.662286,...,46.892168,0.7,PRE_OTC,True,True,True,True,Neutral,PhaseA_v1,short_buy_sell_ratio_z
1,2026-01-02,META,0.223324,2261841.0,17448.0,5486100.0,3224259.0,0.412286,-0.184593,0.701507,...,45.594187,0.7,PRE_OTC,True,True,True,True,Neutral,PhaseA_v1,short_buy_sell_ratio_z
2,2025-12-31,META,0.618645,1273796.0,10784.0,3126429.0,1852633.0,0.407428,-0.224176,0.68756,...,52.033115,0.7,PRE_OTC,True,True,True,True,Neutral,PhaseA_v1,short_buy_sell_ratio_z
3,2025-12-30,META,0.288783,1313715.0,8587.0,3974561.0,2660846.0,0.330531,-1.019352,0.493721,...,44.916122,0.7,PRE_OTC,True,True,True,True,Neutral,PhaseA_v1,short_buy_sell_ratio_z
4,2025-12-29,META,0.663229,1274723.0,6196.0,3534482.0,2259759.0,0.360653,-0.758616,0.564097,...,49.238457,0.7,PRE_OTC,True,True,True,True,Neutral,PhaseA_v1,short_buy_sell_ratio_z



üìÖ OLDEST 5 DAYS (daily_metrics):


Unnamed: 0,date,symbol,log_buy_sell,short_volume,short_exempt_volume,short_total_volume,short_sell_volume,short_ratio,short_ratio_z,short_buy_sell_ratio,...,accumulation_score_display,confidence,data_quality,has_otc,has_short,has_lit,has_price,pressure_context_label,inference_version,accumulation_short_z_source
0,2025-10-08,META,-0.307582,1514725.0,3651.0,4157006.0,2642281.0,0.364379,,0.573264,...,,1.0,OTC_ANCHORED,True,True,True,True,Neutral,PhaseA_v1,short_buy_sell_ratio_z
1,2025-10-09,META,0.46178,1811221.0,8189.0,5302166.0,3490945.0,0.3416,,0.518834,...,,1.0,OTC_ANCHORED,True,True,True,True,Neutral,PhaseA_v1,short_buy_sell_ratio_z
2,2025-10-10,META,-0.262213,2514237.0,7281.0,6557845.0,4043608.0,0.383394,,0.621781,...,,1.0,OTC_ANCHORED,True,True,True,True,Neutral,PhaseA_v1,short_buy_sell_ratio_z
3,2025-10-13,META,-0.572953,1360923.0,11561.0,4054471.0,2693548.0,0.33566,,0.505253,...,,1.0,OTC_ANCHORED,True,True,True,True,Neutral,PhaseA_v1,short_buy_sell_ratio_z
4,2025-10-14,META,0.452854,1248854.0,3788.0,3693124.0,2444270.0,0.338157,,0.510931,...,,1.0,OTC_ANCHORED,True,True,True,True,Neutral,PhaseA_v1,short_buy_sell_ratio_z



üîç NULL FIELD ANALYSIS (across newest + oldest 10 rows):


Unnamed: 0,column,null_count,total_rows,null_pct,status
45,accumulation_score,5,10,50.0,‚ö†Ô∏è MISSING
46,accumulation_score_display,5,10,50.0,‚ö†Ô∏è MISSING
15,finra_buy_volume_z,5,10,50.0,‚ö†Ô∏è MISSING
23,lit_buy_ratio_z,5,10,50.0,‚ö†Ô∏è MISSING
25,lit_flow_imbalance_z,5,10,50.0,‚ö†Ô∏è MISSING
39,otc_buy_ratio_z,5,10,50.0,‚ö†Ô∏è MISSING
43,otc_participation_z,5,10,50.0,‚ö†Ô∏è MISSING
32,return_z,5,10,50.0,‚ö†Ô∏è MISSING
10,short_buy_sell_ratio_z,5,10,50.0,‚ö†Ô∏è MISSING
8,short_ratio_z,5,10,50.0,‚ö†Ô∏è MISSING



‚ö†Ô∏è  ALERT: 13 columns have NULL values:
   - short_ratio_z: 50.0% null
   - short_buy_sell_ratio_z: 50.0% null
   - vw_flow_z: 50.0% null
   - finra_buy_volume_z: 50.0% null
   - lit_buy_ratio_z: 50.0% null
   - lit_flow_imbalance_z: 50.0% null
   - return_1d: 20.0% null
   - return_z: 50.0% null
   - otc_buy_ratio_z: 50.0% null
   - otc_participation_z: 50.0% null
   - otc_participation_delta: 30.0% null
   - accumulation_score: 50.0% null
   - accumulation_score_display: 50.0% null


In [4]:
# Cell 4: Multi-Table Data Health Summary - check all key tables for sample ticker
print(f"{'='*60}")
print(f"MULTI-TABLE HEALTH CHECK FOR: {check_symbol}")
print(f"{'='*60}")

# Define tables and their date columns
table_checks = [
    ("daily_metrics", "date", "symbol"),
    ("lit_direction_daily", "date", "symbol"),
    ("finra_short_daily_raw", "trade_date", "symbol"),
    ("polygon_daily_agg_raw", "trade_date", "symbol"),
]

health_rows = []

for table, date_col, sym_col in table_checks:
    try:
        result = conn.execute(f"""
            SELECT 
                COUNT(*) as row_count,
                MIN({date_col}) as oldest_date,
                MAX({date_col}) as newest_date
            FROM {table}
            WHERE {sym_col} = ?
        """, [check_symbol]).df()
        
        row_count = result['row_count'].iloc[0]
        oldest = result['oldest_date'].iloc[0]
        newest = result['newest_date'].iloc[0]
        
        # Check if newest date is within last 7 days
        if newest and pd.Timestamp(newest) >= pd.Timestamp.now() - pd.Timedelta(days=7):
            freshness = "‚úÖ FRESH"
        elif newest:
            freshness = "‚ö†Ô∏è STALE"
        else:
            freshness = "‚ùå NO DATA"
            
        health_rows.append({
            'table': table,
            'rows': row_count,
            'oldest': oldest,
            'newest': newest,
            'freshness': freshness
        })
    except Exception as e:
        health_rows.append({
            'table': table,
            'rows': 0,
            'oldest': None,
            'newest': None,
            'freshness': f"‚ùå ERROR: {str(e)[:30]}"
        })

health_df = pd.DataFrame(health_rows)
display(health_df)

# Overall verdict
stale_tables = health_df[health_df['freshness'].str.contains('STALE|ERROR|NO DATA', na=False)]['table'].tolist()
if stale_tables:
    print(f"\n‚ö†Ô∏è  ATTENTION: {len(stale_tables)} table(s) may need refresh:")
    for t in stale_tables:
        print(f"   - {t}")
else:
    print("\n‚úÖ ALL TABLES HAVE FRESH DATA")

MULTI-TABLE HEALTH CHECK FOR: META


Unnamed: 0,table,rows,oldest,newest,freshness
0,daily_metrics,61,2025-10-08,2026-01-05,‚úÖ FRESH
1,lit_direction_daily,61,2025-10-08,2026-01-05,‚úÖ FRESH
2,finra_short_daily_raw,61,2025-10-08,2026-01-05,‚úÖ FRESH
3,polygon_daily_agg_raw,61,2025-10-08,2026-01-05,‚úÖ FRESH



‚úÖ ALL TABLES HAVE FRESH DATA


In [5]:
# Cell 5: Consolidated Sanity Checks - duplicates and invalid values
print(f"{'='*60}")
print("SANITY CHECKS")
print(f"{'='*60}")

# Check 1: Duplicate rows in daily_metrics
print("\nüîç Duplicate rows in daily_metrics:")
dupes_dm = conn.execute("""
    SELECT symbol, date, COUNT(*) AS n
    FROM daily_metrics
    GROUP BY symbol, date
    HAVING n > 1
""").df()
if len(dupes_dm) == 0:
    print("   ‚úÖ No duplicates found")
else:
    print(f"   ‚ö†Ô∏è {len(dupes_dm)} duplicate(s) found:")
    display(dupes_dm)

# Check 2: Duplicate rows in lit_direction_daily
print("\nüîç Duplicate rows in lit_direction_daily:")
dupes_lit = conn.execute("""
    SELECT symbol, date, COUNT(*) AS n
    FROM lit_direction_daily
    GROUP BY symbol, date
    HAVING n > 1
""").df()
if len(dupes_lit) == 0:
    print("   ‚úÖ No duplicates found")
else:
    print(f"   ‚ö†Ô∏è {len(dupes_lit)} duplicate(s) found:")
    display(dupes_lit)

# Check 3: Invalid short ratios (outside 0-1 range)
print("\nüîç Invalid short ratios (outside 0-1):")
bad_ratios = conn.execute("""
    SELECT symbol, date, short_ratio
    FROM daily_metrics
    WHERE short_ratio IS NOT NULL AND (short_ratio < 0 OR short_ratio > 1)
    LIMIT 10
""").df()
if len(bad_ratios) == 0:
    print("   ‚úÖ All short ratios valid")
else:
    print(f"   ‚ö†Ô∏è Invalid ratios found:")
    display(bad_ratios)

# Check 4: Zero/negative lit volumes with non-null ratios
print("\nüîç Invalid lit volumes (zero/negative with computed ratio):")
bad_lit = conn.execute("""
    SELECT symbol, date, lit_buy_volume, lit_sell_volume
    FROM lit_direction_daily
    WHERE (lit_buy_volume <= 0 OR lit_sell_volume <= 0)
      AND log_buy_sell IS NOT NULL
    LIMIT 10
""").df()
if len(bad_lit) == 0:
    print("   ‚úÖ All lit volumes valid")
else:
    print(f"   ‚ö†Ô∏è Invalid volumes found:")
    display(bad_lit)

print("\n" + "="*60)

SANITY CHECKS

üîç Duplicate rows in daily_metrics:
   ‚úÖ No duplicates found

üîç Duplicate rows in lit_direction_daily:
   ‚úÖ No duplicates found

üîç Invalid short ratios (outside 0-1):
   ‚úÖ All short ratios valid

üîç Invalid lit volumes (zero/negative with computed ratio):
   ‚úÖ All lit volumes valid



In [6]:
# Cell 6: Cleanup - close database connection
conn.close()
print("Database connection closed.")

Database connection closed.
