# 02 — Data Quality Quick Viz
Scans Parquet files under `data/prices_1d` (or `prices_1h`) and shows freshness, coverage, NaNs, duplicates, weekend bars, and ADR stats.

In [None]:
from pathlib import Path

PROJECT_OVERRIDE = r"C:\Users\speed\Desktop\Forex CFD's system"
ROOT = Path(PROJECT_OVERRIDE)
DATA_1D = ROOT / "data" / "prices_1d"
DATA_1H = ROOT / "data" / "prices_1h"
CFG = ROOT / "config" / "baseline.yaml"
DATA_DIR = DATA_1D if DATA_1D.exists() else DATA_1H

print("Project root:", ROOT)
print("Using data dir:", DATA_DIR)
assert DATA_DIR.exists(), f"Expected data under {ROOT/'data'} (prices_1d or prices_1h)"
assert CFG.exists(), f"Missing baseline config at {CFG}"


Project root: C:\Users\speed\Desktop\Forex CFD's system
Using data dir: C:\Users\speed\Desktop\Forex CFD's system\data\prices_1d


In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

files = sorted(Path(DATA_DIR).glob('*.parquet'))
assert files, f'No parquet files in {DATA_DIR}'

rows = []
for p in files:
    df = pd.read_parquet(p)
    # ensure datetime index
    if not isinstance(df.index, pd.DatetimeIndex):
        if 'Date' in df.columns:
            df = df.set_index(pd.to_datetime(df['Date'], utc=True, errors='coerce'))
        else:
            raise ValueError(f"{p} has no DatetimeIndex or Date column")
    if df.index.tz is None:
        df.index = df.index.tz_localize('UTC')
    df = df.sort_index()

    dup_ts = int(df.index.duplicated().sum())
    weekends = (df.index.weekday >= 5)
    weekend_pct = float(100 * weekends.mean())
    nans = {c:int(df[c].isna().sum()) for c in ['Open','High','Low','Close','Volume'] if c in df.columns}
    adr = (df['High']-df['Low']).abs().resample('D').mean().median() if {'High','Low'}.issubset(df.columns) else np.nan

    rows.append({
        'symbol': p.stem.upper(),
        'tf': DATA_DIR.name.split('_')[-1],
        'bars': len(df),
        'start_utc': str(df.index.min()),
        'end_utc': str(df.index.max()),
        'dup_ts': dup_ts,
        'weekend_pct': weekend_pct,
        'adr_median': adr,
        **{f'nan_{k}':v for k,v in nans.items()}
    })

table = pd.DataFrame(rows).sort_values('symbol').reset_index(drop=True)
try:
    from caas_jupyter_tools import display_dataframe_to_user
    display_dataframe_to_user('Data quality summary', table)
except Exception:
    pass
table.head()


Unnamed: 0,symbol,tf,bars,start_utc,end_utc,dup_ts,weekend_pct,adr_median,nan_Open,nan_High,nan_Low,nan_Close,nan_Volume
0,EURUSD,1d,312,2022-01-02 00:00:00+00:00,2022-12-30 00:00:00+00:00,0,16.666667,0.008545,0,0,0,0,0
1,GBPUSD,1d,1563,2020-01-01 00:00:00+00:00,2024-12-31 00:00:00+00:00,0,16.570697,0.00838,0,0,0,0,0
2,USDJPY,1d,1564,2020-01-01 00:00:00+00:00,2024-12-31 00:00:00+00:00,0,16.624041,0.7125,0,0,0,0,0
3,XAUUSD,1d,1554,2020-01-01 00:00:00+00:00,2024-12-31 00:00:00+00:00,0,16.537967,21.22,0,0,0,0,0


In [None]:
import plotly.express as px
import pandas as pd

table['end_dt'] = pd.to_datetime(table['end_utc'])

px.bar(table, x='symbol', y='bars', title='Bars per symbol').show()
px.bar(table, x='symbol', y='weekend_pct', title='Weekend bars (% of rows)').show()
px.bar(table, x='symbol', y='adr_median', title='Median ADR (abs(High-Low))').show()
