In [25]:
from __future__ import annotations  # no installation needed

from pathlib import Path  # no installation needed
import os  # no installation needed
import sys  # no installation needed
import subprocess  # no installation needed
import json  # no installation needed
import pandas as pd  # already in env â€” no new install

# ---- paths
DATA_ROOT = Path(r"C:\Users\quantbase\Desktop\marketdata")
REPO_ROOT = Path(r"C:\Users\quantbase\Desktop\sydata")
MANIFEST  = DATA_ROOT / "meta" / "symbols.yml"
SRC       = REPO_ROOT / "src"

# make project importable + stable relative paths
sys.path.insert(0, str(SRC))
os.chdir(str(REPO_ROOT))

PY = sys.executable

# ---- run spec
BASKET = "core_major"
INTERVAL = "15m"

START_DAY = "2025-01-01"      # end-exclusive window below
END_DAY_EXCL = "2026-01-01"

# join target for inspection (single symbol first, then basket loop)
SYMBOL_EXAMPLE = "BTC-USDT"


In [26]:
# subprocess runner

def run_cmd(args: list[str]) -> tuple[int, str, str]:
    p = subprocess.run(args, capture_output=True, text=True)
    return p.returncode, p.stdout, p.stderr

def run_script(script: str, extra: list[str]) -> tuple[int, str, str]:
    args = [PY, script] + extra
    return run_cmd(args)


In [27]:
# 1. ingest data for `t` timeframe


# 1) spot klines (15m)
rc, out, err = run_script(
    "scripts/binance_fetch_raw_klines.py",
    [
        "--data-root", str(DATA_ROOT),
        "--manifest",  str(MANIFEST),
        "--basket",    "core_major",
        "--interval",  INTERVAL,          # "15m"
        "--start",     START_DAY,
        "--end",       END_DAY_EXCL,
    ],
)
(rc, out, err)


(0,
 '{\n  "ok": 7,\n  "out_paths": [\n    "C:\\\\Users\\\\quantbase\\\\Desktop\\\\marketdata\\\\raw\\\\binance\\\\klines\\\\symbol=BTC-USDT\\\\interval=15m\\\\part-1735689600000-1767225600000.parquet",\n    "C:\\\\Users\\\\quantbase\\\\Desktop\\\\marketdata\\\\raw\\\\binance\\\\klines\\\\symbol=ETH-USDT\\\\interval=15m\\\\part-1735689600000-1767225600000.parquet",\n    "C:\\\\Users\\\\quantbase\\\\Desktop\\\\marketdata\\\\raw\\\\binance\\\\klines\\\\symbol=SOL-USDT\\\\interval=15m\\\\part-1735689600000-1767225600000.parquet",\n    "C:\\\\Users\\\\quantbase\\\\Desktop\\\\marketdata\\\\raw\\\\binance\\\\klines\\\\symbol=BNB-USDT\\\\interval=15m\\\\part-1735689600000-1767225600000.parquet",\n    "C:\\\\Users\\\\quantbase\\\\Desktop\\\\marketdata\\\\raw\\\\binance\\\\klines\\\\symbol=XRP-USDT\\\\interval=15m\\\\part-1735689600000-1767225600000.parquet",\n    "C:\\\\Users\\\\quantbase\\\\Desktop\\\\marketdata\\\\raw\\\\binance\\\\klines\\\\symbol=ADA-USDT\\\\interval=15m\\\\part-1735689600

In [28]:
# 2) futures index + mark klines (15m)
rc, out, err = run_script(
    "scripts/ingest_um_mark_price_klines.py",
    [
        "--data-root", str(DATA_ROOT),
        "--manifest",  str(MANIFEST),
        "--basket",    BASKET,
        "--intervals", INTERVAL,
        "--start",     START_DAY,
        "--end",       END_DAY_EXCL,
    ],
)
(rc, out, err)

rc, out, err = run_script(
    "scripts/ingest_um_index_price_klines.py",
    [
        "--data-root", str(DATA_ROOT),
        "--manifest",  str(MANIFEST),
        "--basket",    BASKET,
        "--intervals", INTERVAL,
        "--start",     START_DAY,
        "--end",       END_DAY_EXCL,
    ],
)
(rc, out, err)

# 3) premium index klines (15m)
rc, out, err = run_script(
    "scripts/ingest_um_premium_index_klines.py",
    [
        "--data-root", str(DATA_ROOT),
        "--manifest",  str(MANIFEST),
        "--basket",    BASKET,
        "--intervals", INTERVAL,
        "--start",     START_DAY,
        "--end",       END_DAY_EXCL,
    ],
)
(rc, out, err)

# 4) funding (8h cadence; join later to 15m grid)
rc, out, err = run_script(
    "scripts/ingest_um_funding_rate.py",
    [
        "--data-root", str(DATA_ROOT),
        "--manifest",  str(MANIFEST),
        "--basket",    BASKET,
        "--start",     START_DAY,
        "--end",       END_DAY_EXCL,
    ],
)
(rc, out, err)

(0,
 '{\n  "ok": 0,\n  "missing_archive_file": 7,\n  "already_exists": 84,\n  "total": 91\n}\n',
 '')

In [29]:
# 2. file loaders (validate)


def load_spot_interval(symbol: str, interval: str) -> pd.DataFrame:
    p = DATA_ROOT / "raw" / "binance" / "klines" / f"symbol={symbol}" / f"interval={interval}"
    files = sorted(p.glob("part-*.parquet"))
    df = pd.concat((pd.read_parquet(f) for f in files), ignore_index=True) if files else pd.DataFrame()
    if len(df):
        df["ts"] = pd.to_datetime(df["open_time"], unit="ms", utc=True)
    return df

def load_monthly_parts(root: Path, year: int) -> pd.DataFrame:
    files = sorted(root.glob(f"part-{year}-*.parquet"))
    return pd.concat((pd.read_parquet(f) for f in files), ignore_index=True) if files else pd.DataFrame()

def load_range_monthly(dataset_root: Path, start_year: int, end_year: int) -> pd.DataFrame:
    dfs = []
    for y in range(start_year, end_year + 1):
        d = load_monthly_parts(dataset_root, y)
        if len(d):
            dfs.append(d)
    return pd.concat(dfs, ignore_index=True) if dfs else pd.DataFrame()


In [30]:
# 3. load 15m data

spot_df = load_spot_interval(SYMBOL_EXAMPLE, INTERVAL)

mark_root  = DATA_ROOT / "raw" / "binance" / "um_mark_price_klines"  / f"symbol={SYMBOL_EXAMPLE}" / f"interval={INTERVAL}"
index_root = DATA_ROOT / "raw" / "binance" / "um_index_price_klines" / f"symbol={SYMBOL_EXAMPLE}" / f"interval={INTERVAL}"
prem_root  = DATA_ROOT / "raw" / "binance" / "um_premium_index_klines" / f"symbol={SYMBOL_EXAMPLE}" / f"interval={INTERVAL}"
fund_root  = DATA_ROOT / "raw" / "binance" / "um_funding_rate" / f"symbol={SYMBOL_EXAMPLE}"

mark_df  = load_range_monthly(mark_root,  2025, 2025)
index_df = load_range_monthly(index_root, 2025, 2025)
prem_df  = load_range_monthly(prem_root,  2025, 2025)
fund_df  = load_range_monthly(fund_root,  2025, 2025)

# ensure ts exists for monthly datasets
for df in (mark_df, index_df):
    if len(df) and "ts" not in df.columns:
        df["ts"] = pd.to_datetime(df["open_time"], unit="ms", utc=True)

# prem already has ts in your schema; enforce type
if len(prem_df):
    prem_df["ts"] = pd.to_datetime(prem_df["ts"], utc=True)

# funding ts already exists; enforce type
if len(fund_df):
    fund_df["ts"] = pd.to_datetime(fund_df["ts"], utc=True)

(len(spot_df), len(mark_df), len(index_df), len(prem_df), len(fund_df))


(35041, 35040, 35040, 35040, 1095)

In [31]:
# 4. sanity


STEP_MS = 15 * 60 * 1000

def timegrid_report(df: pd.DataFrame, key: str = "open_time", step_ms: int = STEP_MS) -> dict:
    if df is None or len(df) == 0:
        return {"rows": 0}
    s = df.sort_values(key)[key].astype("int64").to_numpy()
    diffs = s[1:] - s[:-1]
    bad = int((diffs != step_ms).sum())
    return {
        "rows": int(len(df)),
        "key_is_unique": bool(df[key].is_unique),
        "monotonic": bool(pd.Series(s).is_monotonic_increasing),
        "step_ok": bool(bad == 0),
        "bad_steps": bad,
        "min": int(s.min()),
        "max": int(s.max()),
    }

{
    "spot":  timegrid_report(spot_df),
    "mark":  timegrid_report(mark_df),
    "index": timegrid_report(index_df),
    "prem":  timegrid_report(prem_df),
    "fund":  {"rows": int(len(fund_df)), "min_ts": str(fund_df["ts"].min()) if len(fund_df) else None},
}


{'spot': {'rows': 35041,
  'key_is_unique': True,
  'monotonic': True,
  'step_ok': True,
  'bad_steps': 0,
  'min': 1735689600000,
  'max': 1767225600000},
 'mark': {'rows': 35040,
  'key_is_unique': True,
  'monotonic': True,
  'step_ok': True,
  'bad_steps': 0,
  'min': 1735689600000,
  'max': 1767224700000},
 'index': {'rows': 35040,
  'key_is_unique': True,
  'monotonic': True,
  'step_ok': True,
  'bad_steps': 0,
  'min': 1735689600000,
  'max': 1767224700000},
 'prem': {'rows': 35040,
  'key_is_unique': True,
  'monotonic': True,
  'step_ok': True,
  'bad_steps': 0,
  'min': 1735689600000,
  'max': 1767224700000},
 'fund': {'rows': 1095, 'min_ts': '2025-01-01 00:00:00+00:00'}}

In [32]:
# Master join builder (15m)

def make_master_slice(spot_df, mark_df, index_df, prem_df, fund_df) -> pd.DataFrame:
    s = spot_df[["open_time","ts","close","volume","quote_volume","trades"]].rename(columns={"close":"spot_close"})
    m = mark_df[["open_time","close"]].rename(columns={"close":"mark_close"})
    i = index_df[["open_time","close"]].rename(columns={"close":"index_close"})
    p = prem_df[["open_time","close"]].rename(columns={"close":"premium_close"})

    master = s.merge(m, on="open_time", how="left")
    master = master.merge(i, on="open_time", how="left")
    master = master.merge(p, on="open_time", how="left")

    master["basis_mark_vs_spot"]  = (master["mark_close"]  / master["spot_close"]) - 1.0
    master["basis_index_vs_spot"] = (master["index_close"] / master["spot_close"]) - 1.0

    if len(fund_df):
        f = fund_df[["ts","funding_rate","funding_interval_hours"]].sort_values("ts")
        master = master.sort_values("ts")
        master = pd.merge_asof(master, f, on="ts", direction="backward", allow_exact_matches=True)
        master["fund_age_minutes"] = (master["ts"] - master["ts"].where(master["funding_rate"].notna())).dt.total_seconds() / 60.0

    cols = [
        "ts","open_time",
        "spot_close","mark_close","index_close",
        "basis_mark_vs_spot","basis_index_vs_spot",
        "premium_close",
        "funding_rate","funding_interval_hours",
        "volume","quote_volume","trades",
    ]
    cols = [c for c in cols if c in master.columns] + [c for c in master.columns if c not in cols]
    return master[cols].sort_values("ts")

master_df = make_master_slice(spot_df, mark_df, index_df, prem_df, fund_df)
master_df.head(10)


Unnamed: 0,ts,open_time,spot_close,mark_close,index_close,basis_mark_vs_spot,basis_index_vs_spot,premium_close,funding_rate,funding_interval_hours,volume,quote_volume,trades,fund_age_minutes
0,2025-01-01 00:00:00+00:00,1735689600000,93656.18,93637.2,93650.139149,-0.000203,-6.5e-05,-2.5e-05,0.0001,8,175.85673,16461790.0,19788,0.0
1,2025-01-01 00:15:00+00:00,1735690500000,93761.9,93743.4,93760.971489,-0.000197,-1e-05,-0.000197,0.0001,8,95.41749,8948674.0,20478,0.0
2,2025-01-01 00:30:00+00:00,1735691400000,93885.01,93864.328404,93885.908085,-0.00022,1e-05,-0.000272,0.0001,8,94.36416,8859841.0,21825,0.0
3,2025-01-01 00:45:00+00:00,1735692300000,94401.14,94363.6,94396.002979,-0.000398,-5.4e-05,-0.000172,0.0001,8,390.35172,36798500.0,31434,0.0
4,2025-01-01 01:00:00+00:00,1735693200000,94153.05,94121.032729,94152.674255,-0.00034,-4e-06,-0.000386,0.0001,8,284.57018,26761440.0,31418,0.0
5,2025-01-01 01:15:00+00:00,1735694100000,93825.86,93805.105567,93837.345957,-0.000221,0.000122,-0.000397,0.0001,8,102.66232,9640520.0,15068,0.0
6,2025-01-01 01:30:00+00:00,1735695000000,93923.14,93897.422305,93926.748936,-0.000274,3.8e-05,-0.000368,0.0001,8,78.97889,7418445.0,14018,0.0
7,2025-01-01 01:45:00+00:00,1735695900000,93607.74,93588.0,93612.465957,-0.000211,5e-05,-0.000248,0.0001,8,120.32317,11276200.0,19439,0.0
8,2025-01-01 02:00:00+00:00,1735696800000,93800.0,93767.9,93793.832766,-0.000342,-6.6e-05,-0.000335,0.0001,8,104.71689,9815654.0,20595,0.0
9,2025-01-01 02:15:00+00:00,1735697700000,93742.52,93715.4,93740.972766,-0.000289,-1.7e-05,-0.000268,0.0001,8,48.21903,4519803.0,10475,0.0


In [None]:
master_df.to_csv("master_dataset_example_BTC-USDT_15m_2025.csv", index=False)

In [15]:
# Join audit outputs (post-join gate)

audit = {
    "rows": len(master_df),
    "mark_missing_frac": float(master_df["mark_close"].isna().mean()),
    "index_missing_frac": float(master_df["index_close"].isna().mean()),
    "prem_missing_frac": float(master_df["premium_close"].isna().mean()),
    "funding_missing_frac": float(master_df["funding_rate"].isna().mean()) if "funding_rate" in master_df.columns else None,
}
audit


{'rows': 35041,
 'mark_missing_frac': 2.8537998344796096e-05,
 'index_missing_frac': 2.8537998344796096e-05,
 'prem_missing_frac': 2.8537998344796096e-05,
 'funding_missing_frac': 0.0}