In [1]:
import sys, os
from pathlib import Path
import yaml
import json

PROJECT_ROOT = Path(r"C:\Users\quantbase\Desktop\sydata")
SRC = PROJECT_ROOT / "src"
DATA_ROOT = Path(r"C:\Users\quantbase\Desktop\marketdata")
MANIFEST = DATA_ROOT / "meta" / "symbols.yml"

# Make `from sydata...` importable
sys.path.insert(0, str(SRC))

# Make relative paths (scripts/, etc.) resolve predictably
os.chdir(str(PROJECT_ROOT))

print("python:", sys.executable)
print("cwd:", Path.cwd())
print("sys.path[0]:", sys.path[0])
print("SRC exists:", SRC.exists())

python: c:\Users\quantbase\.conda\envs\sydata-311\python.exe
cwd: C:\Users\quantbase\Desktop\sydata
sys.path[0]: C:\Users\quantbase\Desktop\sydata\src
SRC exists: True


In [2]:
import numpy as np
import pandas as pd

SYMBOL = "BTC-USDT"
INTERVAL = "1h"
STEP_MS = 3600_000  # 1h in ms

START = "2025-01-01"
END   = "2026-01-01"   # exclusive

In [3]:
# Cell 2 — Utilities: strict audit + gap locator
def audit_kline_frame(df: pd.DataFrame, key: str = "open_time", step_ms: int = STEP_MS) -> dict:
    out = {}
    out["rows"] = int(len(df))
    out["key_is_unique"] = bool(df[key].is_unique)
    df2 = df.sort_values(key)
    v = df2[key].to_numpy()
    if len(v) <= 1:
        out["monotonic"] = True
        out["step_ok"] = True
        out["min"] = int(v[0]) if len(v) else None
        out["max"] = int(v[0]) if len(v) else None
        return out

    d = np.diff(v)
    out["monotonic"] = bool(np.all(d > 0))
    out["step_ok"] = bool(np.all(d == step_ms))
    out["min"] = int(v.min())
    out["max"] = int(v.max())
    out["bad_steps"] = int(np.sum(d != step_ms))
    return out


def locate_gaps(df: pd.DataFrame, key: str = "open_time", step_ms: int = STEP_MS, n: int = 20) -> tuple[int, pd.DataFrame]:
    df2 = df.sort_values(key)
    v = df2[key].to_numpy()
    if len(v) <= 1:
        return 0, pd.DataFrame(columns=["idx","open_time_prev","open_time_next","delta_ms","ts_prev","ts_next"])

    diff = np.diff(v)
    bad = np.where(diff != step_ms)[0]
    take = bad[:n]

    out = pd.DataFrame({
        "idx": take,
        "open_time_prev": v[take],
        "open_time_next": v[take + 1],
        "delta_ms": diff[take],
    })
    out["ts_prev"] = pd.to_datetime(out["open_time_prev"], unit="ms", utc=True)
    out["ts_next"] = pd.to_datetime(out["open_time_next"], unit="ms", utc=True)
    return int(len(bad)), out


In [None]:
# Cell 3 — Load spot klines (spine) and time-filter to [START, END)
P_SPOT = (
    DATA_ROOT
    / "raw" / "binance" / "klines"
    / f"symbol={SYMBOL}" / f"interval={INTERVAL}"
    / "part-1577836800000-1767225600000.parquet"
)

spot_df = pd.read_parquet(P_SPOT)
# strict schema sanity#
assert "open_time" in spot_df.columns
assert "close" in spot_df.columns

spot_df = spot_df.sort_values("open_time").copy()
spot_df["ts"] = pd.to_datetime(spot_df["open_time"], unit="ms", utc=True)
spot_df = spot_df[(spot_df["ts"] >= START) & (spot_df["ts"] < END)].copy()

spot_audit = audit_kline_frame(spot_df, key="open_time", step_ms=STEP_MS)
spot_audit, spot_df.head(3)


({'rows': 8760,
  'key_is_unique': True,
  'monotonic': True,
  'step_ok': True,
  'min': 1735689600000,
  'max': 1767222000000,
  'bad_steps': 0},
            open_time      open      high       low     close     volume  \
 43816  1735689600000  93576.00  94509.42  93489.03  94401.14  755.99010   
 43817  1735693200000  94401.13  94408.72  93578.77  93607.74  586.53456   
 43818  1735696800000  93607.74  94105.12  93594.56  94098.91  276.78045   
 
           close_time  quote_volume  trades  taker_buy_base_volume  \
 43816  1735693199999  7.106881e+07   93525              421.08319   
 43817  1735696799999  5.509661e+07   79943              257.42023   
 43818  1735700399999  2.597409e+07   55078              185.35204   
 
        taker_buy_quote_volume ignore    symbol interval    venue  \
 43816            3.959678e+07      0  BTC-USDT       1h  binance   
 43817            2.418794e+07      0  BTC-USDT       1h  binance   
 43818            1.739377e+07      0  BTC-USDT       1h 

In [5]:
# Cell 5 — Load UM mark/index klines for the same year and audit
MROOT = DATA_ROOT / "raw" / "binance" / "um_mark_price_klines" / f"symbol={SYMBOL}" / f"interval={INTERVAL}"
IROOT = DATA_ROOT / "raw" / "binance" / "um_index_price_klines" / f"symbol={SYMBOL}" / f"interval={INTERVAL}"

mark_files = sorted(MROOT.glob("part-2025-*.parquet"))
index_files = sorted(IROOT.glob("part-2025-*.parquet"))
assert len(mark_files) > 0 and len(index_files) > 0

mark_df = pd.concat([pd.read_parquet(f) for f in mark_files], ignore_index=True).sort_values("open_time")
index_df = pd.concat([pd.read_parquet(f) for f in index_files], ignore_index=True).sort_values("open_time")

# (optional) time filter to exactly match spot window
mark_df["ts"] = pd.to_datetime(mark_df["open_time"], unit="ms", utc=True)
index_df["ts"] = pd.to_datetime(index_df["open_time"], unit="ms", utc=True)
mark_df = mark_df[(mark_df["ts"] >= START) & (mark_df["ts"] < END)].copy()
index_df = index_df[(index_df["ts"] >= START) & (index_df["ts"] < END)].copy()

mark_audit = audit_kline_frame(mark_df, key="open_time", step_ms=STEP_MS)
index_audit = audit_kline_frame(index_df, key="open_time", step_ms=STEP_MS)

(mark_audit, index_audit, mark_df.head(3), index_df.head(3))


({'rows': 8760,
  'key_is_unique': True,
  'monotonic': True,
  'step_ok': True,
  'min': 1735689600000,
  'max': 1767222000000,
  'bad_steps': 0},
 {'rows': 8760,
  'key_is_unique': True,
  'monotonic': True,
  'step_ok': True,
  'min': 1735689600000,
  'max': 1767222000000,
  'bad_steps': 0},
        open_time          open          high           low    close ignore_0  \
 0  1735689600000  93549.661752  94451.045610  93464.441981  94363.6        0   
 1  1735693200000  94363.600000  94365.592837  93557.185993  93588.0        0   
 2  1735696800000  93588.000000  94087.485142  93574.329979  94064.4        0   
 
       close_time ignore_1 ignore_2 ignore_3 ignore_4 ignore_5    symbol  \
 0  1735693199999        0     3600        0        0        0  BTC-USDT   
 1  1735696799999        0     3600        0        0        0  BTC-USDT   
 2  1735700399999        0     3600        0        0        0  BTC-USDT   
 
   venue_symbol interval    venue               dataset  \
 0      BTCUS

In [6]:
# Cell 6 — Enforce 1:1 key integrity + build master (spot spine, left-join)
# Hard fails if duplicates exist on the join key.
assert spot_df["open_time"].is_unique
assert mark_df["open_time"].is_unique
assert index_df["open_time"].is_unique

spot_1 = spot_df[["open_time","close"]].rename(columns={"close":"spot_close"})
mark_1 = mark_df[["open_time","close"]].rename(columns={"close":"mark_close"})
index_1 = index_df[["open_time","close"]].rename(columns={"close":"index_close"})

tmp = spot_1.merge(mark_1, on="open_time", how="left", validate="one_to_one")
master_df = tmp.merge(index_1, on="open_time", how="left", validate="one_to_one")

master_df["ts"] = pd.to_datetime(master_df["open_time"], unit="ms", utc=True)
master_df["basis_mark_vs_spot"]  = master_df["mark_close"] / master_df["spot_close"] - 1.0
master_df["basis_index_vs_spot"] = master_df["index_close"] / master_df["spot_close"] - 1.0

(master_df.shape, master_df.head(10))


((8760, 7),
        open_time  spot_close    mark_close   index_close  \
 0  1735689600000    94401.14  94363.600000  94396.002979   
 1  1735693200000    93607.74  93588.000000  93612.465957   
 2  1735696800000    94098.91  94064.400000  94091.546596   
 3  1735700400000    93838.04  93811.972187  93838.176829   
 4  1735704000000    93553.91  93544.900000  93555.690851   
 5  1735707600000    93792.02  93759.900000  93794.045532   
 6  1735711200000    93757.58  93734.200000  93756.584255   
 7  1735714800000    93684.10  93661.087326  93687.337447   
 8  1735718400000    93428.46  93400.100000  93435.559574   
 9  1735722000000    93413.06  93380.168965  93411.669149   
 
                          ts  basis_mark_vs_spot  basis_index_vs_spot  
 0 2025-01-01 00:00:00+00:00           -0.000398            -0.000054  
 1 2025-01-01 01:00:00+00:00           -0.000211             0.000050  
 2 2025-01-01 02:00:00+00:00           -0.000367            -0.000078  
 3 2025-01-01 03:00:00+00:0

In [7]:
# Cell 7 — Coverage diagnostics (should be near 0 for 2025 if complete)
coverage = {
    "mark_missing_frac": float(master_df["mark_close"].isna().mean()),
    "index_missing_frac": float(master_df["index_close"].isna().mean()),
    "both_present_frac": float((master_df["mark_close"].notna() & master_df["index_close"].notna()).mean()),
}
coverage


{'mark_missing_frac': 0.0, 'index_missing_frac': 0.0, 'both_present_frac': 1.0}

In [8]:
# Cell 8 — Key-set equality checks (spot vs mark/index)
s_spot = set(master_df["open_time"])
s_mark = set(mark_1["open_time"])
s_idx  = set(index_1["open_time"])

keyset_report = {
    "spot_keys": len(s_spot),
    "mark_keys": len(s_mark),
    "index_keys": len(s_idx),
    "spot_minus_mark": len(s_spot - s_mark),
    "spot_minus_index": len(s_spot - s_idx),
    "mark_minus_spot": len(s_mark - s_spot),
    "index_minus_spot": len(s_idx - s_spot),
}
keyset_report


{'spot_keys': 8760,
 'mark_keys': 8760,
 'index_keys': 8760,
 'spot_minus_mark': 0,
 'spot_minus_index': 0,
 'mark_minus_spot': 0,
 'index_minus_spot': 0}

In [9]:
# Cell 9 — If keyset differences exist, view examples
def show_missing_examples(base: set, other: set, n: int = 10):
    xs = sorted(list(base - other))[:n]
    return pd.DataFrame({
        "open_time": xs,
        "ts": pd.to_datetime(xs, unit="ms", utc=True),
    })

missing_mark_ex = show_missing_examples(s_spot, s_mark, n=25)
missing_idx_ex  = show_missing_examples(s_spot, s_idx, n=25)

missing_mark_ex, missing_idx_ex


(Empty DataFrame
 Columns: [open_time, ts]
 Index: [],
 Empty DataFrame
 Columns: [open_time, ts]
 Index: [])

In [10]:
# Cell 10 — Optional: attach funding (as-of backward, no lookahead)
FROOT = DATA_ROOT / "raw" / "binance" / "um_funding_rate" / f"symbol={SYMBOL}"
fund_files = sorted(FROOT.glob("part-2025-*.parquet"))
assert len(fund_files) > 0

fund_df = pd.concat([pd.read_parquet(f) for f in fund_files], ignore_index=True).sort_values("ts")
fund_df = fund_df[(fund_df["ts"] >= START) & (fund_df["ts"] < END)].copy()

master_df = pd.merge_asof(
    master_df.sort_values("ts"),
    fund_df[["ts","funding_rate","funding_interval_hours"]].sort_values("ts"),
    on="ts",
    direction="backward",
)

(master_df[["ts","spot_close","mark_close","index_close","funding_rate"]].head(20),
 float(master_df["funding_rate"].isna().mean()))


(                          ts  spot_close    mark_close   index_close  \
 0  2025-01-01 00:00:00+00:00    94401.14  94363.600000  94396.002979   
 1  2025-01-01 01:00:00+00:00    93607.74  93588.000000  93612.465957   
 2  2025-01-01 02:00:00+00:00    94098.91  94064.400000  94091.546596   
 3  2025-01-01 03:00:00+00:00    93838.04  93811.972187  93838.176829   
 4  2025-01-01 04:00:00+00:00    93553.91  93544.900000  93555.690851   
 5  2025-01-01 05:00:00+00:00    93792.02  93759.900000  93794.045532   
 6  2025-01-01 06:00:00+00:00    93757.58  93734.200000  93756.584255   
 7  2025-01-01 07:00:00+00:00    93684.10  93661.087326  93687.337447   
 8  2025-01-01 08:00:00+00:00    93428.46  93400.100000  93435.559574   
 9  2025-01-01 09:00:00+00:00    93413.06  93380.168965  93411.669149   
 10 2025-01-01 10:00:00+00:00    93326.23  93303.200000  93325.870000   
 11 2025-01-01 11:00:00+00:00    93444.64  93427.950241  93452.544468   
 12 2025-01-01 12:00:00+00:00    93820.76  93790.86

In [11]:
#------Saving-------