In [1]:
import sys, os
from pathlib import Path
import yaml
import json

PROJECT_ROOT = Path(r"C:\Users\quantbase\Desktop\sydata")
SRC = PROJECT_ROOT / "src"
DATA_ROOT = Path(r"C:\Users\quantbase\Desktop\marketdata")
MANIFEST = DATA_ROOT / "meta" / "symbols.yml"

# Make `from sydata...` importable
sys.path.insert(0, str(SRC))

# Make relative paths (scripts/, etc.) resolve predictably
os.chdir(str(PROJECT_ROOT))

print("python:", sys.executable)
print("cwd:", Path.cwd())
print("sys.path[0]:", sys.path[0])
print("SRC exists:", SRC.exists())

python: c:\Users\quantbase\.conda\envs\sydata-311\python.exe
cwd: C:\Users\quantbase\Desktop\sydata
sys.path[0]: C:\Users\quantbase\Desktop\sydata\src
SRC exists: True


In [2]:
import subprocess

cmd = [
    sys.executable, "scripts/ingest_um_funding_rate.py",
    "--data-root", str(DATA_ROOT),
    "--manifest", str(MANIFEST),
    "--basket", "core_major",
    "--start", "2024-01-01",
    "--end",   "2024-02-01",
]

cp = subprocess.run(cmd, cwd=str(PROJECT_ROOT), capture_output=True, text=True)
cp.returncode, cp.stdout[-2000:], cp.stderr[-4000:]


(0,
 '{\n  "ok": 14,\n  "missing_archive_file": 0,\n  "already_exists": 0,\n  "total": 14\n}\n',
 '')

In [3]:
#------inspect output------

In [4]:
import pandas as pd
from pathlib import Path

base = Path(r"C:\Users\quantbase\Desktop\marketdata") / "raw" / "binance" / "um_funding_rate"
parts = sorted(base.rglob("part-2024-01.parquet"))
len(parts), [str(p) for p in parts[:10]]


(7,
 ['C:\\Users\\quantbase\\Desktop\\marketdata\\raw\\binance\\um_funding_rate\\symbol=ADA-USDT\\part-2024-01.parquet',
  'C:\\Users\\quantbase\\Desktop\\marketdata\\raw\\binance\\um_funding_rate\\symbol=BNB-USDT\\part-2024-01.parquet',
  'C:\\Users\\quantbase\\Desktop\\marketdata\\raw\\binance\\um_funding_rate\\symbol=BTC-USDT\\part-2024-01.parquet',
  'C:\\Users\\quantbase\\Desktop\\marketdata\\raw\\binance\\um_funding_rate\\symbol=ETH-USDT\\part-2024-01.parquet',
  'C:\\Users\\quantbase\\Desktop\\marketdata\\raw\\binance\\um_funding_rate\\symbol=LINK-USDT\\part-2024-01.parquet',
  'C:\\Users\\quantbase\\Desktop\\marketdata\\raw\\binance\\um_funding_rate\\symbol=SOL-USDT\\part-2024-01.parquet',
  'C:\\Users\\quantbase\\Desktop\\marketdata\\raw\\binance\\um_funding_rate\\symbol=XRP-USDT\\part-2024-01.parquet'])

In [5]:
p = parts[0]
df = pd.read_parquet(p)
df.shape, df.columns.tolist(), df.dtypes, df.head(10), df.tail(10), df["ts"].min(), df["ts"].max()


((93, 8),
 ['ts',
  'funding_time',
  'funding_rate',
  'mark_price',
  'symbol',
  'venue_symbol',
  'venue',
  'dataset'],
 ts              datetime64[ns, UTC]
 funding_time                  int64
 funding_rate                  int64
 mark_price                  float64
 symbol                       object
 venue_symbol                 object
 venue                        object
 dataset                      object
 dtype: object,
                          ts   funding_time  funding_rate  mark_price  \
 0 2024-01-01 00:00:00+00:00  1704067200000             8    0.000100   
 1 2024-01-01 08:00:00+00:00  1704096000000             8    0.000131   
 2 2024-01-01 16:00:00+00:00  1704124800000             8    0.000330   
 3 2024-01-02 00:00:00+00:00  1704153600000             8    0.000585   
 4 2024-01-02 08:00:00+00:00  1704182400000             8    0.000466   
 5 2024-01-02 16:00:00+00:00  1704211200000             8    0.000566   
 6 2024-01-03 00:00:00+00:00  1704240000000         

In [6]:
#-----Join onto spot 1h bars (no lookahead)-----

In [7]:
import pandas as pd
from pathlib import Path

DATA_ROOT = Path(r"C:\Users\quantbase\Desktop\marketdata")

# spot bars (BTC example)
KLINE_DIR = DATA_ROOT / "raw" / "binance" / "klines" / "symbol=BTC-USDT" / "interval=1h"
kfiles = sorted(KLINE_DIR.glob("part-*.parquet"))
k = pd.concat([pd.read_parquet(f) for f in kfiles], ignore_index=True)

k["ts"] = pd.to_datetime(k["open_time"], unit="ms", utc=True)
k = k.sort_values("ts")
k = k[(k["ts"] >= "2024-01-01") & (k["ts"] < "2024-02-01")].copy()

# funding (pick BTC file explicitly if present)
f = DATA_ROOT / "raw" / "binance" / "um_funding_rate" / "symbol=BTC-USDT" / "part-2024-01.parquet"
fund = pd.read_parquet(f).sort_values("ts")

# as-of backward join
joined = pd.merge_asof(
    k[["ts","open","high","low","close","volume"]].sort_values("ts"),
    fund[["ts","funding_rate","mark_price"]].sort_values("ts"),
    on="ts",
    direction="backward",
)

joined[["ts","close","funding_rate","mark_price"]].head(25), joined["funding_rate"].isna().mean()


(                          ts     close  funding_rate  mark_price
 0  2024-01-01 00:00:00+00:00  42475.23             8    0.000374
 1  2024-01-01 01:00:00+00:00  42613.56             8    0.000374
 2  2024-01-01 02:00:00+00:00  42581.10             8    0.000374
 3  2024-01-01 03:00:00+00:00  42330.49             8    0.000374
 4  2024-01-01 04:00:00+00:00  42399.99             8    0.000374
 5  2024-01-01 05:00:00+00:00  42234.01             8    0.000374
 6  2024-01-01 06:00:00+00:00  42396.69             8    0.000374
 7  2024-01-01 07:00:00+00:00  42492.46             8    0.000374
 8  2024-01-01 08:00:00+00:00  42549.99             8    0.000272
 9  2024-01-01 09:00:00+00:00  42649.69             8    0.000272
 10 2024-01-01 10:00:00+00:00  42691.10             8    0.000272
 11 2024-01-01 11:00:00+00:00  42690.20             8    0.000272
 12 2024-01-01 12:00:00+00:00  42648.38             8    0.000272
 13 2024-01-01 13:00:00+00:00  42715.54             8    0.000272
 14 2024-0

In [8]:
#----sanity check-----

In [10]:
x = joined.dropna(subset=["funding_rate"]).copy()
x["funding_change"] = x["funding_rate"].ne(x["funding_rate"].shift(1))


In [11]:
x.loc[x["funding_change"], ["ts","funding_rate"]].head(20)

Unnamed: 0,ts,funding_rate
0,2024-01-01 00:00:00+00:00,8
