In [1]:
import sys, os
from pathlib import Path
import yaml
import json

PROJECT_ROOT = Path(r"C:\Users\quantbase\Desktop\sydata")
SRC = PROJECT_ROOT / "src"
DATA_ROOT = Path(r"C:\Users\quantbase\Desktop\marketdata")
MANIFEST = DATA_ROOT / "meta" / "symbols.yml"

# Make `from sydata...` importable
sys.path.insert(0, str(SRC))

# Make relative paths (scripts/, etc.) resolve predictably
os.chdir(str(PROJECT_ROOT))

print("python:", sys.executable)
print("cwd:", Path.cwd())
print("sys.path[0]:", sys.path[0])
print("SRC exists:", SRC.exists())

python: c:\Users\quantbase\.conda\envs\sydata-311\python.exe
cwd: C:\Users\quantbase\Desktop\sydata
sys.path[0]: C:\Users\quantbase\Desktop\sydata\src
SRC exists: True


In [5]:
SYMBOL = "BTC-USDT"
START = "2024-01-01"
END   = "2024-02-01"

In [8]:
#-----1) Load spot 1h klines (spine)----

In [6]:
import pandas as pd

KDIR = DATA_ROOT / "raw" / "binance" / "klines" / f"symbol={SYMBOL}" / "interval=1h"
kfiles = sorted(KDIR.glob("part-*.parquet"))
k = pd.concat([pd.read_parquet(f) for f in kfiles], ignore_index=True)

k["ts"] = pd.to_datetime(k["open_time"], unit="ms", utc=True)
k = k.sort_values("ts")
k = k[(k["ts"] >= START) & (k["ts"] < END)].copy()

k.shape, k[["ts","open","high","low","close","volume","trades"]].head(5)


((744, 16),
                              ts      open      high       low     close  \
 35032 2024-01-01 00:00:00+00:00  42283.58  42554.57  42261.02  42475.23   
 35033 2024-01-01 01:00:00+00:00  42475.23  42775.00  42431.65  42613.56   
 35034 2024-01-01 02:00:00+00:00  42613.57  42638.41  42500.00  42581.10   
 35035 2024-01-01 03:00:00+00:00  42581.09  42586.64  42230.08  42330.49   
 35036 2024-01-01 04:00:00+00:00  42330.50  42399.99  42209.46  42399.99   
 
            volume  trades  
 35032  1271.68108   47134  
 35033  1196.37856   50396  
 35034   685.21980   29863  
 35035   794.80391   38620  
 35036   715.41760   36038  )

In [9]:
#-----2) Load BVOL (resampled) and join---- 

In [7]:
b = pd.read_parquet(DATA_ROOT / "norm" / "bvol_resampled" / "symbol=BTCBVOLUSDT" / "bvol_1H.parquet")
b = b.rename(columns={"hour": "ts"}) if "hour" in b.columns else b
b = b.sort_values("ts")
b = b[(b["ts"] >= START) & (b["ts"] < END)].copy()

m = k.merge(b[["ts","bvol"]], on="ts", how="left")
m[["ts","close","bvol"]].head(10), m["bvol"].isna().mean()


(                         ts     close     bvol
 0 2024-01-01 00:00:00+00:00  42475.23  68.0201
 1 2024-01-01 01:00:00+00:00  42613.56  67.7106
 2 2024-01-01 02:00:00+00:00  42581.10  67.5697
 3 2024-01-01 03:00:00+00:00  42330.49  68.0067
 4 2024-01-01 04:00:00+00:00  42399.99  68.0199
 5 2024-01-01 05:00:00+00:00  42234.01  67.8465
 6 2024-01-01 06:00:00+00:00  42396.69  67.7140
 7 2024-01-01 07:00:00+00:00  42492.46  67.5710
 8 2024-01-01 08:00:00+00:00  42549.99  67.7485
 9 2024-01-01 09:00:00+00:00  42649.69  68.4588,
 np.float64(0.0))

In [10]:
#-----3) Load UM funding and join (no lookahead)----

In [12]:
f = pd.read_parquet(DATA_ROOT / "raw" / "binance" / "um_funding_rate" / f"symbol={SYMBOL}" / "part-2024-01.parquet")
f = f.sort_values("ts")

m = pd.merge_asof(
    m.sort_values("ts"),
    f[["ts","funding_rate","funding_interval_hours"]].sort_values("ts"),
    on="ts",
    direction="backward",
)

m[["ts","close","funding_rate","funding_interval_hours"]].head(20), m["funding_rate"].isna().mean()


(                          ts     close  funding_rate  funding_interval_hours
 0  2024-01-01 00:00:00+00:00  42475.23      0.000374                       8
 1  2024-01-01 01:00:00+00:00  42613.56      0.000374                       8
 2  2024-01-01 02:00:00+00:00  42581.10      0.000374                       8
 3  2024-01-01 03:00:00+00:00  42330.49      0.000374                       8
 4  2024-01-01 04:00:00+00:00  42399.99      0.000374                       8
 5  2024-01-01 05:00:00+00:00  42234.01      0.000374                       8
 6  2024-01-01 06:00:00+00:00  42396.69      0.000374                       8
 7  2024-01-01 07:00:00+00:00  42492.46      0.000374                       8
 8  2024-01-01 08:00:00+00:00  42549.99      0.000272                       8
 9  2024-01-01 09:00:00+00:00  42649.69      0.000272                       8
 10 2024-01-01 10:00:00+00:00  42691.10      0.000272                       8
 11 2024-01-01 11:00:00+00:00  42690.20      0.000272           

In [13]:
# 4) (Optional) Load spot aggtrades hourly flow (if present) and join

In [14]:
ADIR = DATA_ROOT / "raw" / "binance" / "spot_aggtrades" / f"symbol={SYMBOL}"
afiles = sorted(ADIR.rglob("part-2024-01.parquet"))
a = pd.concat([pd.read_parquet(f) for f in afiles], ignore_index=True)

a = a[(a["ts"] >= START) & (a["ts"] < END)].copy()
a["ts"] = pd.to_datetime(a["ts"], utc=True)
a["hour"] = a["ts"].dt.floor("1h")

# sign convention: is_buyer_maker True => sell-initiated => negative
a["signed_qty"] = a["qty"].where(~a["is_buyer_maker"], -a["qty"])

hourly = (
    a.groupby("hour", as_index=False)
     .agg(sum_qty=("qty","sum"), cvd=("signed_qty","sum"), trades=("agg_trade_id","count"))
     .rename(columns={"hour":"ts"})
)

m = m.merge(hourly, on="ts", how="left")
m[["ts","close","sum_qty","cvd","trades"]].head(20), m["cvd"].isna().mean()


ValueError: No objects to concatenate

In [15]:
#------INSPECT

In [16]:
# expected 1h grid for Jan
m.shape, m["ts"].is_monotonic_increasing, m["ts"].duplicated().any()


((744, 19), True, np.False_)

In [18]:
# missingness summary
m[["bvol","funding_rate"]].isna().mean().sort_values(ascending=False)


bvol            0.0
funding_rate    0.0
dtype: float64

In [19]:
# spot-check funding changes line up to 8-hour schedule
chg = m["funding_rate"].ne(m["funding_rate"].shift(1))
m.loc[chg, ["ts","funding_rate"]].head(20)


Unnamed: 0,ts,funding_rate
0,2024-01-01 00:00:00+00:00,0.000374
8,2024-01-01 08:00:00+00:00,0.000272
16,2024-01-01 16:00:00+00:00,0.000336
24,2024-01-02 00:00:00+00:00,0.000658
32,2024-01-02 08:00:00+00:00,0.000352
40,2024-01-02 16:00:00+00:00,0.000537
48,2024-01-03 00:00:00+00:00,0.000212
56,2024-01-03 08:00:00+00:00,0.000171
64,2024-01-03 16:00:00+00:00,0.0001
624,2024-01-27 00:00:00+00:00,9e-05


In [20]:
m.head(5)

Unnamed: 0,open_time,open,high,low,close,volume,close_time,quote_volume,trades,taker_buy_base_volume,taker_buy_quote_volume,ignore,symbol,interval,venue,ts,bvol,funding_rate,funding_interval_hours
0,1704067200000,42283.58,42554.57,42261.02,42475.23,1271.68108,1704070799999,53957250.0,47134,682.57581,28957420.0,0,BTC-USDT,1h,binance,2024-01-01 00:00:00+00:00,68.0201,0.000374,8
1,1704070800000,42475.23,42775.0,42431.65,42613.56,1196.37856,1704074399999,50984890.0,50396,712.32227,30355650.0,0,BTC-USDT,1h,binance,2024-01-01 01:00:00+00:00,67.7106,0.000374,8
2,1704074400000,42613.57,42638.41,42500.0,42581.1,685.2198,1704077999999,29167380.0,29863,288.98864,12301020.0,0,BTC-USDT,1h,binance,2024-01-01 02:00:00+00:00,67.5697,0.000374,8
3,1704078000000,42581.09,42586.64,42230.08,42330.49,794.80391,1704081599999,33709050.0,38620,356.37209,15113000.0,0,BTC-USDT,1h,binance,2024-01-01 03:00:00+00:00,68.0067,0.000374,8
4,1704081600000,42330.5,42399.99,42209.46,42399.99,715.4176,1704085199999,30271620.0,36038,371.12012,15703620.0,0,BTC-USDT,1h,binance,2024-01-01 04:00:00+00:00,68.0199,0.000374,8
