In [8]:
import sys, os
from pathlib import Path
import yaml

PROJECT_ROOT = Path(r"C:\Users\quantbase\Desktop\sydata")
SRC = PROJECT_ROOT / "src"
DATA_ROOT = Path(r"C:\Users\quantbase\Desktop\marketdata")
MANIFEST = DATA_ROOT / "meta" / "symbols.yml"

# Make `from sydata...` importable
sys.path.insert(0, str(SRC))

# Make relative paths (scripts/, etc.) resolve predictably
os.chdir(str(PROJECT_ROOT))

print("python:", sys.executable)
print("cwd:", Path.cwd())
print("sys.path[0]:", sys.path[0])
print("SRC exists:", SRC.exists())


python: c:\Users\quantbase\.conda\envs\sydata-311\python.exe
cwd: C:\Users\quantbase\Desktop\sydata
sys.path[0]: C:\Users\quantbase\Desktop\sydata\src
SRC exists: True


In [9]:
spec = yaml.safe_load(MANIFEST.read_text())

basket_name = "core_major"
basket = spec["baskets"][basket_name]["symbols"]  # <- critical: end at ["symbols"]

# hard fail if you accidentally loaded keys like "core_major"
assert isinstance(basket, list) and all(isinstance(s, str) for s in basket), basket
basket


['BTC-USDT',
 'ETH-USDT',
 'SOL-USDT',
 'BNB-USDT',
 'XRP-USDT',
 'ADA-USDT',
 'LINK-USDT']

In [10]:
import sys, subprocess

pkgs = ["pandas", "numpy", "pyarrow", "requests", "pyyaml"]
subprocess.check_call([sys.executable, "-m", "pip", "install", "-q"] + pkgs)


0

In [11]:
#----guards for basket loading----#

import re  # no installation needed

def load_basket(spec: dict, basket_name: str) -> list[str]:
    basket = spec["baskets"][basket_name]["symbols"]
    if not isinstance(basket, list):
        raise TypeError(f"Basket '{basket_name}' must be a list, got {type(basket)}: {basket}")

    bad = [s for s in basket if not isinstance(s, str) or not re.fullmatch(r"[A-Za-z0-9]+-[A-Za-z0-9]+", s)]
    if bad:
        raise ValueError(f"Invalid canonical symbols in basket '{basket_name}': {bad} (expected like 'BTC-USDT')")

    return basket

In [12]:
#------Multi asset basket-------#

In [13]:
basket = load_basket(spec, "core_major")

In [14]:
import pandas as pd
from pathlib import Path
from sydata.providers.binance_spot import BinanceSpotClient

client = BinanceSpotClient()

def to_ms_utc(ts: str) -> int:
    t = pd.Timestamp(ts)
    if t.tzinfo is None:
        t = t.tz_localize("UTC")
    else:
        t = t.tz_convert("UTC")
    return int(t.value // 1_000_000)

intervals = ["1h"]  # add more later: ["1m","1h","1d"]
start = "2020-01-01T00:00:00Z"
end   = "2026-01-01T00:00:00Z"

start_ms = to_ms_utc(start)
end_ms   = to_ms_utc(end)

def raw_out_dir(data_root: Path, symbol: str, interval: str) -> Path:
    return data_root / "raw" / "binance" / "klines" / f"symbol={symbol}" / f"interval={interval}"

written = []
for symbol in basket:
    for interval in intervals:
        df = client.fetch_klines(symbol=symbol, interval=interval, start_ms=start_ms, end_ms=end_ms)
        if df.empty:
            continue

        out_dir = raw_out_dir(DATA_ROOT, symbol, interval)
        out_dir.mkdir(parents=True, exist_ok=True)

        first_open = int(df["open_time"].iloc[0])
        last_open  = int(df["open_time"].iloc[-1])
        out_path = out_dir / f"part-{first_open}-{last_open}.parquet"

        df.to_parquet(out_path, index=False)
        written.append(str(out_path))

written


['C:\\Users\\quantbase\\Desktop\\marketdata\\raw\\binance\\klines\\symbol=BTC-USDT\\interval=1h\\part-1577836800000-1767225600000.parquet',
 'C:\\Users\\quantbase\\Desktop\\marketdata\\raw\\binance\\klines\\symbol=ETH-USDT\\interval=1h\\part-1577836800000-1767225600000.parquet',
 'C:\\Users\\quantbase\\Desktop\\marketdata\\raw\\binance\\klines\\symbol=SOL-USDT\\interval=1h\\part-1597125600000-1767225600000.parquet',
 'C:\\Users\\quantbase\\Desktop\\marketdata\\raw\\binance\\klines\\symbol=BNB-USDT\\interval=1h\\part-1577836800000-1767225600000.parquet',
 'C:\\Users\\quantbase\\Desktop\\marketdata\\raw\\binance\\klines\\symbol=XRP-USDT\\interval=1h\\part-1577836800000-1767225600000.parquet',
 'C:\\Users\\quantbase\\Desktop\\marketdata\\raw\\binance\\klines\\symbol=ADA-USDT\\interval=1h\\part-1577836800000-1767225600000.parquet',
 'C:\\Users\\quantbase\\Desktop\\marketdata\\raw\\binance\\klines\\symbol=LINK-USDT\\interval=1h\\part-1577836800000-1767225600000.parquet']

In [15]:
#-----BVOL Ingest-----#

In [None]:
from sydata.providers.binance_bvol_index import BinanceBVOLIndexClient  # project-local

client = BinanceBVOLIndexClient()

symbols = ["BTCBVOLUSDT", "ETHBVOLUSDT"]
start_d = date(2020, 1, 1)
end_d   = date(2026, 1, 1)   # end exclusive like your other flows

d = start_d
written = []
while d < end_d:
    for sym in symbols:
        df = client.fetch_day(sym, d)
        if df.empty:
            continue

        out_dir = DATA_ROOT / "raw" / "binance" / "bvol_index" / f"symbol={sym}"
        out_dir.mkdir(parents=True, exist_ok=True)

        out_path = out_dir / f"part-{d.isoformat()}.parquet"
        df.to_parquet(out_path, index=False)
        written.append(str(out_path))
    d += timedelta(days=1)

len(written), written[:3]


(1808,
 ['C:\\Users\\quantbase\\Desktop\\marketdata\\raw\\binance\\bvol_index\\symbol=BTCBVOLUSDT\\part-2023-06-20.parquet',
  'C:\\Users\\quantbase\\Desktop\\marketdata\\raw\\binance\\bvol_index\\symbol=ETHBVOLUSDT\\part-2023-06-20.parquet',
  'C:\\Users\\quantbase\\Desktop\\marketdata\\raw\\binance\\bvol_index\\symbol=BTCBVOLUSDT\\part-2023-06-21.parquet'])

In [None]:
#--------timeframe BVOL (REWRITE PATHS FOR EVOL)-------#

In [4]:
import pandas as pd  # already in env â€” no new install
from pathlib import Path  # no installation needed

p = Path(r"C:\Users\quantbase\Desktop\marketdata\raw\binance\bvol_index\symbol=BTCBVOLUSDT\part-2025-12-14.parquet")
df = pd.read_parquet(p)

df.shape, df.columns.tolist()


((21959, 8),
 ['calc_time',
  'symbol',
  'base_asset',
  'quote_asset',
  'index_value',
  'file_date',
  'venue',
  'dataset'])

In [None]:
BVOL_DIR = Path(r"C:\Users\quantbase\Desktop\marketdata\raw\binance\bvol_index\symbol=BTCBVOLUSDT")
parts = sorted(BVOL_DIR.glob("part-*.parquet"))

dfs = []
for p in parts:
    dfs.append(pd.read_parquet(p, columns=["calc_time", "index_value"]))

bv = pd.concat(dfs, ignore_index=True)

# map to canonical names
bv = bv.rename(columns={"calc_time": "ts", "index_value": "bvol"})

# FIX: calc_time is epoch milliseconds
bv["ts"] = pd.to_datetime(bv["ts"], unit="ms", utc=True)
bv["bvol"] = pd.to_numeric(bv["bvol"], errors="coerce")

# optional: if you want decimal (0.45) instead of percent (45.0)
# bv["bvol"] = bv["bvol"] / 100.0

bv = (
    bv.dropna(subset=["ts", "bvol"])
      .drop_duplicates(subset=["ts"])
      .sort_values("ts")
      .set_index("ts")
)

bv.head(), bv.tail(), bv.shape


(                              bvol
 ts                                
 2023-06-20 00:11:40+00:00  47.3320
 2023-06-20 00:11:41+00:00  47.3321
 2023-06-20 00:11:42+00:00  47.3322
 2023-06-20 00:11:43+00:00  47.3322
 2023-06-20 00:11:44+00:00  47.3323,
                                      bvol
 ts                                       
 2025-12-14 06:05:54+00:00         44.9077
 2025-12-14 06:05:55.006000+00:00  44.9088
 2025-12-14 06:05:56+00:00         44.9100
 2025-12-14 06:05:57.001000+00:00  44.9111
 2025-12-14 06:05:58.009000+00:00  44.9141,
 (77960681, 1))

In [9]:
#resample to `t` 

In [27]:
BAR = "1h"          # change to "4H", "1D", etc.
FFILL_LIMIT = 24    # max bars to carry forward (24 hours). adjust later.

bvol_bar = (
    bv["bvol"]
    .resample(BAR)
    .last()
    .ffill(limit=FFILL_LIMIT)
)

bvol_bar.head(), bvol_bar.tail(), bvol_bar.isna().mean()


(ts
 2023-06-20 00:00:00+00:00    47.5671
 2023-06-20 01:00:00+00:00    47.5040
 2023-06-20 02:00:00+00:00    47.7328
 2023-06-20 03:00:00+00:00    47.7538
 2023-06-20 04:00:00+00:00    47.6858
 Freq: h, Name: bvol, dtype: float64,
 ts
 2025-12-14 02:00:00+00:00    47.1572
 2025-12-14 03:00:00+00:00    47.1043
 2025-12-14 04:00:00+00:00    47.1202
 2025-12-14 05:00:00+00:00    44.7705
 2025-12-14 06:00:00+00:00    44.9141
 Freq: h, Name: bvol, dtype: float64,
 np.float64(0.0011009679343089132))

In [22]:
OUT = Path(r"C:\Users\quantbase\Desktop\marketdata\norm\bvol_resampled\symbol=BTCBVOLUSDT")
OUT.mkdir(parents=True, exist_ok=True)

out_path = OUT / f"bvol_{BAR}.parquet"
(
    bvol_bar.rename("bvol")
    .to_frame()
    .reset_index()
    .to_parquet(out_path, index=False)
)

out_path

WindowsPath('C:/Users/quantbase/Desktop/marketdata/norm/bvol_resampled/symbol=BTCBVOLUSDT/bvol_1H.parquet')

In [None]:
#align to our price data

In [23]:
px = pd.read_parquet(r"C:\Users\quantbase\Desktop\marketdata\raw\binance\klines\symbol=BTC-USDT\interval=1h\part-1577836800000-1767225600000.parquet")
px["ts"] = pd.to_datetime(px["open_time"], unit="ms", utc=True)
px = px.sort_values("ts").reset_index(drop=True)

b = bvol_bar.rename("bvol").reset_index().rename(columns={"ts":"ts"}).sort_values("ts")

feat = pd.merge_asof(px.sort_values("ts"), b, on="ts", direction="backward")
feat["bvol_available"] = feat["bvol"].notna().astype("int8")


In [25]:
p = r"C:\Users\quantbase\Desktop\marketdata\norm\bvol_resampled\symbol=BTCBVOLUSDT\bvol_1H.parquet"
pd.read_parquet(p).head(27)

Unnamed: 0,ts,bvol
0,2023-06-20 00:00:00+00:00,47.5671
1,2023-06-20 01:00:00+00:00,47.504
2,2023-06-20 02:00:00+00:00,47.7328
3,2023-06-20 03:00:00+00:00,47.7538
4,2023-06-20 04:00:00+00:00,47.6858
5,2023-06-20 05:00:00+00:00,47.9582
6,2023-06-20 06:00:00+00:00,47.9588
7,2023-06-20 07:00:00+00:00,47.9135
8,2023-06-20 08:00:00+00:00,47.9136
9,2023-06-20 09:00:00+00:00,48.0318


#### MISCELLANEOUS / Tests

In [19]:
#--------Single asset-----------#

In [3]:
import pandas as pd
from sydata.providers.binance_spot import BinanceSpotClient  # src path fix in Cell 1 enables this

client = BinanceSpotClient()

start = pd.Timestamp("2026-01-01T00:00:00Z")
end   = pd.Timestamp("2026-01-01T06:00:00Z")
start_ms = int(start.value // 1_000_000)
end_ms   = int(end.value // 1_000_000)

df = client.fetch_klines(symbol="BTC-USDT", interval="1m", start_ms=start_ms, end_ms=end_ms)
df.head(), df.tail(), df.shape


(       open_time      open      high       low     close   volume  \
 0  1767225600000  87648.21  87648.22  87632.74  87648.00  4.08049   
 1  1767225660000  87648.00  87686.79  87647.99  87686.79  8.53093   
 2  1767225720000  87686.78  87686.79  87679.28  87682.91  4.08878   
 3  1767225780000  87682.91  87701.91  87682.91  87701.90  1.26772   
 4  1767225840000  87701.91  87701.91  87701.90  87701.91  0.47668   
 
       close_time   quote_volume  trades  taker_buy_base_volume  \
 0  1767225659999  357625.776862    1193                1.79879   
 1  1767225719999  747798.775607    1401                7.18527   
 2  1767225779999  358517.369518     938                1.52735   
 3  1767225839999  111167.931008    1070                0.95239   
 4  1767225899999   41805.744653     174                0.29612   
 
    taker_buy_quote_volume ignore    symbol interval    venue  
 0           157651.514951      0  BTC-USDT       1m  binance  
 1           629847.683013      0  BTC-USDT   

In [4]:
out_dir = DATA_ROOT / "raw" / "binance" / "klines" / "symbol=BTC-USDT" / "interval=1m"
out_dir.mkdir(parents=True, exist_ok=True)

if df.empty:
    raise RuntimeError("No rows returned; check time window / connectivity.")

first_open = int(df["open_time"].iloc[0])
last_open  = int(df["open_time"].iloc[-1])
out_path = out_dir / f"part-{first_open}-{last_open}.parquet"

df.to_parquet(out_path, index=False)
out_path


WindowsPath('C:/Users/quantbase/Desktop/marketdata/raw/binance/klines/symbol=BTC-USDT/interval=1m/part-1767225600000-1767247200000.parquet')

In [5]:
import pandas as pd

df2 = pd.read_parquet(out_path)
assert len(df2) == len(df)
df2.dtypes, df2.shape


(open_time                   int64
 open                      float64
 high                      float64
 low                       float64
 close                     float64
 volume                    float64
 close_time                  int64
 quote_volume              float64
 trades                      int64
 taker_buy_base_volume     float64
 taker_buy_quote_volume    float64
 ignore                     object
 symbol                     object
 interval                   object
 venue                      object
 dtype: object,
 (361, 15))

In [6]:
#-----------.py execution-----------#

In [None]:
import os, sys
from pathlib import Path

PROJECT_ROOT = Path(r"C:\Users\quantbase\Desktop\sydata")
SRC = PROJECT_ROOT / "src"
os.chdir(str(PROJECT_ROOT))

os.environ["PYTHONPATH"] = str(SRC)

!{sys.executable} scripts/binance_fetch_raw_klines.py `
  --data-root "C:\Users\quantbase\Desktop\marketdata" `
  --symbol "BTC-USDT" `
  --interval "1m" `
  --start "2026-01-01T00:00:00Z" `
  --end "2026-01-01T06:00:00Z"
