In [2]:
import sys, os
from pathlib import Path
import yaml
import json

PROJECT_ROOT = Path(r"C:\Users\quantbase\Desktop\sydata")
SRC = PROJECT_ROOT / "src"
DATA_ROOT = Path(r"C:\Users\quantbase\Desktop\marketdata")
MANIFEST = DATA_ROOT / "meta" / "symbols.yml"

# Make `from sydata...` importable
sys.path.insert(0, str(SRC))

# Make relative paths (scripts/, etc.) resolve predictably
os.chdir(str(PROJECT_ROOT))

print("python:", sys.executable)
print("cwd:", Path.cwd())
print("sys.path[0]:", sys.path[0])
print("SRC exists:", SRC.exists())


python: c:\Users\quantbase\.conda\envs\sydata-311\python.exe
cwd: C:\Users\quantbase\Desktop\sydata
sys.path[0]: C:\Users\quantbase\Desktop\sydata\src
SRC exists: True


In [3]:
import os, sys
from pathlib import Path

PROJECT_ROOT = Path(r"C:\Users\quantbase\Desktop\sydata")
SRC = PROJECT_ROOT / "src"
os.chdir(str(PROJECT_ROOT))
os.environ["PYTHONPATH"] = str(SRC)

DATA_ROOT = Path(r"C:\Users\quantbase\Desktop\marketdata")
MANIFEST  = r"C:\Users\quantbase\Desktop\marketdata\meta\symbols.yml"


In [4]:
import subprocess  # no installation needed

In [15]:
cmd = [
    sys.executable, "scripts/ingest_spot_aggtrades.py",
    "--data-root", DATA_ROOT,
    "--manifest", MANIFEST,
    "--basket", "core_major",
    "--start", "2025-01-01",
    "--end", "2025-12-31",
]

cp = subprocess.run(cmd, capture_output=True, text=True)
cp.returncode, cp.stdout[-2000:], cp.stderr[-4000:]

(0,
 'C:\\Users\\quantbase\\Desktop\\marketdata\\meta\\runs\\2026-01-21T042119Z_spot_aggtrades_report.json\n',
 '')

In [16]:

runs_dir = Path(DATA_ROOT) / "meta" / "runs"
latest = max(runs_dir.glob("*_spot_aggtrades_report.json"), key=lambda p: p.stat().st_mtime)
report = json.loads(latest.read_text())

latest, report["summary"]

(WindowsPath('C:/Users/quantbase/Desktop/marketdata/meta/runs/2026-01-21T042119Z_spot_aggtrades_report.json'),
 {'ok': 84, 'error': 0, 'rows_written': 1481503207})

In [17]:
errs = [r for r in report["results"] if r["status"] == "error"]
len(errs), errs[:5]


(0, [])

In [18]:
oks = [r for r in report["results"] if r["status"] == "ok"]
one = oks[0]
one["file"]


'C:\\Users\\quantbase\\Desktop\\marketdata\\raw\\binance\\spot_aggtrades\\symbol=BTC-USDT\\year=2025\\month=01\\part-2025-01.parquet'

In [19]:
import pandas as pd
df = pd.read_parquet(one["file"])
df.shape, df["ts"].min(), df["ts"].max(), df.head(3)


((50938081, 8),
 Timestamp('2025-01-01 00:00:00.010866+0000', tz='UTC'),
 Timestamp('2025-01-31 23:59:59.923100+0000', tz='UTC'),
                                 ts  agg_trade_id    price      qty  \
 0 2025-01-01 00:00:00.010866+00:00    3358804174  93576.0  0.00136   
 1 2025-01-01 00:00:00.074095+00:00    3358804175  93576.0  0.00366   
 2 2025-01-01 00:00:00.091046+00:00    3358804176  93576.0  0.00786   
 
    is_buyer_maker    symbol    venue         dataset  
 0            True  BTC-USDT  binance  spot_aggtrades  
 1            True  BTC-USDT  binance  spot_aggtrades  
 2            True  BTC-USDT  binance  spot_aggtrades  )

In [None]:
#-------data inspect-------

In [5]:
def list_parts(dataset: str, symbol: str, n: int = 10):
    d = DATA_ROOT / "raw" / "binance" / dataset / f"symbol={symbol}"
    parts = sorted(d.rglob("part-*.parquet"), key=lambda p: p.stat().st_mtime, reverse=True)
    return [str(p) for p in parts[:n]]

list_parts("spot_aggtrades", "LINK-USDT", n=10)

['C:\\Users\\quantbase\\Desktop\\marketdata\\raw\\binance\\spot_aggtrades\\symbol=LINK-USDT\\year=2025\\month=12\\part-2025-12.parquet',
 'C:\\Users\\quantbase\\Desktop\\marketdata\\raw\\binance\\spot_aggtrades\\symbol=LINK-USDT\\year=2025\\month=11\\part-2025-11.parquet',
 'C:\\Users\\quantbase\\Desktop\\marketdata\\raw\\binance\\spot_aggtrades\\symbol=LINK-USDT\\year=2025\\month=10\\part-2025-10.parquet',
 'C:\\Users\\quantbase\\Desktop\\marketdata\\raw\\binance\\spot_aggtrades\\symbol=LINK-USDT\\year=2025\\month=09\\part-2025-09.parquet',
 'C:\\Users\\quantbase\\Desktop\\marketdata\\raw\\binance\\spot_aggtrades\\symbol=LINK-USDT\\year=2025\\month=08\\part-2025-08.parquet',
 'C:\\Users\\quantbase\\Desktop\\marketdata\\raw\\binance\\spot_aggtrades\\symbol=LINK-USDT\\year=2025\\month=07\\part-2025-07.parquet',
 'C:\\Users\\quantbase\\Desktop\\marketdata\\raw\\binance\\spot_aggtrades\\symbol=LINK-USDT\\year=2025\\month=06\\part-2025-06.parquet',
 'C:\\Users\\quantbase\\Desktop\\marketda

In [7]:
import pandas as pd  # already in env â€” no new install

p = r"C:\\Users\\quantbase\\Desktop\\marketdata\\raw\\binance\\spot_aggtrades\\symbol=LINK-USDT\\year=2025\\month=12\\part-2025-12.parquet"
df = pd.read_parquet(p)

df.shape, df.columns.tolist(), df.dtypes, df.head(10), df.tail(10)


((1104893, 8),
 ['ts',
  'agg_trade_id',
  'price',
  'qty',
  'is_buyer_maker',
  'symbol',
  'venue',
  'dataset'],
 ts                datetime64[ns, UTC]
 agg_trade_id                    Int64
 price                         float64
 qty                           float64
 is_buyer_maker                boolean
 symbol                         object
 venue                          object
 dataset                        object
 dtype: object,
                                 ts  agg_trade_id  price     qty  \
 0 2025-12-01 00:00:03.424403+00:00     282613424  12.95    3.87   
 1 2025-12-01 00:00:05.444611+00:00     282613425  12.95   18.67   
 2 2025-12-01 00:00:06.470195+00:00     282613426  12.96   22.97   
 3 2025-12-01 00:00:06.699224+00:00     282613427  12.95  344.10   
 4 2025-12-01 00:00:06.699295+00:00     282613428  12.95  120.25   
 5 2025-12-01 00:00:06.701957+00:00     282613429  12.95    0.41   
 6 2025-12-01 00:00:06.710381+00:00     282613430  12.94   24.71   
 7 2025-12

In [8]:
df["ts"].min(), df["ts"].max()


(Timestamp('2025-12-01 00:00:03.424403+0000', tz='UTC'),
 Timestamp('2025-12-31 23:59:56.872729+0000', tz='UTC'))

In [9]:
p = r"C:\Users\quantbase\Desktop\marketdata\norm\bvol_resampled\symbol=BTCBVOLUSDT\bvol_1H.parquet"
pd.read_parquet(p).head(20), pd.read_parquet(p).tail(20)

(                          ts     bvol
 0  2023-06-20 00:00:00+00:00  47.5671
 1  2023-06-20 01:00:00+00:00  47.5040
 2  2023-06-20 02:00:00+00:00  47.7328
 3  2023-06-20 03:00:00+00:00  47.7538
 4  2023-06-20 04:00:00+00:00  47.6858
 5  2023-06-20 05:00:00+00:00  47.9582
 6  2023-06-20 06:00:00+00:00  47.9588
 7  2023-06-20 07:00:00+00:00  47.9135
 8  2023-06-20 08:00:00+00:00  47.9136
 9  2023-06-20 09:00:00+00:00  48.0318
 10 2023-06-20 10:00:00+00:00  48.2342
 11 2023-06-20 11:00:00+00:00  48.4916
 12 2023-06-20 12:00:00+00:00  48.8335
 13 2023-06-20 13:00:00+00:00  48.9874
 14 2023-06-20 14:00:00+00:00  49.1983
 15 2023-06-20 15:00:00+00:00  49.1211
 16 2023-06-20 16:00:00+00:00  51.8741
 17 2023-06-20 17:00:00+00:00  52.7536
 18 2023-06-20 18:00:00+00:00  53.5247
 19 2023-06-20 19:00:00+00:00  53.1205,
                              ts     bvol
 21779 2025-12-13 11:00:00+00:00  46.3345
 21780 2025-12-13 12:00:00+00:00  46.3594
 21781 2025-12-13 13:00:00+00:00  46.3647
 21782 2025-

In [10]:
KLINE_DIR = Path(r"C:\Users\quantbase\Desktop\marketdata\raw\binance\klines\symbol=BTC-USDT\interval=1h")
kfile = sorted(KLINE_DIR.glob("part-*.parquet"), key=lambda p: p.stat().st_mtime, reverse=True)[0]
k = pd.read_parquet(kfile)
k.shape, k.columns.tolist(), k.head(10)


((52577, 15),
 ['open_time',
  'open',
  'high',
  'low',
  'close',
  'volume',
  'close_time',
  'quote_volume',
  'trades',
  'taker_buy_base_volume',
  'taker_buy_quote_volume',
  'ignore',
  'symbol',
  'interval',
  'venue'],
        open_time     open     high      low    close      volume  \
 0  1577836800000  7195.24  7196.25  7175.46  7177.02  511.814901   
 1  1577840400000  7176.47  7230.00  7175.71  7216.27  883.052603   
 2  1577844000000  7215.52  7244.87  7211.41  7242.85  655.156809   
 3  1577847600000  7242.66  7245.00  7220.00  7225.01  783.724867   
 4  1577851200000  7225.00  7230.00  7215.03  7217.27  467.812578   
 5  1577854800000  7217.26  7229.76  7216.65  7224.21  344.670596   
 6  1577858400000  7224.24  7236.27  7221.51  7225.62  621.467023   
 7  1577862000000  7225.88  7232.94  7199.11  7209.83  627.344854   
 8  1577865600000  7209.83  7210.00  7180.00  7200.64  915.545974   
 9  1577869200000  7200.29  7210.51  7188.00  7188.77  636.386102   
 
       

In [11]:
#-----------inspect done-----------

In [None]:
#--------single asset test--------

In [5]:
from sydata.providers.binance_data_archive import BinanceDataArchiveClient
from sydata.datasets.spot_aggtrades import SpotAggTradesIngestor

client = BinanceDataArchiveClient()
ing = SpotAggTradesIngestor(data_root=DATA_ROOT, client=client)

info = ing.ingest_month("LINK-USDT", 2024, 1)
info


{'symbol': 'LINK-USDT',
 'venue_symbol': 'LINKUSDT',
 'year': 2024,
 'month': 1,
 'rows': 3437082,
 'ts_min': '2024-01-01T00:00:03.800000+00:00',
 'ts_max': '2024-01-31T23:59:59.413000+00:00',
 'file': 'C:\\Users\\quantbase\\Desktop\\marketdata\\raw\\binance\\spot_aggtrades\\symbol=LINK-USDT\\year=2024\\month=01\\part-2024-01.parquet'}

In [6]:
#---------inspect ingested data---------

In [7]:
import pandas as pd

p = Path(info["file"])
df = pd.read_parquet(p)
df.shape, df.columns.tolist(), df.dtypes, df.head(10), df["ts"].min(), df["ts"].max()


((3437082, 8),
 ['ts',
  'agg_trade_id',
  'price',
  'qty',
  'is_buyer_maker',
  'symbol',
  'venue',
  'dataset'],
 ts                datetime64[ns, UTC]
 agg_trade_id                    Int64
 price                         float64
 qty                           float64
 is_buyer_maker                boolean
 symbol                         object
 venue                          object
 dataset                        object
 dtype: object,
                                 ts  agg_trade_id   price    qty  \
 0 2024-01-01 00:00:03.800000+00:00     217801983  14.940  18.17   
 1 2024-01-01 00:00:04.027000+00:00     217801984  14.940   5.96   
 2 2024-01-01 00:00:04.052000+00:00     217801985  14.941   0.72   
 3 2024-01-01 00:00:04.216000+00:00     217801986  14.940  10.03   
 4 2024-01-01 00:00:04.225000+00:00     217801987  14.940   6.53   
 5 2024-01-01 00:00:04.931000+00:00     217801988  14.940  22.51   
 6 2024-01-01 00:00:05.279000+00:00     217801989  14.938   0.52   
 7 2024-01

In [8]:
#------hourly aggregated data ingestion------

In [9]:
d = df.copy()
d["hour"] = d["ts"].dt.floor("1H")
d["signed_qty"] = d["qty"] * d["is_buyer_maker"].map({True: -1.0, False: 1.0})

hourly = (
    d.groupby("hour", as_index=False)
     .agg(sum_qty=("qty","sum"), cvd=("signed_qty","sum"), trades=("qty","size"))
)
hourly.head(10), hourly.tail(10), hourly.shape


  d["hour"] = d["ts"].dt.floor("1H")


(                       hour    sum_qty       cvd  trades
 0 2024-01-01 00:00:00+00:00  149564.96  -6511.92    3283
 1 2024-01-01 01:00:00+00:00  113669.77  -3746.79    1950
 2 2024-01-01 02:00:00+00:00   75408.84   6854.32    1544
 3 2024-01-01 03:00:00+00:00  124375.49 -22274.53    2702
 4 2024-01-01 04:00:00+00:00  132771.96  -8948.26    3150
 5 2024-01-01 05:00:00+00:00  145314.34 -24061.30    2191
 6 2024-01-01 06:00:00+00:00   62480.65  -3496.97    1538
 7 2024-01-01 07:00:00+00:00   94893.65  14515.77    1918
 8 2024-01-01 08:00:00+00:00  120408.41  -3879.89    2340
 9 2024-01-01 09:00:00+00:00  128490.00    199.82    2414,
                          hour    sum_qty       cvd  trades
 734 2024-01-31 14:00:00+00:00  344590.97   3054.15    5924
 735 2024-01-31 15:00:00+00:00  511285.10 -30722.98    8519
 736 2024-01-31 16:00:00+00:00  293366.72 -22620.12    4595
 737 2024-01-31 17:00:00+00:00  231098.08 -28795.08    3458
 738 2024-01-31 18:00:00+00:00  221981.74 -29422.78    3137
 

In [10]:
#---------consistency with price/klines data---------

In [11]:
import pandas as pd
from pathlib import Path

# Load all 1h kline parts for the symbol, then filter the month.
KLINE_DIR = Path(r"C:\Users\quantbase\Desktop\marketdata\raw\binance\klines\symbol=LINK-USDT\interval=1h")
kfiles = sorted(KLINE_DIR.glob("part-*.parquet"))
k = pd.concat([pd.read_parquet(f) for f in kfiles], ignore_index=True)

k["ts"] = pd.to_datetime(k["open_time"], unit="ms", utc=True)
k = k.sort_values("ts")

k_m = k[(k["ts"] >= "2024-01-01") & (k["ts"] < "2024-02-01")][["ts","volume"]].rename(columns={"ts":"hour"})
m = hourly.merge(k_m, on="hour", how="inner")

m["qty_minus_kline_vol"] = m["sum_qty"] - m["volume"]
m[["hour","sum_qty","volume","qty_minus_kline_vol"]].head(20), m["qty_minus_kline_vol"].abs().describe()


(                        hour    sum_qty     volume  qty_minus_kline_vol
 0  2024-01-01 00:00:00+00:00  149564.96  149566.19                -1.23
 1  2024-01-01 01:00:00+00:00  113669.77  113669.77                 0.00
 2  2024-01-01 02:00:00+00:00   75408.84   75408.84                 0.00
 3  2024-01-01 03:00:00+00:00  124375.49  124375.49                 0.00
 4  2024-01-01 04:00:00+00:00  132771.96  132771.96                 0.00
 5  2024-01-01 05:00:00+00:00  145314.34  145314.34                 0.00
 6  2024-01-01 06:00:00+00:00   62480.65   62480.65                 0.00
 7  2024-01-01 07:00:00+00:00   94893.65   94893.65                 0.00
 8  2024-01-01 08:00:00+00:00  120408.41  120408.41                 0.00
 9  2024-01-01 09:00:00+00:00  128490.00  128490.00                 0.00
 10 2024-01-01 10:00:00+00:00   76219.80   76219.80                 0.00
 11 2024-01-01 11:00:00+00:00   66138.08   66138.08                 0.00
 12 2024-01-01 12:00:00+00:00   72395.71   72395.71