# Binance Spot & Futures ML (Colab, Drive + .env ready)

This notebook:
- mounts **Google Drive**,
- reads **.env** from `/content/drive/MyDrive/Binance/.env` (vars: `API_KEY`, `Secret_Key`),
- saves reusable **data** to `/content/drive/MyDrive/Binance/data` and **artifacts** to `/content/drive/MyDrive/Binance/artifacts`,
- fetches Binance Spot/USDâ“ˆâ€‘M Futures data, engineers features, trains a **TCN quantile model**, and backtests with basic costs.

> Research only. Crypto is risky.


In [None]:
# Install packages (re-run after runtime reset if needed)
!pip -q install --upgrade python-dotenv pandas pandas-ta numpy requests tqdm scikit-learn binance-connector torch


In [None]:
# Mount Google Drive (required for .env path and saving data/artifacts)
from google.colab import drive
drive.mount('/content/drive')


In [None]:

# Load .env from Google Drive
import os
from dotenv import load_dotenv

BASE_DIR = "/content/drive/MyDrive/Binance"
ENV_PATH = f"{BASE_DIR}/.env"
SPOT_DATA_DIR = f"{BASE_DIR}/spot_data"
DATA_DIR = f"{BASE_DIR}/data"
ARTIFACTS_DIR = f"{BASE_DIR}/artifacts"

os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(ARTIFACTS_DIR, exist_ok=True)

if not os.path.isdir(SPOT_DATA_DIR):
    raise FileNotFoundError(f"Expected spot data directory at {SPOT_DATA_DIR}. Please sync the CSV files generated by fetch_spot_ohlcv.py first.")

if not os.path.exists(ENV_PATH):
    raise FileNotFoundError(f"Expected .env at {ENV_PATH}. Please create it with API_KEY and Secret_Key.")

load_dotenv(ENV_PATH)
API_KEY = os.getenv("API_KEY", "")
SECRET_KEY = os.getenv("Secret_Key", "")
print("Loaded .env from", ENV_PATH, "(API key present:", bool(API_KEY), ")")
print("Spot data dir:", SPOT_DATA_DIR)


In [None]:

# Imports & config
import os, math, time, json, numpy as np, pandas as pd
from datetime import datetime, timedelta, timezone
from tqdm import tqdm
import torch, torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from typing import List, Dict, Any, Optional

# Reproducibility
SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)

# ----------- User config (edit here) -----------
CFG = {
    "data": {
        "symbol": "BTCUSDT",
        "market": "spot",
        "interval": "1h",
        "lookback_days": 365,
        "use_cache": True
    },
    "features": {
        "rsi_window": 14,
        "atr_window": 14,
        "vol_window": 96,
        "zscore_window": 288
    },
    "labels": {
        "horizon_steps": 12,
        "quantiles": [0.1, 0.5, 0.9],
        "barrier_bps": 10.0
    },
    "model": {
        "hidden_channels": 64,
        "num_layers": 6,
        "dropout": 0.1,
        "lr": 1.5e-3,
        "batch_size": 512,
        "epochs": 15
    },
    "train": {
        "val_ratio": 0.2,
        "purged_gap_steps": 24
    },
    "backtest": {
        "fee_taker": 0.0004,
        "slippage_bps": 1.0,
        "max_leverage": 2.0,
        "vol_target_ann": 0.20,
        "funding_cost": False
    },
    "runtime": {
        "device": "cuda" if torch.cuda.is_available() else "cpu"
    }
}
print('Using device:', CFG["runtime"]["device"])


In [None]:
# Utility: simple retry decorator
import random
def retry(backoff: float = 0.5, tries: int = 5, jitter: float = 0.2, exceptions=(Exception,)):
    def decorator(func):
        def wrapper(*args, **kwargs):
            delay = backoff
            for attempt in range(1, tries+1):
                try:
                    return func(*args, **kwargs)
                except exceptions as e:
                    if attempt == tries:
                        raise
                    time.sleep(max(0.05, delay * (1.0 + random.uniform(-jitter, jitter))))
                    delay *= 2.0
        return wrapper
    return decorator


In [None]:

# Local spot data utilities

def _interval_to_timedelta(interval: str) -> pd.Timedelta:
    try:
        value = int(interval[:-1])
        unit = interval[-1]
    except (ValueError, IndexError):
        raise ValueError(f"Unsupported interval format: {interval}")
    unit_map = {"m": "m", "h": "h", "d": "d"}
    if unit not in unit_map:
        raise ValueError(f"Unsupported interval unit: {interval}")
    return pd.to_timedelta(value, unit=unit_map[unit])

def _validate_interval(interval: str) -> None:
    if interval != "1h":
        raise ValueError(f"spot_data currently contains hourly klines. Requested interval '{interval}' is not available.")


In [None]:

# Fetch orchestrators using local CSV cache
SPOT_CSV_COLUMNS = ["timestamp", "open", "high", "low", "close", "volume"]

def _load_spot_csv(symbol: str) -> pd.DataFrame:
    csv_path = os.path.join(SPOT_DATA_DIR, f"{symbol}.csv")
    if not os.path.exists(csv_path):
        raise FileNotFoundError(f"Missing CSV for {symbol} at {csv_path}. Run fetch_spot_ohlcv.py to populate spot_data.")
    df = pd.read_csv(csv_path)
    missing = [col for col in SPOT_CSV_COLUMNS if col not in df.columns]
    if missing:
        raise ValueError(f"CSV for {symbol} is missing columns: {missing}")
    df = df.copy()
    df["timestamp"] = pd.to_datetime(df["timestamp"], utc=True)
    for col in ["open", "high", "low", "close", "volume"]:
        df[col] = df[col].astype(float)
    return df

def fetch_klines(symbol: str, market: str, interval: str, lookback_days: int, use_cache: bool = True) -> pd.DataFrame:
    if market.lower() != "spot":
        raise ValueError("Local spot CSVs only support market='spot'.")
    _validate_interval(interval)
    df = _load_spot_csv(symbol).rename(columns={"timestamp": "open_time"})
    df["close_time"] = df["open_time"] + _interval_to_timedelta(interval)
    for optional in ["qav", "n_trades", "tbbv", "tbqv", "ignore"]:
        if optional not in df:
            df[optional] = np.nan
    df = df[["open_time", "open", "high", "low", "close", "volume", "close_time", "qav", "n_trades", "tbbv", "tbqv", "ignore"]]
    df = df.sort_values("open_time").reset_index(drop=True)
    if lookback_days:
        cutoff = df["open_time"].max() - timedelta(days=lookback_days)
        df = df[df["open_time"] >= cutoff].reset_index(drop=True)
    return df

def fetch_futures_metrics(*args, **kwargs) -> Dict[str, pd.DataFrame]:
    raise ValueError("Futures metrics are not available when using local spot CSVs.")


In [None]:
# Feature engineering
def _safe_import_pandas_ta():
    try:
        import pandas_ta as ta
        return ta
    except Exception:
        return None

def add_price_features(df: pd.DataFrame, rsi_window=14, atr_window=14, vol_window=96, zscore_window=288) -> pd.DataFrame:
    df = df.copy().sort_values('open_time')
    df['close'] = df['close'].astype(float)
    df['ret_1'] = np.log(df['close']).diff()
    df['ret_5'] = np.log(df['close']).diff(5)
    df['ret_20'] = np.log(df['close']).diff(20)

    df['rv'] = df['ret_1'].rolling(vol_window, min_periods=max(10, vol_window//2)).std() * np.sqrt(60*24*365)

    df['ma_long'] = df['close'].rolling(zscore_window, min_periods=max(20, zscore_window//2)).mean()
    df['ma_std']  = df['close'].rolling(zscore_window, min_periods=max(20, zscore_window//2)).std()
    df['price_z'] = (df['close'] - df['ma_long']) / (df['ma_std'] + 1e-9)

    ta = _safe_import_pandas_ta()
    if ta is not None:
        df['rsi'] = ta.rsi(df['close'], length=rsi_window)
        df['atr'] = ta.atr(high=df['high'], low=df['low'], close=df['close'], length=atr_window)
    else:
        delta = df['close'].diff()
        gain = (delta.clip(lower=0)).rolling(rsi_window).mean()
        loss = (-delta.clip(upper=0)).rolling(rsi_window).mean()
        rs = gain / (loss + 1e-9)
        df['rsi'] = 100 - (100 / (1 + rs))
        tr = np.maximum(df['high']-df['low'], np.maximum(abs(df['high']-df['close'].shift()), abs(df['low']-df['close'].shift())))
        df['atr'] = tr.rolling(atr_window).mean()

    if 'tbqv' in df.columns and 'qav' in df.columns:
        df['taker_ratio_quote'] = df['tbqv'] / (df['qav'] + 1e-9)
    else:
        df['taker_ratio_quote'] = np.nan

    if 'tbbv' in df.columns and 'volume' in df.columns:
        df['taker_ratio_base'] = df['tbbv'] / (df['volume'] + 1e-9)
    else:
        df['taker_ratio_base'] = np.nan

    return df

def merge_futures_metrics(df: pd.DataFrame, metrics: dict) -> pd.DataFrame:
    out = df.copy().sort_values('open_time')
    if metrics.get("funding") is not None and not metrics["funding"].empty:
        f = metrics["funding"].rename(columns={'timestamp':'open_time'})
        out = pd.merge_asof(out, f[['open_time','fundingRate']].sort_values('open_time'), on='open_time', direction='backward')
    if metrics.get("oi") is not None and not metrics["oi"].empty:
        o = metrics["oi"].rename(columns={'timestamp':'open_time'})
        out = pd.merge_asof(out, o[['open_time','sumOpenInterest','sumOpenInterestValue']].sort_values('open_time'), on='open_time', direction='backward')
    if metrics.get("taker") is not None and not metrics["taker"].empty:
        t = metrics["taker"].rename(columns={'timestamp':'open_time'})
        out = pd.merge_asof(out, t[['open_time','buySellRatio','buyVol','sellVol']].sort_values('open_time'), on='open_time', direction='backward')
    if metrics.get("basis") is not None and not metrics["basis"].empty:
        b = metrics["basis"].rename(columns={'timestamp':'open_time'})
        out = pd.merge_asof(out, b[['open_time','basis','basisRate','annualizedBasisRate','futuresPrice','indexPrice']].sort_values('open_time'), on='open_time', direction='backward')
    return out

def finalize_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.sort_values('open_time').reset_index(drop=True)
    for col in df.columns:
        if col not in ['open_time','close_time']:
            df[col] = df[col].ffill()
    df = df.dropna().reset_index(drop=True)
    return df


In [None]:
# Labels
def make_targets(df: pd.DataFrame, horizon_steps: int, quantiles: List[float]) -> pd.DataFrame:
    df = df.copy()
    df['y'] = np.log(df['close'].shift(-horizon_steps) / df['close'])
    for q in quantiles:
        df[f'y_q{int(q*100):02d}'] = df['y']
    return df

def triple_barrier_meta(df: pd.DataFrame, horizon_steps: int, barrier_bps: float = 10.0) -> pd.DataFrame:
    df = df.copy()
    prices = df['close'].values
    up_mult = 1.0 + barrier_bps/10000.0
    dn_mult = 1.0 - barrier_bps/10000.0
    labels = np.zeros(len(df), dtype=int)
    for i in range(len(df)-horizon_steps):
        p0 = prices[i]; upper = p0 * up_mult; lower = p0 * dn_mult
        path = prices[i+1:i+horizon_steps+1]
        hit_up = (path >= upper).any(); hit_dn = (path <= lower).any()
        if hit_up and not hit_dn: labels[i] = 1
        elif hit_dn and not hit_up: labels[i] = -1
        else: labels[i] = 0
    df['meta_label'] = labels
    return df


In [None]:
# Model: TCN Quantile + Dataset/Scaler
class StandardScaler:
    def __init__(self):
        self.mean_ = None; self.std_ = None
    def fit(self, X: np.ndarray):
        self.mean_ = X.mean(axis=0, keepdims=True)
        self.std_  = X.std(axis=0, keepdims=True) + 1e-9
    def transform(self, X: np.ndarray):
        return (X - self.mean_) / self.std_
    def fit_transform(self, X: np.ndarray):
        self.fit(X); return self.transform(X)

class TSWindowDataset(Dataset):
    def __init__(self, X: np.ndarray, y: np.ndarray, window: int):
        self.X = X; self.y = y; self.window = window
    def __len__(self):
        return max(0, self.X.shape[0] - self.window)
    def __getitem__(self, idx):
        x = self.X[idx:idx+self.window]; y = self.y[idx+self.window]
        return torch.from_numpy(x.T.astype(np.float32)), torch.tensor(np.float32(y))

class Chomp1d(nn.Module):
    def __init__(self, chomp_size): super().__init__(); self.chomp_size = chomp_size
    def forward(self, x): return x[:, :, :-self.chomp_size].contiguous()

class TemporalBlock(nn.Module):
    def __init__(self, n_inputs, n_outputs, kernel_size, dilation, dropout):
        super().__init__()
        padding = (kernel_size-1) * dilation
        self.conv1 = nn.Conv1d(n_inputs, n_outputs, kernel_size, padding=padding, dilation=dilation)
        self.chomp1 = Chomp1d(padding); self.relu1 = nn.ReLU(); self.drop1 = nn.Dropout(dropout)
        self.conv2 = nn.Conv1d(n_outputs, n_outputs, kernel_size, padding=padding, dilation=dilation)
        self.chomp2 = Chomp1d(padding); self.relu2 = nn.ReLU(); self.drop2 = nn.Dropout(dropout)
        self.down = nn.Conv1d(n_inputs, n_outputs, 1) if n_inputs != n_outputs else None
        self.relu = nn.ReLU()
    def forward(self, x):
        out = self.drop1(self.relu1(self.chomp1(self.conv1(x))))
        out = self.drop2(self.relu2(self.chomp2(self.conv2(out))))
        res = x if self.down is None else self.down(x)
        return self.relu(out + res)

class TCNQuantile(nn.Module):
    def __init__(self, in_channels, hidden_channels=64, num_layers=6, kernel_size=3, dropout=0.1, n_quantiles=3):
        super().__init__()
        layers = []; ch = in_channels
        for i in range(num_layers):
            d = 2**i; layers.append(TemporalBlock(ch, hidden_channels, kernel_size, d, dropout)); ch = hidden_channels
        self.net = nn.Sequential(*layers)
        self.head = nn.Conv1d(hidden_channels, n_quantiles, kernel_size=1)
    def forward(self, x):
        h = self.net(x); q = self.head(h); return q[:, :, -1]

class PinballLoss(nn.Module):
    def __init__(self, quantiles: List[float]):
        super().__init__(); self.q = torch.tensor(quantiles).view(1, -1)
    def forward(self, preds, target):
        target = target.view(-1,1).repeat(1, preds.size(1))
        diff = target - preds
        q = self.q.to(preds.device)
        return torch.maximum(q*diff, (q-1)*diff).mean()

def train_tcn_quantile(
    X, y, window, quantiles, hidden_channels=64, num_layers=6, dropout=0.1,
    lr=1.5e-3, batch_size=512, epochs=15, device="cpu", val_ratio=0.2, purged_gap_steps=24
):
    n = X.shape[0]; n_val = int(val_ratio * (n - window))
    tr_end = (n - window) - n_val - purged_gap_steps
    if tr_end <= 0: raise ValueError("Not enough samples for split; reduce val_ratio or purged_gap_steps.")
    tr_idx = list(range(tr_end)); va_idx = list(range(tr_end + purged_gap_steps, n - window))

    scaler = StandardScaler(); scaler.fit(X[:tr_end+window]); Xs = scaler.transform(X)

    ds = TSWindowDataset(Xs, y, window)
    dl_tr = DataLoader(torch.utils.data.Subset(ds, tr_idx), batch_size=batch_size, shuffle=True)
    dl_va = DataLoader(torch.utils.data.Subset(ds, va_idx), batch_size=batch_size, shuffle=False)

    model = TCNQuantile(in_channels=Xs.shape[1], hidden_channels=hidden_channels, num_layers=num_layers,
                        dropout=dropout, n_quantiles=len(quantiles)).to(device)
    opt = torch.optim.Adam(model.parameters(), lr=lr)
    loss_fn = PinballLoss(quantiles)

    best = {"val": 1e9, "state": None}
    for ep in range(1, epochs+1):
        model.train(); tr_loss = 0.0
        for xb, yb in dl_tr:
            xb, yb = xb.to(device), yb.to(device)
            opt.zero_grad(set_to_none=True)
            pred = model(xb)
            loss = loss_fn(pred, yb)
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            opt.step()
            tr_loss += loss.item() * xb.size(0)
        tr_loss /= max(1, len(dl_tr.dataset))

        model.eval(); va_loss = 0.0
        with torch.no_grad():
            for xb, yb in dl_va:
                xb, yb = xb.to(device), yb.to(device)
                va_loss += loss_fn(model(xb), yb).item() * xb.size(0)
        va_loss /= max(1, len(dl_va.dataset))
        print(f"Epoch {ep:02d} | train {tr_loss:.6f} | val {va_loss:.6f}")
        if va_loss < best["val"]:
            best["val"] = va_loss; best["state"] = model.state_dict()
    if best["state"] is not None: model.load_state_dict(best["state"])
    return model, scaler


In [None]:
# Backtest
def position_from_quantiles(q10, q50, q90, rv, vol_target_ann=0.2, max_leverage=2.0):
    exp_ret = q50
    iqr = (q90 - q10) + 1e-9
    raw = np.clip(exp_ret / iqr, -3.0, 3.0)
    vol_scale = (vol_target_ann / rv) if (isinstance(rv, (int,float,np.floating)) and rv>1e-6) else 0.1
    return float(np.clip(raw * vol_scale, -max_leverage, max_leverage))

def simulate(df: pd.DataFrame, fee_taker=0.0004, slippage_bps=1.0, funding_cost=True):
    df = df.copy().reset_index(drop=True)
    df['pos'] = [position_from_quantiles(df['q10'][i], df['q50'][i], df['q90'][i], df['rv'][i]) for i in range(len(df))]
    df['pos_shift'] = df['pos'].shift().fillna(0.0)

    trades = (df['pos'] - df['pos_shift']).abs()
    trade_cost = trades * (fee_taker + slippage_bps/10000.0)

    df['pnl'] = df['pos_shift'] * df['ret_1'].fillna(0.0)

    if funding_cost and 'fundingRate' in df.columns:
        secs = df['open_time'].diff().dt.total_seconds().fillna(0.0)
        frac = secs / (8*3600.0)
        df['pnl'] -= df['pos_shift'].abs() * df['fundingRate'].fillna(0.0) * frac

    df['pnl'] -= trade_cost
    df['equity'] = (1.0 + df['pnl']).cumprod()

    ret = df['equity'].iloc[-1] - 1.0
    pnl = df['pnl'].fillna(0.0)
    ann_scale = np.sqrt(252*24*12)  # rough for 5m bars
    sr = pnl.mean() / (pnl.std() + 1e-9) * ann_scale
    mdd = (df['equity'].cummax() - df['equity']).max()
    return df, {"Total Return": float(ret), "Sharpe ~": float(sr), "Max Drawdown": float(mdd)}


In [None]:

# Run pipeline with Drive caching and Drive outputs
symbol   = CFG["data"]["symbol"]
market   = CFG["data"]["market"]
interval = CFG["data"]["interval"]
lb_days  = CFG["data"]["lookback_days"]
use_cache = CFG["data"]["use_cache"]

lb_desc = f"{lb_days} days" if lb_days else "full history"
print(f"Loading {symbol} {market} {interval} from local spot_data ({lb_desc})...")
df = fetch_klines(symbol, market, interval, lb_days, use_cache=use_cache)

df = add_price_features(df, CFG["features"]["rsi_window"], CFG["features"]["atr_window"],
                        CFG["features"]["vol_window"], CFG["features"]["zscore_window"])

if market == "futures":
    raise ValueError("Futures workflow is disabled when using local spot CSVs.")

df = finalize_features(df)

# Targets
H = CFG["labels"]["horizon_steps"]
quantiles = CFG["labels"]["quantiles"]
df = make_targets(df, H, quantiles)
df = triple_barrier_meta(df, H, CFG["labels"]["barrier_bps"])
df = df.dropna().reset_index(drop=True)
print("Final dataset size:", len(df))

prep_tag = f"{lb_days}d" if lb_days else "all"
prep_path = os.path.join(ARTIFACTS_DIR, f"prepared_{symbol}_{market}_{interval}_{prep_tag}.csv")
df.to_csv(prep_path, index=False)
print("Saved prepared dataset to", prep_path)

df.head()


In [None]:
# Train
feature_cols = [c for c in df.columns if c not in ('open_time','close_time','y','meta_label') and not c.startswith('y_q')]
X = df[feature_cols].values.astype(np.float32)
y = df['y'].values.astype(np.float32)

window = max(64, CFG["labels"]["horizon_steps"] * 4)

model, scaler = train_tcn_quantile(
    X, y, window, quantiles,
    hidden_channels=CFG["model"]["hidden_channels"],
    num_layers=CFG["model"]["num_layers"],
    dropout=CFG["model"]["dropout"],
    lr=CFG["model"]["lr"],
    batch_size=CFG["model"]["batch_size"],
    epochs=CFG["model"]["epochs"],
    device=CFG["runtime"]["device"],
    val_ratio=CFG["train"]["val_ratio"],
    purged_gap_steps=CFG["train"]["purged_gap_steps"]
)


In [None]:
# Inference for backtest + save to Drive
ds_all = TSWindowDataset(StandardScaler().fit_transform(X), y, window)  # scaler for inference uses same stats; but to be consistent use 'scaler' object
from torch.utils.data import DataLoader
dl = DataLoader(TSWindowDataset(scaler.transform(X), y, window), batch_size=1024, shuffle=False)

model.eval()
preds = []
with torch.no_grad():
    for xb, yb in dl:
        xb = xb.to(CFG["runtime"]["device"])
        preds.append(model(xb).cpu().numpy())
preds = np.concatenate(preds, axis=0)

df_bt = df.iloc[window:].copy().reset_index(drop=True)
df_bt['q10'] = preds[:,0]; df_bt['q50'] = preds[:,1]; df_bt['q90'] = preds[:,2]

bt, metrics = simulate(df_bt,
                       fee_taker=CFG["backtest"]["fee_taker"],
                       slippage_bps=CFG["backtest"]["slippage_bps"],
                       funding_cost=CFG["backtest"]["funding_cost"])

print(json.dumps(metrics, indent=2))

bt_path = f"{ARTIFACTS_DIR}/bt_{symbol}_{market}_{interval}.csv"
df_bt.to_csv(bt_path, index=False)
print("Saved backtest CSV to", bt_path)


In [None]:
# Plot equity curve
import matplotlib.pyplot as plt

plt.figure()
plt.plot(bt['open_time'], bt['equity'])
plt.title(f"Equity curve - {symbol} {market} {interval}")
plt.xlabel("Time"); plt.ylabel("Equity")
plt.show()
