In [None]:
from pathlib import Path

import numpy as np
import pandas as pd
import vectorbt as vbt

In [2]:
DATA_DIR = Path("../data")
BTC_1D = DATA_DIR / "btc" / "btc_usdt_1D.parquet"
BTC_1m = DATA_DIR / "btc" / "btc_usdt_1min.parquet"

In [3]:
df = pd.read_parquet(BTC_1D)
df

Unnamed: 0,time,open,high,low,close,volume
0,2017-08-17 00:00:00+00:00,4261.48,4485.39,4200.74,4285.08,795.150377
1,2017-08-18 00:00:00+00:00,4285.08,4371.52,3938.77,4108.37,1199.888264
2,2017-08-19 00:00:00+00:00,4108.37,4184.69,3850.00,4139.98,381.309763
3,2017-08-20 00:00:00+00:00,4139.98,4211.08,4032.62,4086.29,467.083022
4,2017-08-21 00:00:00+00:00,4069.13,4119.62,3911.79,4016.00,691.743060
...,...,...,...,...,...,...
3108,2026-02-19 00:00:00+00:00,66461.00,67320.00,65631.83,67003.73,14542.269880
3109,2026-02-20 00:00:00+00:00,67003.73,68318.39,66280.20,68020.01,35351.870910
3110,2026-02-21 00:00:00+00:00,68020.00,68698.70,67534.69,67975.93,8032.841310
3111,2026-02-22 00:00:00+00:00,67975.93,68245.00,67190.00,67643.40,8175.134840


In [4]:
df.set_index("time")

Unnamed: 0_level_0,open,high,low,close,volume
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2017-08-17 00:00:00+00:00,4261.48,4485.39,4200.74,4285.08,795.150377
2017-08-18 00:00:00+00:00,4285.08,4371.52,3938.77,4108.37,1199.888264
2017-08-19 00:00:00+00:00,4108.37,4184.69,3850.00,4139.98,381.309763
2017-08-20 00:00:00+00:00,4139.98,4211.08,4032.62,4086.29,467.083022
2017-08-21 00:00:00+00:00,4069.13,4119.62,3911.79,4016.00,691.743060
...,...,...,...,...,...
2026-02-19 00:00:00+00:00,66461.00,67320.00,65631.83,67003.73,14542.269880
2026-02-20 00:00:00+00:00,67003.73,68318.39,66280.20,68020.01,35351.870910
2026-02-21 00:00:00+00:00,68020.00,68698.70,67534.69,67975.93,8032.841310
2026-02-22 00:00:00+00:00,67975.93,68245.00,67190.00,67643.40,8175.134840


In [5]:
# pyright: reportAttributeAccessIssue=false
df["log_ret_1"] = np.log(df["close"] / df["close"].shift(1))
df["log_ret_5"] = np.log(df["close"] / df["close"].shift(5))
df["log_ret_15"] = np.log(df["close"] / df["close"].shift(15))

df["hl_range"] = (df["high"] - df["low"]) / df["close"]
df["gap_return"] = (df["open"] - df["close"].shift(1)) / df["close"].shift(1)

atr= vbt.ATR.run(
    high=df["high"],
    low=df["low"],
    close=df["close"],
    window=14
)

df["atr_14"] = atr.atr # type: ignore

df["volatility_14"] = df["log_ret_1"].rolling(14).std()
df["volatility_30"] = df["log_ret_1"].rolling(30).std()

rsi = vbt.RSI.run(df["close"], window=14)
df["rsi_14"] = rsi.rsi # type: ignore

macd = vbt.MACD.run(df["close"])

df["macd"] = macd.macd
df["macd_signal"] = macd.signal
df["macd_hist"] = macd.hist

stoch = vbt.STOCH.run(
    high=df["high"],
    low=df["low"],
    close=df["close"]
)

df["stoch_k"] = stoch.percent_k
df["stoch_d"] = stoch.percent_d

ema_fast = vbt.MA.run(df["close"], window=20, ewm=True)
ema_slow = vbt.MA.run(df["close"], window=50, ewm=True)

df["ema_20"] = ema_fast.ma
df["ema_50"] = ema_slow.ma
df["ema_ratio"] = df["ema_20"] / df["ema_50"]

sma_fast = vbt.MA.run(df["close"], window=20)
sma_slow = vbt.MA.run(df["close"], window=50)

df["sma_20"] = sma_fast.ma
df["sma_50"] = sma_slow.ma
df["sma_ratio"] = df["sma_20"] / df["sma_50"]


bb = vbt.BBANDS.run(df["close"], window=20)

df["bb_lower"] = bb.lower
df["bb_middle"] = bb.middle
df["bb_upper"] = bb.upper
df["bb_width"] = (bb.upper - bb.lower) / bb.middle
df["bb_percent"] = (df["close"] - bb.lower) / (bb.upper - bb.lower)

df["volume_change"] = df["volume"].pct_change()

df["vol_regime"] = (
    df["volatility_14"]
    > df["volatility_14"].rolling(50).mean()
).astype(int)


df = df.dropna()
df

Unnamed: 0,time,open,high,low,close,volume,log_ret_1,log_ret_5,log_ret_15,hl_range,...,sma_20,sma_50,sma_ratio,bb_lower,bb_middle,bb_upper,bb_width,bb_percent,volume_change,vol_regime
49,2017-10-05 00:00:00+00:00,4208.59,4355.00,4110.00,4292.43,779.138638,0.019725,-0.019855,0.095876,0.057077,...,4009.9585,4159.1842,0.964121,3472.857060,4009.9585,4547.059940,0.267884,0.762959,-0.102856,0
50,2017-10-06 00:00:00+00:00,4318.99,4417.00,4292.00,4369.00,506.529176,0.017681,-0.002167,0.190829,0.028611,...,4042.6610,4160.8626,0.971592,3501.757775,4042.6610,4583.564225,0.267598,0.801661,-0.349886,0
51,2017-10-07 00:00:00+00:00,4369.00,4479.50,4312.56,4423.00,297.597500,0.012284,0.009769,0.207032,0.037744,...,4078.8115,4167.1552,0.978800,3537.705473,4078.8115,4619.917527,0.265325,0.818042,-0.412477,0
52,2017-10-08 00:00:00+00:00,4425.00,4658.00,4425.00,4640.00,518.462004,0.047896,0.073776,0.204990,0.050216,...,4109.0610,4177.1556,0.983698,3515.985629,4109.0610,4702.136371,0.288667,0.947615,0.742158,0
53,2017-10-09 00:00:00+00:00,4640.00,4889.98,4550.00,4786.95,646.463145,0.031179,0.128766,0.268425,0.071022,...,4152.9065,4191.1688,0.990871,3498.663799,4152.9065,4807.149201,0.315077,0.984563,0.246886,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3108,2026-02-19 00:00:00+00:00,66461.00,67320.00,65631.83,67003.73,14542.269880,0.008133,-0.041214,-0.087980,0.025195,...,70309.8990,82688.7344,0.850296,61950.678403,70309.8990,78669.119597,0.237782,0.302244,-0.061331,1
3109,2026-02-20 00:00:00+00:00,67003.73,68318.39,66280.20,68020.01,35351.870910,0.015054,-0.011875,0.078099,0.029965,...,69773.8450,82272.3538,0.848084,62320.062375,69773.8450,77227.627625,0.213655,0.382353,1.430973,0
3110,2026-02-21 00:00:00+00:00,68020.00,68698.70,67534.69,67975.93,8032.841310,-0.000648,-0.013393,-0.037597,0.017124,...,69324.2310,81831.9698,0.847153,62612.674807,69324.2310,76035.787193,0.193628,0.399554,-0.772775,0
3111,2026-02-22 00:00:00+00:00,67975.93,68245.00,67190.00,67643.40,8175.134840,-0.004904,0.002070,-0.024042,0.015596,...,68769.4705,81372.2776,0.845122,63606.827585,68769.4705,73932.113415,0.150143,0.390941,0.017714,0


In [None]:
# Should be used as target variable for supervised learning models
df["future_ret_1"] = np.log(df["close"].shift(-1) / df["close"])

df = df.dropna()
df

In [None]:
# XGBoost Specific code
from sklearn.model_selection import TimeSeriesSplit
from xgboost import XGBClassifier

THRESHOLD = 0.01

df["target"] = 0
df.loc[df["future_ret_1"] > THRESHOLD, "target"] = 1
df.loc[df["future_ret_1"] < -THRESHOLD, "target"] = -1

df = df.dropna()

label_map = {-1: 0, 0: 1, 1: 2}
inv_label_map = {0: -1, 1: 0, 2: 1}

df["target_enc"] = df["target"].map(label_map)

FEATURE_COLS = [
    c for c in df.columns
    if c not in ["time", "target", "target_enc", "future_ret_1"]
]

X = df[FEATURE_COLS]
y = df["target_enc"]



tscv = TimeSeriesSplit(n_splits=5)

oof_proba = pd.DataFrame(
    index=X.index,
    columns=[0,1,2],   # class probabilities
    dtype=float
)

for train_idx, test_idx in tscv.split(X):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train = y.iloc[train_idx]

    model = XGBClassifier(
        n_estimators=300,
        max_depth=4,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        objective="multi:softprob",  # IMPORTANT
        num_class=3,
        random_state=42,
        n_jobs=-1
    )

    model.fit(X_train, y_train)

    proba = model.predict_proba(X_test)
    oof_proba.iloc[test_idx] = proba

In [None]:
# Generate trading signals based on predicted probabilities

PROBABILITY_THRESHOLD = 0.55  # tune later

long_signal = oof_proba[2] > PROBABILITY_THRESHOLD
short_signal = oof_proba[0] > PROBABILITY_THRESHOLD

signals = pd.Series(0, index=df.index)
signals[long_signal] = 1
signals[short_signal] = -1

In [None]:
# Backtest the strategy using vectorbt

long_entries = signals == 1
short_entries = signals == -1

long_exits = signals != 1
short_exits = signals != -1

portfolio = vbt.Portfolio.from_signals(
    close=df["close"],
    entries=long_entries,
    exits=long_exits,
    short_entries=short_entries,
    short_exits=short_exits,
    fees=0.001,
    slippage=0.0005,
    freq="1D"
)

In [None]:
portfolio.stats()

In [None]:
portfolio.plot(**dict(width=1960, height=1080)).show()