In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas_ta as ta

In [2]:
df = pd.read_parquet("../data/coin-data-hourly.parquet")

In [3]:
df

Unnamed: 0,Date,Close,High,Low,Open,Volume,Symbol
0,2024-01-20 00:00:00,41480.117188,41640.230469,41446.824219,41624.589844,0,BTC-USD
1,2024-01-20 01:00:00,41604.789062,41646.871094,41469.070312,41469.070312,0,BTC-USD
2,2024-01-20 02:00:00,41575.363281,41622.125000,41522.097656,41622.125000,0,BTC-USD
3,2024-01-20 03:00:00,41624.578125,41624.578125,41512.343750,41589.242188,0,BTC-USD
4,2024-01-20 04:00:00,41585.972656,41630.390625,41562.292969,41611.378906,0,BTC-USD
...,...,...,...,...,...,...,...
17333,2026-01-17 17:00:00,95323.453125,95398.703125,95259.070312,95277.421875,0,BTC-USD
17334,2026-01-17 18:00:00,95382.875000,95428.242188,95330.414062,95330.414062,0,BTC-USD
17335,2026-01-17 19:00:00,95261.351562,95396.429688,95247.890625,95396.429688,0,BTC-USD
17336,2026-01-17 20:00:00,95343.679688,95345.164062,95242.265625,95259.257812,0,BTC-USD


In [4]:
df = pd.read_parquet("../data/coin-data-hourly.parquet")


def add_features(df):
    df = df.copy()

    df.sort_values(["Symbol", "Date"], inplace=True)

    def compute_indicators(group):
        group.ta.ema(length=9, append=True)
        group.ta.ema(length=21, append=True)
        group.ta.ema(length=50, append=True)
        group.ta.rsi(length=14, append=True)
        group.ta.macd(fast=12, slow=26, signal=9, append=True)
        group.ta.bbands(length=20, std=2, append=True)
        group.ta.atr(length=14, append=True)
        group.ta.obv(append=True)
        group.ta.log_return(append=True)
        return group

    def rename_columns(group):
        rename_map = {
            "EMA_9": "EMA_SHORT",
            "EMA_21": "EMA_MID",
            "EMA_50": "EMA_LONG",
            "RSI_14": "RSI",
            "ATRr_14": "ATR",
            "OBV": "OBV",
            "LOGRET_1": "LOG_RET",
            "MACD_12_26_9": "MACD_LINE",
            "MACDh_12_26_9": "MACD_HIST",
            "MACDs_12_26_9": "MACD_SIGNAL",
            "BBU_20_2.0_2.0": "BB_UPPER",
            "BBL_20_2.0_2.0": "BB_LOWER",
            "BBM_20_2.0_2.0": "BB_MID",
            "BBP_20_2.0_2.0": "BB_PERCENT",
            "BBB_20_2.0_2.0": "BB_WIDTH",
            "BBU_20_2.0": "BB_UPPER",
            "BBL_20_2.0": "BB_LOWER",
            "BBM_20_2.0": "BB_MID",
        }

        existing_cols = set(group.columns)
        actual_rename_map = {k: v for k, v in rename_map.items() if k in existing_cols}
        group.rename(columns=actual_rename_map, inplace=True)
        return group

    def custom_features(group):
        group["dist_ema_short"] = (group["Close"] - group["EMA_SHORT"]) / group[
            "EMA_SHORT"
        ]
        group["dist_bb_upper"] = (group["Close"] - group["BB_UPPER"]) / group[
            "BB_UPPER"
        ]
        group["signal_rsi_oversold"] = (group["RSI"] < 30).astype(int)
        group["signal_rsi_overbought"] = (group["RSI"] > 70).astype(int)
        group["signal_macd_bullish"] = (group["MACD_HIST"] > 0).astype(int)

        return group

    def add_lags(group):
        cols_to_lag = ["LOG_RET", "RSI", "MACD_HIST"]
        for col in cols_to_lag:
            if col in group.columns:
                group[f"{col}_lag1"] = group[col].shift(1)
                group[f"{col}_lag2"] = group[col].shift(2)

        return group

    df = df.groupby("Symbol", group_keys=False).apply(compute_indicators)
    df = df.groupby("Symbol", group_keys=False).apply(rename_columns)
    df = df.groupby("Symbol", group_keys=False).apply(custom_features)
    df = df.groupby("Symbol", group_keys=False).apply(add_lags)

    df.dropna(inplace=True)

    return df


df = add_features(df)

# fmt: on

  df = df.groupby("Symbol", group_keys=False).apply(compute_indicators)
  df = df.groupby("Symbol", group_keys=False).apply(rename_columns)
  df = df.groupby("Symbol", group_keys=False).apply(custom_features)
  df = df.groupby("Symbol", group_keys=False).apply(add_lags)


In [5]:
df["next_8h_close"] = df.groupby("Symbol")["Close"].shift(-8)

threshold = 0.001

df["Target"] = (df["next_8h_close"] > df["Close"] * (1 + threshold)).astype(int)

df.dropna(subset=["next_8h_close"], inplace=True)
df.drop(columns=["next_8h_close"], inplace=True)

In [6]:
from sklearn.model_selection import train_test_split

df = df.sort_values(by="Date")

drop_cols = [
    "Open",
    "High",
    "Low",
    "Close",
    "Volume",
    "Target",
    "Date",
    "Symbol",
]

X = df.drop(columns=drop_cols, errors="ignore")
y = df["Target"]

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, shuffle=False)
X_test, X_backtest, y_test, y_backtest = train_test_split(
    X_temp, y_temp, test_size=0.5, shuffle=False
)

In [7]:
X_train.columns

Index(['EMA_SHORT', 'EMA_MID', 'EMA_LONG', 'RSI', 'MACD_LINE', 'MACD_HIST',
       'MACD_SIGNAL', 'BB_LOWER', 'BB_MID', 'BB_UPPER', 'BB_WIDTH',
       'BB_PERCENT', 'ATR', 'OBV', 'LOG_RET', 'dist_ema_short',
       'dist_bb_upper', 'signal_rsi_oversold', 'signal_rsi_overbought',
       'signal_macd_bullish', 'LOG_RET_lag1', 'LOG_RET_lag2', 'RSI_lag1',
       'RSI_lag2', 'MACD_HIST_lag1', 'MACD_HIST_lag2'],
      dtype='object')

In [8]:
from xgboost import XGBClassifier
from sklearn.metrics import precision_score, recall_score


ratio = float(np.sum(y_train == 0)) / np.sum(y_train == 1)

params = {
    "n_estimators": 300,
    "max_depth": 3,
    "random_state": 42,
    "learning_rate": 0.005,
    "scale_pos_weight": ratio,
}

model = XGBClassifier(**params)
model.fit(X_train, y_train)


probs = model.predict_proba(X_test)[:, 1]

print(f"{'Threshold':<18} {'Precision':<10} {'Recall':<10} {'Trade Amount':<12}")
print("-" * 55)


for threshold in np.linspace(0.5, 0.6, 9):

    preds = (probs > threshold).astype(int)

    prec = precision_score(y_test, preds, zero_division=0)
    rec = recall_score(y_test, preds, zero_division=0)
    trade_count = preds.sum()

    print(f"{threshold:<18} {prec:.2f}       {rec:.2f}       {trade_count:<12}")

Threshold          Precision  Recall     Trade Amount
-------------------------------------------------------
0.5                0.47       0.29       490         
0.5125             0.44       0.23       404         
0.525              0.43       0.15       277         
0.5375             0.43       0.13       235         
0.55               0.46       0.10       167         
0.5625             0.46       0.10       163         
0.575              0.46       0.09       149         
0.5875             0.46       0.08       135         
0.6                0.47       0.07       126         


In [9]:
import optuna
from sklearn.metrics import precision_score, recall_score, average_precision_score

ratio = float(np.sum(y_train == 0)) / np.sum(y_train == 1)


def objective(trial):

    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "max_depth": trial.suggest_int("max_depth", 2, 4),
        "learning_rate": trial.suggest_float("learning_rate", 1e-4, 0.01, log=True),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-5, 10.0, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-5, 10.0, log=True),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "scale_pos_weight": trial.suggest_float(
            "scale_pos_weight", ratio * 0.8, ratio * 1.2
        ),
        "random_state": 42,
        "n_jobs": -1,
    }

    model = XGBClassifier(**params)
    model.fit(X_train, y_train)

    probs = model.predict_proba(X_test)[:, 1]
    score = average_precision_score(y_test, probs)

    return score


study = optuna.create_study(direction="maximize")
print("Starting optimization...")
study.optimize(objective, n_trials=50)

print("-" * 55)
print(f"Best PR-AUC Score: {study.best_value:.4f}")
print("Best Params:")
for key, value in study.best_trial.params.items():
    print(f"  {key}: {value}")
print("-" * 55)

print("\nValidating Best Model on Thresholds:")
best_model = XGBClassifier(**study.best_trial.params)
best_model.fit(X_train, y_train)
probs = best_model.predict_proba(X_test)[:, 1]

print(f"{'Threshold':<18} {'Precision':<10} {'Recall':<10} {'Trade Amount':<12}")
print("-" * 55)

for threshold in np.linspace(0.5, 0.75, 11):
    preds = (probs > threshold).astype(int)
    prec = precision_score(y_test, preds, zero_division=0)
    rec = recall_score(y_test, preds, zero_division=0)
    trade_count = preds.sum()
    print(f"{threshold:<18.2f} {prec:.2f}       {rec:.2f}       {trade_count:<12}")

  from .autonotebook import tqdm as notebook_tqdm
[I 2026-01-18 00:55:32,066] A new study created in memory with name: no-name-dc230ad6-903f-4e0e-bed6-24e1102ab67d


Starting optimization...


[I 2026-01-18 00:55:33,231] Trial 0 finished with value: 0.4669154207283058 and parameters: {'n_estimators': 608, 'max_depth': 2, 'learning_rate': 0.00015155913058335804, 'reg_alpha': 0.042710918823264964, 'reg_lambda': 0.0013327458753297503, 'subsample': 0.642536843068904, 'colsample_bytree': 0.9023346696267714, 'scale_pos_weight': 1.007217694095495}. Best is trial 0 with value: 0.4669154207283058.
[I 2026-01-18 00:55:34,284] Trial 1 finished with value: 0.4686730598190657 and parameters: {'n_estimators': 988, 'max_depth': 4, 'learning_rate': 0.00016413870965368784, 'reg_alpha': 0.6962354001444228, 'reg_lambda': 0.0007739258309552704, 'subsample': 0.9934591269485835, 'colsample_bytree': 0.9390199019608073, 'scale_pos_weight': 1.0886742242624758}. Best is trial 1 with value: 0.4686730598190657.
[I 2026-01-18 00:55:34,971] Trial 2 finished with value: 0.4665806894689837 and parameters: {'n_estimators': 925, 'max_depth': 3, 'learning_rate': 0.00047100101987574143, 'reg_alpha': 0.00697418

-------------------------------------------------------
Best PR-AUC Score: 0.4788
Best Params:
  n_estimators: 949
  max_depth: 2
  learning_rate: 0.007996113745785134
  reg_alpha: 0.4946397755743551
  reg_lambda: 0.00027417142727959373
  subsample: 0.5277011678305367
  colsample_bytree: 0.8718750003473645
  scale_pos_weight: 0.9584380079129838
-------------------------------------------------------

Validating Best Model on Thresholds:
Threshold          Precision  Recall     Trade Amount
-------------------------------------------------------
0.50               0.47       0.34       565         
0.53               0.44       0.21       370         
0.55               0.44       0.14       247         
0.57               0.48       0.10       163         
0.60               0.50       0.08       130         
0.62               0.55       0.07       103         
0.65               0.58       0.05       65          
0.68               0.42       0.01       24          
0.70             

In [10]:
analysis = pd.DataFrame(
    {
        "Symbol": df.loc[X_backtest.index, "Symbol"],
        "Open": df.loc[X_backtest.index, "Open"],
        "Low": df.loc[X_backtest.index, "Low"],
        "High": df.loc[X_backtest.index, "High"],
        "Close": df.loc[X_backtest.index, "Close"],
        "Prob": probs,
        "Date": df.loc[X_backtest.index, "Date"],
    }
)

analysis = analysis.sort_values("Date").reset_index(drop=True)

returns = []
thresholds = np.linspace(0, 1, 20)

for threshold in thresholds:

    TAKE_PROFIT = 0.02
    STOP_LOSS = 0.01
    COMMISSION = 0.001
    INITIAL_CAPITAL = 10000.0
    HOLD_PERIOD = 8

    analysis["Signal"] = (analysis["Prob"] > threshold).astype(int)

    current_capital = INITIAL_CAPITAL
    in_trade = False
    entry_price = 0.0
    exit_step_limit = 0

    signals = analysis["Signal"].shift(1).fillna(0)

    for i in range(len(analysis)):
        row = analysis.iloc[i]

        current_open = row["Open"]
        current_low = row["Low"]
        current_high = row["High"]
        current_close = row["Close"]

        if in_trade:

            sl_price = entry_price * (1 - STOP_LOSS)
            hit_sl = current_low <= sl_price

            tp_price = entry_price * (1 + TAKE_PROFIT)
            hit_tp = current_high >= tp_price

            time_exit = i >= exit_step_limit

            exit_price = 0.0
            execute_exit = False

            if hit_sl:
                exit_price = (
                    min(current_open, sl_price) if current_open < sl_price else sl_price
                )

                execute_exit = True

            elif hit_tp:
                exit_price = (
                    max(current_open, tp_price) if current_open > tp_price else tp_price
                )

                execute_exit = True

            elif time_exit:
                exit_price = current_close
                execute_exit = True

            if execute_exit:
                raw_return = (exit_price - entry_price) / entry_price
                current_capital = current_capital * (1 + raw_return - COMMISSION)
                in_trade = False

        elif not in_trade:
            if signals[i] == 1:
                in_trade = True
                entry_price = current_open
                exit_step_limit = i + HOLD_PERIOD

    total_return = (current_capital - INITIAL_CAPITAL) / INITIAL_CAPITAL
    returns.append(total_return)


plt.figure(figsize=(10, 6))
plt.plot(thresholds, returns, marker="o")
plt.title("Return by Threshold")
plt.xlabel("Threshold")
plt.ylabel("Cumulative Return (%)")
plt.grid(True, alpha=0.3)

ValueError: array length 1728 does not match index length 1729

In [None]:
analysis

Unnamed: 0,Symbol,Open,Low,High,Close,Prob,Date,Signal
0,ETH-USD,3829.650635,3827.897461,3856.018799,3844.226562,0.519278,2025-10-31 01:00:00,0
1,BTC-USD,109594.359375,108690.320312,109641.195312,108804.484375,0.535598,2025-10-31 02:00:00,0
2,ETH-USD,3846.122559,3811.953369,3849.025635,3819.152100,0.519278,2025-10-31 02:00:00,0
3,BTC-USD,108831.820312,108667.062500,109263.820312,109224.187500,0.535598,2025-10-31 03:00:00,0
4,ETH-USD,3820.614502,3813.400879,3833.941162,3826.723877,0.519278,2025-10-31 03:00:00,0
...,...,...,...,...,...,...,...,...
3452,BTC-USD,95211.289062,95064.843750,95219.226562,95077.945312,0.469174,2026-01-17 10:00:00,0
3453,ETH-USD,3294.829102,3292.790771,3302.272705,3301.244873,0.502605,2026-01-17 11:00:00,0
3454,BTC-USD,95066.109375,95029.234375,95268.000000,95255.242188,0.471305,2026-01-17 11:00:00,0
3455,BTC-USD,95267.835938,95199.992188,95312.906250,95205.414062,0.471305,2026-01-17 12:00:00,0
