In [12]:
import random

import pandas as pd

from config import config
from logs.prediction_logger import PredictionLogger
from logs.signal_logger import SignalHistoryLogger
from ploting.plot_signals import plot_backtest_signals
from signals.model_preparation import prepare_features_only
from train_model.signal_model import SignalModel
from train_model.vol_model import VolatilityModel
from validation.bootstrap import bootstrap_accuracy_pvalue
from validation.metrics import print_classification_metrics


In [13]:
def update_signal(signal_logger, signal_type, timestamp, price, confidence_str):
    try:
        signal_logger.remove_opposite_signal(timestamp, signal_type)
        if not signal_logger.has_signal(timestamp, signal_type):
            signal_logger.add_signal(signal_type, timestamp, price, trigger=confidence_str)
    except Exception as e:
        print(f"[ERROR] update_signal failed: {e}")


def run_one_day_backtest(csv_path=config.DATA_PATH):
    print("=== Backtest Mode: One Random Day ===")

    # === Load and preprocess data ===
    df_raw = pd.read_csv(csv_path, index_col="timestamp", parse_dates=True)
    df_raw.index = df_raw.index.tz_localize("UTC") if df_raw.index.tz is None else df_raw.index
    df_raw = df_raw.sort_index()

    if len(df_raw) < 1440:
        print("[Backtest] Not enough data to simulate a full day.")
        return

    # === Select a valid day ===
    full_days = df_raw.groupby(df_raw.index.date).count()
    valid_dates = full_days[full_days['close'] >= 720].index

    if len(valid_dates) == 0:
        print("[Backtest] No valid days found.")
        return

    selected_day = pd.Timestamp(random.choice(valid_dates))
    print(f"[Backtest] Selected day: {selected_day.date()}")

    df_day = df_raw[df_raw.index.date == selected_day.date()].copy()
    df_day = prepare_features_only(df_day).dropna()

    if df_day.empty or len(df_day) < 10:
        print("[Backtest] No usable data after feature engineering.")
        return

    # === Load trained models ===
    signal_model = SignalModel().load()
    vol_model = VolatilityModel().load()
    vol_features = vol_model.selected_features
    sig_features = signal_model.selected_features

    # === In-memory loggers ===
    prediction_logger = PredictionLogger(autosave=False)
    signal_logger = SignalHistoryLogger()

    # === Counter for passed signals ===
    pass_count = 0

    for i in range(5, len(df_day) - 1):
        try:
            window = df_day.iloc[:i + 1].copy()
            now = window.index[-2]
            next_ts = window.index[-1]

            vol_input = window.iloc[[-2]].reindex(columns=vol_features, fill_value=0.0)
            sig_input = window.iloc[[-2]].reindex(columns=sig_features, fill_value=0.0)

            # === Volatility filter ===
            vol_pred = vol_model.final_model.predict(vol_input)[0]
            proba_vol = vol_model.final_model.predict_proba(vol_input)[0][1]
            if vol_pred != 1:
                continue

            # === Signal prediction ===
            prob = signal_model.final_model.predict_proba(sig_input)[0][1]
            prediction = "NEUTRAL"
            conf_str = None

            if prob >= 0.85:
                prediction = "UP"
                conf_str = f"{prob:.2%}"
                update_signal(signal_logger, "xgboost_bullish", now, window['close'].iloc[-2], f"Conf={conf_str}")
                pass_count += 1

            elif prob <= 0.15:
                prediction = "DOWN"
                conf_str = f"{(1 - prob):.2%}"
                update_signal(signal_logger, "xgboost_bearish", now, window['close'].iloc[-2], f"Conf={conf_str}")
                pass_count += 1

            else:
                continue

            close_now = window['close'].iloc[-2]
            close_next = window['close'].iloc[-1]

            prediction_logger.record_prediction(
                timestamp=now,
                prediction=prediction,
                close_now=close_next,
                close_prev=close_now,
                confidence=prob
            )

        except Exception as e:
            print(f"[Backtest Error] {e}")
            continue

    # === Summary: Count of passed signals ===
    print(f"\n[Summary] Total signals that passed confidence threshold: {pass_count}")

    # === Evaluation ===
    df_result = prediction_logger.to_dataframe()
    print(df_result['prediction'].value_counts())

    print("\n=== Backtest Results ===")
    print(f"Total predictions: {len(df_result)}")

    if not df_result.empty:
        print(f"Hit rate: {df_result['hit'].mean():.2%}")

        df_filtered = df_result[df_result['prediction'].isin(['UP', 'DOWN'])].copy()

        if not df_filtered.empty and df_filtered['hit'].notna().all():
            y_pred = df_filtered['prediction']
            y_true = pd.Series([
                pred if hit else ("DOWN" if pred == "UP" else "UP")
                for hit, pred in zip(df_filtered['hit'], y_pred)
            ], index=y_pred.index)
            confidences = df_filtered.get('confidence', pd.Series([0.5] * len(df_filtered)))
            print_classification_metrics(y_true, y_pred, confidences)
        else:
            print("[Warning] No valid directional predictions (UP/DOWN) with usable hit labels to evaluate.")

        if len(df_result) >= 20:
            pval, acc, base = bootstrap_accuracy_pvalue(df_result['hit'].values)
            print(f"Bootstrapped Accuracy: {acc:.2%} | p={pval:.4f} | Baseline: {base:.2%}")
        else:
            print("[Note] Not enough predictions for bootstrap evaluation.")

        # === Plotting ===
        df_plot = df_day.copy()
        df_plot.index = df_plot.index.tz_localize("UTC") if df_plot.index.tz is None else df_plot.index

        df_result_plot = df_result.copy()
        df_result_plot["timestamp"] = pd.to_datetime(df_result_plot["timestamp"], utc=True)
        df_result_plot["type"] = df_result_plot["prediction"].map({
            "UP": "xgboost_bullish",
            "DOWN": "xgboost_bearish"
        })
        df_result_plot["price"] = df_result_plot["close_now"]
        df_result_plot["confidence"] = df_result_plot["confidence"]
        df_result_plot = df_result_plot[df_result_plot["type"].notna()]

        df_result_plot = df_result_plot[
            (df_result_plot["timestamp"] >= df_plot.index.min()) &
            (df_result_plot["timestamp"] <= df_plot.index.max())
        ]

        if df_result_plot.empty:
            print("[Debug] No signals to plot — check timestamp alignment or filtering logic.")
        else:
            print(f"[Debug] Plotting {len(df_result_plot)} signals.")

            df_result_plot["trigger"] = df_result_plot["confidence"].apply(lambda x: f"Conf={x:.2%}")
            plot_backtest_signals(
                df_plot,
                signal_df=df_result_plot[["timestamp", "type", "price", "trigger"]],
                symbol=config.SYMBOL,
                output_path=None,
                data_range=500
            )
    else:
        print("No valid predictions generated.")

In [14]:
run_one_day_backtest()

=== Backtest Mode: One Random Day ===
[Backtest] Selected day: 2025-05-26
[Load] Model and features loaded.
[Load] Last trained: 2025-07-04T04:02:04.560046
[VolLoad] Model loaded with 4 features.
[VolLoad] Last trained: 2025-07-04T04:01:13.302181



The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.




[Summary] Total signals that passed confidence threshold: 15
prediction
UP      11
DOWN     4
Name: count, dtype: int64

=== Backtest Results ===
Total predictions: 15
Hit rate: 33.33%

[Eval] Classification Report:
              precision    recall  f1-score   support

        DOWN       0.75      0.25      0.38        12
          UP       0.18      0.67      0.29         3

    accuracy                           0.33        15
   macro avg       0.47      0.46      0.33        15
weighted avg       0.64      0.33      0.36        15

Accuracy:  0.3333
Precision: 0.4659
Recall:    0.4583
F1 Score:  0.3304
AUC:       0.6944
[Note] Not enough predictions for bootstrap evaluation.
[Debug] Plotting 15 signals.
