# MetaMLP Walk-Forward Backtest

Walk-forward out-of-sample evaluation of the MetaMLP strategy on AUDCHF H1.

- Reuses `MetaMLP._build_features()` for feature engineering
- Trains one `MLPRegressor` per horizon on each fold
- Evaluates directional accuracy per horizon
- Builds consensus trading signals and runs a vectorbt portfolio backtest

In [1]:
import warnings
warnings.filterwarnings("ignore")

import MetaTrader5 as mt5
import pandas as pd

import numpy as np
import yaml
from datetime import datetime, timedelta
from itertools import combinations

from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, accuracy_score

import vectorbt as vbt
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from metalib.metamlp import MetaMLP
from metalib.indicators import ols_tval_nb

# --- MT5 Connection & Data Load ---
mt5.initialize()

SYMBOL = "AUDCHF"
TIMEFRAME = mt5.TIMEFRAME_H1

end_date = datetime.now()
start_date = end_date - timedelta(days=1600)  # ~4.4 years

rates = mt5.copy_rates_range(SYMBOL, TIMEFRAME, start_date, end_date)
df = pd.DataFrame(rates)
df["time"] = pd.to_datetime(df["time"], unit="s")
df = df.set_index("time")

close = df["close"]
print(f"Loaded {len(df)} bars from {df.index[0]} to {df.index[-1]}")

Loaded 27221 bars from 2021-09-29 20:00:00 to 2026-02-13 23:00:00


In [28]:
# --- Load YAML config ---
with open("../config/prod/metamlp.yaml", "r") as f:
    cfg_all = yaml.safe_load(f)

cfg = cfg_all["AUDCHF_H1_MLP"]

rolling_windows = tuple(cfg["rolling_windows"])
thresholds = tuple(cfg["thresholds"])
horizons = tuple(cfg["horizons"])
hidden_layers = tuple(cfg["hidden_layers"])
max_iter = cfg["max_iter"]
fit_lookback_days = 365# cfg["fit_lookback_days"]
train_ratio = cfg["train_ratio"]
risk_reward = cfg["risk_reward"]

# Walk-forward parameters
train_window = fit_lookback_days * 24  # bars (H1)
test_window = 30 * 24                  # 30 days = 720 H1 bars
step_size = test_window                # non-overlapping

print(f"Train window: {train_window} bars ({fit_lookback_days} days)")
print(f"Test window:  {test_window} bars ({test_window // 24} days)")
print(f"Horizons: {horizons}")
print(f"Rolling windows: {rolling_windows}")
print(f"Thresholds: {thresholds}")
print(f"Hidden layers: {hidden_layers}, max_iter: {max_iter}")

Train window: 8760 bars (365 days)
Test window:  720 bars (30 days)
Horizons: (4, 6, 8)
Rolling windows: (24, 120, 528)
Thresholds: (-7.0, -5.0, -1.0, 1.0, 5.0, 7.0)
Hidden layers: (64, 32), max_iter: 500


In [29]:
# --- Build features using MetaMLP._build_features ---
mlp = MetaMLP(
    symbols=[SYMBOL],
    timeframe=TIMEFRAME,
    tag="walkforward_bt",
    size_position=0.05,
    rolling_windows=rolling_windows,
    thresholds=thresholds,
    horizons=horizons,
    hidden_layers=hidden_layers,
    max_iter=max_iter,
    fit_lookback_days=fit_lookback_days,
    train_ratio=train_ratio,
    risk_reward=risk_reward,
)

log_close = np.log(close)
log_ret = log_close.diff()

feat_df = mlp._build_features(log_ret)

# Build targets for each horizon
targets = {}
for h in horizons:
    targets[h] = log_close.shift(-h) - log_close

# Warmup mask: drop rows where features or any target is NaN
valid = feat_df.notna().all(axis=1)
for h in horizons:
    valid &= targets[h].notna()

feat_df = feat_df.loc[valid]
for h in horizons:
    targets[h] = targets[h].loc[valid]

feature_cols = list(feat_df.columns)
print(f"Feature matrix: {feat_df.shape}")
print(f"Valid date range: {feat_df.index[0]} to {feat_df.index[-1]}")
print(f"Features: {feature_cols}")

Feature matrix: (26685, 30)
Valid date range: 2021-10-29 20:00:00 to 2026-02-13 15:00:00
Features: ['z_24', 'ols_24', 'std_24', 'z_24_gt_-7.0', 'z_24_gt_-5.0', 'z_24_gt_-1.0', 'z_24_gt_1.0', 'z_24_gt_5.0', 'z_24_gt_7.0', 'z_120', 'ols_120', 'std_120', 'z_120_gt_-7.0', 'z_120_gt_-5.0', 'z_120_gt_-1.0', 'z_120_gt_1.0', 'z_120_gt_5.0', 'z_120_gt_7.0', 'z_528', 'ols_528', 'std_528', 'z_528_gt_-7.0', 'z_528_gt_-5.0', 'z_528_gt_-1.0', 'z_528_gt_1.0', 'z_528_gt_5.0', 'z_528_gt_7.0', 'z_diff_24_120', 'z_diff_24_528', 'z_diff_120_528']


In [30]:
# --- Walk-Forward Loop ---
n = len(feat_df)
results = []  # list of dicts per fold

# Determine fold start indices
fold_starts = list(range(train_window, n - test_window + 1, step_size))
print(f"Total bars: {n}, Number of folds: {len(fold_starts)}")

for fold_idx, test_start in enumerate(fold_starts):
    train_start = test_start - train_window
    test_end = min(test_start + test_window, n)

    X_train = feat_df.iloc[train_start:test_start].values
    X_test = feat_df.iloc[test_start:test_end].values
    test_index = feat_df.index[test_start:test_end]

    # Scale features
    scaler = StandardScaler()
    X_train_s = scaler.fit_transform(X_train)
    X_test_s = scaler.transform(X_test)

    fold_preds = {}  # horizon -> array of predictions
    fold_actuals = {}  # horizon -> array of actuals
    fold_metrics = {"fold": fold_idx, "test_start": test_index[0], "test_end": test_index[-1]}

    for h in horizons:
        y_train = targets[h].iloc[train_start:test_start].values
        y_test = targets[h].iloc[test_start:test_end].values

        model = MLPRegressor(
            hidden_layer_sizes=hidden_layers,
            max_iter=max_iter,
            random_state=42,
            early_stopping=True,
            validation_fraction=0.15,
        )
        model.fit(X_train_s, y_train)

        y_pred = model.predict(X_test_s)
        fold_preds[h] = y_pred
        fold_actuals[h] = y_test

        dir_acc = accuracy_score(y_test > 0, y_pred > 0)
        r2 = r2_score(y_test, y_pred)
        fold_metrics[f"dir_acc_h{h}"] = dir_acc
        fold_metrics[f"r2_h{h}"] = r2

    # Store predictions for signal generation
    fold_metrics["test_index"] = test_index
    fold_metrics["preds"] = fold_preds
    fold_metrics["actuals"] = fold_actuals
    results.append(fold_metrics)

    print(
        f"Fold {fold_idx:2d} | "
        f"{fold_metrics['test_start'].date()} - {fold_metrics['test_end'].date()} | "
        + " | ".join(f"h{h} acc={fold_metrics[f'dir_acc_h{h}']:.3f}" for h in horizons)
    )

print(f"\nCompleted {len(results)} folds")

Total bars: 26685, Number of folds: 24
Fold  0 | 2023-03-27 - 2023-05-08 | h4 acc=0.503 | h6 acc=0.519 | h8 acc=0.528
Fold  1 | 2023-05-08 - 2023-06-19 | h4 acc=0.464 | h6 acc=0.486 | h8 acc=0.486
Fold  2 | 2023-06-19 - 2023-07-31 | h4 acc=0.500 | h6 acc=0.515 | h8 acc=0.512
Fold  3 | 2023-07-31 - 2023-09-11 | h4 acc=0.503 | h6 acc=0.522 | h8 acc=0.508
Fold  4 | 2023-09-11 - 2023-10-23 | h4 acc=0.540 | h6 acc=0.526 | h8 acc=0.524
Fold  5 | 2023-10-23 - 2023-12-04 | h4 acc=0.469 | h6 acc=0.478 | h8 acc=0.499
Fold  6 | 2023-12-04 - 2024-01-18 | h4 acc=0.543 | h6 acc=0.542 | h8 acc=0.539
Fold  7 | 2024-01-18 - 2024-02-29 | h4 acc=0.507 | h6 acc=0.506 | h8 acc=0.499
Fold  8 | 2024-02-29 - 2024-04-11 | h4 acc=0.532 | h6 acc=0.522 | h8 acc=0.529
Fold  9 | 2024-04-11 - 2024-05-23 | h4 acc=0.471 | h6 acc=0.547 | h8 acc=0.512
Fold 10 | 2024-05-23 - 2024-07-04 | h4 acc=0.499 | h6 acc=0.518 | h8 acc=0.508
Fold 11 | 2024-07-04 - 2024-08-15 | h4 acc=0.539 | h6 acc=0.514 | h8 acc=0.490
Fold 12 | 202

In [32]:
# --- Directional Accuracy Analysis ---
metrics_df = pd.DataFrame([
    {k: v for k, v in r.items() if k not in ("test_index", "preds", "actuals")}
    for r in results
])
metrics_df = metrics_df.set_index("fold")

# Summary table
acc_cols = [c for c in metrics_df.columns if c.startswith("dir_acc")]
summary = metrics_df[acc_cols].agg(["mean", "std", "min", "max"]).T
summary.index = [f"Horizon {h}" for h in horizons]
print("=== Directional Accuracy Summary ===")
display(summary)

# R2 summary
r2_cols = [c for c in metrics_df.columns if c.startswith("r2")]
r2_summary = metrics_df[r2_cols].agg(["mean", "std", "min", "max"]).T
r2_summary.index = [f"Horizon {h}" for h in horizons]
print("\n=== R2 Summary ===")
display(r2_summary)

# Plot directional accuracy over folds
fig = go.Figure()
for h in horizons:
    fig.add_trace(go.Scatter(
        x=metrics_df["test_start"],
        y=metrics_df[f"dir_acc_h{h}"],
        name=f"Horizon {h}",
        mode="lines+markers",
    ))
fig.add_hline(y=0.5, line_dash="dash", line_color="gray", annotation_text="50%")
fig.update_layout(
    title="Directional Accuracy per Fold",
    xaxis_title="Fold Start Date",
    yaxis_title="Accuracy",
    yaxis_tickformat=".0%",
    height=450,
)
fig.show()

=== Directional Accuracy Summary ===


Unnamed: 0,mean,std,min,max
Horizon 4,0.500347,0.024806,0.458333,0.543056
Horizon 6,0.504861,0.023811,0.458333,0.547222
Horizon 8,0.497743,0.024716,0.454167,0.541667



=== R2 Summary ===


Unnamed: 0,mean,std,min,max
Horizon 4,-564.691068,780.968936,-2768.525933,-19.530738
Horizon 6,-431.626989,671.432257,-2452.594042,-7.634885
Horizon 8,-253.084095,396.46728,-1894.235064,-5.526385


In [42]:
# --- Generate Trading Signals (replicating MetaMLP.signals() logic) ---

# Concatenate all OOS predictions into aligned DataFrames
pred_series = {h: [] for h in horizons}
actual_series = {h: [] for h in horizons}

for r in results:
    idx = r["test_index"]
    for h in horizons:
        pred_series[h].append(pd.Series(r["preds"][h], index=idx))
        actual_series[h].append(pd.Series(r["actuals"][h], index=idx))

preds_df = pd.DataFrame({f"pred_h{h}": pd.concat(pred_series[h]) for h in horizons})
actuals_df = pd.DataFrame({f"actual_h{h}": pd.concat(actual_series[h]) for h in horizons})

# Align with close prices
bt_close = close.loc[preds_df.index]
bt_log_close = np.log(bt_close)

# Consensus: all horizons agree on direction
all_positive = (preds_df > 0).all(axis=1)
all_negative = (preds_df < 0).all(axis=1)

# SMA filter: compute average of rolling SMAs
sma_values = pd.DataFrame()
for w in rolling_windows:
    sma_values[f"sma_{w}"] = np.exp(bt_log_close.rolling(w).mean())
sma_target = sma_values.mean(axis=1)

# Apply SMA filter (long only if sma_target > price, short only if sma_target < price)
long_signal = all_positive & (sma_target > bt_close)
short_signal = all_negative & (sma_target < bt_close)

# Compute TP/SL distances as percentages
tp_distance_pct = (abs(sma_target - bt_close) / bt_close)
sl_distance_pct = tp_distance_pct / risk_reward

print(f"OOS period: {preds_df.index[0]} to {preds_df.index[-1]}")
print(f"Total bars: {len(preds_df)}")
print(f"Long signals:  {long_signal.sum()} ({long_signal.mean():.1%})")
print(f"Short signals: {short_signal.sum()} ({short_signal.mean():.1%})")
print(f"No signal:     {(~long_signal & ~short_signal).sum()}")

OOS period: 2023-03-27 20:00:00 to 2026-01-07 18:00:00
Total bars: 17280
Long signals:  3272 (18.9%)
Short signals: 2600 (15.0%)
No signal:     11408


In [48]:
# --- Vectorbt Portfolio Backtest ---

# Build entry/exit arrays
# Entry: new signal that wasn't active in the prior bar
long_entries = long_signal & ~long_signal.shift(1, fill_value=False)
# long_exits = ~long_signal & long_signal.shift(1, fill_value=False)

short_entries = short_signal & ~short_signal.shift(1, fill_value=False)
# short_exits = ~short_signal & short_signal.shift(1, fill_value=False)

# Median SL/TP pct for stop parameters
median_tp_pct = tp_distance_pct.median()
median_sl_pct = sl_distance_pct.median()
print(f"Median TP distance: {median_tp_pct:.4%}")
print(f"Median SL distance: {median_sl_pct:.4%}")

portfolio = vbt.Portfolio.from_signals(
    bt_close,
    entries=long_entries,
    # exits=long_exits,
    short_entries=short_entries,
    # short_exits=short_exits,
    sl_stop=median_sl_pct,
    tp_stop=median_tp_pct,
    size=1.0,
    # fees=0.0001,  # ~1 pip spread cost
    freq="1h",
    init_cash=10_000,
)

print("\n=== Portfolio Stats ===")
display(portfolio.stats())

Median TP distance: 0.3908%
Median SL distance: 0.3553%

=== Portfolio Stats ===


Start                               2023-03-27 20:00:00
End                                 2026-01-07 18:00:00
Period                                720 days 00:00:00
Start Value                                     10000.0
End Value                                    10000.1013
Total Return [%]                               0.001013
Benchmark Return [%]                         -11.991065
Max Gross Exposure [%]                         0.006166
Total Fees Paid                                     0.0
Max Drawdown [%]                               0.000565
Max Drawdown Duration                 218 days 15:00:00
Total Trades                                        998
Total Closed Trades                                 997
Total Open Trades                                     1
Open Trade PnL                                  0.00003
Win Rate [%]                                  61.183551
Best Trade [%]                                 1.465687
Worst Trade [%]                               -1

In [49]:
# --- Summary & Visualizations ---

# 1) Equity curve
equity = portfolio.value()
fig = make_subplots(
    rows=3, cols=1,
    shared_xaxes=True,
    subplot_titles=("Equity Curve", "Drawdown", "AUDCHF Price"),
    vertical_spacing=0.06,
    row_heights=[0.45, 0.25, 0.30],
)

fig.add_trace(go.Scatter(
    x=equity.index, y=equity.values,
    name="Equity", line=dict(color="blue"),
), row=1, col=1)

# Drawdown
dd = portfolio.drawdown()
fig.add_trace(go.Scatter(
    x=dd.index, y=dd.values,
    name="Drawdown", fill="tozeroy",
    line=dict(color="red"),
), row=2, col=1)
fig.update_yaxes(tickformat=".1%", row=2, col=1)

# Price
fig.add_trace(go.Scatter(
    x=bt_close.index, y=bt_close.values,
    name="AUDCHF", line=dict(color="gray"),
), row=3, col=1)

fig.update_layout(height=900, title_text="MetaMLP Walk-Forward Backtest", showlegend=True)
fig.show()

# 2) Trade-level analysis
trades = portfolio.trades.records_readable
if len(trades) > 0:
    print(f"\n=== Trade Analysis ===")
    print(f"Total trades: {len(trades)}")
    print(f"Win rate:     {(trades['PnL'] > 0).mean():.1%}")
    print(f"Sahrpe:       {(trades['PnL'].mean()/trades['PnL'].std()):.1%}")
    print(f"Avg profit:   {trades.loc[trades['PnL'] > 0, 'PnL'].mean():.4f}")
    print(f"Avg loss:     {trades.loc[trades['PnL'] <= 0, 'PnL'].mean():.4f}")
    print(f"Profit factor: {trades.loc[trades['PnL'] > 0, 'PnL'].sum() / abs(trades.loc[trades['PnL'] <= 0, 'PnL'].sum()):.2f}")
    # print(f"Avg duration: {trades['Duration'].mean()}")
else:
    print("No trades generated.")


=== Trade Analysis ===
Total trades: 998
Win rate:     61.2%
Sahrpe:       4.6%
Avg profit:   0.0017
Avg loss:     -0.0024
Profit factor: 1.11
