Imports + global config

In [None]:
import os
import numpy as np
import pandas as pd

DATA_PATH = "data/XAU_15m_data.csv"
OUT_DIR = "outputs"
os.makedirs(OUT_DIR, exist_ok=True)

FREQ = "15min"
CALENDAR = "weekday"
MAX_FILL_BARS = 8

FORWARD_YEARS = 5
TREND_PERIOD = 200
LAG = 1

ROUND_TRIP_BPS = 2.0

MC_N_SIMS_FINAL = 10_000
MC_SEED0 = 123
MC_BURN = 2000

PERIODS_PER_YEAR = 96 * 252
MC_YEARS = 1
MC_N_STEPS = PERIODS_PER_YEAR * MC_YEARS


Load + clean + reindex + gap fill + report

In [None]:
from data_utilities import (
    load_xau_15m_csv,
    clean_xau_15m,
    reindex_and_fill_gaps,
    make_data_quality_report,
    print_data_quality_report,
)

df_raw = load_xau_15m_csv(DATA_PATH)

df_clean, clean_stats = clean_xau_15m(df_raw)

df_grid, expected, missing, filled = reindex_and_fill_gaps(
    df_clean,
    freq=FREQ,
    calendar=CALENDAR,
    max_fill_bars=MAX_FILL_BARS,
    fill_policy="ffill_close_only",
)

df_used = df_grid.dropna(subset=["Close"]).copy()

rep = make_data_quality_report(
    df_raw=df_raw,
    df_clean=df_grid,
    clean_stats=clean_stats,
    expected_bars=expected,
    missing_bars=missing,
    filled_bars=filled,
    calendar=CALENDAR,
    freq=FREQ,
)

print_data_quality_report(rep)

print("\n=== DATA USED FOR BACKTESTS ===")
print("df_grid bars:", len(df_grid), "NaN Close:", df_grid["Close"].isna().sum())
print("df_used bars:", len(df_used), "NaN Close:", df_used["Close"].isna().sum())

pd.DataFrame([rep.__dict__]).to_csv(
    os.path.join(OUT_DIR, "data_quality_report.csv"),
    index=False
)
df_used.to_csv(os.path.join(OUT_DIR, "data_used_for_backtests.csv"))



=== DATA QUALITY REPORT ===
Timeframe: 15min
Calendar: weekday
Coverage: 2004-06-11 07:15:00 → 2025-12-01 04:15:00
Raw rows: 487977
Clean rows: 537685
Dropped invalid dates: 0
Dropped invalid prices: 0
Dropped duplicates: 0
Expected bars: 537685
Missing bars: 49708
Filled bars (short gaps): 28696
Left missing (long gaps): 21012

=== DATA USED FOR BACKTESTS ===
df_grid bars: 537685 NaN Close: 21012
df_used bars: 516673 NaN Close: 0


Split into in-sample and out-of-sample (forward test)

In [None]:
cutoff = df_used.index.max() - pd.DateOffset(years=FORWARD_YEARS)

df_in = df_used[df_used.index < cutoff].copy()
df_fw = df_used[df_used.index >= cutoff].copy()

print("IN-SAMPLE:", df_in.index.min(), "→", df_in.index.max(), "bars:", len(df_in))
print("FORWARD :", df_fw.index.min(), "→", df_fw.index.max(), "bars:", len(df_fw))


IN-SAMPLE: 2004-06-11 07:15:00 → 2020-12-01 04:00:00 bars: 398879
FORWARD : 2020-12-01 04:15:00 → 2025-12-01 04:15:00 bars: 117794


Historical optimization on in-sample and save csv

In [None]:
from optimization import optimize_ema_parameters

fast_range = range(5, 41)
slow_range = range(40, 101)

hist_grid = optimize_ema_parameters(
    df_in=df_in,
    fast_range=fast_range,
    slow_range=slow_range,
    trend_period=TREND_PERIOD,
    lag=LAG,
    periods_per_year=PERIODS_PER_YEAR,
    round_trip_bps=ROUND_TRIP_BPS,
    out_csv_path=os.path.join(OUT_DIR, "hist_grid_results.csv"),
)

hist_grid.head(10)


Unnamed: 0,fast_period,slow_period,total_return,cagr,max_drawdown,ann_vol,sharpe,sortino,calmar,n_entries,n_exits,n_flips,n_round_trips,avg_hold_bars,median_hold_bars,turnover,round_trip_bps,trend_period,lag
0,40,71,1.824598,0.065003,-0.210452,0.131362,0.545046,0.498893,0.308871,3091,3090,173,3090,66.077968,30.0,0.016363,2.0,200,1
1,39,69,1.845929,0.065489,-0.212093,0.131473,0.548168,0.500737,0.308774,3138,3137,185,3137,64.851498,29.0,0.016659,2.0,200,1
2,39,73,1.845755,0.065485,-0.213102,0.131426,0.54829,0.5026,0.307293,3101,3100,175,3100,65.980974,29.0,0.016424,2.0,200,1
3,40,67,1.893583,0.066562,-0.218899,0.131463,0.55586,0.507331,0.304078,3129,3128,184,3128,64.99968,29.0,0.016609,2.0,200,1
4,38,74,1.761698,0.063549,-0.212567,0.131308,0.534815,0.489712,0.298959,3109,3108,167,3108,65.686716,29.0,0.016424,2.0,200,1
5,38,70,1.845624,0.065482,-0.220074,0.131781,0.547144,0.50065,0.297544,3148,3147,197,3147,64.935832,29.0,0.016769,2.0,200,1
6,37,76,1.735194,0.062927,-0.212648,0.131391,0.530109,0.485983,0.29592,3100,3099,176,3099,65.976774,30.0,0.016424,2.0,200,1
7,40,68,1.837886,0.065306,-0.223548,0.131367,0.547198,0.499645,0.292132,3126,3125,184,3125,65.015035,29.0,0.016594,2.0,200,1
8,39,68,1.78088,0.063995,-0.221449,0.131506,0.5374,0.49081,0.288985,3151,3150,193,3150,64.630276,29.0,0.016764,2.0,200,1
9,40,69,1.71516,0.062453,-0.216346,0.131081,0.527655,0.481156,0.288673,3123,3122,181,3122,65.23407,29.0,0.016564,2.0,200,1


Fit GARCH on in-sample returns and save fit summary

In [None]:
from simulation import fit_garch_11, summarize_fit
from data_utilities import compute_log_returns

r_in = compute_log_returns(df_in, scale=100.0)

fit = fit_garch_11(
    returns=r_in,
    mean="Zero",
    dist="normal",
    scale_factor=10.0,
    deseasonalise=True,
    rescale=False,
)

fit_summary = summarize_fit(fit)
pd.DataFrame([fit_summary]).to_csv(os.path.join(OUT_DIR, "garch_fit_summary.csv"), index=False)

fit_summary


{'model': 'GARCH(1,1), mean=Zero, dist=normal, scale_factor=10.0, deseasonalise=True',
 'nobs': 398878,
 'loglikelihood': -514249.3698165637,
 'aic': 1028504.7396331273,
 'bic': 1028537.4288657815,
 'params': {'omega': 0.013051556518940287,
  'alpha[1]': 0.06334667407687776,
  'beta[1]': 0.9267747668783896},
 'convergence_flag': 0,
 'alpha_plus_beta': 0.9901214409552673}

Generate MC returns and prices

In [None]:
from simulation import simulate_garch_returns, returns_to_price_paths

if MC_N_STEPS > len(df_in):
    raise ValueError("MC horizon longer than in-sample data. Reduce MC_YEARS or use shorter index.")

index_sim = df_in.index[-MC_N_STEPS:]
start_price = float(df_in["Close"].iloc[-1])

sim_r = simulate_garch_returns(
    fit=fit,
    index=index_sim,
    n_sims=MC_N_SIMS_FINAL,
    seed=MC_SEED0,
    burn=MC_BURN,
    add_back_mean=True,
)

sim_p = returns_to_price_paths(sim_r, start_price=start_price)

print("sim_r:", sim_r.shape, "sim_p:", sim_p.shape)


sim_r: (24192, 10000) sim_p: (24192, 10000)


Path diagnostics and save to csv

In [None]:
from simulation import summarize_simulated_returns, summarize_simulated_paths

ret_stats = summarize_simulated_returns(sim_r, periods_per_year=PERIODS_PER_YEAR)
ret_stats.to_csv(os.path.join(OUT_DIR, "mc_return_diagnostics.csv"), index=False)

path_stats = summarize_simulated_paths(sim_p, periods_per_year=PERIODS_PER_YEAR)
path_stats.to_csv(os.path.join(OUT_DIR, "mc_path_diagnostics_per_path.csv"), index=False)

path_summary = path_stats.describe(percentiles=[0.01, 0.05, 0.5, 0.95, 0.99]).T
path_summary.to_csv(os.path.join(OUT_DIR, "mc_path_diagnostics_summary.csv"))

ret_stats, path_summary


(       count      mean       std       min      p01       p05    median  \
 0  241920000  0.000302  0.124865 -3.943738 -0.34221 -0.195084  0.000295   
 
         p95       p99       max      skew  kurtosis  
 0  0.195706  0.342835  5.373327  0.001114  5.327784  ,
                 count      mean       std       min        1%        5%  \
 total_return  10000.0  0.096480  0.216412 -0.458318 -0.318870 -0.220334   
 max_drawdown  10000.0 -0.189251  0.067554 -0.477017 -0.388810 -0.321210   
 ann_vol       10000.0  0.194032  0.008331  0.170500  0.177597  0.181770   
 
                    50%       95%       99%       max  
 total_return  0.073846  0.487645  0.676988  1.268112  
 max_drawdown -0.176937 -0.102165 -0.085206 -0.058906  
 ann_vol       0.193333  0.208417  0.217636  0.264953  )

Choose historical top10 finalists and save all per simulation results to csv

In [None]:
from simulation_eval import save_per_sim_results_finalists, summarize_mc_results_csv

top10_hist = list(
    hist_grid[["fast_period", "slow_period"]]
    .head(10)
    .itertuples(index=False, name=None)
)

mc_results_hist_top10_csv = os.path.join(OUT_DIR, "mc_results_hist_top10.csv")

save_per_sim_results_finalists(
    prices_sim=sim_p,
    index=index_sim,
    param_list=top10_hist,
    out_csv_path=mc_results_hist_top10_csv,
    trend_period=TREND_PERIOD,
    lag=LAG,
    periods_per_year=PERIODS_PER_YEAR,
    round_trip_bps=ROUND_TRIP_BPS,
    sim_id_offset=0,
)

mc_summary_hist_top10 = summarize_mc_results_csv(
    mc_results_csv=mc_results_hist_top10_csv,
    out_summary_csv=os.path.join(OUT_DIR, "mc_summary_hist_top10.csv"),
    dd_threshold=-0.30,
    tail_q=0.05,
)

mc_summary_hist_top10.head(10)


Unnamed: 0,fast,slow,mc_mean_return,mc_median_return,mc_p05_return,mc_es05_return,mc_prob_loss,mc_mean_dd,mc_p95_dd,mc_prob_dd_le_thresh,mc_mean_sharpe,mc_p05_sharpe,mc_mean_sortino,mc_p05_sortino,mc_mean_calmar,mc_p05_calmar
0,40,67,-0.042442,-0.054717,-0.236149,-0.271098,0.657,-0.173393,-0.090578,0.0311,-0.345365,-2.014,-0.317855,-1.909057,-0.029844,-0.912356
1,40,68,-0.042514,-0.054707,-0.236295,-0.271507,0.6578,-0.173401,-0.090714,0.0312,-0.345606,-2.005364,-0.318212,-1.914387,-0.030487,-0.912602
2,40,71,-0.042368,-0.054948,-0.236858,-0.272436,0.6566,-0.173324,-0.090695,0.031,-0.345736,-2.013059,-0.317845,-1.912351,-0.028011,-0.911601
3,39,68,-0.042424,-0.054277,-0.23697,-0.271332,0.6595,-0.173461,-0.090724,0.0317,-0.344642,-2.011092,-0.317462,-1.916049,-0.029956,-0.913195
4,39,73,-0.042236,-0.05388,-0.237218,-0.273206,0.6572,-0.173391,-0.090613,0.0315,-0.344768,-2.028187,-0.317027,-1.923748,-0.026663,-0.912151
5,40,69,-0.042284,-0.055067,-0.237286,-0.272609,0.6558,-0.173371,-0.090108,0.033,-0.344663,-2.009902,-0.316971,-1.921873,-0.027079,-0.911501
6,39,69,-0.042526,-0.054715,-0.237341,-0.27153,0.6577,-0.173528,-0.09058,0.032,-0.345608,-2.008188,-0.318421,-1.9211,-0.029797,-0.912319
7,38,70,-0.042651,-0.055541,-0.237736,-0.27162,0.657,-0.17375,-0.091038,0.0333,-0.346361,-2.013945,-0.319344,-1.928564,-0.030269,-0.912463
8,38,74,-0.042731,-0.055338,-0.238231,-0.27336,0.6574,-0.173728,-0.090919,0.032,-0.347891,-2.026928,-0.320513,-1.913737,-0.02978,-0.912293
9,37,76,-0.042729,-0.055025,-0.239028,-0.274136,0.6534,-0.173923,-0.091061,0.0327,-0.348126,-2.026913,-0.320931,-1.925651,-0.028423,-0.912635


Best EMA pair from MC summary and compare on forward 5 years (out-of-sample)

In [None]:
from simulation_eval import evaluate_one_path

best_hist = tuple(hist_grid.loc[0, ["fast_period", "slow_period"]])
best_mc = tuple(mc_summary_hist_top10.loc[0, ["fast", "slow"]])

print("Best HIST (in-sample):", best_hist)
print("Best MC (robust among hist top10):", best_mc)

fw_prices = df_fw["Close"].astype(float)

res_fw_hist = evaluate_one_path(
    prices=fw_prices,
    fast_period=int(best_hist[0]),
    slow_period=int(best_hist[1]),
    trend_period=TREND_PERIOD,
    lag=LAG,
    periods_per_year=PERIODS_PER_YEAR,
    round_trip_bps=ROUND_TRIP_BPS,
)

res_fw_mc = evaluate_one_path(
    prices=fw_prices,
    fast_period=int(best_mc[0]),
    slow_period=int(best_mc[1]),
    trend_period=TREND_PERIOD,
    lag=LAG,
    periods_per_year=PERIODS_PER_YEAR,
    round_trip_bps=ROUND_TRIP_BPS,
)

comparison_fw = pd.DataFrame([
    {"model": "best_hist_in_sample", **res_fw_hist},
    {"model": "best_mc_robust", **res_fw_mc},
])

comparison_fw.to_csv(os.path.join(OUT_DIR, "forward_comparison.csv"), index=False)
comparison_fw


Best HIST (in-sample): (40.0, 71.0)
Best MC (robust among hist top10): (40.0, 67.0)


Unnamed: 0,model,fast,slow,total_return,cagr,max_drawdown,ann_vol,sharpe,sortino,calmar,n_entries,n_exits,n_flips,n_round_trips,avg_hold_bars,median_hold_bars,turnover,round_trip_bps,lag,trend_period
0,best_hist_in_sample,40,71,-0.060209,-0.012673,-0.233241,0.101506,-0.074906,-0.070915,-0.054333,1004,1004,54,1004,59.578685,26.5,0.017964,2.0,1,200
1,best_mc_robust,40,67,-0.096855,-0.020705,-0.243898,0.100766,-0.157262,-0.147081,-0.084892,1003,1003,54,1003,59.190429,26.0,0.017947,2.0,1,200
