In [7]:
# 1) Load config & seed
from pathlib import Path
import os
import sys
import json
import math
import random
from collections.abc import Mapping
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

try:
    notebook_path = Path(__file__).resolve()
except NameError:
    notebook_path = Path.cwd() / "research" / "notebooks" / "AMIE_v0.1.ipynb"
    __file__ = str(notebook_path)

repo_root = notebook_path.resolve().parents[2]
if str(repo_root / "src") not in sys.path:
    sys.path.insert(0, str(repo_root / "src"))

notebook_dir = notebook_path.parent
config_dir = repo_root / "config"


def _load_with_omegaconf(path: Path):
    from omegaconf import OmegaConf  # type: ignore

    cfg = OmegaConf.load(path)
    return OmegaConf.to_container(cfg, resolve=True)


def _load_with_yaml(path: Path):
    import yaml  # type: ignore

    with path.open("r", encoding="utf-8") as handle:
        return yaml.safe_load(handle)


try:
    raw_configs = {
        "global": _load_with_omegaconf(config_dir / "config.yaml"),
        "data": _load_with_omegaconf(config_dir / "data" / "synthetic.yaml"),
        "model": _load_with_omegaconf(config_dir / "model" / "kalman.yaml"),
        "backtest": _load_with_omegaconf(config_dir / "backtest" / "default.yaml"),
    }
    loader_backend = "omegaconf"
except ModuleNotFoundError:
    raw_configs = {
        "global": _load_with_yaml(config_dir / "config.yaml"),
        "data": _load_with_yaml(config_dir / "data" / "synthetic.yaml"),
        "model": _load_with_yaml(config_dir / "model" / "kalman.yaml"),
        "backtest": _load_with_yaml(config_dir / "backtest" / "default.yaml"),
    }
    loader_backend = "yaml"

configs = {}
for name, cfg in raw_configs.items():
    if isinstance(cfg, Mapping):
        configs[name] = dict(cfg)
    else:
        configs[name] = cfg

global_config = configs.get("global", {}) or {}
training_config = global_config.get("training", {}) if isinstance(global_config, dict) else {}
seed_candidates = [
    training_config.get("seed") if isinstance(training_config, dict) else None,
    global_config.get("run", {}).get("seed") if isinstance(global_config, dict) else None,
    42,
]
seed_value = next((int(v) for v in seed_candidates if isinstance(v, int)), 42)

random.seed(seed_value)
np.random.seed(seed_value)

backtest_config = configs.get("backtest", {}) or {}
fee_bps = float(backtest_config.get("fee_bps", 0.0))
slippage_bps = float(backtest_config.get("slippage_bps", 0.0))
assert fee_bps >= 0.0, "Fees must be non-negative"
assert slippage_bps >= 0.0, "Slippage must be non-negative"

model_config = configs.get("model", {}) or {}

effective_config = {
    "seed": seed_value,
    "loader_backend": loader_backend,
    "paths": {
        "repo_root": str(repo_root),
        "golden_parquet": str(repo_root / "tests/fixtures/golden/synthetic_data_seed42.parquet"),
    },
    "features": {
        "window_size": 20,
        "ewma_span": 20,
    },
    "model": {
        "process_noise": model_config.get("process_noise"),
        "observation_noise": model_config.get("observation_noise"),
        "warmup_period": model_config.get("warmup_period"),
        "instrument": model_config.get("instrument"),
    },
    "backtest": {
        "initial_capital": backtest_config.get("initial_capital", 100000.0),
        "fee_bps": fee_bps,
        "slippage_bps": slippage_bps,
        "max_position_size": backtest_config.get("max_position_size", 1.0),
    },
}

print(json.dumps(effective_config, indent=2, sort_keys=True))



{
  "backtest": {
    "fee_bps": 1.0,
    "initial_capital": 100000,
    "max_position_size": 1.0,
    "slippage_bps": 5.0
  },
  "features": {
    "ewma_span": 20,
    "window_size": 20
  },
  "loader_backend": "omegaconf",
  "model": {
    "instrument": "BTC-USD",
    "observation_noise": 0.01,
    "process_noise": 0.001,
    "warmup_period": 100
  },
  "paths": {
    "golden_parquet": "/home/ivan-yoga-ubuntu/VSCodeProjects/AMIE/tests/fixtures/golden/synthetic_data_seed42.parquet",
    "repo_root": "/home/ivan-yoga-ubuntu/VSCodeProjects/AMIE"
  },
  "seed": 42
}


In [8]:
# 2) Load synthetic data (or golden parquet)
from amie.data.sources.synthetic_lob import SyntheticLOBGenerator, SyntheticLOBConfig, SyntheticLOBRegime

def _generate_synthetic_dataframe(seed: int, cfg: dict) -> pd.DataFrame:
    regimes_raw = cfg.get("regimes", [])
    regimes = [SyntheticLOBRegime(**reg) for reg in regimes_raw] if regimes_raw else None
    config_kwargs = {
        "instrument": cfg.get("instrument", "BTC-USD"),
        "regime_duration": cfg.get("regime_duration", 250),
        "depth": cfg.get("depth", 3),
        "level_spread": cfg.get("level_spread", 10.0),
        "tick_interval_seconds": cfg.get("tick_interval_seconds", 1.0),
    }
    if regimes is not None:
        config_kwargs["regimes"] = regimes
    lob_config = SyntheticLOBConfig(**config_kwargs)
    generator = SyntheticLOBGenerator(seed=seed, config=lob_config)
    n_ticks = int(cfg.get("n_ticks", 5000))
    return generator.to_dataframe(n_ticks)

synthetic_cfg = configs.get("data", {}) or {}
golden_path = Path(effective_config["paths"]["golden_parquet"])

market_df = None
data_source = "synthetic_generator"
if golden_path.exists():
    try:
        market_df = pd.read_parquet(golden_path)
        data_source = "golden_parquet"
    except Exception as exc:
        print(f"Falling back to synthetic generator because parquet read failed: {exc}")

if market_df is None:
    market_df = _generate_synthetic_dataframe(seed_value, synthetic_cfg)
    if data_source == "golden_parquet":
        data_source = "golden_parquet_fallback"

market_df.loc[:, "ts"] = pd.to_datetime(market_df["ts"], utc=False)
market_df = market_df.sort_values("ts").set_index("ts")

required_columns = ["instrument", "price", "bid_price", "ask_price", "bid_qty", "ask_qty"]
missing_cols = [col for col in required_columns if col not in market_df.columns]
assert not missing_cols, f"Missing required market columns: {missing_cols}"

numeric_cols = market_df[required_columns].apply(pd.to_numeric, errors="coerce")
assert numeric_cols.notna().all().all(), "Non-numeric values detected in required columns"
market_df.loc[:, required_columns] = numeric_cols

assert market_df.index.is_monotonic_increasing, "Timestamp index must be increasing"
assert (market_df["ask_price"] >= market_df["bid_price"]).all(), "Ask must be >= bid"
assert (market_df["price"] > 0).all(), "Prices must be positive"

warmup_candidate = model_config.get("warmup_period", 0) or 0
minimum_length = max(warmup_candidate, 20)
assert len(market_df) >= minimum_length, "Data length is insufficient for warmup"

print(market_df.head(5))
print({"rows": len(market_df), "start": market_df.index[0], "end": market_df.index[-1], "source": data_source})



AssertionError: Non-numeric values detected in required columns

In [9]:
# 3) Compute features
from amie.features.transforms import FeatureComputer

feature_config = effective_config["features"]
feature_computer = FeatureComputer(window_size=feature_config["window_size"], ewma_span=feature_config["ewma_span"])
features_input = market_df.reset_index()
features_raw = feature_computer.compute(features_input)
features_raw["ts"] = pd.to_datetime(features_raw["ts"], utc=False)
features_df = features_raw.sort_values("ts").set_index("ts")

features_df["price"] = market_df["price"].values
features_df["spread_ratio"] = features_df["spread"]
features_df["spread"] = (market_df["ask_price"] - market_df["bid_price"]).values
features_df["bid_price"] = market_df["bid_price"].values
features_df["ask_price"] = market_df["ask_price"].values
features_df["imbalance"] = features_df["imbalance"].astype(float)

warmup_len = max(feature_computer.window_size, feature_computer.ewma_span)

assert features_df.index.equals(market_df.index), "Feature index misaligned with market data"
post_warmup = features_df.iloc[warmup_len:]
if not post_warmup.empty:
    finite_mask = np.isfinite(post_warmup.select_dtypes(include=[np.number]).to_numpy())
    assert finite_mask.all(), "Non-finite feature values after warmup"

print(features_df.head(10))
print(market_df[["price", "bid_price", "ask_price", "bid_qty", "ask_qty"]].head(3))
print({"warmup_len": warmup_len, "assumed_no_lookahead": True})



{"function": "FeatureComputer.compute", "duration_ms": 7.375212000624742, "timestamp": "2025-10-24T21:48:26.627874Z", "level": "info", "event": "profiled_function"}


                    instrument   returns  ewma_volatility   z_score  spread  \
ts                                                                            
2024-01-01 00:00:00    BTC-USD       NaN         0.000000       NaN    20.0   
2024-01-01 00:00:01    BTC-USD  0.000376         0.000000  0.000000    20.0   
2024-01-01 00:00:02    BTC-USD -0.000521         0.000634 -0.776069    20.0   
2024-01-01 00:00:03    BTC-USD -0.000007         0.000474  0.033081    20.0   
2024-01-01 00:00:04    BTC-USD  0.000352         0.000387  0.596995    20.0   
2024-01-01 00:00:05    BTC-USD  0.000451         0.000341  0.752909    20.0   
2024-01-01 00:00:06    BTC-USD -0.000344         0.000386 -0.497223    20.0   
2024-01-01 00:00:07    BTC-USD  0.000351         0.000352  0.596399    20.0   
2024-01-01 00:00:08    BTC-USD -0.000074         0.000340 -0.072691    20.0   
2024-01-01 00:00:09    BTC-USD -0.000062         0.000328 -0.053600    20.0   

                     imbalance         price  sprea

In [10]:
# 4) Kalman -> score, uncertainty
from amie.models.kalman import KalmanFilter

kalman_settings = {
    "instrument": model_config.get("instrument", str(features_df["instrument"].mode().iat[0])),
    "process_noise": float(model_config.get("process_noise", 1e-3)),
    "observation_noise": float(model_config.get("observation_noise", 1e-2)),
    "warmup_period": int(model_config.get("warmup_period", 10)),
}

kalman_model = KalmanFilter(**kalman_settings)
features_for_model = features_df.reset_index()[["ts", "instrument", "returns"]]
kalman_model.fit(features_for_model)
signals = kalman_model.predict(features_for_model)
signal_records = [signal.dict() for signal in signals]
signals_df = pd.DataFrame(signal_records)
signals_df["ts"] = pd.to_datetime(signals_df["ts"], utc=False)
signals_df = signals_df.sort_values("ts").set_index("ts")

signals_df = signals_df.join(features_df[["returns"]])
post_warmup_signals = signals_df.iloc[warmup_len:]
if not post_warmup_signals.empty:
    assert np.isfinite(post_warmup_signals["score"]).all(), "Non-finite scores after warmup"
    assert np.isfinite(post_warmup_signals["uncertainty"]).all(), "Non-finite uncertainties after warmup"
    assert (post_warmup_signals["uncertainty"] >= 0).all(), "Negative uncertainty detected"

print(signals_df[["score", "uncertainty", "returns"]].describe().T)
print(signals_df.tail(5))



{"function": "KalmanFilter.predict", "duration_ms": 20.515104002697626, "timestamp": "2025-10-24T21:48:26.834673Z", "level": "info", "event": "profiled_function"}


              count      mean       std       min       25%       50%  \
score        1000.0 -0.000026  0.000237 -0.000930 -0.000141 -0.000013   
uncertainty  1000.0  0.003318  0.031551  0.002149  0.002201  0.002263   
returns       999.0 -0.000027  0.000636 -0.002441 -0.000393 -0.000016   

                  75%       max  
score        0.000103  0.000871  
uncertainty  0.002380  1.000000  
returns      0.000351  0.002540  
                    instrument     score  uncertainty model_version   returns
ts                                                                           
2024-01-01 00:16:35    BTC-USD -0.000564     0.002319   kalman_v0.1 -0.000224
2024-01-01 00:16:36    BTC-USD -0.000595     0.002322   kalman_v0.1 -0.000705
2024-01-01 00:16:37    BTC-USD -0.000486     0.002302   kalman_v0.1 -0.000080
2024-01-01 00:16:38    BTC-USD -0.000366     0.002286   kalman_v0.1  0.000086
2024-01-01 00:16:39    BTC-USD -0.000291     0.002277   kalman_v0.1 -0.000002


In [11]:
# 5) Backtest -> plot equity & underwater; metrics table
from amie.strategy.policy import SignalPolicy
from amie.strategy.risk import RiskManager
from amie.strategy.execution import ExecutionSimulator
from amie.backtest.engine import BacktestEngine
from amie.backtest.metrics import BacktestMetrics

policy_config = {
    "threshold_multiplier": 2.0,
    "max_position_size": backtest_config.get("max_position_size", 1.0),
}
policy = SignalPolicy(policy_config)
risk_manager = RiskManager({"max_position_size": backtest_config.get("max_position_size", 1.0)})
execution_cfg = {
    "slippage_bps": backtest_config.get("slippage_bps", 0.0),
    "fee_bps": backtest_config.get("fee_bps", 0.0),
    "instrument": model_config.get("instrument", policy_config.get("instrument", "UNKNOWN")),
}
executor = ExecutionSimulator(execution_cfg)

backtest_engine = BacktestEngine(backtest_config, KalmanFilter(**kalman_settings), policy, risk_manager, executor)
backtest_features = features_df.copy().reset_index()
backtest_input_cols = ["ts", "instrument", "returns", "price", "spread"]
result_df = backtest_engine.run(backtest_features[backtest_input_cols])
result_df["ts"] = pd.to_datetime(result_df["ts"], utc=False)
result_df = result_df.set_index("ts").sort_index()

initial_capital = float(backtest_config.get("initial_capital", 100000.0))
prev_equity = result_df["equity"].shift(1).fillna(initial_capital)
result_df["returns"] = result_df["pnl"] / prev_equity.replace(0, np.nan)
result_df["returns"] = result_df["returns"].fillna(0.0)
result_df["underwater"] = -result_df["drawdown"]

assert np.isfinite(result_df["equity"]).all(), "Equity series contains non-finite values"
assert (result_df["underwater"] <= 1e-12).all(), "Underwater series must be <= 0"

if np.allclose(backtest_features["returns"], 0.0, atol=1e-12):
    assert result_df["pnl"].sum() <= 1e-8, "PnL should not be positive with zero returns"

notebook_dir.mkdir(parents=True, exist_ok=True)

equity_path = notebook_dir / "equity.png"
plt.figure(figsize=(10, 4))
plt.plot(result_df.index, result_df["equity"], label="Equity")
plt.title("Equity Curve")
plt.xlabel("Timestamp")
plt.ylabel("Equity")
plt.tight_layout()
plt.savefig(equity_path)
plt.close()

underwater_path = notebook_dir / "underwater.png"
plt.figure(figsize=(10, 4))
plt.plot(result_df.index, result_df["underwater"], label="Underwater")
plt.title("Underwater Curve")
plt.xlabel("Timestamp")
plt.ylabel("Drawdown")
plt.tight_layout()
plt.savefig(underwater_path)
plt.close()

metrics = BacktestMetrics(result_df).to_dataframe()
elapsed = (result_df.index[-1] - result_df.index[0]).total_seconds() if len(result_df) > 1 else 0.0
years = max(elapsed / (365.25 * 24 * 3600), 1.0 / 252.0)
end_equity = float(result_df["equity"].iloc[-1])
cagr = (end_equity / initial_capital) ** (1.0 / years) - 1.0 if end_equity > 0 else float("nan")
metrics["cagr"] = cagr

metrics_path = notebook_dir / "metrics.csv"
metrics.to_csv(metrics_path, index=False)

print({"equity_path": str(equity_path), "underwater_path": str(underwater_path), "metrics_path": str(metrics_path)})
print(metrics)
try:
    from IPython.display import display
    display(metrics)
except Exception as exc:
    print(f"Display unavailable: {exc}")



{"function": "KalmanFilter.predict", "duration_ms": 43.04760200102464, "timestamp": "2025-10-24T21:48:26.993174Z", "level": "info", "event": "profiled_function"}
backtest_summary total_trades=0 final_equity=nan sharpe=0.0000
{"function": "BacktestEngine.run", "duration_ms": 50.32477900022059, "timestamp": "2025-10-24T21:48:26.999864Z", "level": "info", "event": "profiled_function"}


AssertionError: Equity series contains non-finite values

In [12]:
# 6) Print constants + assumptions & failure modes
summary = {
    "fees_bps": float(backtest_config.get("fee_bps", 0.0)),
    "slippage_bps": float(backtest_config.get("slippage_bps", 0.0)),
    "feature_window": int(feature_computer.window_size),
    "ewma_span": int(feature_computer.ewma_span),
    "seed": int(seed_value),
    "risk_cap": float(policy_config.get("max_position_size", 1.0)),
    "threshold_multiplier": float(policy_config.get("threshold_multiplier", 2.0)),
    "warmup_len": int(warmup_len),
    "data_source": data_source,
}
print(json.dumps(summary, indent=2, sort_keys=True))
print("Assumptions & Failure Modes")
bullets = [
    "- No look-ahead: signals at t use data <= t-1; breach -> optimistic bias",
    "- Warmup handling: metrics ignore first warmup_len rows; mis-set -> NaNs/leakage",
    "- Determinism: same seed+config reproduces identical features/ticks; failed determinism -> flaky tests",
    "- Fees/slippage units: interpreted as bps fractions; mismatch -> wrong PnL",
    "- Data validity: bid<=ask, price>0, monotone time index; violation -> invalid features/backtest",
]
for line in bullets:
    print(line)



{
  "data_source": "golden_parquet",
  "ewma_span": 20,
  "feature_window": 20,
  "fees_bps": 1.0,
  "risk_cap": 1.0,
  "seed": 42,
  "slippage_bps": 5.0,
  "threshold_multiplier": 2.0,
  "warmup_len": 20
}
Assumptions & Failure Modes
- No look-ahead: signals at t use data <= t-1; breach -> optimistic bias
- Warmup handling: metrics ignore first warmup_len rows; mis-set -> NaNs/leakage
- Determinism: same seed+config reproduces identical features/ticks; failed determinism -> flaky tests
- Fees/slippage units: interpreted as bps fractions; mismatch -> wrong PnL
- Data validity: bid<=ask, price>0, monotone time index; violation -> invalid features/backtest
