# Algorithmic Trading Pipeline
# Complete Workflow: Data â†’ Model â†’ Strategy â†’ Backtest â†’ Analysis

This notebook demonstrates the full pipeline for:
1. Loading/creating data catalog
2. Model hyperparameter optimization
3. Strategy hyperparameter optimization
4. Final backtest execution
5. Performance analysis and visualization

---
## Stage 0: Setup & Configuration
Load dependencies and configuration

In [1]:
# Use to debugging
%load_ext autoreload
%autoreload 2

# Core imports
from pathlib import Path
import yaml
import logging
import pandas as pd
import mlflow
from mlflow import MlflowClient
import tqdm as notebook_tqdm


# Nautilus Trader
from nautilus_trader.model.objects import Currency
from nautilus_trader.core.nautilus_pyo3 import CurrencyType
from nautilus_trader.persistence.catalog import ParquetDataCatalog
from nautilus_trader.model.data import TradeTick

# Project modules
from engine.databento_loader import DatabentoTickLoader
from engine.hparam_tuner import OptunaHparamsTuner
from engine.performance_plots import (
    get_frequency_params, align_series,
    plot_balance_breakdown, plot_cumulative_returns,
    plot_rolling_sharpe, plot_underwater,
    plot_active_returns, plot_portfolio_allocation
)

# Setup logging
logging.basicConfig(
    level=logging.DEBUG,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
client = MlflowClient(tracking_uri="file:logs/mlflow")

IndentationError: unexpected indent (hparam_tuner.py, line 625)

In [None]:
# Load Sapiens configuration
sapiens_cfg_path = Path("configs/sapiens_config.yaml")
sapiens_cfg = yaml.safe_load(sapiens_cfg_path.read_text(encoding="utf-8"))

# Load strategy and model configs
strategy_name = sapiens_cfg["SAPIENS_STRATEGY"]['strategy_name']
strategy_cfg_path = Path(f"strategies/{strategy_name}/strategy_config.yaml")
strategy_cfg = yaml.safe_load(strategy_cfg_path.read_text(encoding="utf-8"))["STRATEGY"]

model_name = sapiens_cfg["SAPIENS_MODEL"]['model_name']
model_cfg_path = Path(f"models/{model_name}/model_config.yaml")
model_cfg = yaml.safe_load(model_cfg_path.read_text(encoding="utf-8"))["MODEL"]


# Setup directories
logs_dir = Path(sapiens_cfg["logs_dir"])
logs_dir.mkdir(parents=True, exist_ok=True)

print("Configuration loaded successfully")
print(f"Model: {model_cfg['PARAMS']['model_name']}")
print(f"Strategy: {strategy_cfg['PARAMS']['strategy_name']}")
print(f"Backtest period: {sapiens_cfg['backtest_start']} to {sapiens_cfg['backtest_end']}")

---
## Stage 1: Data Catalog and HyperParameter Tuner setup
Load or create Nautilus Trader data catalog from Databento tick data

In [None]:
# Configuration
FORCE_RELOAD_CATALOG = False  # Set to True to rebuild catalog
CATALOG_PATH = None  # Set custom path or None for default

# Initialize loader
logger.info("Initializing Databento loader...")
loader = DatabentoTickLoader(
    cfg=strategy_cfg["PARAMS"],
    venue_name=strategy_cfg["PARAMS"]["venue_name"]
)

# Determine catalog path
catalog_path = Path(CATALOG_PATH) if CATALOG_PATH else loader.catalog_path

# Load or create catalog
if not FORCE_RELOAD_CATALOG and loader.catalog_exists(catalog_path):
    logger.info(f"ðŸ“‚ Reusing existing catalog at: {catalog_path}")
    catalog = ParquetDataCatalog(path=str(catalog_path))
else:
    logger.info(f"ðŸ”„ Loading Databento ticks to catalog at: {catalog_path}")
    if FORCE_RELOAD_CATALOG:
        logger.info("Force reload enabled - rebuilding catalog")
    
    # Load with progress bar and memory management
    catalog = loader.load_to_catalog(
        catalog_path=catalog_path,
    )

# Add catalog path to config
strategy_cfg["PARAMS"]["catalog_path"] = str(catalog_path)

# Verify catalog
#instruments = catalog.instruments(instrument_type=TradeTick)  # takes too long on laptop. Use loader class instruments property instead
instruments = set(inst.id.value for inst in catalog.instruments())
print(f"\nâœ… Catalog ready: {catalog.list_data_types()} data loaded")
print(f"Universe: {[str(symbol) for symbol in instruments]}")


In [None]:
# Initialize hyperparameter tuner
tuner = OptunaHparamsTuner(
    sapiens_config=sapiens_cfg,
    catalog=catalog,
    model_config=model_cfg,
    strategy_config=strategy_cfg,
    run_dir=logs_dir
)

print("Hyperparameter tuner initialized")
print(f"Model trials: {sapiens_cfg['SAPIENS_MODEL']['optimization']['n_trials']}")
print(f"Strategy trials: {sapiens_cfg['SAPIENS_STRATEGY']['optimization']['n_trials']}")

---
## Stage 2: Model Hyperparameter Optimization
Optimize model hyperparameters using Optuna

In [None]:
# Run model hyperparameter optimization
logger.info("\n" + "="*70)
logger.info("ðŸ”¬ STAGE 2: MODEL HYPERPARAMETER OPTIMIZATION")
logger.info("="*70 + "\n")

model_results = tuner.optimize_model()

print("\nâœ… Model optimization complete!")
print(f"Best model path: {model_results['model_path']}")
print(f"MLflow run ID: {model_results['mlflow_run_id']}")

---
## Stage 3: Strategy Hyperparameter Optimization
Optimize strategy hyperparameters using best model from Stage 2

In [None]:
# Use to debugging
%load_ext autoreload
%autoreload 2


# Run strategy hyperparameter optimization
logger.info("\n" + "="*70)
logger.info("ðŸ“Š STAGE 3: STRATEGY HYPERPARAMETER OPTIMIZATION")
logger.info("="*70 + "\n")

model_name = model_cfg['PARAMS']['model_name']
strategy_results = tuner.optimize_strategy(model_name=model_name)

print("\nâœ… Strategy optimization complete!")
print(f"Best hyperparameters: {strategy_results['hparams']}")
print(f"\nBest metrics:")
for metric, value in strategy_results['metrics'].items():
    print(f"  {metric}: {value:.4f}")
print(f"\nMLflow run ID: {strategy_results['mlflow_run_id']}")

---
## Stage 4: Final Backtest
Run final backtest on full period with optimized hyperparameters

In [None]:
# Get optimization context
run = client.get_run(strategy_results["mlflow_run_id"])
optimization_id = run.data.tags.get("optimization_id", "")

# Define backtest period
backtest_start = sapiens_cfg["backtest_start"]
backtest_end = sapiens_cfg["backtest_end"]

print(f"Running final backtest: {backtest_start} to {backtest_end}")

In [None]:
# Execute final backtest
logger.info("\n" + "="*70)
logger.info("ðŸš€ STAGE 4: FINAL BACKTEST")
logger.info("="*70 + "\n")

final_metrics, final_time_series = tuner.run_final_backtest(
    backtest_start=backtest_start,
    backtest_end=backtest_end,
    strategy_hpo_run_id=strategy_results["mlflow_run_id"],
    optimization_id=optimization_id
)

print("\nâœ… Final backtest complete!")
print("\nFinal Performance Metrics:")
print("="*50)
for metric, value in sorted(final_metrics.items()):
    print(f"{metric:.<40} {value:>10.4f}")

---
## Stage 5: Performance Analysis
Detailed analysis and visualization of backtest results

In [None]:
# Load results from MLflow

# Get the most recent backtest run
exp = client.get_experiment_by_name("Backtests")
if not exp:
    exp_id = client.create_experiment("Backtests")
else:
    exp_id = exp.experiment_id
runs = client.search_runs(
    experiment_ids=[exp_id],
    order_by=["start_time DESC"],
    max_results=1
)
backtest_run = runs[0]
backtest_run_id = backtest_run.info.run_id

print(f"Loading backtest run: {backtest_run_id}")

In [None]:
# Load account and positions data
acc_path = client.download_artifacts(run_id=backtest_run_id, path="account_report.json")
pos_path = client.download_artifacts(run_id=backtest_run_id, path="positions_report.json")

account_df = pd.read_csv(acc_path, index_col=0, parse_dates=True)
positions_df = pd.read_csv(pos_path, index_col=0, parse_dates=True)

print(f"Account snapshots: {len(account_df)}")
print(f"Position records: {len(positions_df)}")

In [None]:
# Calculate returns
freq = strategy_cfg['PARAMS']['freq']
freq_params = get_frequency_params(freq)

# Extract portfolio values
currency_code = strategy_cfg['PARAMS']['currency']
portfolio_values = account_df[account_df['currency'] == currency_code]['total']
portfolio_values = portfolio_values.resample(freq_params['resample_freq']).last().ffill()
strategy_ret = portfolio_values.pct_change().fillna(0)

# Load benchmark and risk-free data
data_dict = tuner.get_ohlcv_data_from_catalog(
    frequency=freq,
    start=pd.Timestamp(backtest_start),
    end=pd.Timestamp(backtest_end),
    instrument_ids=[strategy_cfg['PARAMS']['benchmark_ticker'],
                    strategy_cfg['PARAMS']['risk_free_ticker']]
)
benchmark_ret = data_dict[strategy_cfg['PARAMS']['benchmark_ticker']]['close'].pct_change()
rf_ret = data_dict[strategy_cfg['PARAMS']['risk_free_ticker']]['close'].pct_change()

# Align series
strategy_ret, benchmark_ret, rf_ret = align_series(
    strategy_ret, benchmark_ret, rf_ret, freq_params['resample_freq']
)

print(f"Returns calculated for {len(strategy_ret)} periods")

### 5.1: Balance Over Time

In [None]:
fig = plot_balance_breakdown(
    account_df=account_df,
    resample_freq=freq_params['resample_freq']
)

### 5.2: Cumulative Returns

In [None]:
fig = plot_cumulative_returns(
    strategy_ret=strategy_ret,
    benchmark_ret=benchmark_ret
)

### 5.3: Rolling Sharpe Ratio

In [None]:
window = max(10, int(freq_params['periods_per_year'] / 12))
fig = plot_rolling_sharpe(
    strategy_ret=strategy_ret,
    benchmark_ret=benchmark_ret,
    rf_ret=rf_ret,
    window=window,
    annualization_factor=freq_params['annualization_factor']
)

### 5.4: Drawdown Analysis

In [None]:
fig = plot_underwater(strategy_ret=strategy_ret)

### 5.5: Active Returns

In [None]:
fig = plot_active_returns(
    strategy_ret=strategy_ret,
    benchmark_ret=benchmark_ret,
    freq=freq
)

### 5.6: Portfolio Allocation

In [None]:
"""
fig = plot_portfolio_allocation(
    positions_df=positions_df,
    resample_freq=freq_params['resample_freq']
)
"""

### 5.7: Summary Statistics

In [None]:
# Calculate summary statistics
total_return = (1 + strategy_ret).prod() - 1
annualized_return = (1 + total_return) ** (freq_params['periods_per_year'] / len(strategy_ret)) - 1
annualized_vol = strategy_ret.std() * freq_params['annualization_factor']
sharpe_ratio = (strategy_ret.mean() - rf_ret.mean()) / strategy_ret.std() * freq_params['annualization_factor']

# Drawdown
cumulative = (1 + strategy_ret).cumprod()
running_max = cumulative.expanding().max()
drawdown = (cumulative - running_max) / running_max
max_drawdown = drawdown.min()

# Win rate
winning_periods = (strategy_ret > 0).sum()
win_rate = winning_periods / len(strategy_ret)

# Create summary DataFrame
summary = pd.DataFrame({
    'Metric': [
        'Total Return (%)',
        'Annualized Return (%)',
        'Annualized Volatility (%)',
        'Sharpe Ratio',
        'Max Drawdown (%)',
        'Win Rate (%)',
        'Number of Periods'
    ],
    'Value': [
        f"{total_return * 100:.2f}",
        f"{annualized_return * 100:.2f}",
        f"{annualized_vol * 100:.2f}",
        f"{sharpe_ratio:.2f}",
        f"{max_drawdown * 100:.2f}",
        f"{win_rate * 100:.2f}",
        len(strategy_ret)
    ]
})

print("\n" + "="*60)
print("PERFORMANCE SUMMARY")
print("="*60)
print(summary.to_string(index=False))

---
## Comparison Matrix: All Models Ã— Strategies

In [None]:
# Generate HPO results matrix
hpo_matrix = tuner.get_strategy_hpo_matrix(metric="total_pnl_pct")
print("\nStrategy HPO Results Matrix (total_pnl_pct):")
print(hpo_matrix)

# Generate final backtest results matrix
backtest_matrix = tuner.get_final_backtest_matrix(metric="sharpe_ratio")
print("\nFinal Backtest Results Matrix (sharpe_ratio):")
print(backtest_matrix)

---
## Pipeline Complete âœ…

**Next Steps:**
- Review MLflow UI: `mlflow ui --backend-store-uri logs/mlflow`
- Explore experiment tracking and compare runs
- Adjust hyperparameters in `configs/config.yaml` and rerun
- Export results for production deployment