
Research_mlflow.ipynb - Model Training & MLflow Visualization
==============================================================
This notebook demonstrates model lifecycle (initialize, predict, update) 
with MLflow tracking for training metrics visualization.


In [1]:
"""
research_mlflow.ipynb - Model Training & MLflow Visualization
==============================================================
This notebook demonstrates model lifecycle (initialize, predict, update) 
with MLflow tracking for training metrics visualization.
"""

# %% [markdown]
# # UMI Model Training with MLflow Tracking
# This notebook tests the model's core functions with sample data and visualizes training metrics

# %% Import dependencies
import calendar
import sys
import os
sys.path.append(os.path.abspath('..'))

import pandas as pd
import numpy as np
import torch
import mlflow
import mlflow.pytorch
from pathlib import Path
import yaml
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
from pandas.tseries.frequencies import to_offset
import pandas_market_calendars as market_calendars

# Import model and utilities
from models.UMIModel.UMIModel import UMIModel
from algos.engine.data_loader import CsvBarLoader
from models.utils import freq2pdoffset, freq2pdoffset

# %% Load configuration
config_path = Path("configs/config.yaml")
cfg = yaml.safe_load(config_path.read_text(encoding="utf-8"))

print(f"Configuration loaded from: {config_path}")
print(f"Model: {cfg['model_name']}")
print(f"Data directory: {cfg['data_dir']}")
print(f"Frequency: {cfg['freq']}")
print(f"Retrain offset: {cfg['training']['retrain_offset']}")
print(f"Prediction length: {cfg['pred_len']} bars")

# %% Setup MLflow
mlflow.set_tracking_uri("file:./mlruns")
mlflow.set_experiment("Model_Research")

print(f"MLflow tracking URI: {mlflow.get_tracking_uri()}")
print(f"Active experiment: {mlflow.get_experiment_by_name('Model_Research').name}")

# %% Load sample data
# Initialize data loader
cols = ['Open', 'High', 'Low', 'Adj Close', 'Volume']
loader = CsvBarLoader(cfg=cfg, columns_to_load=cols)

# Select a subset of stocks for testing
test_universe = loader.universe[:10]  # Use first 10 stocks for testing
print(f"Test universe: {test_universe}")

# Prepare sample data windows
backtest_start = pd.Timestamp(cfg["backtest_start"], tz="UTC")
train_offset = freq2pdoffset(cfg["train_offset"])
valid_offset = freq2pdoffset(cfg["valid_offset"])
train_end = backtest_start + train_offset
valid_end = train_end

# Create data dictionary for selected stocks
sample_data = {}
calendar = market_calendars.get_calendar(cfg["calendar"])
days_range = calendar.schedule(start_date=backtest_start, end_date=valid_end)
timestamps = market_calendars.date_range(days_range, frequency=cfg["freq"]).normalize()
for ticker in test_universe:
    if ticker in loader._frames:
        df = loader._frames[ticker]
        # Get data up to validation end for initial training
        df.index = df.index.normalize()
        sample_data[ticker] = df.reindex(timestamps).dropna()
        print(f"  {ticker}: {len(sample_data[ticker])} bars")

assert len(sample_data) > 0 
# %% Initialize model with MLflow tracking
print("\n=== Model Initialization ===")

# Start MLflow run
with mlflow.start_run(run_name=f"research_{datetime.now().strftime('%Y%m%d_%H%M%S')}"):
    
    # Model parameters
    model_params = {
        "freq": cfg["freq"],
        "feature_dim": cfg["feature_dim"],
        "window_len": cfg["window_len"],
        "pred_len": cfg["pred_len"],
        "train_offset": freq2pdoffset(cfg["train_offset"]),
        "pred_offset": freq2pdoffset(cfg["freq"])*int(cfg["pred_len"]),
        "train_end": train_end,
        "valid_end": valid_end,
        "n_epochs": 1,  # Reduced for testing
        "batch_size": cfg["training"]["batch_size"],
        "patience": cfg["training"]["patience"],
        "pretrain_epochs": cfg["training"]["pretrain_epochs"],
        "training_mode": cfg["training"]["training_mode"],
        "close_idx": cfg["training"]["target_idx"],
        "warm_start": cfg["training"]["warm_start"],
        "warm_training_epochs": cfg["training"]["warm_training_epochs"],
        "save_backups": False,
        "data_dir": Path(cfg["data_dir"]),
        "model_dir": Path("logs/research_model"),
        "calendar": cfg["calendar"],   #Equity market calendar
    }
    
    # Use HP tuner like strategy does
    from algos.engine.hparam_tuner import OptunaHparamsTuner, split_hparam_cfg
    
    defaults, search_space = split_hparam_cfg(cfg["hparams"])
    hp_values = {}
    
    # Test hyperparameter tuning workflow
    if cfg["training"].get("tune_hparams", False):
        print("Testing HPO workflow...")
        tuner = OptunaHparamsTuner(
            model_name=cfg["model_name"],
            ModelClass=UMIModel,
            start=backtest_start,
            end=valid_end,
            logs_dir=Path("logs/research_hpo"),
            data=sample_data,
            model_params=model_params,
            defaults=defaults,
            search_space=search_space,
            n_trials=4,  # Just 2 trials for testing
            log=None,
        )
        best = tuner.optimize()
        hp_values = {**defaults, **best["hparams"]}
        print(f"HPO complete. Best hparams: {hp_values}")
    else:
        print("Using default hyperparameters...")
        hp_values = defaults

    # Create model instance
    model = UMIModel(**model_params, **hp_values)
    
    # Initialize model (trains from scratch)
    print("Training model from scratch...")
    best_val_loss = model.initialize(sample_data)
    print(f"Initial training complete. Best validation loss: {best_val_loss:.6f}")
    
    # Log model info
    mlflow.log_param("num_stocks", len(test_universe))
    mlflow.log_param("train_bars", len(sample_data[test_universe[0]]))
    
    # %% Test prediction
    print("\n=== Testing Prediction ===")
    
    # Prepare current data for prediction (use last window)
    current_time = valid_end
    active_mask = torch.ones(len(test_universe), dtype=torch.bool)
    
    predictions = model.predict(sample_data, current_time, active_mask)
    
    print(f"Predictions at {current_time}:")
    for ticker, pred in predictions.items():
        print(f"  {ticker}: {pred:.6f}")
    
    # Log predictions
    for ticker, pred in predictions.items():
        mlflow.log_metric(f"pred_{ticker}", pred)
    
    # %% Test walk-forward updates until retrain_offset
    print("\n=== Testing Walk-Forward Updates ===")
    print("This simulates the model's behavior during live trading:")
    print("1. Make predictions at each time step")
    print("2. Update model with warm-start training")
    print("3. Continue until retrain_offset is reached")
    print("4. Then re-initialize model from scratch")
    print("-" * 60)
    
    # Get update and retrain intervals from config
    from pandas.tseries.frequencies import to_offset
    update_freq = to_offset(cfg["freq"]) * cfg["pred_len"]  # Update after each prediction horizon
    retrain_offset = to_offset(cfg["training"]["retrain_offset"])
    
    print(f"Update frequency: {update_freq}")
    print(f"Retrain offset: {retrain_offset}")
    
    # Track predictions over time
    prediction_history = []
    current_time = valid_end
    last_retrain_time = valid_end
    update_count = 0
    
    # Walk forward until retrain_offset is reached
    while current_time - last_retrain_time < retrain_offset:
        # Move forward by prediction horizon
        current_time = current_time + update_freq
        update_count += 1
        
        # Check if we have data for this time
        data_available = all(
            ticker in loader._frames and 
            len(loader._frames[ticker][loader._frames[ticker].index <= current_time]) >= cfg["window_len"]
            for ticker in test_universe
        )
        
        if not data_available:
            print(f"Insufficient data at {current_time}, stopping walk-forward")
            break
        
        # Prepare data up to current time
        current_data = {}
        for ticker in test_universe:
            if ticker in loader._frames:
                df = loader._frames[ticker]
                mask = (df.index >= backtest_start) & (df.index <= current_time)
                current_data[ticker] = df[mask]
        
        # Make prediction
        predictions = model.predict(current_data, current_time, active_mask)
        
        # Store predictions
        pred_record = {"timestamp": current_time, "update_num": update_count}
        pred_record.update({f"pred_{ticker}": pred for ticker, pred in predictions.items()})
        prediction_history.append(pred_record)
        
        # Update model (warm-start training)
        print(f"Update {update_count}: {current_time.strftime('%Y-%m-%d')} - Updating model...")
        
        # Update model's time windows for warm-start training
        model.train_end = current_time
        model.valid_end = current_time
        
        model.update(current_data, current_time, active_mask)
        
        # Calculate time since last retrain for tracking
        time_since_retrain = (current_time - last_retrain_time).days
        
        # Log to MLflow
        for ticker, pred in predictions.items():
            mlflow.log_metric(f"walkfwd_pred_{ticker}", pred, step=update_count)
        mlflow.log_metric("walkfwd_update", update_count, step=update_count)
        mlflow.log_metric("days_since_retrain", time_since_retrain, step=update_count)
        
        # Print progress
        if update_count % 5 == 0 or update_count == 1:
            sample_pred = predictions[test_universe[0]]
            print(f"  Update {update_count}: Time={current_time.strftime('%Y-%m-%d')}, "
                  f"Sample pred ({test_universe[0]})={sample_pred:.6f}")
    
    print(f"\n✅ Completed {update_count} updates over {current_time - valid_end}")
    print(f"Time since last retrain: {current_time - last_retrain_time}")
    
    # Convert predictions to DataFrame for analysis
    pred_df = pd.DataFrame(prediction_history)
    pred_df.set_index('timestamp', inplace=True)
    
    # Visualize prediction evolution
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10))
    
    # Top plot: Prediction evolution
    for ticker in test_universe[:3]:  # Plot first 3 tickers
        pred_col = f"pred_{ticker}"
        if pred_col in pred_df.columns:
            ax1.plot(pred_df.index, pred_df[pred_col], marker='o', label=ticker, alpha=0.7)
    
    ax1.set_title("Prediction Evolution During Walk-Forward")
    ax1.set_xlabel("Time")
    ax1.set_ylabel("Predicted Return")
    ax1.legend()
    ax1.grid(True, alpha=0.3)
    
    # Bottom plot: Prediction variance over time (model degradation indicator)
    pred_cols = [col for col in pred_df.columns if col.startswith('pred_')]
    if len(pred_cols) > 0:
        rolling_std = pred_df[pred_cols].rolling(window=min(5, len(pred_df))).std().mean(axis=1)
        ax2.plot(pred_df.index, rolling_std, marker='s', color='red', alpha=0.7)
        ax2.set_title("Model Stability (Rolling Std of Predictions)")
        ax2.set_xlabel("Time")
        ax2.set_ylabel("Prediction Volatility")
        ax2.grid(True, alpha=0.3)
        
        # Log degradation metric
        if len(rolling_std) > 1:
            degradation = (rolling_std.iloc[-1] - rolling_std.iloc[0]) / rolling_std.iloc[0] if rolling_std.iloc[0] != 0 else 0
            mlflow.log_metric("model_degradation_pct", degradation * 100)
            ax2.text(0.02, 0.98, f"Degradation: {degradation*100:.1f}%", 
                    transform=ax2.transAxes, verticalalignment='top',
                    bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
    
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
    
    # %% Test model re-initialization after retrain_offset
    print("\n=== Testing Model Re-initialization ===")
    print(f"Re-initializing model at {current_time} (after {update_count} updates)")
    
    # Prepare extended training data for re-initialization
    retrain_data = {}
    for ticker in test_universe:
        if ticker in loader._frames:
            df = loader._frames[ticker]
            # Include more recent data for retraining
            mask = (df.index >= backtest_start) & (df.index <= current_time)
            retrain_data[ticker] = df[mask]
    
    # Update model's train/valid end dates for re-initialization
    model.train_end = current_time - pd.DateOffset(days=30)  # Keep some data for validation
    model.valid_end = current_time
    
    # Re-initialize model (full retraining from scratch)
    print("Re-training model from scratch with updated data...")
    model._is_initialized = False  # Reset initialization flag
    new_val_loss = model.initialize(retrain_data)
    print(f"Re-initialization complete. New validation loss: {new_val_loss:.6f}")
    
    # Log re-initialization metrics
    mlflow.log_metric("retrain_val_loss", new_val_loss)
    mlflow.log_metric("retrain_after_updates", update_count)
    
    # Make predictions with re-initialized model
    retrain_predictions = model.predict(retrain_data, current_time, active_mask)
    
    print(f"\nPredictions after re-initialization:")
    for ticker in test_universe[:5]:
        if ticker in retrain_predictions:
            print(f"  {ticker}: {retrain_predictions[ticker]:.6f}")
    
    # %% Compare predictions before and after re-initialization
    print("\n=== Comparing Predictions: Before vs After Re-initialization ===")
    
    if len(prediction_history) > 0:
        # Get last predictions before re-initialization
        last_update_preds = prediction_history[-1]
        
        comparison_data = []
        for ticker in test_universe:
            before = last_update_preds.get(f"pred_{ticker}", 0)
            after = retrain_predictions.get(ticker, 0)
            change = after - before
            change_pct = (change / abs(before) * 100) if before != 0 else 0
            
            comparison_data.append({
                "Ticker": ticker,
                "Before Retrain": f"{before:.6f}",
                "After Retrain": f"{after:.6f}",
                "Change": f"{change:.6f}",
                "Change %": f"{change_pct:.2f}%"
            })
        
        comparison_df = pd.DataFrame(comparison_data)
        print(comparison_df.to_string(index=False))
        
        # Visualize the comparison
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
        
        # Plot 1: Before vs After
        tickers = [d["Ticker"] for d in comparison_data]
        before_vals = [float(d["Before Retrain"]) for d in comparison_data]
        after_vals = [float(d["After Retrain"]) for d in comparison_data]
        
        x = np.arange(len(tickers))
        width = 0.35
        
        ax1.bar(x - width/2, before_vals, width, label='Before Retrain', alpha=0.7)
        ax1.bar(x + width/2, after_vals, width, label='After Retrain', alpha=0.7)
        ax1.set_xlabel('Ticker')
        ax1.set_ylabel('Predicted Return')
        ax1.set_title('Predictions: Before vs After Re-initialization')
        ax1.set_xticks(x)
        ax1.set_xticklabels(tickers, rotation=45)
        ax1.legend()
        ax1.grid(True, alpha=0.3)
        
        # Plot 2: Change distribution
        changes = [float(d["Change"]) for d in comparison_data]
        ax2.bar(tickers, changes, alpha=0.7, color=['g' if c > 0 else 'r' for c in changes])
        ax2.set_xlabel('Ticker')
        ax2.set_ylabel('Change in Prediction')
        ax2.set_title('Change After Re-initialization')
        ax2.set_xticklabels(tickers, rotation=45)
        ax2.axhline(y=0, color='black', linestyle='-', linewidth=0.5)
        ax2.grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.show()
        
        # Log comparison metrics
        avg_change = np.mean(np.abs(changes))
        mlflow.log_metric("avg_prediction_change_after_retrain", avg_change)
        print(f"\nAverage absolute change in predictions: {avg_change:.6f}")

# %% Visualize MLflow metrics
print("\n=== MLflow Metrics Visualization ===")

# Get the last run
client = mlflow.tracking.MlflowClient()
experiment = client.get_experiment_by_name("Model_Research")
runs = client.search_runs(experiment_ids=[experiment.experiment_id], max_results=1)

if runs:
    run_id = runs[0].info.run_id
    print(f"Visualizing metrics from run: {run_id}")
    
    # Get metrics history
    metric_names = ["train_loss_stock", "train_loss_market", "train_loss_pred", "val_loss"]
    
    fig, axes = plt.subplots(2, 2, figsize=(12, 8))
    axes = axes.flatten()
    
    for i, metric_name in enumerate(metric_names):
        metric_history = client.get_metric_history(run_id, metric_name)
        if metric_history:
            steps = [m.step for m in metric_history]
            values = [m.value for m in metric_history]
            axes[i].plot(steps, values, marker='o')
            axes[i].set_title(metric_name)
            axes[i].set_xlabel("Epoch")
            axes[i].set_ylabel("Loss")
            axes[i].grid(True)
    
    plt.tight_layout()
    plt.show()
    
    # Print final metrics
    print("\n=== Final Metrics ===")
    for key, value in runs[0].data.metrics.items():
        print(f"{key}: {value:.6f}")

# %% Compare multiple training runs (if available)
print("\n=== Training Runs Comparison ===")

# Get all runs from the experiment
all_runs = client.search_runs(experiment_ids=[experiment.experiment_id])

if len(all_runs) > 1:
    # Create comparison dataframe
    comparison_data = []
    for run in all_runs:
        run_data = {
            "run_id": run.info.run_id[:8],
            "start_time": pd.Timestamp(run.info.start_time, unit='ms'),
            "duration_min": (run.info.end_time - run.info.start_time) / 60000,
        }
        # Add metrics
        for key, value in run.data.metrics.items():
            if "best" in key or "final" in key:
                run_data[key] = value
        comparison_data.append(run_data)
    
    comparison_df = pd.DataFrame(comparison_data)
    print(comparison_df.to_string())
else:
    print("Only one run available. Run the notebook multiple times to compare runs.")

# %% Summary statistics for walk-forward simulation
print("\n=== Walk-Forward Simulation Summary ===")
print(f"Total updates performed: {update_count}")
print(f"Time period covered: {valid_end} to {current_time}")
print(f"Days between updates: {update_freq.days if hasattr(update_freq, 'days') else 'varies'}")
print(f"Retrain offset: {retrain_offset}")

# Calculate prediction stability
if len(prediction_history) > 1:
    pred_cols = [col for col in pred_df.columns if col.startswith('pred_')]
    for col in pred_cols:
        ticker = col.replace('pred_', '')
        values = pred_df[col].values
        stability = np.std(values)
        drift = values[-1] - values[0] if len(values) > 0 else 0
        print(f"\n{ticker} predictions:")
        print(f"  Stability (std): {stability:.6f}")
        print(f"  Drift (last-first): {drift:.6f}")
        print(f"  Mean: {np.mean(values):.6f}")

# Log summary metrics
mlflow.log_metrics({
    "total_updates": update_count,
    "prediction_stability_mean": np.mean([np.std(pred_df[col].values) for col in pred_cols]),
    "final_retrain_loss": new_val_loss,
})

# %% MLflow UI Instructions
print("\n" + "="*60)
print("To view detailed metrics in MLflow UI:")
print("1. Open terminal in the notebook directory")
print("2. Run: mlflow ui --backend-store-uri file:./mlruns")
print("3. Open browser at http://localhost:5000")
print("4. Compare runs, view metrics, and analyze experiments")
print("="*60)

# %% Test hyperparameter impact
print("\n=== Testing Hyperparameter Impact ===")

# Test different learning rates
test_hparams = [
    {"lr_stage1": 0.0001, "lr_stage2": 0.00001},
    {"lr_stage1": 0.001, "lr_stage2": 0.0001},
    {"lr_stage1": 0.005, "lr_stage2": 0.001},
]

results = []
for hp_set in test_hparams:
    with mlflow.start_run(run_name=f"hp_test_lr{hp_set['lr_stage1']}"):
        # Update hyperparameters
        test_hp = hp_values.copy()
        test_hp.update(hp_set)
        
        # Create and train model
        test_model = UMIModel(**model_params, **test_hp)
        val_loss = test_model.initialize(sample_data)
        
        # Log results
        mlflow.log_params(hp_set)
        mlflow.log_metric("final_val_loss", val_loss)
        
        results.append({
            **hp_set,
            "val_loss": val_loss
        })
        
        print(f"LR={hp_set['lr_stage1']}: Val Loss={val_loss:.6f}")

# Display results
results_df = pd.DataFrame(results)
print("\n=== Hyperparameter Test Results ===")
print(results_df.to_string())

print("\n✅ Model testing complete! Check MLflow UI for detailed metrics.")

Configuration loaded from: configs\config.yaml
Model: UMIModel
Data directory: data/
Frequency: 1D
Retrain offset: 30B
Prediction length: 1 bars
MLflow tracking URI: file:./mlruns
Active experiment: Model_Research
Test universe: ['AAPL', 'AMZN', 'GOOGL']
  AAPL: 128 bars
  AMZN: 128 bars
  GOOGL: 128 bars

=== Model Initialization ===


  from .autonotebook import tqdm as notebook_tqdm


Using default hyperparameters...
Training model from scratch...




Initial training complete. Best validation loss: inf

=== Testing Prediction ===


AssertionError: tiker dataframe size mismatch