# 04 - Model Analysis

Analyze trained model predictions and performance.

**Contents:**
- Load trained models (Ridge, Transformer, TFT)
- Prediction vs actual scatter plots
- Direction accuracy analysis
- Attention weight visualization (Transformer)
- Feature importance (TFT Variable Selection)

In [None]:
# === Colab Auto-Detection ===
import sys, os
if "google.colab" in sys.modules:
    import subprocess
    if not os.path.exists("/content/quant-lab"):
        subprocess.run(["git", "clone", "https://github.com/Mohit1053/quant-lab.git", "/content/quant-lab"], check=True)
    os.chdir("/content/quant-lab")
    subprocess.run([sys.executable, "-m", "pip", "install", "-q", "-e", "."], check=True)
    from google.colab import drive
    drive.mount("/content/drive", force_remount=False)
    # Symlink data from Drive if available
    from pathlib import Path
    drive_data = Path("/content/drive/MyDrive/quant_lab/data")
    if drive_data.exists():
        import shutil
        for sub in ["raw", "cleaned", "features"]:
            src = drive_data / sub
            dst = Path("data") / sub
            if src.exists():
                dst.mkdir(parents=True, exist_ok=True)
                for f in src.glob("*.parquet"):
                    shutil.copy(f, dst / f.name)
    print("Colab setup complete!")
else:
    sys.path.insert(0, "../src")


In [None]:
import numpy as np
import pandas as pd
import torch
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from quant_lab.models.linear_baseline import RidgeBaseline
from quant_lab.features.feature_store import FeatureStore
from quant_lab.data.datasets import TemporalSplit, create_flat_datasets

In [None]:
# Load feature data
store = FeatureStore('../data/features')
df = store.load_features('nifty50_features')

base_cols = {'date', 'ticker', 'open', 'high', 'low', 'close', 'volume', 'adj_close'}
feature_cols = [c for c in df.columns if c not in base_cols]

target_col = 'log_return_1d'
if target_col not in df.columns:
    df[target_col] = df.groupby('ticker')['adj_close'].transform(lambda s: np.log(s / s.shift(1)))

split = TemporalSplit(train_end='2021-12-31', val_end='2023-06-30')
datasets = create_flat_datasets(df, feature_cols, split, target_col=target_col)
X_test, y_test, meta_test = datasets['test']
print(f'Test set: {len(y_test)} samples')

In [None]:
# Load Ridge baseline
from pathlib import Path

model_path = Path('../outputs/models/ridge_baseline.pkl')
if model_path.exists():
    model = RidgeBaseline()
    model.load(model_path)
    predictions = model.predict(X_test)

    fig = px.scatter(x=y_test, y=predictions,
                     labels={'x': 'Actual Return', 'y': 'Predicted Return'},
                     title='Ridge Baseline: Predicted vs Actual Returns',
                     opacity=0.3)
    fig.add_trace(go.Scatter(x=[-0.05, 0.05], y=[-0.05, 0.05],
                             mode='lines', name='Perfect', line=dict(dash='dash', color='red')))
    fig.update_layout(height=500, template='plotly_white')
    fig.show()

    # Metrics
    metrics = model.evaluate(X_test, y_test)
    for k, v in metrics.items():
        print(f'{k}: {v:.6f}')
else:
    print('No Ridge model found. Run: python scripts/run_pipeline.py')

In [None]:
# Load Transformer model (if available)
transformer_path = Path('../outputs/models/transformer/final_model.pt')
if transformer_path.exists():
    from quant_lab.models.transformer.model import TransformerForecaster
    tf_model = TransformerForecaster.load(transformer_path)
    print(f'Transformer loaded: {tf_model.count_parameters()} parameters')
else:
    print('No Transformer model found. Run: python scripts/train_forecaster.py')

In [None]:
# Feature importance (Ridge coefficients)
if model_path.exists():
    importance = model.get_feature_importance(feature_cols)
    imp_df = pd.DataFrame({'feature': list(importance.keys()),
                           'importance': list(importance.values())})
    imp_df = imp_df.sort_values('importance', key=abs, ascending=False).head(20)

    fig = px.bar(imp_df, x='feature', y='importance',
                 title='Ridge Feature Importance (Top 20)',
                 color='importance', color_continuous_scale='RdBu_r')
    fig.update_layout(height=400, template='plotly_white')
    fig.show()