In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))


True
Tesla T4


In [3]:
import psutil, os

process = psutil.Process(os.getpid())
mem_gb = process.memory_info().rss / (1024 ** 3)
print(f"RAM used by this notebook: {mem_gb:.2f} GB")


RAM used by this notebook: 0.51 GB


In [4]:
%cd /content/drive/MyDrive/mlProject/stock-volatility-forecasting
!pip install -r requirements.txt


/content/drive/MyDrive/mlProject/stock-volatility-forecasting


In [5]:
import psutil, os

process = psutil.Process(os.getpid())
mem_gb = process.memory_info().rss / (1024 ** 3)
print(f"RAM used by this notebook: {mem_gb:.2f} GB")


RAM used by this notebook: 0.52 GB


In [6]:
# Core libraries 454545454545
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# PyTorch
import torch
import torch.nn as nn
from torch.utils.data import DataLoader

# Scikit-learn
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Plotting
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

# Custom modules
import sys
sys.path.append('/content/drive/MyDrive/mlProject/stock-volatility-forecasting/src')

from data.loader import DataLoader as StockDataLoader, load_data
from data.preprocessor import FeatureEngineer, time_based_split
from models.transformer import create_transformer_model
from models.lstm import create_lstm_model
from models.baselines import ARIMABaseline, GARCHBaseline, HistoricalVolatilityBaseline, fit_baseline_models
from models.utils import create_dataloaders, train_model, evaluate, compute_metrics, load_model
from evaluation.compare import ModelComparator

# Logging
from loguru import logger
import yaml

# Configuration
with open('/content/drive/MyDrive/mlProject/stock-volatility-forecasting/config.yaml', 'r') as f:
    config = yaml.safe_load(f)

# Setup paths
Path("/content/drive/MyDrive/mlProject/stock-volatility-forecasting/models").mkdir(exist_ok=True)
Path("/content/drive/MyDrive/mlProject/stock-volatility-forecasting/outputs").mkdir(exist_ok=True)
Path("/content/drive/MyDrive/mlProject/stock-volatility-forecasting/outputs").mkdir(exist_ok=True)
Path("/content/drive/MyDrive/mlProject/stock-volatility-forecasting/logs").mkdir(exist_ok=True)

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
print(f"PyTorch version: {torch.__version__}")
print(f"Config loaded: {config.keys()}")

Using device: cuda
PyTorch version: 2.9.0+cu126
Config loaded: dict_keys(['data', 'features', 'splits', 'transformer', 'lstm', 'training', 'arima', 'garch', 'evaluation', 'paths', 'deployment'])


In [7]:
import psutil, os

process = psutil.Process(os.getpid())
mem_gb = process.memory_info().rss / (1024 ** 3)
print(f"RAM used by this notebook: {mem_gb:.2f} GB")


RAM used by this notebook: 0.73 GB


l## Section 2: Load All CSV Files

Load the complete 2.55GB dataset from stocks and ETFs directories.

## ‚ö†Ô∏è IMPORTANT: Check Variables Before Loading

Run this cell first to verify MAX_TICKERS is set correctly.

In [8]:
# ===================================================================
# PRE-FLIGHT CHECK - Verify Settings Before Loading
# ===================================================================
# Run this cell FIRST to verify your settings are correct

print("="*70)
print("PRE-FLIGHT CHECK")
print("="*70)

# Check if variables exist in kernel
try:
    print(f"\nCurrent kernel state:")
    print(f"  USE_RAW_DATA exists: {USE_RAW_DATA}")
    print(f"  MAX_TICKERS exists: {MAX_TICKERS}")

    # Validate
    if MAX_TICKERS is not None:
        print()
        print("‚ùå PROBLEM DETECTED!")
        print(f"   MAX_TICKERS = {MAX_TICKERS} (should be None)")
        print()
        print("SOLUTION: Restart kernel to clear old variables")
        print("  1. Click: Kernel ‚Üí Restart")
        print("  2. Run cells from top in order")
        print()
    else:
        print()
        print("‚úì Settings look correct")
        print("‚úì MAX_TICKERS = None (will load ALL data)")

except NameError:
    print("\n‚úì Variables not set yet (this is normal on first run)")
    print("‚úì Proceeding to next cell will set them correctly")

# Check data paths
from pathlib import Path
stocks_path = Path("/content/drive/MyDrive/mlProject/Data/stocks")
etfs_path = Path("/content/drive/MyDrive/mlProject/Data/etfs")

print()
print("Data directory check:")
print(f"  Stocks: {stocks_path.absolute()}")
print(f"    Exists: {stocks_path.exists()}")
if stocks_path.exists():
    stock_files = list(stocks_path.glob("*.csv"))
    print(f"    Files: {len(stock_files):,} CSV files found")

print(f"  ETFs: {etfs_path.absolute()}")
print(f"    Exists: {etfs_path.exists()}")
if etfs_path.exists():
    etf_files = list(etfs_path.glob("*.csv"))
    print(f"    Files: {len(etf_files):,} CSV files found")

total_files = len(stock_files) + len(etf_files) if stocks_path.exists() and etfs_path.exists() else 0
print()
print(f"Total CSV files available: {total_files:,}")
if total_files > 5000:
    print("‚úì Full dataset detected!")
elif total_files > 100:
    print("‚ö†Ô∏è  Partial dataset detected")
elif total_files > 0:
    print("‚ö†Ô∏è  Only sample data detected")
else:
    print("‚ùå No CSV files found! Check data paths!")

print("="*70)

PRE-FLIGHT CHECK

Current kernel state:

‚úì Variables not set yet (this is normal on first run)
‚úì Proceeding to next cell will set them correctly

Data directory check:
  Stocks: /content/drive/MyDrive/mlProject/Data/stocks
    Exists: True
    Files: 5,884 CSV files found
  ETFs: /content/drive/MyDrive/mlProject/Data/etfs
    Exists: True
    Files: 2,165 CSV files found

Total CSV files available: 8,049
‚úì Full dataset detected!


In [9]:
import psutil, os

process = psutil.Process(os.getpid())
mem_gb = process.memory_info().rss / (1024 ** 3)
print(f"RAM used by this notebook: {mem_gb:.2f} GB")


RAM used by this notebook: 0.73 GB


In [10]:
# # ===================================================================
# # LOAD ALL DATA - FULL 2.55GB DATASET
# # ===================================================================
# # This cell loads ALL 8,000+ tickers from C:/ML_Project/Data/
# # Takes 5-10 minutes depending on system

# import time

# # FORCE these values (ignore any previous kernel state)
# USE_RAW_DATA = True
# MAX_TICKERS = 2500  # None = load ALL tickers (do NOT use a number)

# print("="*70)
# print("LOADING FULL DATASET - ALL TICKERS")
# print("="*70)
# print(f"Configuration:")
# print(f"  USE_RAW_DATA = {USE_RAW_DATA}")
# print(f"  MAX_TICKERS = {MAX_TICKERS}")
# print(f"  Source: /content/drive/MyDrive/mlProject/Data/stocks and /content/drive/MyDrive/mlProject/Data/etfs")
# print("="*70)

# # Verification
# if MAX_TICKERS is not None:
#     print()
#     print("‚ö†Ô∏è  ERROR: MAX_TICKERS is not None!")
#     print(f"‚ö†Ô∏è  Current value: MAX_TICKERS = {MAX_TICKERS}")
#     print("‚ö†Ô∏è  This will limit the data loading!")
#     print()
#     print("Fix: Set MAX_TICKERS = None (not 0, not a number, but None)")
#     raise ValueError(f"MAX_TICKERS must be None to load all data, got {MAX_TICKERS}")

# print(f"\n‚úì Settings verified - will load ALL tickers")
# print(f"‚è≥ Loading... this takes 5-10 minutes\n")

# start_time = time.time()

# # Load the data
# df, summary = load_data(
#     config_path='/content/drive/MyDrive/mlProject/stock-volatility-forecasting/config.yaml',
#     use_raw_data=USE_RAW_DATA,
#     max_tickers=MAX_TICKERS  # Passing None here
# )

# load_time = time.time() - start_time

# print()
# print("="*70)
# print("DATA LOADING COMPLETE")
# print("="*70)
# print(f"‚è±Ô∏è  Time taken: {load_time:.1f} seconds ({load_time/60:.1f} minutes)")
# print(f"üìä Rows loaded: {summary['total_rows']:,}")
# print(f"üéØ Tickers: {summary['num_tickers']:,} total")
# print(f"   - Stocks: {summary['num_stocks']:,}")
# print(f"   - ETFs: {summary['num_etfs']:,}")
# print(f"üìÖ Date range: {summary['date_range'][0]} to {summary['date_range'][1]}")
# print(f"üíæ Memory: {summary['memory_mb']:.2f} MB ({summary['memory_mb']/1024:.2f} GB)")
# print(f"üìà Avg rows per ticker: {summary['avg_rows_per_ticker']:.1f}")
# print("="*70)

# # Verify we loaded ALL data (should be thousands of tickers)
# if summary['num_tickers'] < 100:
#     print()
#     print("‚ö†Ô∏è  WARNING: Only loaded", summary['num_tickers'], "tickers!")
#     print("‚ö†Ô∏è  Expected: 5,000-8,000 tickers for full dataset")
#     print("‚ö†Ô∏è  Something went wrong - check data paths in config.yaml")
# elif summary['num_tickers'] < 1000:
#     print()
#     print(f"‚ö†Ô∏è  Loaded {summary['num_tickers']} tickers - this seems low")
#     print("‚ö†Ô∏è  Full dataset should have 5,000-8,000 tickers")
# else:
#     print()
#     print(f"‚úì SUCCESS: Loaded {summary['num_tickers']:,} tickers - full dataset!")

# print()
# print("Sample data (first 10 rows):")
# display(df.head(10))

# print()
# print("Sample tickers (first 30):")
# tickers = sorted(df['Ticker'].unique())
# print(tickers[:30])

# print()
# print(f"DataFrame info:")
# print(f"  Shape: {df.shape}")
# print(f"  Columns: {df.columns.tolist()}")
# print(f"  Memory: {df.memory_usage(deep=True).sum() / 1024**3:.2f} GB")

# ===================================================================
# LOAD DATA - LIMIT TO MAX_TICKERS
# ===================================================================
# This cell loads up to MAX_TICKERS tickers from:
#   /content/drive/MyDrive/mlProject/Data/stocks
#   /content/drive/MyDrive/mlProject/Data/etfs
# Adjust MAX_TICKERS to control how many tickers you load.

import time

# SETTINGS
USE_RAW_DATA = True
MAX_TICKERS = 5000   # None = load ALL tickers; int = limit number of tickers

print("="*70)
print("LOADING DATASET")
print("="*70)
print("Configuration:")
print(f"  USE_RAW_DATA = {USE_RAW_DATA}")
print(f"  MAX_TICKERS = {MAX_TICKERS}")
print("  Source: /content/drive/MyDrive/mlProject/Data/stocks and /content/drive/MyDrive/mlProject/Data/etfs")
print("="*70)

# Verification (allow both None and integer values)
if MAX_TICKERS is None:
    print("\n‚úì Settings verified - will load ALL tickers")
else:
    print(f"\n‚úì Settings verified - will load up to {MAX_TICKERS} tickers")

print("‚è≥ Loading... this may take several minutes depending on MAX_TICKERS\n")

start_time = time.time()

# Load the data
df, summary = load_data(
    config_path='/content/drive/MyDrive/mlProject/stock-volatility-forecasting/config.yaml',
    use_raw_data=USE_RAW_DATA,
    max_tickers=MAX_TICKERS  # now you can pass 2500 here
)

load_time = time.time() - start_time

print()
print("="*70)
print("DATA LOADING COMPLETE")
print("="*70)
print(f"‚è±Ô∏è  Time taken: {load_time:.1f} seconds ({load_time/60:.1f} minutes)")
print(f"üìä Rows loaded: {summary['total_rows']:,}")
print(f"üéØ Tickers: {summary['num_tickers']:,} total")
print(f"   - Stocks: {summary['num_stocks']:,}")
print(f"   - ETFs: {summary['num_etfs']:,}")
print(f"üìÖ Date range: {summary['date_range'][0]} to {summary['date_range'][1]}")
print(f"üíæ Memory: {summary['memory_mb']:.2f} MB ({summary['memory_mb']/1024:.2f} GB)")
print(f"üìà Avg rows per ticker: {summary['avg_rows_per_ticker']:.1f}")
print("="*70)

# Simple sanity check on how many tickers got loaded
if summary['num_tickers'] < 1:
    print()
    print("‚ö†Ô∏è  WARNING: No tickers loaded!")
    print("‚ö†Ô∏è  Check data paths in config.yaml and MAX_TICKERS setting.")
elif MAX_TICKERS is None and summary['num_tickers'] < 1000:
    print()
    print(f"‚ö†Ô∏è  Loaded {summary['num_tickers']} tickers - this seems low for FULL dataset")
    print("‚ö†Ô∏è  Full dataset should have 5,000-8,000 tickers.")
else:
    print()
    print(f"‚úì SUCCESS: Loaded {summary['num_tickers']:,} tickers")

print()
print("Sample data (first 10 rows):")
display(df.head(10))

print()
print("Sample tickers (first 30):")
tickers = sorted(df['Ticker'].unique())
print(tickers[:30])

print()
print("DataFrame info:")
print(f"  Shape: {df.shape}")
print(f"  Columns: {df.columns.tolist()}")
print(f"  Memory: {df.memory_usage(deep=True).sum() / 1024**3:.2f} GB")


[32m2025-11-30 17:19:52.500[0m | [1mINFO    [0m | [36mdata.loader[0m:[36m__init__[0m:[36m27[0m - [1mDataLoader initialized with config: /content/drive/MyDrive/mlProject/stock-volatility-forecasting/config.yaml[0m
[32m2025-11-30 17:19:52.505[0m | [1mINFO    [0m | [36mdata.loader[0m:[36mload_all_csvs[0m:[36m129[0m - [1mLooking for stocks in: /content/drive/MyDrive/mlProject/Data/stocks[0m
[32m2025-11-30 17:19:52.507[0m | [1mINFO    [0m | [36mdata.loader[0m:[36mload_all_csvs[0m:[36m130[0m - [1mLooking for ETFs in: /content/drive/MyDrive/mlProject/Data/etfs[0m
[32m2025-11-30 17:19:52.510[0m | [1mINFO    [0m | [36mdata.loader[0m:[36mload_all_csvs[0m:[36m131[0m - [1mStocks path exists: True[0m
[32m2025-11-30 17:19:52.511[0m | [1mINFO    [0m | [36mdata.loader[0m:[36mload_all_csvs[0m:[36m132[0m - [1mETFs path exists: True[0m


LOADING DATASET
Configuration:
  USE_RAW_DATA = True
  MAX_TICKERS = 5000
  Source: /content/drive/MyDrive/mlProject/Data/stocks and /content/drive/MyDrive/mlProject/Data/etfs

‚úì Settings verified - will load up to 5000 tickers
‚è≥ Loading... this may take several minutes depending on MAX_TICKERS



[32m2025-11-30 17:19:52.700[0m | [1mINFO    [0m | [36mdata.loader[0m:[36mload_all_csvs[0m:[36m139[0m - [1mFound 5884 stock CSV files[0m
Loading stocks:   2%|‚ñè         | 76/5000 [00:53<1:40:34,  1.23s/it][32m2025-11-30 17:20:46.169[0m | [34m[1mDEBUG   [0m | [36mdata.loader[0m:[36mload_single_csv[0m:[36m59[0m - [34m[1mInsufficient data in /content/drive/MyDrive/mlProject/Data/stocks/SRACU.csv: 99 rows[0m
Loading stocks:   2%|‚ñè         | 107/5000 [00:53<19:16,  4.23it/s][32m2025-11-30 17:20:46.650[0m | [34m[1mDEBUG   [0m | [36mdata.loader[0m:[36mload_single_csv[0m:[36m59[0m - [34m[1mInsufficient data in /content/drive/MyDrive/mlProject/Data/stocks/SRAC.csv: 63 rows[0m
Loading stocks:   4%|‚ñé         | 186/5000 [00:54<01:38, 48.97it/s][32m2025-11-30 17:20:47.675[0m | [34m[1mDEBUG   [0m | [36mdata.loader[0m:[36mload_single_csv[0m:[36m59[0m - [34m[1mInsufficient data in /content/drive/MyDrive/mlProject/Data/stocks/SWT.csv: 99 rows[0m


DATA LOADING COMPLETE
‚è±Ô∏è  Time taken: 187.0 seconds (3.1 minutes)
üìä Rows loaded: 20,629,580
üéØ Tickers: 4,995 total
   - Stocks: 4,908
   - ETFs: 87
üìÖ Date range: 1962-01-02 00:00:00 to 2020-04-01 00:00:00
üíæ Memory: 3349.90 MB (3.27 GB)
üìà Avg rows per ticker: 4130.0

‚úì SUCCESS: Loaded 4,995 tickers

Sample data (first 10 rows):


Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Ticker,Asset_Type
0,2018-06-28,21.25,21.74,21.0,21.4,21.4,7415000.0,BV,Stock
1,2018-06-29,21.370001,22.115,21.35,21.950001,21.950001,1467500.0,BV,Stock
2,2018-07-02,22.07,22.465,21.799999,22.200001,22.200001,1110400.0,BV,Stock
3,2018-07-03,22.309999,22.790001,22.309999,22.65,22.65,1013200.0,BV,Stock
4,2018-07-05,22.77,22.940001,21.9,22.25,22.25,2019900.0,BV,Stock
5,2018-07-06,22.0,22.200001,21.6,21.85,21.85,717700.0,BV,Stock
6,2018-07-09,22.01,22.344999,21.74,22.0,22.0,1112800.0,BV,Stock
7,2018-07-10,21.99,22.299999,21.93,22.030001,22.030001,419600.0,BV,Stock
8,2018-07-11,22.120001,22.17,21.83,21.9,21.9,291600.0,BV,Stock
9,2018-07-12,21.92,22.17,21.92,21.969999,21.969999,99400.0,BV,Stock



Sample tickers (first 30):
['BV', 'BVXV', 'BW', 'BWB', 'BWXT', 'BXP', 'BXS', 'BYD', 'BYND', 'BZH', 'BZM', 'CAAP', 'CABA', 'CABO', 'CACC', 'CALA', 'CALB', 'CALX', 'CAMP', 'CAPE', 'CAPR', 'CARA', 'CARE', 'CARG', 'CARO', 'CARR#', 'CARS', 'CASA', 'CASY', 'CATC']

DataFrame info:
  Shape: (20629580, 9)
  Columns: ['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'Ticker', 'Asset_Type']
  Memory: 3.12 GB


In [11]:
import psutil, os

process = psutil.Process(os.getpid())
mem_gb = process.memory_info().rss / (1024 ** 3)
print(f"RAM used by this notebook: {mem_gb:.2f} GB")


RAM used by this notebook: 3.71 GB


## Section 3: Feature Engineering

Engineer 25 features from OHLCV data and compute realized volatility labels.

**Features**:
- Raw OHLCV + Volume (5)
- Log returns, price ranges, ratios (3)
- Rolling statistics of returns (6)
- Rolling statistics of volume (6)
- Rolling volatility windows (3)
- Momentum indicators (2)

**Labels**: 5-day & 10-day realized volatility

In [None]:
# import sys

# # 1) Make sure Python can see your project
# sys.path.append('/content/drive/MyDrive/mlProject/stock-volatility-forecasting')

# from feature_engineer import FeatureEngineer

# # Initialize feature engineer
# engineer = FeatureEngineer(config_path='/content/drive/MyDrive/mlProject/stock-volatility-forecasting/config.yaml')

# print(f"Feature configuration:")
# print(f"  Sequence length: {engineer.sequence_length} days")
# print(f"  Horizons: {engineer.horizons}")
# print(f"  Rolling windows: {engineer.rolling_windows}")

# # Engineer features for all tickers
# print(f"\nEngineering features for {df['Ticker'].nunique()} tickers...")

# X, y, dates, ticker_labels = engineer.prepare_all_data(df)

# print(f"\n‚úì Features engineered successfully")
# print(f"  X shape: {X.shape} (samples, time_steps, features)")
# print(f"  y shape: {y.shape} (samples, horizons)")
# print(f"  dates shape: {dates.shape}")
# print(f"  Number of unique tickers: {len(set(ticker_labels))}")

# # Display feature statistics
# print(f"\nFeature statistics:")
# print(f"  Mean: {X.mean():.4f}")
# print(f"  Std: {X.std():.4f}")
# print(f"  Min: {X.min():.4f}")
# print(f"  Max: {X.max():.4f}")

# print(f"\nLabel statistics:")
# for i, horizon in enumerate(engineer.horizons):
#     print(f"  {horizon}-day volatility:")
#     print(f"    Mean: {y[:, i].mean():.6f}")
#     print(f"    Std: {y[:, i].std():.6f}")
#     print(f"    Min: {y[:, i].min():.6f}")
#     print(f"    Max: {y[:, i].max():.6f}")

# # Visualize label distributions
# fig = make_subplots(rows=1, cols=2, subplot_titles=['5-day Volatility', '10-day Volatility'])

# fig.add_trace(
#     go.Histogram(x=y[:, 0], nbinsx=50, name='5d Vol'),
#     row=1, col=1
# )

# fig.add_trace(
#     go.Histogram(x=y[:, 1], nbinsx=50, name='10d Vol'),
#     row=1, col=2
# )

# fig.update_layout(title="Realized Volatility Distributions", height=400, showlegend=False)
# fig.show()


# Make sure Python can see the folder where preprocessing.py lives

# IMPORT FROM preprocessing, NOT feature_engineer

# Initialize feature engineer
engineer = FeatureEngineer(
    config_path="/content/drive/MyDrive/mlProject/stock-volatility-forecasting/config.yaml"
)

X, y, dates, ticker_labels = engineer.prepare_all_data(df)

print(f"Feature configuration:")
print(f"  Sequence length: {engineer.sequence_length} days")
print(f"  Horizons: {engineer.horizons}")
print(f"  Rolling windows: {engineer.rolling_windows}")

print(f"\nEngineering features for {df['Ticker'].nunique()} tickers...")


print(f"\n‚úì Features engineered successfully")
print(f"  X shape: {X.shape} (samples, time_steps, features)")
print(f"  y shape: {y.shape} (samples, horizons)")
print(f"  dates shape: {dates.shape}")
print(f"  Number of unique tickers: {len(set(ticker_labels))}")


[32m2025-11-30 17:30:29.781[0m | [1mINFO    [0m | [36mdata.preprocessor[0m:[36m__init__[0m:[36m30[0m - [1mFeatureEngineer initialized: T=60, horizons=[5, 10][0m
[32m2025-11-30 17:30:30.741[0m | [1mINFO    [0m | [36mdata.preprocessor[0m:[36mprepare_all_data[0m:[36m238[0m - [1mProcessing 4995 tickers...[0m
[32m2025-11-30 17:30:32.438[0m | [1mINFO    [0m | [36mdata.preprocessor[0m:[36mcreate_sequences[0m:[36m170[0m - [1mCreated 343 sequences of shape (343, 60, 25)[0m
[32m2025-11-30 17:30:34.879[0m | [1mINFO    [0m | [36mdata.preprocessor[0m:[36mcreate_sequences[0m:[36m170[0m - [1mCreated 1133 sequences of shape (1133, 60, 25)[0m
[32m2025-11-30 17:30:37.468[0m | [1mINFO    [0m | [36mdata.preprocessor[0m:[36mcreate_sequences[0m:[36m170[0m - [1mCreated 1312 sequences of shape (1312, 60, 25)[0m
[32m2025-11-30 17:30:40.956[0m | [1mINFO    [0m | [36mdata.preprocessor[0m:[36mcreate_sequences[0m:[36m170[0m - [1mCreated 417 seq

In [1]:
import psutil, os

process = psutil.Process(os.getpid())
mem_gb = process.memory_info().rss / (1024 ** 3)
print(f"RAM used by this notebook: {mem_gb:.2f} GB")


RAM used by this notebook: 0.10 GB


## Section 4: Time-Based Train/Val/Test Split

Split data chronologically (NO SHUFFLE) to respect temporal order.

**Splits**: 70% train / 15% val / 15% test

In [None]:
# Time-based split
X_train, X_val, X_test, y_train, y_val, y_test, dates_train, dates_val, dates_test = time_based_split(
    X, y, dates,
    train_ratio=config['splits']['train'],
    val_ratio=config['splits']['val']
)

print(f"‚úì Time-based split completed")
print(f"\nSplit sizes:")
print(f"  Train: {len(X_train):,} samples ({len(X_train)/len(X)*100:.1f}%)")
print(f"  Val:   {len(X_val):,} samples ({len(X_val)/len(X)*100:.1f}%)")
print(f"  Test:  {len(X_test):,} samples ({len(X_test)/len(X)*100:.1f}%)")

print(f"\nDate ranges:")
print(f"  Train: {dates_train[0]} to {dates_train[-1]}")
print(f"  Val:   {dates_val[0]} to {dates_val[-1]}")
print(f"  Test:  {dates_test[0]} to {dates_test[-1]}")

# Fit scalers on training data only
print(f"\nFitting scalers on training data...")
engineer.fit_scalers(X_train, y_train)

# Transform all splits
X_train_scaled = engineer.transform_features(X_train)
X_val_scaled = engineer.transform_features(X_val)
X_test_scaled = engineer.transform_features(X_test)

y_train_scaled = engineer.transform_labels(y_train)
y_val_scaled = engineer.transform_labels(y_val)
y_test_scaled = engineer.transform_labels(y_test)

print(f"‚úì Data scaled")

# Save scalers
engineer.save_scalers("/content/drive/MyDrive/mlProject/stock-volatility-forecasting/models/scalers.pkl")
print(f"‚úì Scalers saved to /content/drive/MyDrive/mlProject/stock-volatility-forecasting/models/scalers.pkl")

# Create DataLoaders
train_loader, val_loader, test_loader = create_dataloaders(
    X_train_scaled, y_train_scaled,
    X_val_scaled, y_val_scaled,
    X_test_scaled, y_test_scaled,
    batch_size=config['training']['batch_size']
)

print(f"\n‚úì DataLoaders created")
print(f"  Train batches: {len(train_loader)}")
print(f"  Val batches: {len(val_loader)}")
print(f"  Test batches: {len(test_loader)}")

## Section 5: Hyperparameter Search - Transformer Models

Systematic grid search over Transformer architectures to find the best configuration.

**Search Grid**:
- d_model: 64, 128, 256
- nhead: 4, 8
- num_layers: 2, 3
- dropout: 0.1, 0.2
- learning_rate: 1e-4, 3e-4, 1e-3
- batch_size: 64, 128

In [None]:
# Create Transformer model
print("Creating Transformer model...")
transformer_model = create_transformer_model(config, use_conv=False)
transformer_model = transformer_model.to(device)

print(f"‚úì Transformer created")
print(f"  Parameters: {transformer_model.count_parameters():,}")
print(f"  Architecture:")
print(transformer_model)

# Train Transformer
print(f"\nTraining Transformer...")
print(f"  Epochs: {config['training']['epochs']}")
print(f"  Batch size: {config['training']['batch_size']}")
print(f"  Learning rate: {config['training']['learning_rate']}")
print(f"  Patience: {config['training']['patience']}")

transformer_history = train_model(
    model=transformer_model,
    train_loader=train_loader,
    val_loader=val_loader,
    config=config,
    device=device,
    save_path="/content/drive/MyDrive/mlProject/stock-volatility-forecasting/models/transformer_best.pth"
)

print(f"\n‚úì Transformer training completed")
print(f"  Best val loss: {transformer_history['best_val_loss']:.6f}")
print(f"  Training time: {transformer_history['training_time']:.2f} seconds")
print(f"  Total epochs: {len(transformer_history['epochs'])}")

# Plot training curves
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=transformer_history['epochs'],
    y=transformer_history['train_loss'],
    mode='lines',
    name='Train Loss'
))

fig.add_trace(go.Scatter(
    x=transformer_history['epochs'],
    y=transformer_history['val_loss'],
    mode='lines',
    name='Val Loss'
))

fig.update_layout(
    title="Transformer Training Curves",
    xaxis_title="Epoch",
    yaxis_title="Loss (MSE)",
    height=500
)

fig.write_html("/content/drive/MyDrive/mlProject/stock-volatility-forecasting/outputs/plots/transformer_training_curves.html")
fig.show()

print(f"‚úì Training curves saved")

## Section 6: Run Transformer Hyperparameter Search

Search across all Transformer configurations to find the best model.

In [None]:
# ==================== LSTM TRAINING STARTED ====================
print("="*70)
print("SECTION 6: LSTM BASELINE TRAINING")
print("="*70)
print("\n[1/3] Creating LSTM model...")

lstm_model = create_lstm_model(config, model_type='lstm')
lstm_model = lstm_model.to(device)

print(f"‚úì LSTM created")
print(f"  Parameters: {lstm_model.count_parameters():,}")

# Train LSTM
print(f"\n[2/3] Training LSTM (this may take several minutes)...")
print(f"  Config: {config['training']['epochs']} epochs max, batch_size={config['training']['batch_size']}, lr={config['training']['learning_rate']}")
print(f"  Early stopping patience: {config['training']['patience']}")
print("\nTraining progress (epoch-by-epoch):")
print("-"*70)

lstm_history = train_model(
    model=lstm_model,
    train_loader=train_loader,
    val_loader=val_loader,
    config=config,
    device=device,
    save_path="/content/drive/MyDrive/mlProject/stock-volatility-forecasting/models/lstm_best.pth"
)

print("-"*70)
print(f"\n‚úì LSTM training completed")
print(f"  Best val loss: {lstm_history['best_val_loss']:.6f}")
print(f"  Training time: {lstm_history['training_time']:.2f} seconds")
print(f"  Total epochs: {len(lstm_history['epochs'])}")

# Plot comparison
print(f"\n[3/3] Generating comparison plots...")

fig = go.Figure()

for name, history in [('Transformer', transformer_history), ('LSTM', lstm_history)]:
    fig.add_trace(go.Scatter(
        x=history['epochs'],
        y=history['val_loss'],
        mode='lines',
        name=name
    ))

fig.update_layout(
    title="Model Comparison - Validation Loss",
    xaxis_title="Epoch",
    yaxis_title="Val Loss (MSE)",
    height=500
)

fig.show()

print(f"‚úì Section 6 completed")
print("="*70)


## Section 7: Fit ARIMA & GARCH Baselines

Fit traditional time series models per ticker.

In [None]:
# Get unique tickers from training data
train_indices = range(len(X_train))
train_tickers = [ticker_labels[i] for i in train_indices if i < len(ticker_labels)]
unique_train_tickers = list(set(train_tickers))[:50]  # Limit for speed

print(f"Fitting baseline models for {len(unique_train_tickers)} tickers...")

# Fit baseline models
baseline_models = fit_baseline_models(
    df=df,
    tickers=unique_train_tickers,
    config=config
)

print(f"\n‚úì Baseline models fitted")
print(f"  ARIMA models: {len(baseline_models['ARIMA'].models)}")
print(f"  GARCH models: {len(baseline_models['GARCH'].models)}")
print(f"  HistVol models: {len(baseline_models['HistVol'].historical_vols)}")
# Test baseline predictions
test_ticker = unique_train_tickers[0]
print(f"\nExample predictions for {test_ticker}:")

for horizon in config['features']['horizons']:
    arima_pred = baseline_models['ARIMA'].predict(test_ticker, horizon)
    garch_pred = baseline_models['GARCH'].predict(test_ticker, horizon)
    hist_pred = baseline_models['HistVol'].predict(test_ticker, horizon)

    print(f"  {horizon}-day volatility:")
    print(f"    ARIMA: {arima_pred:.6f}" if arima_pred else "    ARIMA: N/A")
    print(f"    GARCH: {garch_pred:.6f}" if garch_pred else "    GARCH: N/A")
    print(f"    HistVol: {hist_pred:.6f}" if hist_pred else "    HistVol: N/A")

## Section 8: Evaluate All Models

Compute metrics (MSE, RMSE, MAE, R¬≤) for all models on test set.

In [None]:
# Load best Transformer model
transformer_model = load_model(
    create_transformer_model(config),
    "/content/drive/MyDrive/mlProject/stock-volatility-forecasting/models/transformer_best.pth",
    device
)

# Load best LSTM model
lstm_model = load_model(
    create_lstm_model(config, 'lstm'),
    "/content/drive/MyDrive/mlProject/stock-volatility-forecasting/models/lstm_best.pth",
    device
)

# Evaluate Transformer
print("Evaluating Transformer on test set...")
_, transformer_preds_scaled, test_targets_scaled = evaluate(
    transformer_model, test_loader, nn.MSELoss(), device
)

# Inverse transform predictions
transformer_preds = engineer.inverse_transform_labels(transformer_preds_scaled)
test_targets = engineer.inverse_transform_labels(test_targets_scaled)

transformer_metrics = compute_metrics(test_targets, transformer_preds, ['5d', '10d'])
print(f"‚úì Transformer metrics computed")

# Evaluate LSTM
print("Evaluating LSTM on test set...")
_, lstm_preds_scaled, _ = evaluate(
    lstm_model, test_loader, nn.MSELoss(), device
)

lstm_preds = engineer.inverse_transform_labels(lstm_preds_scaled)
lstm_metrics = compute_metrics(test_targets, lstm_preds, ['5d', '10d'])
print(f"‚úì LSTM metrics computed")

# Create metrics table
metrics_data = []

for model_name, metrics in [('Transformer', transformer_metrics), ('LSTM', lstm_metrics)]:
    for horizon, horizon_metrics in metrics.items():
        metrics_data.append({
            'Model': model_name,
            'Horizon': horizon,
            **horizon_metrics
        })

metrics_df = pd.DataFrame(metrics_data)

print(f"\n{'='*80}")
print(f"MODEL PERFORMANCE ON TEST SET")
print(f"{'='*80}")
display(metrics_df)

# Save metrics
metrics_df.to_csv("/content/drive/MyDrive/mlProject/stock-volatility-forecasting/outputs/test_metrics.csv", index=False)
print(f"\n‚úì Metrics saved to /content/drive/MyDrive/mlProject/stock-volatility-forecasting/outputs/test_metrics.csv")

# Visualize metrics
fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=('RMSE', 'MAE', 'R¬≤', 'MSE')
)

for i, metric in enumerate(['RMSE', 'MAE', 'R2', 'MSE']):
    row = i // 2 + 1
    col = i % 2 + 1

    for horizon in ['5d', '10d']:
        df_h = metrics_df[metrics_df['Horizon'] == horizon]

        fig.add_trace(
            go.Bar(
                x=df_h['Model'],
                y=df_h[metric],
                name=horizon,
                showlegend=(i==0)
            ),
            row=row, col=col
        )

fig.update_layout(title="Test Set Metrics Comparison", height=800)
fig.write_html("/content/drive/MyDrive/mlProject/stock-volatility-forecasting/outputs/plots/test_metrics_comparison.html")
fig.show()

## Section 9: Regime Analysis

Analyze model performance in high vs low volatility regimes.

In [None]:
# Initialize comparator
comparator = ModelComparator(horizons=['5d', '10d'])

# Add model results
comparator.add_model_results('Transformer', test_targets, transformer_preds)
comparator.add_model_results('LSTM', test_targets, lstm_preds)

# Regime analysis
print("Performing regime analysis...")
regime_df = comparator.regime_analysis(
    quantile_high=config['evaluation']['high_vol_quantile'],
    quantile_low=config['evaluation']['low_vol_quantile']
)

print(f"\n{'='*80}")
print(f"REGIME ANALYSIS (High Vol ‚â• 75th percentile, Low Vol ‚â§ 25th percentile)")
print(f"{'='*80}")
display(regime_df)

# Save regime analysis
regime_df.to_csv("/content/drive/MyDrive/mlProject/stock-volatility-forecasting/outputs/regime_analysis.csv", index=False)
print(f"\n‚úì Regime analysis saved")

# Visualize regime performance
fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=('RMSE by Regime', 'MAE by Regime')
)

for horizon in ['5d', '10d']:
    df_h = regime_df[regime_df['Horizon'] == horizon]

    # RMSE
    for regime in ['High Vol', 'Low Vol']:
        df_r = df_h[df_h['Regime'] == regime]
        fig.add_trace(
            go.Bar(
                x=df_r['Model'],
                y=df_r['RMSE'],
                name=f'{horizon} - {regime}',
                showlegend=True
            ),
            row=1, col=1
        )

    # MAE
    for regime in ['High Vol', 'Low Vol']:
        df_r = df_h[df_h['Regime'] == regime]
        fig.add_trace(
            go.Bar(
                x=df_r['Model'],
                y=df_r['MAE'],
                name=f'{horizon} - {regime}',
                showlegend=False
            ),
            row=1, col=2
        )

fig.update_layout(title="Performance by Volatility Regime", height=500)
fig.write_html("/content/drive/MyDrive/mlProject/stock-volatility-forecasting/outputs/plots/regime_analysis.html")
fig.show()

# Insights
print(f"\n{'='*60}")
print(f"REGIME INSIGHTS")
print(f"{'='*60}")

for model in ['Transformer', 'LSTM']:
    for horizon in ['5d', '10d']:
        df_model = regime_df[(regime_df['Model'] == model) & (regime_df['Horizon'] == horizon)]

        high_rmse = df_model[df_model['Regime'] == 'High Vol']['RMSE'].values[0]
        low_rmse = df_model[df_model['Regime'] == 'Low Vol']['RMSE'].values[0]

        print(f"{model} ({horizon}):")
        print(f"  High Vol RMSE: {high_rmse:.6f}")
        print(f"  Low Vol RMSE: {low_rmse:.6f}")
        print(f"  Ratio: {high_rmse/low_rmse:.2f}x")
        print()

## Section 10: Error Analysis

Analyze prediction errors and identify worst predictions.

In [None]:
# Error analysis for Transformer (5-day horizon)
print("Analyzing Transformer errors (5-day horizon)...")

error_analysis_5d = comparator.error_analysis('Transformer', horizon_idx=0)

print(f"\n{'='*60}")
print(f"ERROR STATISTICS (5-day)")
print(f"{'='*60}")
print(f"Mean error: {error_analysis_5d['mean_error']:.6f}")
print(f"Std error: {error_analysis_5d['std_error']:.6f}")
print(f"Mean absolute error: {error_analysis_5d['mean_abs_error']:.6f}")
print(f"Median absolute error: {error_analysis_5d['median_abs_error']:.6f}")
print(f"Max error: {error_analysis_5d['max_error']:.6f}")

print(f"\nError percentiles:")
for pct, value in error_analysis_5d['error_percentiles'].items():
    print(f"  {pct}: {value:.6f}")

# Plot residuals
fig = comparator.plot_residuals('Transformer', horizon_idx=0)
fig.write_html("/content/drive/MyDrive/mlProject/stock-volatility-forecasting/outputs/plots/transformer_5d_residuals.html")
fig.show()

# Plot predictions vs actual
fig = comparator.plot_predictions_vs_actual(
    'Transformer',
    horizon_idx=0,
    n_samples=500
)
fig.write_html("/content/drive/MyDrive/mlProject/stock-volatility-forecasting/outputs/plots/transformer_5d_predictions.html")
fig.show()

# Identify worst predictions
errors = error_analysis_5d['errors']
abs_errors = np.abs(errors)
worst_indices = np.argsort(abs_errors)[-10:]

print(f"\n{'='*60}")
print(f"TOP 10 WORST PREDICTIONS")
print(f"{'='*60}")

worst_df = pd.DataFrame({
    'Index': worst_indices,
    'True_Vol': error_analysis_5d['y_true'][worst_indices],
    'Pred_Vol': error_analysis_5d['y_pred'][worst_indices],
    'Error': errors[worst_indices],
    'Abs_Error': abs_errors[worst_indices]
})

display(worst_df)

# Error distribution by magnitude
fig = go.Figure()

fig.add_trace(go.Histogram(
    x=abs_errors,
    nbinsx=50,
    name='Absolute Errors'
))

fig.add_vline(
    x=error_analysis_5d['mean_abs_error'],
    line_dash="dash",
    line_color="red",
    annotation_text="Mean"
)

fig.update_layout(
    title="Error Distribution (5-day Transformer)",
    xaxis_title="Absolute Error",
    yaxis_title="Frequency",
    height=400
)

fig.show()

## Section 11: Save Best Models and Scalers

Save trained models and scalers for production deployment.

In [None]:
# Models and scalers already saved during training
# Let's verify and create a summary

print(f"{'='*60}")
print(f"SAVED MODELS SUMMARY")
print(f"{'='*60}")

models_dir = Path("/content/drive/MyDrive/mlProject/stock-volatility-forecasting/models")

saved_files = {
    'Transformer Model': models_dir / "transformer_best.pth",
    'LSTM Model': models_dir / "lstm_best.pth",
    'Scalers': models_dir / "scalers.pkl"
}

for name, path in saved_files.items():
    if path.exists():
        size_mb = path.stat().st_size / (1024 * 1024)
        print(f"‚úì {name}: {path}")
        print(f"  Size: {size_mb:.2f} MB")
    else:
        print(f"‚úó {name}: NOT FOUND")

print(f"\n‚úì All models saved successfully")

# Save model metadata
metadata = {
    'transformer': {
        'path': str(saved_files['Transformer Model']),
        'parameters': transformer_model.count_parameters(),
        'best_val_loss': transformer_history['best_val_loss'],
        'training_time': transformer_history['training_time'],
        'test_metrics': transformer_metrics
    },
    'lstm': {
        'path': str(saved_files['LSTM Model']),
        'parameters': lstm_model.count_parameters(),
        'best_val_loss': lstm_history['best_val_loss'],
        'training_time': lstm_history['training_time'],
        'test_metrics': lstm_metrics
    },
    'config': config,
    'data_summary': summary
}

import json
with open("/content/drive/MyDrive/mlProject/stock-volatility-forecasting/models/metadata.json", 'w') as f:
    json.dump(metadata, f, indent=2, default=str)

print(f"‚úì Model metadata saved to /content/drive/MyDrive/mlProject/stock-volatility-forecasting/models/metadata.json")

## Section 12: Generate Comparison Plots and Export Results

Create comprehensive visualizations and export all results.

In [None]:
# Generate all comparison plots
print("Generating comparison plots...")

# Overall comparison
fig_comparison = comparator.plot_comparison()
fig_comparison.write_html("/content/drive/MyDrive/mlProject/stock-volatility-forecasting/outputs/plots/model_comparison.html")
print(f"‚úì Model comparison saved")

# Predictions vs actual for both models and horizons
for model_name in ['Transformer', 'LSTM']:
    for i, horizon in enumerate(['5d', '10d']):
        # Predictions vs actual
        fig = comparator.plot_predictions_vs_actual(
            model_name,
            horizon_idx=i,
            n_samples=1000,
            save_path=f"/content/drive/MyDrive/mlProject/stock-volatility-forecasting/outputs/plots/{model_name}_{horizon}_predictions.html"
        )

        # Residuals
        fig = comparator.plot_residuals(
            model_name,
            horizon_idx=i,
            save_path=f"/content/drive/MyDrive/mlProject/stock-volatility-forecasting/outputs/plots/{model_name}_{horizon}_residuals.html"
        )

print(f"‚úì All plots generated")

# Export all results
print("\nExporting results...")
comparator.export_results("/content/drive/MyDrive/mlProject/stock-volatility-forecasting/outputs")

# Create summary report
summary_report = f"""
# Stock Volatility Forecasting - Training Summary

## Dataset
- Total samples: {summary['total_rows']:,}
- Tickers: {summary['num_tickers']} ({summary['num_stocks']} stocks, {summary['num_etfs']} ETFs)
- Date range: {summary['date_range'][0]} to {summary['date_range'][1]}
- Features: {config['features']['num_features']}
- Sequence length: {config['features']['sequence_length']} days

## Data Splits
- Train: {len(X_train):,} samples ({len(X_train)/len(X)*100:.1f}%)
- Val: {len(X_val):,} samples ({len(X_val)/len(X)*100:.1f}%)
- Test: {len(X_test):,} samples ({len(X_test)/len(X)*100:.1f}%)

## Model Performance (Test Set)

### Transformer
- Parameters: {transformer_model.count_parameters():,}
- Training time: {transformer_history['training_time']:.2f}s
- Best val loss: {transformer_history['best_val_loss']:.6f}

**5-day volatility:**
- RMSE: {transformer_metrics['5d']['RMSE']:.6f}
- MAE: {transformer_metrics['5d']['MAE']:.6f}
- R¬≤: {transformer_metrics['5d']['R2']:.4f}

**10-day volatility:**
- RMSE: {transformer_metrics['10d']['RMSE']:.6f}
- MAE: {transformer_metrics['10d']['MAE']:.6f}
- R¬≤: {transformer_metrics['10d']['R2']:.4f}

### LSTM
- Parameters: {lstm_model.count_parameters():,}
- Training time: {lstm_history['training_time']:.2f}s
- Best val loss: {lstm_history['best_val_loss']:.6f}

**5-day volatility:**
- RMSE: {lstm_metrics['5d']['RMSE']:.6f}
- MAE: {lstm_metrics['5d']['MAE']:.6f}
- R¬≤: {lstm_metrics['5d']['R2']:.4f}

**10-day volatility:**
- RMSE: {lstm_metrics['10d']['RMSE']:.6f}
- MAE: {lstm_metrics['10d']['MAE']:.6f}
- R¬≤: {lstm_metrics['10d']['R2']:.4f}

## Files Generated
- Models: `/content/drive/MyDrive/mlProject/stock-volatility-forecasting/models/transformer_best.pth`, `/content/drive/MyDrive/mlProject/stock-volatility-forecasting/models/lstm_best.pth`
- Scalers: `/content/drive/MyDrive/mlProject/stock-volatility-forecasting/models/scalers.pkl`
- Metrics: `/content/drive/MyDrive/mlProject/stock-volatility-forecasting/outputs/comparison_results.csv`, `/content/drive/MyDrive/mlProject/stock-volatility-forecasting/outputs/regime_analysis.csv`
- Plots: `/content/drive/MyDrive/mlProject/stock-volatility-forecasting/outputs/plots/*.html`

## Next Steps
1. Deploy models to production (see `app/app.py`)
2. Run inference on new data
3. Monitor model performance
4. Retrain periodically with new data
"""

with open("/content/drive/MyDrive/mlProject/stock-volatility-forecasting/outputs/TRAINING_SUMMARY.md", 'w') as f:
    f.write(summary_report)

print(f"‚úì Summary report saved to /content/drive/MyDrive/mlProject/stock-volatility-forecasting/outputs/TRAINING_SUMMARY.md")

print(f"\n{'='*80}")
print(f"TRAINING PIPELINE COMPLETED SUCCESSFULLY")
print(f"{'='*80}")
print(f"\nAll results saved to:")
print(f"  Models: /content/drive/MyDrive/mlProject/stock-volatility-forecasting/models/")
print(f"  Outputs: /content/drive/MyDrive/mlProject/stock-volatility-forecasting/outputs/")
print(f"  Plots: /content/drive/MyDrive/mlProject/stock-volatility-forecasting/outputs/plots/")
print(f"\nTo deploy the model:")
print(f"  cd ../app")
print(f"  streamlit run app.py")