In [2]:
from google.colab import drive
drive.mount('/content/drive')
from tqdm.notebook import tqdm

ModuleNotFoundError: No module named 'google.colab'

In [3]:
# Imports
import numpy as np
import pandas as pd
import glob
import os
import matplotlib.pyplot as plt
import time

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader


# Seeds
import random, os
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

In [4]:
#df_ibm = pd.read_csv("/content/drive/MyDrive/data/stocks/Apple.csv")
data_folder = "/content/drive/MyDrive/data/stocks/"
stock_files = glob.glob(os.path.join(data_folder, "*.csv"))
stock_files

[]

In [362]:
def preprocess_stock_new(df_raw, train_frac: float, val_frac: float):
    minutely = df_raw.copy()

    # turn column to datetime, add helper day column
    minutely["Date Time"] = pd.to_datetime(minutely["Date Time"])
    minutely = minutely.sort_values("Date Time")
    minutely["day"] = minutely["Date Time"].dt.date

    # minutely log returns withing days from close
    minutely["log_close"] = np.log(minutely["Close"].astype(float))
    minutely["ret_1m"] = minutely.groupby("day")["log_close"].diff()
    # remove first minute of each day without returns
    minutely = minutely[minutely["Date Time"].dt.time != pd.to_datetime("09:35").time()]

    # Calculate the RV for each grouped 380 row day
    group_sums = minutely.groupby("day")["ret_1m"].transform(lambda x: np.sqrt(np.sum(x**2)))
    
    # Add the group sums as a new column
    minutely["realized_vol"] = group_sums
    minutely["log_rv"] = np.log(minutely["realized_vol"].replace(0.0, 1e-12))
    minutely = minutely.reset_index(drop=True)

    # calculate train test split; make sure no day is cut in half between the two data splits
    train_idx = int(np.floor(minutely.shape[0] * train_frac / 380) * 380)
    val_idx = int(np.floor(minutely.shape[0] * val_frac / 380) * 380)
    # Split the DataFrame into training and testing sets
    df_train = minutely.iloc[:train_idx]
    df_val = minutely.iloc[train_idx:train_idx+val_idx]
    df_test = minutely.iloc[train_idx+val_idx:]

    return df_train, df_val, df_test

In [364]:
def create_sequences_array(df, n_days):
    # create lists to fill with subsequences and their Realized Volatility targets
    sequence_list = []
    sequence_target = []
    
    # Loop through the DataFrame to create subsequences
    for i in range(int(len(df)/380-n_days)):
        
        # Extract a subsequence of 21 days (each day with 380 rows)
        tmp_subsequence = df.iloc[i * 380: (n_days + i) * 380]
        arr = tmp_subsequence["ret_1m"].to_numpy()   
        
        arr = arr.reshape(1, 380*n_days, 1)   
        sequence_list.append(arr)

        # Try to get the target value for the current subsequence
        try:
            subsequence_target = df.iloc[(n_days + i) * 380]["log_rv"]
            sequence_target.append(subsequence_target)
        except:
            # Print a message if an IndexError occurs (likely in the last iteration)
            # since the last 21 day window doesn't have a RV target
            print("last iteration")


    arr = np.vstack(sequence_list)

    return arr, np.array(sequence_target)

In [318]:
def scaler(X_train, X_val, X_test, d):
    
    mu = X_train[:, :, :d].mean(axis=(0, 1), keepdims=True)
    sd = X_train[:, :, :d].std(axis=(0, 1), keepdims=True) + 1e-8

    def scale(X):
        X_scaled = X.copy()
        X_scaled[:, :, :d] = (X_scaled[:, :, :d] - mu) / sd
        return X_scaled

    X_train = scale(X_train)
    X_val   = scale(X_val)
    X_test  = scale(X_test)

    return X_train, X_val, X_test

In [348]:
# ---- CUDA-only loader: pinned memory + persistent workers ----
def mk_loader(X, y, bs=64, shuffle=False, num_workers=2):
    x_t = torch.tensor(X, dtype=torch.float32)
    y_t = torch.tensor(y, dtype=torch.float32)
    ds = TensorDataset(x_t, y_t)
    return DataLoader(
        ds, batch_size=bs, shuffle=shuffle,
        pin_memory=True, num_workers=num_workers,
        persistent_workers=(num_workers > 0)
    )

In [349]:
# ---- Tiny Transformer with learnable positional embeddings ----
class VolTransformerTiny(nn.Module):
    def __init__(self, d_in, d_model=128, nhead=4, num_layers=3,
                 p_drop=0.1, use_cls=True, ff_mult=4, max_len=4096):
        super().__init__()
        self.use_cls = use_cls
        self.embed = nn.Linear(d_in, d_model)

        # positional embedding for tokens (add +1 for CLS position)
        self.pos_emb = nn.Embedding(max_len + 1, d_model)

        enc_layer = nn.TransformerEncoderLayer(
            d_model=d_model, nhead=nhead,
            dim_feedforward=ff_mult*d_model,
            dropout=p_drop, batch_first=True, norm_first=True
        )
        self.encoder = nn.TransformerEncoder(enc_layer, num_layers=num_layers)

        if use_cls:
            self.cls = nn.Parameter(torch.zeros(1, 1, d_model))
            nn.init.normal_(self.cls, mean=0.0, std=0.02)

        self.head = nn.Sequential(
            nn.LayerNorm(d_model),
            nn.Dropout(p_drop),
            nn.Linear(d_model, 1)
        )

    def forward(self, x):  # x: [B, L, d_in]
        B, L, _ = x.shape
        h = self.embed(x)  # [B, L, d_model]

        # add positional embeddings
        # if CLS is used, its position index = 0, the rest shifted by +1
        if self.use_cls:
            pos_idx = torch.arange(1, L+1, device=x.device).unsqueeze(0).expand(B, -1)
            h = h + self.pos_emb(pos_idx)
            cls = self.cls.expand(B, -1, -1)
            # CLS gets position 0
            h = torch.cat([cls, h], dim=1)  # [B, L+1, d_model]
            h = h + torch.cat([self.pos_emb(torch.zeros(B,1, dtype=torch.long, device=x.device)),
                               torch.zeros_like(h[:,1:])], dim=1)
        else:
            pos_idx = torch.arange(0, L, device=x.device).unsqueeze(0).expand(B, -1)
            h = h + self.pos_emb(pos_idx)

        h = self.encoder(h)
        pooled = h[:, 0] if self.use_cls else h.mean(dim=1)
        return self.head(pooled).squeeze(-1)


In [9]:
# ---- CUDA-only trainer with AMP ----
def train_model_cuda(Xtr, ytr, Xva, yva, *,
                     d_model=128, nhead=4, num_layers=3,
                     batch_size=64, lr=1e-3, max_epochs=50,
                     weight_decay=1e-2, p_drop=0.1, patience=10,
                     ff_mult=4, ModelClass=None):
    assert torch.cuda.is_available(), "CUDA expected on Colab GPU runtime"
    device = torch.device("cuda")
    torch.backends.cudnn.benchmark = True  # speed up for fixed shapes

    # Data
    train_loader = mk_loader(Xtr, ytr, bs=batch_size, shuffle=True)
    val_loader   = mk_loader(Xva, yva, bs=batch_size, shuffle=False)

    # Model
    if ModelClass is None:
        ModelClass = VolTransformerTiny  # use your class defined elsewhere
    model = ModelClass(
        d_in=Xtr.shape[-1], d_model=d_model, nhead=nhead,
        num_layers=num_layers, p_drop=p_drop, ff_mult=ff_mult,
        max_len=max(Xtr.shape[1], Xva.shape[1]) + 1
    ).to(device)

    opt = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
    scaler = torch.amp.GradScaler("cuda")

    best_val, best_state, wait = float('inf'), None, 0

    for epoch in range(1, max_epochs + 1):
        # ---- train ----
        model.train()
        tr_loss, n = 0.0, 0
        for xb, yb in tqdm(train_loader, desc=f"Epoch {epoch}/{max_epochs}", leave=False):
            xb = xb.to(device, non_blocking=True)
            yb = yb.to(device, non_blocking=True)

            opt.zero_grad(set_to_none=True)
            with torch.amp.autocast(device_type="cuda"):
                pred = model(xb)
                loss = F.mse_loss(pred, yb)
            scaler.scale(loss).backward()
            nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            scaler.step(opt)
            scaler.update()

            tr_loss += loss.item() * xb.size(0)
            n += xb.size(0)
        tr_loss /= max(1, n)

        # ---- validate ----
        model.eval()
        va_loss, n = 0.0, 0
        with torch.no_grad(), torch.amp.autocast(device_type="cuda"):
            for xb, yb in val_loader:
                xb = xb.to(device, non_blocking=True)
                yb = yb.to(device, non_blocking=True)
                pred = model(xb)
                loss = F.mse_loss(pred, yb)
                va_loss += loss.item() * xb.size(0)
                n += xb.size(0)
        va_loss /= max(1, n)

        improved = va_loss < best_val
        print(f"Epoch {epoch:03d} | Train {tr_loss:.6f} | Val {va_loss:.6f}{' *' if improved else ''}")

        if improved:
            best_val = va_loss
            best_state = {k: v.detach().cpu() for k, v in model.state_dict().items()}
            wait = 0
        else:
            wait += 1
            if wait >= patience:
                print(f"Early stopping at epoch {epoch}")
                break

    if best_state is not None:
        model.load_state_dict(best_state)
        model.to(device)
        print(f"Restored best model with validation loss: {best_val:.6f}")
    else:
        print("Warning: no improvement; returning final weights")

    return model

In [10]:
# ---- CUDA-only evaluation ----
def eval_mse_cuda(X, y, model, bs=64):
    device = torch.device("cuda")
    loader = mk_loader(X, y, bs=bs, shuffle=False)
    model.eval()
    tot, n = 0.0, 0
    with torch.no_grad(), torch.amp.autocast(device_type="cuda"):
        for xb, yb in loader:
            xb = xb.to(device, non_blocking=True)
            yb = yb.to(device, non_blocking=True)
            pred = model(xb)
            loss = F.mse_loss(pred, yb)
            tot += loss.item() * xb.size(0)
            n += xb.size(0)
    return tot / max(1, n)

# ---- CUDA-only prediction ----
def predict_array_cuda(X, model, bs=64):
    device = torch.device("cuda")
    loader = mk_loader(X, np.zeros(len(X), dtype=np.float32), bs=bs, shuffle=False)
    outs = []
    model.eval()
    with torch.no_grad(), torch.amp.autocast(device_type="cuda"):
        for xb, _ in loader:
            xb = xb.to(device, non_blocking=True)
            outs.append(model(xb).float().cpu().numpy())
    return np.concatenate(outs, axis=0)

In [491]:
def get_data_train_model(file_list, *, train_frac=0.7, val_frac=0.15,
                n_days_context=3,
                d_model=128, nhead=4, num_layers=3,
                batch_size=64, lr=1e-3, max_epochs=50, patience=10,
                save_dir="/content/drive/MyDrive/data"):
    # --- 1) build per-stock splits and remember lengths for later slicing ---
    Xtr_list, ytr_list = [], []
    Xva_list, yva_list = [], []
    Xte_list, yte_list = [], []
    names, te_lengths = [], []
    X_test_dic = {}
    y_test_dic = {}

    for path in file_list:
        df_raw = pd.read_csv(path)
        stock_name = os.path.splitext(os.path.basename(path))[0]
        print(stock_name)
        # time-based split per stock
        train_df, val_df, test_df = preprocess_stock_new(df_raw, train_frac, val_frac)

        # window into sequences (shape: [N, seq_len, feat_dim])
        X_tr, y_tr = create_sequences_array(train_df, n_days_context)
        X_va, y_va = create_sequences_array(val_df, n_days_context)
        X_te, y_te = create_sequences_array(test_df, n_days_context)

        Xtr_list.append(X_tr); ytr_list.append(y_tr)
        Xva_list.append(X_va); yva_list.append(y_va)
        Xte_list.append(X_te); yte_list.append(y_te)
        X_test_dic[stock_name] = X_te
        y_test_dic[stock_name] = y_te

        names.append(stock_name)
        te_lengths.append(len(X_te))
    
    # --- 2) concatenate across stocks to make one big dataset ---
    X_train = np.concatenate(Xtr_list, axis=0)
    y_train = np.concatenate(ytr_list, axis=0)
    X_val   = np.concatenate(Xva_list, axis=0)
    y_val   = np.concatenate(yva_list, axis=0)
    X_test = np.concatenate(Xte_list, axis=0)
    y_test = np.concatenate(yte_list, axis=0)

    # --- 3) scale AFTER concatenation: fit on big train, transform val + big test ---
    # NOTE: we call your existing scaler once so it fits on X_train and applies to val/test.
    X_train_scaled, X_val_scaled, X_test_big_scaled = scaler(X_train, X_val, X_test, 1)

    # --- 4) train once on the pooled dataset ---
    start_time = time.time()
    model = train_model_cuda(
        X_train_scaled, y_train, X_val_scaled, y_val,
        d_model=d_model, nhead=nhead, num_layers=num_layers,
        batch_size=batch_size, lr=lr, max_epochs=max_epochs, patience=patience
    )
    end_time = time.time()
    minutes = int((end_time - start_time) // 60)
    seconds = (end_time - start_time) % 60
    print(f"Training completed in {minutes} minutes and {seconds:.2f} seconds.")

    return model, X_test_dic, y_test_dic

In [492]:
def run_test(model, X_test_dic, y_test_dic):
    out = {}
    model_mse_counter = 0
    naive_mse_counter = 0
    for stock in X_test_dic.keys():
        stock_dic = {}
        
        stock_pred = predict_array_cuda(X_test_dic[stock], model)
        naive_pred = np.roll(y_test_dic[stock], 1)
        test_mse = np.mean((y_pred - y_test_dic[stock])**2)
        naive_mse = np.mean((naive_pred[1:] - y_test_dic[stock][1:])**2)
        
        stock_dic["y_pred"] = stock_pred
        stock_dic["y_test"] = y_test_dic[stock]
        stock_dic["transformer_MSE"] = test_mse
        stock_dic["naive_pred"] = naive_pred
        stock_dic["naive_MSE"] = naive_mse

        out[stock] = stock_dic
        mse_counter += test_mse
        naive_mse_counter += naive_mse

        make_plot(stock_pred, y_test_dic[stock], test_mse, naive_mse)

    return out, mse_counter / len(X_test_dic), naive_mse_counter / len(X_test_dic)

In [481]:
def make_plot(stock, y_pred, y_test, test_mse, naive_mse):

    plt.figure(figsize=(12, 6))
    plt.plot(y_test, label="True log RV", color="darkblue")
    plt.plot(y_pred,   label="Transformer", alpha=0.8, color="darkorange")
    plt.title(f"{stock_name} (Test) | Naive: {naive_mse:.4f} | Transformer: {test_mse:.4f}")
    plt.xlabel("Test days"); plt.ylabel("log Realized Volatility")
    plt.legend()
    #fig_path = os.path.join(save_dir, f"{stock_name}_dm{d_model}_h{nhead}_L{num_layers}_ctx{n_days_context}.png")
    #plt.savefig(fig_path, dpi=300, bbox_inches="tight")
    plt.show(); plt.close()

In [None]:
data_folder = "data/"
stock_files = glob.glob(os.path.join(data_folder, "*.csv"))

In [497]:
model, X_test_dic, y_test_dic = get_data_train_model(stock_files[:2], train_frac=0.7, val_frac = 0.15, n_days_context=2,
        d_model=64, nhead=2, num_layers=3,
        batch_size=64, lr=1e-3, max_epochs=50, patience=10,
        save_dir="/content/drive/MyDrive/data")

Home Depot
United Health


NameError: name 'train_model_cuda' is not defined

In [496]:
result_dic, model_MSE_avg, naive_MSE_avg = run_test(model, X_test_dic, y_test_dic)

TypeError: test_stocks() got an unexpected keyword argument 'add_calendar'