In [7]:
from pathlib import Path
import pandas as pd
import sys
import os
import torch
import numpy as np

current_dir = os.getcwd()
parent_dir = os.path.abspath(os.path.join(current_dir, ".."))
sys.path.append(parent_dir)

In [8]:
from src.data.stocks import read_and_concat_all_stocks
from src.training.preparation import (
    time_series_train_test_split,
    separate_the_target_column,
    prepare_nn_multistep_dataset
)
from src.training.feature_engineering import add_date_as_feature, add_lagged_features

In [9]:
from src.utils.config import load_config

In [5]:
load_config()

{'tickers': {'tickers_list_filename': 'sp500_tickers_list',
  'tickers_list_save_path': './metadata/tickers_list/',
  'tickers_wikipedia_url': 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies',
  'exclude_tickers': ['GOOG']},
 'stock': {'stock_data_start_date': '2010-01-01',
  'stock_data_save_path': './data/raw_stocks/'},
 'configs': {'train_config': './configs/train/train_cfg.yaml'},
 'train_config': {'target': {'target_column': 'AAPL', 'prediction_window': 7},
  'train_test_split': {'validation_size': 30},
  'feature_engineering': {'n_lags': 1}}}

In [7]:
import yaml

In [8]:
with open("../configs/main.yaml", "r") as f:
                temp_config = yaml.safe_load(f)

In [10]:
temp_config['configs']

{'train_config_path': './configs/train/train_cfg.yaml'}

In [3]:
data_folder = "../data/"
data_folder_path = Path(data_folder)

target_stocks = [
    "AAPL",
    "GOOGL",
    "AMZN",
    "TSLA",
    "NVDA",
    "MSFT",
    "META"
]

target_column = "AAPL"
n_lags = 10
output_size = 5
validation_size = 30
hidden_dim = 256
learning_rate = 0.0001
batch_size = 128
n_epochs = 100

In [4]:
from src.training.preparation import prepare_data_for_training

In [5]:
X_train, y_train, X_valid, y_valid = prepare_data_for_training(data_folder_path, target_column, n_lags=1, validation_size=30, prediction_window=7, save_path="./")

In [6]:
X_train.shape, y_train.shape, X_valid.shape, y_valid.shape

(torch.Size([3883, 426]),
 torch.Size([3883, 7]),
 torch.Size([30, 426]),
 torch.Size([30, 7]))

In [7]:
# Get the input size
input_size = X_train.shape[1]

In [10]:
from src.training.models import FeedforwardRegressor
from src.training.loss_functions import masked_mse_loss
from src.training.trainers import trainer
from src.utils.checks import check_datatypes
from src.utils.path import verify_saving_path, verify_existing_dir
from src.training.utils import validate_device

from torch.utils.data import DataLoader, TensorDataset
from datetime import datetime

import torch.nn as nn

In [11]:
from pathlib import Path

In [14]:
def read_processed_stock_data(data_folder):
    """
    Find the most recent dated subfolder in the given data folder,
    then load and return the PyTorch tensors stored in the files:
    'X_train.pt', 'X_valid.pt', 'y_train.pt', and 'y_valid.pt'.

    Assumes that the subfolder names are dates in the format 'YYYY-MM-DD'
    and that each subfolder contains the above four files.

    Parameters
    ----------
    data_folder : str or Path
        Path to the main data folder containing date-named subfolders.

    Returns
    -------
    tuple of torch.Tensor
        Returns a tuple with four tensors in this order:
        (X_train, X_valid, y_train, y_valid)

    Raises
    ------
    FileNotFoundError
        If the data_folder does not exist, is not a directory,
        or if any of the expected .pt files are missing.
    ValueError
        If no valid date-named subfolders are found or folder names do not match the date format.
    """
    import torch

    # Convert input to Path object if needed
    data_folder = Path(data_folder)

    # Verify that the directory exists
    verify_existing_dir(data_folder)

    # List all subfolders inside data_folder
    subfolders = data_folder.glob("*")

    # Get the most recent folder by date-parsing folder names
    most_recent_folder = max(
        subfolders,
        key=lambda f: datetime.strptime(f.name, "%Y-%m-%d")
    )

    # Define expected files inside the most recent folder
    expected_files = ["X_train.pt", "X_valid.pt", "y_train.pt", "y_valid.pt"]

    # Build full paths and check if all files exist
    file_paths = {}
    for fname in expected_files:
        fpath = most_recent_folder / fname
        if not fpath.is_file():
            raise FileNotFoundError(f"Expected file not found: {fpath}")
        file_paths[fname] = fpath

    # Load the tensors using torch.load
    X_train = torch.load(file_paths["X_train.pt"])
    X_valid = torch.load(file_paths["X_valid.pt"])
    y_train = torch.load(file_paths["y_train.pt"])
    y_valid = torch.load(file_paths["y_valid.pt"])

    return X_train, X_valid, y_train, y_valid

In [16]:
import src.training.loss_functions as loss_functions

In [18]:
x = getattr(loss_functions, "masked_mse_loss")

In [19]:
x

<function src.training.loss_functions.masked_mse_loss(preds, targets)>

In [15]:
read_processed_stock_data("../data/processed_data/")

(tensor([[1.9932e+01, 6.4319e+00, 1.8497e+01,  ..., 2.0100e+03, 1.0000e+00,
          0.0000e+00],
         [1.9715e+01, 6.4430e+00, 1.8347e+01,  ..., 2.0100e+03, 1.0000e+00,
          1.0000e+00],
         [1.9645e+01, 6.3405e+00, 1.8449e+01,  ..., 2.0100e+03, 1.0000e+00,
          2.0000e+00],
         ...,
         [1.2027e+02, 2.0267e+02, 1.3353e+02,  ..., 2.0250e+03, 6.0000e+00,
          1.0000e+00],
         [1.1947e+02, 1.9878e+02, 1.3468e+02,  ..., 2.0250e+03, 6.0000e+00,
          2.0000e+00],
         [1.1866e+02, 1.9920e+02, 1.3584e+02,  ..., 2.0250e+03, 6.0000e+00,
          3.0000e+00]]),
 tensor([[1.1683e+02, 1.9645e+02, 1.3501e+02,  ..., 2.0250e+03, 6.0000e+00,
          4.0000e+00],
         [1.1877e+02, 1.9842e+02, 1.3341e+02,  ..., 2.0250e+03, 6.0000e+00,
          0.0000e+00],
         [1.1585e+02, 1.9564e+02, 1.3169e+02,  ..., 2.0250e+03, 6.0000e+00,
          1.0000e+00],
         ...,
         [1.2035e+02, 2.1376e+02, 1.2564e+02,  ..., 2.0250e+03, 7.0000e+00,
   

In [13]:
FeedforwardRegressor.__name__

'FeedforwardRegressor'

In [21]:
round(113.123241, 3)

113.123

In [16]:
def train_model(model_architecture,
                X_train,
                y_train,
                X_valid,
                y_valid,
                loss_fn,
                prediction_window=7,
                batch_size=32,
                n_epochs=100,
                learning_rate=0.001,
                shuffle=True,
                model_params=None,
                device='cpu',
                save_path="./"):

    """
    Train a PyTorch model using the provided architecture and datasets, then save the traced model.

    Parameters
    ----------
    model_architecture : type
        A PyTorch module class (not instance) representing the model architecture. It must accept `input_dim` and 
        `output_dim` as keyword arguments in addition to any custom `model_params`.
    
    X_train : torch.Tensor
        Training input features of shape (n_samples, n_features).
    
    y_train : torch.Tensor
        Training target values of shape (n_samples, prediction_window).
    
    X_valid : torch.Tensor
        Validation input features of shape (n_samples, n_features).
    
    y_valid : torch.Tensor
        Validation target values of shape (n_samples, prediction_window).
    
    loss_fn : callable
        Loss function used during training. Must accept predicted and true outputs as inputs.
    
    prediction_window : int, optional
        Number of future steps to predict, by default 7.
    
    batch_size : int, optional
        Number of samples per training batch, by default 32.
    
    n_epochs : int, optional
        Number of epochs to train for, by default 100.
    
    learning_rate : float, optional
        Learning rate for the optimizer, by default 0.001.
    
    shuffle : bool, optional
        Whether to shuffle the training data during batching, by default True.
    
    model_params : dict or None, optional
        Additional keyword arguments to pass to the model constructor, by default None.
    
    device : str, optional
        Device on which to train the model (e.g., 'cpu' or 'cuda'), by default 'cpu'.
    
    save_path : str or pathlib.Path, optional
        Directory path where the traced model will be saved, by default "./".

    Returns
    -------
    model : torch.nn.Module
        The trained PyTorch model instance.

    """

    data_types_schema = [
        ("model_architecture", model_architecture, nn.Module),
        ("X_train", X_train, torch.Tensor),
        ("y_train", y_train, torch.Tensor),
        ("X_valid", X_valid, torch.Tensor),
        ("y_valid", y_valid, torch.Tensor),
        ("loss_fn", loss_fn),
        ("prediction_window", prediction_window, int),
        ("batch_size", batch_size, int),
        ("n_epochs", n_epochs, int),
        ("learning_rate", learning_rate, float),
        ("shuffle", shuffle, bool),
        ("model_params", model_params, dict),
        ("device", device, str),
        ("save_path", save_path, (str, Path))
    ]

    # Check if all data types provided are correct
    check_datatypes(data_types_schema)

    # Check if the provided saving path is correctly provided
    save_path = Path(save_path)
    verify_saving_path(save_path)

    # Get todays date
    todays_date = datetime.today().strftime('%Y_%m_%d_%H_%M_%S')

    # Model params is an empty dictionary if None is provided
    model_params = {} if model_params is None else model_params

    # Get the input size
    input_size = X_train.shape[1]

    # Wrap training data into a DataLoader
    train_dataset = TensorDataset(X_train, y_train)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=shuffle)

    # Define the model
    model = model_architecture(input_dim=input_size,
                               output_dim=prediction_window,
                               **model_params).to(device)

    # Define the optimizer
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    # Train the model
    model, val_loss = trainer(
        model=model,
        train_loader=train_loader,
        X_valid=X_valid,
        y_valid=y_valid,
        optimizer=optimizer,
        loss_fn=masked_mse_loss,
        n_epochs=n_epochs,
        device=device
    )

    # Note that it's assumed that input is 1D here
    example_input = torch.randn(1, input_size)

    # Produce the computational graph
    traced_model = torch.jit.trace(model.cpu(), example_input)

    # Save the model
    traced_model.save(save_path / f"{model_architecture.__name__}_{todays_date}_{val_loss}.pt")

    return model
    

In [17]:
trained_model = train_model(
    FeedforwardRegressor,
    X_train,
    y_train,
    X_valid,
    y_valid,
    masked_mse_loss,
    prediction_window=7,
    batch_size=32,
    n_epochs=100,
    learning_rate=0.001,
    shuffle=True,
    model_params={"hidden_dim":512},
    device='cuda',
    save_path="./"
)

Epoch 1/100 - Train Loss: 240.8914 - Val Loss: 36.1221
Epoch 2/100 - Train Loss: 96.0538 - Val Loss: 129.5992
Epoch 3/100 - Train Loss: 54.8940 - Val Loss: 46.0809
Epoch 4/100 - Train Loss: 49.1767 - Val Loss: 539.7546
Epoch 5/100 - Train Loss: 53.0057 - Val Loss: 240.5298
Epoch 6/100 - Train Loss: 38.7454 - Val Loss: 47.2514
Epoch 7/100 - Train Loss: 38.1695 - Val Loss: 148.9607
Epoch 8/100 - Train Loss: 33.9217 - Val Loss: 166.2247
Epoch 9/100 - Train Loss: 27.7492 - Val Loss: 397.8635
Epoch 10/100 - Train Loss: 35.4550 - Val Loss: 28.9498
Epoch 11/100 - Train Loss: 26.3158 - Val Loss: 154.5502
Epoch 12/100 - Train Loss: 35.2254 - Val Loss: 29.7353
Epoch 13/100 - Train Loss: 22.8817 - Val Loss: 35.8374
Epoch 14/100 - Train Loss: 18.9910 - Val Loss: 25.6086
Epoch 15/100 - Train Loss: 19.7127 - Val Loss: 56.0552
Epoch 16/100 - Train Loss: 34.5863 - Val Loss: 120.6449
Epoch 17/100 - Train Loss: 19.1132 - Val Loss: 29.1526
Epoch 18/100 - Train Loss: 30.0356 - Val Loss: 62.8390
Epoch 19/1

In [14]:
model = trained_model

In [15]:
import matplotlib.pyplot as plt

In [16]:
# 1. Move model and data to appropriate device
model.eval()
X_train_cpu = X_train.to(device)
y_train_cpu = y_train.to(device)
X_valid_cpu = X_valid.to(device)
y_valid_cpu = y_valid.to(device)

# 2. Predict outputs
with torch.no_grad():
    y_train_pred = model(X_train_cpu)
    y_valid_pred = model(X_valid_cpu)

# 3. Convert to numpy
y_train_true = y_train_cpu.cpu().numpy()
y_train_pred = y_train_pred.cpu().numpy()
y_valid_true = y_valid_cpu.cpu().numpy()
y_valid_pred = y_valid_pred.cpu().numpy()

NameError: name 'device' is not defined

In [43]:
y_train_true.shape, y_train_pred.shape, y_valid_true.shape, y_valid_pred.shape

((3874, 5), (3874, 5), (30, 5), (30, 5))

In [35]:
def collect_predictions_per_timestamp(y_pred):
    y_pred = np.asarray(y_pred)
    n_windows, window = y_pred.shape
    

    # Store predictions as lists per timestamp
    all_preds = [[] for _ in range(n_windows)]

    for i in range(n_windows):
        for j in range(window):
            t = i + j
            if t < n_windows and y_pred[i, j] != 0:
                all_preds[t].append(y_pred[i, j])

    # Pad to length 7 (or fill with NaNs) for consistency
    padded_preds = np.full((n_windows, output_size), np.nan)
    for t, preds in enumerate(all_preds):
        padded_preds[t, :len(preds)] = preds

    return padded_preds

In [90]:
def plot_train_valid_with_preds(y_train,
                                y_train_pred,
                                y_valid,
                                y_valid_pred,
                                title="Train and Validation Sets Plot With Predictions",
                                x_label="Timestamp",
                                y_label="Value",
                                figsize=(12, 5),
                                zoom_out=1
                                ):
                                
    plt.figure(figsize=figsize)

    # Train and it's target
    train_size = len(y_train)
    plt.plot(range(train_size), y_train, label="Train Target", color="tab:blue")

    train_predicted_avg = collect_predictions_per_timestamp(y_train_pred)
    plt.plot(range(train_size), np.nanmean(train_predicted_avg, axis=1), label="Train Predicted", color="tab:orange")

    # Validation and it's target
    valid_size = len(y_valid)
    valid_start = train_size + 1
    valid_end = train_size + len(y_valid) + 1 
    plt.plot(range(valid_start, valid_end), y_valid, label="Validation Target", color="tab:blue")

    valid_predicted = collect_predictions_per_timestamp(y_valid_pred)
    plt.plot(range(valid_start, valid_end), np.nanmean(valid_predicted, axis=1), label="Validation Predicted", color="brown")
    plt.plot(range(valid_start, valid_end), valid_predicted, color="gray", alpha=0.3)

    plt.axvline(x=train_size, color='gray', linestyle='--', label='Change to validation set')

    validation_y = y_valid.mean()
    validation_x = (train_size + valid_end) // 2
    figsize_x_over_y = figsize[0] / figsize[1]

    plt.xlim(validation_x-zoom_out*valid_size*figsize_x_over_y, validation_x+zoom_out*valid_size*figsize_x_over_y)
    plt.ylim(validation_y-zoom_out*valid_size, validation_y+zoom_out*valid_size)

    plt.title(title)
    plt.xlabel(x_label)
    plt.ylabel(y_label)

    plt.legend()
    plt.show()

In [1]:
plot_train_valid_with_preds(y_train_raw,
                            y_train_pred,
                            y_valid_raw,
                            y_valid_pred,
                            zoom_out=1
                            )

NameError: name 'plot_train_valid_with_preds' is not defined