Resources:

https://docs.pytorch.org/docs/stable/generated/torch.nn.LSTM.html
https://machinelearningmastery.com/lstm-for-time-series-prediction-in-pytorch/




#### Dependencies

In [54]:
import os
import yfinance as yf
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error, mean_absolute_error
from scipy.stats import spearmanr, pearsonr
import numpy as np
import pandas as pd
import seaborn as sns
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import torch.optim as optim
from datetime import datetime, timedelta
from ta.momentum import RSIIndicator, StochasticOscillator
from ta.trend import SMAIndicator, MACD, CCIIndicator
from ta.momentum import WilliamsRIndicator
from ta.volume import ChaikinMoneyFlowIndicator


In [2]:
print(torch.__version__)
print(torch.version.cuda) 
print(torch.cuda.is_available())   # True if a GPU is detected
print(torch.cuda.device_count())   # Number of GPUs available
print(torch.cuda.get_device_name(0))  # Name of the first GPU

2.9.0+cu128
12.8
True
1
NVIDIA GeForce RTX 4080


#### Useful functions

In [16]:
def load_price_data(
    ticker: str,
    start_date: str,
    end_date: str,
    pred_target: int,
    leadup_days: int = 30,
    interval: str = "1d",
    sma_vals=[10, 20]
    
):
    """
    Fetches price data with extra leadup_days for indicator warmup,
    computes technical indicators, and trims back to the exact start_date.
    """

    # Compute leadup start date
    start_dt = datetime.strptime(start_date, "%Y-%m-%d")
    leadup_start_dt = start_dt - timedelta(days=leadup_days)
    leadup_start = leadup_start_dt.strftime("%Y-%m-%d")

    # Fetch data
    ticker_obj = yf.Ticker(ticker)
    df = ticker_obj.history(
        start=leadup_start,
        end=end_date,
        interval=interval,
        auto_adjust=False,
        actions=False
    )

    # Ensure columns are consistent
    df = df.copy()
    df.index = df.index.normalize()
    df.columns = df.columns.str.lower().str.replace(" ", "_")

    # === Add Indicators ===
    # SMA
    for i in sma_vals:
        df[f"sma_{i}"] = SMAIndicator(df["close"], window=i).sma_indicator()

    # Stochastic Oscillator
    stoch = StochasticOscillator(
        df["high"], df["low"], df["close"],
        window=14, smooth_window=3
    )
    df["stoch_k"] = stoch.stoch()
    df["stoch_d"] = stoch.stoch_signal()

    # MACD
    macd = MACD(df["close"], window_slow=26, window_fast=12, window_sign=9)
    df["macd"] = macd.macd()
    df["macd_signal"] = macd.macd_signal()
    df["macd_hist"] = macd.macd_diff()

    # CCI
    df["cci"] = CCIIndicator(df["high"], df["low"], df["close"], window=20).cci()

    # Williams %R
    df["williams_r"] = WilliamsRIndicator(df["high"], df["low"], df["close"], lbp=14).williams_r()

    # RSI
    df["rsi"] = RSIIndicator(df["close"], window=14).rsi()

    # ADOSC (Chaikin Money Flow)
    df["adosc"] = ChaikinMoneyFlowIndicator(
        high=df["high"],
        low=df["low"],
        close=df["close"],
        volume=df["volume"],
        window=20
    ).chaikin_money_flow()
    
    # Target value
    # df[f"{pred_target}-day_target"] = (df["close"].shift(-pred_target) - df["close"]) / df["close"]
    
    # Target: cumulative future log return over pred_target days
    daily_log_return = np.log(df["close"] / df["close"].shift(1))
    df[f"{pred_target}_day_target"] = (
        daily_log_return.rolling(pred_target).sum().shift(-pred_target)
    )

    # Drop NaNs caused by indicator warmup
    df = df.dropna()

    # Trim back to the actual start_date (remove leadup)
    # df = df[df.index >= start_date]

    return df



In [21]:
def make_sequences(df, target_col ,feature_cols, seq_len=10):
    X, y = [], []
    
    values = df[feature_cols].values
    targets = target_col.values

    for i in range(len(df) - seq_len):
        X.append(values[i:i+seq_len])
        y.append(targets[i+seq_len])

    return np.array(X), np.array(y)

In [72]:
def plot_and_save_metrics(metrics_df, target_horizon, save_table=True):
      """Creates and saves three metric plots:
         (1) RMSE + MAE together
         (2) Directional Accuracy
         (3) Spearman Correlation
         Files are named using the prediction horizon.
      """
      
      # Create directory name and ensure it exists
      folder_name = f"results_target_{target_horizon}"
      os.makedirs(folder_name, exist_ok=True)

      years = metrics_df["year"]

      # --- Plot 1: RMSE + MAE ---
      fig1, ax1 = plt.subplots(figsize=(8, 4))
      ax1.plot(years, metrics_df["rmse"], marker="o", label="RMSE")
      ax1.plot(years, metrics_df["mae"], marker="o", label="MAE")
      ax1.set_title(f"RMSE & MAE ({target_horizon}-Day Horizon)")
      ax1.set_xlabel("Year")
      ax1.set_ylabel("Error Value")
      ax1.grid(True, alpha=0.3)
      ax1.legend()
      
      filename1 = f"metrics_target_{target_horizon}_RMSE_MAE.png"
      fig1.tight_layout()
      fig1.savefig(os.path.join(folder_name, "RMSE_MAE.png"), dpi=150)
      plt.close(fig1)


      # --- Plot 2: Directional Accuracy ---
      
      fig2, ax2 = plt.subplots(figsize=(8, 4))
      ax2.plot(years, metrics_df["directional_accuracy (%)"], marker="s", linestyle="--")
      ax2.set_ylim(0, 100)
      ax2.set_title(f"Directional Accuracy ({target_horizon}-Day Horizon)")
      ax2.set_xlabel("Year")
      ax2.set_ylabel("Directional Accuracy (%)")
      ax2.grid(True, alpha=0.3)

      filename2 = f"metrics_target_{target_horizon}_Directional_Accuracy.png"
      fig2.tight_layout()
      fig2.savefig(os.path.join(folder_name, "Directional_Accuracy.png"), dpi=150)
      plt.close(fig2)


      # --- Plot 3: Spearman Correlation ---
      fig3, ax3 = plt.subplots(figsize=(8, 4))
      ax3.plot(years, metrics_df["spearman"], marker="s", linestyle=":")
      ax3.set_title(f"Spearman Correlation ({target_horizon}-Day Horizon)")
      ax3.set_xlabel("Year")
      ax3.set_ylabel("Spearman")
      ax3.grid(True, alpha=0.3)

      filename3 = f"metrics_target_{target_horizon}_Spearman.png"
      fig3.tight_layout()
      fig3.savefig(os.path.join(folder_name, "Spearman.png"), dpi=150)
      plt.close(fig3)
      
      
      # save the df for good measure
      df_file = f"metrics_target_{target_horizon}.csv"
      # === optional table save ===
      if save_table:
        metrics_df.to_csv(os.path.join(folder_name, "metrics.csv"), index=False)
      

      print("\nSaved plots:")
      print(f" - {filename1}")
      print(f" - {filename2}")
      print(f" - {filename3}")
      print(f" - {df_file}")


#### Model

In [73]:
class LSTMPredictor(nn.Module):
    def __init__(self, input_dim, hidden_dim=64, num_layers=2, dropout=0.2):
        super().__init__()

        self.lstm = nn.LSTM(
            input_size=input_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout
        )
        
        self.fc = nn.Linear(hidden_dim, 1)

    def forward(self, x):
        # x: (batch, seq_len, features)
        output, (hidden, _) = self.lstm(x)
        last_hidden = hidden[-1]
        return self.fc(last_hidden)
    

In [74]:
def save_experiment(model: LSTMPredictor, metrics: pd.DataFrame, target_horizon: int):
    folder = f"results_target_{target_horizon}"
    os.makedirs(folder, exist_ok=True)

    # ----- Save model -----
    model_path = os.path.join(folder, f"model_target_{target_horizon}.pt")
    torch.save(model.state_dict(), model_path)

    # ----- Save metrics -----
    metrics.to_csv(os.path.join(folder, "metrics.csv"), index=False)

    print(f"\nSaved model and metrics to: {folder}")

#### Training Loop

In [75]:
def walk_forward_training_loop(df, target_col, feature_cols, 
    SEQ_LEN=10,
    START_YEAR=2015,
    END_YEAR=2025,
    LEARNING_RATE=0.001,
    HIDDEN_DIM=64,
    NUM_LAYERS=2,
    DROPOUT=0.2,
    EPOCHS=10):
    
    """
    Walk Forward logic:

    | Window | Train     | Test | Scaling Rule                 |
    | ------ | --------- | ---- | ---------------------------- |
    | #1     | 2015-2017 | 2018 | Fit scaler only on 2015-2017 |
    | #2     | 2015-2018 | 2019 | Fit scaler only on 2015-2018 |
    | #3     | 2015-2019 | 2020 | Fit scaler only on 2015-2019 |
    etc...
    
    This must be done to avoid data-leakage
    
    """

    years = list(range(START_YEAR, END_YEAR))  # test years

    results = []

    for year in years:
        print(f"\n=== WALK {year} ===")

        # Determine the true first trading day of the year
        test_start = df[df.index >= f"{year}-01-01"].index.min()
        if pd.isna(test_start):
            print(f"[SKIP] No rows available for year {year}")
            continue

        train = df[df.index < test_start]
        test  = df[df.index >= test_start]

        # ðŸ”¥ Fix: ensure training data exists BEFORE scaling
        if len(train) < SEQ_LEN:
            print(f"[WAIT] Not enough historical rows before {test_start} "
                f"({len(train)} available, need â‰¥ {SEQ_LEN}). Skipping this boundary.")
            continue

        # Scaling
        scaler = StandardScaler()
        scaled_train = scaler.fit_transform(train[feature_cols])
        scaled_test  = scaler.transform(test[feature_cols])

        # Assign scaled values
        train_scaled = train.copy()
        test_scaled  = test.copy()
        train_scaled[feature_cols] = scaled_train
        test_scaled[feature_cols] = scaled_test

        # Make sequences
        X_train, y_train = make_sequences(train_scaled, target_col, feature_cols, SEQ_LEN)
        X_test,  y_test  = make_sequences(test_scaled,  target_col, feature_cols, SEQ_LEN)

        # Convert to tensors
        X_train_t = torch.tensor(X_train, dtype=torch.float32)
        y_train_t = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1)
        X_test_t  = torch.tensor(X_test, dtype=torch.float32)

        loader = DataLoader(TensorDataset(X_train_t, y_train_t), batch_size=32, shuffle=True)

        # Train a fresh LSTM
        model = LSTMPredictor(
            input_dim=len(feature_cols), 
            hidden_dim=HIDDEN_DIM,
            num_layers=NUM_LAYERS,
            dropout=DROPOUT
        )
        opt = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
        loss_fn = nn.MSELoss()

        for epoch in range(EPOCHS):
            for xb, yb in loader:
                opt.zero_grad()
                pred = model(xb)
                loss = loss_fn(pred, yb)
                loss.backward()
                opt.step()
        
        
        
        # Predict full test window
        preds = model(X_test_t).detach().numpy().flatten()
        pred_dates = test_scaled.index[SEQ_LEN:]

        results.append(pd.DataFrame({
            "date": pred_dates,
            "prediction": preds,
            "actual": y_test
        }))

    return results, model

#### Operation Pipeline

In [94]:
#INIT PARAMS
    # Dataset/Time
TICKER = "AMZN"
START = "2015-01-01"
END = "2025-01-01"
Y_START = 2015
Y_END = 2022
WARMUP = 70
TARGET = 30
    # Model
LOOKBACK = 10
DIM = 64
LAYERS = 2
DROPOUT = 0.2
LEARNING_RATE = 0.001
# 1. Load full data
df = load_price_data(
    ticker=TICKER,
    start_date=START,
    end_date=END,
    leadup_days=WARMUP,
    pred_target=TARGET
)

# 2. Create walk-forward boundary

# leave last two years for final testing untouched during training.
train_df = df[df.index < "2023-01-01"]
test_df  = df[df.index >= "2023-01-01"]


# 3. Train the model using WalkForward testing split
# use all columns as features except target column
feature_cols = [col for col in df.columns if col not in [f"{TARGET}_day_target"]]
target_col = df[f"{TARGET}_day_target"]

results, model = walk_forward_training_loop(
    train_df, target_col, 
    feature_cols, 
    SEQ_LEN=LOOKBACK, 
    START_YEAR=Y_START, 
    END_YEAR=Y_END, 
    LEARNING_RATE= LEARNING_RATE, 
    HIDDEN_DIM= DIM, 
    NUM_LAYERS= LAYERS, 
    DROPOUT= DROPOUT
    
)




=== WALK 2015 ===

=== WALK 2016 ===

=== WALK 2017 ===

=== WALK 2018 ===

=== WALK 2019 ===

=== WALK 2020 ===

=== WALK 2021 ===


In [95]:
year_stats = []

for df_year in results:  # each entry is a year dataframe

    year = df_year['date'].iloc[0].year  # extract year from first row's date

    preds = df_year["prediction"].values
    actual = df_year["actual"].values

    # Metrics
    rmse = np.sqrt(mean_squared_error(actual, preds))
    mae = mean_absolute_error(actual, preds)
    
    # directional accuracy
    direction_accuracy = (np.sign(preds) == np.sign(actual)).mean()

    # Spearman correlation (ignore NaNs)
    spearman_val, _ = spearmanr(actual, preds, nan_policy='omit')

    year_stats.append({
        "year": year,
        "rmse": rmse,
        "mae": mae,
        "directional_accuracy (%)": round(direction_accuracy * 100, 2),
        "spearman": spearman_val
    })

# Convert results into a table
metrics_df = pd.DataFrame(year_stats).sort_values("year").reset_index(drop=True)
metrics_df

Unnamed: 0,year,rmse,mae,directional_accuracy (%),spearman
0,2015,0.112573,0.088688,49.8,0.130792
1,2016,0.631487,0.598403,34.19,0.111015
2,2017,0.297951,0.259338,30.53,-0.036511
3,2018,0.213248,0.18635,70.78,0.019047
4,2019,0.202947,0.170748,43.29,0.066105
5,2020,0.348165,0.324884,23.59,0.190341
6,2021,0.12581,0.100786,52.13,0.21127


In [96]:
plot_and_save_metrics(metrics_df, TARGET)
save_experiment(model, metrics_df, TARGET)


Saved plots:
 - metrics_target_30_RMSE_MAE.png
 - metrics_target_30_Directional_Accuracy.png
 - metrics_target_30_Spearman.png
 - metrics_target_30.csv

Saved model and metrics to: results_target_30
