Resources:

- https://docs.pytorch.org/docs/stable/generated/torch.nn.LSTM.html
- https://machinelearningmastery.com/lstm-for-time-series-prediction-in-pytorch/




#### Dependencies

In [1]:
import os
import re
import json
import yfinance as yf
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error, mean_absolute_error
from scipy.stats import spearmanr, pearsonr
import numpy as np
import random
import pandas as pd
import seaborn as sns
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import torch.optim as optim
from datetime import datetime, timedelta
from ta.momentum import RSIIndicator, StochasticOscillator
from ta.trend import SMAIndicator, MACD, CCIIndicator
from ta.momentum import WilliamsRIndicator
from ta.volume import ChaikinMoneyFlowIndicator


In [2]:
print(torch.__version__)
print(torch.version.cuda) 
print(torch.cuda.is_available())   # True if a GPU is detected
print(torch.cuda.device_count())   # Number of GPUs available
# print(torch.cuda.get_device_name(0))  # Name of the first GPU

2.9.0+cpu
None
False
0


#### Useful functions

In [45]:
def load_price_data(
    ticker: str,
    start_date: str,
    end_date: str,
    pred_target: int,
    leadup_days: int = 30,
    interval: str = "1d",
    sma_vals=[10, 20]
):
    """
    Fetches price data with extra leadup_days for indicator warmup,
    computes technical indicators, and trims back to the exact start_date.
    """

    # Compute leadup start date
    start_dt = datetime.strptime(start_date, "%Y-%m-%d")
    leadup_start_dt = start_dt - timedelta(days=leadup_days)
    leadup_start = leadup_start_dt.strftime("%Y-%m-%d")

    # Fetch data
    ticker_obj = yf.Ticker(ticker)
    df = ticker_obj.history(
        start=leadup_start,
        end=end_date,
        interval=interval,
        auto_adjust=False,
        actions=False
    )

    # Ensure columns are consistent
    df = df.copy()
    df.index = df.index.normalize()
    df.columns = df.columns.str.lower().str.replace(" ", "_")

    # === Add Indicators ===
    # SMA
    for i in sma_vals:
        df[f"sma_{i}"] = SMAIndicator(df["close"], window=i).sma_indicator()

    # Stochastic Oscillator
    stoch = StochasticOscillator(
        df["high"], df["low"], df["close"],
        window=14, smooth_window=3
    )
    df["stoch_k"] = stoch.stoch()
    df["stoch_d"] = stoch.stoch_signal()

    # MACD
    macd = MACD(df["close"], window_slow=26, window_fast=12, window_sign=9)
    df["macd"] = macd.macd()
    df["macd_signal"] = macd.macd_signal()
    df["macd_hist"] = macd.macd_diff()

    # CCI
    df["cci"] = CCIIndicator(df["high"], df["low"], df["close"], window=20).cci()

    # Williams %R
    df["williams_r"] = WilliamsRIndicator(df["high"], df["low"], df["close"], lbp=14).williams_r()

    # RSI
    df["rsi"] = RSIIndicator(df["close"], window=14).rsi()

    # ADOSC (Chaikin Money Flow)
    df["adosc"] = ChaikinMoneyFlowIndicator(
        high=df["high"],
        low=df["low"],
        close=df["close"],
        volume=df["volume"],
        window=20
    ).chaikin_money_flow()
    
    # Target value
    df[f"{pred_target}-day_target"] = (df["close"].shift(-pred_target) - df["close"]) / df["close"]

    # Drop NaNs caused by indicator warmup
    df = df.dropna()

    # Trim back to the actual start_date (remove leadup)
    df = df[df.index >= start_date]

    return df

In [4]:
def make_sequences(df, target_col ,feature_cols, seq_len=10):
    X, y = [], []
    
    values = df[feature_cols].values
    targets = target_col.values

    for i in range(len(df) - seq_len):
        X.append(values[i:i+seq_len])
        y.append(targets[i+seq_len])

    return np.array(X), np.array(y)

In [5]:
def plot_and_save_metrics(metrics_df, target_horizon, save_table=True):
      """Creates and saves three metric plots:
         (1) RMSE + MAE together
         (2) Directional Accuracy
         (3) Spearman Correlation
         (4) Metrics table of all years
         Files are named using the prediction horizon.
      """
      
      # Create directory name and ensure it exists
      base_folder = "results"
      folder_name = os.path.join(base_folder, f"target_{target_horizon}")
      os.makedirs(folder_name, exist_ok=True)

      years = metrics_df["year"]

      # --- Plot 1: RMSE + MAE ---
      fig1, ax1 = plt.subplots(figsize=(8, 4))
      ax1.plot(years, metrics_df["rmse"], marker="o", label="RMSE")
      ax1.plot(years, metrics_df["mae"], marker="o", label="MAE")
      ax1.set_title(f"RMSE & MAE ({target_horizon}-Day Horizon)")
      ax1.set_xlabel("Year")
      ax1.set_ylabel("Error Value")
      ax1.grid(True, alpha=0.3)
      ax1.legend()
      
      filename1 = f"metrics_target_{target_horizon}_RMSE_MAE.png"
      fig1.tight_layout()
      fig1.savefig(os.path.join(folder_name, "RMSE_MAE.png"), dpi=150)
      plt.close(fig1)


      # --- Plot 2: Directional Accuracy ---
      
      fig2, ax2 = plt.subplots(figsize=(8, 4))
      ax2.plot(years, metrics_df["directional_accuracy (%)"], marker="s", linestyle="--")
      ax2.set_ylim(0, 100)
      ax2.set_title(f"Directional Accuracy ({target_horizon}-Day Horizon)")
      ax2.set_xlabel("Year")
      ax2.set_ylabel("Directional Accuracy (%)")
      ax2.grid(True, alpha=0.3)

      filename2 = f"metrics_target_{target_horizon}_Directional_Accuracy.png"
      fig2.tight_layout()
      fig2.savefig(os.path.join(folder_name, "Directional_Accuracy.png"), dpi=150)
      plt.close(fig2)


      # --- Plot 3: Spearman Correlation ---
      fig3, ax3 = plt.subplots(figsize=(8, 4))
      ax3.plot(years, metrics_df["spearman"], marker="s", linestyle=":")
      ax3.set_title(f"Spearman Correlation ({target_horizon}-Day Horizon)")
      ax3.set_xlabel("Year")
      ax3.set_ylabel("Spearman")
      ax3.grid(True, alpha=0.3)

      filename3 = f"metrics_target_{target_horizon}_Spearman.png"
      fig3.tight_layout()
      fig3.savefig(os.path.join(folder_name, "Spearman.png"), dpi=150)
      plt.close(fig3)
      
      
      # save the df for good measure
      df_file = f"metrics_target_{target_horizon}.csv"
      # optional table save 
      if save_table:
        metrics_df.to_csv(os.path.join(folder_name, "metrics.csv"), index=False)
      

      print("\nSaved plots:")
      print(f" - {filename1}")
      print(f" - {filename2}")
      print(f" - {filename3}")
      print(f" - {df_file}")


In [6]:
def collect_target_metrics(base_path="."):
    """
    Collects metrics.csv files from folders named result_target_[TARGET]
    and returns a combined dataframe sorted by TARGET value.
    """

    combined_results = []
    print("Folders found:", os.listdir(base_path))
    
    # Regex to detect folders like result_target_5, result_target_10, etc.
    folder_pattern = re.compile(r"target_(\d+)")
    for folder_name in os.listdir(base_path):
        
        match = folder_pattern.match(folder_name)
        if match:
            target_value = int(match.group(1))
            metrics_file = os.path.join(base_path, folder_name, "metrics.csv")
            # print(f"Checking: {metrics_file}")

            if os.path.exists(metrics_file):
                df = pd.read_csv(metrics_file)
                df["pred_target"] = target_value
                combined_results.append(df)
            else:
                print(f"WARNING: {metrics_file} not found.")

    # Combine all collected data
    if combined_results:
        final_df = pd.concat(combined_results, ignore_index=True)
        final_df = final_df.set_index(["year", "pred_target"]).sort_index()
        return final_df
    else:
        print("No metrics.csv files found.")
        return pd.DataFrame()

In [7]:
def generate_metrics(results):
    """
    Generate yearly prediction performance metrics from model results.

    Parameters
    ----------
    results : list[pd.DataFrame]
        A list of DataFrames where each contains:
        ['date', 'prediction', 'actual']

    Returns
    -------
    pd.DataFrame
        Metrics sorted by year containing:
        RMSE, MAE, Directional Accuracy (%), and Spearman Rank Correlation.
    """

    year_stats = []

    for df_year in results:  # each entry is a year dataframe

        year = df_year['date'].iloc[0].year  # extract year from first row's date

        preds = df_year["prediction"].values
        actual = df_year["actual"].values

        # Metrics
        rmse = np.sqrt(mean_squared_error(actual, preds))
        mae = mean_absolute_error(actual, preds)
        
        # directional accuracy
        direction_accuracy = (np.sign(preds) == np.sign(actual)).mean()

        # Spearman correlation (ignore NaNs)
        spearman_val, _ = spearmanr(actual, preds, nan_policy='omit')

        year_stats.append({
            "year": year,
            "rmse": rmse,
            "mae": mae,
            "directional_accuracy (%)": round(direction_accuracy * 100, 2),
            "spearman": spearman_val
        })

    # Convert results into a table
    return pd.DataFrame(year_stats).sort_values("year").reset_index(drop=True)

#### Model

In [8]:
class LSTMPredictor(nn.Module):
    def __init__(self, input_dim, hidden_dim=64, num_layers=2, dropout=0.2):
        super().__init__()

        self.lstm = nn.LSTM(
            input_size=input_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout
        )
        
        self.fc = nn.Linear(hidden_dim, 1)

    def forward(self, x):
        # x: (batch, seq_len, features)
        output, (hidden, _) = self.lstm(x)
        last_hidden = hidden[-1]
        return self.fc(last_hidden)
    

In [9]:
def save_experiment(model, config, metrics_df, target_horizon):
    
    os.makedirs("models", exist_ok=True)

    # Save checkpoint
    checkpoint_path = os.path.join("models", f"experiment_target_{target_horizon}.pth")
    torch.save({
        "model_config": config,
        "weights": model.state_dict(),
        "metrics": metrics_df.to_dict(),
        "target_horizon": target_horizon
    }, checkpoint_path)

    # Save config separately as JSON
    config_path = os.path.join("models", f"config_target_{target_horizon}.json")
    with open(config_path, "w") as f:
        json.dump(config, f, indent=4)

    print(f"\nSaved model + config to models/:")
    print(f" - {checkpoint_path}")
    print(f" - {config_path}")

In [10]:
def load_model_for_target(target, input_dim, hidden_dim, num_layers, dropout):
    model_path = f"model_target_{target}.pt"
    
    model = LSTMPredictor(
        input_dim=input_dim,
        hidden_dim=hidden_dim,
        num_layers=num_layers,
        dropout=dropout
    )
    
    model.load_state_dict(torch.load(model_path))
    model.eval()
    return model

In [11]:
def save_checkpoint(model: LSTMPredictor, config, metrics, target):
    checkpoint = {
        "model_config": config,
        "weights": model.state_dict(),
        "metrics": metrics.to_dict(),
        "target_horizon": target
    }

    os.makedirs("models", exist_ok=True)
    path = f"models/experiment_target_{target}.pth"
    torch.save(checkpoint, path)

    print(f"Saved experiment: {path}")

In [12]:
def load_checkpoint(path):
    checkpoint = torch.load(path)
    model = LSTMPredictor(**checkpoint["model_config"])
    model.load_state_dict(checkpoint["weights"])
    return model, checkpoint

In [13]:
def save_config(config, target):
    """
    Stores the hyperparameter configuration of the trained LSTM
    """
    folder = "models"
    os.makedirs(folder, exist_ok=True)

    config_path = os.path.join(folder, f"config_target_{target}.json")

    with open(config_path, "w") as f:
        json.dump(config, f, indent=4)

    print(f"[âœ“] Saved config to: {config_path}")

In [14]:
def load_config(target):
    path = f"models/config_target_{target}.json"
    with open(path, "r") as f:
        return json.load(f)

In [15]:
def make_seed(fixed=None):
    """Returns a deterministic seed if provided, otherwise generates a random one."""
    return fixed if fixed is not None else np.random.randint(1, 1_000_000)

In [16]:
def apply_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)

    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

#### Training Loop

In [17]:
def walk_forward_training_loop(df, target_col, feature_cols,
    PRED_TARGET=15, 
    SEQ_LEN=10,
    START_YEAR=2015,
    END_YEAR=2025,
    LEARNING_RATE=0.001,
    HIDDEN_DIM=64,
    NUM_LAYERS=2,
    DROPOUT=0.2,
    WEIGHT_SEED=42,
    BATCH_SIZE=32,
    EPOCHS=10):

    results = []

    for year in range(START_YEAR, END_YEAR):
        print(f"\n=== WALK {year} ===")

        # find actual first trading day of the year
        test_start = df[df.index >= f"{year}-01-01"].index.min()
        if pd.isna(test_start):
            print(f"[SKIP] No rows for year {year}")
            continue

        # rolling expand training set, strict 1-year forecast window
        train = df[df.index < test_start]
        test  = df[(df.index >= test_start) & (df.index < f"{year+1}-01-01")]

        if len(train) < SEQ_LEN:
            print(f"[WAIT] Not enough history ({len(train)} rows, need {SEQ_LEN}) â€” skipping.")
            continue

        # scale on train only
        scaler = StandardScaler()
        scaled_train = scaler.fit_transform(train[feature_cols])
        scaled_test  = scaler.transform(test[feature_cols])

        train_scaled = train.copy()
        test_scaled  = test.copy()
        train_scaled[feature_cols] = scaled_train
        test_scaled[feature_cols]  = scaled_test

        # Make sequences
        X_train, y_train = make_sequences(train_scaled, target_col, feature_cols, SEQ_LEN)
        X_test,  y_test  = make_sequences(test_scaled,  target_col, feature_cols, SEQ_LEN)

        # tensors
        X_train_t = torch.tensor(X_train, dtype=torch.float32)
        y_train_t = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1)
        X_test_t  = torch.tensor(X_test, dtype=torch.float32)

        # reproducibility
        apply_seed(WEIGHT_SEED)

        # fresh model
        model = LSTMPredictor(
            input_dim=len(feature_cols),
            hidden_dim=HIDDEN_DIM,
            num_layers=NUM_LAYERS,
            dropout=DROPOUT
        )

        loader = DataLoader(TensorDataset(X_train_t, y_train_t), batch_size=BATCH_SIZE, shuffle=False)

        # optimizer
        opt = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
        loss_fn = nn.MSELoss()

        # train
        for epoch in range(EPOCHS):
            for xb, yb in loader:
                opt.zero_grad()
                loss = loss_fn(model(xb), yb)
                loss.backward()
                opt.step()

        # predict
        preds = model(X_test_t).detach().numpy().flatten()
        pred_dates = test_scaled.index[SEQ_LEN:]

        results.append(pd.DataFrame({
            "date": pred_dates,
            "prediction": preds,
            "actual": y_test[-len(preds):]
        }))

    config = {
        "input_dim": len(feature_cols),
        "hidden_dim": HIDDEN_DIM,
        "num_layers": NUM_LAYERS,
        "dropout": DROPOUT,
        "seq_len": SEQ_LEN,
        "target_horizon": PRED_TARGET,
        "learning_rate": LEARNING_RATE,
        "epochs": EPOCHS,
        "weight_init_seed": WEIGHT_SEED,
        "batch_size": BATCH_SIZE
    }

    return results, model, config

#### Operation Pipeline

In [47]:
#INIT PARAMS
    # Dataset/Time
TICKER = "AMZN"
START = "2015-01-01"
END = "2025-01-01"
Y_START = 2015
Y_END = 2022
WARMUP = 70
TARGET = 15
    # Model
SEED = make_seed(42)
LOOKBACK = 10
DIM = 64
LAYERS = 2
DROPOUT = 0.2
LEARNING_RATE = 0.001
BATCH_SIZE = 32
EPOCHS = 10

In [43]:
# 1. Load full data
df = load_price_data(
    ticker=TICKER,
    start_date=START,
    end_date=END,
    leadup_days=WARMUP,
    pred_target=TARGET
)

# 2. Create walk-forward boundary

# leave last two years for final testing untouched during training.
train_df = df[df.index < "2023-01-01"]
test_df  = df[df.index >= "2023-01-01"]


# 3. Train the model using WalkForward testing split
# use all columns as features except target column
feature_cols = [col for col in df.columns if col not in [f"{TARGET}_day_target"]]
target_col = df[f"{TARGET}_day_target"]

results, model, config = walk_forward_training_loop(
    train_df, target_col, 
    feature_cols,
    PRED_TARGET=TARGET,
    SEQ_LEN=LOOKBACK, 
    START_YEAR=Y_START, 
    END_YEAR=Y_END, 
    LEARNING_RATE= LEARNING_RATE, 
    HIDDEN_DIM= DIM, 
    NUM_LAYERS= LAYERS, 
    DROPOUT= DROPOUT,
    EPOCHS=EPOCHS,
    WEIGHT_SEED=SEED,
    BATCH_SIZE=BATCH_SIZE
    
)

# 4. Convert results into a table
metrics_df = generate_metrics(results=results)

# 5. store results
plot_and_save_metrics(metrics_df, TARGET)
save_experiment(model, config, metrics_df, TARGET)




1 Failed download:
['META']: YFTzMissingError('possibly delisted; no timezone found')


ValueError: [ERROR] No data returned for ticker 'META'.

In [20]:
metrics_df

NameError: name 'metrics_df' is not defined

In [21]:
metrics_combined = collect_target_metrics(r"C:\Projects\Workspaces\NN\results")
metrics_combined
# metrics_combined.swaplevel("year", "pred_target")

Folders found: ['target_15', 'target_30']


Unnamed: 0_level_0,Unnamed: 1_level_0,rmse,mae,directional_accuracy (%),spearman
year,pred_target,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2015,15,0.083687,0.06877,70.25,-0.102168
2015,30,0.186073,0.156129,63.77,0.234522
2016,15,0.134896,0.113871,39.26,-0.032589
2016,30,0.568233,0.540696,34.47,0.057996
2017,15,0.101579,0.080987,34.85,0.119427
2017,30,0.247933,0.196842,39.8,-0.128993
2018,15,0.141982,0.11844,65.56,-0.259904
2018,30,0.208363,0.185902,71.42,-0.049284
2019,15,0.076832,0.056928,73.55,0.192271
2019,30,0.255556,0.219676,39.78,0.024605


#### meta-experiment layer

Goal: Finding the most suitable prediction horizon to use as baseline

In [22]:
def generate_prediction_targets(df: pd.DataFrame, pred_targets: list) -> pd.DataFrame:
    """
    Adds multiple prediction horizon target columns to the dataframe using
    cumulative log returns.

    Example:
      pred_targets = [1, 5, 10, 15]
      â†’ columns: ["1_day_target", "5_day_target", "10_day_target", "15_day_target"]
    """

    df = df.copy()

    # Compute daily log returns once
    daily_log_return = np.log(df["close"] / df["close"].shift(1))

    for t in pred_targets:
        df[f"{t}_day_target"] = (
            daily_log_return.rolling(t).sum().shift(-t)
        )

    # Drop NaNs only once (after all targets are added)
    df = df.dropna()

    return df

In [23]:
def run_multi_horizon_experiments(
    df,
    horizons,
    base_seed=42,   # each horizon will mutate from this
    seq_len=10,
    start_year=2015,
    end_year=2022,
    learning_rate=0.001,
    hidden_dim=64,
    num_layers=2,
    dropout=0.2,
    batch_size=32,
    epochs=10
):

    all_results = []  # stores summary for comparison

    for i, horizon in enumerate(horizons):

        

        print("\n" + "="*50)
        print(f"ðŸš€ Running experiment for horizon: {horizon} days")
        print("="*50)

        # 1) Assign target column name for this horizon
        target_col_name = f"{horizon}_day_target"

        if target_col_name not in df.columns:
            raise ValueError(f"Missing column {target_col_name}. Did load_price_data generate these?")

        target_col = df[target_col_name]

        # 2) Feature cols = everything except target
        feature_cols = [c for c in df.columns if c != target_col_name]

        # 3) Make reproducible but horizon-unique seed
        seed = base_seed + (i * 137)  # guarantees different but traceable seeds

        # 4) Walk-forward training
        results, model, config = walk_forward_training_loop(
            df[df.index < "2023-01-01"],   # train
            target_col,
            feature_cols,
            WEIGHT_SEED=seed,
            PRED_TARGET=horizon,
            SEQ_LEN=seq_len,
            START_YEAR=start_year,
            END_YEAR=end_year,
            LEARNING_RATE=learning_rate,
            HIDDEN_DIM=hidden_dim,
            NUM_LAYERS=num_layers,
            DROPOUT=dropout,
            BATCH_SIZE=batch_size,
            EPOCHS=epochs
        )

        # 5) Compute metrics
        metrics_df = generate_metrics(results)

        # 6) Save model + metrics + plots
        plot_and_save_metrics(metrics_df, horizon)
        save_experiment(model, config, metrics_df, horizon)

        # 7) Store summary row
        all_results.append({
            "horizon": horizon,
            "weight_seed": seed,
            "avg_rmse": metrics_df["rmse"].mean(),
            "avg_mae": metrics_df["mae"].mean(),
            "avg_directional_accuracy": metrics_df["directional_accuracy (%)"].mean(),
            "avg_spearman": metrics_df["spearman"].mean()
        })

    # return final comparison table
    return pd.DataFrame(all_results)

In [24]:
HORIZONS = [1, 5, 10, 15, 20, 25, 30]

In [46]:
df = load_price_data(
    ticker=TICKER,
    start_date=START,
    end_date=END,
    leadup_days=WARMUP,
    pred_target=TARGET
)



$META: possibly delisted; no timezone found


AttributeError: 'Index' object has no attribute 'normalize'

In [None]:
multi_horizon_df = generate_prediction_targets(df, HORIZONS)
multi_horizon_df

In [None]:




summary_df = run_multi_horizon_experiments(
    df=train_df,
    horizons=HORIZONS,
    base_seed=SEED,
    seq_len=LOOKBACK,
    start_year=Y_START,
    end_year=Y_END,
    learning_rate=LEARNING_RATE,
    hidden_dim=DIM,
    num_layers=LAYERS,
    dropout=DROPOUT,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS
)

summary_df.to_csv("results/horizon_comparison_summary.csv", index=False)
print(summary_df)


ðŸš€ Running experiment for horizon: 1 days


ValueError: Missing column 1_day_target. Did load_price_data generate these?

#### NEAT implementation of NeuroEvolutionary network

In [32]:
print(yf.Ticker("AMZN").get_info().get("exchangeTimezoneName"))

America/New_York
