In [1]:
# Cisco Dacanay
#

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, random_split
from sklearn.model_selection import train_test_split
import torchvision.models as models
import kagglehub
import os
import pandas as pd
import numpy as np
from datetime import timedelta

# Download latest version
path = kagglehub.dataset_download("dgawlik/nyse")

print("Path to dataset files:", path)

os.listdir(path)

Path to dataset files: C:\Users\darth\.cache\kagglehub\datasets\dgawlik\nyse\versions\3


['fundamentals.csv',
 'prices-split-adjusted.csv',
 'prices.csv',
 'securities.csv']

In [2]:
fundamentals = pd.read_csv(path + "/fundamentals.csv") # SEC annual filing reports, 449 securities
prices = pd.read_csv(path + "/prices-split-adjusted.csv") # Split-adjusted prices, 502 securities
# securities = pd.read_csv(path + "/securities.csv") # List of securities/tickers, 506 securities

# limit data size for faster testing
# fundamentals = fundamentals.iloc[:20].copy()

# Filter out securities that aren't in fundamentals data
prices = prices[prices["symbol"].isin(fundamentals["Ticker Symbol"])]

# Not using securities csv since only the ticker symbol is being used
securities = fundamentals["Ticker Symbol"].unique()
securities = pd.DataFrame(securities, columns=["Ticker symbol"])


In [3]:
# Convert date strings in dataset to datetime
datetime_dates_fundamentals = []
for index, date in fundamentals["Period Ending"].items():
  date = pd.to_datetime(date)
  datetime_dates_fundamentals.append(date)

datetime_dates_prices = []
for index, date in prices["date"].items():
  date = pd.to_datetime(date)
  datetime_dates_prices.append(date)

# Replace date string columns with datetime lists
fundamentals["Period Ending"] = datetime_dates_fundamentals
prices["date"] = datetime_dates_prices

In [4]:
# Input features: 2 consecutive years of fundamentals reports data, price data in between those 2 reports (predicting future price based off ~1 year of price data and 2 SEC filings)

price_count = 50 # how many price data points will be given to the models (should be around 250 data points per year)
prediction_lead = 22 # how many business days ahead the price will be predicted (22 business days ~ 30 days)

feature_data = []
label_data = []

for ticker in securities["Ticker symbol"]:
  # print(ticker)
  report_dates = []
  report_data = []

  # Get period ending dates of reports for each ticker
  for index, ticker_reports in fundamentals[fundamentals["Ticker Symbol"] == ticker].iterrows():
    # print(ticker_reports["Period Ending"])
    report_date = ticker_reports["Period Ending"]
    # print(report_date)
    report_dates.append(report_date)
    report_data.append(ticker_reports["Accounts Payable":"Treasury Stock"]) # fundamentals csv has 2 more columns after Treasury Stock but not all rows have data

  # Get ticker prices between dates of 2 reports
  for period in range(len(report_dates) - 1):
    price_start_date = report_dates[period]
    price_end_date = report_dates[period + 1]
    period_prices = []

    period_prices_full = prices[(prices["symbol"] == ticker) & (prices["date"] > price_start_date) & (prices["date"] <= price_end_date)]["close"] # chose to use closing price for each day, not sure if it matters
    if len(period_prices_full) <= prediction_lead: # skip periods that don't have any price data before prediction lead
      continue
    step_size = (len(period_prices_full) - 1 - prediction_lead) / max(1, (price_count - 1)) # avg index distance between sampled prices (float for better distribution)

    for count in range(price_count):
      step = int(round(count * step_size)) # calculate index to sample from and cast to int
      period_prices.append(period_prices_full.iloc[step])

    # print(period_prices)

    # Combine data into dataframe
    fundamentals_start = report_data[period].tolist()
    fundamentals_end = report_data[period + 1].tolist()
    data_row = period_prices + fundamentals_start + fundamentals_end
    feature_data.append(data_row)
    label_data.append(period_prices_full.iloc[-1])

# Make array of column names
fundamentals_columns_start = fundamentals.add_suffix(" (Start)").columns.tolist()[3:76]
fundamentals_columns_end = fundamentals.add_suffix(" (End)").columns.tolist()[3:76]
price_columns = [f"Price {i}" for i in range(1, price_count + 1)]
data_columns = price_columns + fundamentals_columns_start + fundamentals_columns_end + ["Final Price"]

print(f"Dataset: {len(label_data)} rows")
# Create dataframe from data and column names (not using, going straight from list to tensor instead)
# df = pd.DataFrame(data_lst, columns=data_columns)

Dataset: 1318 rows


In [5]:
# Create dataset tensor
feature_tensor = torch.tensor(np.array(feature_data), dtype=torch.float32)
label_tensor = torch.tensor(np.array(label_data), dtype=torch.float32)

# Note: Zach commented this out to normalize data without labels, did a different split in cell below
'''
dataset = TensorDataset(feature_tensor, label_tensor)
# Split data
train_set, validation_set = train_test_split(dataset, test_size=0.2, random_state=42)
validation_set, test_set = train_test_split(validation_set, test_size=0.5, random_state=42)
'''

'\ndataset = TensorDataset(feature_tensor, label_tensor)\n# Split data\ntrain_set, validation_set = train_test_split(dataset, test_size=0.2, random_state=42)\nvalidation_set, test_set = train_test_split(validation_set, test_size=0.5, random_state=42)\n'

In [6]:
# Normalize features
mean = feature_tensor.mean(dim=0, keepdim=True)
std = feature_tensor.std(dim=0, keepdim=True)
normalized_features = (feature_tensor - mean) / std

# Combine features and labels into a dataset
dataset = TensorDataset(normalized_features, label_tensor)

# Split into train, validation, and test
dataset_size = len(feature_tensor)
train_size = int(0.7 * dataset_size)
val_size = int(0.20 * dataset_size)
test_size = dataset_size - train_size - val_size
train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])

In [None]:
# Save tensors to files
torch.save(train_dataset, "train_set.pt")
torch.save(val_dataset, "val_set.pt")
torch.save(test_dataset, "test_set.pt")