In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
import random
import os
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

In [2]:
BASE_DIR = 'data_feed/'
# Set the hyperparameters
HIDDEN_SIZE = 32
NUM_EPOCHS = 100
LAG = 10
N_STOCK = 20
BATCH_SIZE = 32

# Data Preprocessing

In [3]:
# check if data_feed directory exist if not 
assert os.path.exists(BASE_DIR)

In [4]:
# list all the files in the data_feed directory
files = os.listdir('data_feed/')

In [5]:
def rename_columns(df: pd.DataFrame, file_name: str) -> None:
    """
    modify columns names (exept date) into the following structure: 
        *column_name*_*stock_name* 
        
    Example: Open_AAPL
    """
    df.rename(columns=dict(zip(df.columns[1:], df.columns[1:] + '_' + file_name[:-4])), inplace=True)

In [6]:
# Read First File into a pandas dataframe
df = pd.read_csv(BASE_DIR+files[0])
rename_columns(df, files[0])

# Merge Everything into a unique DataFrame
for fn in files[1:]:
    df2 = pd.read_csv(BASE_DIR+fn)
    rename_columns(df2, fn)
    result = df.merge(df2, on='Date')
    df = result

df.set_index('Date',inplace=True)
df.index = pd.to_datetime(df.index)

In [7]:
cols = [col for col in df.columns if "Adj Close" in col]
df = df[cols]


# Save all the data onto a file
df.to_csv('AGGREGATED_DATA.csv')

In [9]:
df_copy = df.copy(deep=True)
for col in df.columns:
    df_copy[f'Ret_{col}'] = df_copy[col] / df_copy[col].shift(1)

In [10]:
df_copy

Unnamed: 0_level_0,Adj Close_BRK-A,Adj Close_JPM,Adj Close_MCD,Adj Close_MSFT,Adj Close_MA,Adj Close_NKE,Adj Close_KO,Adj Close_V,Adj Close_UNH,Adj Close_HD,...,Ret_Adj Close_GOOGL,Ret_Adj Close_WMT,Ret_Adj Close_AAPL,Ret_Adj Close_PFE,Ret_Adj Close_META,Ret_Adj Close_XOM,Ret_Adj Close_PG,Ret_Adj Close_AMZN,Ret_Adj Close_DIS,Ret_Adj Close_JNJ
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-01-02,139610.0,33.557884,67.764648,22.668226,47.946102,23.057142,27.239819,36.085690,46.601398,50.027027,...,,,,,,,,,,
2013-01-03,140549.0,33.490253,68.148109,22.364561,48.014698,23.292871,27.239819,36.113567,44.422573,49.885166,...,1.000581,0.993645,0.987378,0.997685,0.991786,0.998197,0.993659,1.004547,1.002153,0.998588
2013-01-04,140803.0,34.083870,67.561600,21.945997,48.012821,23.519707,27.283293,36.408501,44.508015,49.790600,...,1.019760,1.003779,0.972146,1.004255,1.035650,1.004630,1.002031,1.002592,1.019137,1.011451
2013-01-07,140190.0,34.121441,68.358658,21.904963,48.844883,23.555288,27.022470,36.668633,44.508015,49.522652,...,0.995637,0.990443,0.994118,1.000771,1.022949,0.988422,0.993197,1.035925,0.976624,0.997904
2013-01-08,141000.0,34.189060,68.381233,21.790056,48.684105,23.306211,26.834129,37.010017,43.918453,49.822124,...,0.998027,1.002778,1.002691,1.001539,0.987763,1.006255,0.998397,0.992252,0.995880,1.000140
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-05-11,493000.0,136.050003,293.233490,309.433533,383.390015,121.819016,63.860001,231.009995,488.760010,285.633667,...,1.043132,1.003736,1.001095,0.991818,1.011627,0.981901,1.002337,1.018060,0.912695,0.995917
2023-05-12,491182.0,134.100006,294.576355,308.296051,381.920013,119.815605,64.110001,231.380005,491.230011,288.393799,...,1.008064,0.999674,0.994582,0.993880,0.991603,0.999905,1.010169,0.982885,0.996533,0.998696
2023-05-15,495900.0,135.229996,294.337616,308.784973,383.410004,119.436852,63.939999,232.809998,486.859985,286.477600,...,0.991490,0.992226,0.997103,0.994913,1.021599,1.001907,1.000321,1.008525,1.009458,0.992350
2023-05-16,498620.0,134.320007,292.596832,311.059998,380.239990,116.097847,63.220001,230.470001,479.720001,280.311981,...,1.025749,0.986173,1.000000,0.995963,0.999833,0.975730,0.998269,1.019784,0.979754,0.998684


In [None]:
split_date = df.index[0] + pd.offsets.DateOffset(years=8)
train_data = df[df.index <= split_date]
test_data = df[df.index > split_date]
test_data.to_csv('TEST.csv')
train_data.to_csv('TRAIN.csv')

# Data Preparation for the Model

In [11]:
test_data = pd.read_csv('TEST.csv')
train_data = pd.read_csv('TRAIN.csv')

In [None]:
device = torch.device("cpu")
if torch.cuda.is_available():
    device = torch.device("cuda")

In [None]:
scaler = MinMaxScaler()
scaled_train = scaler.fit_transform(train_data.values)
scaled_test = scaler.transform(test_data.values)

In [None]:
class TimeSeriesdataset(Dataset):
    def __init__(self, lag: int, data: np.ndarray, device: torch.device):
        self.lag = lag
        self.data = data
        self.device = device

    def __len__(self):
        lenght = len(self.data)
        return lenght - (self.lag + 1)

    def __getitem__(self, idx):
        X = self.data[idx:idx+self.lag, :].flatten()
        Y = self.data[idx+self.lag, :]
        return torch.Tensor(X, device=self.device), torch.Tensor(Y, device=self.device)

In [None]:
train_dataset = TimeSeriesdataset(lag=LAG, data=scaled_train, device=device)
test_dataset = TimeSeriesdataset(lag=LAG, data=scaled_test, device=device)

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [None]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        out, _ = self.rnn(x)
        out = self.fc(out)
        return out

In [None]:
# Create the RNN model
model = RNN(input_size=LAG*N_STOCK, hidden_size=HIDDEN_SIZE, output_size=N_STOCK).to(device)

# Define the loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters())

In [None]:
# Training loop
running_loss = 0.
last_loss = 0.
for epoch in range(NUM_EPOCHS):
    for i, batch in enumerate(train_dataloader):
        X, y = batch
        predict = model(X)
        loss = criterion(predict, y)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    if (epoch + 1) % 10 == 0:
        last_loss = running_loss/10
        print(f'Epoch: {epoch + 1}/{NUM_EPOCHS}, Loss: {last_loss}')
        running_loss = 0

In [None]:
# Generate predictions
model.eval()
predictions = []
with torch.no_grad():
    for i, batch in enumerate(test_dataloader):
        X, y = batch
        predict = model(X)
        predictions.append(predict.detach().numpy())

# Plots

In [None]:
# join everything into a unique matrix
all_pred = np.zeros(N_STOCK)
for b in predictions:
    for p in b:
        all_pred = np.vstack((all_pred, p))
# Remove zero vector
all_pred = all_pred[1:, :]
all_pred = scaler.inverse_transform(all_pred)

In [None]:
all_real = np.concatenate((scaled_train, scaled_test))
all_real = scaler.inverse_transform(all_real)

In [None]:
all_real.shape[0]

In [None]:
# idx = 0
for idx, file_name in enumerate(files):
    stock = file_name[:-4]
    index_test_start = all_real[:, 0].shape[0] - all_pred.shape[0]
    plt.figure()
    plt.plot( range(all_real.shape[0]) , all_real[:, idx], label='real')
    plt.plot( range(index_test_start,all_real.shape[0]), all_pred[:, idx], color='orange', label='forecast')
    plt.axvline(x=index_test_start, color='red', linestyle='--')
    plt.title(stock)
    plt.legend()
    plt.show()