In [1]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch.nn.functional as F
import datetime
from tickers import getTickers
import tqdm

In [2]:
labels = [0,1,2,3,4]


class StockDataset(Dataset):
    def __init__(self, ticker, context = 5) -> None:
        super().__init__()
        self.info = []
        self.context = context

        self.info = pd.read_csv(f"data/{ticker}.csv")
        # features = self.info.columns.tolist()
        # features.pop()
        self.info = self.info.values.tolist()


    def __len__(self):
        return len(self.info) - self.context if len(self.info) > self.context else len(self.info)
    def __getitem__(self, index):
        index+=self.context
        res = []

        for day in self.info[index-self.context: index]:
            time = self.transformDay(day[0])
            res.append(day[1:] + time)
            # res.append(day[1:])

        
        previousDays = torch.Tensor(res)
        truthClosingPrice = self.info[index][4]
        dayBeforeClosingPrice = self.info[index-1][4]
        label = self.getLabel(truthClosingPrice, dayBeforeClosingPrice)
        label = torch.tensor(label)


        return previousDays, label
    def transformDay(self, date):

        dt = datetime.datetime.strptime(date, "%Y-%m-%d")
        res = [dt.weekday(), dt.day, dt.month]
        return res
    def getLabel(self, truth, dayBefore):
        change = round((truth - dayBefore) / truth, 4)
        # print(f"Day before {dayBefore} --> today {truth}....change = {change}")
        if change <= -0.05: return 0
        if change <= -0.01 and change > -0.05: return 1
        if change > -0.01 and change < 0.01: return 2
        if change >= 0.01 and change < 0.05: return 3
        if change >= 0.05: return 4

        





dkng = StockDataset("AAPL")
print(dkng[0], dkng[3010])



(tensor([[7.6225e+00, 7.6607e+00, 7.5850e+00, 7.6432e+00, 6.5530e+00, 4.9373e+08,
         0.0000e+00, 4.0000e+00, 1.0000e+00],
        [7.6643e+00, 7.6996e+00, 7.6161e+00, 7.6564e+00, 6.5644e+00, 6.0190e+08,
         1.0000e+00, 5.0000e+00, 1.0000e+00],
        [7.6564e+00, 7.6868e+00, 7.5268e+00, 7.5346e+00, 6.4599e+00, 5.5216e+08,
         2.0000e+00, 6.0000e+00, 1.0000e+00],
        [7.5625e+00, 7.5714e+00, 7.4661e+00, 7.5207e+00, 6.4480e+00, 4.7713e+08,
         3.0000e+00, 7.0000e+00, 1.0000e+00],
        [7.5107e+00, 7.5714e+00, 7.4664e+00, 7.5707e+00, 6.4909e+00, 4.4761e+08,
         4.0000e+00, 8.0000e+00, 1.0000e+00]]), tensor(2)) (tensor([[1.7928e+02, 1.8114e+02, 1.7075e+02, 1.7226e+02, 1.7226e+02, 1.5019e+08,
         3.0000e+00, 1.6000e+01, 1.2000e+01],
        [1.6993e+02, 1.7347e+02, 1.6969e+02, 1.7114e+02, 1.7114e+02, 1.9543e+08,
         4.0000e+00, 1.7000e+01, 1.2000e+01],
        [1.6828e+02, 1.7058e+02, 1.6746e+02, 1.6975e+02, 1.6975e+02, 1.0750e+08,
         0.0000

In [3]:
class Encoder(nn.Module):
    def __init__(self, num_hidden = 128, num_layers = 1, context = 5, features = 9, dropout = 0.1) -> None:
        super().__init__()
        self.rnn = nn.GRU(features, num_hidden, num_layers, batch_first = True)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = self.dropout(x)
        output, state = self.rnn(x)
        return output, state

class PlainDecoder(nn.Module):
    def __init__(self, num_hidden = 128, num_layers = 1, context = 5, features = 9, dropout = 0.1) -> None:
        super().__init__()
        self.rnn = nn.GRU(features, num_hidden, num_layers, batch_first = True)
        self.dense = nn.Linear(num_hidden, 1)
        self.dropout = nn.Dropout(dropout)


    def forward(self, x, state):

        # x = self.dropout(x)
        output, state = self.rnn(x, state)
        output = self.dense(output)
        return output

class Seq2Seq(nn.Module):
    def __init__(self, num_hidden = 128, num_layers = 1, context = 5, features = 9, dropout = 0.1) -> None:
        super().__init__()
        self.encoder = Encoder()
        self.decoder = PlainDecoder()
    def forward(self, x):
        # print(x.shape)
        encoder_output, encoder_hidden = self.encoder(x)# hidden --> layer, batch, hidden dim
        most_recent_day = x[:,-1,:].unsqueeze(1) #batch, days, features
        output = self.decoder(most_recent_day, encoder_hidden).squeeze(2)
        return output


class PlainModel(nn.Module):
    def __init__(self, num_hidden = 128, num_layers = 1, context = 5, features = 9, dropout = 0.1) -> None:
        super().__init__()
        self.rnn = nn.GRU(features, num_hidden, num_layers, batch_first = True)
        self.dense = nn.Linear(num_hidden, 5)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = self.dropout(x)
        output, state = self.rnn(x)
        output = self.dense(output)
        return output[:,-1]


In [173]:
model = Seq2Seq()
# train_loader = DataLoader(dev, batch_size = 10, shuffle= True)
optimizer = optim.Adam(model.parameters(), lr = 0.001)
loss = nn.MSELoss()
epochs = 25

for epoch in range(epochs):
    total_mse = 0
    for train_loader in dev:
        for x, y in train_loader:

            optimizer.zero_grad()
            output = model(x).squeeze()
        
            batch_loss = loss(output, y)
            total_mse+=batch_loss
            batch_loss.backward()
            optimizer.step()
    print('Epoch [{}/{}], Loss: {:.4f}'.format(epoch+1, epochs, total_mse))   

Epoch [1/25], Loss: 4633211.0000
Epoch [2/25], Loss: 4100686.7500
Epoch [3/25], Loss: 3881414.2500
Epoch [4/25], Loss: 3672417.7500
Epoch [5/25], Loss: 3440325.5000
Epoch [6/25], Loss: 3367491.7500
Epoch [7/25], Loss: 3316024.5000
Epoch [8/25], Loss: 3275573.7500
Epoch [9/25], Loss: 3214323.7500
Epoch [10/25], Loss: 3154447.0000
Epoch [11/25], Loss: 3207470.2500
Epoch [12/25], Loss: 3107849.5000
Epoch [13/25], Loss: 3127691.2500
Epoch [14/25], Loss: 3062318.2500
Epoch [15/25], Loss: 3020305.2500
Epoch [16/25], Loss: 3085181.0000
Epoch [17/25], Loss: 3023238.5000
Epoch [18/25], Loss: 3035671.5000
Epoch [19/25], Loss: 3002042.2500
Epoch [20/25], Loss: 2991512.7500
Epoch [21/25], Loss: 3043677.2500
Epoch [22/25], Loss: 2953368.0000
Epoch [23/25], Loss: 2911113.2500
Epoch [24/25], Loss: 2908077.5000
Epoch [25/25], Loss: 2888042.5000


In [4]:
def getWeight(numClasses, total, classTotal):
    return total / (numClasses*classTotal)


from itertools import chain
dev_tickers = ['AAPL','TSLA','COKE', 'AMAT']
dev = [StockDataset(ticker) for ticker in dev_tickers]

c = {
    0:0,
    1:0,
    2:0,
    3:0,
    4:0
}
for d in dev:
    for sample in range(len(d)):
        c[d[sample][1].item()]+=1

print(c)
weights = []
total_sample = sum(c.values())
print(total_sample)

for cls in c:
    weights.append(getWeight(5, total_sample, c[cls]))

print(weights)





dev = [DataLoader(ticker, batch_size = 64, shuffle= True) for ticker in dev]
# dev = list(chain.from_iterable(dev))
# print(len(dev))



{0: 265, 1: 2653, 2: 5558, 3: 3180, 4: 266}
11922
[8.997735849056603, 0.8987561251413494, 0.429003238575027, 0.749811320754717, 8.96390977443609]


In [5]:
train_tickers = getTickers()
train = [StockDataset(ticker) for ticker in train_tickers if len(StockDataset(ticker)) == 3011]
print(f"Training set has {len(train)} tickers")
train = [DataLoader(ticker, batch_size = 64, shuffle= True) for ticker in train]
test = train[:20]
train = train[20:]

Training set has 1503 tickers


In [None]:
model = PlainModel(features=9)
# train_loader = DataLoader(dev, batch_size = 10, shuffle= True)
optimizer = optim.Adam(model.parameters(), lr = 0.001)
classWeights = torch.FloatTensor(weights)
# loss = nn.CrossEntropyLoss()
loss = nn.CrossEntropyLoss(weight=classWeights)

epochs = 2

for epoch in (range(epochs)):
    total_mse = 0
    for train_loader in (train):
        for x, y in train_loader:

            optimizer.zero_grad()
            output = model(x).squeeze()
        
            
            batch_loss = loss(output, y)
            total_mse+=batch_loss
            batch_loss.backward()
            optimizer.step()
    print('Epoch [{}/{}], Loss: {:.4f}'.format(epoch+1, epochs, total_mse))   
    


In [16]:
model.eval()
correct, total = 0,0
for loader in test:
    for x,y in loader:
        output = model(x).squeeze()
        
        batch_loss = loss(output, y)
        total_mse+=batch_loss
        batch_loss.backward()
        optimizer.step()

        output = F.softmax(output, dim = 1)
        output = output.argmax(dim = 1)

        for pred, truth in zip(output, y):
            print(pred, truth)
            correct = correct+ 1 if pred == truth else correct
            total+=1


print(f"{correct}/{total}")


tensor(2) tensor(1)
tensor(2) tensor(2)
tensor(2) tensor(2)
tensor(2) tensor(2)
tensor(2) tensor(1)
tensor(2) tensor(2)
tensor(2) tensor(1)
tensor(2) tensor(2)
tensor(2) tensor(3)
tensor(2) tensor(1)
tensor(2) tensor(1)
tensor(2) tensor(2)
tensor(2) tensor(1)
tensor(2) tensor(4)
tensor(2) tensor(2)
tensor(2) tensor(0)
tensor(2) tensor(3)
tensor(2) tensor(3)
tensor(2) tensor(1)
tensor(2) tensor(1)
tensor(2) tensor(3)
tensor(2) tensor(2)
tensor(2) tensor(1)
tensor(2) tensor(2)
tensor(2) tensor(4)
tensor(2) tensor(2)
tensor(2) tensor(1)
tensor(2) tensor(1)
tensor(2) tensor(3)
tensor(2) tensor(2)
tensor(2) tensor(3)
tensor(2) tensor(2)
tensor(2) tensor(3)
tensor(2) tensor(4)
tensor(2) tensor(3)
tensor(2) tensor(2)
tensor(2) tensor(1)
tensor(2) tensor(1)
tensor(2) tensor(2)
tensor(2) tensor(0)
tensor(2) tensor(1)
tensor(2) tensor(1)
tensor(2) tensor(3)
tensor(2) tensor(2)
tensor(2) tensor(0)
tensor(2) tensor(2)
tensor(2) tensor(1)
tensor(2) tensor(2)
tensor(2) tensor(2)
tensor(2) tensor(3)
