In [1]:
import time
import numpy as np
import pandas as pd
import datetime as datetime

import torch
from torch import nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

import matplotlib.pyplot as plt

# All code was written by Derek Shore, unless otherwise noted.

In [2]:
# load training/test features 
training_features = pd.read_hdf('../build_dataset/aapl_spy_training_examples.h5', key='data')
test_features = pd.read_hdf('../build_dataset/aapl_spy_test_examples.h5', key='data')

training_target = pd.read_hdf('../build_dataset/aapl_spy_training_target.h5', key='data')
test_target = pd.read_hdf('../build_dataset/aapl_spy_test_target.h5', key='data')

# https://pytorch.org/docs/stable/generated/torch.eye.html
training_target = torch.eye(3)[training_target.values]
test_target = torch.eye(3)[test_target.values]

In [3]:
# Get a list of the technical indicators
technical_keys = np.unique([i[0] for i in training_features.keys()])

# Build a dictionary from the loaded training_feature DataFrame
training_technical_dict = {}
for technical in technical_keys:
    training_technical_dict[technical] = training_features[technical]

# Build a dictionary from the loaded test_feature DataFrame
test_technical_dict = {}
for technical in technical_keys:
    test_technical_dict[technical] = test_features[technical]

In [4]:
# of technicals : # of dates : # of stock tickers
# original data
#training_shape = (109, 9262, 501)
#test_shape = (109, 1040, 501)

# of technicals : # of dates : # of stock tickers
# reduced tickers/technicals data
training_shape = (16, 6764, 2)
test_shape = (16, 756, 2)

training_features_tensor = torch.zeros(training_shape)
test_features_tensor = torch.zeros(test_shape)

In [5]:
# https://pytorch.org/docs/stable/generated/torch.Tensor.index_copy_.html

for index, key in enumerate(technical_keys): 
    training_features_tensor[index, :, :].copy_(torch.tensor(training_technical_dict[key].values)) 
    
for index, key in enumerate(technical_keys):
    test_features_tensor[index, :, :].copy_(torch.tensor(test_technical_dict[key].values)) 

In [6]:
# Based on the tutorial found in the PyTorch documentation
# https://pytorch.org/tutorials/beginner/basics/data_tutorial.html

class StockDataset(Dataset):
    def __init__(self, stock_tensor, stock_target_labels_df, example_length):
        self.stock_tensor = stock_tensor
        self.example_length = example_length
        self.stock_targets = stock_target_labels_df

    def __len__(self):
        # The number of examples in the stock dataset
        return self.stock_tensor.shape[1] - self.example_length + 1

    def __getitem__(self, this_index):
        start_index = this_index
        end_index = this_index + self.example_length
        
        # I'm doing the transpose here so that the dimensionality goes: 
        # number of features : # of stocks : # of dates
        features = self.stock_tensor[:,start_index:end_index,:].clone().detach().transpose(1,2)
        label = torch.squeeze(self.stock_targets[this_index].clone().detach(), -1)

        return features, label

In [7]:
device = "mps"

In [8]:
# # Based on the tutorial found in the PyTorch documentation
# # https://pytorch.org/tutorials/beginner/basics/data_tutorial.html

class StockCNN(nn.Module):
    def __init__(self):
        super(StockCNN, self).__init__()
        
        # https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html
        self.this_conv1 = nn.Conv2d(16, 32, kernel_size=2, padding=2)
        self.this_conv2 = nn.Conv2d(32, 32, kernel_size=2, padding=2)
        
        # https://pytorch.org/docs/stable/generated/torch.nn.ReLU.html
        self.this_relu1 = nn.ReLU()
        self.this_relu2 = nn.ReLU()
        self.this_relu3 = nn.ReLU()
        
        # https://pytorch.org/docs/stable/generated/torch.nn.MaxPool2d.html
        self.this_pool1 = nn.MaxPool2d(kernel_size=2, stride=2)
        self.this_pool2 = nn.MaxPool2d(kernel_size=2, stride=2)
        
        # https://pytorch.org/docs/stable/generated/torch.nn.Linear.html
        self.this_linear1 = nn.Linear(320, 224)
        self.this_linear2 = nn.Linear(224, 3)
            
        # https://pytorch.org/docs/stable/generated/torch.nn.Softmax.html
        self.this_softmax = nn.Softmax(dim=1)

    def forward(self, this_features):
        #print(this_features.shape)
        this_features = self.this_conv1(this_features)
        this_features = self.this_relu1(this_features)
        this_features = self.this_pool1(this_features)
        
        this_features = self.this_conv2(this_features)
        this_features = self.this_relu2(this_features)
        this_features = self.this_pool2(this_features)

        this_features = this_features.view(-1, this_features.shape[1] * this_features.shape[2] * this_features.shape[3])

        this_features = self.this_linear1(this_features)
        this_features = self.this_relu3(this_features)
        
        this_features = self.this_linear2(this_features)
        this_features = self.this_softmax(this_features)
        
        return this_features

In [9]:
# Based on the tutorial found in the PyTorch documentation
# https://pytorch.org/tutorials/beginner/basics/data_tutorial.html

batch_size = 64

stock_training_dataset = StockDataset(training_features_tensor, training_target, 14)
stock_test_dataset = StockDataset(test_features_tensor, test_target, 14)

stock_training_dataloader = DataLoader(stock_training_dataset, batch_size=batch_size, shuffle=True)
stock_test_dataloader = DataLoader(stock_test_dataset, batch_size=batch_size, shuffle=False)

In [10]:
# https://pytorch.org/tutorials/beginner/basics/optimization_tutorial.html

def train(stock_training_dataloader, stock_model, loss_fn, optimizer):
    size = len(stock_training_dataloader.dataset)
    
    average_loss = 0
    average_accuracy = 0
    
    predictions = []

    for batch, (features, targets) in enumerate(stock_training_dataloader):
        features, targets = features.to(device), targets.to(device)

        # Compute prediction error
        pred = stock_model(features)
        loss = loss_fn(pred.view(-1), targets.view(-1))

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        loss = loss.item() * len(pred) / size
        average_loss += loss
        
        pred_indices = torch.argmax(pred.cpu(), dim=1)
        predictions.append(pred_indices.tolist())
        real_indices = torch.argmax(targets.reshape(pred.shape[0],3).cpu(), dim=1)
        this_accuracy = sum(pred_indices == real_indices)/len(real_indices)
        
        average_accuracy += this_accuracy.item() * len(pred) / size

    return average_loss, average_accuracy, predictions

In [11]:
def test(stock_test_dataloader, stock_model, loss_fn):
    size = len(stock_test_dataloader.dataset)

    average_loss = 0
    average_accuracy = 0
    
    predictions = []
    
    for batch, (features, targets) in enumerate(stock_test_dataloader):
        features, targets = features.to(device), targets.to(device)

        # Compute prediction error
        pred = stock_model(features)
        loss = loss_fn(pred.view(-1), targets.view(-1))

        loss = loss.item() * len(pred) / size
        average_loss += loss
        
        pred_indices = torch.argmax(pred.cpu(), dim=1)
        predictions.append(pred_indices.tolist())
        real_indices = torch.argmax(targets.reshape(pred.shape[0],3).cpu(), dim=1)
        this_accuracy = sum(pred_indices == real_indices)/len(real_indices)
        
        average_accuracy += this_accuracy.item() * len(pred) / size
        
    return average_loss, average_accuracy, predictions

In [12]:
# This code was originally created by Rocco
# Modified for application to CNNs by Derek

def rolling_window_cv(batch_size=64, lr=1e-4, num_folds=5, epochs=10, device='mps'):
    train_data_size = len(stock_training_dataset)
    n = train_data_size // num_folds
    validation_errors = []
    
    training_losses = np.zeros([num_folds, epochs])
    training_accuracies = np.zeros([num_folds, epochs])
    
    val_losses = np.zeros([num_folds, epochs])
    val_accuracies = np.zeros([num_folds, epochs])

    for fold in range(num_folds):
        print(f"Fold {fold+1} of {num_folds}")

        start_idx = fold * n
        end_idx = (fold + 1) * n

        k_train_data_fold_indices = list(range(0, start_idx)) + list(range(end_idx, train_data_size))
        
        # https://pytorch.org/docs/stable/data.html#torch.utils.data.Subset
        train_data_fold = torch.utils.data.Subset(stock_training_dataset, k_train_data_fold_indices)
        val_data_fold = torch.utils.data.Subset(stock_training_dataset, range(start_idx, end_idx))

        training_dataloader = DataLoader(train_data_fold, batch_size=batch_size, shuffle=True)
        val_dataloader = DataLoader(val_data_fold, batch_size=batch_size, shuffle=False)
        
        # Fresh model and optimizer for each fold
        model = StockCNN().to(device)
        optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9)
        loss_fn = nn.CrossEntropyLoss()

        fold_losses = []
        fold_accuarcies = []
        
        fold_val_losses = []
        fold_val_accuarcies = []
        
        for epoch in range(epochs):
            epoch_losses, epoch_accuracies = train(training_dataloader, model, loss_fn, optimizer)
            fold_losses.append(epoch_losses)
            fold_accuarcies.append(epoch_accuracies)
            
            this_val_losses, this_val_accuracies = test(val_dataloader, model, loss_fn)
            fold_val_losses.append(this_val_losses)
            fold_val_accuarcies.append(this_val_accuracies)
       
        training_losses[fold] = fold_losses
        training_accuracies[fold] = fold_accuarcies
        
        val_losses[fold] = fold_val_losses
        val_accuracies[fold] = fold_val_accuarcies
    
    
    print("Average Training Losses")
    final_training_losses = np.round(training_losses.mean(axis=0),2)
    print(final_training_losses)
    print()
    
    print("Average Training Accuracy:")
    final_training_accuracies = np.round(training_accuracies.mean(axis=0),2)
    print(final_training_accuracies)
    print()
    
    print("Validation Losses:")
    final_val_losses = np.round(val_losses.mean(axis=0),2)
    print(final_val_losses)
    print()
    
    print("Validation Accuracy:")
    final_val_accuracies = np.round(val_accuracies.mean(axis=0),2)
    print(final_val_accuracies)
    print()
    
    return final_training_losses, final_training_accuracies, final_val_losses, final_val_accuracies

In [13]:
%%time
#final_training_losses1, final_training_accuracies1, \
#           final_val_losses1, final_val_accuracies1 = rolling_window_cv(batch_size=32, lr=1e-4, num_folds=5, epochs=50)

CPU times: user 22 µs, sys: 2 µs, total: 24 µs
Wall time: 6.2 µs


In [14]:
%%time
#final_training_losses2, final_training_accuracies2, \
#           final_val_losses2, final_val_accuracies2 = rolling_window_cv(batch_size=64, lr=1e-4, num_folds=5, epochs=50)

CPU times: user 1 µs, sys: 1 µs, total: 2 µs
Wall time: 4.29 µs


In [15]:
%%time
#final_training_losses3, final_training_accuracies3, \
#          final_val_losses3, final_val_accuracies3 = rolling_window_cv(batch_size=128, lr=1e-4, num_folds=5, epochs=50)

CPU times: user 24 µs, sys: 2 µs, total: 26 µs
Wall time: 5.25 µs


In [16]:
%%time
#final_training_losses4, final_training_accuracies4, \
#           final_val_losses4, final_val_accuracies4 = rolling_window_cv(batch_size=256, lr=1e-4, num_folds=5, epochs=50)

CPU times: user 10 µs, sys: 2 µs, total: 12 µs
Wall time: 8.11 µs


In [17]:
epoch_nums = range(1,51)

# plt.plot(epoch_nums, final_val_accuracies1, label = 'BS = 32')
# plt.plot(epoch_nums, final_val_accuracies2, label = 'BS = 64')
# plt.plot(epoch_nums, final_val_accuracies3, label = 'BS = 128')
# plt.plot(epoch_nums, final_val_accuracies4, label = 'BS = 256')

# plt.xlabel('Epoch #')
# plt.ylabel('Accuracy')
# plt.title('Validation Accuracy as a Function of Batch Size (BS)')
# plt.legend()

# # Show the graph
# plt.show()

In [18]:
%%time
#final_training_losses5, final_training_accuracies5, \
#           final_val_losses5, final_val_accuracies5 = rolling_window_cv(batch_size=64, lr=1e-5, num_folds=5, epochs=50)

CPU times: user 13 µs, sys: 1 µs, total: 14 µs
Wall time: 5.25 µs


In [19]:
%%time
#final_training_losses6, final_training_accuracies6, \
#           final_val_losses6, final_val_accuracies6 = rolling_window_cv(batch_size=64, lr=1e-4, num_folds=5, epochs=50)

CPU times: user 33 µs, sys: 1 µs, total: 34 µs
Wall time: 5.96 µs


In [20]:
%%time
#final_training_losses7, final_training_accuracies7, \
#           final_val_losses7, final_val_accuracies7 = rolling_window_cv(batch_size=64, lr=1e-3, num_folds=5, epochs=50)

CPU times: user 22 µs, sys: 1e+03 ns, total: 23 µs
Wall time: 5.25 µs


In [21]:
%%time
#final_training_losses8, final_training_accuracies8, \
#           final_val_losses8, final_val_accuracies8 = rolling_window_cv(batch_size=64, lr=1e-2, num_folds=5, epochs=50)

CPU times: user 12 µs, sys: 1 µs, total: 13 µs
Wall time: 3.81 µs


In [22]:
epoch_nums = range(1,51)

# plt.plot(epoch_nums, final_val_accuracies5, label = 'LR = 1e-5')
# plt.plot(epoch_nums, final_val_accuracies6, label = 'LR = 1e-4')
# plt.plot(epoch_nums, final_val_accuracies7, label = 'LR = 1e-3')
# plt.plot(epoch_nums, final_val_accuracies8, label = 'LR = 1e-2')

# plt.xlabel('Epoch #')
# plt.ylabel('Accuracy')
# plt.title('Validation Accuracy as a Function of Learning Rate (LR)')
# plt.legend()

# # Show the graph
# plt.show()

In [23]:
epoch_nums = range(1,51)

# plt.plot(epoch_nums, final_training_accuracies2, label = 'Training')
# plt.plot(epoch_nums, final_val_accuracies2, label = 'Validation')


# plt.xlabel('Epoch #')
# plt.ylabel('Accuracy')
# plt.title('Accuracy as a Function of Training Epochs')
# plt.legend()

# # Show the graph
# plt.show()

In [34]:
stock_model = StockCNN().to(device)

# https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html
loss_fn = nn.CrossEntropyLoss()

# https://pytorch.org/docs/stable/generated/torch.optim.SGD.html
optimizer = torch.optim.SGD(stock_model.parameters(), lr=1e-4, momentum=0.9)

In [35]:
epochs = 40

epoch_losses = []
epoch_accuracies = []
predictions = None
print("Epoch # : Accuracy")
test_losses = []
test_accuracies = []

for epoch in range(epochs):
    this_loss, this_accuracy, predictions = train(stock_training_dataloader, stock_model, loss_fn, optimizer)
    epoch_losses.append(this_loss)
    epoch_accuracies.append(this_accuracy)
    print("Epoch", str(epoch + 1), ":", str(np.round(this_accuracy, 2)))
    test_epoch_losses, test_epoch_accuracies, _ = test(stock_test_dataloader, stock_model, loss_fn)
    test_losses.append(test_epoch_losses)
    test_accuracies.append(test_epoch_accuracies)
    
print(epoch_losses)
print()
print(epoch_accuracies)
#print(predictions)
print(test_losses)
print()
print(test_accuracies)

Epoch # : Accuracy
Epoch 1 : 0.32
Epoch 2 : 0.34
Epoch 3 : 0.36
Epoch 4 : 0.41
Epoch 5 : 0.42
Epoch 6 : 0.44
Epoch 7 : 0.44
Epoch 8 : 0.45
Epoch 9 : 0.47
Epoch 10 : 0.51
Epoch 11 : 0.58
Epoch 12 : 0.66
Epoch 13 : 0.74
Epoch 14 : 0.81
Epoch 15 : 0.84
Epoch 16 : 0.87
Epoch 17 : 0.88
Epoch 18 : 0.9
Epoch 19 : 0.91
Epoch 20 : 0.92
Epoch 21 : 0.92
Epoch 22 : 0.92
Epoch 23 : 0.93
Epoch 24 : 0.94
Epoch 25 : 0.93
Epoch 26 : 0.94
Epoch 27 : 0.95
Epoch 28 : 0.94
Epoch 29 : 0.95
Epoch 30 : 0.95
Epoch 31 : 0.95
Epoch 32 : 0.95
Epoch 33 : 0.96
Epoch 34 : 0.95
Epoch 35 : 0.96
Epoch 36 : 0.97
Epoch 37 : 0.97
Epoch 38 : 0.96
Epoch 39 : 0.96
Epoch 40 : 0.96
[335.4705205032198, 335.3450369322817, 335.1996691475937, 335.0244214805633, 334.8273638551666, 334.57524926410633, 334.2715515421508, 333.9279786557591, 333.4283472580162, 332.5927052442065, 330.23929276012194, 324.59511293465783, 319.2321452549205, 314.99724333509835, 312.3035818785672, 310.3721405976332, 309.12572827887993, 308.0424155753871, 307

In [36]:
pd.DataFrame(epoch_losses).to_csv('training_losses_4_report.csv')
pd.DataFrame(epoch_accuracies).to_csv('training_accuracies_4_report.csv')
pd.DataFrame(test_losses).to_csv('test_losses_4_report.csv')
pd.DataFrame(test_accuracies).to_csv('test_accuracies_4_report.csv')

In [26]:
pred_list = []

for i in predictions:
    for j in i:
        pred_list.append(j)
        


In [27]:
training_target = pd.read_hdf('../build_dataset/aapl_spy_training_target.h5', key='data')
test_target = pd.read_hdf('../build_dataset/aapl_spy_test_target.h5', key='data')

In [28]:
pd.DataFrame(np.array(pred_list) - 1).to_csv('cnn_training_predictions.csv')

In [29]:
pd.DataFrame(epoch_accuracies, index = range(1,len(epoch_accuracies)+1)).to_csv('cnn_training_accuracies.csv')

In [30]:
average_loss, average_accuracy, predictions = test(stock_test_dataloader, stock_model, loss_fn)

In [31]:
pred_list = []

for i in predictions:
    for j in i:
        pred_list.append(j)
        

In [32]:
pd.DataFrame(np.array(pred_list) - 1).to_csv('cnn_test_predictions.csv')

In [33]:
average_accuracy

0.35262449540969976