In [145]:
# Data Load
import pandas as pd
import numpy as np
import requests
import datetime
from tqdm import tqdm
import pickle

In [132]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

# 1. Load Data

In [7]:
nasdaq = pd.read_csv("data/NASDAQ_tickers.csv",header=None)

In [102]:
# We will test with only 500 stocks here
tickers_nasdaq = nasdaq.iloc[:,0].to_list()
tickers = tickers_nasdaq[:500]

In [103]:
def get_price_data(tickers, start_date, end_date):
    
    data = pd.DataFrame([])
    for ticker in tqdm(tickers):
        headers = {
            'Content-Type': 'json'
        }
        token = 'f9e28eedc7515e073aeef37b03d9c51dad4c1706'
        startDate = start_date
        endDate = end_date
        url = f"https://api.tiingo.com/tiingo/daily/{ticker}/prices"
        params = {'Ticker':ticker, 'startDate': startDate, 'endDate':endDate, 'token':token}
        requestResponse = requests.get(url, headers=headers, params = params)
        data_list = requestResponse.json()

        try:
            df = pd.DataFrame(data_list)
            df = df[['date','adjClose']]
            df['date'] = pd.to_datetime(df['date']).dt.date
            df = df.set_index('date')
            df.columns = [ticker]

            data = pd.concat([data, df], axis=1)

        except:
            print(f"{ticker} does not have data")
            
    return data

In [104]:
data = get_price_data(tickers, '2013-01-01', '2017-12-31')        

  1%|          | 6/500 [00:08<10:51,  1.32s/it]

AAXN does not have data


  2%|▏         | 11/500 [00:15<10:17,  1.26s/it]

ACET does not have data


  8%|▊         | 39/500 [00:58<10:08,  1.32s/it]

AKRX does not have data


 17%|█▋        | 83/500 [02:06<09:04,  1.31s/it]

ATHN does not have data


 50%|████▉     | 249/500 [06:27<05:28,  1.31s/it]

DGLD does not have data


 52%|█████▏    | 261/500 [06:45<05:05,  1.28s/it]

DSLV does not have data


 69%|██████▉   | 344/500 [08:54<03:22,  1.30s/it]

FOX does not have data


 69%|██████▉   | 345/500 [08:54<02:53,  1.12s/it]

FOXA does not have data


 76%|███████▌  | 381/500 [09:49<02:35,  1.30s/it]

GPOR does not have data


 91%|█████████ | 455/500 [11:43<00:57,  1.27s/it]

ILG does not have data


100%|██████████| 500/500 [12:54<00:00,  1.55s/it]


In [111]:
# Drop stocks with NA values for simplicity
nasdaq_test_data = data.dropna(axis=1)

In [112]:
with open('data/nasdaq_test_price.pickle', 'wb') as f:
    pickle.dump(nasdaq_test_data, f, pickle.HIGHEST_PROTOCOL)

In [113]:
%time
# load
with open('data/nasdaq_test_price.pickle', 'rb') as f:
    nasdaq_data = pickle.load(f)

CPU times: user 0 ns, sys: 21 µs, total: 21 µs
Wall time: 37.7 µs


In [115]:
nasdaq_data.head()

Unnamed: 0_level_0,AABA,AAON,AAPL,AAWW,AAXJ,ABAX,ABCB,ABMD,ACGL,ACHC,...,JAZZ,JBHT,JBLU,JBSS,JCOM,JJSF,JKHY,JKI,JMBA,JOBS
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-01-02,5.516981,8.894916,17.011521,45.58,53.233575,36.28919,11.861925,13.69,14.793333,24.37,...,54.58,56.432451,5.9,13.594767,27.844603,57.646466,36.760707,23.157976,11.75,24.05
2013-01-03,5.434556,9.049646,16.79667,44.76,52.828462,36.193238,11.824944,13.45,14.75,24.19,...,55.21,56.635844,5.95,13.601933,27.757014,57.969114,36.814701,23.22951,11.75,24.315
2013-01-04,5.456536,9.095647,16.328928,45.19,52.828462,36.135666,11.926644,13.41,14.876667,23.63,...,55.62,57.209044,5.95,13.530268,27.923434,58.049776,36.895691,23.434664,12.4,26.51
2013-01-07,5.330151,8.995281,16.232875,44.84,52.414729,35.962952,11.899924,13.2495,14.73,24.15,...,55.5,56.783767,5.97,13.644932,27.809568,56.822548,36.958684,23.316236,12.499,26.4
2013-01-08,5.401586,9.016191,16.276564,45.53,51.975139,36.452309,11.926644,13.21,14.75,24.9392,...,56.01,56.700561,5.94,13.609099,27.888398,56.848808,36.787704,23.197095,12.7,26.255


# 2. Preprocessing

In [124]:
# First, make a copy of the dataframe and calculate the return so that we can get the rank for everyday
return_rank = nasdaq_data.copy()
return_rank = return_rank.pct_change().rank(axis=1)
return_rank.head()

Unnamed: 0_level_0,AABA,AAON,AAPL,AAWW,AAXJ,ABAX,ABCB,ABMD,ACGL,ACHC,...,JAZZ,JBHT,JBLU,JBSS,JCOM,JJSF,JKHY,JKI,JMBA,JOBS
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-01-02,,,,,,,,,,,...,,,,,,,,,,
2013-01-03,82.0,446.0,101.0,61.0,146.0,221.0,206.0,62.0,214.0,150.0,...,410.0,336.0,387.0,288.0,204.0,362.0,304.0,326.0,274.0,408.0
2013-01-04,232.0,255.0,13.0,331.0,159.5,124.0,309.0,105.0,308.0,17.0,...,291.0,342.0,159.5,86.0,275.0,196.0,209.0,317.0,478.0,481.0
2013-01-07,42.0,128.0,199.0,174.0,173.0,216.0,267.0,120.0,142.0,455.0,...,270.0,179.0,360.0,402.0,239.0,55.0,340.0,208.0,398.0,234.0
2013-01-08,418.0,310.0,318.0,431.0,132.0,420.0,306.0,217.0,298.0,466.0,...,390.0,252.0,179.0,227.0,321.0,282.0,192.0,178.0,435.0,174.0


In [135]:
# Normalize the price of each stock via dividing it by its maximum value throughout the entire 2013-2017 dataset
normalized_data = nasdaq_data.apply(lambda x: x / x.max())
normalized_data.head()

Unnamed: 0_level_0,AABA,AAON,AAPL,AAWW,AAXJ,ABAX,ABCB,ABMD,ACGL,ACHC,...,JAZZ,JBHT,JBLU,JBSS,JCOM,JJSF,JKHY,JKI,JMBA,JOBS
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-01-02,0.275333,0.239532,0.402108,0.667839,0.726574,0.56387,0.244267,0.068539,0.433483,0.293721,...,0.282769,0.504152,0.218357,0.2045,0.320092,0.393618,0.322227,0.473141,0.674125,0.371371
2013-01-03,0.271219,0.243698,0.39703,0.655824,0.721045,0.562379,0.243506,0.067338,0.432213,0.291551,...,0.286033,0.505969,0.220207,0.204608,0.319085,0.395821,0.3227,0.474603,0.674125,0.375463
2013-01-04,0.272316,0.244937,0.385973,0.662125,0.721045,0.561485,0.2456,0.067137,0.435925,0.284802,...,0.288157,0.51109,0.220207,0.20353,0.320998,0.396371,0.32341,0.478794,0.711417,0.409358
2013-01-07,0.266009,0.242234,0.383703,0.656996,0.715398,0.558801,0.24505,0.066334,0.431627,0.291069,...,0.287535,0.507291,0.220947,0.205255,0.319689,0.387992,0.323962,0.476375,0.717097,0.407659
2013-01-08,0.269574,0.242798,0.384736,0.667106,0.709398,0.566405,0.2456,0.066136,0.432213,0.300581,...,0.290177,0.506547,0.219837,0.204716,0.320596,0.388171,0.322463,0.473941,0.728629,0.40542


In [133]:
# In addition to the normalized closing price, we calculate four more sequential features: 
# 5, 10, 20, and 30 days moving averages which represent the weekly and monthly trends.


# 3. Sliding Window

In [143]:
list(normalized_data.columns)

['AABA',
 'AAON',
 'AAPL',
 'AAWW',
 'AAXJ',
 'ABAX',
 'ABCB',
 'ABMD',
 'ACGL',
 'ACHC',
 'ACIW',
 'ACOR',
 'ACTA',
 'ACWI',
 'ACWX',
 'ACXM',
 'ADBE',
 'ADI',
 'ADP',
 'ADRD',
 'ADRE',
 'ADSK',
 'ADTN',
 'ADUS',
 'AEGN',
 'AEIS',
 'AFAM',
 'AFSI',
 'AGII',
 'AGNC',
 'AGYS',
 'AHGP',
 'AIA',
 'AIMC',
 'AKAM',
 'ALCO',
 'ALGN',
 'ALGT',
 'ALKS',
 'ALNY',
 'ALOG',
 'ALXN',
 'AMAG',
 'AMAT',
 'AMBA',
 'AMCX',
 'AMED',
 'AMGN',
 'AMNB',
 'AMOT',
 'AMSF',
 'AMSWA',
 'AMTD',
 'AMWD',
 'AMZN',
 'ANAT',
 'ANCX',
 'ANDE',
 'ANGO',
 'ANIK',
 'ANSS',
 'AOBC',
 'AOSL',
 'APEI',
 'APOG',
 'ARCB',
 'ARCC',
 'ARII',
 'ARLP',
 'ARNA',
 'AROW',
 'ARTNA',
 'ASCMA',
 'ASFI',
 'ASML',
 'ASPS',
 'ASTE',
 'ATLO',
 'ATNI',
 'ATRC',
 'ATRI',
 'ATRO',
 'ATVI',
 'AVAV',
 'AVHI',
 'AVNW',
 'AZPN',
 'BABY',
 'BANF',
 'BANR',
 'BBBY',
 'BCPC',
 'BDGE',
 'BECN',
 'BELFB',
 'BFIN',
 'BGFV',
 'BIB',
 'BIDU',
 'BIIB',
 'BIS',
 'BJRI',
 'BKCC',
 'BLKB',
 'BLMN',
 'BLMT',
 'BMRC',
 'BMRN',
 'BMTC',
 'BOBE',
 'BOFI',
 '

In [157]:
window_size = 60
data = normalized_data.copy()
tickers = list(normalized_data.columns)
X = []
y = []
for ticker in tqdm(tickers):
    ticker_data = data[ticker].values
    
    for i in range(len(ticker_data) - window_size):
        X.append(ticker_data[i:i+window_size])
        y.append(ticker_data[i+window_size])
    
X = np.stack(X, 0)
y = np.stack(y, 0)

100%|██████████| 481/481 [00:00<00:00, 1033.79it/s]


In [159]:
X.shape

(576719, 60)

# 4. Sequential Embedding Layer

In [152]:
batch_size = 1024

X_train_tensor = torch.FloatTensor(X_train)
y_train_tensor = torch.FloatTensor(y_train)

train_data = []
for i in tqdm(range(len(X_train_tensor))):
    train_data.append([X_train_tensor[i], y_train_tensor[i]])
train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True, pin_memory=True)

X_valid_tensor = torch.FloatTensor(X_valid)
y_valid_tensor = torch.FloatTensor(y_valid)

valid_data = []
for i in tqdm(range(len(X_valid_tensor))):
    valid_data.append([X_valid_tensor[i], y_valid_tensor[i]])
valid_loader = torch.utils.data.DataLoader(valid_data, batch_size=batch_size, shuffle=False, pin_memory=True)

0it [00:00, ?it/s]


ValueError: num_samples should be a positive integer value, but got num_samples=0

In [153]:
state_size = 32
input_size = X_train.shape[2]
output_size = 1

AttributeError: 'list' object has no attribute 'shape'

In [154]:
cuda = torch.cuda.is_available() # False
if cuda:
    device = 'cuda:0'
else:
    device = 'cpu'
cuda, device

(True, 'cuda:0')

In [155]:
path = './models/LSTM_{}input_{}state_{}output_{}window_TotalReturn_LSTMs'.format(input_size, state_size, output_size, window_size)
print(path)

num_epoch = int(1e4)
lr = 1e-4
weight_decay = 1e-8
criterion = nn.MSELoss()

NameError: name 'input_size' is not defined

In [156]:
class Model(nn.Module):
    
    def __init__(self, input_size, state_size, output_size):
        super(Model, self).__init__()
        
        self.input_size = input_size
        self.state_size = state_size
        self.output_size = output_size
        self.device = device
        
        self.lstm = nn.LSTM(self.input_size, self.state_size, num_layers=2, batch_first=True)
        self.out = nn.Linear(self.state_size, self.output_size)
        
    def forward(self, x):
        ## x: (batch, sequence_len, input_size)
        
        out, _ = self.lstm(x)

        pred1 = self.out(out1[0]) 
        pred2 = self.out(out2[0]) 
        pred3 = self.out(out3[0])
        ## pred: (batch, 1)

        return pred1, pred2, pred3