In [1]:
# import warnings
# warnings.filterwarnings('ignore')

import os
import yfinance as yf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Technical Analysis
from talib import RSI, BBANDS, ATR, NATR, MACD

# scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

# neural network
from tqdm import tqdm
import torch
import torch.nn as nn
  # our model
from LSTM import LSTM

# Data Preprocessing

In [2]:
# def read_tickers_sp500(file_path):
#     with open(file_path, 'r') as file:
#         tickers_sp500 = [line.strip() for line in file]
#     return tickers_sp500

# tickers_sp500 = read_tickers_sp500('tickers_sp500.txt')
# print(tickers_sp500)

In [3]:
# Download data

# Stocks list
# top 7 MarketCap in S&P500(^GSPC)
# tickers = ['^GSPC', 'AAPL', 'MSFT', 'AMZN', 'NVDA', 'GOOGL', 'TSLA', 'META']
tickers = ['AAPL', 'MSFT', 'AMZN', 'NVDA', 'GOOGL', 'TSLA', 'META', 'AMD', 'INTC', 'NFLX', 'PYPL', 'ASML']
# tickers = tickers_sp500
start_date = None
end_date = None

df_prices_download = yf.download(tickers=tickers, start=start_date, end=end_date, group_by='ticker')

[*********************100%%**********************]  12 of 12 completed


In [4]:
# Format into large table
# col: OHLCV
# rows(multi-index): Ticker, Date

df_prices = df_prices_download.stack(level=0, dropna=False)
df_prices = df_prices.swaplevel(0, 1)
df_prices = df_prices.loc[tickers].sort_index(level='Ticker')
df_prices.dropna(inplace=True)

# Use 'Adj Close' instead of 'Close'
df_prices.drop('Close', axis=1, inplace=True)
df_prices.rename(columns={'Adj Close': 'Close'}, inplace=True)

df_prices

  df_prices = df_prices_download.stack(level=0, dropna=False)


Unnamed: 0_level_0,Price,Open,High,Low,Close,Volume
Ticker,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AAPL,1980-12-12,0.128348,0.128906,0.128348,0.099192,469033600.0
AAPL,1980-12-15,0.122210,0.122210,0.121652,0.094017,175884800.0
AAPL,1980-12-16,0.113281,0.113281,0.112723,0.087117,105728000.0
AAPL,1980-12-17,0.115513,0.116071,0.115513,0.089273,86441600.0
AAPL,1980-12-18,0.118862,0.119420,0.118862,0.091861,73449600.0
...,...,...,...,...,...,...
TSLA,2024-05-06,183.800003,187.559998,182.199997,184.759995,84390300.0
TSLA,2024-05-07,182.399994,183.259995,177.399994,177.809998,75045900.0
TSLA,2024-05-08,171.589996,176.059998,170.149994,174.720001,79969500.0
TSLA,2024-05-09,175.009995,175.619995,171.369995,171.970001,65701300.0


### Remove stocks with less than 10 years of data

In [5]:
min_obs_days = 10 * 252 # NYSE and NASDAQ average 252 trading days a year
num_obs = df_prices.groupby(level='Ticker').size()

tickers_to_drop = num_obs[num_obs < min_obs_days].index
print(f'Amount tickers_to_drop: {tickers_to_drop.size}')
print(f'tickers_to_drop: {tickers_to_drop}')

df_prices = df_prices.drop(tickers_to_drop, level='Ticker')

Amount tickers_to_drop: 1
tickers_to_drop: Index(['PYPL'], dtype='object', name='Ticker')


# Feature Engineering

### RSI - Relative Strength Index
RSI compares the magnitude of recent price changes across stocks to identify stocks as overbought or oversold.

In [6]:
rsi = df_prices.groupby(level='Ticker', group_keys=False).Close.apply(RSI)
df_prices['RSI'] = rsi

### Bollinger Bands
Bollinger Bands is a technical analysis tool used to determine where prices are high and low relative to each other.

In [7]:
def compute_bb(close):
    high, mid, low = BBANDS(np.log1p(close), timeperiod=20)
    return pd.DataFrame({'BB_High': high,
                         'BB_Mid': mid, # SMA20
                         'BB_Low': low},
                        index=close.index)

bbands = df_prices.groupby(level='Ticker', group_keys=False).Close.apply(compute_bb)
df_prices = pd.concat([df_prices, bbands], axis=1)

### ATR - Average True Range
The average true range (ATR) indicator shows the volatility of the market.

In [8]:
by_ticker = df_prices.groupby('Ticker', group_keys=False)

def compute_atr(stock_data):
    atr = ATR(stock_data.High,
              stock_data.Low,
              stock_data.Close,
              timeperiod=14)
    return atr.sub(atr.mean()).div(atr.std())

df_prices['ATR'] = by_ticker.apply(compute_atr)
# Normalized Average True Range (NATR)
df_prices['NATR'] = by_ticker.apply(lambda x: NATR(high=x.High, low=x.Low, close=x.Close))

### MACD - Moving Average Convergence/Divergence

In [9]:
def compute_macd(close):
    macd = MACD(close)[0]
    return macd.sub(macd.mean()).div(macd.std())

df_prices['MACD'] = df_prices.groupby(level='Ticker', group_keys=False).Close.apply(compute_macd)

## Determine Investment Universe

### Dollar Volume

In [10]:
# Close: USD
# Volumn: Amount
df_prices['Dollar_Volume'] = (df_prices.loc[:, 'Close']
                           .mul(df_prices.loc[:, 'Volume'], axis=0))

df_prices.Dollar_Volume /= 1e6 # Dollar_Volume: Million USD

### Resample Monthly Freq.
- drop OHL_V
- use Dollar_Volumn instead of Volume

In [11]:
remian_cols = [c for c in df_prices.columns.unique(0) if c not in ['Dollar_Volume', 'Volume', 'Open', 'High', 'Low']]

# New data frame: 'data' - load to model
data = (
    pd.concat(
        [
        # avg(1M) Dollar_Volume
            df_prices.unstack("Ticker")
            .Dollar_Volume.resample('M')
            .mean()
            .stack("Ticker")
            .to_frame("Dollar_Volume"),
        # (Adj)Close & Technical Indicators
            df_prices.unstack("Ticker")[remian_cols]
            .resample('M')
            .last()
            .stack("Ticker")
        ],
        axis=1
    )
    .swaplevel()
    .sort_index(level='Ticker')
    .dropna()
)

data.info()

  df_prices.unstack("Ticker")
  df_prices.unstack("Ticker")[remian_cols]


<class 'pandas.core.frame.DataFrame'>
MultiIndex: 3820 entries, ('AAPL', Timestamp('1981-01-31 00:00:00')) to ('TSLA', Timestamp('2024-05-31 00:00:00'))
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Dollar_Volume  3820 non-null   float64
 1   Close          3820 non-null   float64
 2   RSI            3820 non-null   float64
 3   BB_High        3820 non-null   float64
 4   BB_Mid         3820 non-null   float64
 5   BB_Low         3820 non-null   float64
 6   ATR            3820 non-null   float64
 7   NATR           3820 non-null   float64
 8   MACD           3820 non-null   float64
dtypes: float64(9)
memory usage: 300.6+ KB


  df_prices.unstack("Ticker")[remian_cols]


In [12]:
data

Unnamed: 0_level_0,Unnamed: 1_level_0,Dollar_Volume,Close,RSI,BB_High,BB_Mid,BB_Low,ATR,NATR,MACD
Ticker,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
AAPL,1981-01-31,3.151991,0.097467,39.642862,0.110904,0.103202,0.095500,-0.734545,32.588143,-0.122072
AAPL,1981-02-28,1.541886,0.091430,46.976888,0.096494,0.087478,0.078462,-0.736967,30.902431,-0.126576
AAPL,1981-03-31,2.610709,0.084529,44.668320,0.091408,0.081691,0.071973,-0.738274,31.185326,-0.122769
AAPL,1981-04-30,2.409549,0.097898,57.303034,0.097653,0.090244,0.082834,-0.736762,29.164623,-0.119101
AAPL,1981-05-31,3.168914,0.114286,75.524108,0.108313,0.095319,0.082326,-0.734781,27.493025,-0.116530
...,...,...,...,...,...,...,...,...,...,...
TSLA,2024-01-31,23894.259794,187.289993,26.168632,5.548875,5.369038,5.189201,0.981318,4.728262,-2.580594
TSLA,2024-02-29,19463.383858,201.880005,52.255213,5.331063,5.266266,5.201469,0.791999,3.886996,-0.437606
TSLA,2024-03-31,16726.508798,175.789993,45.271376,5.266777,5.176023,5.085268,0.739286,4.304150,-0.901836
TSLA,2024-04-30,18861.231179,183.279999,58.014280,5.266724,5.111188,4.955652,1.225468,5.541325,-0.102560


### Select 10 most-traded equities
Select the 10 most-traded stocks based on a 5-year rolling average of dollar volume.

In [13]:
data

Unnamed: 0_level_0,Unnamed: 1_level_0,Dollar_Volume,Close,RSI,BB_High,BB_Mid,BB_Low,ATR,NATR,MACD
Ticker,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
AAPL,1981-01-31,3.151991,0.097467,39.642862,0.110904,0.103202,0.095500,-0.734545,32.588143,-0.122072
AAPL,1981-02-28,1.541886,0.091430,46.976888,0.096494,0.087478,0.078462,-0.736967,30.902431,-0.126576
AAPL,1981-03-31,2.610709,0.084529,44.668320,0.091408,0.081691,0.071973,-0.738274,31.185326,-0.122769
AAPL,1981-04-30,2.409549,0.097898,57.303034,0.097653,0.090244,0.082834,-0.736762,29.164623,-0.119101
AAPL,1981-05-31,3.168914,0.114286,75.524108,0.108313,0.095319,0.082326,-0.734781,27.493025,-0.116530
...,...,...,...,...,...,...,...,...,...,...
TSLA,2024-01-31,23894.259794,187.289993,26.168632,5.548875,5.369038,5.189201,0.981318,4.728262,-2.580594
TSLA,2024-02-29,19463.383858,201.880005,52.255213,5.331063,5.266266,5.201469,0.791999,3.886996,-0.437606
TSLA,2024-03-31,16726.508798,175.789993,45.271376,5.266777,5.176023,5.085268,0.739286,4.304150,-0.901836
TSLA,2024-04-30,18861.231179,183.279999,58.014280,5.266724,5.111188,4.955652,1.225468,5.541325,-0.102560


In [14]:
data['Dollar_Volume'] = (data.loc[:, 'Dollar_Volume']
                         .unstack('Ticker')
                         .rolling(window=5*12, min_periods=12)
                         .mean()
                         .stack()
                         .swaplevel())

data['Dollar_Volume_Rank'] = (data
                           .groupby('Date')
                           .Dollar_Volume
                           .rank(ascending=False))

data = data[data.Dollar_Volume_Rank < 10].drop(['Dollar_Volume', 'Dollar_Volume_Rank'], axis=1)

## Monthly Return

In [15]:
outlier_cutoff = 0.01 # winsorize returns at the [1%, 99%]
lags = [1, 3, 6, 12]
returns = []

for lag in lags:
    returns.append(data
                   .Close
                   .unstack('Ticker')
                   .sort_index()
                   .pct_change(lag)
                   .stack('Ticker')
                   .pipe(lambda x: x.clip(lower=x.quantile(outlier_cutoff),
                                          upper=x.quantile(1-outlier_cutoff)))
                   .add(1)
                   .pow(1/lag)
                   .sub(1)
                   .to_frame(f'Return_{lag}m')
                   )

df_returns = pd.concat(returns, axis=1).swaplevel().sort_index(level='Ticker')
df_returns.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 3666 entries, ('AAPL', Timestamp('1982-01-31 00:00:00')) to ('TSLA', Timestamp('2024-05-31 00:00:00'))
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Return_1m   3666 non-null   float64
 1   Return_3m   3644 non-null   float64
 2   Return_6m   3611 non-null   float64
 3   Return_12m  3545 non-null   float64
dtypes: float64(4)
memory usage: 146.0+ KB


  returns.append(data
  returns.append(data
  returns.append(data
  returns.append(data


In [16]:
# merge returns -> data
# drop 'Close', use 'Returns' instead
data = data.join(df_returns).drop('Close', axis=1).dropna()
data

Unnamed: 0_level_0,Unnamed: 1_level_0,RSI,BB_High,BB_Mid,BB_Low,ATR,NATR,MACD,Return_1m,Return_3m,Return_6m,Return_12m
Ticker,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
AAPL,1982-12-31,47.403443,0.111810,0.101131,0.090452,-0.733044,32.924823,-0.120462,-0.062741,0.178553,0.152478,0.025343
AAPL,1983-01-31,72.314662,0.136634,0.110507,0.084379,-0.728839,28.384148,-0.111827,0.368198,0.172243,0.177537,0.059733
AAPL,1983-02-28,60.740668,0.154891,0.143017,0.131143,-0.723543,30.303026,-0.111723,0.116206,0.126984,0.167675,0.079349
AAPL,1983-03-31,46.380966,0.143759,0.138385,0.133011,-0.723940,32.329294,-0.121576,-0.073972,0.122464,0.150167,0.079481
AAPL,1983-04-30,64.387237,0.172024,0.146498,0.120972,-0.720760,29.691712,-0.113907,0.195264,0.073027,0.121539,0.108005
...,...,...,...,...,...,...,...,...,...,...,...,...
TSLA,2024-01-31,26.168632,5.548875,5.369038,5.189201,0.981318,4.728262,-2.580594,-0.246257,-0.023014,-0.057639,0.006529
TSLA,2024-02-29,52.255213,5.331063,5.266266,5.201469,0.791999,3.886996,-0.437606,0.077901,-0.056129,-0.040106,-0.001565
TSLA,2024-03-31,45.271376,5.266777,5.176023,5.085268,0.739286,4.304150,-0.901836,-0.129235,-0.108952,-0.057144,-0.013709
TSLA,2024-04-30,58.014280,5.266724,5.111188,4.955652,1.225468,5.541325,-0.102560,0.042608,-0.007188,-0.015133,0.009147


## Price Momentum
This factor computes the total return for a given number of prior trading days d.

In [17]:
for lag in [3, 6, 12]:
    data[f'Momentum_{lag}'] = data[f'Return_{lag}m'].sub(data.Return_1m) # 3Xm - 1m
    if lag > 3:
        data[f'Momentum_3_{lag}'] = data[f'Return_{lag}m'].sub(data.Return_3m) # 6Xm - 3m

In [18]:
data

Unnamed: 0_level_0,Unnamed: 1_level_0,RSI,BB_High,BB_Mid,BB_Low,ATR,NATR,MACD,Return_1m,Return_3m,Return_6m,Return_12m,Momentum_3,Momentum_6,Momentum_3_6,Momentum_12,Momentum_3_12
Ticker,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
AAPL,1982-12-31,47.403443,0.111810,0.101131,0.090452,-0.733044,32.924823,-0.120462,-0.062741,0.178553,0.152478,0.025343,0.241294,0.215219,-0.026075,0.088083,-0.153211
AAPL,1983-01-31,72.314662,0.136634,0.110507,0.084379,-0.728839,28.384148,-0.111827,0.368198,0.172243,0.177537,0.059733,-0.195955,-0.190661,0.005294,-0.308464,-0.112510
AAPL,1983-02-28,60.740668,0.154891,0.143017,0.131143,-0.723543,30.303026,-0.111723,0.116206,0.126984,0.167675,0.079349,0.010778,0.051468,0.040691,-0.036858,-0.047635
AAPL,1983-03-31,46.380966,0.143759,0.138385,0.133011,-0.723940,32.329294,-0.121576,-0.073972,0.122464,0.150167,0.079481,0.196436,0.224139,0.027703,0.153454,-0.042983
AAPL,1983-04-30,64.387237,0.172024,0.146498,0.120972,-0.720760,29.691712,-0.113907,0.195264,0.073027,0.121539,0.108005,-0.122237,-0.073725,0.048511,-0.087259,0.034978
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TSLA,2024-01-31,26.168632,5.548875,5.369038,5.189201,0.981318,4.728262,-2.580594,-0.246257,-0.023014,-0.057639,0.006529,0.223243,0.188618,-0.034624,0.252786,0.029544
TSLA,2024-02-29,52.255213,5.331063,5.266266,5.201469,0.791999,3.886996,-0.437606,0.077901,-0.056129,-0.040106,-0.001565,-0.134030,-0.118007,0.016023,-0.079466,0.054565
TSLA,2024-03-31,45.271376,5.266777,5.176023,5.085268,0.739286,4.304150,-0.901836,-0.129235,-0.108952,-0.057144,-0.013709,0.020283,0.072091,0.051808,0.115526,0.095243
TSLA,2024-04-30,58.014280,5.266724,5.111188,4.955652,1.225468,5.541325,-0.102560,0.042608,-0.007188,-0.015133,0.009147,-0.049796,-0.057741,-0.007945,-0.033461,0.016335


## Date Indicators

In [19]:
dates = data.index.get_level_values('Date')
data['Year'] = dates.year
data['Month'] = dates.month

## Target: Holding Period Returns
1 month target holding period\
= to predict return in next 1 month

In [20]:
data['target'] = data.groupby(level='Ticker')['Return_1m'].shift(-1)
data = data.dropna()
data.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 3270 entries, ('AAPL', Timestamp('1982-12-31 00:00:00')) to ('TSLA', Timestamp('2024-04-30 00:00:00'))
Data columns (total 19 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   RSI            3270 non-null   float64
 1   BB_High        3270 non-null   float64
 2   BB_Mid         3270 non-null   float64
 3   BB_Low         3270 non-null   float64
 4   ATR            3270 non-null   float64
 5   NATR           3270 non-null   float64
 6   MACD           3270 non-null   float64
 7   Return_1m      3270 non-null   float64
 8   Return_3m      3270 non-null   float64
 9   Return_6m      3270 non-null   float64
 10  Return_12m     3270 non-null   float64
 11  Momentum_3     3270 non-null   float64
 12  Momentum_6     3270 non-null   float64
 13  Momentum_3_6   3270 non-null   float64
 14  Momentum_12    3270 non-null   float64
 15  Momentum_3_12  3270 non-null   float64
 16  Year           32

In [21]:
data

Unnamed: 0_level_0,Unnamed: 1_level_0,RSI,BB_High,BB_Mid,BB_Low,ATR,NATR,MACD,Return_1m,Return_3m,Return_6m,Return_12m,Momentum_3,Momentum_6,Momentum_3_6,Momentum_12,Momentum_3_12,Year,Month,target
Ticker,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
AAPL,1982-12-31,47.403443,0.111810,0.101131,0.090452,-0.733044,32.924823,-0.120462,-0.062741,0.178553,0.152478,0.025343,0.241294,0.215219,-0.026075,0.088083,-0.153211,1982,12,0.368198
AAPL,1983-01-31,72.314662,0.136634,0.110507,0.084379,-0.728839,28.384148,-0.111827,0.368198,0.172243,0.177537,0.059733,-0.195955,-0.190661,0.005294,-0.308464,-0.112510,1983,1,0.116206
AAPL,1983-02-28,60.740668,0.154891,0.143017,0.131143,-0.723543,30.303026,-0.111723,0.116206,0.126984,0.167675,0.079349,0.010778,0.051468,0.040691,-0.036858,-0.047635,1983,2,-0.073972
AAPL,1983-03-31,46.380966,0.143759,0.138385,0.133011,-0.723940,32.329294,-0.121576,-0.073972,0.122464,0.150167,0.079481,0.196436,0.224139,0.027703,0.153454,-0.042983,1983,3,0.195264
AAPL,1983-04-30,64.387237,0.172024,0.146498,0.120972,-0.720760,29.691712,-0.113907,0.195264,0.073027,0.121539,0.108005,-0.122237,-0.073725,0.048511,-0.087259,0.034978,1983,4,0.143569
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TSLA,2023-12-31,52.070118,5.575414,5.513506,5.451598,0.961960,3.522393,0.848762,0.034988,-0.002323,-0.008646,0.060220,-0.037312,-0.043635,-0.006323,0.025232,0.062543,2023,12,-0.246257
TSLA,2024-01-31,26.168632,5.548875,5.369038,5.189201,0.981318,4.728262,-2.580594,-0.246257,-0.023014,-0.057639,0.006529,0.223243,0.188618,-0.034624,0.252786,0.029544,2024,1,0.077901
TSLA,2024-02-29,52.255213,5.331063,5.266266,5.201469,0.791999,3.886996,-0.437606,0.077901,-0.056129,-0.040106,-0.001565,-0.134030,-0.118007,0.016023,-0.079466,0.054565,2024,2,-0.129235
TSLA,2024-03-31,45.271376,5.266777,5.176023,5.085268,0.739286,4.304150,-0.901836,-0.129235,-0.108952,-0.057144,-0.013709,0.020283,0.072091,0.051808,0.115526,0.095243,2024,3,0.042608


# Model

## Train-Test Split

In [22]:
X = data.drop('target', axis=1)
y = data.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)

## Train Model

In [23]:
if torch.cuda.is_available():
    # Nvidia CUDA
    device = torch.device("cuda")
elif torch.backends.mps.is_available():
    # Apple Metal
    device = torch.device("mps")
else:
    device = torch.device("cpu")

print(f'Device: {device}')

Device: mps


In [24]:
naive_model = None
linear_model = LinearRegression()
# logistic_model = LogisticRegression()
svr_model = SVR()
rfr_model = RandomForestRegressor()
lstm_model = LSTM()

models = {
    'NaiveForecast': naive_model,
    'LinearRegression': linear_model,
    # 'LogisticRegression': logistic_model,
    'SVR': svr_model,
    'RFR': rfr_model,
    'LSTM': lstm_model
}

In [25]:
results = {}

for model_name, model in models.items():
    print("Training: ", model_name)

    if model_name == 'NaiveForecast':
        y_pred_test = y_test.shift(1).fillna(y_test.iloc[0]) # Backward Filling
        y_pred_train = y_train.shift(1).fillna(y_train.iloc[0])
    elif model_name == 'LSTM':
        X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
        X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
        y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)
        y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32)

        train_data = torch.utils.data.TensorDataset(X_train_tensor, y_train_tensor)
        train_loader = torch.utils.data.DataLoader(train_data, batch_size=32, shuffle=True)

        loss_function = nn.MSELoss()
        optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

        epochs = 30

        model.to(device)
        model.train()

        for i in tqdm(range(epochs), desc='LSTM'):
            total_loss = 0
            for batch_X, batch_y in train_loader:
                batch_X = batch_X[:,:,None].to(device)
                batch_y = batch_y[:,None].to(device)

                optimizer.zero_grad()
                model.reset_hidden_state()
                y_pred = model(batch_X)

                loss = loss_function(y_pred, batch_y)
                loss.backward()
                optimizer.step()

                total_loss += loss.to(torch.device('cpu')).item()

        model.eval()

        with torch.no_grad():
            X_test_tensor = X_test_tensor[:,:,None].to(device)
            y_pred_test = model(X_test_tensor)
            y_pred_test = y_pred_test.to(torch.device('cpu')).numpy().reshape(-1)
    else:
        if model_name == 'LogisticRegression':
            y_train_bool = (y_train > 0) # Continuos -> Discrete(bool)
            model.fit(X_train, y_train_bool)
        else:
            model.fit(X_train, y_train)

        y_pred_test = model.predict(X_test)
        y_pred_train = model.predict(X_train)

    mae = mean_absolute_error(y_true=y_test, y_pred=y_pred_test)
    mse = mean_squared_error(y_true=y_test, y_pred=y_pred_test)
    rmse = np.sqrt(mse)
    direction = ( np.mean(np.sign(y_pred_test) == np.sign(y_test)) )

    y_pred_test = pd.Series(y_pred_test, index=y_test.index).sort_index()
    y_pred_train = pd.Series(y_pred_train, index=y_train.index).sort_index()
    y_predicted = pd.concat([y_pred_test, y_pred_train]).sort_index()

    results[model_name] = {
        'model': model,
        'y_pred_test': y_pred_test,
        'y_pred_train': y_pred_train,
        'y_predicted': y_predicted,
        'mae': mae,
        'mse': mse,
        'rmse': rmse,
        'direction': direction
    }

Training:  NaiveForecast
Training:  LinearRegression
Training:  SVR
Training:  RFR
Training:  LSTM


LSTM: 100%|██████████| 30/30 [00:33<00:00,  1.11s/it]


In [26]:
for model_name, metrics in results.items():
    print(f"Model: {model_name}")
    print(f"MAE: {metrics['mae']:.4f}")
    print(f"MSE: {metrics['mse']:.4f}")
    print(f"RMSE: {metrics['rmse']:.4f}")
    print(f"Direction: {metrics['direction']:.4f}")
    print()

Model: NaiveForecast
MAE: 0.1379
MSE: 0.0319
RMSE: 0.1786
Direction: 0.5229

Model: LinearRegression
MAE: 0.0964
MSE: 0.0164
RMSE: 0.1280
Direction: 0.5459

Model: SVR
MAE: 0.0968
MSE: 0.0167
RMSE: 0.1293
Direction: 0.5382

Model: RFR
MAE: 0.0955
MSE: 0.0160
RMSE: 0.1266
Direction: 0.5719

Model: LSTM
MAE: 0.1127
MSE: 0.0203
RMSE: 0.1424
Direction: 0.5382



In [27]:
df_predicted = pd.DataFrame()

for model_name, _ in results.items():
    df_predicted = pd.concat([df_predicted, results[model_name]['y_predicted'].rename(f'Predicted_{model_name}')], axis=1).sort_index()

df_predicted = df_predicted.rename_axis(index=['Ticker', 'Date'])
df_predicted.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 3270 entries, ('AAPL', Timestamp('1982-12-31 00:00:00')) to ('TSLA', Timestamp('2024-04-30 00:00:00'))
Data columns (total 5 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Predicted_NaiveForecast     3270 non-null   float64
 1   Predicted_LinearRegression  3270 non-null   float64
 2   Predicted_SVR               3270 non-null   float64
 3   Predicted_RFR               3270 non-null   float64
 4   Predicted_LSTM              3270 non-null   float64
dtypes: float64(5)
memory usage: 158.1+ KB


In [28]:
df_predicted

Unnamed: 0_level_0,Unnamed: 1_level_0,Predicted_NaiveForecast,Predicted_LinearRegression,Predicted_SVR,Predicted_RFR,Predicted_LSTM
Ticker,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AAPL,1982-12-31,0.024705,0.036095,0.022606,0.137131,0.085715
AAPL,1983-01-31,0.077554,0.029296,0.023887,0.096602,0.096602
AAPL,1983-02-28,0.091157,0.016557,0.023075,0.065281,0.078616
AAPL,1983-03-31,0.285809,0.016942,0.022062,0.143245,0.143245
AAPL,1983-04-30,0.017443,0.020601,0.023461,0.081468,0.081468
...,...,...,...,...,...,...
TSLA,2023-12-31,0.017903,0.033372,0.024390,-0.136265,-0.136265
TSLA,2024-01-31,0.068103,0.030556,0.021960,0.075873,0.075873
TSLA,2024-02-29,-0.067006,0.017661,0.023927,0.015584,0.078958
TSLA,2024-03-31,0.325011,0.004024,0.023485,0.010217,0.079798


In [29]:
# Combine 'df_predicted' to 'data'
data = data.join(df_predicted)
data.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 3270 entries, ('AAPL', Timestamp('1982-12-31 00:00:00')) to ('TSLA', Timestamp('2024-04-30 00:00:00'))
Data columns (total 24 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   RSI                         3270 non-null   float64
 1   BB_High                     3270 non-null   float64
 2   BB_Mid                      3270 non-null   float64
 3   BB_Low                      3270 non-null   float64
 4   ATR                         3270 non-null   float64
 5   NATR                        3270 non-null   float64
 6   MACD                        3270 non-null   float64
 7   Return_1m                   3270 non-null   float64
 8   Return_3m                   3270 non-null   float64
 9   Return_6m                   3270 non-null   float64
 10  Return_12m                  3270 non-null   float64
 11  Momentum_3                  3270 non-null   float64
 12  Momentum_6 

# Backtesting

## Vectorized Backtest

In [30]:
fwd_returns = data['Return_1m'].unstack('Ticker').sort_index()

for model_name, _ in models.items():
    predictions = data[f'Predicted_{model_name}'].unstack('Ticker').sort_index()
    N_LONG = N_SHORT = 3

    long_signals = ((predictions
                    .where(predictions > 0)
                    .rank(axis=1, ascending=False) > N_LONG)
                    .astype(int))
    short_signals = ((predictions
                    .where(predictions < 0)
                    .rank(axis=1) > N_SHORT)
                    .astype(int))

    long_returns = long_signals.mul(fwd_returns).mean(axis=1)
    short_returns = short_signals.mul(-fwd_returns).mean(axis=1)
    strategy_returns = long_returns.add(short_returns).to_frame('Strategy')

    print(model_name)
    print(f"Cumulative Return: {strategy_returns['Strategy'].sum() * 100}")
    print('-----')

NaiveForecast
Cumulative Return: 166.53693899862418
-----
LinearRegression
Cumulative Return: 149.4683247093879
-----
SVR
Cumulative Return: -530.4097244194986
-----
RFR
Cumulative Return: 269.6087694031337
-----
LSTM
Cumulative Return: 317.8204019294703
-----
