In [1]:
# import warnings
# warnings.filterwarnings('ignore')

import os
import yfinance as yf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xg
import joblib

# Technical Analysis
from talib import RSI, BBANDS, ATR, NATR, MACD

# scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import GridSearchCV

# neural network
from tqdm import tqdm
import torch
import torch.nn as nn
  # our model
from LSTM import LSTM

# Data Preprocessing

In [2]:
# def read_tickers_sp500(file_path):
#     with open(file_path, 'r') as file:
#         tickers_sp500 = [line.strip() for line in file]
#     return tickers_sp500

# tickers_sp500 = read_tickers_sp500('tickers_sp500.txt')
# print(tickers_sp500)

In [3]:
# Download data

# Stocks list
# top 7 MarketCap in S&P500(^GSPC)
# tickers = ['^GSPC', 'AAPL', 'MSFT', 'AMZN', 'NVDA', 'GOOGL', 'TSLA', 'META']
tickers = ['^GSPC', 'AAPL', 'MSFT', 'AMZN', 'NVDA', 'GOOGL', 'TSLA', 'META', 'AMD', 'INTC', 'NFLX', 'PYPL', 'ASML']
# tickers = tickers_sp500
start_date = '2014-05-01'
end_date = '2024-05-01'

df_prices_download = yf.download(tickers=tickers, start=start_date, end=end_date, group_by='ticker')

[*********************100%%**********************]  13 of 13 completed


In [4]:
# Format into large table
# col: OHLCV
# rows(multi-index): Ticker, Date

df_prices = df_prices_download.stack(level=0, dropna=False)
df_prices = df_prices.swaplevel(0, 1)
df_prices = df_prices.loc[tickers].sort_index(level='Ticker')
df_prices.dropna(inplace=True)

# Use 'Adj Close' instead of 'Close'
df_prices.drop('Close', axis=1, inplace=True)
df_prices.rename(columns={'Adj Close': 'Close'}, inplace=True)

df_prices

  df_prices = df_prices_download.stack(level=0, dropna=False)


Unnamed: 0_level_0,Price,Open,High,Low,Close,Volume
Ticker,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AAPL,2014-05-01,21.142857,21.242857,20.941429,18.581392,2.440480e+08
AAPL,2014-05-02,21.155001,21.221430,21.061071,18.615944,1.915144e+08
AAPL,2014-05-05,21.076429,21.464287,21.071428,18.879213,2.870672e+08
AAPL,2014-05-06,21.492857,21.586071,21.228930,18.673439,3.745644e+08
AAPL,2014-05-07,21.258928,21.331785,20.990356,18.608097,2.828644e+08
...,...,...,...,...,...,...
^GSPC,2024-04-24,5084.859863,5089.479980,5047.020020,5071.629883,3.656740e+09
^GSPC,2024-04-25,5019.879883,5057.750000,4990.580078,5048.419922,3.958050e+09
^GSPC,2024-04-26,5084.649902,5114.620117,5073.140137,5099.959961,3.604140e+09
^GSPC,2024-04-29,5114.129883,5123.490234,5088.649902,5116.169922,3.447450e+09


# Feature Engineering

### RSI - Relative Strength Index
RSI compares the magnitude of recent price changes across stocks to identify stocks as overbought or oversold.

In [5]:
rsi = df_prices.groupby(level='Ticker', group_keys=False).Close.apply(RSI)
df_prices['RSI'] = rsi

### Bollinger Bands
Bollinger Bands is a technical analysis tool used to determine where prices are high and low relative to each other.

In [6]:
def compute_bb(close):
    high, mid, low = BBANDS(np.log1p(close), timeperiod=20)
    return pd.DataFrame({'BB_High': high,
                         'BB_Mid': mid, # SMA20
                         'BB_Low': low},
                        index=close.index)

bbands = df_prices.groupby(level='Ticker', group_keys=False).Close.apply(compute_bb)
df_prices = pd.concat([df_prices, bbands], axis=1)

### ATR - Average True Range
The average true range (ATR) indicator shows the volatility of the market.

In [7]:
by_ticker = df_prices.groupby('Ticker', group_keys=False)

def compute_atr(stock_data):
    atr = ATR(stock_data.High,
              stock_data.Low,
              stock_data.Close,
              timeperiod=14)
    return atr.sub(atr.mean()).div(atr.std())

df_prices['ATR'] = by_ticker.apply(compute_atr)
# Normalized Average True Range (NATR)
df_prices['NATR'] = by_ticker.apply(lambda x: NATR(high=x.High, low=x.Low, close=x.Close))

### MACD - Moving Average Convergence/Divergence

In [8]:
def compute_macd(close):
    macd = MACD(close)[0]
    return macd.sub(macd.mean()).div(macd.std())

df_prices['MACD'] = df_prices.groupby(level='Ticker', group_keys=False).Close.apply(compute_macd)

## Determine Investment Universe

### Dollar Volume

In [9]:
# Close: USD
# Volumn: Amount
df_prices['Dollar_Volume'] = (df_prices.loc[:, 'Close']
                           .mul(df_prices.loc[:, 'Volume'], axis=0))

df_prices.Dollar_Volume /= 1e6 # Dollar_Volume: Million USD

df_prices.dropna(inplace=True)

In [10]:
remian_cols = [c for c in df_prices.columns.unique(0) if c not in ['Dollar_Volume', 'Volume', 'Open', 'High', 'Low']]

# New data frame: 'data' - load to model
data = (
    pd.concat(
        [
        # avg(1M) Dollar_Volume
            df_prices.unstack("Ticker")
            .Dollar_Volume.resample('D')
            .mean()
            .stack("Ticker")
            .to_frame("Dollar_Volume"),
        # (Adj)Close & Technical Indicators
            df_prices.unstack("Ticker")[remian_cols]
            .resample('D')
            .last()
            .stack("Ticker")
        ],
        axis=1
    )
    .swaplevel()
    .sort_index(level='Ticker')
    .dropna()
)

data.info()

  df_prices.unstack("Ticker")[remian_cols]


<class 'pandas.core.frame.DataFrame'>
MultiIndex: 31996 entries, ('AAPL', Timestamp('2014-06-18 00:00:00')) to ('^GSPC', Timestamp('2024-04-30 00:00:00'))
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Dollar_Volume  31996 non-null  float64
 1   Close          31996 non-null  float64
 2   RSI            31996 non-null  float64
 3   BB_High        31996 non-null  float64
 4   BB_Mid         31996 non-null  float64
 5   BB_Low         31996 non-null  float64
 6   ATR            31996 non-null  float64
 7   NATR           31996 non-null  float64
 8   MACD           31996 non-null  float64
dtypes: float64(9)
memory usage: 2.4+ MB


## Monthly Return

In [11]:
outlier_cutoff = 0.01 # winsorize returns at the [1%, 99%]
# lags = [1, 3, 6, 12] # Month timeframe
lags = [1, 5, 10, 21, 42, 63] # Day timeframe
returns = []

for lag in lags:
    returns.append(data
                   .Close
                   .unstack('Ticker')
                   .sort_index()
                   .pct_change(lag)
                   .stack('Ticker')
                   .pipe(lambda x: x.clip(lower=x.quantile(outlier_cutoff),
                                          upper=x.quantile(1-outlier_cutoff)))
                   .add(1)
                   .pow(1/lag)
                   .sub(1)
                   .to_frame(f'Return_{lag}d')
                   )

df_returns = pd.concat(returns, axis=1).swaplevel().sort_index(level='Ticker')
df_returns.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 31983 entries, ('AAPL', Timestamp('2014-06-19 00:00:00')) to ('^GSPC', Timestamp('2024-04-30 00:00:00'))
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Return_1d   31983 non-null  float64
 1   Return_5d   31931 non-null  float64
 2   Return_10d  31866 non-null  float64
 3   Return_21d  31723 non-null  float64
 4   Return_42d  31450 non-null  float64
 5   Return_63d  31177 non-null  float64
dtypes: float64(6)
memory usage: 1.6+ MB


In [12]:
# merge returns -> data
# drop 'Close', use 'Returns' instead
data = data.join(df_returns).drop('Close', axis=1).dropna()
data.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 31177 entries, ('AAPL', Timestamp('2014-09-17 00:00:00')) to ('^GSPC', Timestamp('2024-04-30 00:00:00'))
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Dollar_Volume  31177 non-null  float64
 1   RSI            31177 non-null  float64
 2   BB_High        31177 non-null  float64
 3   BB_Mid         31177 non-null  float64
 4   BB_Low         31177 non-null  float64
 5   ATR            31177 non-null  float64
 6   NATR           31177 non-null  float64
 7   MACD           31177 non-null  float64
 8   Return_1d      31177 non-null  float64
 9   Return_5d      31177 non-null  float64
 10  Return_10d     31177 non-null  float64
 11  Return_21d     31177 non-null  float64
 12  Return_42d     31177 non-null  float64
 13  Return_63d     31177 non-null  float64
dtypes: float64(14)
memory usage: 3.6+ MB


In [13]:
data.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 31177 entries, ('AAPL', Timestamp('2014-09-17 00:00:00')) to ('^GSPC', Timestamp('2024-04-30 00:00:00'))
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Dollar_Volume  31177 non-null  float64
 1   RSI            31177 non-null  float64
 2   BB_High        31177 non-null  float64
 3   BB_Mid         31177 non-null  float64
 4   BB_Low         31177 non-null  float64
 5   ATR            31177 non-null  float64
 6   NATR           31177 non-null  float64
 7   MACD           31177 non-null  float64
 8   Return_1d      31177 non-null  float64
 9   Return_5d      31177 non-null  float64
 10  Return_10d     31177 non-null  float64
 11  Return_21d     31177 non-null  float64
 12  Return_42d     31177 non-null  float64
 13  Return_63d     31177 non-null  float64
dtypes: float64(14)
memory usage: 3.6+ MB


## Price Momentum
This factor computes the total return for a given number of prior trading days d.

In [17]:
# # Month timeframe
# for lag in [3, 6, 12]:
#     data[f'Momentum_{lag}'] = data[f'Return_{lag}m'].sub(data.Return_1m) # 3Xm - 1m
#     if lag > 3:
#         data[f'Momentum_3_{lag}'] = data[f'Return_{lag}m'].sub(data.Return_3m) # 6Xm - 3m

# Day timeframe
for lag in [5, 10, 21, 42, 63]:
    data[f'Momentum_{lag}'] = data[f'Return_{lag}d'].sub(data.Return_1d) # 3Xm - 1m
    if lag > 5:
        data[f'Momentum_5_{lag}'] = data[f'Return_{lag}d'].sub(data.Return_5d) # 6Xm - 3m

data.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 31177 entries, ('AAPL', Timestamp('2014-09-17 00:00:00')) to ('^GSPC', Timestamp('2024-04-30 00:00:00'))
Data columns (total 23 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Dollar_Volume  31177 non-null  float64
 1   RSI            31177 non-null  float64
 2   BB_High        31177 non-null  float64
 3   BB_Mid         31177 non-null  float64
 4   BB_Low         31177 non-null  float64
 5   ATR            31177 non-null  float64
 6   NATR           31177 non-null  float64
 7   MACD           31177 non-null  float64
 8   Return_1d      31177 non-null  float64
 9   Return_5d      31177 non-null  float64
 10  Return_10d     31177 non-null  float64
 11  Return_21d     31177 non-null  float64
 12  Return_42d     31177 non-null  float64
 13  Return_63d     31177 non-null  float64
 14  Momentum_5     31177 non-null  float64
 15  Momentum_10    31177 non-null  float64
 16  Momentum_5_10  

## Date Indicators

In [19]:
dates = data.index.get_level_values('Date')
data['Year'] = dates.year
data['Month'] = dates.month

## Target: Holding Period Returns
1 day target holding period\
= to predict return in next 1 day (tomorrow)

In [20]:
data['target'] = data.groupby(level='Ticker')['Return_1d'].shift(-1)
data = data.dropna()
data.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 31164 entries, ('AAPL', Timestamp('2014-09-17 00:00:00')) to ('^GSPC', Timestamp('2024-04-29 00:00:00'))
Data columns (total 26 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Dollar_Volume  31164 non-null  float64
 1   RSI            31164 non-null  float64
 2   BB_High        31164 non-null  float64
 3   BB_Mid         31164 non-null  float64
 4   BB_Low         31164 non-null  float64
 5   ATR            31164 non-null  float64
 6   NATR           31164 non-null  float64
 7   MACD           31164 non-null  float64
 8   Return_1d      31164 non-null  float64
 9   Return_5d      31164 non-null  float64
 10  Return_10d     31164 non-null  float64
 11  Return_21d     31164 non-null  float64
 12  Return_42d     31164 non-null  float64
 13  Return_63d     31164 non-null  float64
 14  Momentum_5     31164 non-null  float64
 15  Momentum_10    31164 non-null  float64
 16  Momentum_5_10  

## Save data to local

In [28]:
DATA_PATH = 'data'

df_prices.to_csv(f'{DATA_PATH}/prices.csv', index=True)
df_returns.to_csv(f'{DATA_PATH}/returns.csv', index=True)
data.to_csv(f'{DATA_PATH}/data.csv', index=True)

In [23]:
df_prices.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 31996 entries, ('AAPL', Timestamp('2014-06-18 00:00:00')) to ('^GSPC', Timestamp('2024-04-30 00:00:00'))
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Open           31996 non-null  float64
 1   High           31996 non-null  float64
 2   Low            31996 non-null  float64
 3   Close          31996 non-null  float64
 4   Volume         31996 non-null  float64
 5   RSI            31996 non-null  float64
 6   BB_High        31996 non-null  float64
 7   BB_Mid         31996 non-null  float64
 8   BB_Low         31996 non-null  float64
 9   ATR            31996 non-null  float64
 10  NATR           31996 non-null  float64
 11  MACD           31996 non-null  float64
 12  Dollar_Volume  31996 non-null  float64
dtypes: float64(13)
memory usage: 3.3+ MB


In [24]:
df_returns.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 31983 entries, ('AAPL', Timestamp('2014-06-19 00:00:00')) to ('^GSPC', Timestamp('2024-04-30 00:00:00'))
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Return_1d   31983 non-null  float64
 1   Return_5d   31931 non-null  float64
 2   Return_10d  31866 non-null  float64
 3   Return_21d  31723 non-null  float64
 4   Return_42d  31450 non-null  float64
 5   Return_63d  31177 non-null  float64
dtypes: float64(6)
memory usage: 2.6+ MB


In [25]:
data.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 31164 entries, ('AAPL', Timestamp('2014-09-17 00:00:00')) to ('^GSPC', Timestamp('2024-04-29 00:00:00'))
Data columns (total 26 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Dollar_Volume  31164 non-null  float64
 1   RSI            31164 non-null  float64
 2   BB_High        31164 non-null  float64
 3   BB_Mid         31164 non-null  float64
 4   BB_Low         31164 non-null  float64
 5   ATR            31164 non-null  float64
 6   NATR           31164 non-null  float64
 7   MACD           31164 non-null  float64
 8   Return_1d      31164 non-null  float64
 9   Return_5d      31164 non-null  float64
 10  Return_10d     31164 non-null  float64
 11  Return_21d     31164 non-null  float64
 12  Return_42d     31164 non-null  float64
 13  Return_63d     31164 non-null  float64
 14  Momentum_5     31164 non-null  float64
 15  Momentum_10    31164 non-null  float64
 16  Momentum_5_10  

# Model

In [None]:
# Simple Split

# X = data.drop('target', axis=1)
# y = data.target

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)

# Ntest = 1000
# df_train = df_to_model.iloc[1:-Ntest]
# df_test = df_to_model.iloc[-Ntest:-1]

# X_train = df_train.drop(columns=['^GSPC_log_return'])
# X_test = df_test.drop(columns=['^GSPC_log_return'])

# y_train = df_train['^GSPC_log_return']
# y_test = df_test['^GSPC_log_return']

## Train-Test Split
**Strategy:** Time Series Cross-Validator:

A variation of KFold. In the kth split, it returns first k folds as train set and the (k+1)th fold as test set.


In [None]:
for ticker in data.index.unique('Ticker'):
    print(ticker, " size:", data.loc[ticker].shape)

In [None]:
split_data = dict()

tscv = TimeSeriesSplit(n_splits=3)

for ticker in data.index.unique('Ticker'):
    X = data.loc[ticker].drop(columns=['target'])
    y = data.loc[ticker]['target']
    ticker_data = { 'X_train': [], 'X_test': [], 'y_train': [], 'y_test': [] }
    for train_index, test_index in tscv.split(data.loc[ticker]):
        X_train, X_test = X.iloc[train_index, :], X.iloc[test_index,:]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        ticker_data['X_train'].append(X_train)
        ticker_data['X_test'].append(X_test)
        ticker_data['y_train'].append(y_train)
        ticker_data['y_test'].append(y_test)

    split_data[ticker] = ticker_data

print(split_data.keys())
print(split_data['AAPL']["X_train"][0].shape, split_data['AAPL']["y_train"][0].shape,)

## Train Model

In [None]:
if torch.cuda.is_available():
    # Nvidia CUDA
    device = torch.device("cuda")
elif torch.backends.mps.is_available():
    # Apple Metal
    device = torch.device("mps")
else:
    device = torch.device("cpu")

print(f'Device: {device}')

In [None]:
# model_list = ['NaiveForecast', 'LinearRegression', 'SVR', 'RFR', 'XGB', 'LSTM']

naive_model = None
linear_model = LinearRegression()
# logistic_model = LogisticRegression()
svr_model = SVR()
rfr_model = RandomForestRegressor()
xgb_model = xg.XGBRegressor()
lstm_model = LSTM()

models = {
    'NaiveForecast': naive_model,
    'LinearRegression': linear_model,
    # 'LogisticRegression': logistic_model,
    'SVR': svr_model,
    'RFR': rfr_model,
    'XGB': xgb_model,
    'LSTM': lstm_model
}

In [None]:
results = dict()

for ticker in data.index.unique('Ticker'):

    print("Training Ticker:", ticker)

    results[ticker] = dict()

    for iter in range(len(split_data[ticker]['X_train'])):
        # print("  Iter:", iter)
        X_train = split_data[ticker]['X_train'][iter]
        X_test = split_data[ticker]['X_test'][iter]
        y_train = split_data[ticker]['y_train'][iter]
        y_test = split_data[ticker]['y_test'][iter]

        for name, model in models.items():

            if name == 'NaiveForecast':
                y_pred_test = y_test.shift(1).fillna(y_test.iloc[0]) # Backward Filling
                y_pred_train = y_train.shift(1).fillna(y_train.iloc[0])
            elif name == 'LSTM':
                X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
                X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
                y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)
                y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32)

                train_data = torch.utils.data.TensorDataset(X_train_tensor, y_train_tensor)
                train_loader = torch.utils.data.DataLoader(train_data, batch_size=32, shuffle=True)

                loss_function = nn.MSELoss()
                optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

                epochs = 30

                model.to(device)
                model.train()

                for i in tqdm(range(epochs), desc='LSTM'):
                    total_loss = 0
                    for batch_X, batch_y in train_loader:
                        batch_X = batch_X[:,:,None].to(device)
                        batch_y = batch_y[:,None].to(device)

                        optimizer.zero_grad()
                        model.reset_hidden_state()
                        y_pred = model(batch_X)

                        loss = loss_function(y_pred, batch_y)
                        loss.backward()
                        optimizer.step()

                        total_loss += loss.to(torch.device('cpu')).item()

                model.eval()

                with torch.no_grad():
                    X_test_tensor = X_test_tensor[:,:,None].to(device)
                    y_pred_test = model(X_test_tensor)
                    y_pred_test = y_pred_test.to(torch.device('cpu')).numpy().reshape(-1)
            else:
                model.fit(X_train, y_train)
                y_pred_test = model.predict(X_test)

            mae = mean_absolute_error(y_true=y_test, y_pred=y_pred_test)
            mse = mean_squared_error(y_true=y_test, y_pred=y_pred_test)
            rmse = np.sqrt(mse)
            direction = ( np.mean(np.sign(y_pred_test) == np.sign(y_test)) )

            if name not in results[ticker].keys() or mse < results[ticker][name]['mse']:

                results[ticker][name] = {
                    'iter': iter,
                    'model': model,
                    'y_test': y_test,
                    'y_pred_test': y_pred_test,
                    'mae': mae,
                    'mse': mse,
                    'rmse': rmse,
                    'direction': direction
                }

In [None]:
for ticker in results.keys():

    if ticker != "^GSPC":  # prevent too much printing
        continue

    for name, metrics in results[ticker].items():
        print(f"Ticker: {ticker}")
        print(f"Model: {name} (Best Iteration: {metrics['iter']})")
        print(f"MAE: {metrics['mae']:.4f}")
        print(f"MSE: {metrics['mse']:.4f}")
        print(f"RMSE: {metrics['rmse']:.4f}")
        print(f"Direction: {metrics['direction']:.4f}")

        plt.figure(figsize=(21, 9))
        plt.plot(metrics['y_test'].index, metrics['y_test'], label='Actual', marker='o')
        plt.plot(metrics['y_test'].index, metrics['y_pred_test'], label='Predicted', marker='x')
        plt.xlabel('Date')
        plt.ylabel('target')
        plt.title(f'Actual vs Predicted {ticker} ({name})')
        plt.legend()
        plt.show()

# Hyper Parameter Tuning
**Strategy**: Grid Search CV

Exhaustive search over specified parameter values for an estimator.

In [None]:
# naive_model = None
tuned_linear_model = LinearRegression()
# tuned_logistic_model = LogisticRegression()
tuned_svr_model = SVR()
tuned_rfr_model = RandomForestRegressor()
tuned_xgb_model = xg.XGBRegressor()
# lstm_model = LSTM()

tuned_models = {
    # 'NaiveForecast': naive_model,
    'tuned_LinearRegression': tuned_linear_model,
    # 'tuned_LogisticRegression': tuned_logistic_model,
    'tuned_SVR': tuned_svr_model,
    'tuned_RFR': tuned_rfr_model,
    'tuned_XGB': tuned_xgb_model
    # 'LSTM': lstm_model
}

In [None]:
tuned_results = dict()

for ticker in data.index.unique('Ticker'):

    print("Training Ticker:", ticker)

    tuned_results[ticker] = dict()

    for iter in range(len(split_data[ticker]['X_train'])):
        print("  Iteration:", iter)
        X_train = split_data[ticker]['X_train'][iter]
        X_test = split_data[ticker]['X_test'][iter]
        y_train = split_data[ticker]['y_train'][iter]
        y_test = split_data[ticker]['y_test'][iter]

        for name, model in tuned_models.items():

            if name == 'tuned_SVR':
                params = {
                    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
                    'degree': np.arange(1, 5),
                    # 'gamma': ['scale', 'auto'],
                    'epsilon': [0.3, 0.5, 0.7, 1],
                    'coef0': [0, 0.01, 0.1]
                }
                model = GridSearchCV(model, params)
                model.fit(X_train * 10, y_train * 10)
                y_pred_test = model.predict(X_test * 10) / 10

            else:
                if name == 'tuned_LinearRegression':
                    params = {'fit_intercept': [True, False]}
                elif name == 'tuned_RFR':
                    params = {
                        # 'n_estimators': [100, 200, 300],
                        # 'criterion': ['squared_error', 'friedman_mse'],
                        # 'min_samples_split': np.arange(2, 5),
                        'min_samples_leaf': np.arange(1, 10, 2),
                        'min_impurity_decrease': [0.0, 0.1, 0.5],
                        'warm_start': [False, True]
                    }
                elif name == 'tuned_XGB':
                    params = {
                        'eta': [0.01, 0.05, 0.1, 0.2],
                        'subsample': [0.5, 0.7, 0.9]
                    }

                model = GridSearchCV(model, params)
                model.fit(X_train, y_train)

                y_pred_test = model.predict(X_test)

            print(f"    Model: {name}  |  Tuned Parameters: {model.best_params_}")

            mae = mean_absolute_error(y_true=y_test, y_pred=y_pred_test)
            mse = mean_squared_error(y_true=y_test, y_pred=y_pred_test)
            rmse = np.sqrt(mse)
            direction = ( np.mean(np.sign(y_pred_test) == np.sign(y_test)) )

            if name not in results[ticker].keys() or mse < results[ticker][name]['mse']:

                joblib.dump(model, f"./models/tuned/{ticker}_{name}.sav")
                tuned_results[ticker][name] = {
                    'iter': iter,
                    'model': model,
                    'y_test': y_test,
                    'y_pred_test': y_pred_test,
                    'mae': mae,
                    'mse': mse,
                    'rmse': rmse,
                    'direction': direction,
                }

In [None]:
for ticker in tuned_results.keys():

    if ticker != "^GSPC":  # prevent too much printing
        continue

    for name, metrics in tuned_results[ticker].items():
        print(f"Ticker: {ticker}")
        print(f"Model: {name} (Best Iteration: {metrics['iter']})")
        print(f"MAE: {metrics['mae']:.4f}")
        print(f"MSE: {metrics['mse']:.4f}")
        print(f"RMSE: {metrics['rmse']:.4f}")
        print(f"Direction: {metrics['direction']:.4f}")

        plt.figure(figsize=(21, 9))
        plt.plot(metrics['y_test'].index, metrics['y_test'], label='Actual', marker='o')
        plt.plot(metrics['y_test'].index, metrics['y_pred_test'], label='Predicted', marker='x')
        plt.xlabel('Date')
        plt.ylabel('target')
        plt.title(f'Actual vs Predicted {ticker} ({name})')
        plt.legend()
        plt.show()

# Trading

Strategy
- Position (Buy/Sell/do nothing) base on predicted '(log)return' of the next day
    - return > 0 : buy
    - return = 0 : do nothing
    - return < 0 : sell
- buy/sell all of portfolio in each transaction
- no short position

In [None]:
df_trading = pd.DataFrame() # on test set
df_trading['^GSPC_log_return'] = y_test

for model_name in models:
    pred_log_return = results[model_name]['y_pred_test']
    position = ( pred_log_return > 0 ) # 1: Buy, 0: Sell
    model_log_return = position * df_trading['^GSPC_log_return']
    df_trading[model_name + '_log_return'] = model_log_return

df_trading

In [None]:
df_trading.index = pd.to_datetime(df_trading.index)

total_days = (df_trading.index[-1] - df_trading.index[0]).days
years = total_days // 365
months = (total_days % 365) // 30

In [None]:
buy_and_hold_total_return = np.exp(y_test.sum())

print(f'Total(Gross) Return ({years} years {months} months)')
print('----------')
print(f'Buy & Hold (since {(df_trading.index[0]).date()}): {buy_and_hold_total_return *100:.2f}%')
for model_name in models:
    total_return = np.exp(df_trading[model_name + '_log_return'].sum())
    print(f'{model_name}: {total_return *100:.2f}%')

print(f'\nNet Return ({years} years {months} months)')
print('----------')
print(f'Buy & Hold (since {(df_trading.index[0]).date()}): {(buy_and_hold_total_return-1) *100:.2f}%')
for model_name in models:
    total_return = np.exp(df_trading[model_name + '_log_return'].sum())
    print(f'{model_name}: {(total_return-1) *100:.2f}%')

print(f'\nNet Return (per year)')
print('----------')
print(f'Buy & Hold (since {(df_trading.index[0]).date()}): {(buy_and_hold_total_return-1)/total_days*365 *100:.2f}%')
for model_name in models:
    total_return = np.exp(df_trading[model_name + '_log_return'].sum())
    print(f'{model_name}: {(total_return-1)/total_days*365 *100:.2f}%')