### This is a test project to predict short-term stock returns using streaming data from public API

Data sources:
- Alphavantage: daily/intraday data with history of 2 years. Intraday data is delayed by a few days.
- yfinance: Intraday data is near-realtime, maybe delay of 1 min. Shorter sample window, 60d for itd data.


for more vendor api options, see:
https://nordicapis.com/10-real-time-stock-data-apis/
https://algotrading101.com/learn/yahoo-finance-api-guide/
https://algotrading101.com/learn/yfinance-guide/


Another streaming data project may be to use Google Trends data, see a template on Kaggle.

In [43]:
# !pip install yfinance

import numpy as np
import pandas as pd
import os, time, warnings, random, shap, requests, optuna, datetime
import seaborn as sns
import matplotlib.pyplot as plt
import functools as ft
import yfinance as yf


from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.preprocessing import LabelBinarizer, LabelEncoder, OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression, LinearRegression, Lasso, Ridge
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, train_test_split, KFold
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, f1_score, r2_score, mean_squared_error
from sklearn.inspection import permutation_importance
from xgboost import XGBClassifier, XGBRegressor

pd.set_option('display.max_columns', 100)
pd.set_option('mode.chained_assignment', None)
pd.set_option('display.expand_frame_repr', False)
warnings.filterwarnings('ignore') 

time0 = time.time()

os.chdir('/home/jupyter/projects_gcp_cpu/spx/src')
os.getcwd()

'/home/jupyter/projects_gcp_cpu/spx/src'

Datapull using AV APIs:

In [44]:
# data = yf.download(
#         tickers = "^GSPC ^IXIC ^RUT EEM",
#         period = "60d",
#         interval = "2m",
#         ignore_tz = True,
#         group_by = 'ticker',
#         auto_adjust = True,
#         prepost = False,
#         threads = True,
#         proxy = None
#     )

# display(data.head(2), data.tail(6))

In [45]:
tickerStrings = ['^GSPC', '^IXIC', '^RUT', 'EEM']
df_list = list()
for ticker in tickerStrings:
    data = yf.download(ticker, group_by="Ticker", period='60d', interval='30m', prepost=False, auto_adjust=True)
    data['ticker'] = ticker  
    df_list.append(data)

df = pd.concat(df_list)
df

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Volume,ticker
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2022-09-07 09:30:00,3909.429932,3929.600098,3906.030029,3920.530029,169448784,^GSPC
2022-09-07 10:00:00,3920.250000,3926.209961,3911.020020,3925.469971,154008120,^GSPC
2022-09-07 10:30:00,3925.659912,3935.969971,3924.489990,3930.520020,137701341,^GSPC
2022-09-07 11:00:00,3930.469971,3947.000000,3926.469971,3941.550049,129448725,^GSPC
2022-09-07 11:30:00,3941.540039,3946.750000,3937.280029,3942.399902,99217335,^GSPC
...,...,...,...,...,...,...
2022-11-30 14:00:00,39.410000,39.650002,39.404999,39.570000,8148791,EEM
2022-11-30 14:30:00,39.579899,39.599998,39.494999,39.549999,6592872,EEM
2022-11-30 15:00:00,39.549999,39.669998,39.520000,39.639999,7457692,EEM
2022-11-30 15:30:00,39.639999,39.680000,39.500000,39.500000,22085670,EEM


In [48]:
df = df[['Close', 'ticker']]
df.replace({'^GSPC':'Spx', '^IXIC':'Nasdaq', '^RUT':'Russel'}, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 3104 entries, 2022-09-07 09:30:00 to 2022-11-30 16:00:00
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Close   3104 non-null   float64
 1   ticker  3104 non-null   object 
dtypes: float64(1), object(1)
memory usage: 72.8+ KB


In [49]:
df = (df.pivot_table(index=['Datetime'], columns='ticker', values='Close'))
df

ticker,EEM,Nasdaq,Russel,Spx
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2022-09-07 09:30:00,38.119999,11607.953125,1797.582275,3920.530029
2022-09-07 10:00:00,38.214199,11597.743164,1801.146851,3925.469971
2022-09-07 10:30:00,38.284100,11606.208984,1804.016113,3930.520020
2022-09-07 11:00:00,38.334999,11648.340820,1805.438843,3941.550049
2022-09-07 11:30:00,38.355000,11665.584961,1806.501587,3942.399902
...,...,...,...,...
2022-11-30 14:00:00,39.570000,11336.531250,1871.074463,4034.229980
2022-11-30 14:30:00,39.549999,11315.137695,1865.741577,4035.320068
2022-11-30 15:00:00,39.639999,11387.146484,1871.581299,4057.459961
2022-11-30 15:30:00,39.500000,11465.195312,1885.863770,4078.770020


In [50]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 776 entries, 2022-09-07 09:30:00 to 2022-11-30 16:00:00
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   EEM     776 non-null    float64
 1   Nasdaq  776 non-null    float64
 2   Russel  776 non-null    float64
 3   Spx     776 non-null    float64
dtypes: float64(4)
memory usage: 30.3 KB


In [51]:
df.columns = ['EEM', 'Nasdaq', 'Russel', 'Spx']
df['time'] = df.index.time
df['date'] = df.index.date
df

Unnamed: 0_level_0,EEM,Nasdaq,Russel,Spx,time,date
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2022-09-07 09:30:00,38.119999,11607.953125,1797.582275,3920.530029,09:30:00,2022-09-07
2022-09-07 10:00:00,38.214199,11597.743164,1801.146851,3925.469971,10:00:00,2022-09-07
2022-09-07 10:30:00,38.284100,11606.208984,1804.016113,3930.520020,10:30:00,2022-09-07
2022-09-07 11:00:00,38.334999,11648.340820,1805.438843,3941.550049,11:00:00,2022-09-07
2022-09-07 11:30:00,38.355000,11665.584961,1806.501587,3942.399902,11:30:00,2022-09-07
...,...,...,...,...,...,...
2022-11-30 14:00:00,39.570000,11336.531250,1871.074463,4034.229980,14:00:00,2022-11-30
2022-11-30 14:30:00,39.549999,11315.137695,1865.741577,4035.320068,14:30:00,2022-11-30
2022-11-30 15:00:00,39.639999,11387.146484,1871.581299,4057.459961,15:00:00,2022-11-30
2022-11-30 15:30:00,39.500000,11465.195312,1885.863770,4078.770020,15:30:00,2022-11-30


In [52]:
df = df.fillna(method='ffill')
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 776 entries, 2022-09-07 09:30:00 to 2022-11-30 16:00:00
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   EEM     776 non-null    float64
 1   Nasdaq  776 non-null    float64
 2   Russel  776 non-null    float64
 3   Spx     776 non-null    float64
 4   time    776 non-null    object 
 5   date    776 non-null    object 
dtypes: float64(4), object(2)
memory usage: 42.4+ KB


In [53]:
dayclose = df[df.time==datetime.time(15, 30, 0)]
dayopen = df[df.time==datetime.time(9, 30, 0)]
display(df, dayopen.head(), dayclose.head())

Unnamed: 0_level_0,EEM,Nasdaq,Russel,Spx,time,date
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2022-09-07 09:30:00,38.119999,11607.953125,1797.582275,3920.530029,09:30:00,2022-09-07
2022-09-07 10:00:00,38.214199,11597.743164,1801.146851,3925.469971,10:00:00,2022-09-07
2022-09-07 10:30:00,38.284100,11606.208984,1804.016113,3930.520020,10:30:00,2022-09-07
2022-09-07 11:00:00,38.334999,11648.340820,1805.438843,3941.550049,11:00:00,2022-09-07
2022-09-07 11:30:00,38.355000,11665.584961,1806.501587,3942.399902,11:30:00,2022-09-07
...,...,...,...,...,...,...
2022-11-30 14:00:00,39.570000,11336.531250,1871.074463,4034.229980,14:00:00,2022-11-30
2022-11-30 14:30:00,39.549999,11315.137695,1865.741577,4035.320068,14:30:00,2022-11-30
2022-11-30 15:00:00,39.639999,11387.146484,1871.581299,4057.459961,15:00:00,2022-11-30
2022-11-30 15:30:00,39.500000,11465.195312,1885.863770,4078.770020,15:30:00,2022-11-30


Unnamed: 0_level_0,EEM,Nasdaq,Russel,Spx,time,date
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2022-09-07 09:30:00,38.119999,11607.953125,1797.582275,3920.530029,09:30:00,2022-09-07
2022-09-08 09:30:00,38.299999,11782.061523,1826.878906,3974.850098,09:30:00,2022-09-08
2022-09-09 09:30:00,39.025002,12025.96875,1870.292236,4045.350098,09:30:00,2022-09-09
2022-09-12 09:30:00,39.490002,12253.59375,1903.255615,4112.160156,09:30:00,2022-09-12
2022-09-13 09:30:00,38.860001,11876.735352,1863.523682,4011.689941,09:30:00,2022-09-13


Unnamed: 0_level_0,EEM,Nasdaq,Russel,Spx,time,date
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2022-09-07 15:30:00,38.619999,11789.628906,1831.217041,3980.02002,15:30:00,2022-09-07
2022-09-08 15:30:00,38.439999,11861.103516,1846.412231,4005.860107,15:30:00,2022-09-08
2022-09-09 15:30:00,39.0,12112.06543,1882.81665,4067.620117,15:30:00,2022-09-09
2022-09-12 15:30:00,39.59,12266.00293,1905.813721,4110.919922,15:30:00,2022-09-12
2022-09-13 15:30:00,38.360001,11629.938477,1831.536377,3932.689941,15:30:00,2022-09-13


In [54]:
df['spx_ret'] = 100*(df.Spx/df.Spx.shift(1)-1)
df['s_spx_ret_30m'] = (100*(df.Spx/df.Spx.shift(1)-1)).shift(1)
df['s_spx_ret_1h'] = (100*(df.Spx/df.Spx.shift(2)-1)).shift(1)
df['s_spx_ret_2h'] = (100*(df.Spx/df.Spx.shift(4)-1)).shift(1)
display(df.shape, df.head(7))

(776, 10)

Unnamed: 0_level_0,EEM,Nasdaq,Russel,Spx,time,date,spx_ret,s_spx_ret_30m,s_spx_ret_1h,s_spx_ret_2h
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2022-09-07 09:30:00,38.119999,11607.953125,1797.582275,3920.530029,09:30:00,2022-09-07,,,,
2022-09-07 10:00:00,38.214199,11597.743164,1801.146851,3925.469971,10:00:00,2022-09-07,0.126002,,,
2022-09-07 10:30:00,38.2841,11606.208984,1804.016113,3930.52002,10:30:00,2022-09-07,0.128648,0.126002,,
2022-09-07 11:00:00,38.334999,11648.34082,1805.438843,3941.550049,11:00:00,2022-09-07,0.280625,0.128648,0.254812,
2022-09-07 11:30:00,38.355,11665.584961,1806.501587,3942.399902,11:30:00,2022-09-07,0.021561,0.280625,0.409634,
2022-09-07 12:00:00,38.319901,11641.611328,1802.598022,3937.090088,12:00:00,2022-09-07,-0.134685,0.021561,0.302247,0.557829
2022-09-07 12:30:00,38.34,11678.947266,1807.955566,3949.25,12:30:00,2022-09-07,0.308855,-0.134685,-0.113152,0.296018


In [55]:
df.loc[df.time < datetime.time(10, 0, 0), 'spx_ret'] = np.nan
df.loc[df.time < datetime.time(10, 30, 0), 's_spx_ret_30m'] = np.nan
df.loc[df.time < datetime.time(11, 0, 0), 's_spx_ret_1h'] = np.nan
df.loc[df.time < datetime.time(12, 0, 0), 's_spx_ret_2h'] = np.nan

# df = df[df.time >= datetime.time(12, 0, 0)]
df

Unnamed: 0_level_0,EEM,Nasdaq,Russel,Spx,time,date,spx_ret,s_spx_ret_30m,s_spx_ret_1h,s_spx_ret_2h
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2022-09-07 09:30:00,38.119999,11607.953125,1797.582275,3920.530029,09:30:00,2022-09-07,,,,
2022-09-07 10:00:00,38.214199,11597.743164,1801.146851,3925.469971,10:00:00,2022-09-07,0.126002,,,
2022-09-07 10:30:00,38.284100,11606.208984,1804.016113,3930.520020,10:30:00,2022-09-07,0.128648,0.126002,,
2022-09-07 11:00:00,38.334999,11648.340820,1805.438843,3941.550049,11:00:00,2022-09-07,0.280625,0.128648,0.254812,
2022-09-07 11:30:00,38.355000,11665.584961,1806.501587,3942.399902,11:30:00,2022-09-07,0.021561,0.280625,0.409634,
...,...,...,...,...,...,...,...,...,...,...
2022-11-30 14:00:00,39.570000,11336.531250,1871.074463,4034.229980,14:00:00,2022-11-30,0.823990,1.277465,1.429706,1.176567
2022-11-30 14:30:00,39.549999,11315.137695,1865.741577,4035.320068,14:30:00,2022-11-30,0.027021,0.823990,2.111981,2.333705
2022-11-30 15:00:00,39.639999,11387.146484,1871.581299,4057.459961,15:00:00,2022-11-30,0.548653,0.027021,0.851233,2.293109
2022-11-30 15:30:00,39.500000,11465.195312,1885.863770,4078.770020,15:30:00,2022-11-30,0.525207,0.548653,0.575822,2.699964


In [56]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 776 entries, 2022-09-07 09:30:00 to 2022-11-30 16:00:00
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   EEM            776 non-null    float64
 1   Nasdaq         776 non-null    float64
 2   Russel         776 non-null    float64
 3   Spx            776 non-null    float64
 4   time           776 non-null    object 
 5   date           776 non-null    object 
 6   spx_ret        716 non-null    float64
 7   s_spx_ret_30m  656 non-null    float64
 8   s_spx_ret_1h   596 non-null    float64
 9   s_spx_ret_2h   476 non-null    float64
dtypes: float64(8), object(2)
memory usage: 66.7+ KB


In [57]:
df

Unnamed: 0_level_0,EEM,Nasdaq,Russel,Spx,time,date,spx_ret,s_spx_ret_30m,s_spx_ret_1h,s_spx_ret_2h
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2022-09-07 09:30:00,38.119999,11607.953125,1797.582275,3920.530029,09:30:00,2022-09-07,,,,
2022-09-07 10:00:00,38.214199,11597.743164,1801.146851,3925.469971,10:00:00,2022-09-07,0.126002,,,
2022-09-07 10:30:00,38.284100,11606.208984,1804.016113,3930.520020,10:30:00,2022-09-07,0.128648,0.126002,,
2022-09-07 11:00:00,38.334999,11648.340820,1805.438843,3941.550049,11:00:00,2022-09-07,0.280625,0.128648,0.254812,
2022-09-07 11:30:00,38.355000,11665.584961,1806.501587,3942.399902,11:30:00,2022-09-07,0.021561,0.280625,0.409634,
...,...,...,...,...,...,...,...,...,...,...
2022-11-30 14:00:00,39.570000,11336.531250,1871.074463,4034.229980,14:00:00,2022-11-30,0.823990,1.277465,1.429706,1.176567
2022-11-30 14:30:00,39.549999,11315.137695,1865.741577,4035.320068,14:30:00,2022-11-30,0.027021,0.823990,2.111981,2.333705
2022-11-30 15:00:00,39.639999,11387.146484,1871.581299,4057.459961,15:00:00,2022-11-30,0.548653,0.027021,0.851233,2.293109
2022-11-30 15:30:00,39.500000,11465.195312,1885.863770,4078.770020,15:30:00,2022-11-30,0.525207,0.548653,0.575822,2.699964


In [58]:
dayopen.reset_index(drop=True, inplace=True)
dayopen.rename(columns={'Spx':'spx_open'}, inplace=True)
dayopen.head()

dayclose.reset_index(drop=True, inplace=True)
dayclose.sort_values(by='date')
dayclose.rename(columns={'Spx':'spx_close'}, inplace=True)
dayclose_l1 = dayclose.copy()
dayclose_l2 = dayclose.copy()

dayclose_l1['spx_close_l1'] = dayclose_l1.spx_close.shift(1)
dayclose_l2['spx_close_l2'] = dayclose_l2.spx_close.shift(2)

display(dayclose_l1.head(), dayclose_l2.head())

Unnamed: 0,EEM,Nasdaq,Russel,spx_close,time,date,spx_close_l1
0,38.619999,11789.628906,1831.217041,3980.02002,15:30:00,2022-09-07,
1,38.439999,11861.103516,1846.412231,4005.860107,15:30:00,2022-09-08,3980.02002
2,39.0,12112.06543,1882.81665,4067.620117,15:30:00,2022-09-09,4005.860107
3,39.59,12266.00293,1905.813721,4110.919922,15:30:00,2022-09-12,4067.620117
4,38.360001,11629.938477,1831.536377,3932.689941,15:30:00,2022-09-13,4110.919922


Unnamed: 0,EEM,Nasdaq,Russel,spx_close,time,date,spx_close_l2
0,38.619999,11789.628906,1831.217041,3980.02002,15:30:00,2022-09-07,
1,38.439999,11861.103516,1846.412231,4005.860107,15:30:00,2022-09-08,
2,39.0,12112.06543,1882.81665,4067.620117,15:30:00,2022-09-09,3980.02002
3,39.59,12266.00293,1905.813721,4110.919922,15:30:00,2022-09-12,4005.860107
4,38.360001,11629.938477,1831.536377,3932.689941,15:30:00,2022-09-13,4067.620117


In [59]:
df = pd.merge(df, dayopen[['date', 'spx_open']], on=['date'], how='left')
df = pd.merge(df, dayclose_l1[['date', 'spx_close_l1']], on=['date'], how='left')
df = pd.merge(df, dayclose_l2[['date', 'spx_close_l2']], on=['date'], how='left')
df

Unnamed: 0,EEM,Nasdaq,Russel,Spx,time,date,spx_ret,s_spx_ret_30m,s_spx_ret_1h,s_spx_ret_2h,spx_open,spx_close_l1,spx_close_l2
0,38.119999,11607.953125,1797.582275,3920.530029,09:30:00,2022-09-07,,,,,3920.530029,,
1,38.214199,11597.743164,1801.146851,3925.469971,10:00:00,2022-09-07,0.126002,,,,3920.530029,,
2,38.284100,11606.208984,1804.016113,3930.520020,10:30:00,2022-09-07,0.128648,0.126002,,,3920.530029,,
3,38.334999,11648.340820,1805.438843,3941.550049,11:00:00,2022-09-07,0.280625,0.128648,0.254812,,3920.530029,,
4,38.355000,11665.584961,1806.501587,3942.399902,11:30:00,2022-09-07,0.021561,0.280625,0.409634,,3920.530029,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
771,39.570000,11336.531250,1871.074463,4034.229980,14:00:00,2022-11-30,0.823990,1.277465,1.429706,1.176567,3954.030029,3957.350098,3964.560059
772,39.549999,11315.137695,1865.741577,4035.320068,14:30:00,2022-11-30,0.027021,0.823990,2.111981,2.333705,3954.030029,3957.350098,3964.560059
773,39.639999,11387.146484,1871.581299,4057.459961,15:00:00,2022-11-30,0.548653,0.027021,0.851233,2.293109,3954.030029,3957.350098,3964.560059
774,39.500000,11465.195312,1885.863770,4078.770020,15:30:00,2022-11-30,0.525207,0.548653,0.575822,2.699964,3954.030029,3957.350098,3964.560059


In [60]:
df['s_spx_ret_open'] = (100*(df.Spx/df.spx_open-1)).shift(1)
df['s_spx_ret_close1'] = (100*(df.Spx/df.spx_close_l1-1)).shift(1)
df['s_spx_ret_close2'] = (100*(df.Spx/df.spx_close_l2-1)).shift(1)

df.head()

Unnamed: 0,EEM,Nasdaq,Russel,Spx,time,date,spx_ret,s_spx_ret_30m,s_spx_ret_1h,s_spx_ret_2h,spx_open,spx_close_l1,spx_close_l2,s_spx_ret_open,s_spx_ret_close1,s_spx_ret_close2
0,38.119999,11607.953125,1797.582275,3920.530029,09:30:00,2022-09-07,,,,,3920.530029,,,,,
1,38.214199,11597.743164,1801.146851,3925.469971,10:00:00,2022-09-07,0.126002,,,,3920.530029,,,0.0,,
2,38.2841,11606.208984,1804.016113,3930.52002,10:30:00,2022-09-07,0.128648,0.126002,,,3920.530029,,,0.126002,,
3,38.334999,11648.34082,1805.438843,3941.550049,11:00:00,2022-09-07,0.280625,0.128648,0.254812,,3920.530029,,,0.254812,,
4,38.355,11665.584961,1806.501587,3942.399902,11:30:00,2022-09-07,0.021561,0.280625,0.409634,,3920.530029,,,0.536152,,


In [None]:
# grab dayopen, dayclose, create 3 more signals for spx by joining dfs.
# then wrap up evth into a function or loop and do it for all indices.
# then xgb evth on evth and see what sticks.

In [61]:
t_df = df[['date', 
           'time', 
           'spx_ret', 
           's_spx_ret_30m', 
           's_spx_ret_1h', 
           's_spx_ret_2h', 
           's_spx_ret_open',
           's_spx_ret_close1',
           's_spx_ret_close2']]
t_df.rename(columns={'spx_ret':'target'}, inplace=True)
t_df

Unnamed: 0,date,time,target,s_spx_ret_30m,s_spx_ret_1h,s_spx_ret_2h,s_spx_ret_open,s_spx_ret_close1,s_spx_ret_close2
0,2022-09-07,09:30:00,,,,,,,
1,2022-09-07,10:00:00,0.126002,,,,0.000000,,
2,2022-09-07,10:30:00,0.128648,0.126002,,,0.126002,,
3,2022-09-07,11:00:00,0.280625,0.128648,0.254812,,0.254812,,
4,2022-09-07,11:30:00,0.021561,0.280625,0.409634,,0.536152,,
...,...,...,...,...,...,...,...,...,...
771,2022-11-30,14:00:00,0.823990,1.277465,1.429706,1.176567,1.194477,1.109579,0.925700
772,2022-11-30,14:30:00,0.027021,0.823990,2.111981,2.333705,2.028309,1.942711,1.757318
773,2022-11-30,15:00:00,0.548653,0.027021,0.851233,2.293109,2.055878,1.970257,1.784814
774,2022-11-30,15:30:00,0.525207,0.548653,0.575822,2.699964,2.615810,2.529720,2.343259


In [62]:
t_df = t_df.dropna()
t_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 457 entries, 31 to 775
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   date              457 non-null    object 
 1   time              457 non-null    object 
 2   target            457 non-null    float64
 3   s_spx_ret_30m     457 non-null    float64
 4   s_spx_ret_1h      457 non-null    float64
 5   s_spx_ret_2h      457 non-null    float64
 6   s_spx_ret_open    457 non-null    float64
 7   s_spx_ret_close1  457 non-null    float64
 8   s_spx_ret_close2  457 non-null    float64
dtypes: float64(7), object(2)
memory usage: 35.7+ KB


In [65]:
X = t_df[['s_spx_ret_30m',
          's_spx_ret_1h',
          's_spx_ret_2h', 
          's_spx_ret_open',
          's_spx_ret_close1',
          's_spx_ret_close2']]
y = t_df['target']

In [70]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=int(0.2*X.shape[0]))
display(X_train.shape, X_test.shape, y_train.shape, X_train.head())

(366, 6)

(91, 6)

(366,)

Unnamed: 0,s_spx_ret_30m,s_spx_ret_1h,s_spx_ret_2h,s_spx_ret_open,s_spx_ret_close1,s_spx_ret_close2
285,0.196202,-0.056445,-0.424719,-0.538964,-0.788971,-0.975068
415,-0.090541,0.144614,-0.498679,-1.014883,-0.882582,-1.570744
153,-0.328428,-0.185969,-0.220432,-0.19366,-0.864683,-2.58507
440,0.346098,0.584637,0.430875,0.456076,1.363531,3.783936
694,-0.120761,-0.160023,-0.49413,-0.49413,-0.761515,-0.271308


In [79]:
xgbm = XGBRegressor(eta=0.015, max_depth=3)
xgbm.fit(X_train, y_train)

# rdm = Ridge()
# rdm.fit(X_train, y_train)



print('In sample, xgb: ', r2_score(y_train, xgbm.predict(X_train)))
print('Out of sample, xgb: ', r2_score(y_test, xgbm.predict(X_test)))


In sample, xgb:  0.08055855040909743
Out of sample, xgb:  -0.14296550515713768


In [None]:
data = yf.download(  # or pdr.get_data_yahoo(...
        # tickers list or string as well
        tickers = "^GSPC ^IXIC ^RUT EEM",

        # use "period" instead of start/end
        # valid periods: 1d,5d,1mo,3mo,6mo,1y,2y,5y,10y,ytd,max
        # (optional, default is '1mo')
        period = "60d",

        # fetch data by interval (including intraday if period < 60 days)
        # valid intervals: 1m,2m,5m,15m,30m,60m,90m,1h,1d,5d,1wk,1mo,3mo
        # (optional, default is '1d')
        interval = "2m",

        # Whether to ignore timezone when aligning ticker data from 
        # different timezones. Default is True. False may be useful for 
        # minute/hourly data.
        ignore_tz = True,

        # group by ticker (to access via data['SPY'])
        # (optional, default is 'column')
        group_by = 'ticker',

        # adjust all OHLC automatically
        # (optional, default is False)
        auto_adjust = True,

        # download pre/post regular market hours data
        # (optional, default is False)
        prepost = False,

        # use threads for mass downloading? (True/False/Integer)
        # (optional, default is True)
        threads = True,

        # proxy URL scheme use use when downloading?
        # (optional, default is None)
        proxy = None
    )

display(data.head(2), data.tail(6))

In [4]:
url = 'https://www.alphavantage.co/query?function=TIME_SERIES_INTRADAY_EXTENDED&symbol=SPY&interval=5min&slice=year1month1&apikey=KBYUCPQEHAG67WNC&datatype=csv'
#adjusted=true&
spy = pd.read_csv(url)
spy

# at 9:50am monday there is still no data after friday close.
# apparently, there is one calendar day of delay.

url = 'https://www.alphavantage.co/query?function=TIME_SERIES_INTRADAY_EXTENDED&symbol=EEM&interval=1min&slice=year1month1&apikey=KBYUCPQEHAG67WNC&datatype=csv'
eem = pd.read_csv(url)
print(eem.shape)

In [16]:
url = 'https://www.alphavantage.co/query?function=CRYPTO_INTRADAY&symbol=ETH&market=USD&interval=5min&apikey=KBYUCPQEHAG67WNC'
eth = pd.read_csv(url)
eth

Unnamed: 0,{
0,"""Information"": ""Thank you for using Alpha ..."
1,}


In [12]:
eth[0:30]

Unnamed: 0,{
0,"""Information"": ""Thank you for using Alpha ..."
1,}


In [15]:
eth.iloc[0,0]

'    "Information": "Thank you for using Alpha Vantage! This is a premium endpoint. You may subscribe to any of the premium plans at https://www.alphavantage.co/premium/ to instantly unlock all premium endpoints"'

In [None]:
# can try predicting 10 yield, try smth like xgb 200, 4, 0.04

In [67]:
# it is hard to get any positive results at all using daily freq major assets
# can try to exploit intraday lead-lag effects
# i.e., pick up intraday major assets (3 indices) and try to predict less liquid assets.
# can try btc or etfs of small stocks. e.g., eem or eems etfs. 

In [88]:
url = 'https://www.alphavantage.co/query?function=TIME_SERIES_INTRADAY_EXTENDED&symbol=EEM&interval=1min&slice=year1month1&apikey=KBYUCPQEHAG67WNC&datatype=csv'
eem = pd.read_csv(url)
print(eem.shape)

# EEMS is even less liquid, try eema emxc

url = 'https://www.alphavantage.co/query?function=TIME_SERIES_INTRADAY_EXTENDED&symbol=EEMS&interval=1min&slice=year1month1&apikey=KBYUCPQEHAG67WNC&datatype=csv'
eems = pd.read_csv(url)
print(eems.shape)

url = 'https://www.alphavantage.co/query?function=TIME_SERIES_INTRADAY_EXTENDED&symbol=EWX&interval=1min&slice=year1month1&apikey=KBYUCPQEHAG67WNC&datatype=csv'
ewx = pd.read_csv(url)
print(ewx.shape)

url = 'https://www.alphavantage.co/query?function=TIME_SERIES_INTRADAY_EXTENDED&symbol=EEMA&interval=1min&slice=year1month1&apikey=KBYUCPQEHAG67WNC&datatype=csv'
eema = pd.read_csv(url)
print(eema.shape)

url = 'https://www.alphavantage.co/query?function=TIME_SERIES_INTRADAY_EXTENDED&symbol=EMXC&interval=1min&slice=year1month1&apikey=KBYUCPQEHAG67WNC&datatype=csv'
emxc = pd.read_csv(url)
print(emxc.shape)

(9752, 6)
(1210, 6)
(1230, 6)
(1384, 6)
(6033, 6)


In [87]:
display(spy[192:250], eem[:50], emxc[:50])

Unnamed: 0,time,open,high,low,close,volume
192,2022-11-21 16:01:00,394.58,394.69,394.53,394.62,195089
193,2022-11-21 16:00:00,394.49,394.79,394.48,394.58,1210352
194,2022-11-21 15:59:00,394.425,394.5,394.38,394.49,686851
195,2022-11-21 15:58:00,394.44,394.5,394.39,394.425,255209
196,2022-11-21 15:57:00,394.72,394.72,394.42,394.4345,284264
197,2022-11-21 15:56:00,394.8,394.8,394.63,394.7,278622
198,2022-11-21 15:55:00,395.06,395.12,394.77,394.8,354828
199,2022-11-21 15:54:00,394.9,395.08,394.83,395.06,283090
200,2022-11-21 15:53:00,394.75,394.92,394.74,394.9,147763
201,2022-11-21 15:52:00,394.63,394.92,394.59,394.75,188787


Unnamed: 0,time,open,high,low,close,volume
0,2022-11-21 19:18:00,37.55,37.55,37.55,37.55,206
1,2022-11-21 19:06:00,37.55,37.55,37.55,37.55,1000
2,2022-11-21 18:46:00,37.55,37.55,37.55,37.55,145
3,2022-11-21 18:43:00,37.55,37.55,37.55,37.55,270
4,2022-11-21 18:01:00,37.55,37.55,37.55,37.55,461
5,2022-11-21 17:51:00,37.55,37.55,37.55,37.55,100
6,2022-11-21 17:00:00,37.55,37.55,37.55,37.55,2000
7,2022-11-21 16:34:00,37.53,37.53,37.52,37.52,4600
8,2022-11-21 16:30:00,37.55,37.55,37.54,37.54,9599
9,2022-11-21 16:26:00,37.55,37.55,37.55,37.55,7501


Unnamed: 0,time,open,high,low,close,volume
0,2022-11-21 16:00:00,48.781,48.8,48.76,48.76,5387
1,2022-11-21 15:59:00,48.78,48.78,48.7745,48.7745,874
2,2022-11-21 15:58:00,48.77,48.7725,48.77,48.7725,993
3,2022-11-21 15:56:00,48.7892,48.7892,48.77,48.77,559
4,2022-11-21 15:55:00,48.7601,48.7601,48.7601,48.7601,422
5,2022-11-21 15:54:00,48.7601,48.79,48.7601,48.79,2033
6,2022-11-21 15:52:00,48.785,48.785,48.785,48.785,212
7,2022-11-21 15:51:00,48.78,48.78,48.77,48.77,334
8,2022-11-21 15:50:00,48.78,48.785,48.78,48.785,8183
9,2022-11-21 15:46:00,48.75,48.77,48.75,48.77,597


In [17]:
# fix date, create returns and rate differences over 1d, 5d, 21d, 63d, 121d, 252d, 504d for all 4 variables
# then create signals by lagging evth by 1d
# then clean evth and fir xgb (around 30 features)

#### step 1:
build simple XGB model
#### step 2:
deploy this model via Cloud Run and static web app, fixed model
#### step 3
deploy model via Cloud Run and Flask with dynamic model, retrained daily