### This is a modeling script for a project to predict index returns at 2-minute frequency

In [11]:
import numpy as np
import pandas as pd
import os, time, warnings, random, shap, requests, optuna, datetime, joblib
import seaborn as sns
import matplotlib.pyplot as plt
import functools as ft
import yfinance as yf

from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.preprocessing import LabelBinarizer, LabelEncoder, OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, train_test_split, KFold
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, f1_score, r2_score, mean_squared_error
from sklearn.inspection import permutation_importance
from xgboost import XGBClassifier, XGBRegressor
from google.cloud import bigquery, storage

pd.set_option('display.max_columns', 100)
pd.set_option('mode.chained_assignment', None)
pd.set_option('display.expand_frame_repr', False)
warnings.filterwarnings('ignore') 

project_name = 'GCP-pp2'
project_id = 'valid-heuristic-369117'
regionn = 'us-west1'

time0 = time.time()

os.chdir('/home/jupyter/projects_gcp_cpu/spx/src')
os.getcwd()

'/home/jupyter/projects_gcp_cpu/spx/src'

In [2]:
tickerStrings = ['^GSPC', '^IXIC', '^RUT', 'EEM', 'EMXC', 'EEMA', 'VTHR']
df_list = list()
for ticker in tickerStrings:
    data = yf.download(ticker, 
                       group_by="Ticker", 
                       period='60d', 
                       interval='2m', 
                       prepost=False, 
                       auto_adjust=True)
    data['ticker'] = ticker  
    df_list.append(data)

df = pd.concat(df_list)
df = df[['Close', 'ticker']]
df.replace({'^GSPC':'Spx', '^IXIC':'Nasdaq', '^RUT':'Russel'}, inplace=True)
df = (df.pivot_table(index=['Datetime'], columns='ticker', values='Close'))
df.columns = ['EEM', 'EEMA', 'EMXC', 'Nasdaq', 'Russel', 'Spx', 'VTHR']

df['time'] = df.index.time
df['date'] = df.index.date

df = df.fillna(method='ffill')
dayclose = df[df.time==datetime.time(15, 58, 0)]
dayopen = df[df.time==datetime.time(9, 30, 0)]
dayopen.reset_index(drop=True, inplace=True)
dayclose.reset_index(drop=True, inplace=True)
dayclose.sort_values(by='date')
display(df, dayopen.head(), dayclose.head())
df0 = df.copy()

# df['hour'] = pd.to_datetime(df['time'], format='%H:%M:%S').dt.hour
# df['minute'] = pd.to_datetime(df['time'], format='%H:%M:%S').dt.minute

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,EEM,EEMA,EMXC,Nasdaq,Russel,Spx,VTHR,time,date
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2022-10-21 09:30:00,34.634998,55.570000,45.400002,10576.550781,1710.077637,3659.689941,164.059998,09:30:00,2022-10-21
2022-10-21 09:32:00,34.654999,55.660000,45.410000,10586.665039,1709.384399,3660.110107,164.169998,09:32:00,2022-10-21
2022-10-21 09:34:00,34.610001,55.619999,45.360001,10547.528320,1704.969971,3652.659912,164.020004,09:34:00,2022-10-21
2022-10-21 09:36:00,34.709999,55.619999,45.498699,10586.677734,1705.815674,3665.989990,164.020004,09:36:00,2022-10-21
2022-10-21 09:38:00,34.689999,55.619999,45.463501,10583.666016,1704.521973,3662.169922,163.869995,09:38:00,2022-10-21
...,...,...,...,...,...,...,...,...,...
2022-12-02 15:52:00,39.564999,65.579300,50.395901,11445.749023,1890.934937,4066.689941,182.669998,15:52:00,2022-12-02
2022-12-02 15:54:00,39.560001,65.579300,50.389999,11453.005859,1891.065552,4069.070068,182.669998,15:54:00,2022-12-02
2022-12-02 15:56:00,39.560001,65.579300,50.400002,11461.099609,1892.414673,4071.340088,182.669998,15:56:00,2022-12-02
2022-12-02 15:58:00,39.549999,65.500000,50.380001,11461.688477,1892.738037,4071.550049,182.850006,15:58:00,2022-12-02


Unnamed: 0,EEM,EEMA,EMXC,Nasdaq,Russel,Spx,VTHR,time,date
0,34.634998,55.57,45.400002,10576.550781,1710.077637,3659.689941,164.059998,09:30:00,2022-10-21
1,33.950001,54.200001,45.580002,10831.625,1744.627563,3765.52002,168.860001,09:30:00,2022-10-24
2,34.084999,54.450001,45.431999,11022.798828,1753.145142,3805.649902,170.169998,09:30:00,2022-10-25
3,34.349998,54.880001,45.830002,10956.992188,1803.124023,3829.889893,171.710007,09:30:00,2022-10-26
4,34.59,55.560001,46.139999,10987.374023,1822.050537,3847.090088,173.0,09:30:00,2022-10-27


Unnamed: 0,EEM,EEMA,EMXC,Nasdaq,Russel,Spx,VTHR,time,date
0,35.299999,56.529999,46.360001,10859.959961,1741.45874,3752.75,168.300003,15:58:00,2022-10-21
1,33.919998,54.119999,45.59,10952.712891,1748.230347,3797.790039,170.050003,15:58:00,2022-10-24
2,34.220001,54.77,45.880001,11200.161133,1795.591431,3860.199951,173.130005,15:58:00,2022-10-25
3,34.790001,55.939999,46.105,10967.536133,1804.481689,3830.620117,172.009995,15:58:00,2022-10-26
4,34.509998,55.259998,46.099998,10792.367188,1806.232666,3808.870117,171.240005,15:58:00,2022-10-27


In [3]:
### now i wanna do feature engineering for all assets 

asset_list = ['Spx', 'Nasdaq', 'Russel', 'EMXC', 'EEMA', 'EEM', 'VTHR']

for asset in asset_list:
    
    df[asset + '_ret'] = 100*(df[asset]/df[asset].shift(1)-1)
    df['s_' + asset + '_ret_1prd'] = (100*(df[asset]/df[asset].shift(1)-1)).shift(1)
    df['s_' + asset + '_ret_2prd'] = (100*(df[asset]/df[asset].shift(2)-1)).shift(1)
    df['s_' + asset + '_ret_4prd'] = (100*(df[asset]/df[asset].shift(4)-1)).shift(1)
    display(df.shape, df.head(5))

    df.loc[df.time < datetime.time(9, 32, 0), 's_' + asset + '_1prd'] = np.nan
    df.loc[df.time < datetime.time(9, 33, 0), 's_' + asset + '_2prd'] = np.nan
    df.loc[df.time < datetime.time(9, 35, 0), 's_' + asset + '_4prd'] = np.nan

    dayopen.rename(columns={asset:asset+'_open'}, inplace=True)
    dayopen.head()
    dayclose.rename(columns={asset:asset+'_close'}, inplace=True)
    dayclose_l1 = dayclose.copy()
    dayclose_l2 = dayclose.copy()
    dayclose_l1[asset+'_close_l1'] = dayclose_l1[asset+'_close'].shift(1)
    dayclose_l2[asset+'_close_l2'] = dayclose_l2[asset+'_close'].shift(2)

    # display(dayclose_l1.head(), dayclose_l2.head())

    df = pd.merge(df, dayopen[['date', asset + '_open']], on=['date'], how='left')
    df = pd.merge(df, dayclose_l1[['date', asset + '_close_l1']], on=['date'], how='left')
    df = pd.merge(df, dayclose_l2[['date', asset + '_close_l2']], on=['date'], how='left')

    df['s_' + asset + '_ret_open'] = (100*(df[asset]/df[asset + '_open']-1)).shift(1)
    df['s_' + asset + '_ret_close1'] = (100*(df[asset]/df[asset + '_close_l1']-1)).shift(1)
    df['s_' + asset + '_ret_close2'] = (100*(df[asset]/df[asset + '_close_l2']-1)).shift(1)

    cols_todrop = [x for x in list(df.columns) if asset in x and 'ret' not in x]
    df.drop(columns = cols_todrop, inplace=True)

display(time.time() - time0, df.head())

(5762, 13)

Unnamed: 0_level_0,EEM,EEMA,EMXC,Nasdaq,Russel,Spx,VTHR,time,date,Spx_ret,s_Spx_ret_1prd,s_Spx_ret_2prd,s_Spx_ret_4prd
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2022-10-21 09:30:00,34.634998,55.57,45.400002,10576.550781,1710.077637,3659.689941,164.059998,09:30:00,2022-10-21,,,,
2022-10-21 09:32:00,34.654999,55.66,45.41,10586.665039,1709.384399,3660.110107,164.169998,09:32:00,2022-10-21,0.011481,,,
2022-10-21 09:34:00,34.610001,55.619999,45.360001,10547.52832,1704.969971,3652.659912,164.020004,09:34:00,2022-10-21,-0.203551,0.011481,,
2022-10-21 09:36:00,34.709999,55.619999,45.498699,10586.677734,1705.815674,3665.98999,164.020004,09:36:00,2022-10-21,0.364942,-0.203551,-0.192094,
2022-10-21 09:38:00,34.689999,55.619999,45.463501,10583.666016,1704.521973,3662.169922,163.869995,09:38:00,2022-10-21,-0.104203,0.364942,0.160648,


(5762, 19)

Unnamed: 0,EEM,EEMA,EMXC,Nasdaq,Russel,VTHR,time,date,Spx_ret,s_Spx_ret_1prd,s_Spx_ret_2prd,s_Spx_ret_4prd,s_Spx_ret_open,s_Spx_ret_close1,s_Spx_ret_close2,Nasdaq_ret,s_Nasdaq_ret_1prd,s_Nasdaq_ret_2prd,s_Nasdaq_ret_4prd
0,34.634998,55.57,45.400002,10576.550781,1710.077637,164.059998,09:30:00,2022-10-21,,,,,,,,,,,
1,34.654999,55.66,45.41,10586.665039,1709.384399,164.169998,09:32:00,2022-10-21,0.011481,,,,0.0,,,0.095629,,,
2,34.610001,55.619999,45.360001,10547.52832,1704.969971,164.020004,09:34:00,2022-10-21,-0.203551,0.011481,,,0.011481,,,-0.369679,0.095629,,
3,34.709999,55.619999,45.498699,10586.677734,1705.815674,164.020004,09:36:00,2022-10-21,0.364942,-0.203551,-0.192094,,-0.192094,,,0.371171,-0.369679,-0.274404,
4,34.689999,55.619999,45.463501,10583.666016,1704.521973,163.869995,09:38:00,2022-10-21,-0.104203,0.364942,0.160648,,0.172147,,,-0.028448,0.371171,0.00012,


(5762, 25)

Unnamed: 0,EEM,EEMA,EMXC,Russel,VTHR,time,date,Spx_ret,s_Spx_ret_1prd,s_Spx_ret_2prd,s_Spx_ret_4prd,s_Spx_ret_open,s_Spx_ret_close1,s_Spx_ret_close2,Nasdaq_ret,s_Nasdaq_ret_1prd,s_Nasdaq_ret_2prd,s_Nasdaq_ret_4prd,s_Nasdaq_ret_open,s_Nasdaq_ret_close1,s_Nasdaq_ret_close2,Russel_ret,s_Russel_ret_1prd,s_Russel_ret_2prd,s_Russel_ret_4prd
0,34.634998,55.57,45.400002,1710.077637,164.059998,09:30:00,2022-10-21,,,,,,,,,,,,,,,,,,
1,34.654999,55.66,45.41,1709.384399,164.169998,09:32:00,2022-10-21,0.011481,,,,0.0,,,0.095629,,,,0.0,,,-0.040538,,,
2,34.610001,55.619999,45.360001,1704.969971,164.020004,09:34:00,2022-10-21,-0.203551,0.011481,,,0.011481,,,-0.369679,0.095629,,,0.095629,,,-0.258247,-0.040538,,
3,34.709999,55.619999,45.498699,1705.815674,164.020004,09:36:00,2022-10-21,0.364942,-0.203551,-0.192094,,-0.192094,,,0.371171,-0.369679,-0.274404,,-0.274404,,,0.049602,-0.258247,-0.29868,
4,34.689999,55.619999,45.463501,1704.521973,163.869995,09:38:00,2022-10-21,-0.104203,0.364942,0.160648,,0.172147,,,-0.028448,0.371171,0.00012,,0.095749,,,-0.075841,0.049602,-0.208773,


(5762, 31)

Unnamed: 0,EEM,EEMA,EMXC,VTHR,time,date,Spx_ret,s_Spx_ret_1prd,s_Spx_ret_2prd,s_Spx_ret_4prd,s_Spx_ret_open,s_Spx_ret_close1,s_Spx_ret_close2,Nasdaq_ret,s_Nasdaq_ret_1prd,s_Nasdaq_ret_2prd,s_Nasdaq_ret_4prd,s_Nasdaq_ret_open,s_Nasdaq_ret_close1,s_Nasdaq_ret_close2,Russel_ret,s_Russel_ret_1prd,s_Russel_ret_2prd,s_Russel_ret_4prd,s_Russel_ret_open,s_Russel_ret_close1,s_Russel_ret_close2,EMXC_ret,s_EMXC_ret_1prd,s_EMXC_ret_2prd,s_EMXC_ret_4prd
0,34.634998,55.57,45.400002,164.059998,09:30:00,2022-10-21,,,,,,,,,,,,,,,,,,,,,,,,,
1,34.654999,55.66,45.41,164.169998,09:32:00,2022-10-21,0.011481,,,,0.0,,,0.095629,,,,0.0,,,-0.040538,,,,0.0,,,0.022023,,,
2,34.610001,55.619999,45.360001,164.020004,09:34:00,2022-10-21,-0.203551,0.011481,,,0.011481,,,-0.369679,0.095629,,,0.095629,,,-0.258247,-0.040538,,,-0.040538,,,-0.110106,0.022023,,
3,34.709999,55.619999,45.498699,164.020004,09:36:00,2022-10-21,0.364942,-0.203551,-0.192094,,-0.192094,,,0.371171,-0.369679,-0.274404,,-0.274404,,,0.049602,-0.258247,-0.29868,,-0.29868,,,0.305773,-0.110106,-0.088108,
4,34.689999,55.619999,45.463501,163.869995,09:38:00,2022-10-21,-0.104203,0.364942,0.160648,,0.172147,,,-0.028448,0.371171,0.00012,,0.095749,,,-0.075841,0.049602,-0.208773,,-0.249226,,,-0.077361,0.305773,0.19533,


(5762, 37)

Unnamed: 0,EEM,EEMA,VTHR,time,date,Spx_ret,s_Spx_ret_1prd,s_Spx_ret_2prd,s_Spx_ret_4prd,s_Spx_ret_open,s_Spx_ret_close1,s_Spx_ret_close2,Nasdaq_ret,s_Nasdaq_ret_1prd,s_Nasdaq_ret_2prd,s_Nasdaq_ret_4prd,s_Nasdaq_ret_open,s_Nasdaq_ret_close1,s_Nasdaq_ret_close2,Russel_ret,s_Russel_ret_1prd,s_Russel_ret_2prd,s_Russel_ret_4prd,s_Russel_ret_open,s_Russel_ret_close1,s_Russel_ret_close2,EMXC_ret,s_EMXC_ret_1prd,s_EMXC_ret_2prd,s_EMXC_ret_4prd,s_EMXC_ret_open,s_EMXC_ret_close1,s_EMXC_ret_close2,EEMA_ret,s_EEMA_ret_1prd,s_EEMA_ret_2prd,s_EEMA_ret_4prd
0,34.634998,55.57,164.059998,09:30:00,2022-10-21,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,34.654999,55.66,164.169998,09:32:00,2022-10-21,0.011481,,,,0.0,,,0.095629,,,,0.0,,,-0.040538,,,,0.0,,,0.022023,,,,0.0,,,0.161958,,,
2,34.610001,55.619999,164.020004,09:34:00,2022-10-21,-0.203551,0.011481,,,0.011481,,,-0.369679,0.095629,,,0.095629,,,-0.258247,-0.040538,,,-0.040538,,,-0.110106,0.022023,,,0.022023,,,-0.071867,0.161958,,
3,34.709999,55.619999,164.020004,09:36:00,2022-10-21,0.364942,-0.203551,-0.192094,,-0.192094,,,0.371171,-0.369679,-0.274404,,-0.274404,,,0.049602,-0.258247,-0.29868,,-0.29868,,,0.305773,-0.110106,-0.088108,,-0.088108,,,0.0,-0.071867,0.089975,
4,34.689999,55.619999,163.869995,09:38:00,2022-10-21,-0.104203,0.364942,0.160648,,0.172147,,,-0.028448,0.371171,0.00012,,0.095749,,,-0.075841,0.049602,-0.208773,,-0.249226,,,-0.077361,0.305773,0.19533,,0.217396,,,0.0,0.0,-0.071867,


(5762, 43)

Unnamed: 0,EEM,VTHR,time,date,Spx_ret,s_Spx_ret_1prd,s_Spx_ret_2prd,s_Spx_ret_4prd,s_Spx_ret_open,s_Spx_ret_close1,s_Spx_ret_close2,Nasdaq_ret,s_Nasdaq_ret_1prd,s_Nasdaq_ret_2prd,s_Nasdaq_ret_4prd,s_Nasdaq_ret_open,s_Nasdaq_ret_close1,s_Nasdaq_ret_close2,Russel_ret,s_Russel_ret_1prd,s_Russel_ret_2prd,s_Russel_ret_4prd,s_Russel_ret_open,s_Russel_ret_close1,s_Russel_ret_close2,EMXC_ret,s_EMXC_ret_1prd,s_EMXC_ret_2prd,s_EMXC_ret_4prd,s_EMXC_ret_open,s_EMXC_ret_close1,s_EMXC_ret_close2,EEMA_ret,s_EEMA_ret_1prd,s_EEMA_ret_2prd,s_EEMA_ret_4prd,s_EEMA_ret_open,s_EEMA_ret_close1,s_EEMA_ret_close2,EEM_ret,s_EEM_ret_1prd,s_EEM_ret_2prd,s_EEM_ret_4prd
0,34.634998,164.059998,09:30:00,2022-10-21,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,34.654999,164.169998,09:32:00,2022-10-21,0.011481,,,,0.0,,,0.095629,,,,0.0,,,-0.040538,,,,0.0,,,0.022023,,,,0.0,,,0.161958,,,,0.0,,,0.057746,,,
2,34.610001,164.020004,09:34:00,2022-10-21,-0.203551,0.011481,,,0.011481,,,-0.369679,0.095629,,,0.095629,,,-0.258247,-0.040538,,,-0.040538,,,-0.110106,0.022023,,,0.022023,,,-0.071867,0.161958,,,0.161958,,,-0.129846,0.057746,,
3,34.709999,164.020004,09:36:00,2022-10-21,0.364942,-0.203551,-0.192094,,-0.192094,,,0.371171,-0.369679,-0.274404,,-0.274404,,,0.049602,-0.258247,-0.29868,,-0.29868,,,0.305773,-0.110106,-0.088108,,-0.088108,,,0.0,-0.071867,0.089975,,0.089975,,,0.288929,-0.129846,-0.072175,
4,34.689999,163.869995,09:38:00,2022-10-21,-0.104203,0.364942,0.160648,,0.172147,,,-0.028448,0.371171,0.00012,,0.095749,,,-0.075841,0.049602,-0.208773,,-0.249226,,,-0.077361,0.305773,0.19533,,0.217396,,,0.0,0.0,-0.071867,,0.089975,,,-0.057622,0.288929,0.158708,


(5762, 49)

Unnamed: 0,VTHR,time,date,Spx_ret,s_Spx_ret_1prd,s_Spx_ret_2prd,s_Spx_ret_4prd,s_Spx_ret_open,s_Spx_ret_close1,s_Spx_ret_close2,Nasdaq_ret,s_Nasdaq_ret_1prd,s_Nasdaq_ret_2prd,s_Nasdaq_ret_4prd,s_Nasdaq_ret_open,s_Nasdaq_ret_close1,s_Nasdaq_ret_close2,Russel_ret,s_Russel_ret_1prd,s_Russel_ret_2prd,s_Russel_ret_4prd,s_Russel_ret_open,s_Russel_ret_close1,s_Russel_ret_close2,EMXC_ret,s_EMXC_ret_1prd,s_EMXC_ret_2prd,s_EMXC_ret_4prd,s_EMXC_ret_open,s_EMXC_ret_close1,s_EMXC_ret_close2,EEMA_ret,s_EEMA_ret_1prd,s_EEMA_ret_2prd,s_EEMA_ret_4prd,s_EEMA_ret_open,s_EEMA_ret_close1,s_EEMA_ret_close2,EEM_ret,s_EEM_ret_1prd,s_EEM_ret_2prd,s_EEM_ret_4prd,s_EEM_ret_open,s_EEM_ret_close1,s_EEM_ret_close2,VTHR_ret,s_VTHR_ret_1prd,s_VTHR_ret_2prd,s_VTHR_ret_4prd
0,164.059998,09:30:00,2022-10-21,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,164.169998,09:32:00,2022-10-21,0.011481,,,,0.0,,,0.095629,,,,0.0,,,-0.040538,,,,0.0,,,0.022023,,,,0.0,,,0.161958,,,,0.0,,,0.057746,,,,0.0,,,0.067049,,,
2,164.020004,09:34:00,2022-10-21,-0.203551,0.011481,,,0.011481,,,-0.369679,0.095629,,,0.095629,,,-0.258247,-0.040538,,,-0.040538,,,-0.110106,0.022023,,,0.022023,,,-0.071867,0.161958,,,0.161958,,,-0.129846,0.057746,,,0.057746,,,-0.091365,0.067049,,
3,164.020004,09:36:00,2022-10-21,0.364942,-0.203551,-0.192094,,-0.192094,,,0.371171,-0.369679,-0.274404,,-0.274404,,,0.049602,-0.258247,-0.29868,,-0.29868,,,0.305773,-0.110106,-0.088108,,-0.088108,,,0.0,-0.071867,0.089975,,0.089975,,,0.288929,-0.129846,-0.072175,,-0.072175,,,0.0,-0.091365,-0.024377,
4,163.869995,09:38:00,2022-10-21,-0.104203,0.364942,0.160648,,0.172147,,,-0.028448,0.371171,0.00012,,0.095749,,,-0.075841,0.049602,-0.208773,,-0.249226,,,-0.077361,0.305773,0.19533,,0.217396,,,0.0,0.0,-0.071867,,0.089975,,,-0.057622,0.288929,0.158708,,0.216546,,,-0.091458,0.0,-0.091365,


2.4821343421936035

Unnamed: 0,time,date,Spx_ret,s_Spx_ret_1prd,s_Spx_ret_2prd,s_Spx_ret_4prd,s_Spx_ret_open,s_Spx_ret_close1,s_Spx_ret_close2,Nasdaq_ret,s_Nasdaq_ret_1prd,s_Nasdaq_ret_2prd,s_Nasdaq_ret_4prd,s_Nasdaq_ret_open,s_Nasdaq_ret_close1,s_Nasdaq_ret_close2,Russel_ret,s_Russel_ret_1prd,s_Russel_ret_2prd,s_Russel_ret_4prd,s_Russel_ret_open,s_Russel_ret_close1,s_Russel_ret_close2,EMXC_ret,s_EMXC_ret_1prd,s_EMXC_ret_2prd,s_EMXC_ret_4prd,s_EMXC_ret_open,s_EMXC_ret_close1,s_EMXC_ret_close2,EEMA_ret,s_EEMA_ret_1prd,s_EEMA_ret_2prd,s_EEMA_ret_4prd,s_EEMA_ret_open,s_EEMA_ret_close1,s_EEMA_ret_close2,EEM_ret,s_EEM_ret_1prd,s_EEM_ret_2prd,s_EEM_ret_4prd,s_EEM_ret_open,s_EEM_ret_close1,s_EEM_ret_close2,VTHR_ret,s_VTHR_ret_1prd,s_VTHR_ret_2prd,s_VTHR_ret_4prd,s_VTHR_ret_open,s_VTHR_ret_close1,s_VTHR_ret_close2
0,09:30:00,2022-10-21,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,09:32:00,2022-10-21,0.011481,,,,0.0,,,0.095629,,,,0.0,,,-0.040538,,,,0.0,,,0.022023,,,,0.0,,,0.161958,,,,0.0,,,0.057746,,,,0.0,,,0.067049,,,,0.0,,
2,09:34:00,2022-10-21,-0.203551,0.011481,,,0.011481,,,-0.369679,0.095629,,,0.095629,,,-0.258247,-0.040538,,,-0.040538,,,-0.110106,0.022023,,,0.022023,,,-0.071867,0.161958,,,0.161958,,,-0.129846,0.057746,,,0.057746,,,-0.091365,0.067049,,,0.067049,,
3,09:36:00,2022-10-21,0.364942,-0.203551,-0.192094,,-0.192094,,,0.371171,-0.369679,-0.274404,,-0.274404,,,0.049602,-0.258247,-0.29868,,-0.29868,,,0.305773,-0.110106,-0.088108,,-0.088108,,,0.0,-0.071867,0.089975,,0.089975,,,0.288929,-0.129846,-0.072175,,-0.072175,,,0.0,-0.091365,-0.024377,,-0.024377,,
4,09:38:00,2022-10-21,-0.104203,0.364942,0.160648,,0.172147,,,-0.028448,0.371171,0.00012,,0.095749,,,-0.075841,0.049602,-0.208773,,-0.249226,,,-0.077361,0.305773,0.19533,,0.217396,,,0.0,0.0,-0.071867,,0.089975,,,-0.057622,0.288929,0.158708,,0.216546,,,-0.091458,0.0,-0.091365,,-0.024377,,


In [4]:
### do modeling ###

t_df = df.copy()
t_df.rename(columns={'VTHR_ret':'target'}, inplace=True)
t_df.drop(columns = ['time', 'date', 'Spx_ret', 'Nasdaq_ret', 'Russel_ret', 'EEMA_ret', 'EEM_ret', 'EMXC_ret', 'VXUS_ret'], 
          inplace=True,
          errors = 'ignore')

t_df = t_df.dropna()
display(t_df.info())

y = t_df.pop('target')
X = t_df

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=int(0.2*X.shape[0]))
display(X_train.shape, X_test.shape, y_train.shape, X_train.head())

time1 = time.time()


enm = ElasticNet()
parameters = {'alpha':[0.0005, 0.001, 0.002, 0.003, 0.005], 
              'l1_ratio':[0, 0.02, 0.05, 0.1, 0.25, 0.5, 1]}
enmgs = GridSearchCV(enm, parameters, scoring='r2', cv=4)
enmgs.fit(X_train, y_train)
print(enmgs.best_params_)
enmt = ElasticNet(**enmgs.best_params_)
enmt.fit(X_train, y_train)

print('In sample, ElasticNet: ', r2_score(y_train, enmgs.predict(X_train)))
print('Out of sample, ElasticNet: ', r2_score(y_test, enmgs.predict(X_test)))

print('Total time: ', time.time()-time0)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5265 entries, 391 to 5761
Data columns (total 43 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   s_Spx_ret_1prd       5265 non-null   float64
 1   s_Spx_ret_2prd       5265 non-null   float64
 2   s_Spx_ret_4prd       5265 non-null   float64
 3   s_Spx_ret_open       5265 non-null   float64
 4   s_Spx_ret_close1     5265 non-null   float64
 5   s_Spx_ret_close2     5265 non-null   float64
 6   s_Nasdaq_ret_1prd    5265 non-null   float64
 7   s_Nasdaq_ret_2prd    5265 non-null   float64
 8   s_Nasdaq_ret_4prd    5265 non-null   float64
 9   s_Nasdaq_ret_open    5265 non-null   float64
 10  s_Nasdaq_ret_close1  5265 non-null   float64
 11  s_Nasdaq_ret_close2  5265 non-null   float64
 12  s_Russel_ret_1prd    5265 non-null   float64
 13  s_Russel_ret_2prd    5265 non-null   float64
 14  s_Russel_ret_4prd    5265 non-null   float64
 15  s_Russel_ret_open    5265 non-null  

None

(4212, 42)

(1053, 42)

(4212,)

Unnamed: 0,s_Spx_ret_1prd,s_Spx_ret_2prd,s_Spx_ret_4prd,s_Spx_ret_open,s_Spx_ret_close1,s_Spx_ret_close2,s_Nasdaq_ret_1prd,s_Nasdaq_ret_2prd,s_Nasdaq_ret_4prd,s_Nasdaq_ret_open,s_Nasdaq_ret_close1,s_Nasdaq_ret_close2,s_Russel_ret_1prd,s_Russel_ret_2prd,s_Russel_ret_4prd,s_Russel_ret_open,s_Russel_ret_close1,s_Russel_ret_close2,s_EMXC_ret_1prd,s_EMXC_ret_2prd,s_EMXC_ret_4prd,s_EMXC_ret_open,s_EMXC_ret_close1,s_EMXC_ret_close2,s_EEMA_ret_1prd,s_EEMA_ret_2prd,s_EEMA_ret_4prd,s_EEMA_ret_open,s_EEMA_ret_close1,s_EEMA_ret_close2,s_EEM_ret_1prd,s_EEM_ret_2prd,s_EEM_ret_4prd,s_EEM_ret_open,s_EEM_ret_close1,s_EEM_ret_close2,s_VTHR_ret_1prd,s_VTHR_ret_2prd,s_VTHR_ret_4prd,s_VTHR_ret_open,s_VTHR_ret_close1,s_VTHR_ret_close2
2609,-0.070221,-0.144284,-0.169208,-0.052348,-0.753593,-0.20697,-0.147026,-0.156299,-0.199107,-0.073599,-1.011686,-0.528215,-0.069507,-0.060012,-0.057779,-0.030522,-0.966478,-0.99642,0.0,-0.010347,-0.0414,0.082903,0.103647,1.321865,-0.068565,-0.102812,-0.102812,0.0,-1.035479,-0.22249,-0.041161,-0.069035,-0.124201,-0.096631,-0.835846,-0.096631,0.0,-0.06443,-0.052728,-0.029298,-0.715747,-0.198884
2948,0.100263,0.149459,0.420785,0.311272,0.485035,5.984277,0.15246,0.274638,0.635059,0.983818,1.029106,8.384935,0.117629,0.246703,0.399795,1.135307,1.557012,7.680056,0.080114,0.100166,0.280954,0.442214,1.627015,4.550692,0.064414,0.306697,0.306697,-0.288832,2.642878,7.452875,0.052577,0.092044,0.289857,0.461932,2.394413,6.223833,0.218427,0.299906,0.299906,0.488646,0.737634,6.292269
5202,-0.024578,-0.02331,-0.139709,-0.372445,-0.297675,-0.478994,0.00605,8.9e-05,-0.103581,0.105076,0.221571,-0.396787,-0.014994,-0.065654,-0.122376,-0.620591,-0.412809,-0.121397,-0.019897,-0.039786,-0.119279,-0.475432,1.208707,2.384353,0.0,0.0,0.139033,0.356097,2.563289,5.073758,0.051022,0.063788,-0.036189,0.051022,2.068971,4.294646,0.0,0.016942,0.016942,-0.365679,-0.365679,-0.37689
3115,0.042532,0.017759,0.067319,0.942648,1.117505,6.65136,0.064203,0.060277,0.096076,1.983638,2.029375,9.458033,-0.031859,-0.048104,-5.8e-05,0.615009,1.034545,7.126088,0.029906,0.015742,0.000395,0.844217,2.03376,4.969139,0.032124,0.032124,0.032124,-0.064186,2.874129,7.694962,0.013112,-0.026204,-0.032761,0.686293,2.623089,6.461062,-0.01112,0.0,0.066813,0.966184,1.216355,6.797387
2951,0.123465,0.106097,0.305826,0.517269,0.691388,6.201923,0.159529,0.161035,0.454897,1.288765,1.33419,8.712232,0.126372,0.191532,0.329758,1.349592,1.772191,7.908208,0.089986,0.099988,0.25035,0.613066,1.799882,4.728533,0.0,0.112489,0.176975,-0.176668,2.75834,7.573747,0.141795,0.144424,0.24973,0.659892,2.59618,6.433146,0.0,0.013973,0.282916,0.553309,0.802457,6.360666


{'alpha': 0.003, 'l1_ratio': 0.1}
In sample, ElasticNet:  0.08646324859199683
Out of sample, ElasticNet:  0.06758642058450604
Total time:  19.79577875137329


In [5]:
feature_names = X_test.columns
feature_importance = pd.DataFrame(list(zip(feature_names, np.abs(enmt.coef_))),
                                 columns=['col_name','feature_importance_vals'])
feature_importance.sort_values(by=['feature_importance_vals'],
                              ascending=False, inplace=True)

feature_importance.head(10)

Unnamed: 0,col_name,feature_importance_vals
40,s_VTHR_ret_close1,0.076917
41,s_VTHR_ret_close2,0.063291
4,s_Spx_ret_close1,0.059778
5,s_Spx_ret_close2,0.042837
39,s_VTHR_ret_open,0.01782
2,s_Spx_ret_4prd,0.016509
9,s_Nasdaq_ret_open,0.01649
16,s_Russel_ret_close1,0.016128
7,s_Nasdaq_ret_2prd,0.015741
21,s_EMXC_ret_open,0.010845


In [13]:
artifact_filename_en = 'en_model.pkl'

os.chdir('/home/jupyter/project_repos/spg_stocks/spg_stocks/stocks-app')
joblib.dump(enmt, artifact_filename_en)


model_bucket = 'gs://pmykola-projectsgcp-artifacts/spg-stocks/'
storage_path = os.path.join(model_bucket, artifact_filename_en)
blob = storage.blob.Blob.from_string(storage_path, client=storage.Client(project=project_id))
blob.upload_from_filename(os.getcwd()+'/'+artifact_filename_en)


file = open(artifact_filename_en, "rb")
trained_model = joblib.load(file)
prediction = trained_model.predict([list(X_test.iloc[1,:])])
print('EN model', prediction)


EN model [0.00321355]
