## This is a modeling script for a project to predict index returns at 2-minute frequency

#### Current iteration is basically v0.1 for stocks_ete project

In [1]:
import numpy as np
import pandas as pd
import os, time, warnings, random, shap, requests, optuna, datetime, joblib, scipy
import seaborn as sns
import matplotlib.pyplot as plt
import functools as ft
import yfinance as yf
import scipy.stats as stats
from sklearn.utils.fixes import loguniform


from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.preprocessing import LabelBinarizer, LabelEncoder, OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet, SGDRegressor
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, train_test_split, KFold
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, f1_score, r2_score, mean_squared_error
from sklearn.inspection import permutation_importance
from google.cloud import bigquery, storage

pd.set_option('display.max_columns', 100)
pd.set_option("display.min_rows", 8)
pd.set_option('mode.chained_assignment', None)
pd.set_option('display.expand_frame_repr', False)
warnings.filterwarnings('ignore') 

project_name = 'GCP-pp3'
project_id = 'polished-vault-379315'
regionn = 'us-west1'

time0 = time.time()

# os.chdir('/home/jupyter/projects_gcp_cpu/spx/src')
os.getcwd()

'/home/jupyter/project_repos/spg_stocks'

In [2]:
tickerStrings = ['^GSPC', 
                 '^IXIC', 
                 '^DJI',  
                 '^RUT', 
                 '^RUMIC', 
                 'URTH',
                 '^SPG100',
                 '^SPG1200',
                 'GSG',
                 '^SPG100',
                 '^SPG1200',
                 '^BKTAS',
                 'FEZ',
                 'IEUR',
                 '^SPGSCL',
                 '^SPGSGCP',
                 'SHV',
                 'SHY',
                 'IEI',
                 'IEF',
                 'TLT',
                 'EEM', 
                 'EMXC', 
                 'EEMA', 
                 'VTHR',
                 'IWC']
df_list = list()
for ticker in tickerStrings:
    data = yf.download(ticker, 
                       group_by="Ticker", 
                       period='60d', 
                       interval='2m', 
                       prepost=False, 
                       auto_adjust=True)
    data['ticker'] = ticker  
    df_list.append(data)

df = pd.concat(df_list)
df = df[['Close', 'ticker']]
df.replace({'^GSPC':'Spx', 
            '^IXIC':'Nasdaq', 
            '^DJI':'DJI',
            '^RUT':'Russell',
            '^RUMIC':'Russellmicro',
            'URTH':'MSCIw',
            'GSG':'GSCI',
            '^SPG100':'SPG100',
            '^SPG1200':'SPG1200',            
            '^BKTAS':'SPAsia50',
            'FEZ':'Stoxx50',
            'IEUR':'EU1400',
            '^SPGSCL':'Oil',
            '^SPGSGCP':'Gold',
            'IEI':'Tnotes',
            'IEF':'Tnotes_long',
            'SHV':'Tbills',
            'SHY':'Tnotes_short',
            'TLT':'Tbonds',
           }, inplace=True)
df = (df.pivot_table(index=['Datetime'], columns='ticker', values='Close'))
df0 = df.copy()
df

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%********

ticker,DJI,EEM,EEMA,EMXC,EU1400,GSCI,Gold,IWC,MSCIw,Nasdaq,Oil,Russell,Russellmicro,SPAsia50,SPG100,SPG1200,Spx,Stoxx50,Tbills,Tbonds,Tnotes,Tnotes_long,Tnotes_short,VTHR
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
2023-04-10 09:30:00-04:00,33354.500000,39.240002,66.309998,49.342701,52.549999,20.610001,141.680695,101.099998,117.080002,11961.982422,441.214386,1746.987427,635.994873,1560.880005,2842.120117,3069.629883,4075.560059,44.840000,110.195000,107.750000,117.970001,99.750000,82.114998,181.210007
2023-04-10 09:32:00-04:00,33356.960938,39.260101,,49.314999,52.500000,20.620001,141.708893,,116.962898,11963.609375,441.707703,1746.566772,635.821411,1559.939941,2842.129883,3069.540039,4075.899902,44.850498,110.199997,107.620003,117.940002,99.699997,82.110001,
2023-04-10 09:34:00-04:00,33351.199219,39.249901,,49.335999,52.490002,20.600000,141.567795,,116.849998,11944.889648,440.819794,1746.223877,635.638306,1559.589966,2839.409912,3068.040039,4073.199951,44.849998,110.190002,107.690002,117.949997,99.706200,82.119499,
2023-04-10 09:36:00-04:00,33365.070312,39.259998,66.099998,,52.520000,20.610001,141.673599,,,11945.699219,441.312988,1746.954956,635.277405,1558.819946,2839.879883,3069.280029,4074.330078,44.869999,110.190102,107.635002,117.940002,99.695000,82.114998,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-05-19 15:54:00-04:00,33449.621094,38.980000,65.250000,50.360001,54.049999,19.240000,,104.459999,,12662.286133,,1773.896973,657.872375,1530.599976,2974.649902,3147.010010,4194.790039,46.235001,110.290001,101.160004,116.860001,97.629997,81.790001,
2023-05-19 15:56:00-04:00,33455.609375,38.990002,65.245003,50.369999,54.044998,19.245001,,104.430000,120.470001,12664.101562,,1773.907471,657.745056,1530.609985,2975.629883,3147.449951,4195.180176,46.224998,110.290001,101.129997,116.860001,97.620003,81.785004,
2023-05-19 15:58:00-04:00,33424.878906,38.999001,65.050003,50.375000,54.029999,19.240000,,104.459999,120.410004,12658.373047,,1773.626221,657.744751,1530.089966,2974.110107,3146.080078,4191.970215,46.189999,110.290001,101.120003,116.839996,97.610001,81.790001,186.339294
2023-05-21 16:00:00-04:00,,,,,,,139.875198,,,,393.127197,,,,,,,,,,,,,


In [3]:
# df_list

In [4]:
df['time'] = df.index.time
df['date'] = df.index.date

df = df.fillna(method='ffill')
dayclose = df[df.time==datetime.time(15, 58, 0)]
dayopen = df[df.time==datetime.time(9, 30, 0)]
dayopen.reset_index(drop=True, inplace=True)
dayclose.reset_index(drop=True, inplace=True)
dayclose.sort_values(by='date')
display(df.head(), dayopen.head(), dayclose.head())
df.head()
df0 = df.copy()

# df['hour'] = pd.to_datetime(df['time'], format='%H:%M:%S').dt.hour
# df['minute'] = pd.to_datetime(df['time'], format='%H:%M:%S').dt.minute

ticker,DJI,EEM,EEMA,EMXC,EU1400,GSCI,Gold,IWC,MSCIw,Nasdaq,Oil,Russell,Russellmicro,SPAsia50,SPG100,SPG1200,Spx,Stoxx50,Tbills,Tbonds,Tnotes,Tnotes_long,Tnotes_short,VTHR,time,date
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
2023-04-10 09:30:00-04:00,33354.5,39.240002,66.309998,49.342701,52.549999,20.610001,141.680695,101.099998,117.080002,11961.982422,441.214386,1746.987427,635.994873,1560.880005,2842.120117,3069.629883,4075.560059,44.84,110.195,107.75,117.970001,99.75,82.114998,181.210007,09:30:00,2023-04-10
2023-04-10 09:32:00-04:00,33356.960938,39.260101,66.309998,49.314999,52.5,20.620001,141.708893,101.099998,116.962898,11963.609375,441.707703,1746.566772,635.821411,1559.939941,2842.129883,3069.540039,4075.899902,44.850498,110.199997,107.620003,117.940002,99.699997,82.110001,181.210007,09:32:00,2023-04-10
2023-04-10 09:34:00-04:00,33351.199219,39.249901,66.309998,49.335999,52.490002,20.6,141.567795,101.099998,116.849998,11944.889648,440.819794,1746.223877,635.638306,1559.589966,2839.409912,3068.040039,4073.199951,44.849998,110.190002,107.690002,117.949997,99.7062,82.119499,181.210007,09:34:00,2023-04-10
2023-04-10 09:36:00-04:00,33365.070312,39.259998,66.099998,49.335999,52.52,20.610001,141.673599,101.099998,116.849998,11945.699219,441.312988,1746.954956,635.277405,1558.819946,2839.879883,3069.280029,4074.330078,44.869999,110.190102,107.635002,117.940002,99.695,82.114998,181.210007,09:36:00,2023-04-10
2023-04-10 09:38:00-04:00,33394.238281,39.299999,66.334999,49.306,52.52,20.610001,141.758301,101.316803,116.895401,11946.828125,441.707703,1748.898926,635.660278,1559.5,2840.189941,3070.219971,4076.219971,44.900002,110.197998,107.609802,117.940002,99.684998,82.119301,181.210007,09:38:00,2023-04-10


ticker,DJI,EEM,EEMA,EMXC,EU1400,GSCI,Gold,IWC,MSCIw,Nasdaq,Oil,Russell,Russellmicro,SPAsia50,SPG100,SPG1200,Spx,Stoxx50,Tbills,Tbonds,Tnotes,Tnotes_long,Tnotes_short,VTHR,time,date
0,33354.5,39.240002,66.309998,49.342701,52.549999,20.610001,141.680695,101.099998,117.080002,11961.982422,441.214386,1746.987427,635.994873,1560.880005,2842.120117,3069.629883,4075.560059,44.84,110.195,107.75,117.970001,99.75,82.114998,181.210007,09:30:00,2023-04-10
1,33608.109375,39.735001,66.629997,49.900002,53.029999,20.620001,142.252197,102.779999,117.949997,12068.006836,438.539795,1780.577637,646.037842,1576.640015,2850.26001,3097.469971,4109.399902,45.345001,110.209,107.010002,117.860001,99.535004,82.089996,182.936096,09:30:00,2023-04-11
2,33840.03125,39.790001,66.449997,50.360001,53.639999,20.879999,143.4095,103.970001,118.709999,12124.086914,450.180695,1802.421509,652.884338,1576.660034,2862.080078,3117.719971,4131.540039,45.730099,110.235001,107.125,118.260101,99.93,82.192902,183.910294,09:30:00,2023-04-12
3,33691.710938,39.860001,65.75,50.509998,54.040001,21.049999,145.011307,102.43,118.455002,12012.972656,454.083008,1779.912964,644.625244,1553.48999,2862.600098,3109.689941,4104.169922,46.110001,110.275002,107.118896,118.370003,99.910004,82.260101,183.089996,09:30:00,2023-04-13
4,34023.691406,39.75,66.5,50.41,54.310001,20.99,143.755295,104.709999,119.360001,12119.128906,453.08551,1798.380127,653.873047,1557.449951,2894.860107,3133.290039,4142.700195,46.355,110.269997,105.280197,117.625,99.029999,82.029999,184.589996,09:30:00,2023-04-14


ticker,DJI,EEM,EEMA,EMXC,EU1400,GSCI,Gold,IWC,MSCIw,Nasdaq,Oil,Russell,Russellmicro,SPAsia50,SPG100,SPG1200,Spx,Stoxx50,Tbills,Tbonds,Tnotes,Tnotes_long,Tnotes_short,VTHR,time,date
0,33583.71875,39.43,66.489998,49.549999,52.889999,20.59,141.426697,102.531403,117.809998,12080.524414,437.169708,1772.550171,644.302979,1567.560059,2850.310059,3087.100098,4108.660156,45.215,110.190002,106.779999,117.849998,99.489998,82.07,182.75,15:58:00,2023-04-10
1,33686.640625,39.689999,66.519997,49.959999,53.099998,20.790001,142.470993,103.389999,117.959999,12032.628906,446.749908,1786.652954,648.819763,1573.910034,2844.800049,3097.97998,4109.129883,45.310001,110.199997,106.989998,117.790001,99.459999,82.040001,183.149994,15:58:00,2023-04-11
2,33650.96875,39.380001,65.75,50.064999,53.540001,21.01,142.887299,102.050003,117.879997,11929.892578,455.760101,1773.595947,642.474854,1547.859985,2846.709961,3096.5,4092.330078,45.700001,110.220001,106.889999,118.129997,99.720001,82.150002,182.309998,15:58:00,2023-04-12
3,34027.089844,39.959999,66.82,50.630001,54.259998,20.93,145.032501,104.020203,119.470001,12165.261719,449.764313,1796.401733,653.350952,1556.530029,2894.909912,3131.51001,4145.910156,46.330002,110.285004,106.059998,118.019997,99.419998,82.184998,184.679993,15:58:00,2023-04-13
4,33879.378906,39.700001,66.394997,50.400002,54.18,20.98,142.245193,102.589996,119.160004,12124.474609,451.770203,1781.274902,646.06012,1550.800049,2894.73999,3127.199951,4137.470215,46.290001,110.269997,105.040001,117.544998,98.93,82.0,184.050003,15:58:00,2023-04-14


In [5]:
# save raw file. later this will go BQ warehouse.
repo_path = os.getcwd()
repo_path
os.chdir(repo_path + '/data')
print(f'Path: {os.getcwd()}')

df.to_csv('raw_prices_26f.csv')
os.chdir(repo_path)
print(f'Path: {os.getcwd()}')

display(df.count())
display(df)

Path: /home/jupyter/project_repos/spg_stocks/data
Path: /home/jupyter/project_repos/spg_stocks


ticker
DJI             5851
EEM             5851
EEMA            5851
EMXC            5851
EU1400          5851
GSCI            5851
Gold            5851
IWC             5851
MSCIw           5851
Nasdaq          5851
Oil             5851
Russell         5851
Russellmicro    5851
SPAsia50        5851
SPG100          5851
SPG1200         5851
Spx             5851
Stoxx50         5851
Tbills          5851
Tbonds          5851
Tnotes          5851
Tnotes_long     5851
Tnotes_short    5851
VTHR            5851
time            5851
date            5851
dtype: int64

ticker,DJI,EEM,EEMA,EMXC,EU1400,GSCI,Gold,IWC,MSCIw,Nasdaq,Oil,Russell,Russellmicro,SPAsia50,SPG100,SPG1200,Spx,Stoxx50,Tbills,Tbonds,Tnotes,Tnotes_long,Tnotes_short,VTHR,time,date
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
2023-04-10 09:30:00-04:00,33354.500000,39.240002,66.309998,49.342701,52.549999,20.610001,141.680695,101.099998,117.080002,11961.982422,441.214386,1746.987427,635.994873,1560.880005,2842.120117,3069.629883,4075.560059,44.840000,110.195000,107.750000,117.970001,99.750000,82.114998,181.210007,09:30:00,2023-04-10
2023-04-10 09:32:00-04:00,33356.960938,39.260101,66.309998,49.314999,52.500000,20.620001,141.708893,101.099998,116.962898,11963.609375,441.707703,1746.566772,635.821411,1559.939941,2842.129883,3069.540039,4075.899902,44.850498,110.199997,107.620003,117.940002,99.699997,82.110001,181.210007,09:32:00,2023-04-10
2023-04-10 09:34:00-04:00,33351.199219,39.249901,66.309998,49.335999,52.490002,20.600000,141.567795,101.099998,116.849998,11944.889648,440.819794,1746.223877,635.638306,1559.589966,2839.409912,3068.040039,4073.199951,44.849998,110.190002,107.690002,117.949997,99.706200,82.119499,181.210007,09:34:00,2023-04-10
2023-04-10 09:36:00-04:00,33365.070312,39.259998,66.099998,49.335999,52.520000,20.610001,141.673599,101.099998,116.849998,11945.699219,441.312988,1746.954956,635.277405,1558.819946,2839.879883,3069.280029,4074.330078,44.869999,110.190102,107.635002,117.940002,99.695000,82.114998,181.210007,09:36:00,2023-04-10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-05-19 15:54:00-04:00,33449.621094,38.980000,65.250000,50.360001,54.049999,19.240000,139.812302,104.459999,120.440002,12662.286133,392.907990,1773.896973,657.872375,1530.599976,2974.649902,3147.010010,4194.790039,46.235001,110.290001,101.160004,116.860001,97.629997,81.790001,186.210098,15:54:00,2023-05-19
2023-05-19 15:56:00-04:00,33455.609375,38.990002,65.245003,50.369999,54.044998,19.245001,139.812302,104.430000,120.470001,12664.101562,392.907990,1773.907471,657.745056,1530.609985,2975.629883,3147.449951,4195.180176,46.224998,110.290001,101.129997,116.860001,97.620003,81.785004,186.210098,15:56:00,2023-05-19
2023-05-19 15:58:00-04:00,33424.878906,38.999001,65.050003,50.375000,54.029999,19.240000,139.812302,104.459999,120.410004,12658.373047,392.907990,1773.626221,657.744751,1530.089966,2974.110107,3146.080078,4191.970215,46.189999,110.290001,101.120003,116.839996,97.610001,81.790001,186.339294,15:58:00,2023-05-19
2023-05-21 16:00:00-04:00,33424.878906,38.999001,65.050003,50.375000,54.029999,19.240000,139.875198,104.459999,120.410004,12658.373047,393.127197,1773.626221,657.744751,1530.089966,2974.110107,3146.080078,4191.970215,46.189999,110.290001,101.120003,116.839996,97.610001,81.790001,186.339294,16:00:00,2023-05-21


In [6]:
### now i want to do feature engineering for all assets 

asset_list = ['Spx', 
              'Nasdaq', 
              'DJI', 
              'Russellmicro',
              'Russell',
              'SPG100',
              'SPG1200',
              'Tbills',
              'Tnotes',
              'Tbonds',
              'Oil',
              'Gold',
              'EMXC', 
              'EEMA', 
              'EEM', 
              'VTHR',
              'IWC']

df = df[asset_list+['date', 'time']]
print(f'DF shape:{df.shape}')

for asset in asset_list:
    
    df[asset + '_ret'] = 100*(df[asset]/df[asset].shift(1)-1)
    df['s_' + asset + '_ret_1prd'] = (100*(df[asset]/df[asset].shift(1)-1)).shift(1)
    df['s_' + asset + '_ret_2prd'] = (100*(df[asset]/df[asset].shift(2)-1)).shift(1)
    df['s_' + asset + '_ret_4prd'] = (100*(df[asset]/df[asset].shift(4)-1)).shift(1)
    df['s_' + asset + '_ret_10prd'] = (100*(df[asset]/df[asset].shift(10)-1)).shift(1)
    df['s_' + asset + '_ret_30prd'] = (100*(df[asset]/df[asset].shift(3)-1)).shift(1)
    print(f'Data shape: {df.shape}')

    df.loc[df.time < datetime.time(9, 32, 0), 's_' + asset + '_1prd'] = np.nan
    df.loc[df.time < datetime.time(9, 34, 0), 's_' + asset + '_2prd'] = np.nan
    df.loc[df.time < datetime.time(9, 38, 0), 's_' + asset + '_4prd'] = np.nan
    df.loc[df.time < datetime.time(9, 50, 0), 's_' + asset + '_10prd'] = np.nan

    dayopen.rename(columns={asset:asset+'_open'}, inplace=True)
    # dayopen.head()
    dayclose.rename(columns={asset:asset+'_close'}, inplace=True)
    dayclose_l1 = dayclose.copy()
    dayclose_l2 = dayclose.copy()
    dayclose_l1[asset+'_close_l1'] = dayclose_l1[asset+'_close'].shift(1)
    dayclose_l2[asset+'_close_l2'] = dayclose_l2[asset+'_close'].shift(2)
    
    df = pd.merge(df, dayopen[['date', asset + '_open']], on=['date'], how='left')
    df = pd.merge(df, dayclose_l1[['date', asset + '_close_l1']], on=['date'], how='left')
    df = pd.merge(df, dayclose_l2[['date', asset + '_close_l2']], on=['date'], how='left')

    df['s_' + asset + '_ret_open'] = (100*(df[asset]/df[asset + '_open']-1)).shift(1)
    df['s_' + asset + '_ret_close1'] = (100*(df[asset]/df[asset + '_close_l1']-1)).shift(1)
    df['s_' + asset + '_ret_close2'] = (100*(df[asset]/df[asset + '_close_l2']-1)).shift(1)

    cols_todrop = [x for x in list(df.columns) if asset in x and 'ret' not in x]
    df.drop(columns = cols_todrop, inplace=True)

print(f'Time to do feature engineering: {time.time() - time0:.2f}')
display(df.head())

DF shape:(5851, 19)
Data shape: (5851, 25)
Data shape: (5851, 33)
Data shape: (5851, 41)
Data shape: (5851, 49)
Data shape: (5851, 57)
Data shape: (5851, 65)
Data shape: (5851, 73)
Data shape: (5851, 81)
Data shape: (5851, 89)
Data shape: (5851, 97)
Data shape: (5851, 105)
Data shape: (5851, 113)
Data shape: (5851, 121)
Data shape: (5851, 129)
Data shape: (5851, 137)
Data shape: (5851, 145)
Data shape: (5851, 153)
Time to do feature engineering: 15.83


ticker,date,time,Spx_ret,s_Spx_ret_1prd,s_Spx_ret_2prd,s_Spx_ret_4prd,s_Spx_ret_10prd,s_Spx_ret_30prd,s_Spx_ret_open,s_Spx_ret_close1,s_Spx_ret_close2,Nasdaq_ret,s_Nasdaq_ret_1prd,s_Nasdaq_ret_2prd,s_Nasdaq_ret_4prd,s_Nasdaq_ret_10prd,s_Nasdaq_ret_30prd,s_Nasdaq_ret_open,s_Nasdaq_ret_close1,s_Nasdaq_ret_close2,DJI_ret,s_DJI_ret_1prd,s_DJI_ret_2prd,s_DJI_ret_4prd,s_DJI_ret_10prd,s_DJI_ret_30prd,s_DJI_ret_open,s_DJI_ret_close1,s_DJI_ret_close2,Russellmicro_ret,s_Russellmicro_ret_1prd,s_Russellmicro_ret_2prd,s_Russellmicro_ret_4prd,s_Russellmicro_ret_10prd,s_Russellmicro_ret_30prd,s_Russellmicro_ret_open,s_Russellmicro_ret_close1,s_Russellmicro_ret_close2,Russell_ret,s_Russell_ret_1prd,s_Russell_ret_2prd,s_Russell_ret_4prd,s_Russell_ret_10prd,s_Russell_ret_30prd,s_Russell_ret_open,s_Russell_ret_close1,s_Russell_ret_close2,SPG100_ret,s_SPG100_ret_1prd,s_SPG100_ret_2prd,...,s_Gold_ret_10prd,s_Gold_ret_30prd,s_Gold_ret_open,s_Gold_ret_close1,s_Gold_ret_close2,EMXC_ret,s_EMXC_ret_1prd,s_EMXC_ret_2prd,s_EMXC_ret_4prd,s_EMXC_ret_10prd,s_EMXC_ret_30prd,s_EMXC_ret_open,s_EMXC_ret_close1,s_EMXC_ret_close2,EEMA_ret,s_EEMA_ret_1prd,s_EEMA_ret_2prd,s_EEMA_ret_4prd,s_EEMA_ret_10prd,s_EEMA_ret_30prd,s_EEMA_ret_open,s_EEMA_ret_close1,s_EEMA_ret_close2,EEM_ret,s_EEM_ret_1prd,s_EEM_ret_2prd,s_EEM_ret_4prd,s_EEM_ret_10prd,s_EEM_ret_30prd,s_EEM_ret_open,s_EEM_ret_close1,s_EEM_ret_close2,VTHR_ret,s_VTHR_ret_1prd,s_VTHR_ret_2prd,s_VTHR_ret_4prd,s_VTHR_ret_10prd,s_VTHR_ret_30prd,s_VTHR_ret_open,s_VTHR_ret_close1,s_VTHR_ret_close2,IWC_ret,s_IWC_ret_1prd,s_IWC_ret_2prd,s_IWC_ret_4prd,s_IWC_ret_10prd,s_IWC_ret_30prd,s_IWC_ret_open,s_IWC_ret_close1,s_IWC_ret_close2
0,2023-04-10,09:30:00,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,2023-04-10,09:32:00,0.008339,,,,,,0.0,,,0.013601,,,,,,0.0,,,0.007378,,,,,,0.0,,,-0.027274,,,,,,0.0,,,-0.024079,,,,,,0.0,,,0.000344,,,...,,,0.0,,,-0.056143,,,,,,0.0,,,0.0,,,,,,0.0,,,0.051222,,,,,,0.0,,,0.0,,,,,,0.0,,,0.0,,,,,,0.0,,
2,2023-04-10,09:34:00,-0.066242,0.008339,,,,,0.008339,,,-0.156472,0.013601,,,,,0.013601,,,-0.017273,0.007378,,,,,0.007378,,,-0.028798,-0.027274,,,,,-0.027274,,,-0.019633,-0.024079,,,,,-0.024079,,,-0.095702,0.000344,,...,,,0.019903,,,0.042583,-0.056143,,,,,-0.056143,,,0.0,0.0,,,,,0.0,,,-0.025982,0.051222,,,,,0.051222,,,0.0,0.0,,,,,0.0,,,0.0,0.0,,,,,0.0,,
3,2023-04-10,09:36:00,0.027745,-0.066242,-0.057909,,,,-0.057909,,,0.006778,-0.156472,-0.142892,,,,-0.142892,,,0.041591,-0.017273,-0.009896,,,,-0.009896,,,-0.056778,-0.028798,-0.056065,,,,-0.056065,,,0.041866,-0.019633,-0.043707,,,,-0.043707,,,0.016552,-0.095702,-0.095359,...,,,-0.079686,,,0.0,0.042583,-0.013583,,,,-0.013583,,,-0.316693,0.0,0.0,,,,0.0,,,0.025726,-0.025982,0.025227,,,,0.025227,,,0.0,0.0,0.0,,,,0.0,,,0.0,0.0,0.0,,,,0.0,,
4,2023-04-10,09:38:00,0.046385,0.027745,-0.038515,,,-0.030179,-0.030179,,,0.00945,0.006778,-0.149705,,,-0.136125,-0.136125,,,0.087421,0.041591,0.024311,,,0.031691,0.031691,,,0.060269,-0.056778,-0.08556,,,-0.11281,-0.11281,,,0.111278,0.041866,0.022226,,,-0.001859,-0.001859,,,0.010918,0.016552,-0.079166,...,,-0.005008,-0.005008,,,-0.060805,0.0,0.042583,,,-0.013583,-0.013583,,,0.355523,-0.316693,-0.316693,,,-0.316693,-0.316693,,,0.101887,0.025726,-0.000262,,,0.05096,0.05096,,,0.0,0.0,0.0,,,0.0,0.0,,,0.214446,0.0,0.0,,,0.0,0.0,,


In [7]:
# save preprocessed file. later this will go to BQ feature store.
repo_path = os.getcwd()
os.chdir(repo_path + '/data')
print(f'Path: {os.getcwd()}')

df.to_csv('features.csv')
os.chdir(repo_path)
print(f'Path: {os.getcwd()}')

Path: /home/jupyter/project_repos/spg_stocks/data
Path: /home/jupyter/project_repos/spg_stocks


In [8]:
### modeling ###

t_df = df.copy()
# t_df.rename(columns={'VTHR_ret':'target'}, inplace=True)
t_df.rename(columns={'IWC_ret':'target'}, inplace=True)

colstodrop = [col for col in df.columns if col.endswith('_ret')]
t_df.drop(columns = ['time', 'date'] + colstodrop, 
          inplace = True,
          errors = 'ignore')

t_df = t_df.dropna()
display(t_df.info())

y = t_df.pop('target')
X = t_df
print(f'Data preprocessng time: , {time.time()-time0:.2f} sec')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=int(0.2*X.shape[0]))
display(X_train.shape, X_test.shape, y_train.shape, X_train.head())
time1 = time.time()


# grid search: works fine and is faster than randomizedsearch, but is less maintainable.
# time_modeling = time.time()
# enm = SGDRegressor()
# parameters = {'alpha':[0.0005, 0.001, 0.002, 0.003, 0.005], 
#               'l1_ratio':[0, 0.02, 0.05, 0.1, 0.25, 0.5, 1]}
# enmgs = GridSearchCV(enm, parameters, scoring='r2', cv=4)
# enmgs.fit(X_train, y_train)
# print(enmgs.best_params_)
# enmt = ElasticNet(**enmgs.best_params_)
# enmt.fit(X_train, y_train)

# print(f'In sample, ElasticNet: , {r2_score(y_train, enmgs.predict(X_train))}')
# print(f'Out of sample, ElasticNet: , {r2_score(y_test, enmgs.predict(X_test))}')

# print(f'Modeling time: , {time.time()-time_modeling:.2f} sec')
# print(f'Total time: , {time.time()-time0:.2f} sec')

time_modeling = time.time()
enm = SGDRegressor()
param_dist = {
    "l1_ratio": stats.beta(a=0.8, b=1.4, loc=0.01),
    # "l1_ratio": loguniform(0.01, 1),
    "alpha": loguniform(2e-4, 5e-1),
}

# run randomized search
n_iter_search = 100
enmrs = RandomizedSearchCV(
    enm, 
    param_distributions=param_dist, 
    n_iter=n_iter_search,
    cv=4
)
enmrs.fit(X_train, y_train)
print(enmrs.best_params_)
enmt = ElasticNet(**enmrs.best_params_)
enmt.fit(X_train, y_train)
print(f'In sample, ElasticNet: , {r2_score(y_train, enmrs.predict(X_train))}')
print(f'Out of sample, ElasticNet: , {r2_score(y_test, enmrs.predict(X_test))}')

print(f'Modeling time: , {time.time()-time_modeling:.2f} sec')
print(f'Total time: , {time.time()-time0:.2f} sec')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5460 entries, 391 to 5850
Columns: 137 entries, s_Spx_ret_1prd to s_IWC_ret_close2
dtypes: float64(137)
memory usage: 5.7 MB


None

Data preprocessng time: , 17.27 sec


(4368, 136)

(1092, 136)

(4368,)

ticker,s_Spx_ret_1prd,s_Spx_ret_2prd,s_Spx_ret_4prd,s_Spx_ret_10prd,s_Spx_ret_30prd,s_Spx_ret_open,s_Spx_ret_close1,s_Spx_ret_close2,s_Nasdaq_ret_1prd,s_Nasdaq_ret_2prd,s_Nasdaq_ret_4prd,s_Nasdaq_ret_10prd,s_Nasdaq_ret_30prd,s_Nasdaq_ret_open,s_Nasdaq_ret_close1,s_Nasdaq_ret_close2,s_DJI_ret_1prd,s_DJI_ret_2prd,s_DJI_ret_4prd,s_DJI_ret_10prd,s_DJI_ret_30prd,s_DJI_ret_open,s_DJI_ret_close1,s_DJI_ret_close2,s_Russellmicro_ret_1prd,s_Russellmicro_ret_2prd,s_Russellmicro_ret_4prd,s_Russellmicro_ret_10prd,s_Russellmicro_ret_30prd,s_Russellmicro_ret_open,s_Russellmicro_ret_close1,s_Russellmicro_ret_close2,s_Russell_ret_1prd,s_Russell_ret_2prd,s_Russell_ret_4prd,s_Russell_ret_10prd,s_Russell_ret_30prd,s_Russell_ret_open,s_Russell_ret_close1,s_Russell_ret_close2,s_SPG100_ret_1prd,s_SPG100_ret_2prd,s_SPG100_ret_4prd,s_SPG100_ret_10prd,s_SPG100_ret_30prd,s_SPG100_ret_open,s_SPG100_ret_close1,s_SPG100_ret_close2,s_SPG1200_ret_1prd,s_SPG1200_ret_2prd,...,s_Oil_ret_close1,s_Oil_ret_close2,s_Gold_ret_1prd,s_Gold_ret_2prd,s_Gold_ret_4prd,s_Gold_ret_10prd,s_Gold_ret_30prd,s_Gold_ret_open,s_Gold_ret_close1,s_Gold_ret_close2,s_EMXC_ret_1prd,s_EMXC_ret_2prd,s_EMXC_ret_4prd,s_EMXC_ret_10prd,s_EMXC_ret_30prd,s_EMXC_ret_open,s_EMXC_ret_close1,s_EMXC_ret_close2,s_EEMA_ret_1prd,s_EEMA_ret_2prd,s_EEMA_ret_4prd,s_EEMA_ret_10prd,s_EEMA_ret_30prd,s_EEMA_ret_open,s_EEMA_ret_close1,s_EEMA_ret_close2,s_EEM_ret_1prd,s_EEM_ret_2prd,s_EEM_ret_4prd,s_EEM_ret_10prd,s_EEM_ret_30prd,s_EEM_ret_open,s_EEM_ret_close1,s_EEM_ret_close2,s_VTHR_ret_1prd,s_VTHR_ret_2prd,s_VTHR_ret_4prd,s_VTHR_ret_10prd,s_VTHR_ret_30prd,s_VTHR_ret_open,s_VTHR_ret_close1,s_VTHR_ret_close2,s_IWC_ret_1prd,s_IWC_ret_2prd,s_IWC_ret_4prd,s_IWC_ret_10prd,s_IWC_ret_30prd,s_IWC_ret_open,s_IWC_ret_close1,s_IWC_ret_close2
5244,0.002191,-0.029819,0.011156,-0.03176,-0.033215,-0.063251,-0.312091,-0.006062,-0.004076,-0.028524,0.009107,-0.052134,-0.025325,0.327828,0.176442,0.834518,0.008484,-0.022012,0.007044,-0.025975,-0.03279,-0.296195,-0.731655,-0.583469,0.025142,0.026752,0.034106,-0.016526,0.039757,-0.303057,-1.044057,0.11569,0.00612,-0.01401,0.011448,-0.052117,0.001044,-0.47122,-1.114667,0.029582,-0.002722,-0.021407,-0.008498,-0.017334,-0.029221,0.186288,0.160704,0.289779,0.005145,-0.019304,...,-0.323657,1.156805,0.0,0.0,0.0,0.0,0.0,-1.13603,-1.639094,-1.412391,0.020093,-0.020077,0.0,-0.040153,-0.020077,-0.260411,-0.459815,0.443819,-0.445374,-0.445374,-0.445374,-0.445374,-0.445374,-0.169314,-0.947563,1.742979,0.025614,0.0,0.03843,-0.038411,0.0,-0.166204,-0.572962,1.270906,0.0,0.0,0.0,0.151861,0.0,-0.266554,-0.456074,-0.03817,0.082117,0.023165,0.023165,0.013352,0.023165,-0.484812,-1.006898,0.249509
4469,0.064685,0.103597,0.227508,0.329158,0.114953,-0.16569,0.634336,0.180768,0.062444,0.081459,0.197191,0.309813,0.099776,0.190244,1.25467,0.620638,0.059413,0.11457,0.234219,0.332019,0.113042,-0.542063,0.004527,-0.161637,0.037705,0.074741,0.169462,0.217707,0.101414,-0.543013,0.705217,0.916301,0.046113,0.104245,0.253989,0.274934,0.16177,-0.650957,0.616983,0.348417,0.046189,0.063852,0.152984,0.239201,0.079496,0.202035,0.751017,0.389836,0.043863,0.07014,...,-1.572172,-0.856085,0.0,0.0,0.0,0.0,0.0,-0.076996,-0.173278,0.299157,0.041146,0.041146,0.051083,0.130695,0.051083,-0.048252,0.250352,0.070978,0.0,0.0,0.17737,0.17737,0.0,-0.092301,-0.138382,-0.763942,0.025519,0.051061,0.076606,0.153329,0.063827,0.114953,0.102164,-0.558237,0.0,0.0,0.0,0.133799,0.0,-0.320215,0.311326,-0.038095,0.02915,0.126452,0.072916,0.253216,0.126452,-0.309897,0.704364,0.921964
2072,0.028816,0.037782,0.063957,0.06881,0.031489,-0.073543,-0.070639,0.010415,0.037013,0.03659,0.071374,0.069656,0.021594,-0.284909,-0.528243,-0.420358,0.007588,0.02635,0.03868,0.061859,0.021613,-0.09272,0.044182,0.105923,0.025384,0.036884,0.081783,0.118409,0.049256,-0.40947,-0.45557,-0.056886,0.033666,0.049875,0.093471,0.131129,0.059328,-0.105637,-0.201593,-0.075615,0.024512,0.032452,0.048682,0.058355,0.02313,0.083925,-0.08756,0.115038,0.020501,0.030113,...,1.412603,2.067978,-0.004964,-0.019983,-0.054967,0.009995,-0.044984,0.401732,0.442121,-0.980643,0.033513,0.033513,0.033513,0.014328,0.033513,-0.030259,0.050478,-0.411974,0.0,0.0,0.0,0.0,0.0,0.724134,-0.621127,-1.588023,0.0,0.012871,0.012871,0.012871,0.002062,-0.077138,-0.358973,-1.320468,0.0,0.0,0.0,0.0,0.0,-0.146917,-0.146917,-0.266307,-0.007712,-0.007712,-0.072293,-0.072293,-0.007712,-0.627875,-0.38916,-0.072293
2289,-0.018629,0.007596,0.003184,-0.086959,-0.004908,-1.011993,-1.411791,-1.322351,-0.027902,-0.010813,-0.050405,-0.155028,-0.037992,-1.305759,-1.732802,-2.014555,-0.011767,0.003609,0.027538,-0.037806,0.008638,-0.771247,-0.940202,-0.74419,0.009574,0.015526,-0.005088,-0.060555,0.006747,-1.673438,-2.52278,-2.865026,0.013835,0.0208,0.030829,-0.014745,0.017785,-1.256251,-2.190536,-2.321825,-0.020196,-0.008356,-0.029943,-0.09602,-0.023332,-0.801427,-1.018201,-0.998748,-0.009714,0.007771,...,-2.260036,-1.142937,0.0,0.0,0.0,0.0,0.0,0.310028,0.310028,0.778725,0.020691,0.030938,0.0295,-0.020269,0.020488,-0.590183,-1.572061,-1.413079,0.0,0.07753,0.07753,0.07753,0.07753,-0.625702,-1.859357,-2.303804,-0.026218,0.015465,0.0,-0.117853,-0.013111,-0.62533,-1.978927,-2.20513,0.0,0.0,0.0,0.0,0.0,-0.841756,-1.541382,-1.289583,0.005428,0.005428,0.005428,0.005428,0.005428,-1.514869,-2.331149,-2.622079
5597,-0.010567,0.030516,0.010334,-0.069855,0.044942,0.19833,0.103638,1.293707,0.024348,0.020822,-0.0301,-0.157455,0.006101,0.504644,0.658486,1.921875,-0.027882,0.050578,0.047473,0.012537,0.086082,-0.234938,-0.517468,0.716064,0.035878,0.026805,-0.013421,-0.166561,0.019636,-0.160278,-0.395706,1.846172,0.031496,0.043462,3.5e-05,-0.092758,0.031227,0.050496,-0.351407,1.844071,-0.00746,-0.001017,-0.0227,-0.08632,-0.002713,0.001687,0.12585,0.564889,-0.007042,0.0205,...,-1.591451,1.256359,0.0,0.0,0.0,0.0,0.0,-0.105936,-1.227709,-1.644937,0.009991,0.019984,0.030772,-0.05989,0.019984,0.019984,-0.318598,0.724347,0.0,-0.347064,0.004166,-0.381362,-0.347064,-0.530534,-0.530534,-0.193659,0.051541,0.025769,0.051541,-0.025746,0.061082,-0.295321,-0.88077,-0.448716,0.0,0.0,-0.148397,-0.132223,-0.148397,0.273656,0.143413,1.255813,0.0,0.028993,-0.067587,-0.28902,0.019324,-0.737041,-0.533372,1.910203


{'alpha': 0.010783866798131723, 'l1_ratio': 0.29629832454786775}
In sample, ElasticNet: , 0.10493560400973057
Out of sample, ElasticNet: , 0.03564688752870182
Modeling time: , 13.85 sec
Total time: , 31.20 sec


In [9]:
feature_names = X_test.columns
feature_importance = pd.DataFrame(list(zip(feature_names, np.abs(enmt.coef_))),
                                 columns=['col_name','feature_importance_vals'])
feature_importance.sort_values(by=['feature_importance_vals'],
                              ascending=False, inplace=True)

feature_importance.head(10)

Unnamed: 0,col_name,feature_importance_vals
135,s_IWC_ret_close2,0.001244
87,s_Oil_ret_close2,0.000536
38,s_Russell_ret_close1,0.000508
100,s_EMXC_ret_30prd,0.0
99,s_EMXC_ret_10prd,0.0
98,s_EMXC_ret_4prd,0.0
97,s_EMXC_ret_2prd,0.0
96,s_EMXC_ret_1prd,0.0
69,s_Tnotes_ret_open,0.0
94,s_Gold_ret_close1,0.0


Experimenting with a distribution for l1-ratio hyperparameter.
Takeaway: beta generalized (log)uniform distribution and allows you to finutune it as needed.

In [10]:
# N = 500000
# draws = scipy.stats.beta.rvs(a=1,b=3,size=N)
# sns.histplot(x=draws, bins=100)
# plt.show()
# draws = scipy.stats.beta.rvs(a=0.8,b=1.4,size=N)
# sns.histplot(x=draws, bins=100)
# plt.show()
# # draws = scipy.stats.beta.rvs(a=0.8,b=3,size=N)
# # sns.histplot(x=draws)
# # plt.show()
# draws = loguniform.rvs(0.01,1,size=N)
# sns.histplot(x=draws, bins=100)
# plt.show()

In [11]:
# N = 500000
# u = scipy.stats.uniform.rvs(0.01,1,size=10000)
# lu1 = loguniform.rvs(0.01,1,size=N)
# lu2 = scipy.stats.beta.rvs(a=0.8,b=1.4,loc=0.01,size=N)
# # sns.histplot(x=lu1)

In [12]:
# from matplotlib import pyplot

# bins = np.linspace(0, 1, 100)

# pyplot.hist(lu1, bins, alpha=0.5, label='Loguniform1')
# pyplot.hist(lu2, bins, alpha=0.5, label='Beta')
# pyplot.legend(loc='upper right')
# pyplot.show()

In [13]:
# artifact_filename_en = 'EN_model.pkl'

# os.chdir('/home/jupyter/project_repos/spg_stocks/stocks-app')
# joblib.dump(enmt, artifact_filename_en)

# model_bucket = 'gs://mpg3-stocks/artifacts'
# storage_path = os.path.join(model_bucket, artifact_filename_en)
# blob = storage.blob.Blob.from_string(storage_path, client=storage.Client(project=project_id))
# blob.upload_from_filename(os.getcwd()+'/'+artifact_filename_en)

# file = open(artifact_filename_en, "rb")
# trained_model = joblib.load(file)

# prediction = trained_model.predict([list(X_test.iloc[1,:])])
# print('EN model', prediction)

In [14]:
prediction = enmrs.predict([list(X_test.iloc[1,:])])
print('EN model', prediction)

EN model [0.01529305]
