### This is a modeling script for a project to predict index returns at 2-minute frequency

In [37]:
import numpy as np
import pandas as pd
import os, time, warnings, random, shap, requests, optuna, datetime, joblib
import seaborn as sns
import matplotlib.pyplot as plt
import functools as ft
import yfinance as yf

from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.preprocessing import LabelBinarizer, LabelEncoder, OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, train_test_split, KFold
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, f1_score, r2_score, mean_squared_error
from sklearn.inspection import permutation_importance
from google.cloud import bigquery, storage

pd.set_option('display.max_columns', 100)
pd.set_option('mode.chained_assignment', None)
pd.set_option('display.expand_frame_repr', False)
warnings.filterwarnings('ignore') 

project_name = 'GCP-pp3'
project_id = 'polished-vault-379315'
regionn = 'us-west1'

time0 = time.time()

# os.chdir('/home/jupyter/projects_gcp_cpu/spx/src')
os.getcwd()

'/home/jupyter/project_repos/spg_stocks'

In [47]:
tickerStrings = ['^GSPC', 
                 '^IXIC', 
                 '^DJI',  
                 '^RUT', 
                 '^RUMIC', 
                 'URTH',
                 '^SPG100',
                 '^SPG1200',
                 'GSG',
                 '^SPG100',
                 '^SPG1200',
                 '^BKTAS',
                 'FEZ',
                 'IEUR',
                 '^SPGSCL',
                 '^SPGSGCP',
                 'SHV',
                 'SHY',
                 'IEI',
                 'IEF',
                 'TLT',
                 'EEM', 
                 'EMXC', 
                 'EEMA', 
                 'VTHR']
df_list = list()
for ticker in tickerStrings:
    data = yf.download(ticker, 
                       group_by="Ticker", 
                       period='60d', 
                       interval='2m', 
                       prepost=False, 
                       auto_adjust=True)
    data['ticker'] = ticker  
    df_list.append(data)

df = pd.concat(df_list)
df = df[['Close', 'ticker']]
df.replace({'^GSPC':'Spx', 
            '^IXIC':'Nasdaq', 
            '^DJI':'DJI',
            '^RUT':'Russell',
            '^RUMIC':'Russellmicro',
            'URTH':'MSCIw',
            'GSG':'GSCI',
            '^SPG100':'SPG100',
            '^SPG1200':'SPG1200',            
            '^BKTAS':'SPAsia50',
            'FEZ':'Stoxx50',
            'IEUR':'EU1400',
            '^SPGSCL':'Oil',
            '^SPGSGCP':'Gold',
            'IEI':'Tnotes',
            'IEF':'Tnotes_long',
            'SHV':'Tbills',
            'SHY':'Tnotes_short',
            'TLT':'Tbonds',
           }, inplace=True)
df = (df.pivot_table(index=['Datetime'], columns='ticker', values='Close'))
df0 = df.copy()
df

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%********

ticker,DJI,EEM,EEMA,EMXC,EU1400,GSCI,Gold,MSCIw,Nasdaq,Oil,Russell,Russellmicro,SPAsia50,SPG100,SPG1200,Spx,Stoxx50,Tbills,Tbonds,Tnotes,Tnotes_long,Tnotes_short,VTHR
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
2023-04-10 09:30:00-04:00,33354.500000,39.240002,66.309998,49.342701,52.549999,20.610001,141.680695,117.080002,11961.982422,441.214386,1746.987427,635.994873,1560.880005,2842.120117,3069.629883,4075.560059,44.840000,110.195000,107.750000,117.970001,99.750000,82.114998,181.210007
2023-04-10 09:32:00-04:00,33356.960938,39.260101,,49.314999,52.500000,20.620001,141.708893,116.962898,11963.609375,441.707703,1746.566772,635.821411,1559.939941,2842.129883,3069.540039,4075.899902,44.850498,110.199997,107.620003,117.940002,99.699997,82.110001,
2023-04-10 09:34:00-04:00,33351.199219,39.249901,,49.335999,52.490002,20.600000,141.567795,116.849998,11944.889648,440.819794,1746.223877,635.638306,1559.589966,2839.409912,3068.040039,4073.199951,44.849998,110.190002,107.690002,117.949997,99.706200,82.119499,
2023-04-10 09:36:00-04:00,33365.070312,39.259998,66.099998,,52.520000,20.610001,141.673599,,11945.699219,441.312988,1746.954956,635.277405,1558.819946,2839.879883,3069.280029,4074.330078,44.869999,110.190102,107.635002,117.940002,99.695000,82.114998,
2023-04-10 09:38:00-04:00,33394.238281,39.299999,66.334999,49.306000,,,141.758301,116.895401,11946.828125,441.707703,1748.898926,635.660278,1559.500000,2840.189941,3070.219971,4076.219971,44.900002,110.197998,107.609802,117.940002,99.684998,82.119301,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-05-19 15:50:00-04:00,33440.691406,38.970001,65.044197,50.348999,54.020000,,,120.410004,12661.015625,,1773.239746,657.748535,1530.079956,2974.090088,3146.139893,4193.370117,46.215199,110.290001,101.105003,116.830002,97.599998,81.775002,
2023-05-19 15:52:00-04:00,33444.519531,38.980000,65.376099,50.360001,54.040001,19.230000,,120.440002,12660.219727,,1773.284180,657.668823,1530.449951,2974.189941,3146.330078,4193.790039,46.229900,110.294998,101.109901,116.845001,97.605003,81.779800,
2023-05-19 15:54:00-04:00,33449.621094,38.980000,65.250000,50.360001,54.049999,19.240000,,,12662.286133,,1773.896973,657.872375,1530.599976,2974.649902,3147.010010,4194.790039,46.235001,110.290001,101.160004,116.860001,97.629997,81.790001,
2023-05-19 15:56:00-04:00,33455.609375,38.990002,65.245003,50.369999,54.044998,19.245001,,120.470001,12664.101562,,1773.907471,657.745056,1530.609985,2975.629883,3147.449951,4195.180176,46.224998,110.290001,101.129997,116.860001,97.620003,81.785004,


In [48]:
df_list

[                                  Open         High          Low        Close    Volume ticker
 Datetime                                                                                      
 2023-04-10 09:30:00-04:00  4085.199951  4085.199951  4075.560059  4075.560059   9164797  ^GSPC
 2023-04-10 09:32:00-04:00  4075.320068  4075.899902  4073.909912  4075.899902  14935066  ^GSPC
 2023-04-10 09:34:00-04:00  4075.889893  4076.389893  4072.550049  4073.199951  14821251  ^GSPC
 2023-04-10 09:36:00-04:00  4073.110107  4075.030029  4072.919922  4074.330078  15626372  ^GSPC
 2023-04-10 09:38:00-04:00  4074.239990  4076.250000  4072.969971  4076.219971  14051856  ^GSPC
 ...                                ...          ...          ...          ...       ...    ...
 2023-05-19 15:50:00-04:00  4191.680176  4193.580078  4189.290039  4193.370117  25842000  ^GSPC
 2023-05-19 15:52:00-04:00  4193.259766  4193.970215  4192.680176  4193.790039  22933000  ^GSPC
 2023-05-19 15:54:00-04:00  4193.839844 

In [49]:
# df = df0.copy()
# display(df.index)
# df.index = pd.to_datetime(df.index, utc=True, infer_datetime_format=True)
# # df.index = df.index.tz_localize('UTC')
# display(df.index)

In [50]:
df['time'] = df.index.time
df['date'] = df.index.date

In [51]:
df = df.fillna(method='ffill')
dayclose = df[df.time==datetime.time(15, 58, 0)]
dayopen = df[df.time==datetime.time(9, 30, 0)]
dayopen.reset_index(drop=True, inplace=True)
dayclose.reset_index(drop=True, inplace=True)
dayclose.sort_values(by='date')
display(df, dayopen.head(), dayclose.head())
df.head()
df0 = df.copy()

# df['hour'] = pd.to_datetime(df['time'], format='%H:%M:%S').dt.hour
# df['minute'] = pd.to_datetime(df['time'], format='%H:%M:%S').dt.minute

ticker,DJI,EEM,EEMA,EMXC,EU1400,GSCI,Gold,MSCIw,Nasdaq,Oil,Russell,Russellmicro,SPAsia50,SPG100,SPG1200,Spx,Stoxx50,Tbills,Tbonds,Tnotes,Tnotes_long,Tnotes_short,VTHR,time,date
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
2023-04-10 09:30:00-04:00,33354.500000,39.240002,66.309998,49.342701,52.549999,20.610001,141.680695,117.080002,11961.982422,441.214386,1746.987427,635.994873,1560.880005,2842.120117,3069.629883,4075.560059,44.840000,110.195000,107.750000,117.970001,99.750000,82.114998,181.210007,09:30:00,2023-04-10
2023-04-10 09:32:00-04:00,33356.960938,39.260101,66.309998,49.314999,52.500000,20.620001,141.708893,116.962898,11963.609375,441.707703,1746.566772,635.821411,1559.939941,2842.129883,3069.540039,4075.899902,44.850498,110.199997,107.620003,117.940002,99.699997,82.110001,181.210007,09:32:00,2023-04-10
2023-04-10 09:34:00-04:00,33351.199219,39.249901,66.309998,49.335999,52.490002,20.600000,141.567795,116.849998,11944.889648,440.819794,1746.223877,635.638306,1559.589966,2839.409912,3068.040039,4073.199951,44.849998,110.190002,107.690002,117.949997,99.706200,82.119499,181.210007,09:34:00,2023-04-10
2023-04-10 09:36:00-04:00,33365.070312,39.259998,66.099998,49.335999,52.520000,20.610001,141.673599,116.849998,11945.699219,441.312988,1746.954956,635.277405,1558.819946,2839.879883,3069.280029,4074.330078,44.869999,110.190102,107.635002,117.940002,99.695000,82.114998,181.210007,09:36:00,2023-04-10
2023-04-10 09:38:00-04:00,33394.238281,39.299999,66.334999,49.306000,52.520000,20.610001,141.758301,116.895401,11946.828125,441.707703,1748.898926,635.660278,1559.500000,2840.189941,3070.219971,4076.219971,44.900002,110.197998,107.609802,117.940002,99.684998,82.119301,181.210007,09:38:00,2023-04-10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-05-19 15:50:00-04:00,33440.691406,38.970001,65.044197,50.348999,54.020000,19.230000,139.812302,120.410004,12661.015625,392.907990,1773.239746,657.748535,1530.079956,2974.090088,3146.139893,4193.370117,46.215199,110.290001,101.105003,116.830002,97.599998,81.775002,186.210098,15:50:00,2023-05-19
2023-05-19 15:52:00-04:00,33444.519531,38.980000,65.376099,50.360001,54.040001,19.230000,139.812302,120.440002,12660.219727,392.907990,1773.284180,657.668823,1530.449951,2974.189941,3146.330078,4193.790039,46.229900,110.294998,101.109901,116.845001,97.605003,81.779800,186.210098,15:52:00,2023-05-19
2023-05-19 15:54:00-04:00,33449.621094,38.980000,65.250000,50.360001,54.049999,19.240000,139.812302,120.440002,12662.286133,392.907990,1773.896973,657.872375,1530.599976,2974.649902,3147.010010,4194.790039,46.235001,110.290001,101.160004,116.860001,97.629997,81.790001,186.210098,15:54:00,2023-05-19
2023-05-19 15:56:00-04:00,33455.609375,38.990002,65.245003,50.369999,54.044998,19.245001,139.812302,120.470001,12664.101562,392.907990,1773.907471,657.745056,1530.609985,2975.629883,3147.449951,4195.180176,46.224998,110.290001,101.129997,116.860001,97.620003,81.785004,186.210098,15:56:00,2023-05-19


ticker,DJI,EEM,EEMA,EMXC,EU1400,GSCI,Gold,MSCIw,Nasdaq,Oil,Russell,Russellmicro,SPAsia50,SPG100,SPG1200,Spx,Stoxx50,Tbills,Tbonds,Tnotes,Tnotes_long,Tnotes_short,VTHR,time,date
0,33354.5,39.240002,66.309998,49.342701,52.549999,20.610001,141.680695,117.080002,11961.982422,441.214386,1746.987427,635.994873,1560.880005,2842.120117,3069.629883,4075.560059,44.84,110.195,107.75,117.970001,99.75,82.114998,181.210007,09:30:00,2023-04-10
1,33608.109375,39.735001,66.629997,49.900002,53.029999,20.620001,142.252197,117.949997,12068.006836,438.539795,1780.577637,646.037842,1576.640015,2850.26001,3097.469971,4109.399902,45.345001,110.209,107.010002,117.860001,99.535004,82.089996,182.936096,09:30:00,2023-04-11
2,33840.03125,39.790001,66.449997,50.360001,53.639999,20.879999,143.4095,118.709999,12124.086914,450.180695,1802.421509,652.884338,1576.660034,2862.080078,3117.719971,4131.540039,45.730099,110.235001,107.125,118.260101,99.93,82.192902,183.910294,09:30:00,2023-04-12
3,33691.710938,39.860001,65.75,50.509998,54.040001,21.049999,145.011307,118.455002,12012.972656,454.083008,1779.912964,644.625244,1553.48999,2862.600098,3109.689941,4104.169922,46.110001,110.275002,107.118896,118.370003,99.910004,82.260101,183.089996,09:30:00,2023-04-13
4,34023.691406,39.75,66.5,50.41,54.310001,20.99,143.755295,119.360001,12119.128906,453.08551,1798.380127,653.873047,1557.449951,2894.860107,3133.290039,4142.700195,46.355,110.269997,105.280197,117.625,99.029999,82.029999,184.589996,09:30:00,2023-04-14


ticker,DJI,EEM,EEMA,EMXC,EU1400,GSCI,Gold,MSCIw,Nasdaq,Oil,Russell,Russellmicro,SPAsia50,SPG100,SPG1200,Spx,Stoxx50,Tbills,Tbonds,Tnotes,Tnotes_long,Tnotes_short,VTHR,time,date
0,33583.71875,39.43,66.489998,49.549999,52.889999,20.59,141.426697,117.809998,12080.524414,437.169708,1772.550171,644.302979,1567.560059,2850.310059,3087.100098,4108.660156,45.215,110.190002,106.779999,117.849998,99.489998,82.07,182.75,15:58:00,2023-04-10
1,33686.640625,39.689999,66.519997,49.959999,53.099998,20.790001,142.470993,117.959999,12032.628906,446.749908,1786.652954,648.819763,1573.910034,2844.800049,3097.97998,4109.129883,45.310001,110.199997,106.989998,117.790001,99.459999,82.040001,183.149994,15:58:00,2023-04-11
2,33650.96875,39.380001,65.75,50.064999,53.540001,21.01,142.887299,117.879997,11929.892578,455.760101,1773.595947,642.474854,1547.859985,2846.709961,3096.5,4092.330078,45.700001,110.220001,106.889999,118.129997,99.720001,82.150002,182.309998,15:58:00,2023-04-12
3,34027.089844,39.959999,66.82,50.630001,54.259998,20.93,145.032501,119.470001,12165.261719,449.764313,1796.401733,653.350952,1556.530029,2894.909912,3131.51001,4145.910156,46.330002,110.285004,106.059998,118.019997,99.419998,82.184998,184.679993,15:58:00,2023-04-13
4,33879.378906,39.700001,66.394997,50.400002,54.18,21.0,142.245193,119.160004,12124.474609,451.770203,1781.274902,646.06012,1550.800049,2894.73999,3127.199951,4137.470215,46.290001,110.269997,105.040001,117.544998,98.93,82.0,184.050003,15:58:00,2023-04-14


In [52]:
# stopped here

In [41]:
### now i wanna do feature engineering for all assets 

asset_list = ['Spx', 'Nasdaq', 'Russel', 'EMXC', 'EEMA', 'EEM', 'VTHR']

for asset in asset_list:
    
    df[asset + '_ret'] = 100*(df[asset]/df[asset].shift(1)-1)
    df['s_' + asset + '_ret_1prd'] = (100*(df[asset]/df[asset].shift(1)-1)).shift(1)
    df['s_' + asset + '_ret_2prd'] = (100*(df[asset]/df[asset].shift(2)-1)).shift(1)
    df['s_' + asset + '_ret_4prd'] = (100*(df[asset]/df[asset].shift(4)-1)).shift(1)
    print(f'Data shape: {df.shape}')

    df.loc[df.time < datetime.time(9, 32, 0), 's_' + asset + '_1prd'] = np.nan
    df.loc[df.time < datetime.time(9, 33, 0), 's_' + asset + '_2prd'] = np.nan
    df.loc[df.time < datetime.time(9, 35, 0), 's_' + asset + '_4prd'] = np.nan

    dayopen.rename(columns={asset:asset+'_open'}, inplace=True)
    # dayopen.head()
    dayclose.rename(columns={asset:asset+'_close'}, inplace=True)
    dayclose_l1 = dayclose.copy()
    dayclose_l2 = dayclose.copy()
    dayclose_l1[asset+'_close_l1'] = dayclose_l1[asset+'_close'].shift(1)
    dayclose_l2[asset+'_close_l2'] = dayclose_l2[asset+'_close'].shift(2)
    
    df = pd.merge(df, dayopen[['date', asset + '_open']], on=['date'], how='left')
    df = pd.merge(df, dayclose_l1[['date', asset + '_close_l1']], on=['date'], how='left')
    df = pd.merge(df, dayclose_l2[['date', asset + '_close_l2']], on=['date'], how='left')

    df['s_' + asset + '_ret_open'] = (100*(df[asset]/df[asset + '_open']-1)).shift(1)
    df['s_' + asset + '_ret_close1'] = (100*(df[asset]/df[asset + '_close_l1']-1)).shift(1)
    df['s_' + asset + '_ret_close2'] = (100*(df[asset]/df[asset + '_close_l2']-1)).shift(1)

    cols_todrop = [x for x in list(df.columns) if asset in x and 'ret' not in x]
    df.drop(columns = cols_todrop, inplace=True)

print(f'Time to do feature engineering: {time.time() - time0}')
display(df.head())

Data shape: (5, 13)
Data shape: (5, 19)
Data shape: (5, 25)
Data shape: (5, 31)
Data shape: (5, 37)
Data shape: (5, 43)
Data shape: (5, 49)
Time to do feature engineering: 1.8718185424804688


Unnamed: 0,time,date,Spx_ret,s_Spx_ret_1prd,s_Spx_ret_2prd,s_Spx_ret_4prd,s_Spx_ret_open,s_Spx_ret_close1,s_Spx_ret_close2,Nasdaq_ret,s_Nasdaq_ret_1prd,s_Nasdaq_ret_2prd,s_Nasdaq_ret_4prd,s_Nasdaq_ret_open,s_Nasdaq_ret_close1,s_Nasdaq_ret_close2,Russel_ret,s_Russel_ret_1prd,s_Russel_ret_2prd,s_Russel_ret_4prd,s_Russel_ret_open,s_Russel_ret_close1,s_Russel_ret_close2,EMXC_ret,s_EMXC_ret_1prd,s_EMXC_ret_2prd,s_EMXC_ret_4prd,s_EMXC_ret_open,s_EMXC_ret_close1,s_EMXC_ret_close2,EEMA_ret,s_EEMA_ret_1prd,s_EEMA_ret_2prd,s_EEMA_ret_4prd,s_EEMA_ret_open,s_EEMA_ret_close1,s_EEMA_ret_close2,EEM_ret,s_EEM_ret_1prd,s_EEM_ret_2prd,s_EEM_ret_4prd,s_EEM_ret_open,s_EEM_ret_close1,s_EEM_ret_close2,VTHR_ret,s_VTHR_ret_1prd,s_VTHR_ret_2prd,s_VTHR_ret_4prd,s_VTHR_ret_open,s_VTHR_ret_close1,s_VTHR_ret_close2
0,15:50:00,2023-05-10,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,15:52:00,2023-05-10,0.026358,,,,,,,0.016601,,,,,,,0.011902,,,,,,,0.0,,,,,,,0.0,,,,,,,0.0,,,,,,,,,,,,,
2,15:54:00,2023-05-10,-0.034812,0.026358,,,,,,-0.03983,0.016601,,,,,,0.02457,0.011902,,,,,,0.019873,0.0,,,,,,0.0,0.0,,,,,,-0.012763,0.0,,,,,,0.0,,,,,,
3,15:56:00,2023-05-10,-0.002172,-0.034812,-0.008463,,,,,-0.016447,-0.03983,-0.023236,,,,,-0.026271,0.02457,0.036475,,,,,-0.019869,0.019873,0.019873,,,,,0.0,0.0,0.0,,,,,-0.025548,-0.012763,-0.012763,,,,,0.0,0.0,,,,,
4,15:58:00,2023-05-10,0.021521,-0.002172,-0.036983,,,,,0.033581,-0.016447,-0.05627,,,,,0.02201,-0.026271,-0.001707,,,,,0.0,-0.019869,0.0,,,,,0.0,0.0,0.0,,,,,0.0511,-0.025548,-0.038308,,,,,0.0,0.0,0.0,,,,


In [42]:
### do modeling ###

t_df = df.copy()
t_df.rename(columns={'VTHR_ret':'target'}, inplace=True)
t_df.drop(columns = ['time', 'date', 'Spx_ret', 'Nasdaq_ret', 'Russel_ret', 'EEMA_ret', 'EEM_ret', 'EMXC_ret', 'VXUS_ret'], 
          inplace=True,
          errors = 'ignore')

t_df = t_df.dropna()
display(t_df.info())

y = t_df.pop('target')
X = t_df
print(f'Data preprocessng time: , {time.time()-time0:.2f} sec')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=int(0.2*X.shape[0]))
display(X_train.shape, X_test.shape, y_train.shape, X_train.head())
time1 = time.time()

enm = ElasticNet()
parameters = {'alpha':[0.0005, 0.001, 0.002, 0.003, 0.005], 
              'l1_ratio':[0, 0.02, 0.05, 0.1, 0.25, 0.5, 1]}
enmgs = GridSearchCV(enm, parameters, scoring='r2', cv=4)
enmgs.fit(X_train, y_train)
print(enmgs.best_params_)
enmt = ElasticNet(**enmgs.best_params_)
enmt.fit(X_train, y_train)

print(f'In sample, ElasticNet: , {r2_score(y_train, enmgs.predict(X_train))}')
print(f'Out of sample, ElasticNet: , {r2_score(y_test, enmgs.predict(X_test))}')

print(f'Total time: , {time.time()-time0:.2f} sec')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 0 entries
Data columns (total 43 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   s_Spx_ret_1prd       0 non-null      float64
 1   s_Spx_ret_2prd       0 non-null      float64
 2   s_Spx_ret_4prd       0 non-null      float64
 3   s_Spx_ret_open       0 non-null      float64
 4   s_Spx_ret_close1     0 non-null      float64
 5   s_Spx_ret_close2     0 non-null      float64
 6   s_Nasdaq_ret_1prd    0 non-null      float64
 7   s_Nasdaq_ret_2prd    0 non-null      float64
 8   s_Nasdaq_ret_4prd    0 non-null      float64
 9   s_Nasdaq_ret_open    0 non-null      float64
 10  s_Nasdaq_ret_close1  0 non-null      float64
 11  s_Nasdaq_ret_close2  0 non-null      float64
 12  s_Russel_ret_1prd    0 non-null      float64
 13  s_Russel_ret_2prd    0 non-null      float64
 14  s_Russel_ret_4prd    0 non-null      float64
 15  s_Russel_ret_open    0 non-null      float64
 16  s_

None

Data preprocessng time: , 1.97 sec


ValueError: test_size=0 should be either positive and smaller than the number of samples 0 or a float in the (0, 1) range

In [34]:
# feature_names = X_test.columns
# feature_importance = pd.DataFrame(list(zip(feature_names, np.abs(enmt.coef_))),
#                                  columns=['col_name','feature_importance_vals'])
# feature_importance.sort_values(by=['feature_importance_vals'],
#                               ascending=False, inplace=True)

# feature_importance.head(10)

In [6]:
artifact_filename_en = 'EN_model.pkl'

os.chdir('/home/jupyter/project_repos/spg_stocks/stocks-app')
joblib.dump(enmt, artifact_filename_en)

model_bucket = 'gs://mpg3-stocks/artifacts'
storage_path = os.path.join(model_bucket, artifact_filename_en)
blob = storage.blob.Blob.from_string(storage_path, client=storage.Client(project=project_id))
blob.upload_from_filename(os.getcwd()+'/'+artifact_filename_en)

file = open(artifact_filename_en, "rb")
trained_model = joblib.load(file)
prediction = trained_model.predict([list(X_test.iloc[1,:])])
print('EN model', prediction)


EN model [0.01598725]
