### This is a modeling script for a project to predict index returns at 2-minute frequency

In [1]:
import numpy as np
import pandas as pd
import os, time, warnings, random, shap, requests, optuna, datetime, joblib
import seaborn as sns
import matplotlib.pyplot as plt
import functools as ft
import yfinance as yf

from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.preprocessing import LabelBinarizer, LabelEncoder, OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, train_test_split, KFold
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, f1_score, r2_score, mean_squared_error
from sklearn.inspection import permutation_importance
from google.cloud import bigquery, storage

pd.set_option('display.max_columns', 100)
pd.set_option('mode.chained_assignment', None)
pd.set_option('display.expand_frame_repr', False)
warnings.filterwarnings('ignore') 

project_name = 'GCP-pp2'
project_id = 'polished-vault-379315'
regionn = 'us-west1'

time0 = time.time()

# os.chdir('/home/jupyter/projects_gcp_cpu/spx/src')
os.getcwd()

'/home/jupyter/project_repos/spg_stocks'

In [2]:
tickerStrings = ['^GSPC', '^IXIC', '^RUT', 'EEM', 'EMXC', 'EEMA', 'VTHR']
df_list = list()
for ticker in tickerStrings:
    data = yf.download(ticker, 
                       group_by="Ticker", 
                       period='60d', 
                       interval='2m', 
                       prepost=False, 
                       auto_adjust=True)
    data['ticker'] = ticker  
    df_list.append(data)

df = pd.concat(df_list)
df = df[['Close', 'ticker']]
df.replace({'^GSPC':'Spx', '^IXIC':'Nasdaq', '^RUT':'Russel'}, inplace=True)
df = (df.pivot_table(index=['Datetime'], columns='ticker', values='Close'))
df.columns = ['EEM', 'EEMA', 'EMXC', 'Nasdaq', 'Russel', 'Spx', 'VTHR']

df['time'] = df.index.time
df['date'] = df.index.date

df = df.fillna(method='ffill')
dayclose = df[df.time==datetime.time(15, 58, 0)]
dayopen = df[df.time==datetime.time(9, 30, 0)]
dayopen.reset_index(drop=True, inplace=True)
dayclose.reset_index(drop=True, inplace=True)
dayclose.sort_values(by='date')
display(df, dayopen.head(), dayclose.head())
df0 = df.copy()

# df['hour'] = pd.to_datetime(df['time'], format='%H:%M:%S').dt.hour
# df['minute'] = pd.to_datetime(df['time'], format='%H:%M:%S').dt.minute

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,EEM,EEMA,EMXC,Nasdaq,Russel,Spx,VTHR,time,date
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2023-02-24 09:30:00-05:00,38.365002,64.500000,48.200001,11401.920898,1880.470093,3959.310059,178.720001,09:30:00,2023-02-24
2023-02-24 09:32:00-05:00,38.415001,64.250603,48.228001,11411.784180,1882.089600,3963.639893,178.720001,09:32:00,2023-02-24
2023-02-24 09:34:00-05:00,38.419998,64.250603,48.228001,11413.026367,1882.927979,3963.570068,178.720001,09:34:00,2023-02-24
2023-02-24 09:36:00-05:00,38.439999,64.250603,48.220001,11414.869141,1882.817139,3965.540039,178.720001,09:36:00,2023-02-24
2023-02-24 09:38:00-05:00,38.480000,64.500000,48.270000,11431.043945,1884.397095,3968.909912,178.720001,09:38:00,2023-02-24
...,...,...,...,...,...,...,...,...,...
2023-04-14 12:20:00-04:00,39.599998,66.501404,50.259998,12063.862305,1776.204346,4123.959961,183.360596,12:20:00,2023-04-14
2023-04-14 12:22:00-04:00,39.610001,66.501404,50.259998,12066.233398,1776.204346,4124.529785,183.360596,12:22:00,2023-04-14
2023-04-14 12:24:00-04:00,39.590000,66.501404,50.270000,12064.986328,1776.204346,4124.439941,183.388794,12:24:00,2023-04-14
2023-04-14 12:26:00-04:00,39.590000,66.501404,50.270000,12063.056641,1776.204346,4123.890137,183.388794,12:26:00,2023-04-14


Unnamed: 0,EEM,EEMA,EMXC,Nasdaq,Russel,Spx,VTHR,time,date
0,38.365002,64.5,48.200001,11401.920898,1880.470093,3959.310059,178.720001,09:30:00,2023-02-24
1,38.48,64.43,48.279999,11509.564453,1904.872437,4001.340088,180.699997,09:30:00,2023-02-27
2,38.310001,64.529999,48.189999,11451.158203,1898.162964,3975.360107,179.770004,09:30:00,2023-02-28
3,39.189999,65.769997,48.720001,11454.742188,1896.210449,3962.689941,179.119995,09:30:00,2023-03-01
4,38.849998,65.220001,48.43,11283.30957,1881.650146,3931.879883,178.520004,09:30:00,2023-03-02


Unnamed: 0,EEM,EEMA,EMXC,Nasdaq,Russel,Spx,VTHR,time,date
0,38.32,64.43,48.16,11396.826172,1890.438232,3970.419922,179.199997,15:58:00,2023-02-24
1,38.485001,64.529999,48.240002,11467.080078,1896.615479,3982.22998,179.389999,15:58:00,2023-02-27
2,38.200001,64.239998,47.970001,11454.893555,1897.916626,3969.280029,179.089996,15:58:00,2023-02-28
3,39.060001,65.68,48.75,11380.023438,1898.395264,3951.629883,178.520004,15:58:00,2023-03-01
4,39.220001,66.220001,48.68,11461.352539,1902.360107,3981.290039,179.860001,15:58:00,2023-03-02


In [3]:
### now i wanna do feature engineering for all assets 

asset_list = ['Spx', 'Nasdaq', 'Russel', 'EMXC', 'EEMA', 'EEM', 'VTHR']

for asset in asset_list:
    
    df[asset + '_ret'] = 100*(df[asset]/df[asset].shift(1)-1)
    df['s_' + asset + '_ret_1prd'] = (100*(df[asset]/df[asset].shift(1)-1)).shift(1)
    df['s_' + asset + '_ret_2prd'] = (100*(df[asset]/df[asset].shift(2)-1)).shift(1)
    df['s_' + asset + '_ret_4prd'] = (100*(df[asset]/df[asset].shift(4)-1)).shift(1)
    print(f'Data shape: {df.shape}')

    df.loc[df.time < datetime.time(9, 32, 0), 's_' + asset + '_1prd'] = np.nan
    df.loc[df.time < datetime.time(9, 33, 0), 's_' + asset + '_2prd'] = np.nan
    df.loc[df.time < datetime.time(9, 35, 0), 's_' + asset + '_4prd'] = np.nan

    dayopen.rename(columns={asset:asset+'_open'}, inplace=True)
    # dayopen.head()
    dayclose.rename(columns={asset:asset+'_close'}, inplace=True)
    dayclose_l1 = dayclose.copy()
    dayclose_l2 = dayclose.copy()
    dayclose_l1[asset+'_close_l1'] = dayclose_l1[asset+'_close'].shift(1)
    dayclose_l2[asset+'_close_l2'] = dayclose_l2[asset+'_close'].shift(2)
    
    df = pd.merge(df, dayopen[['date', asset + '_open']], on=['date'], how='left')
    df = pd.merge(df, dayclose_l1[['date', asset + '_close_l1']], on=['date'], how='left')
    df = pd.merge(df, dayclose_l2[['date', asset + '_close_l2']], on=['date'], how='left')

    df['s_' + asset + '_ret_open'] = (100*(df[asset]/df[asset + '_open']-1)).shift(1)
    df['s_' + asset + '_ret_close1'] = (100*(df[asset]/df[asset + '_close_l1']-1)).shift(1)
    df['s_' + asset + '_ret_close2'] = (100*(df[asset]/df[asset + '_close_l2']-1)).shift(1)

    cols_todrop = [x for x in list(df.columns) if asset in x and 'ret' not in x]
    df.drop(columns = cols_todrop, inplace=True)

print(f'Time to do feature engineering: {time.time() - time0}')
display(df.head())

Data shape: (6720, 13)
Data shape: (6720, 19)
Data shape: (6720, 25)
Data shape: (6720, 31)
Data shape: (6720, 37)
Data shape: (6720, 43)
Data shape: (6720, 49)
Time to do feature engineering: 3.859210252761841


Unnamed: 0,time,date,Spx_ret,s_Spx_ret_1prd,s_Spx_ret_2prd,s_Spx_ret_4prd,s_Spx_ret_open,s_Spx_ret_close1,s_Spx_ret_close2,Nasdaq_ret,s_Nasdaq_ret_1prd,s_Nasdaq_ret_2prd,s_Nasdaq_ret_4prd,s_Nasdaq_ret_open,s_Nasdaq_ret_close1,s_Nasdaq_ret_close2,Russel_ret,s_Russel_ret_1prd,s_Russel_ret_2prd,s_Russel_ret_4prd,s_Russel_ret_open,s_Russel_ret_close1,s_Russel_ret_close2,EMXC_ret,s_EMXC_ret_1prd,s_EMXC_ret_2prd,s_EMXC_ret_4prd,s_EMXC_ret_open,s_EMXC_ret_close1,s_EMXC_ret_close2,EEMA_ret,s_EEMA_ret_1prd,s_EEMA_ret_2prd,s_EEMA_ret_4prd,s_EEMA_ret_open,s_EEMA_ret_close1,s_EEMA_ret_close2,EEM_ret,s_EEM_ret_1prd,s_EEM_ret_2prd,s_EEM_ret_4prd,s_EEM_ret_open,s_EEM_ret_close1,s_EEM_ret_close2,VTHR_ret,s_VTHR_ret_1prd,s_VTHR_ret_2prd,s_VTHR_ret_4prd,s_VTHR_ret_open,s_VTHR_ret_close1,s_VTHR_ret_close2
0,09:30:00,2023-02-24,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,09:32:00,2023-02-24,0.109358,,,,0.0,,,0.086505,,,,0.0,,,0.086122,,,,0.0,,,0.058091,,,,0.0,,,-0.386662,,,,0.0,,,0.130325,,,,0.0,,,0.0,,,,0.0,,
2,09:34:00,2023-02-24,-0.001762,0.109358,,,0.109358,,,0.010885,0.086505,,,0.086505,,,0.044545,0.086122,,,0.086122,,,0.0,0.058091,,,0.058091,,,0.0,-0.386662,,,-0.386662,,,0.013009,0.130325,,,0.130325,,,0.0,0.0,,,0.0,,
3,09:36:00,2023-02-24,0.049702,-0.001762,0.107595,,0.107595,,,0.016146,0.010885,0.0974,,0.0974,,,-0.005887,0.044545,0.130706,,0.130706,,,-0.016587,0.0,0.058091,,0.058091,,,0.0,0.0,-0.386662,,-0.386662,,,0.052057,0.013009,0.143351,,0.143351,,,0.0,0.0,0.0,,0.0,,
4,09:38:00,2023-02-24,0.084979,0.049702,0.047939,,0.15735,,,0.141699,0.016146,0.027033,,0.113562,,,0.083914,-0.005887,0.038656,,0.124812,,,0.10369,-0.016587,-0.016587,,0.041495,,,0.388163,0.0,0.0,,-0.386662,,,0.104061,0.052057,0.065073,,0.195483,,,0.0,0.0,0.0,,0.0,,


In [4]:
### do modeling ###

t_df = df.copy()
t_df.rename(columns={'VTHR_ret':'target'}, inplace=True)
t_df.drop(columns = ['time', 'date', 'Spx_ret', 'Nasdaq_ret', 'Russel_ret', 'EEMA_ret', 'EEM_ret', 'EMXC_ret', 'VXUS_ret'], 
          inplace=True,
          errors = 'ignore')

t_df = t_df.dropna()
display(t_df.info())

y = t_df.pop('target')
X = t_df

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=int(0.2*X.shape[0]))
display(X_train.shape, X_test.shape, y_train.shape, X_train.head())
time1 = time.time()

enm = ElasticNet()
parameters = {'alpha':[0.0005, 0.001, 0.002, 0.003, 0.005], 
              'l1_ratio':[0, 0.02, 0.05, 0.1, 0.25, 0.5, 1]}
enmgs = GridSearchCV(enm, parameters, scoring='r2', cv=4)
enmgs.fit(X_train, y_train)
print(enmgs.best_params_)
enmt = ElasticNet(**enmgs.best_params_)
enmt.fit(X_train, y_train)

print(f'In sample, ElasticNet: , {r2_score(y_train, enmgs.predict(X_train))}')
print(f'Out of sample, ElasticNet: , {r2_score(y_test, enmgs.predict(X_test))}')

print(f'Total time: , {time.time()-time0}')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6240 entries, 391 to 6630
Data columns (total 43 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   s_Spx_ret_1prd       6240 non-null   float64
 1   s_Spx_ret_2prd       6240 non-null   float64
 2   s_Spx_ret_4prd       6240 non-null   float64
 3   s_Spx_ret_open       6240 non-null   float64
 4   s_Spx_ret_close1     6240 non-null   float64
 5   s_Spx_ret_close2     6240 non-null   float64
 6   s_Nasdaq_ret_1prd    6240 non-null   float64
 7   s_Nasdaq_ret_2prd    6240 non-null   float64
 8   s_Nasdaq_ret_4prd    6240 non-null   float64
 9   s_Nasdaq_ret_open    6240 non-null   float64
 10  s_Nasdaq_ret_close1  6240 non-null   float64
 11  s_Nasdaq_ret_close2  6240 non-null   float64
 12  s_Russel_ret_1prd    6240 non-null   float64
 13  s_Russel_ret_2prd    6240 non-null   float64
 14  s_Russel_ret_4prd    6240 non-null   float64
 15  s_Russel_ret_open    6240 non-null  

None

(4992, 42)

(1248, 42)

(4992,)

Unnamed: 0,s_Spx_ret_1prd,s_Spx_ret_2prd,s_Spx_ret_4prd,s_Spx_ret_open,s_Spx_ret_close1,s_Spx_ret_close2,s_Nasdaq_ret_1prd,s_Nasdaq_ret_2prd,s_Nasdaq_ret_4prd,s_Nasdaq_ret_open,s_Nasdaq_ret_close1,s_Nasdaq_ret_close2,s_Russel_ret_1prd,s_Russel_ret_2prd,s_Russel_ret_4prd,s_Russel_ret_open,s_Russel_ret_close1,s_Russel_ret_close2,s_EMXC_ret_1prd,s_EMXC_ret_2prd,s_EMXC_ret_4prd,s_EMXC_ret_open,s_EMXC_ret_close1,s_EMXC_ret_close2,s_EEMA_ret_1prd,s_EEMA_ret_2prd,s_EEMA_ret_4prd,s_EEMA_ret_open,s_EEMA_ret_close1,s_EEMA_ret_close2,s_EEM_ret_1prd,s_EEM_ret_2prd,s_EEM_ret_4prd,s_EEM_ret_open,s_EEM_ret_close1,s_EEM_ret_close2,s_VTHR_ret_1prd,s_VTHR_ret_2prd,s_VTHR_ret_4prd,s_VTHR_ret_open,s_VTHR_ret_close1,s_VTHR_ret_close2
1675,0.037745,-0.052295,-0.01861,-0.397604,-0.280932,-1.801919,0.034269,-0.061604,-0.014838,-0.320787,-0.069221,-1.319435,0.006015,-0.033929,-0.035365,-0.604899,-0.517766,-1.632872,-0.081701,-0.081701,-0.112303,0.0,0.493011,-0.871328,0.0,0.0,-0.122615,-0.16852,-0.458288,-1.645277,0.0,-0.051528,-0.025775,-0.064404,0.141963,-1.560522,0.0,0.0,0.058619,-0.400134,-0.31149,-1.791881
6440,0.065601,0.029985,-0.024603,-0.024603,0.264643,-0.14528,0.13191,0.141866,0.220554,0.220554,0.936331,0.056836,0.112956,0.055606,0.081153,0.081153,0.437612,-0.296394,0.099048,0.099048,0.039597,0.039597,0.928793,1.140912,0.0,0.0,0.0,0.0,0.0,-1.157542,0.137957,0.175658,0.150521,0.150521,1.371247,0.57949,0.0,0.0,0.0,0.0,0.427842,-0.032759
3278,0.025632,0.116595,0.085834,0.49568,0.635525,-0.463414,0.034967,0.126618,0.087253,0.361782,0.175955,-0.558097,0.047579,0.052407,0.00432,0.310858,1.057686,-1.555894,0.010509,0.073591,0.052557,0.284455,0.623683,-0.157325,0.0,0.0,0.0,0.210192,-0.244947,-0.35419,0.023886,0.050691,0.050427,0.53067,0.236641,-0.398936,0.0,0.0,0.0,0.421439,0.63349,-0.631161
2791,-0.042235,-0.075034,0.134328,1.591687,0.910851,0.225019,-0.006875,0.017203,0.249302,2.008042,1.481795,1.529699,-0.038252,-0.095257,0.160733,1.951517,0.634484,-1.168566,-0.042249,-0.084461,0.063436,0.78807,0.852518,-1.086957,0.0,0.0,0.0,0.627682,0.56376,-0.853293,-0.093221,-0.093221,0.053339,0.860446,0.697982,-1.133369,0.0,0.0,0.0,1.28296,0.91737,-0.122629
2697,0.112847,0.146908,0.050982,0.577997,-0.868946,0.780691,0.097121,0.11795,0.020164,0.993763,-0.153207,2.006416,0.11041,-0.001374,-0.138993,0.462004,-2.036098,-0.172972,0.085409,0.04269,0.0,0.321057,-2.027595,-2.129882,0.0,-0.183947,-0.183947,-0.341867,-1.870987,-2.070314,0.094136,0.121071,0.094136,0.282941,-1.910908,-1.820337,0.0,0.0,-0.045952,0.560532,-1.023776,0.863621


{'alpha': 0.002, 'l1_ratio': 0.05}
In sample, ElasticNet: , 0.07655986778121282
Out of sample, ElasticNet: , 0.052625022878438066
Total time: , 34.11257076263428


In [5]:
# feature_names = X_test.columns
# feature_importance = pd.DataFrame(list(zip(feature_names, np.abs(enmt.coef_))),
#                                  columns=['col_name','feature_importance_vals'])
# feature_importance.sort_values(by=['feature_importance_vals'],
#                               ascending=False, inplace=True)

# feature_importance.head(10)

In [6]:
artifact_filename_en = 'EN_model.pkl'

os.chdir('/home/jupyter/project_repos/spg_stocks/stocks-app')
joblib.dump(enmt, artifact_filename_en)

model_bucket = 'gs://mpg3-stocks/artifacts'
storage_path = os.path.join(model_bucket, artifact_filename_en)
blob = storage.blob.Blob.from_string(storage_path, client=storage.Client(project=project_id))
blob.upload_from_filename(os.getcwd()+'/'+artifact_filename_en)

file = open(artifact_filename_en, "rb")
trained_model = joblib.load(file)
prediction = trained_model.predict([list(X_test.iloc[1,:])])
print('EN model', prediction)


EN model [0.01598725]
