### This is a modeling script for a project to predict index returns at 2-minute frequency

In [39]:
import numpy as np
import pandas as pd
import os, time, warnings, random, shap, requests, optuna, datetime, joblib
import seaborn as sns
import matplotlib.pyplot as plt
import functools as ft
import yfinance as yf

from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.preprocessing import LabelBinarizer, LabelEncoder, OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, train_test_split, KFold
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, f1_score, r2_score, mean_squared_error
from sklearn.inspection import permutation_importance
from google.cloud import bigquery, storage

pd.set_option('display.max_columns', 100)
pd.set_option('mode.chained_assignment', None)
pd.set_option('display.expand_frame_repr', False)
warnings.filterwarnings('ignore') 

project_name = 'GCP-pp2'
project_id = 'polished-vault-379315'
regionn = 'us-west1'

time0 = time.time()

# os.chdir('/home/jupyter/projects_gcp_cpu/spx/src')
os.getcwd()

'/home/jupyter/project_repos/spg_stocks'

In [40]:
tickerStrings = ['^GSPC', '^IXIC', '^RUT', 'EEM', 'EMXC', 'EEMA', 'VTHR']
df_list = list()
for ticker in tickerStrings:
    data = yf.download(ticker, 
                       group_by="Ticker", 
                       period='60d', 
                       interval='2m', 
                       prepost=False, 
                       auto_adjust=True)
    data['ticker'] = ticker  
    df_list.append(data)

df = pd.concat(df_list)
df = df[['Close', 'ticker']]
df.replace({'^GSPC':'Spx', '^IXIC':'Nasdaq', '^RUT':'Russel'}, inplace=True)
df = (df.pivot_table(index=['Datetime'], columns='ticker', values='Close'))
df.columns = ['EEM', 'EEMA', 'EMXC', 'Nasdaq', 'Russel', 'Spx', 'VTHR']

df['time'] = df.index.time
df['date'] = df.index.date

df = df.fillna(method='ffill')
dayclose = df[df.time==datetime.time(15, 58, 0)]
dayopen = df[df.time==datetime.time(9, 30, 0)]
dayopen.reset_index(drop=True, inplace=True)
dayclose.reset_index(drop=True, inplace=True)
dayclose.sort_values(by='date')
display(df, dayopen.head(), dayclose.head())
df0 = df.copy()

# df['hour'] = pd.to_datetime(df['time'], format='%H:%M:%S').dt.hour
# df['minute'] = pd.to_datetime(df['time'], format='%H:%M:%S').dt.minute

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,EEM,EEMA,EMXC,Nasdaq,Russel,Spx,VTHR,time,date
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2023-05-10 15:50:00-04:00,39.154999,64.879997,50.310001,12305.97168,1759.024414,4136.649902,,15:50:00,2023-05-10
2023-05-10 15:52:00-04:00,39.154999,64.879997,50.310001,12308.014648,1759.233765,4137.740234,183.910004,15:52:00,2023-05-10
2023-05-10 15:54:00-04:00,39.150002,64.879997,50.32,12303.112305,1759.666016,4136.299805,183.910004,15:54:00,2023-05-10
2023-05-10 15:56:00-04:00,39.139999,64.879997,50.310001,12301.088867,1759.203735,4136.209961,183.910004,15:56:00,2023-05-10
2023-05-10 15:58:00-04:00,39.16,64.879997,50.310001,12305.219727,1759.590942,4137.100098,183.910004,15:58:00,2023-05-10


Unnamed: 0,EEM,EEMA,EMXC,Nasdaq,Russel,Spx,VTHR,time,date


Unnamed: 0,EEM,EEMA,EMXC,Nasdaq,Russel,Spx,VTHR,time,date
0,39.16,64.879997,50.310001,12305.219727,1759.590942,4137.100098,183.910004,15:58:00,2023-05-10


In [41]:
### now i wanna do feature engineering for all assets 

asset_list = ['Spx', 'Nasdaq', 'Russel', 'EMXC', 'EEMA', 'EEM', 'VTHR']

for asset in asset_list:
    
    df[asset + '_ret'] = 100*(df[asset]/df[asset].shift(1)-1)
    df['s_' + asset + '_ret_1prd'] = (100*(df[asset]/df[asset].shift(1)-1)).shift(1)
    df['s_' + asset + '_ret_2prd'] = (100*(df[asset]/df[asset].shift(2)-1)).shift(1)
    df['s_' + asset + '_ret_4prd'] = (100*(df[asset]/df[asset].shift(4)-1)).shift(1)
    print(f'Data shape: {df.shape}')

    df.loc[df.time < datetime.time(9, 32, 0), 's_' + asset + '_1prd'] = np.nan
    df.loc[df.time < datetime.time(9, 33, 0), 's_' + asset + '_2prd'] = np.nan
    df.loc[df.time < datetime.time(9, 35, 0), 's_' + asset + '_4prd'] = np.nan

    dayopen.rename(columns={asset:asset+'_open'}, inplace=True)
    # dayopen.head()
    dayclose.rename(columns={asset:asset+'_close'}, inplace=True)
    dayclose_l1 = dayclose.copy()
    dayclose_l2 = dayclose.copy()
    dayclose_l1[asset+'_close_l1'] = dayclose_l1[asset+'_close'].shift(1)
    dayclose_l2[asset+'_close_l2'] = dayclose_l2[asset+'_close'].shift(2)
    
    df = pd.merge(df, dayopen[['date', asset + '_open']], on=['date'], how='left')
    df = pd.merge(df, dayclose_l1[['date', asset + '_close_l1']], on=['date'], how='left')
    df = pd.merge(df, dayclose_l2[['date', asset + '_close_l2']], on=['date'], how='left')

    df['s_' + asset + '_ret_open'] = (100*(df[asset]/df[asset + '_open']-1)).shift(1)
    df['s_' + asset + '_ret_close1'] = (100*(df[asset]/df[asset + '_close_l1']-1)).shift(1)
    df['s_' + asset + '_ret_close2'] = (100*(df[asset]/df[asset + '_close_l2']-1)).shift(1)

    cols_todrop = [x for x in list(df.columns) if asset in x and 'ret' not in x]
    df.drop(columns = cols_todrop, inplace=True)

print(f'Time to do feature engineering: {time.time() - time0}')
display(df.head())

Data shape: (5, 13)
Data shape: (5, 19)
Data shape: (5, 25)
Data shape: (5, 31)
Data shape: (5, 37)
Data shape: (5, 43)
Data shape: (5, 49)
Time to do feature engineering: 1.8718185424804688


Unnamed: 0,time,date,Spx_ret,s_Spx_ret_1prd,s_Spx_ret_2prd,s_Spx_ret_4prd,s_Spx_ret_open,s_Spx_ret_close1,s_Spx_ret_close2,Nasdaq_ret,s_Nasdaq_ret_1prd,s_Nasdaq_ret_2prd,s_Nasdaq_ret_4prd,s_Nasdaq_ret_open,s_Nasdaq_ret_close1,s_Nasdaq_ret_close2,Russel_ret,s_Russel_ret_1prd,s_Russel_ret_2prd,s_Russel_ret_4prd,s_Russel_ret_open,s_Russel_ret_close1,s_Russel_ret_close2,EMXC_ret,s_EMXC_ret_1prd,s_EMXC_ret_2prd,s_EMXC_ret_4prd,s_EMXC_ret_open,s_EMXC_ret_close1,s_EMXC_ret_close2,EEMA_ret,s_EEMA_ret_1prd,s_EEMA_ret_2prd,s_EEMA_ret_4prd,s_EEMA_ret_open,s_EEMA_ret_close1,s_EEMA_ret_close2,EEM_ret,s_EEM_ret_1prd,s_EEM_ret_2prd,s_EEM_ret_4prd,s_EEM_ret_open,s_EEM_ret_close1,s_EEM_ret_close2,VTHR_ret,s_VTHR_ret_1prd,s_VTHR_ret_2prd,s_VTHR_ret_4prd,s_VTHR_ret_open,s_VTHR_ret_close1,s_VTHR_ret_close2
0,15:50:00,2023-05-10,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,15:52:00,2023-05-10,0.026358,,,,,,,0.016601,,,,,,,0.011902,,,,,,,0.0,,,,,,,0.0,,,,,,,0.0,,,,,,,,,,,,,
2,15:54:00,2023-05-10,-0.034812,0.026358,,,,,,-0.03983,0.016601,,,,,,0.02457,0.011902,,,,,,0.019873,0.0,,,,,,0.0,0.0,,,,,,-0.012763,0.0,,,,,,0.0,,,,,,
3,15:56:00,2023-05-10,-0.002172,-0.034812,-0.008463,,,,,-0.016447,-0.03983,-0.023236,,,,,-0.026271,0.02457,0.036475,,,,,-0.019869,0.019873,0.019873,,,,,0.0,0.0,0.0,,,,,-0.025548,-0.012763,-0.012763,,,,,0.0,0.0,,,,,
4,15:58:00,2023-05-10,0.021521,-0.002172,-0.036983,,,,,0.033581,-0.016447,-0.05627,,,,,0.02201,-0.026271,-0.001707,,,,,0.0,-0.019869,0.0,,,,,0.0,0.0,0.0,,,,,0.0511,-0.025548,-0.038308,,,,,0.0,0.0,0.0,,,,


In [42]:
### do modeling ###

t_df = df.copy()
t_df.rename(columns={'VTHR_ret':'target'}, inplace=True)
t_df.drop(columns = ['time', 'date', 'Spx_ret', 'Nasdaq_ret', 'Russel_ret', 'EEMA_ret', 'EEM_ret', 'EMXC_ret', 'VXUS_ret'], 
          inplace=True,
          errors = 'ignore')

t_df = t_df.dropna()
display(t_df.info())

y = t_df.pop('target')
X = t_df
print(f'Data preprocessng time: , {time.time()-time0:.2f} sec')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=int(0.2*X.shape[0]))
display(X_train.shape, X_test.shape, y_train.shape, X_train.head())
time1 = time.time()

enm = ElasticNet()
parameters = {'alpha':[0.0005, 0.001, 0.002, 0.003, 0.005], 
              'l1_ratio':[0, 0.02, 0.05, 0.1, 0.25, 0.5, 1]}
enmgs = GridSearchCV(enm, parameters, scoring='r2', cv=4)
enmgs.fit(X_train, y_train)
print(enmgs.best_params_)
enmt = ElasticNet(**enmgs.best_params_)
enmt.fit(X_train, y_train)

print(f'In sample, ElasticNet: , {r2_score(y_train, enmgs.predict(X_train))}')
print(f'Out of sample, ElasticNet: , {r2_score(y_test, enmgs.predict(X_test))}')

print(f'Total time: , {time.time()-time0:.2f} sec')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 0 entries
Data columns (total 43 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   s_Spx_ret_1prd       0 non-null      float64
 1   s_Spx_ret_2prd       0 non-null      float64
 2   s_Spx_ret_4prd       0 non-null      float64
 3   s_Spx_ret_open       0 non-null      float64
 4   s_Spx_ret_close1     0 non-null      float64
 5   s_Spx_ret_close2     0 non-null      float64
 6   s_Nasdaq_ret_1prd    0 non-null      float64
 7   s_Nasdaq_ret_2prd    0 non-null      float64
 8   s_Nasdaq_ret_4prd    0 non-null      float64
 9   s_Nasdaq_ret_open    0 non-null      float64
 10  s_Nasdaq_ret_close1  0 non-null      float64
 11  s_Nasdaq_ret_close2  0 non-null      float64
 12  s_Russel_ret_1prd    0 non-null      float64
 13  s_Russel_ret_2prd    0 non-null      float64
 14  s_Russel_ret_4prd    0 non-null      float64
 15  s_Russel_ret_open    0 non-null      float64
 16  s_

None

Data preprocessng time: , 1.97 sec


ValueError: test_size=0 should be either positive and smaller than the number of samples 0 or a float in the (0, 1) range

In [34]:
# feature_names = X_test.columns
# feature_importance = pd.DataFrame(list(zip(feature_names, np.abs(enmt.coef_))),
#                                  columns=['col_name','feature_importance_vals'])
# feature_importance.sort_values(by=['feature_importance_vals'],
#                               ascending=False, inplace=True)

# feature_importance.head(10)

In [6]:
artifact_filename_en = 'EN_model.pkl'

os.chdir('/home/jupyter/project_repos/spg_stocks/stocks-app')
joblib.dump(enmt, artifact_filename_en)

model_bucket = 'gs://mpg3-stocks/artifacts'
storage_path = os.path.join(model_bucket, artifact_filename_en)
blob = storage.blob.Blob.from_string(storage_path, client=storage.Client(project=project_id))
blob.upload_from_filename(os.getcwd()+'/'+artifact_filename_en)

file = open(artifact_filename_en, "rb")
trained_model = joblib.load(file)
prediction = trained_model.predict([list(X_test.iloc[1,:])])
print('EN model', prediction)


EN model [0.01598725]
