### This is a script to run periodic model performance evaluation using the most recent 1 day of data

In [1]:
import numpy as np
import pandas as pd
import os, time, warnings, random, shap, requests, optuna, datetime, joblib, pytz, gcsfs
import seaborn as sns
import matplotlib.pyplot as plt
import functools as ft
import yfinance as yf
from google.cloud import storage
from io import BytesIO


from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.preprocessing import LabelBinarizer, LabelEncoder, OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, train_test_split, KFold
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, VotingClassifier
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.inspection import permutation_importance
from xgboost import XGBClassifier, XGBRegressor

pd.set_option('display.max_columns', 100)
pd.set_option('mode.chained_assignment', None)
pd.set_option('display.expand_frame_repr', False)
warnings.filterwarnings('ignore') 

time0 = time.time()

bucket_path = 'gs://pmykola-streaming-projects/spg-stocks/data/'

pull_time = datetime.datetime.now()
pull_time = pull_time.astimezone(pytz.timezone('America/New_York'))
pull_time = pull_time.replace(tzinfo=None)
now_time = (str(pull_time.month) + '_' + 
str(pull_time.day) + '_' +
str(pull_time.hour) + ':'  +
str(pull_time.minute) + ':' +
str(pull_time.second))

In [2]:
datafiles = !gsutil ls gs://pmykola-streaming-projects/spg-stocks/data
start_file = [x for x in datafiles if ('data_start_' in x)]
datafiles = [x for x in datafiles if ('auto_data_last_' in x) & ('pull_time' in x)]
assert len(start_file) == 1
start_file = start_file[0]
start_file

'gs://pmykola-streaming-projects/spg-stocks/data/data_start_20221021.csv'

In [3]:
df = pd.read_csv(start_file)
df.Datetime = pd.to_datetime(df.Datetime)

df_new = pd.DataFrame(columns = df.columns)
for file in datafiles:
    temp_df = pd.read_csv(file)
    df_new = pd.concat([df_new, temp_df], axis=0)
    # remove duplicates
df_new.reset_index(inplace=True, drop=True)
df_new.drop_duplicates(inplace=True)
df_new.Datetime = pd.to_datetime(df_new.Datetime)
df_new.sort_values(by='Datetime')
df_new

Unnamed: 0,Datetime,EEM,EEMA,EMXC,Nasdaq,Russel,Spx,VTHR
0,2022-12-06 09:30:00,39.240002,65.089996,49.399899,11229.620117,1836.604736,3999.110107,179.300003
1,2022-12-06 09:32:00,39.230000,65.110001,49.389999,11220.044922,1837.946777,4000.229980,179.360001
2,2022-12-06 09:34:00,39.189999,65.040001,49.389999,11208.439453,1837.892456,3996.840088,
3,2022-12-06 09:36:00,39.230000,65.019997,49.419998,11215.238281,1838.812744,3998.750000,179.259995
4,2022-12-06 09:38:00,39.200001,65.029999,49.419998,11206.872070,1838.209717,3995.010010,179.220001
...,...,...,...,...,...,...,...,...
387,2022-12-07 15:52:00,38.855000,,49.154999,10959.493164,1807.726440,3932.590088,
388,2022-12-07 15:54:00,38.860001,,49.139999,10965.423828,1807.849487,3935.709961,
389,2022-12-07 15:56:00,38.855000,,49.139999,10964.358398,,3935.780029,
390,2022-12-07 15:58:00,38.820000,,49.160000,10957.579102,,3933.449951,


In [4]:
df = pd.read_csv(start_file)
df.Datetime = pd.to_datetime(df.Datetime)
df = pd.concat([df, df_new])
df.drop_duplicates(inplace=True)
df.sort_values(by='Datetime')
df

Unnamed: 0,Datetime,EEM,EEMA,EMXC,Nasdaq,Russel,Spx,VTHR
0,2022-10-21 09:30:00,34.634998,55.570000,45.400002,10576.550781,1710.077637,3659.689941,164.059998
1,2022-10-21 09:32:00,34.654999,55.660000,45.410000,10586.665039,1709.384399,3660.110107,164.169998
2,2022-10-21 09:34:00,34.610001,55.619999,45.360001,10547.528320,1704.969971,3652.659912,164.020004
3,2022-10-21 09:36:00,34.709999,,45.498699,10586.677734,1705.815674,3665.989990,
4,2022-10-21 09:38:00,34.689999,,45.463501,10583.666016,1704.521973,3662.169922,163.869995
...,...,...,...,...,...,...,...,...
387,2022-12-07 15:52:00,38.855000,,49.154999,10959.493164,1807.726440,3932.590088,
388,2022-12-07 15:54:00,38.860001,,49.139999,10965.423828,1807.849487,3935.709961,
389,2022-12-07 15:56:00,38.855000,,49.139999,10964.358398,,3935.780029,
390,2022-12-07 15:58:00,38.820000,,49.160000,10957.579102,,3933.449951,


In [5]:
df['time'] = df.Datetime.dt.time
df['date'] = df.Datetime.dt.date

df = df.fillna(method='ffill')
dayclose = df[df.time==datetime.time(15, 58, 0)]
dayopen = df[df.time==datetime.time(9, 30, 0)]
dayopen.reset_index(drop=True, inplace=True)
dayclose.reset_index(drop=True, inplace=True)
dayclose.sort_values(by='date')

df

Unnamed: 0,Datetime,EEM,EEMA,EMXC,Nasdaq,Russel,Spx,VTHR,time,date
0,2022-10-21 09:30:00,34.634998,55.570000,45.400002,10576.550781,1710.077637,3659.689941,164.059998,09:30:00,2022-10-21
1,2022-10-21 09:32:00,34.654999,55.660000,45.410000,10586.665039,1709.384399,3660.110107,164.169998,09:32:00,2022-10-21
2,2022-10-21 09:34:00,34.610001,55.619999,45.360001,10547.528320,1704.969971,3652.659912,164.020004,09:34:00,2022-10-21
3,2022-10-21 09:36:00,34.709999,55.619999,45.498699,10586.677734,1705.815674,3665.989990,164.020004,09:36:00,2022-10-21
4,2022-10-21 09:38:00,34.689999,55.619999,45.463501,10583.666016,1704.521973,3662.169922,163.869995,09:38:00,2022-10-21
...,...,...,...,...,...,...,...,...,...,...
387,2022-12-07 15:52:00,38.855000,64.500000,49.154999,10959.493164,1807.726440,3932.590088,176.169998,15:52:00,2022-12-07
388,2022-12-07 15:54:00,38.860001,64.500000,49.139999,10965.423828,1807.849487,3935.709961,176.169998,15:54:00,2022-12-07
389,2022-12-07 15:56:00,38.855000,64.500000,49.139999,10964.358398,1807.849487,3935.780029,176.169998,15:56:00,2022-12-07
390,2022-12-07 15:58:00,38.820000,64.500000,49.160000,10957.579102,1807.849487,3933.449951,176.169998,15:58:00,2022-12-07


In [11]:
df.date.unique()[len(df.date.unique())-2]

datetime.date(2022, 12, 6)

In [12]:
df.date.max()

datetime.date(2022, 12, 7)

In [6]:
### now i wanna do feature engineering for all assets 

asset_list = ['Spx', 'Nasdaq', 'Russel', 'EMXC', 'EEMA', 'EEM', 'VTHR']

for asset in asset_list:
    
    df[asset + '_ret'] = 100*(df[asset]/df[asset].shift(1)-1)
    df['s_' + asset + '_ret_1prd'] = (100*(df[asset]/df[asset].shift(1)-1)).shift(1)
    df['s_' + asset + '_ret_2prd'] = (100*(df[asset]/df[asset].shift(2)-1)).shift(1)
    df['s_' + asset + '_ret_4prd'] = (100*(df[asset]/df[asset].shift(4)-1)).shift(1)
    # display(df.shape, df.head(5))

    df.loc[df.time < datetime.time(9, 32, 0), 's_' + asset + '_1prd'] = np.nan
    df.loc[df.time < datetime.time(9, 33, 0), 's_' + asset + '_2prd'] = np.nan
    df.loc[df.time < datetime.time(9, 35, 0), 's_' + asset + '_4prd'] = np.nan

    dayopen.rename(columns={asset:asset+'_open'}, inplace=True)
    # dayopen.head()
    dayclose.rename(columns={asset:asset+'_close'}, inplace=True)
    dayclose_l1 = dayclose.copy()
    dayclose_l2 = dayclose.copy()
    dayclose_l1[asset+'_close_l1'] = dayclose_l1[asset+'_close'].shift(1)
    dayclose_l2[asset+'_close_l2'] = dayclose_l2[asset+'_close'].shift(2)

    df = pd.merge(df, dayopen[['date', asset + '_open']], on=['date'], how='left')
    df = pd.merge(df, dayclose_l1[['date', asset + '_close_l1']], on=['date'], how='left')
    df = pd.merge(df, dayclose_l2[['date', asset + '_close_l2']], on=['date'], how='left')

    df['s_' + asset + '_ret_open'] = (100*(df[asset]/df[asset + '_open']-1)).shift(1)
    df['s_' + asset + '_ret_close1'] = (100*(df[asset]/df[asset + '_close_l1']-1)).shift(1)
    df['s_' + asset + '_ret_close2'] = (100*(df[asset]/df[asset + '_close_l2']-1)).shift(1)

    cols_todrop = [x for x in list(df.columns) if asset in x and 'ret' not in x]
    df.drop(columns = cols_todrop, inplace=True)

display(time.time() - time0, df.tail())

4.189150810241699

Unnamed: 0,Datetime,time,date,Spx_ret,s_Spx_ret_1prd,s_Spx_ret_2prd,s_Spx_ret_4prd,s_Spx_ret_open,s_Spx_ret_close1,s_Spx_ret_close2,Nasdaq_ret,s_Nasdaq_ret_1prd,s_Nasdaq_ret_2prd,s_Nasdaq_ret_4prd,s_Nasdaq_ret_open,s_Nasdaq_ret_close1,s_Nasdaq_ret_close2,Russel_ret,s_Russel_ret_1prd,s_Russel_ret_2prd,s_Russel_ret_4prd,s_Russel_ret_open,s_Russel_ret_close1,s_Russel_ret_close2,EMXC_ret,s_EMXC_ret_1prd,s_EMXC_ret_2prd,s_EMXC_ret_4prd,s_EMXC_ret_open,s_EMXC_ret_close1,s_EMXC_ret_close2,EEMA_ret,s_EEMA_ret_1prd,s_EEMA_ret_2prd,s_EEMA_ret_4prd,s_EEMA_ret_open,s_EEMA_ret_close1,s_EEMA_ret_close2,EEM_ret,s_EEM_ret_1prd,s_EEM_ret_2prd,s_EEM_ret_4prd,s_EEM_ret_open,s_EEM_ret_close1,s_EEM_ret_close2,VTHR_ret,s_VTHR_ret_1prd,s_VTHR_ret_2prd,s_VTHR_ret_4prd,s_VTHR_ret_open,s_VTHR_ret_close1,s_VTHR_ret_close2
6344,2022-12-07 15:52:00,15:52:00,2022-12-07,0.033579,0.020864,0.011192,0.045299,-0.061772,-0.259545,-1.697358,0.031785,0.016386,-0.001595,0.056374,-0.192232,-0.544009,-2.544562,0.030856,-0.034552,-0.044325,-0.015817,0.035847,-0.319578,-1.804915,0.030524,-0.02035,-0.02035,-0.030514,0.040718,-0.06101,-0.707218,0.0,0.0,0.0,0.0,0.734033,-0.37071,-0.799749,0.013128,-0.013126,-0.038859,-0.038859,0.231943,-0.499677,-0.601509,0.0,0.0,0.0,-0.024402,-0.232191,-0.283014,-1.855154
6345,2022-12-07 15:54:00,15:54:00,2022-12-07,0.079334,0.033579,0.05445,0.046303,-0.028214,-0.226054,-1.664349,0.054114,0.031785,0.048176,0.042745,-0.160508,-0.512397,-2.513586,0.006807,0.030856,-0.003707,0.00131,0.066714,-0.28882,-1.774616,-0.030514,0.030524,0.010167,0.010167,0.071254,-0.030505,-0.67691,0.0,0.0,0.0,0.0,0.734033,-0.37071,-0.799749,0.012871,0.013128,0.0,-0.038589,0.245101,-0.486615,-0.58846,0.0,0.0,0.0,0.0,-0.232191,-0.283014,-1.855154
6346,2022-12-07 15:56:00,15:56:00,2022-12-07,0.00178,0.079334,0.112939,0.124144,0.051097,-0.146899,-1.586336,-0.009716,0.054114,0.085917,0.08432,-0.10648,-0.45856,-2.460831,0.0,0.006807,0.037665,-0.006678,0.073525,-0.282033,-1.76793,0.0,-0.030514,0.0,-0.02035,0.040718,-0.06101,-0.707218,0.0,0.0,0.0,0.0,0.734033,-0.37071,-0.799749,-0.012869,0.012871,0.026001,-0.012868,0.258004,-0.473806,-0.575665,0.0,0.0,0.0,0.0,-0.232191,-0.283014,-1.855154
6347,2022-12-07 15:58:00,15:58:00,2022-12-07,-0.059202,0.00178,0.081116,0.13561,0.052878,-0.145122,-1.584584,-0.06183,-0.009716,0.044393,0.092591,-0.116186,-0.468231,-2.470309,0.0,0.0,0.006807,0.003099,0.073525,-0.282033,-1.76793,0.040701,0.0,-0.030514,-0.02035,0.040718,-0.06101,-0.707218,0.0,0.0,0.0,0.0,0.734033,-0.37071,-0.799749,-0.090078,-0.012869,0.0,0.0,0.245101,-0.486615,-0.58846,0.0,0.0,0.0,0.0,-0.232191,-0.283014,-1.855154
6348,2022-12-07 16:00:00,16:00:00,2022-12-07,0.011948,-0.059202,-0.057423,0.055451,-0.006355,-0.204238,-1.642848,0.008894,-0.06183,-0.071541,0.014315,-0.177945,-0.529772,-2.530611,0.0,0.0,0.0,0.037665,0.073525,-0.282033,-1.76793,0.0,0.040701,0.040701,0.040701,0.081435,-0.020334,-0.666805,0.0,0.0,0.0,0.0,0.734033,-0.37071,-0.799749,0.0,-0.090078,-0.102936,-0.076962,0.154802,-0.576254,-0.678008,0.0,0.0,0.0,0.0,-0.232191,-0.283014,-1.855154


In [7]:
### do prediction ###

storage_client = storage.Client()
bucket_name='pmykola-streaming-projects'
model_path='spg-stocks/artifacts/en_model.pkl'

bucket = storage_client.get_bucket(bucket_name)
blob = bucket.blob(model_path)
model_file = BytesIO()
blob.download_to_file(model_file)
trained_model=joblib.load(model_file)

this_day = df.loc[df.date == df.date.max()]
print(f'{this_day.shape[0]} observations this day')
X = this_day.copy()
X.drop(columns = ['Datetime',
                  'time', 
                  'date', 
                  'Spx_ret', 
                  'Nasdaq_ret', 
                  'Russel_ret', 
                  'EEMA_ret', 
                  'EEM_ret', 
                  'EMXC_ret', 
                  'VXUS_ret', 
                  'VTHR_ret'], 
                  inplace=True,
                  errors = 'ignore')

if(X.count().sum() < X.shape[1]):
    print(f'''There are {X.shape[1] - X.count().sum()} missing values. 
          There will be an error''')

y = this_day.VTHR_ret
y_hat = trained_model.predict(X)
#print(trained_model.predict(X))

print('Total time: ', time.time()-time0)

196 observations this day
Total time:  4.3377463817596436


In [8]:
model_rmse = mean_squared_error(y, y_hat)
constant_rmse = mean_squared_error(y, np.zeros(len(y)))

performance = pd.DataFrame([[100*(r2_score(y, y_hat)), model_rmse, constant_rmse, 100*(1-model_rmse/constant_rmse)]], 
                           columns = ['R2', 'model_rmse', 'constant_rmse', 'rmse_improvement'])

file_name = 'm1_performance_' + \
str(df.date.max().year) + \
str(df.date.max().month) + \
str(df.date.max().day) + \
'_pull_time_' + \
now_time + \
'.csv'
# performance.to_csv('gs://pmykola-streaming-projects/spg-stocks/artifacts' + '/' + file_name)

In [13]:
performance.index = [df.date.max()]

In [14]:
performance

Unnamed: 0,R2,model_rmse,constant_rmse,rmse_improvement
2022-12-07,16.760033,0.004709,0.005659,16.789598


In [11]:
df.date.max()

datetime.date(2022, 12, 7)