### This is a script to run periodic model performance evaluation using the most recent 1 day of data

In [1]:
import numpy as np
import pandas as pd
import os, time, warnings, random, shap, requests, optuna, datetime, joblib, pytz
import seaborn as sns
import matplotlib.pyplot as plt
import functools as ft
import yfinance as yf

from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.preprocessing import LabelBinarizer, LabelEncoder, OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, train_test_split, KFold
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, VotingClassifier
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.inspection import permutation_importance
from xgboost import XGBClassifier, XGBRegressor

pd.set_option('display.max_columns', 100)
pd.set_option('mode.chained_assignment', None)
pd.set_option('display.expand_frame_repr', False)
warnings.filterwarnings('ignore') 

time0 = time.time()

os.chdir('/home/jupyter/projects_gcp_cpu/spx/src')
os.getcwd()

'/home/jupyter/projects_gcp_cpu/spx/src'

In [2]:
df = pd.read_csv('gs://pmykola-streaming-projects/spg-stocks/data/data_start_20221021.csv')
df.Datetime = pd.to_datetime(df.Datetime)

df1d = pd.read_csv('gs://pmykola-streaming-projects/spg-stocks/data/data_last_2022126.csv')
df1d.Datetime = pd.to_datetime(df1d.Datetime)

df = pd.concat([df, df1d])
df.sort_values(by='Datetime')
df

Unnamed: 0,Datetime,EEM,EEMA,EMXC,Nasdaq,Russel,Spx,VTHR
0,2022-10-21 09:30:00,34.634998,55.570000,45.400002,10576.550781,1710.077637,3659.689941,164.059998
1,2022-10-21 09:32:00,34.654999,55.660000,45.410000,10586.665039,1709.384399,3660.110107,164.169998
2,2022-10-21 09:34:00,34.610001,55.619999,45.360001,10547.528320,1704.969971,3652.659912,164.020004
3,2022-10-21 09:36:00,34.709999,,45.498699,10586.677734,1705.815674,3665.989990,
4,2022-10-21 09:38:00,34.689999,,45.463501,10583.666016,1704.521973,3662.169922,163.869995
...,...,...,...,...,...,...,...,...
191,2022-12-06 15:52:00,39.064999,,49.200001,11018.590820,1811.978882,3941.929932,
192,2022-12-06 15:54:00,39.070000,,49.189999,11017.306641,1811.827271,3941.419922,
193,2022-12-06 15:56:00,39.064999,,49.180000,11009.246094,1811.182251,3938.770020,
194,2022-12-06 15:58:00,39.044998,,49.169998,11016.256836,1812.962646,3941.500000,176.669998


In [3]:
df['time'] = df.Datetime.dt.time
df['date'] = df.Datetime.dt.date

df = df.fillna(method='ffill')
dayclose = df[df.time==datetime.time(15, 58, 0)]
dayopen = df[df.time==datetime.time(9, 30, 0)]
dayopen.reset_index(drop=True, inplace=True)
dayclose.reset_index(drop=True, inplace=True)
dayclose.sort_values(by='date')

df

Unnamed: 0,Datetime,EEM,EEMA,EMXC,Nasdaq,Russel,Spx,VTHR,time,date
0,2022-10-21 09:30:00,34.634998,55.570000,45.400002,10576.550781,1710.077637,3659.689941,164.059998,09:30:00,2022-10-21
1,2022-10-21 09:32:00,34.654999,55.660000,45.410000,10586.665039,1709.384399,3660.110107,164.169998,09:32:00,2022-10-21
2,2022-10-21 09:34:00,34.610001,55.619999,45.360001,10547.528320,1704.969971,3652.659912,164.020004,09:34:00,2022-10-21
3,2022-10-21 09:36:00,34.709999,55.619999,45.498699,10586.677734,1705.815674,3665.989990,164.020004,09:36:00,2022-10-21
4,2022-10-21 09:38:00,34.689999,55.619999,45.463501,10583.666016,1704.521973,3662.169922,163.869995,09:38:00,2022-10-21
...,...,...,...,...,...,...,...,...,...,...
191,2022-12-06 15:52:00,39.064999,64.739998,49.200001,11018.590820,1811.978882,3941.929932,176.699997,15:52:00,2022-12-06
192,2022-12-06 15:54:00,39.070000,64.739998,49.189999,11017.306641,1811.827271,3941.419922,176.699997,15:54:00,2022-12-06
193,2022-12-06 15:56:00,39.064999,64.739998,49.180000,11009.246094,1811.182251,3938.770020,176.699997,15:56:00,2022-12-06
194,2022-12-06 15:58:00,39.044998,64.739998,49.169998,11016.256836,1812.962646,3941.500000,176.669998,15:58:00,2022-12-06


In [4]:
### now i wanna do feature engineering for all assets 

asset_list = ['Spx', 'Nasdaq', 'Russel', 'EMXC', 'EEMA', 'EEM', 'VTHR']

for asset in asset_list:
    
    df[asset + '_ret'] = 100*(df[asset]/df[asset].shift(1)-1)
    df['s_' + asset + '_ret_1prd'] = (100*(df[asset]/df[asset].shift(1)-1)).shift(1)
    df['s_' + asset + '_ret_2prd'] = (100*(df[asset]/df[asset].shift(2)-1)).shift(1)
    df['s_' + asset + '_ret_4prd'] = (100*(df[asset]/df[asset].shift(4)-1)).shift(1)
    # display(df.shape, df.head(5))

    df.loc[df.time < datetime.time(9, 32, 0), 's_' + asset + '_1prd'] = np.nan
    df.loc[df.time < datetime.time(9, 33, 0), 's_' + asset + '_2prd'] = np.nan
    df.loc[df.time < datetime.time(9, 35, 0), 's_' + asset + '_4prd'] = np.nan

    dayopen.rename(columns={asset:asset+'_open'}, inplace=True)
    # dayopen.head()
    dayclose.rename(columns={asset:asset+'_close'}, inplace=True)
    dayclose_l1 = dayclose.copy()
    dayclose_l2 = dayclose.copy()
    dayclose_l1[asset+'_close_l1'] = dayclose_l1[asset+'_close'].shift(1)
    dayclose_l2[asset+'_close_l2'] = dayclose_l2[asset+'_close'].shift(2)

    df = pd.merge(df, dayopen[['date', asset + '_open']], on=['date'], how='left')
    df = pd.merge(df, dayclose_l1[['date', asset + '_close_l1']], on=['date'], how='left')
    df = pd.merge(df, dayclose_l2[['date', asset + '_close_l2']], on=['date'], how='left')

    df['s_' + asset + '_ret_open'] = (100*(df[asset]/df[asset + '_open']-1)).shift(1)
    df['s_' + asset + '_ret_close1'] = (100*(df[asset]/df[asset + '_close_l1']-1)).shift(1)
    df['s_' + asset + '_ret_close2'] = (100*(df[asset]/df[asset + '_close_l2']-1)).shift(1)

    cols_todrop = [x for x in list(df.columns) if asset in x and 'ret' not in x]
    df.drop(columns = cols_todrop, inplace=True)

display(time.time() - time0, df.tail())

0.6075882911682129

Unnamed: 0,Datetime,time,date,Spx_ret,s_Spx_ret_1prd,s_Spx_ret_2prd,s_Spx_ret_4prd,s_Spx_ret_open,s_Spx_ret_close1,s_Spx_ret_close2,Nasdaq_ret,s_Nasdaq_ret_1prd,s_Nasdaq_ret_2prd,s_Nasdaq_ret_4prd,s_Nasdaq_ret_open,s_Nasdaq_ret_close1,s_Nasdaq_ret_close2,Russel_ret,s_Russel_ret_1prd,s_Russel_ret_2prd,s_Russel_ret_4prd,s_Russel_ret_open,s_Russel_ret_close1,s_Russel_ret_close2,EMXC_ret,s_EMXC_ret_1prd,s_EMXC_ret_2prd,s_EMXC_ret_4prd,s_EMXC_ret_open,s_EMXC_ret_close1,s_EMXC_ret_close2,EEMA_ret,s_EEMA_ret_1prd,s_EEMA_ret_2prd,s_EEMA_ret_4prd,s_EEMA_ret_open,s_EEMA_ret_close1,s_EEMA_ret_close2,EEM_ret,s_EEM_ret_1prd,s_EEM_ret_2prd,s_EEM_ret_4prd,s_EEM_ret_open,s_EEM_ret_close1,s_EEM_ret_close2,VTHR_ret,s_VTHR_ret_1prd,s_VTHR_ret_2prd,s_VTHR_ret_4prd,s_VTHR_ret_open,s_VTHR_ret_close1,s_VTHR_ret_close2
6148,2022-12-06 15:52:00,15:52:00,2022-12-06,0.001016,0.006343,0.083273,0.108944,-1.430824,-1.431805,-3.18454,-0.014267,-0.005441,0.076783,0.135411,-1.869759,-1.973917,-3.852185,0.025202,0.055307,0.072364,0.155268,-1.365694,-1.568361,-4.290911,0.00549,0.025005,0.055525,0.116607,-0.410119,-0.591436,-2.347561,0.0,0.0,0.030897,0.061825,-0.537715,-0.430635,-1.160309,0.0,-0.012546,0.064031,0.102491,-0.445981,-0.051172,-1.226297,0.0,0.118982,0.118982,0.392025,-1.450087,-1.55989,-3.363418
6149,2022-12-06 15:54:00,15:54:00,2022-12-06,-0.012938,0.001016,0.007358,0.064478,-1.429822,-1.430803,-3.183557,-0.011655,-0.014267,-0.019707,0.039801,-1.88376,-1.987903,-3.865902,-0.008367,0.025202,0.080523,0.133291,-1.340836,-1.543554,-4.26679,-0.02033,0.00549,0.030496,0.07709,-0.404652,-0.585979,-2.3422,0.0,0.0,0.0,0.030897,-0.537715,-0.430635,-1.160309,0.012802,0.0,-0.012546,0.064031,-0.445981,-0.051172,-1.226297,0.0,0.0,0.118982,0.118982,-1.450087,-1.55989,-3.363418
6150,2022-12-06 15:56:00,15:56:00,2022-12-06,-0.067232,-0.012938,-0.011922,0.07134,-1.442576,-1.443556,-3.196083,-0.073163,-0.011655,-0.02592,0.050843,-1.895195,-1.999326,-3.877106,-0.0356,-0.008367,0.016833,0.089209,-1.349091,-1.551792,-4.2748,-0.020326,-0.02033,-0.014841,0.040676,-0.424899,-0.606189,-2.362053,0.0,0.0,0.0,0.030897,-0.537715,-0.430635,-1.160309,-0.0128,0.012802,0.012802,0.076841,-0.433236,-0.038376,-1.213652,0.0,0.0,0.0,0.118982,-1.450087,-1.55989,-3.363418
6151,2022-12-06 15:58:00,15:58:00,2022-12-06,0.06931,-0.067232,-0.080162,-0.072809,-1.508838,-1.509818,-3.261167,0.06368,-0.073163,-0.084809,-0.104499,-1.966971,-2.071025,-3.947432,0.0983,-0.0356,-0.043965,0.036523,-1.384211,-1.58684,-4.308879,-0.020338,-0.020326,-0.040651,-0.010168,-0.445139,-0.626392,-2.381899,0.0,0.0,0.0,0.0,-0.537715,-0.430635,-1.160309,-0.051198,-0.0128,0.0,-0.012546,-0.445981,-0.051172,-1.226297,-0.016977,0.0,0.0,0.118982,-1.450087,-1.55989,-3.363418
6152,2022-12-06 16:00:00,16:00:00,2022-12-06,-0.006089,0.06931,0.002032,-0.009891,-1.440573,-1.441554,-3.194116,-0.012411,0.06368,-0.009529,-0.035446,-1.904543,-2.008664,-3.886265,-0.021364,0.0983,0.062665,0.079508,-1.287272,-1.490099,-4.214814,-0.081345,-0.020338,-0.04066,-0.055494,-0.465386,-0.646602,-2.401752,0.0,0.0,0.0,0.0,-0.537715,-0.430635,-1.160309,-0.012799,-0.051198,-0.063992,-0.051198,-0.496951,-0.102343,-1.276867,0.0,-0.016977,-0.016977,-0.016977,-1.466818,-1.576603,-3.379824


In [5]:
### do prediction ###

model_path = '/home/jupyter/project_repos/spg_stocks/spg_stocks/stocks-app/en_model.pkl'
trained_model = joblib.load(open(model_path, "rb"))

this_day = df.loc[df.date == df.date.max()]
print(f'{this_day.shape[0]} observations this day')
X = this_day.copy()
X.drop(columns = ['Datetime',
                  'time', 
                  'date', 
                  'Spx_ret', 
                  'Nasdaq_ret', 
                  'Russel_ret', 
                  'EEMA_ret', 
                  'EEM_ret', 
                  'EMXC_ret', 
                  'VXUS_ret', 
                  'VTHR_ret'], 
                  inplace=True,
                  errors = 'ignore')

if(X.count().sum() < X.shape[1]):
    print(f'''There are {X.shape[1] - X.count().sum()} missing values. 
          There will be an error''')

y = this_day.VTHR_ret
y_hat = trained_model.predict(X)
#print(trained_model.predict(X))

print('Total time: ', time.time()-time0)

196 observations this day
Total time:  0.666630744934082


In [11]:
model_rmse = mean_squared_error(y, y_hat)
constant_rmse = mean_squared_error(y, np.zeros(len(y)))

print(f'R2 is {100*(r2_score(y, y_hat))}%')
print(f'RMSE of the model is {model_rmse}%')
print(f'RMSE of the constant model is {constant_rmse}%')
print(f'RMSE improvement is {100*(1-model_rmse/constant_rmse)}%')

R2 is 12.185785931477911%
RMSE of the model is 0.003488191623526129%
RMSE of the constant model is 0.004037652138549758%
RMSE improvement is 13.608416380837197%


In [14]:
performance = pd.DataFrame([[100*(r2_score(y, y_hat)), model_rmse, constant_rmse, 100*(1-model_rmse/constant_rmse)]], 
                           columns = ['R2', 'model_rmse', 'constant_rmse', 'rmse_improvement'])
performance

Unnamed: 0,R2,model_rmse,constant_rmse,rmse_improvement
0,12.185786,0.003488,0.004038,13.608416


In [20]:
file_name = 'm1_performance_' + str(df.date.max().year) + str(df.date.max().month) + str(df.date.max().day) + '.csv'
performance.to_csv('gs://pmykola-streaming-projects/spg-stocks/artifacts' + '/' + file_name)