### This is a script to run periodic model performance evaluation using the most recent 1 day of data

In [36]:
import numpy as np
import pandas as pd
import os, time, warnings, random, shap, requests, optuna, datetime, joblib, pytz, gcsfs
import seaborn as sns
import matplotlib.pyplot as plt
import functools as ft
import yfinance as yf
from google.cloud import storage
from io import BytesIO



from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.preprocessing import LabelBinarizer, LabelEncoder, OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, train_test_split, KFold
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, VotingClassifier
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.inspection import permutation_importance
from xgboost import XGBClassifier, XGBRegressor

pd.set_option('display.max_columns', 100)
pd.set_option('mode.chained_assignment', None)
pd.set_option('display.expand_frame_repr', False)
warnings.filterwarnings('ignore') 

time0 = time.time()

bucket_path = 'gs://pmykola-streaming-projects/spg-stocks/data/'

pull_time = datetime.datetime.now()
pull_time = pull_time.astimezone(pytz.timezone('America/New_York'))
pull_time = pull_time.replace(tzinfo=None)
now_time = (str(pull_time.month) + '_' + 
str(pull_time.day) + '_' +
str(pull_time.hour) + ':'  +
str(pull_time.minute) + ':' +
str(pull_time.second))

In [12]:
datafiles = !gsutil ls gs://pmykola-streaming-projects/spg-stocks/data
start_file = [x for x in datafiles if ('data_start_' in x)]
datafiles = [x for x in datafiles if ('auto_data_last_' in x) & ('pull_time' in x)]
assert len(start_file) == 1
start_file = start_file[0]
start_file

'gs://pmykola-streaming-projects/spg-stocks/data/data_start_20221021.csv'

In [13]:
df = pd.read_csv(start_file)
df.Datetime = pd.to_datetime(df.Datetime)

df_new = pd.DataFrame(columns = df.columns)
for file in datafiles:
    temp_df = pd.read_csv(file)
    df_new = pd.concat([df_new, temp_df], axis=0)
    # remove duplicates
df_new.reset_index(inplace=True, drop=True)
df_new.drop_duplicates(inplace=True)
df_new.Datetime = pd.to_datetime(df_new.Datetime)
df_new.sort_values(by='Datetime')
df_new

Unnamed: 0,Datetime,EEM,EEMA,EMXC,Nasdaq,Russel,Spx,VTHR
0,2022-12-06 09:30:00,39.240002,65.089996,49.399899,11229.620117,1836.604736,3999.110107,179.300003
1,2022-12-06 09:32:00,39.230000,65.110001,49.389999,11220.044922,1837.946777,4000.229980,179.360001
2,2022-12-06 09:34:00,39.189999,65.040001,49.389999,11208.439453,1837.892456,3996.840088,
3,2022-12-06 09:36:00,39.230000,65.019997,49.419998,11215.238281,1838.812744,3998.750000,179.259995
4,2022-12-06 09:38:00,39.200001,65.029999,49.419998,11206.872070,1838.209717,3995.010010,179.220001
...,...,...,...,...,...,...,...,...
191,2022-12-06 15:52:00,39.064999,,49.200001,11018.576172,1811.978882,3941.929932,
192,2022-12-06 15:54:00,39.070000,,49.189999,11017.687500,1811.827271,3941.500000,
193,2022-12-06 15:56:00,39.064999,,49.180000,11008.847656,1811.182251,3938.770020,
194,2022-12-06 15:58:00,39.044998,,49.169998,11015.938477,1812.962646,3941.500000,176.669998


In [14]:
df = pd.read_csv(start_file)
df.Datetime = pd.to_datetime(df.Datetime)
df = pd.concat([df, df_new])
df.drop_duplicates(inplace=True)
df.sort_values(by='Datetime')
df

Unnamed: 0,Datetime,EEM,EEMA,EMXC,Nasdaq,Russel,Spx,VTHR
0,2022-10-21 09:30:00,34.634998,55.570000,45.400002,10576.550781,1710.077637,3659.689941,164.059998
1,2022-10-21 09:32:00,34.654999,55.660000,45.410000,10586.665039,1709.384399,3660.110107,164.169998
2,2022-10-21 09:34:00,34.610001,55.619999,45.360001,10547.528320,1704.969971,3652.659912,164.020004
3,2022-10-21 09:36:00,34.709999,,45.498699,10586.677734,1705.815674,3665.989990,
4,2022-10-21 09:38:00,34.689999,,45.463501,10583.666016,1704.521973,3662.169922,163.869995
...,...,...,...,...,...,...,...,...
191,2022-12-06 15:52:00,39.064999,,49.200001,11018.576172,1811.978882,3941.929932,
192,2022-12-06 15:54:00,39.070000,,49.189999,11017.687500,1811.827271,3941.500000,
193,2022-12-06 15:56:00,39.064999,,49.180000,11008.847656,1811.182251,3938.770020,
194,2022-12-06 15:58:00,39.044998,,49.169998,11015.938477,1812.962646,3941.500000,176.669998


In [15]:
df['time'] = df.Datetime.dt.time
df['date'] = df.Datetime.dt.date

df = df.fillna(method='ffill')
dayclose = df[df.time==datetime.time(15, 58, 0)]
dayopen = df[df.time==datetime.time(9, 30, 0)]
dayopen.reset_index(drop=True, inplace=True)
dayclose.reset_index(drop=True, inplace=True)
dayclose.sort_values(by='date')

df

Unnamed: 0,Datetime,EEM,EEMA,EMXC,Nasdaq,Russel,Spx,VTHR,time,date
0,2022-10-21 09:30:00,34.634998,55.570000,45.400002,10576.550781,1710.077637,3659.689941,164.059998,09:30:00,2022-10-21
1,2022-10-21 09:32:00,34.654999,55.660000,45.410000,10586.665039,1709.384399,3660.110107,164.169998,09:32:00,2022-10-21
2,2022-10-21 09:34:00,34.610001,55.619999,45.360001,10547.528320,1704.969971,3652.659912,164.020004,09:34:00,2022-10-21
3,2022-10-21 09:36:00,34.709999,55.619999,45.498699,10586.677734,1705.815674,3665.989990,164.020004,09:36:00,2022-10-21
4,2022-10-21 09:38:00,34.689999,55.619999,45.463501,10583.666016,1704.521973,3662.169922,163.869995,09:38:00,2022-10-21
...,...,...,...,...,...,...,...,...,...,...
191,2022-12-06 15:52:00,39.064999,64.739998,49.200001,11018.576172,1811.978882,3941.929932,176.699997,15:52:00,2022-12-06
192,2022-12-06 15:54:00,39.070000,64.739998,49.189999,11017.687500,1811.827271,3941.500000,176.699997,15:54:00,2022-12-06
193,2022-12-06 15:56:00,39.064999,64.739998,49.180000,11008.847656,1811.182251,3938.770020,176.699997,15:56:00,2022-12-06
194,2022-12-06 15:58:00,39.044998,64.739998,49.169998,11015.938477,1812.962646,3941.500000,176.669998,15:58:00,2022-12-06


In [16]:
### now i wanna do feature engineering for all assets 

asset_list = ['Spx', 'Nasdaq', 'Russel', 'EMXC', 'EEMA', 'EEM', 'VTHR']

for asset in asset_list:
    
    df[asset + '_ret'] = 100*(df[asset]/df[asset].shift(1)-1)
    df['s_' + asset + '_ret_1prd'] = (100*(df[asset]/df[asset].shift(1)-1)).shift(1)
    df['s_' + asset + '_ret_2prd'] = (100*(df[asset]/df[asset].shift(2)-1)).shift(1)
    df['s_' + asset + '_ret_4prd'] = (100*(df[asset]/df[asset].shift(4)-1)).shift(1)
    # display(df.shape, df.head(5))

    df.loc[df.time < datetime.time(9, 32, 0), 's_' + asset + '_1prd'] = np.nan
    df.loc[df.time < datetime.time(9, 33, 0), 's_' + asset + '_2prd'] = np.nan
    df.loc[df.time < datetime.time(9, 35, 0), 's_' + asset + '_4prd'] = np.nan

    dayopen.rename(columns={asset:asset+'_open'}, inplace=True)
    # dayopen.head()
    dayclose.rename(columns={asset:asset+'_close'}, inplace=True)
    dayclose_l1 = dayclose.copy()
    dayclose_l2 = dayclose.copy()
    dayclose_l1[asset+'_close_l1'] = dayclose_l1[asset+'_close'].shift(1)
    dayclose_l2[asset+'_close_l2'] = dayclose_l2[asset+'_close'].shift(2)

    df = pd.merge(df, dayopen[['date', asset + '_open']], on=['date'], how='left')
    df = pd.merge(df, dayclose_l1[['date', asset + '_close_l1']], on=['date'], how='left')
    df = pd.merge(df, dayclose_l2[['date', asset + '_close_l2']], on=['date'], how='left')

    df['s_' + asset + '_ret_open'] = (100*(df[asset]/df[asset + '_open']-1)).shift(1)
    df['s_' + asset + '_ret_close1'] = (100*(df[asset]/df[asset + '_close_l1']-1)).shift(1)
    df['s_' + asset + '_ret_close2'] = (100*(df[asset]/df[asset + '_close_l2']-1)).shift(1)

    cols_todrop = [x for x in list(df.columns) if asset in x and 'ret' not in x]
    df.drop(columns = cols_todrop, inplace=True)

display(time.time() - time0, df.tail())

8.137312412261963

Unnamed: 0,Datetime,time,date,Spx_ret,s_Spx_ret_1prd,s_Spx_ret_2prd,s_Spx_ret_4prd,s_Spx_ret_open,s_Spx_ret_close1,s_Spx_ret_close2,Nasdaq_ret,s_Nasdaq_ret_1prd,s_Nasdaq_ret_2prd,s_Nasdaq_ret_4prd,s_Nasdaq_ret_open,s_Nasdaq_ret_close1,s_Nasdaq_ret_close2,Russel_ret,s_Russel_ret_1prd,s_Russel_ret_2prd,s_Russel_ret_4prd,s_Russel_ret_open,s_Russel_ret_close1,s_Russel_ret_close2,EMXC_ret,s_EMXC_ret_1prd,s_EMXC_ret_2prd,s_EMXC_ret_4prd,s_EMXC_ret_open,s_EMXC_ret_close1,s_EMXC_ret_close2,EEMA_ret,s_EEMA_ret_1prd,s_EEMA_ret_2prd,s_EEMA_ret_4prd,s_EEMA_ret_open,s_EEMA_ret_close1,s_EEMA_ret_close2,EEM_ret,s_EEM_ret_1prd,s_EEM_ret_2prd,s_EEM_ret_4prd,s_EEM_ret_open,s_EEM_ret_close1,s_EEM_ret_close2,VTHR_ret,s_VTHR_ret_1prd,s_VTHR_ret_2prd,s_VTHR_ret_4prd,s_VTHR_ret_open,s_VTHR_ret_close1,s_VTHR_ret_close2
6148,2022-12-06 15:52:00,15:52:00,2022-12-06,0.001016,0.006343,0.083273,0.108944,-1.430824,-1.431805,-3.18454,-0.014232,-0.005742,0.067544,0.132515,-1.865384,-1.974082,-3.852346,0.025202,0.055307,0.072364,0.155268,-1.365694,-1.568361,-4.290911,0.00549,0.025005,0.055525,0.116607,-0.410119,-0.591436,-2.347561,0.0,0.0,0.030897,0.061825,-0.537715,-0.430635,-1.160309,0.0,-0.012546,0.064031,0.102491,-0.445981,-0.051172,-1.226297,0.0,0.118982,0.118982,0.392025,-1.450087,-1.55989,-3.363418
6149,2022-12-06 15:54:00,15:54:00,2022-12-06,-0.010907,0.001016,0.007358,0.064478,-1.429822,-1.430803,-3.183557,-0.008065,-0.014232,-0.019973,0.051928,-1.879351,-1.988033,-3.86603,-0.008367,0.025202,0.080523,0.133291,-1.340836,-1.543554,-4.26679,-0.02033,0.00549,0.030496,0.07709,-0.404652,-0.585979,-2.3422,0.0,0.0,0.0,0.030897,-0.537715,-0.430635,-1.160309,0.012802,0.0,-0.012546,0.064031,-0.445981,-0.051172,-1.226297,0.0,0.0,0.118982,0.118982,-1.450087,-1.55989,-3.363418
6150,2022-12-06 15:56:00,15:56:00,2022-12-06,-0.069262,-0.010907,-0.009891,0.073373,-1.440573,-1.441554,-3.194116,-0.080233,-0.008065,-0.022296,0.045234,-1.887264,-1.995938,-3.873783,-0.0356,-0.008367,0.016833,0.089209,-1.349091,-1.551792,-4.2748,-0.020326,-0.02033,-0.014841,0.040676,-0.424899,-0.606189,-2.362053,0.0,0.0,0.0,0.030897,-0.537715,-0.430635,-1.160309,-0.0128,0.012802,0.012802,0.076841,-0.433236,-0.038376,-1.213652,0.0,0.0,0.0,0.118982,-1.450087,-1.55989,-3.363418
6151,2022-12-06 15:58:00,15:58:00,2022-12-06,0.06931,-0.069262,-0.080162,-0.072809,-1.508838,-1.509818,-3.261167,0.06441,-0.080233,-0.088292,-0.108247,-1.965983,-2.07457,-3.950908,0.0983,-0.0356,-0.043965,0.036523,-1.384211,-1.58684,-4.308879,-0.020338,-0.020326,-0.040651,-0.010168,-0.445139,-0.626392,-2.381899,0.0,0.0,0.0,0.0,-0.537715,-0.430635,-1.160309,-0.051198,-0.0128,0.0,-0.012546,-0.445981,-0.051172,-1.226297,-0.016977,0.0,0.0,0.118982,-1.450087,-1.55989,-3.363418
6152,2022-12-06 16:00:00,16:00:00,2022-12-06,-0.006089,0.06931,0.0,-0.009891,-1.440573,-1.441554,-3.194116,-0.009521,0.06441,-0.015875,-0.038167,-1.902839,-2.011496,-3.889043,-0.021364,0.0983,0.062665,0.079508,-1.287272,-1.490099,-4.214814,-0.081345,-0.020338,-0.04066,-0.055494,-0.465386,-0.646602,-2.401752,0.0,0.0,0.0,0.0,-0.537715,-0.430635,-1.160309,-0.012799,-0.051198,-0.063992,-0.051198,-0.496951,-0.102343,-1.276867,0.0,-0.016977,-0.016977,-0.016977,-1.466818,-1.576603,-3.379824


In [24]:
def load_joblib(path):
    fs = gcsfs.GCSFileSystem()
    with fs.open(f'path') as f:
        return joblib.load(f)

In [27]:
model_path = 'gs://pmykola-streaming-projects/spg-stocks/artifacts/en_model.pkl'
trained_model = joblib.load(open(model_path, "rb"))

FileNotFoundError: [Errno 2] No such file or directory: 'gs://pmykola-streaming-projects/spg-stocks/artifacts/en_model.pkl'

In [42]:
#trained_model = load_joblib(model_path)

In [52]:
storage_client = storage.Client()
bucket_name='pmykola-streaming-projects'
model_bucket='en_model.pkl'

bucket = storage_client.get_bucket(bucket_name)
blob = bucket.blob(model_bucket)
model_file = BytesIO()
blob.download_to_file(model_file)
model=joblib.load(model_file)

In [8]:
### do prediction ###

model_path = '/home/jupyter/project_repos/spg_stocks/spg_stocks/stocks-app/en_model.pkl'
trained_model = joblib.load(open(model_path, "rb"))

this_day = df.loc[df.date == df.date.max()]
print(f'{this_day.shape[0]} observations this day')
X = this_day.copy()
X.drop(columns = ['Datetime',
                  'time', 
                  'date', 
                  'Spx_ret', 
                  'Nasdaq_ret', 
                  'Russel_ret', 
                  'EEMA_ret', 
                  'EEM_ret', 
                  'EMXC_ret', 
                  'VXUS_ret', 
                  'VTHR_ret'], 
                  inplace=True,
                  errors = 'ignore')

if(X.count().sum() < X.shape[1]):
    print(f'''There are {X.shape[1] - X.count().sum()} missing values. 
          There will be an error''')

y = this_day.VTHR_ret
y_hat = trained_model.predict(X)
#print(trained_model.predict(X))

print('Total time: ', time.time()-time0)

196 observations this day
Total time:  3.7465178966522217


In [9]:
model_rmse = mean_squared_error(y, y_hat)
constant_rmse = mean_squared_error(y, np.zeros(len(y)))

performance = pd.DataFrame([[100*(r2_score(y, y_hat)), model_rmse, constant_rmse, 100*(1-model_rmse/constant_rmse)]], 
                           columns = ['R2', 'model_rmse', 'constant_rmse', 'rmse_improvement'])

file_name = 'm1_performance_' + \
str(df.date.max().year) + \
str(df.date.max().month) + \
str(df.date.max().day) + \
'_pull_time_' + \
now_time + \
'.csv'
performance.to_csv('gs://pmykola-streaming-projects/spg-stocks/artifacts' + '/' + file_name)

In [10]:
performance

Unnamed: 0,R2,model_rmse,constant_rmse,rmse_improvement
0,12.187655,0.003488,0.004038,13.610255
