In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import metrics, model_selection, preprocessing
import warnings
import xgboost
warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
train_csv_route = '../input/optiver-realized-volatility-prediction/train.csv'
test_csv_route = '../input/optiver-realized-volatility-prediction/test.csv'
book_train = pd.read_parquet('../input/optiver-realized-volatility-prediction/book_train.parquet')
book_test = pd.read_parquet('../input/optiver-realized-volatility-prediction/book_test.parquet')
trade_train = pd.read_parquet('../input/optiver-realized-volatility-prediction/trade_train.parquet')
trade_test = pd.read_parquet('../input/optiver-realized-volatility-prediction/trade_test.parquet')

train = pd.read_csv(train_csv_route)
test = pd.read_csv(test_csv_route)
data = {
    'book_train':book_train,
    'book_test':book_test,
    'trade_train':trade_train,
    'trade_test':trade_test,
    'train':train,
    'test':test
}

In [3]:
def set_check(df,df_name):
    print("-"*40,"For ",df_name," Set","-"*40)
    print(f"column names: {list(df.columns)}\n")
    print(f"data types: {list(df.dtypes)}\n")
    print(f"shape of data that we are dealing with in this dataset: {df.shape}\n")
    print(f"Null data for each columns?: \n{df.isnull().sum()}\n")

for df_name,df in data.items():
    set_check(df,df_name)

---------------------------------------- For  book_train  Set ----------------------------------------
column names: ['time_id', 'seconds_in_bucket', 'bid_price1', 'ask_price1', 'bid_price2', 'ask_price2', 'bid_size1', 'ask_size1', 'bid_size2', 'ask_size2', 'stock_id']

data types: [dtype('int16'), dtype('int16'), dtype('float32'), dtype('float32'), dtype('float32'), dtype('float32'), dtype('int32'), dtype('int32'), dtype('int32'), dtype('int32'), CategoricalDtype(categories=[  0,   1,  10, 100, 101, 102, 103, 104, 105, 107,
                  ...
                   89,   9,  90,  93,  94,  95,  96,  97,  98,  99],
                 ordered=False)]

shape of data that we are dealing with in this dataset: (167253289, 11)

Null data for each columns?: 
time_id              0
seconds_in_bucket    0
bid_price1           0
ask_price1           0
bid_price2           0
ask_price2           0
bid_size1            0
ask_size1            0
bid_size2            0
ask_size2            0
stock_id   

In [4]:
book_train = book_train[book_train['stock_id']==0]
trade_train = trade_train[trade_train['stock_id']==0]
train = train[train['stock_id']==0]
print(f"booking_training set of stock id 0 is: {book_train.shape}")
print(f"trading_training set of stock id 0 is: {trade_train.shape}")
print(f"actual recorded volatility training set set of stock id 0 is: {train.shape}")

booking_training set of stock id 0 is: (917553, 11)
trading_training set of stock id 0 is: (123443, 6)
actual recorded volatility training set set of stock id 0 is: (3830, 3)


In [5]:
# configuring class that groups the functions
class Optiver_feature_engineered:
    
    """
    it is a collection of the features... docstring work in progress. 
    """
    TODO: 'complete docstring for this'
    
    def __init__(self,df=None,df_name=None):
        self.df = df
        self.df_name = df_name
        
    def BAS(self,ask_price,bid_price):
        return [ask_p/bid_p - 1 for ask_p,bid_p in zip(ask_price,bid_price)]

    def WAP(self,df):
        wap = (df[df.columns[0]] * df[df.columns[1]] + df[df.columns[2]]*df[df.columns[3]])/(df[df.columns[1]]+df[df.columns[3]])
        return wap

    def log_return(self,list_stock_prices):
        return np.log(list_stock_prices).diff() 

    def realized_volatility(self,series_log_return):
        return np.sqrt(np.sum(series_log_return**2))

In [6]:
# preprocessing for booking dataset:
# BAS
# WAP
# Log return
# Calculated volatility


fe = Optiver_feature_engineered(df,'df')
def preprocessings_book(df):
    
    df['seconds_in_bucket'] = df['seconds_in_bucket'] + 1 # cuz 0 seconds will mess up the data internally
    df['seconds_bids'] = df['seconds_in_bucket']*(df['bid_size1']+df['bid_size2'])
    df['seconds_asks'] = df['seconds_in_bucket']*(df['ask_size1']+df['ask_size2'])
    df['BAS1'] = fe.BAS(df['ask_price1'],df['bid_price1'])
    df['BAS2'] = fe.BAS(df['ask_price2'],df['bid_price2'])
    df['WAP1'] = fe.WAP(df[['bid_price1','ask_size1','ask_price1','bid_size1']])
    df['WAP2'] = fe.WAP(df[['bid_price2','ask_size2','ask_price2','bid_size2']])
    df['logr1'] = df.groupby(['time_id'])['WAP1'].apply(fe.log_return)
    df['logr2'] = df.groupby(['time_id'])['WAP2'].apply(fe.log_return)
    apply_functions = {"seconds_in_bucket":"mean",
                       "bid_price1":"mean",
                       "bid_price2":"mean",
                       "ask_price1":"mean",
                       "ask_price2":"mean",
                       "BAS1":"mean",
                       "BAS2":"mean",
                       "WAP1":"mean", # null values to be ignored when taking mean
                       "WAP2":"mean", # null values to be ignored when taking mean
                       "logr1":"mean",
                       "logr2":"mean",
                       "seconds_bids":"sum",
                       "seconds_asks":"sum",
                       'bid_size1':"sum",
                       'bid_size2':"sum",
                       'ask_size1':"sum",
                       'ask_size2':"sum"
                      }
    df_feature = df.groupby(['time_id']).agg(apply_functions)
    df_feature['vol_1'] = df.groupby(['time_id'])['logr1'].apply(fe.realized_volatility)
    df_feature['vol_2'] = df.groupby(['time_id'])['logr2'].apply(fe.realized_volatility)
    df_feature['seconds_bids'] = df_feature['seconds_bids']/(df_feature['bid_size1'] + df_feature['bid_size2'])
    df_feature['seconds_asks'] = df_feature['seconds_asks']/(df_feature['ask_size1'] + df_feature['ask_size2'])
    df_feature.reset_index(inplace=True)
    df_feature.drop(columns='seconds_in_bucket',axis=1,inplace=True)
    return df_feature

# Preprocessing for trading dataset:

def preprocessings_trade(df):

    df['seconds_in_bucket'] = df['seconds_in_bucket'] + 1
    df['seconds_size'] = df['seconds_in_bucket']*df['size']
    df['logr_p'] = df.groupby(['time_id'])['price'].apply(fe.log_return)
    apply_func = {
        'order_count':'sum',
        'seconds_in_bucket':'mean',
        'size':'sum',
        'seconds_size':'sum',
        'price':'mean',
        'logr_p':'mean'
    }

    df_feature = df.groupby(['time_id']).agg(apply_func)
    df_feature['spread'] = df.groupby(['time_id'])['price'].max() - df.groupby(['time_id'])['price'].min()
    df_feature['vol_p'] = df.groupby(['time_id'])['logr_p'].apply(fe.realized_volatility)
    df_feature['seconds_size'] = df_feature['seconds_size']/df_feature['size']
    df_feature.reset_index(inplace=True)
    df_feature.drop(columns='seconds_in_bucket',axis=1,inplace=True)
    return df_feature

book_train_feature = preprocessings_book(book_train)
book_test_feature = preprocessings_book(book_test)
trade_train_feature = preprocessings_trade(trade_train)
trade_test_feature = preprocessings_trade(trade_test)


dataset = pd.merge(book_train_feature,trade_train_feature,how='left',on=['time_id'])
df = pd.merge(dataset,train,how='right',on=['time_id'])
df.drop(columns='stock_id',inplace=True)
df.dropna(inplace=True)

In [7]:
dataset = pd.merge(book_train_feature,trade_train_feature,how='left',on=['time_id'])
df = pd.merge(dataset,train,how='right',on=['time_id'])
df.drop(columns='stock_id',inplace=True)
df.dropna(inplace=True)
dataset_val = pd.merge(book_test_feature,trade_test_feature,how='left',on=['time_id'])
df_val = pd.merge(dataset_val,test,how='right',on=['time_id'])
df_val.drop(columns=['stock_id','row_id'],inplace=True)

In [8]:
# book_train_feature.describe(percentiles = [i/10 for i in range(1,10)])

from IPython.display import display

display(df.head())
df_val.head()

Unnamed: 0,time_id,bid_price1,bid_price2,ask_price1,ask_price2,BAS1,BAS2,WAP1,WAP2,logr1,...,vol_1,vol_2,order_count,size,seconds_size,price,logr_p,spread,vol_p,target
0,5,1.003314,1.003139,1.004169,1.00432,0.000852,0.001178,1.003725,1.003661,7.613697e-06,...,0.004499,0.006999,110,3179,302.602705,1.003722,3.4e-05,0.002379,0.002006,0.004136
1,11,1.000011,0.99987,1.000406,1.000541,0.000394,0.000671,1.000239,1.000206,1.810376e-06,...,0.001204,0.002476,57,1289,415.373157,1.000206,2.8e-05,0.001104,0.000901,0.001445
2,16,0.999204,0.999007,0.999929,1.000127,0.000725,0.001121,0.999542,0.99968,-1.109168e-05,...,0.002369,0.004801,68,2161,367.290606,0.999204,-0.000106,0.003064,0.001961,0.002168
3,31,0.998445,0.998255,0.999304,0.999413,0.000861,0.00116,0.998832,0.998633,-2.376725e-05,...,0.002574,0.003637,59,1962,366.336391,0.99902,-0.000162,0.002259,0.001561,0.002195
4,62,0.999407,0.999216,0.999804,0.999913,0.000397,0.000697,0.999619,0.999626,-1.02174e-08,...,0.001894,0.003257,89,1791,323.361809,0.999618,-1.3e-05,0.000792,0.000871,0.001747


Unnamed: 0,time_id,bid_price1,bid_price2,ask_price1,ask_price2,BAS1,BAS2,WAP1,WAP2,logr1,...,ask_size2,vol_1,vol_2,order_count,size,seconds_size,price,logr_p,spread,vol_p
0,4,1.000049,0.999656,1.000606,1.000721,0.000557,0.001066,1.000405,1.00055,0.000147,...,59.0,0.000294,0.000252,11.0,201.0,26.40796,1.000151,-0.000143,0.000295,0.000295
1,32,,,,,,,,,,...,,,,,,,,,,
2,34,,,,,,,,,,...,,,,,,,,,,


In [9]:
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn import linear_model, kernel_ridge, svm, neighbors, gaussian_process, cross_decomposition, tree, ensemble, neural_network
import xgboost

In [10]:
Regress_alg = [
    # Support Vector Machine - Regressions
#     svm.SVR(), # kernel='rbf', degree=3, gamma='scale', coef0=0.0, tol=0.001, C=1.0, epsilon=0.1, shrinking=True, cache_size=200, verbose=False, max_iter=- 1
    svm.NuSVR(), # nu=0.5, C=1.0, kernel='rbf', degree=3, gamma='scale', coef0=0.0, shrinking=True, tol=0.001, cache_size=200, verbose=False, max_iter=- 1
    svm.LinearSVR(), # epsilon=0.0, tol=0.0001, C=1.0, loss='epsilon_insensitive', fit_intercept=True, intercept_scaling=1.0, dual=True, verbose=0, random_state=None, max_iter=1000
]

In [11]:
# bring the right tool for splitting the data and normalizing the data
from sklearn.model_selection import train_test_split, cross_validate
from sklearn import preprocessing

# Split the data
# X_train, X_test, Y_train, Y_test, = train_test_split(df.drop(columns='target',axis=1),df['target'], test_size = .33)
X_train, X_test, Y_train, Y_test, = train_test_split(df.drop(columns='target',axis=1),df['target'], test_size = .2)


# Normalizing the train,test predictor variables.
scaler = preprocessing.StandardScaler()

# Normalize the train predictors
X_train[X_train.columns] = scaler.fit_transform(X_train[X_train.columns]) 

# Apply normalization traits to the test predictors
X_test[X_test.columns] = scaler.transform(X_test[X_test.columns])

print(f'Train dataset shape: {X_train.shape}')
print(f'Test dataset shape: {X_test.shape}')
print(f'Train target dataset shape: {Y_train.shape}')
print(f'Test target dataset shape: {Y_test.shape}')

def rmspe(y_true, y_pred):
    '''
    Compute Root Mean Square Percentage Error between two arrays.
    '''
    loss = -np.sqrt(np.mean(np.square(((y_true - y_pred) / y_true))))

    return loss

rmspe_loss = metrics.make_scorer(rmspe)

Train dataset shape: (3064, 26)
Test dataset shape: (766, 26)
Train target dataset shape: (3064,)
Test target dataset shape: (766,)


In [12]:
# Pay Homage to Titanic dataset kaggler
MLA_columns = ['algorithm Name', 'algorithm Parameters','algorithm Train R2', 'algorithm Test R2','algorithm Test RMSPE','algorithm Time']
MLA_compare = pd.DataFrame(columns = MLA_columns)

#create table to compare MLA predictions
MLA_predict = Y_train

#index through MLA and save performance to table
row_index = 0

scoring = {
    'r2': 'r2',
    'rmspe': rmspe_loss
}

for alg in Regress_alg:

    #set name and parameters
    MLA_name = alg.__class__.__name__
    MLA_compare.loc[row_index, 'algorithm Name'] = MLA_name
    MLA_compare.loc[row_index, 'algorithm Parameters'] = str(alg.get_params())
    
    #score model with cross validation:
    cv_results = cross_validate(alg, X_train, Y_train, return_train_score=True, scoring=scoring) # 5 fold 
    
    MLA_compare.loc[row_index, 'algorithm Time'] = cv_results['fit_time'].mean()
    MLA_compare.loc[row_index, 'algorithm Train R2'] = cv_results['train_r2'].mean()
    MLA_compare.loc[row_index, 'algorithm Test R2'] = cv_results['test_r2'].mean()
    MLA_compare.loc[row_index, 'algorithm Test RMSPE'] = cv_results['test_rmspe'].mean()
    
    #save MLA predictions - see section 6 for usage
#     alg.fit(X_train, Y_train)
#     MLA_predict[MLA_name] = alg.predict(X_train)
    
    row_index+=1

MLA_compare.sort_values(by = ['algorithm Test RMSPE'], ascending = False, inplace = True)
MLA_compare

Unnamed: 0,algorithm Name,algorithm Parameters,algorithm Train R2,algorithm Test R2,algorithm Test RMSPE,algorithm Time
0,NuSVR,"{'C': 1.0, 'cache_size': 200, 'coef0': 0.0, 'd...",0.985075,0.481761,-0.508788,4.31844
1,LinearSVR,"{'C': 1.0, 'dual': True, 'epsilon': 0.0, 'fit_...",0.516482,0.496491,-0.532575,0.194766


In [13]:
predicted = pd.Series()

for alg in Regress_alg:
    model = alg.fit(X_train,Y_train)
    predicted = pd.concat([predicted,pd.Series(model.predict(X_test),name=alg.__class__.__name__)],axis=1)
predicted.drop(columns=[0],axis=1,inplace=True)

In [16]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go

predicted_graph = pd.concat([predicted,Y_test.reset_index(drop=True)],axis=1)
fig = make_subplots(specs=[[{"secondary_y": True}]])
x = [0,0.018]
y = x
df = [x,y]



fig1 = px.scatter(predicted_graph, x='NuSVR', y='target', opacity=0.65, trendline='ols', trendline_color_override='blue')
fig2 = px.scatter(df,x=x,y=y,opacity=1.0,trendline='ols',trendline_color_override='red')

fig = make_subplots()

fig.add_trace((go.Scatter(x=fig1.data[0]['x'],y=fig1.data[0]['y'],name='data',mode='markers',opacity=0.65)))
fig.add_trace((go.Scatter(x=fig1.data[1]['x'],y=fig1.data[1]['y'],name='linear fit to pred vs true',fill='tonexty')))
fig.add_trace((go.Scatter(x=fig2.data[0]['x'],y=fig2.data[0]['y'],name='reference, 45 deg line',fill='tonexty')))