## Main code for Kaggle - Optiver Realized Volatility Prediction
@LaurentMombaerts 

In [1]:
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])?  y


**Lib Import / Data loading**

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import glob
import time

# ML
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
#from keras.utils import to_categorical

# Maths
import nolds
from scipy.interpolate import interp1d

# Paths tricks
import os
from pathlib import Path

# Support code
from support_file import *

datapath = os.path.join(str(Path.home()), 'ownCloud', 'Data', 'Kaggle', 'optiver-realized-volatility-prediction')

# Load dataset
train = pd.read_csv(os.path.join(datapath,'train.csv')) 
all_stocks_ids = train['stock_id'].unique()
all_time_ids = train['time_id'].unique()

train['row_id'] = train['stock_id'].astype(str) + '-' + train['time_id'].astype(str)
train = train[['row_id','target']]

**Functions**

In [3]:
# Competition metric
def rmspe(y_true, y_pred):
    return  (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))

# Prediction function (chose here which prediction strategy to use)
def prediction_function(pred,book_path_train,trade_path_train,targets,book_path_test,trade_path_test):
    
    if pred == 'naive':
        # Naive prediction (persistence model)
        prediction = past_realized_volatility_per_stock(list_file=book_path_train,prediction_column_name='pred')
        
        # Merge and evaluate results
        prediction = train.merge(prediction[['row_id','pred']], on = ['row_id'], how = 'left')
        print(prediction.head(5))

        # Estimate performances
        R2 = round(r2_score(y_true = prediction['target'], y_pred = prediction['pred']),3)
        RMSPE = round(rmspe(y_true = prediction['target'], y_pred = prediction['pred']),3)

        print('--')
        print(f'Performance of prediction: R2 score: {R2}, RMSPE: {RMSPE}')
        
        prediction = prediction.drop(columns=['target'])
        prediction = prediction.rename(columns={'pred': 'target'})

    if pred == 'stupid_RF':
        # Stupid nonlinear regression between persistence and next volatility (random forest)
        prediction = stupidForestPrediction(book_path_train=book_path_train,
                                            prediction_column_name='pred',
                                            train_targets_pd=targets,
                                            book_path_test=book_path_test)
        
    if pred == 'entropy_based':
        prediction = entropy_Prediction(book_path_train=book_path_train,
                                            prediction_column_name='pred',
                                            train_targets_pd=targets,
                                            book_path_test=book_path_test)
        
    return prediction

**Test code**

In [4]:
# Try a prediction code

# Glob book file train (contains all paths for each file in this folder)
list_order_book_file_train = glob.glob(os.path.join(datapath,'book_train.parquet','*')) 
list_order_book_file_test = glob.glob(os.path.join(datapath,'book_test.parquet','*'))
list_trade_file_train = glob.glob(os.path.join(datapath,'trade_train.parquet','*')) 
list_trade_file_test = glob.glob(os.path.join(datapath,'trade_test.parquet','*'))

# Given variables
pred = 'entropy_based'
book_path_train = list_order_book_file_train
trade_path_train = list_trade_file_train
targets = train
book_path_test = list_order_book_file_test
trade_path_test = list_trade_file_test

In [5]:
def entropy_from_book(book_stock_time):
    wap = compute_wap(book_stock_time)
    t_init = book_stock_time['seconds_in_bucket']
    t_new = np.arange(np.max(t_init))
    
    # Closest neighbour interpolation (no changes in wap between lines)
    nearest = interp1d(t_init, wap, kind='nearest')
    resampled_wap = nearest(t_new)
    
    # Compute sample entropy
    sampleEntropy = nolds.sampen(resampled_wap)
    
    return sampleEntropy

In [40]:
book_all_features = pd.DataFrame()


#for file in book_path_train:
start = time.time()

file = book_path_train[0]
book_stock = pd.read_parquet(file)
stock_id = file.split('=')[1]
print('stock id computing = ' + str(stock_id))
for time_id in all_time_ids:     
    # Access book data at this time + stock
    book_stock_time = book_stock[book_stock['time_id'] == time_id]
    
    # Create feature matrix
    book_features = pd.DataFrame()
    book_features['stock_id'] = [stock_id]
    book_features['time_id'] = [time_id]
    book_features['row_id'] = book_features['time_id'].apply(lambda x:f'{stock_id}-{x}')
    
    # Hand-designed features
    book_features['volatility'] = realized_volatility_from_book_pd(book_stock_time=book_stock_time)
    book_features['entropy'] = entropy_from_book(book_stock_time=book_stock_time)  
        
    
    
    # Concatenate features, rows
    book_all_features = pd.concat([book_all_features,book_features])

print('Computing one stock entropy took', time.time() - start, 'seconds.')

# Merge targets
#book_all_features = book_all_features.merge(train, on = ['row_id'])
book_all_features = train.merge(book_all_features, on = ['row_id'])

stock id computing = 17
Computing one stock entropy took 140.93975806236267 seconds.


In [10]:
aa = all_stocks_ids.reshape(1,-1)

print(type(aa))
aa = list(range(1,20))
print(type(aa))

onehot_encoder = OneHotEncoder(sparse=False).fit(aa)
hot_encoded_stock = onehot_encoder.transform(17)

<class 'numpy.ndarray'>
<class 'list'>


ValueError: Expected 2D array, got 1D array instead:
array=[ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [41]:
book_all_features

Unnamed: 0,row_id,target,stock_id,time_id,volatility,entropy
0,17-5,0.004615,17,5,0.004091,0.665200
1,17-11,0.002474,17,11,0.002155,0.239818
2,17-16,0.002831,17,16,0.002566,0.629409
3,17-31,0.002201,17,31,0.002221,0.206020
4,17-62,0.002090,17,62,0.002155,0.243821
...,...,...,...,...,...,...
3825,17-32751,0.002470,17,32751,0.003227,0.253120
3826,17-32753,0.002465,17,32753,0.002969,0.333961
3827,17-32758,0.002946,17,32758,0.002157,0.311557
3828,17-32763,0.006927,17,32763,0.003487,0.334461


**Main evaluation code**

In [15]:
# Glob book file train (contains all paths for each file in this folder)
list_order_book_file_train = glob.glob(os.path.join(datapath,'book_train.parquet','*')) 
list_order_book_file_test = glob.glob(os.path.join(datapath,'book_test.parquet','*'))
list_trade_file_train = glob.glob(os.path.join(datapath,'trade_train.parquet','*')) 
list_trade_file_test = glob.glob(os.path.join(datapath,'trade_test.parquet','*'))

# Compute predictions
prediction = prediction_function(pred='stupid_RF',
                                 book_path_train=list_order_book_file_train,
                                 trade_path_train=list_trade_file_train,
                                 targets=train,
                                 book_path_test=list_order_book_file_test,
                                 trade_path_test=list_trade_file_test)

**Notes**

In [14]:
prediction

Unnamed: 0,row_id,target
0,0-4,0.001062
