## Main code for Kaggle - Optiver Realized Volatility Prediction
@LaurentMombaerts 

In [11]:
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])?  y


## MACHINE TO SET UP

In [12]:
###########################
machine = 'local'
###########################

**Lib Import / Data loading**

In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import glob
import time

# ML
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
import xgboost as xgb
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

# Maths
from scipy.interpolate import interp1d
from arch import arch_model

# Paths tricks
import os
from pathlib import Path

# Support code
from support_file import *
from information_measures import *

if machine == 'local':
    datapath = os.path.join(str(Path.home()), 'ownCloud', 'Data', 'Kaggle', 'optiver-realized-volatility-prediction')

    # Load dataset
    train = pd.read_csv(os.path.join(datapath,'train.csv')) 
    all_stocks_ids = train['stock_id'].unique()
    all_time_ids = train['time_id'].unique()

    train['row_id'] = train['stock_id'].astype(str) + '-' + train['time_id'].astype(str)
    train = train[['row_id','target']]

    # Load test ids
    test = pd.read_csv(os.path.join(datapath,'test.csv'))
    test = test.drop(['stock_id','time_id'],axis=1)
    
elif machine == 'kaggle':
    
    # Load dataset
    train = pd.read_csv('../input/optiver-realized-volatility-prediction/train.csv')
    all_stocks_ids = train['stock_id'].unique()
    all_time_ids = train['time_id'].unique()

    train['row_id'] = train['stock_id'].astype(str) + '-' + train['time_id'].astype(str)
    train = train[['row_id','target']]

    test = pd.read_csv('../input/optiver-realized-volatility-prediction/test.csv') 
    test = test.drop(['stock_id','time_id'],axis=1)
    
    datapath = 0
    

**Functions**

In [4]:
# Competition metric
def rmspe(y_true, y_pred):
    return  (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))

# Prediction function (chose here which prediction strategy to use)
def prediction_function(pred, machine, targets, all_stocks_ids, datapath):
        
    if pred == 'entropy':
        if machine == 'local':
            # Load data
            df_features_encoded_test = computeFeatures_wEntropy(machine=machine, dataset='test', all_stocks_ids=all_stocks_ids, datapath=datapath)
            df_features_encoded_train = computeFeatures_wEntropy(machine=machine, dataset='train', all_stocks_ids=all_stocks_ids, datapath=datapath)
            X = df_features_encoded_train.drop(['row_id'],axis=1)
            y = targets
            
            # Model
            model = CatBoostRegressor(verbose=0)
            model.fit(X,y)
            
            # Predicting targets from same
            yhat = model.predict(X)
            
            print('New model catboost perf : ', rmspe(y, yhat))
            
            # Submission file
            yhat_pd = pd.DataFrame(yhat,columns=['target'])
            return pd.concat([df_features_encoded_train['row_id'],yhat_pd],axis=1)

        # Features computation
        df_features_encoded_test = computeFeatures_wEntropy(machine=machine, dataset='test', all_stocks_ids=all_stocks_ids, datapath=datapath)
        df_features_encoded_train = computeFeatures_wEntropy(machine=machine, dataset='train', all_stocks_ids=all_stocks_ids, datapath=datapath)
        
        # Training model
        X = df_features_encoded_train.drop(['row_id'],axis=1)
        y = targets
        
        # Optimized model
        model = CatBoostRegressor(verbose=0)
        model.fit(X,y)
        
        # Predicting targets from test
        X_test = df_features_encoded_test.drop(['row_id'],axis=1)
        yhat = model.predict(X_test)
        
        # Submission file
        yhat_pd = pd.DataFrame(yhat,columns=['target'])
        submission_file = pd.concat([df_features_encoded_train['row_id'],yhat_pd],axis=1)
        
    if pred == 'garch':
        
        if machine == 'local':
            book_path_train = glob.glob(os.path.join(datapath,'book_train.parquet','*')) 
            
            # fit garch and predict
            prediction = garch_volatility_per_stock(list_file=book_path_train, prediction_column_name='pred')
            
            # Merge and evaluate results
            prediction = train.merge(prediction[['row_id','pred']], on = ['row_id'], how = 'left')
            prediction = prediction[prediction.pred.notnull()]

            # Estimate performances
            R2 = round(r2_score(y_true = prediction['target'], y_pred = prediction['pred']),3)
            RMSPE = round(rmspe(y_true = prediction['target'], y_pred = prediction['pred']),3)

            print('--')\n",
            print(f'Performance of prediction: R2 score: {R2}, RMSPE: {RMSPE}')

            prediction = prediction.drop(columns=['target'])
            prediction = prediction.rename(columns={'pred': 'target'})
            
            return prediction
        
    if pred == 'new_version':
        if machine == 'local':
            # Load data
            df_features_encoded_test = computeFeatures_july(machine=machine, dataset='test', all_stocks_ids=all_stocks_ids, datapath=datapath)
            df_features_encoded_train = computeFeatures_july(machine=machine, dataset='train', all_stocks_ids=all_stocks_ids, datapath=datapath)
            X = df_features_encoded_train.drop(['row_id'],axis=1)
            y = targets
            
            # Model
            model = CatBoostRegressor(verbose=0)
            model.fit(X,y)
            
            # Predicting targets from same
            yhat = model.predict(X)
            
            print('New model catboost perf : ', rmspe(y, yhat))
            
            # Submission file
            yhat_pd = pd.DataFrame(yhat,columns=['target'])
            return pd.concat([df_features_encoded_train['row_id'],yhat_pd],axis=1)

        # Features computation
        df_features_encoded_test = computeFeatures_july(machine=machine, dataset='test', all_stocks_ids=all_stocks_ids, datapath=datapath)
        df_features_encoded_train = computeFeatures_july(machine=machine, dataset='train', all_stocks_ids=all_stocks_ids, datapath=datapath)
        
        # Training model
        X = df_features_encoded_train.drop(['row_id'],axis=1)
        y = targets
        
        # Optimized model
        model = CatBoostRegressor(verbose=0)
        model.fit(X,y)
        
        # Predicting targets from test
        X_test = df_features_encoded_test.drop(['row_id'],axis=1)
        yhat = model.predict(X_test)
        
        # Submission file
        yhat_pd = pd.DataFrame(yhat,columns=['target'])
        submission_file = pd.concat([df_features_encoded_train['row_id'],yhat_pd],axis=1)
        

    return submission_file

**Submission**

In [None]:
# Glob book file train (contains all paths for each file in this folder) - Needs this on Kaggle for some reason...
df_submission = prediction_function(pred='new_version',machine=machine,targets=train['target'],all_stocks_ids=all_stocks_ids, datapath=datapath)
df_submission.to_csv('submission.csv',index=False)