In [1]:
import os
import gc
import pickle
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
from joblib import Parallel, delayed
from sklearn.model_selection import KFold

import xgboost as xgb

tqdm.pandas()
%matplotlib inline


import warnings
warnings.filterwarnings("ignore")

In [2]:
# files
train = pd.read_csv("../input/optiver-realized-volatility-prediction/train.csv")
test = pd.read_csv("../input/optiver-realized-volatility-prediction/test.csv")
ss = pd.read_csv("../input/optiver-realized-volatility-prediction/sample_submission.csv")

print("Train Info: \n")
train.info()

print("\nTest Info: \n")
test.info()

print("\nSample Submission Format: \n")
ss.info()

Train Info: 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 428932 entries, 0 to 428931
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   stock_id  428932 non-null  int64  
 1   time_id   428932 non-null  int64  
 2   target    428932 non-null  float64
dtypes: float64(1), int64(2)
memory usage: 9.8 MB

Test Info: 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   stock_id  3 non-null      int64 
 1   time_id   3 non-null      int64 
 2   row_id    3 non-null      object
dtypes: int64(2), object(1)
memory usage: 200.0+ bytes

Sample Submission Format: 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   row_id  3 non-null      object 
 1   target  3 non-null    

# Feature Engineering and Dataset

Alright, we got a basic idea of what and how of the problem statement. Let's get on to it. There are actually pretty good baselines, so instead of describing more feature-centric approaches, I will build a XGB pipeline here. Let's go.

In [3]:
# feature utils
def calculate_wap(df, rank="1"):
    """
    Weighted Average Pricing for a stock at a given time ID is given by:
    (bid_price1 * ask_size1 + bid_size1 * ask_price1)/(bid_size1 + ask_size1)

    It can further be extended to:

        sum(bid_price_i * ask_size_i + bid_size_i * ask_price_i)/sum(bid_size_i + ask_size_i)

    :param rank: which wap to calculate
    :param df: parquet table containing order book
    :return:
    """
    return (df[f"bid_price{rank}"] * df[f"ask_size{rank}"] + df[f"bid_size{rank}"] * df[f"ask_price{rank}"]) / (df[f"bid_size{rank}"] + df[f"ask_size{rank}"])


def calculate_logreturn(series):
    return np.log(series).diff()


def calculate_rv(series):
    return np.sqrt(np.sum(np.square(series)))


def count_unique(series):
    return len(np.unique(series))


def get_stats_window(df, seconds_in_bucket, features_dict, add_suffix=False):
    df_feature = df[df["seconds_in_bucket"] >= seconds_in_bucket].groupby(["time_id"]).agg(features_dict).reset_index()
    df_feature.columns = ["_".join(col) for col in df_feature.columns]
    
    if add_suffix:
        df_feature = df_feature.add_suffix("_" + str(seconds_in_bucket))
        
    return df_feature
    pass

In [4]:
# configs
class cfg:
    
    paths = {
        # train path
        "train_csv": "../input/optiver-realized-volatility-prediction/train.csv",
        "train_book": "../input/optiver-realized-volatility-prediction/book_train.parquet",
        "train_trade": "../input/optiver-realized-volatility-prediction/trade_train.parquet",

        # test path
        "test_csv": "../input/optiver-realized-volatility-prediction/test.csv",
        "test_book": "../input/optiver-realized-volatility-prediction/book_test.parquet",
        "test_trade": "../input/optiver-realized-volatility-prediction/trade_test.parquet",
    }

    feature_dict_book = {
        "wap1": [np.sum, np.mean, np.std],
        "wap2": [np.sum, np.mean, np.std],
        "log_return1": [np.sum, calculate_rv, np.mean, np.std],
        "log_return2": [np.sum, calculate_rv, np.mean, np.std],
        "wap_balance": [np.sum, np.mean, np.std],
        "volume_imbalance": [np.sum, np.mean, np.std],
        "total_volume": [np.sum, np.mean, np.std],
        "price_spread1": [np.sum, np.mean, np.std],
        "price_spread2": [np.sum, np.mean, np.std],
        "bid_spread": [np.sum, np.mean, np.std],
        "ask_spread": [np.sum, np.mean, np.std],
    }

    feature_dict_trade = {
        "log_return": [calculate_rv],
        "seconds_in_bucket": [count_unique],
        "size": [np.sum],
        "order_count": [np.mean]
    }
    
    model_params = {
        "xgb_bl": {
            "objective": "reg:squarederror",
            "booster": "gbtree",
            "nthread": -1,
            "eta": 0.3,
            "max_depth": 8,
            "min_child_weight": 1,
            "sampling_method": "gradient_based",
#             "tree_method": "gpu_hist"  # turn it on for GPU
        }
    }

In [5]:
# order book features
def get_book_features(file_path):
    book_df = pd.read_parquet(file_path)

    # calculate wap
    book_df['wap1'] = calculate_wap(book_df, rank="1")
    book_df['wap2'] = calculate_wap(book_df, rank="2")

    # calculate log return
    book_df["log_return1"] = book_df.groupby(["time_id"])["wap1"].apply(calculate_logreturn)
    book_df["log_return2"] = book_df.groupby(["time_id"])["wap2"].apply(calculate_logreturn)

    # calculate balance
    book_df["wap_balance"] = abs(book_df["wap1"] - book_df["wap2"])
    book_df["volume_imbalance"] = abs(
        (book_df["ask_size1"] + book_df["ask_size2"]) - (book_df["bid_size1"] + book_df["bid_size2"]))
    book_df["total_volume"] = book_df["ask_size1"] + book_df["ask_size2"] + book_df["bid_size1"] + book_df[
        "bid_size2"]

    # calculate spread
    book_df["price_spread1"] = (book_df["ask_price1"] - book_df["bid_price1"]) / (
            (book_df["ask_price1"] + book_df["bid_price1"]) / 2)
    book_df["price_spread2"] = (book_df["ask_price2"] - book_df["bid_price2"]) / (
            (book_df["ask_price2"] + book_df["bid_price2"]) / 2)

    book_df["bid_spread"] = book_df["bid_price1"] - book_df["bid_price2"]
    book_df["ask_spread"] = book_df["ask_price1"] - book_df["ask_price2"]

    book_df_merged = get_stats_window(book_df, seconds_in_bucket=0, features_dict=cfg.feature_dict_book)

    book_df_450 = get_stats_window(book_df, seconds_in_bucket=450, features_dict=cfg.feature_dict_book, add_suffix=True)
    book_df_300 = get_stats_window(book_df, seconds_in_bucket=300, features_dict=cfg.feature_dict_book, add_suffix=True)
    book_df_150 = get_stats_window(book_df, seconds_in_bucket=150, features_dict=cfg.feature_dict_book, add_suffix=True)

    # merge stats
    book_df_merged = book_df_merged.merge(book_df_450, how="left", left_on="time_id_", right_on="time_id__450")
    book_df_merged = book_df_merged.merge(book_df_300, how="left", left_on="time_id_", right_on="time_id__300")
    book_df_merged = book_df_merged.merge(book_df_150, how="left", left_on="time_id_", right_on="time_id__150")


    book_df_merged.drop(columns=["time_id__450", "time_id__300", "time_id__150"], inplace=True)

    book_df_merged["row_id"] = book_df_merged["time_id_"].apply(lambda x: f"{file_path.split('=')[1]}-{x}")
    book_df_merged.drop(["time_id_"], axis=1, inplace=True)

    return book_df_merged

# trade features
def get_trade_features(file_path):
    trade_df = pd.read_parquet(file_path)
    
    trade_df["log_return"] = trade_df.groupby(["time_id"])["price"].apply(calculate_logreturn)

    trade_df_merged = get_stats_window(trade_df, seconds_in_bucket=0, features_dict=cfg.feature_dict_trade)

    trade_df_450 = get_stats_window(trade_df, seconds_in_bucket=450, features_dict=cfg.feature_dict_trade, add_suffix=True)
    trade_df_300 = get_stats_window(trade_df, seconds_in_bucket=300, features_dict=cfg.feature_dict_trade, add_suffix=True)
    trade_df_150 = get_stats_window(trade_df, seconds_in_bucket=150, features_dict=cfg.feature_dict_trade, add_suffix=True)

    # merge stats
    trade_df_merged = trade_df_merged.merge(trade_df_450, how="left", left_on="time_id_", right_on="time_id__450")
    trade_df_merged = trade_df_merged.merge(trade_df_300, how="left", left_on="time_id_", right_on="time_id__300")
    trade_df_merged = trade_df_merged.merge(trade_df_150, how="left", left_on="time_id_", right_on="time_id__150")
    
    trade_df_merged.drop(columns=["time_id__450", "time_id__300", "time_id__150"], inplace=True)
    
    trade_df_merged = trade_df_merged.add_prefix("trade_")

    trade_df_merged["row_id"] = trade_df_merged["trade_time_id_"].apply(lambda x: f"{file_path.split('=')[1]}-{x}")
    trade_df_merged.drop(["trade_time_id_"], axis=1, inplace=True)

    return trade_df_merged

In [6]:
class GetData:
    def __init__(self, df, book_path, trade_path):
        self.df = df.copy(deep=True)
        self.order_book_path = book_path
        self.trade_path = trade_path

        self._get_rowid()

    def _get_rowid(self):
        self.df["row_id"] = self.df["stock_id"].astype(str) + "-" + self.df["time_id"].astype(str)

    def get_time_stock(self):
        vol_cols = ['log_return1_calculate_rv', 'log_return2_calculate_rv',
                    'log_return1_calculate_rv_450', 'log_return2_calculate_rv_450',
                    'log_return1_calculate_rv_300', 'log_return2_calculate_rv_300',
                    'log_return1_calculate_rv_150', 'log_return2_calculate_rv_150',
                    'trade_log_return_calculate_rv', 'trade_log_return_calculate_rv_450',
                    'trade_log_return_calculate_rv_300', 'trade_log_return_calculate_rv_150']

        df_stock_id = self.df.groupby(['stock_id'])[vol_cols].agg(['mean', 'std', 'max', 'min']).reset_index()
        df_stock_id.columns = ['_'.join(col) for col in df_stock_id.columns]
        df_stock_id = df_stock_id.add_suffix('_' + 'stock')

        df_time_id = self.df.groupby(['time_id'])[vol_cols].agg(['mean', 'std', 'max', 'min']).reset_index()
        df_time_id.columns = ['_'.join(col) for col in df_time_id.columns]
        df_time_id = df_time_id.add_suffix('_' + 'time')

        # Merge with original dataframe
        self.df = self.df.merge(df_stock_id, how='left', left_on=['stock_id'], right_on=['stock_id__stock'])
        self.df = self.df.merge(df_time_id, how='left', left_on=['time_id'], right_on=['time_id__time'])
        self.df.drop(['stock_id__stock', 'time_id__time'], axis=1, inplace=True)
        return self.df

    def process_features(self, list_stock_ids):
        def parallel_helper(stock_id):
            book_sample_path = os.path.join(self.order_book_path, f"stock_id={stock_id}")
            trade_sample_path = os.path.join(self.trade_path, f"stock_id={stock_id}")

            return pd.merge(get_book_features(book_sample_path), get_trade_features(trade_sample_path),
                            on="row_id",
                            how="left")

        df = Parallel(n_jobs=-1, verbose=1)(delayed(parallel_helper)(stock_id) for stock_id in list_stock_ids)
        df = pd.concat(df, ignore_index=True)

        return df

    def get_features(self):
        features_df = self.process_features(self.df["stock_id"].unique())
        self.df = self.df.merge(features_df, on=["row_id"], how="left")

        return self.get_time_stock()
        pass

# Modeling

In [7]:
def rmspe(y_true, y_pred):
    return np.sqrt(np.mean(np.square((y_true - y_pred) / y_true)))


def feval_rmspe(y_pred, xgb_dtrain):
    y_true = xgb_dtrain.get_label()
    return "RMSPE", rmspe(y_true, y_pred)


class TrainFer:
    def __init__(self, params_dict, n_splits, model_path, random_state=2021):
        self.params = params_dict
        self.n_splits = n_splits
        self.random_state = random_state
        self.model_path = model_path
        
        if not os.path.isdir(model_path):
            os.makedirs(model_path)
    
    def train(self, X, y):
        oof_predictions = np.zeros(X.shape[0])
        kfold = KFold(n_splits=self.n_splits, random_state=self.random_state, shuffle=True)
        
        for fold, (train_idx, val_idx) in enumerate(kfold.split(X)):
            print(f"\nFold - {fold}\n")

            x_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
            x_val, y_val = X.iloc[val_idx], y.iloc[val_idx]
            
            x_train["stock_id"] = x_train["stock_id"].astype(int)
            x_val["stock_id"] = x_val["stock_id"].astype(int)

            dtrain = xgb.DMatrix(x_train, label=y_train, weight=1/np.square(y_train), enable_categorical=True)
            dval = xgb.DMatrix(x_val, label=y_val, weight=1/np.square(y_val), enable_categorical=True)

            model = xgb.train(self.params,
                              dtrain=dtrain,
                              num_boost_round=100,
                              evals=[(dtrain, "dtrain"), (dval, "dval")],
                              verbose_eval=10,
                              feval=feval_rmspe)
            
            pickle.dump(model, open(os.path.join(self.model_path, f"xgb_bl_{fold}.pkl"), "wb"))
            oof_predictions[val_idx] = model.predict(dval)
            
        rmspe_score = rmspe(y, oof_predictions)
        print(f"OOF RMSPE: {rmspe_score}")
        
    def infer(self, x_test):
        test_predictions = np.zeros(x_test.shape[0])
        dtest = xgb.DMatrix(x_test)
        
        for mpth in os.listdir(self.model_path):
            model = pickle.load(open(os.path.join(self.model_path, mpth), "rb"))
            test_predictions += model.predict(dtest)/5
            
        return test_predictions

In [8]:
test_data = GetData(test, cfg.paths["test_book"], cfg.paths["test_trade"])
test_df = test_data.get_features()
test_df.head()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    1.2s finished


Unnamed: 0,stock_id,time_id,row_id,wap1_sum,wap1_mean,wap1_std,wap2_sum,wap2_mean,wap2_std,log_return1_sum,...,trade_log_return_calculate_rv_450_max_time,trade_log_return_calculate_rv_450_min_time,trade_log_return_calculate_rv_300_mean_time,trade_log_return_calculate_rv_300_std_time,trade_log_return_calculate_rv_300_max_time,trade_log_return_calculate_rv_300_min_time,trade_log_return_calculate_rv_150_mean_time,trade_log_return_calculate_rv_150_std_time,trade_log_return_calculate_rv_150_max_time,trade_log_return_calculate_rv_150_min_time
0,0,4,0-4,3.001215,1.000405,0.00017,3.00165,1.00055,0.000153,0.000294,...,,,,,,,,,,
1,0,32,0-32,,,,,,,,...,,,,,,,,,,
2,0,34,0-34,,,,,,,,...,,,,,,,,,,


In [9]:
tester = TrainFer(cfg.model_params["xgb_bl"], n_splits=5, model_path="../input/orvpxgbbaselinelocalv1/")
preds = tester.infer(test_df.drop(columns=["row_id", "time_id"]))

In [10]:
test["target"] = preds
test[["row_id", "target"]].to_csv("submission.csv", index=False)

### Work In Progress