In [24]:
import os
import sys

from pathlib import Path

import numpy as np
import pandas as pd
import wandb

sys.path.append("..")
from otc.metrics.metrics import effective_spread


In [2]:
os.environ["GCLOUD_PROJECT"] = "flowing-mantis-239216"

run = wandb.init(project="thesis", entity="fbv")

dataset = "fbv/thesis/train_val_test_ultra:v0"
results = "fbv/thesis/results_classical_clf:v0"

fname_dataset = "test_set_extended_20"
fname_results = "results_classical_clf_ise"


# load unscaled data
artifact = run.use_artifact(dataset) # type: ignore
data_dir = artifact.download()

# load results
artifact = run.use_artifact(results) # type: ignore
results_dir = artifact.download()


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mkarelze[0m ([33mfbv[0m). Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Downloading large artifact train_val_test_ultra:v0, 3391.53MB. 3 files... 
[34m[1mwandb[0m:   3 of 3 files downloaded.  
Done. 0:0:0.0
[34m[1mwandb[0m: Downloading large artifact results_classical_clf:v0, 63.63MB. 1 files... 
[34m[1mwandb[0m:   1 of 1 files downloaded.  
Done. 0:0:0.0


In [33]:
columns = ["TRADE_PRICE", "bid_ex", "ask_ex", "buy_sell"]

eval_data = pd.read_parquet(
    Path(data_dir, fname_dataset), engine="fastparquet", columns=columns
)
results_data = pd.read_parquet(
    Path(results_dir, fname_results), engine="fastparquet"
)

assert len(eval_data) == len(results_data)

results_col = [*results_data.columns.tolist(), "buy_sell"]

composed_data = pd.concat([eval_data, results_data], axis=1)
composed_data["mid"] = (composed_data["bid_ex"] + composed_data["ask_ex"]) / 2
composed_data["spread"] = composed_data["ask_ex"] - composed_data["bid_ex"]

del eval_data, results_data


In [34]:
composed_data.head()

Unnamed: 0,TRADE_PRICE,bid_ex,ask_ex,buy_sell,tick_all,tick_ex,quote_best,quote_ex,lr_ex,lr_best,...,clnv_best,rev_clnv_ex,rev_clnv_best,trade_size_ex->tick_all,trade_size_ex->quote_best,trade_size_ex->quote_best->quote_ex,quote_best->quote_ex,trade_size_ex->depth_ex->quote_best->rev_lr_ex,mid,spread
39342171,0.52,0.52,0.6,-1,1.0,1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,0.56,0.08
39342172,7.82,7.6,8.15,-1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,7.875,0.55
39342173,28.889999,28.799999,32.049999,-1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,30.424999,3.25
39342174,2.25,1.85,2.25,1,1.0,-1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.05,0.4
39342175,1.7,1.7,1.95,-1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.825,0.25


In [35]:
# calculate actual spread + predicted spread (nominal / relative)
results = []
for col in results_col:
    results.append(
        [col, effective_spread(
                composed_data[col], composed_data["TRADE_PRICE"], composed_data["mid"], mode="nominal" # type: ignore
            ),
         effective_spread(
                composed_data[col], composed_data["TRADE_PRICE"], composed_data["mid"], mode="relative" # type: ignore
            )
        ]
    )

# calculate quoted spread (nominal / relative)
results.append(["quoted_spread", np.nanmean(composed_data["spread"]), np.nanmean(composed_data["spread"] / composed_data["mid"])])

In [36]:
results_df = pd.DataFrame(results, columns=["approach", "avg_nominal_spread", "avg_relative_spread"]).sort_values(by="avg_nominal_spread", ascending=True)
results_df

Unnamed: 0,approach,avg_nominal_spread,avg_relative_spread
16,trade_size_ex->tick_all,-0.00619,-0.002024
21,buy_sell,0.004965,0.037225
17,trade_size_ex->quote_best,0.010034,0.039476
20,trade_size_ex->depth_ex->quote_best->rev_lr_ex,0.013795,0.042804
18,trade_size_ex->quote_best->quote_ex,0.013795,0.042804
1,tick_ex,0.015624,0.010794
0,tick_all,0.021816,0.024974
11,rev_emo_best,0.038908,0.072111
10,rev_emo_ex,0.04167,0.078861
9,emo_best,0.043213,0.075759


In [37]:
results_df.to_csv("effective_spread_classical.csv")

In [40]:
results_df.style.to_latex("effective_spread_classical.tex", siunitx=True)