In [None]:
import sys

import gcsfs
import google.auth
import pandas as pd
import wandb
from catboost import CatBoostClassifier, Pool
from google.colab import auth
from numpy.testing import assert_almost_equal
from pandas._testing.asserters import assert_almost_equal
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_predict
from tqdm.notebook import tqdm

sys.path.append("..")



In [None]:
# connect to weights and biases
run = wandb.init(project="thesis", job_type="dataset-creation", entity="fbv")
dataset = wandb.Artifact(name="train_val_test_extended", type="preprocessed_data")


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 

··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [None]:
# connect to google cloud storage
auth.authenticate_user()
credentials, _ = google.auth.default()
fs = gcsfs.GCSFileSystem(project="thesis", token=credentials)


In [None]:
files = fs.glob(
    "thesis-bucket-option-trade-classification/data/preprocessed/matched_ise_quotes_min_mem_usage_extended_part_*.parquet",
    recursive=True,
)
files = ["gs://" + sub for sub in files]

columns = [
    "QUOTE_DATETIME",
    "ROOT",
    "EXPIRATION",
    "STRK_PRC",
    "OPTION_TYPE",
    "TRADE_SIZE",
    "TRADE_PRICE",
    "BEST_BID",
    "BEST_ASK",
    "ask_ex",
    "bid_ex",
    "bid_size_ex",
    "ask_size_ex",
    "price_all_lead",
    "price_all_lag",
    "optionid",
    "day_vol",
    "price_ex_lead",
    "price_ex_lag",
    "issue_type",
    "myn",
    "buy_sell",
]

dfs = [pd.read_parquet(gc_file, columns=columns) for gc_file in tqdm(files)]
df = pd.concat(dfs)

del dfs

  0%|          | 0/50 [00:00<?, ?it/s]

In [None]:
df.memory_usage(deep=True).sum()


10042066301

In [None]:
df.head().T


Unnamed: 0,0,1,2,3,4
QUOTE_DATETIME,2005-05-02 09:30:02,2005-05-02 09:30:03,2005-05-02 09:30:03,2005-05-02 09:30:03,2005-05-02 09:30:03
ROOT,YNU,SYQ,SWG,QAX,ORQ
EXPIRATION,2006-01-21 00:00:00,2005-06-18 00:00:00,2005-05-21 00:00:00,2005-06-18 00:00:00,2005-12-17 00:00:00
STRK_PRC,2.5,15.0,105.0,25.0,14.0
OPTION_TYPE,C,C,C,C,C
TRADE_SIZE,10,10,50,10,15
TRADE_PRICE,2.05,3.9,11.2,0.2,0.25
BEST_BID,1.9,3.6,11.1,0.0,0.3
BEST_ASK,2.1,3.8,11.4,0.25,0.4
ask_ex,2.1,,11.4,0.25,0.45


In [None]:
# check against some stats from sub panel A.1 in Grauer et al

# trade size
stats_trade_size = df["TRADE_SIZE"].agg(["mean", "median", "std"])

assert_almost_equal(stats_trade_size.values.tolist(), [13.62, 4.0, 77.75], atol=0.1)

# moneyness; price underlying / strike
# TODO: Request price for underlyings?

# time to maturity
stats_time_to_maturity = (df["EXPIRATION"] - df["QUOTE_DATETIME"]).dt.days
stats_time_to_maturity = stats_time_to_maturity.agg(["mean", "median", "std"])

# no of observations
stats_n = len(df)
assert stats_n == 49203747

# trade_size = quote size; TRADE_SIZE
stats_trades_with_quote_size_bid = df["bid_size_ex"].eq(df["TRADE_SIZE"])
stats_trades_with_quote_size_ask = df["ask_size_ex"].eq(df["TRADE_SIZE"])

# either ask or bid must be equal, but not both (XOR)
# TODO: mismatch Grauer et. al report 22.28 % -> 0.10956509064238543
stats_trade_with_quote_size = (
    stats_trades_with_quote_size_bid ^ stats_trades_with_quote_size_ask
).sum() / stats_n

# no of buys
stats_buy_trades = df["buy_sell"].ge(0).sum() / stats_n
assert_almost_equal(stats_buy_trades, 0.4746, atol=0.01)

# underlyings per day
# stats_underlyings_per_day = df.groupby(['UNDERLYING_SYMBOL','QUOTE_DATETIME']).count().agg(['mean','median','std'])


## create subsample 🔢

In [None]:
year = 2017

output_path = f"gs://thesis-bucket-option-trade-classification/data/preprocessed/matched_ise_quotes_extended_{year}.parquet"
df_sub = df[df["QUOTE_DATETIME"].dt.year == year]
df_sub.to_parquet(output_path)

dataset.add_reference(output_path, name=f"data_preprocessed_extended_{year}")


[<ManifestEntry ref: gs://thesis-bucket-option-trade-classification/data/preprocessed/matched_ise_quotes_extended_2017.parquet/data_preprocessed_extended_2017>]

In [None]:
year = 2015

output_path = f"gs://thesis-bucket-option-trade-classification/data/preprocessed/matched_ise_quotes_extended_{year}.parquet"
df_sub = df[df["QUOTE_DATETIME"].dt.year == year]
df_sub.to_parquet(output_path)

dataset.add_reference(output_path, name=f"data_preprocessed_extended_{year}")


[<ManifestEntry ref: gs://thesis-bucket-option-trade-classification/data/preprocessed/matched_ise_quotes_extended_2015.parquet/data_preprocessed_extended_2015>]

In [None]:
del df_sub

## train-test-split ⚗️

In [None]:
train = df[df.QUOTE_DATETIME.between("2005-05-02 00:00:01", "2013-10-24 23:59:00")]

len_train = len(train)
print(f"train ratio: {len_train / len(df)}")


train ratio: 0.5997575753732739


In [None]:
val = df[df.QUOTE_DATETIME.between("2013-10-25 00:00:01", "2015-11-05 23:59:00")]
len_val = len(val)
print(f"train ratio: {len_val / len(df)}")


train ratio: 0.1998191519845023


In [None]:
test = df[df.QUOTE_DATETIME.between("2015-11-06 00:00:01", "2017-05-31 23:59:00")]
len_test = len(test)
print(f"train ratio: {len_test / len(df)}")


train ratio: 0.20042327264222376


In [None]:
# check if total is sum of its parts
assert len_train + len_val + len_test == len(df)


In [None]:
train


Unnamed: 0,QUOTE_DATETIME,ROOT,EXPIRATION,STRK_PRC,OPTION_TYPE,TRADE_SIZE,TRADE_PRICE,BEST_BID,BEST_ASK,ask_ex,...,ask_size_ex,price_all_lead,price_all_lag,optionid,day_vol,price_ex_lead,price_ex_lag,issue_type,myn,buy_sell
0,2005-05-02 09:30:02,YNU,2006-01-21,2.5,C,10,2.05,1.90,2.10,2.10,...,20.0,1.90,1.90,21060388.0,10.0,1.90,2.10,0,1.742000,1
1,2005-05-02 09:30:03,SYQ,2005-06-18,15.0,C,10,3.90,3.60,3.80,,...,,4.00,4.00,31624184.0,10.0,4.60,4.00,0,1.235000,1
2,2005-05-02 09:30:03,SWG,2005-05-21,105.0,C,50,11.20,11.10,11.40,11.40,...,300.0,11.80,11.00,31620976.0,50.0,11.90,11.00,%,1.105381,-1
3,2005-05-02 09:30:03,QAX,2005-06-18,25.0,C,10,0.20,0.00,0.25,0.25,...,86.0,0.15,0.15,31560072.0,10.0,0.15,0.15,0,0.799000,1
4,2005-05-02 09:30:03,ORQ,2005-12-17,14.0,C,15,0.25,0.30,0.40,0.45,...,399.0,0.35,0.35,25240212.0,17.0,0.35,0.35,0,0.826429,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29510315,2013-10-24 16:13:41,IWM,2013-11-08,106.0,P,43,0.15,0.15,0.17,0.19,...,26.0,0.16,0.16,101818080.0,1197.0,0.16,0.16,%,0.953452,-1
29510316,2013-10-24 16:13:52,XLE,2014-01-18,77.0,P,1,0.60,0.56,0.60,0.60,...,20.0,0.42,0.72,100555800.0,1.0,0.47,0.72,%,0.893479,1
29510317,2013-10-24 16:13:52,XLE,2014-01-18,71.0,P,1,0.23,0.21,0.26,0.26,...,528.0,0.19,0.30,80840608.0,1.0,0.20,0.30,%,0.823857,-1
29510318,2013-10-24 16:14:33,IWM,2013-11-08,106.0,P,31,0.16,0.16,0.19,0.20,...,21.0,0.15,0.15,101818080.0,1197.0,0.15,0.15,%,0.953580,-1


In [None]:
val


Unnamed: 0,QUOTE_DATETIME,ROOT,EXPIRATION,STRK_PRC,OPTION_TYPE,TRADE_SIZE,TRADE_PRICE,BEST_BID,BEST_ASK,ask_ex,...,ask_size_ex,price_all_lead,price_all_lag,optionid,day_vol,price_ex_lead,price_ex_lag,issue_type,myn,buy_sell
29510320,2013-10-25 09:30:00,IWM,2014-03-31,97.0,P,20,1.47,1.38,1.62,,...,,1.60,1.62,100723144.0,20.0,1.12,2.73,%,0.871793,-1
29510321,2013-10-25 09:30:00,IWM,2014-09-30,105.0,P,20,6.27,5.85,6.31,6.31,...,11.0,6.32,7.69,101786576.0,20.0,5.92,10.29,%,0.943693,1
29510322,2013-10-25 09:30:00,UNG,2013-12-21,18.0,C,2,1.32,1.19,1.44,1.44,...,82.0,1.30,1.25,101943840.0,2.0,1.02,1.19,%,1.040278,1
29510323,2013-10-25 09:30:00,VXX,2013-10-25,14.5,P,20,1.66,1.62,1.70,1.70,...,172.0,1.62,1.60,101658624.0,82.0,1.62,1.60,%,1.127966,1
29510324,2013-10-25 09:30:00,GRPN,2013-10-25,10.5,P,1,0.85,0.00,0.00,,...,,0.50,0.86,101849752.0,4.0,0.65,0.86,0,1.080803,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39342166,2015-11-05 16:14:37,UVXY,2015-11-06,27.0,C,1,0.88,0.86,0.96,0.96,...,21.0,0.70,0.90,109295920.0,4.0,0.46,1.88,%,0.985926,-1
39342167,2015-11-05 16:14:40,QQQ,2015-12-18,109.0,C,2,6.59,6.51,6.68,6.68,...,1077.0,6.58,6.42,105521016.0,18.0,6.64,6.42,%,1.051789,-1
39342168,2015-11-05 16:14:53,SPY,2015-12-24,220.0,P,1,11.42,11.08,11.60,11.60,...,10.0,12.63,,109720304.0,2.0,11.41,11.41,%,1.047120,1
39342169,2015-11-05 16:14:53,SPY,2015-12-24,220.0,P,1,11.41,11.08,11.60,11.60,...,10.0,11.42,11.42,109720304.0,2.0,12.63,,%,1.047120,1


In [None]:
test


Unnamed: 0,QUOTE_DATETIME,ROOT,EXPIRATION,STRK_PRC,OPTION_TYPE,TRADE_SIZE,TRADE_PRICE,BEST_BID,BEST_ASK,ask_ex,...,ask_size_ex,price_all_lead,price_all_lag,optionid,day_vol,price_ex_lead,price_ex_lag,issue_type,myn,buy_sell
39342171,2015-11-06 09:30:00,EWZ,2015-12-18,22.0,P,2,0.520000,0.520000,0.600000,0.600000,...,11.0,0.57,0.460000,107453656.0,3.0,0.59,0.45,%,0.921659,-1
39342172,2015-11-06 09:30:00,TSLA,2015-11-27,230.0,C,1,7.820000,7.600000,8.150000,8.150000,...,1.0,8.16,8.500000,109398624.0,1.0,4.97,8.37,0,1.001696,-1
39342173,2015-11-06 09:30:00,TSLA,2017-01-20,260.0,C,1,28.889999,28.799999,32.049999,32.049999,...,1.0,30.23,30.799999,105940216.0,3.0,29.90,29.00,0,0.886115,-1
39342174,2015-11-06 09:30:00,VB,2015-11-20,115.0,C,1,2.250000,1.850000,2.150000,2.250000,...,10.0,0.64,1.480000,109232832.0,1.0,,,%,1.009261,1
39342175,2015-11-06 09:30:00,VB,2015-12-18,117.0,C,1,1.700000,1.700000,1.950000,1.950000,...,10.0,0.65,2.200000,107538000.0,1.0,,,%,0.991880,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49203742,2017-05-31 16:12:19,SVXY,2017-06-02,152.0,C,1,3.540000,2.720000,4.350000,4.350000,...,10.0,4.75,3.000000,115528016.0,1.0,5.24,3.85,%,1.015066,-1
49203743,2017-05-31 16:12:45,SPY,2017-12-15,236.0,P,6,7.270000,7.220000,7.290000,7.290000,...,1250.0,7.20,7.610000,113308776.0,6.0,5.37,9.01,%,0.976255,1
49203744,2017-05-31 16:13:39,NDX,2017-06-02,5690.0,P,12,1.000000,0.400000,1.600000,2.400000,...,13.0,0.47,1.400000,115919712.0,83.0,0.47,1.40,A,0.980586,-1
49203745,2017-05-31 16:14:02,DIA,2017-06-02,212.0,C,2,0.050000,0.030000,0.070000,0.070000,...,22.0,0.03,0.010000,115563008.0,2.0,0.06,0.14,%,0.991769,1


In [None]:
output_path = f"gs://thesis-bucket-option-trade-classification/data/preprocessed/train_set_extended_60.parquet"
train.to_parquet(output_path)
dataset.add_reference(output_path, name="train_set_extended_60")


[<ManifestEntry ref: gs://thesis-bucket-option-trade-classification/data/preprocessed/train_set_extended_60.parquet/train_set_extended_60>]

In [None]:
output_path = f"gs://thesis-bucket-option-trade-classification/data/preprocessed/val_set_extended_20.parquet"
val.to_parquet(output_path)
dataset.add_reference(output_path, name="val_set_extended_20")


[<ManifestEntry ref: gs://thesis-bucket-option-trade-classification/data/preprocessed/val_set_extended_20.parquet/val_set_extended_20>]

In [None]:
output_path = f"gs://thesis-bucket-option-trade-classification/data/preprocessed/test_set_extended_20.parquet"
test.to_parquet(output_path)
dataset.add_reference(output_path, name="test_set_extended_20")


[<ManifestEntry ref: gs://thesis-bucket-option-trade-classification/data/preprocessed/test_set_extended_20.parquet/test_set_extended_20>]

In [None]:
# Log the artifact to save it as an output of this run
run.log_artifact(dataset)

wandb.finish()


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

## Time consistency

Check if features maintain their predictive power over time, buy training on the first $10~\%$ of the training set and predicting the last $10~\%$ feature by feature. Remove features or further investigate features where accuracy is just above or below $0.5$. Technique found in this [notebook](https://www.kaggle.com/code/cdeotte/xgb-fraud-with-magic-0-9600/notebook).

In [4]:
data = pd.read_parquet(
    "gs://thesis-bucket-option-trade-classification/data/classical_size_features_log_normalized/train_set_extended_60.parquet",
    engine="fastparquet"
)

In [5]:
# try to predict last 10 % in training set using first 10 % of features. Accuracy should be above 50 %.
label = data["buy_sell"]
data.drop(columns=["buy_sell"], inplace=True)

In [6]:
y_train = label.iloc[0:len(label)//10]
y_test = label.iloc[-len(data)//10:]

X_train = data.iloc[0:len(data)//10,:]
X_test = data.iloc[-len(data)//10:,:]

del label, data

In [7]:
y_train.shape

(2951032,)

In [8]:
params = {
        "od_type": "Iter",
        "logging_level": "Silent",
        "loss_function": "Logloss",
        "task_type": "GPU",
        "cat_features": None,
        "random_seed": 42,
        "eval_metric":"Accuracy",
        "iterations":1000,
        "early_stopping_rounds":100,
}

In [13]:
columns = X_train.columns

In [21]:
results = []
for col in tqdm(columns):
  model = CatBoostClassifier(**params)
  model.fit(X_train[[col]], y_train, eval_set=(X_test[[col]],y_test))
  acc = model.score(X_test[[col]], y_test)
  results.append([col,acc])

  0%|          | 0/24 [00:00<?, ?it/s]

In [24]:
results_df = pd.DataFrame(results, columns=["feature", "accuracy"])
results_df.sort_values(by="accuracy")

Unnamed: 0,feature,accuracy
10,chg_ex_lag,0.518701
21,TRADE_SIZE,0.522728
9,chg_ex_lead,0.527728
12,chg_all_lag,0.547701
8,bid_ask_ratio_ex,0.548045
3,rel_ask_size_ex,0.550789
23,ask_size_ex,0.55215
19,price_ex_lag,0.552589
11,chg_all_lead,0.554612
0,TRADE_PRICE,0.554639


Few features are actually consistent over time and are more informative than a random guess. These include entire includes features related to the proximity of the quote and relative bid size.

Some features like `chg_ex_lead` are hard to exclude. Better weight observations, as suggested in `3.0c-feature-engineering.ipynb`.