In [1]:
import sys
import os

import gcsfs
import numpy as np
import pandas as pd
import wandb
from catboost import CatBoostClassifier, Pool
from numpy.testing import assert_almost_equal
from pandas._testing.asserters import assert_almost_equal
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_predict
from tqdm.auto import tqdm

sys.path.append("..")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
exchange = "cboe" # "ise"
strategy = "transfer" # "supervised"
max_i = 50 if exchange == "ise" else 38 # number of partial files

In [3]:
# connect to weights and biases
run = wandb.init(project="thesis", job_type="dataset-creation", entity="fbv")
dataset = wandb.Artifact(name=f"{exchange}_{strategy}_raw", type="preprocessed_data")


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mkarelze[0m ([33mfbv[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [4]:
os.environ["GCLOUD_PROJECT"] = "flowing-mantis-239216"

fs = gcsfs.GCSFileSystem(project="thesis")



In [6]:

files = [f"gs://thesis-bucket-option-trade-classification/data/preprocessed/{'unmatched' if strategy == 'unsupervised' else 'matched'}_{exchange}_quotes_min_mem_usage_extended_part_{i:04d}.parquet" for i in range(0, max_i)]

# asks = [f"ASK_{i}" for i in range(1, 17)]
# bids = [f"BID_{i}" for i in range(1, 17)]

columns = [
    "QUOTE_DATETIME",
    "ROOT",
    "EXPIRATION",
    "STRK_PRC",
    "OPTION_TYPE",
    "TRADE_SIZE",
    "TRADE_PRICE",
    "BEST_BID",
    "BEST_ASK",
    "ask_ex",
    "bid_ex",
    "bid_size_ex",
    "ask_size_ex",
    "price_all_lead",
    "price_all_lag",
    "optionid",
    "day_vol",
    "price_ex_lead",
    "price_ex_lag",
    "issue_type",
    "myn",
    # *asks,
    # *bids,
    "buy_sell",
]

dfs = [pd.read_parquet(gc_file, columns=columns) for gc_file in tqdm(files)]
df = pd.concat(dfs)

del dfs


  0%|          | 0/38 [00:00<?, ?it/s][A
  3%|▎         | 1/38 [00:07<04:20,  7.04s/it][A
  5%|▌         | 2/38 [00:14<04:27,  7.43s/it][A
  8%|▊         | 3/38 [00:22<04:17,  7.36s/it][A
 11%|█         | 4/38 [00:30<04:22,  7.71s/it][A
 13%|█▎        | 5/38 [00:37<04:12,  7.64s/it][A
 16%|█▌        | 6/38 [00:45<04:05,  7.67s/it][A
 18%|█▊        | 7/38 [00:53<03:59,  7.71s/it][A
 21%|██        | 8/38 [01:00<03:48,  7.61s/it][A
 24%|██▎       | 9/38 [01:07<03:36,  7.48s/it][A
 26%|██▋       | 10/38 [01:15<03:28,  7.45s/it][A
 29%|██▉       | 11/38 [01:22<03:22,  7.49s/it][A
 32%|███▏      | 12/38 [01:30<03:14,  7.47s/it][A
 34%|███▍      | 13/38 [01:37<03:03,  7.32s/it][A
 37%|███▋      | 14/38 [01:44<02:56,  7.35s/it][A
 39%|███▉      | 15/38 [01:51<02:46,  7.22s/it][A
 42%|████▏     | 16/38 [01:58<02:38,  7.18s/it][A
 45%|████▍     | 17/38 [02:05<02:30,  7.18s/it][A
 47%|████▋     | 18/38 [02:13<02:26,  7.31s/it][A
 50%|█████     | 19/38 [02:20<02:19,  7.35s/it]

In [7]:
df.memory_usage(deep=True).sum()


5529034042

In [8]:
df.head().T


Unnamed: 0,0,1,2,3,4
QUOTE_DATETIME,2011-02-08 15:44:23,2011-02-25 12:05:06,2011-01-24 10:20:49,2011-01-24 10:20:49,2011-01-24 09:50:25
ROOT,A,A,A,A,A
EXPIRATION,2011-03-19 00:00:00,2011-03-19 00:00:00,2011-03-19 00:00:00,2011-03-19 00:00:00,2011-03-19 00:00:00
STRK_PRC,36.0,37.0,38.0,40.0,42.0
OPTION_TYPE,C,C,P,P,C
TRADE_SIZE,1,7,10,10,1
TRADE_PRICE,8.22,5.27,0.62,1.07,2.08
BEST_BID,7.95,5.1,0.59,1.06,2.08
BEST_ASK,8.7,5.4,0.62,1.11,2.16
ask_ex,8.7,5.4,0.63,1.11,2.16


In [9]:
len(df)

37155412

In [15]:
# check against some stats from sub panel A.1 in Grauer et al

# trade size
stats_trade_size = df["TRADE_SIZE"].agg(["mean", "median", "std"])

# time to maturity
stats_time_to_maturity = (df["EXPIRATION"] - df["QUOTE_DATETIME"]).dt.days
stats_time_to_maturity = stats_time_to_maturity.agg(["mean", "median", "std"])

# no of observations
stats_n = len(df)

# trade_size = quote size; TRADE_SIZE
stats_trades_with_quote_size_bid = df["bid_size_ex"].eq(df["TRADE_SIZE"])
stats_trades_with_quote_size_ask = df["ask_size_ex"].eq(df["TRADE_SIZE"])

# ask or bid 
stats_trade_with_quote_size = (
    stats_trades_with_quote_size_bid | stats_trades_with_quote_size_ask
).sum() / stats_n


# no of buys
stats_buy_trades = df["buy_sell"].ge(0).sum() / stats_n


if exchange == "ise" and (strategy == "supervised" or strategy == "transfer"): 
    assert stats_n == 49203747
    assert_almost_equal(stats_trade_with_quote_size, 0.2281, atol=0.01)
    assert_almost_equal(stats_trade_size.values.tolist(), [13.62, 4.0, 77.75], atol=0.1)
    assert_almost_equal(stats_buy_trades, 0.4746, atol=0.01)

if exchange == "cboe" and (strategy == "supervised" or strategy == "transfer"): 
    assert stats_n == 37155412
    assert_almost_equal(stats_trade_with_quote_size, 0.1397, atol=0.01)
    assert_almost_equal(stats_trade_size.values.tolist(), [18.14,5.0, 223.24], atol=0.1)
    assert_almost_equal(stats_buy_trades, 0.4500, atol=0.01)

## train-test-split ⚗️

In [11]:
df.sort_values(by="QUOTE_DATETIME", inplace=True)

In [12]:
# indices 

if exchange == "ise" and strategy == "supervised":
    train_range = df.QUOTE_DATETIME.between("2005-05-02 00:00:01", "2013-10-24 23:59:00")
    val_range = df.QUOTE_DATETIME.between("2013-10-25 00:00:01", "2015-11-05 23:59:00")
    test_range = df.QUOTE_DATETIME.between("2015-11-06 00:00:01", "2017-05-31 23:59:00")
    
if exchange == "cboe" and strategy ==  "supervised":
    train_range = df.QUOTE_DATETIME.between("2011-01-01 00:00:01", "2015-06-15 23:59:00")
    val_range = df.QUOTE_DATETIME.between("2015-06-16 00:00:01", "2016-10-12 23:59:00")
    test_range = df.QUOTE_DATETIME.between("2016-10-13 00:00:01", "2017-10-31 23:59:00")  

    
if exchange == "cboe" and strategy == "transfer":
    # use everything after *ISE* validation set for transfer learning
    test_range = df.QUOTE_DATETIME.between("2015-11-06 00:00:01", "2017-10-31 23:59:00")  
    

In [13]:

if strategy == "supervised":

    train = df[train_range]

    len_train = len(train)
    print(f"train ratio: {len_train / len(df)}")

    val = df[val_range]
    len_val = len(val)
    print(f"val ratio: {len_val / len(df)}")

    test = df[test_range]
    len_test = len(test)
    print(f"test ratio: {len_test / len(df)}")

    # check if total is sum of its parts
    assert len_train + len_val + len_test == len(df)

    output_path = f"gs://thesis-bucket-option-trade-classification/data/preprocessed/{exchange}_{strategy}_train.parquet"
    train.to_parquet(output_path)
    dataset.add_reference(output_path, name="train_set")

    output_path = f"gs://thesis-bucket-option-trade-classification/data/preprocessed/{exchange}_{strategy}_val.parquet"
    val.to_parquet(output_path)
    dataset.add_reference(output_path, name="val_set")

    output_path = f"gs://thesis-bucket-option-trade-classification/data/preprocessed/{exchange}_{strategy}_test.parquet"
    test.to_parquet(output_path)
    dataset.add_reference(output_path, name="test_set")
    

elif strategy == "transfer":
    test = df[test_range]
    len_test = len(test)
    print(f"test ratio: {len_test / len(df)}")
    
    output_path = f"gs://thesis-bucket-option-trade-classification/data/preprocessed/{exchange}_{strategy}_test.parquet"
    test.to_parquet(output_path)
    dataset.add_reference(output_path, name="test_set")
    

test ratio: 0.344262849245219




In [14]:
# Log the artifact to save it as an output of this run
run.log_artifact(dataset)

wandb.finish()


## Relevant length of dataset⏲️

In [2]:
train = pd.read_parquet(
    "gs://thesis-bucket-option-trade-classification/data/ise_log_standardized/train_set.parquet",
    engine="fastparquet"
)
val = pd.read_parquet(
    "gs://thesis-bucket-option-trade-classification/data/ise_log_standardized/val_set.parquet",
    engine="fastparquet"
)



In [3]:
val = pd.read_parquet(
    "gs://thesis-bucket-option-trade-classification/data/ise_log_standardized/val_set_20.parquet",
    engine="fastparquet"
)

In [4]:
y_train = train["buy_sell"]
X_train = train.drop(columns=["buy_sell"])

In [5]:
y_val = val["buy_sell"]
X_val = val.drop(columns=["buy_sell"])

In [6]:
X_train.head()

Unnamed: 0,TRADE_PRICE,bid_ask_size_ratio_ex,rel_bid_size_ex,rel_ask_size_ex,depth_ex,prox_ex,prox_best,spread_ex,spread_best,bid_ask_ratio_ex,...,date_month_cos,date_day_sin,date_day_cos,date_weekday_sin,date_weekday_cos,date_time_sin,date_time_cos,bin_option_type,bin_issue_type,bin_root
0,-0.075434,-0.056979,-0.080504,0.050436,0.077184,0.596202,0.545705,-0.022887,-0.000591,0.263251,...,-0.866025,0.394356,0.918958,0.0,1.0,0.608646,-0.793442,0,1,8337
1,0.435948,-0.07849,-0.105558,-0.108601,0.005799,0.027073,2.111057,-0.48766,-0.000591,-3.523268,...,-0.866025,0.394356,0.918958,0.0,1.0,0.608588,-0.793486,0,1,5899
2,1.419895,-0.075206,-0.050857,-0.055588,0.005799,-0.352349,-0.323935,0.209498,0.000184,0.551698,...,-0.866025,0.394356,0.918958,0.0,1.0,0.608588,-0.793486,0,0,5864
3,-1.081623,-0.07849,-0.105558,-0.071615,-0.049509,0.710028,0.650062,0.093306,-0.000203,-3.523268,...,-0.866025,0.394356,0.918958,0.0,1.0,0.608588,-0.793486,0,1,4943
4,-1.03759,-0.050867,-0.104091,-0.096643,1.907473,-1.111186,-2.06321,-0.022887,-0.001365,-1.198212,...,-0.866025,0.394356,0.918958,0.0,1.0,0.608588,-0.793486,0,1,4458


In [7]:
results_p = []
percentages = np.linspace(0.1, 1, 10)


# go back from last element
for p in tqdm(percentages):
	# keep ordering of data
	length = int(len(y_train) * p)
	timestamp = np.linspace(0, 1, length)
	# keep weight fixed
	for strategy in ["uniform", "exponential"]:

		if strategy == "uniform":
			weight = np.ones(length)
		else:
			weight = np.geomspace(0.001, 1, num=len(y_train))[-length:]

		train_pool = Pool(
			data=X_train.iloc[-length:],
			label=y_train.iloc[-length:],
			# cat_features=cat_features,
			weight=weight,
			timestamp=np.linspace(0, 1, length),
		)
		val_pool = Pool(data=X_val.iloc[-length:], label=y_val.iloc[-length:])

		kwargs_cat = {
			# "iterations": 1000,
			# "grow_policy": "symmetric",
			# "border_count": 254,
			"logging_level": "Silent",
			"task_type": "GPU",
			"random_seed": 42,
			"eval_metric": "Accuracy",
			# "early_stopping_rounds": 100,
		}

		clf = CatBoostClassifier(**kwargs_cat)
		clf.fit(
			train_pool,
			eval_set=val_pool,
		)

		train_acc = clf.score(train_pool)
		val_acc = clf.score(val_pool)

		res = {
			"start": -length,
			"end": -1,
			"train_acc": train_acc,
			"val_acc": val_acc,
			"strategy": strategy,
		}
		print(res)
		results_p.append(res)


  0%|          | 0/10 [00:00<?, ?it/s]

{'start': -2951032, 'end': -1, 'train_acc': 0.8503154150819103, 'val_acc': 0.7097832215984103, 'strategy': 'uniform'}
{'start': -2951032, 'end': -1, 'train_acc': 0.8502127391366817, 'val_acc': 0.7102562764483746, 'strategy': 'exponential'}
{'start': -5902064, 'end': -1, 'train_acc': 0.8560598461826235, 'val_acc': 0.7301994692026382, 'strategy': 'uniform'}
{'start': -5902064, 'end': -1, 'train_acc': 0.8558826200461398, 'val_acc': 0.7314344608936806, 'strategy': 'exponential'}
{'start': -8853096, 'end': -1, 'train_acc': 0.8605411033609034, 'val_acc': 0.7502536965599379, 'strategy': 'uniform'}
{'start': -8853096, 'end': -1, 'train_acc': 0.8597372037985356, 'val_acc': 0.7516163836922134, 'strategy': 'exponential'}
{'start': -11804128, 'end': -1, 'train_acc': 0.8644420833118719, 'val_acc': 0.7552953151954804, 'strategy': 'uniform'}
{'start': -11804128, 'end': -1, 'train_acc': 0.8623050343066425, 'val_acc': 0.7575980352021201, 'strategy': 'exponential'}
{'start': -14755160, 'end': -1, 'train

In [8]:
results_df = pd.DataFrame(results_p)

In [9]:
results_df

Unnamed: 0,start,end,train_acc,val_acc,strategy
0,-2951032,-1,0.850315,0.709783,uniform
1,-2951032,-1,0.850213,0.710256,exponential
2,-5902064,-1,0.85606,0.730199,uniform
3,-5902064,-1,0.855883,0.731434,exponential
4,-8853096,-1,0.860541,0.750254,uniform
5,-8853096,-1,0.859737,0.751616,exponential
6,-11804128,-1,0.864442,0.755295,uniform
7,-11804128,-1,0.862305,0.757598,exponential
8,-14755160,-1,0.868171,0.753912,uniform
9,-14755160,-1,0.864821,0.757307,exponential


In [10]:
results_df.to_csv("learning_curves_gbm_default_params.csv")

## Time consistency

Check if features maintain their predictive power over time, buy training on the first $10~\%$ of the training set and predicting the last $10~\%$ feature by feature. Remove features or further investigate features where accuracy is just above or below $0.5$. Technique found in this [notebook](https://www.kaggle.com/code/cdeotte/xgb-fraud-with-magic-0-9600/notebook).

In [4]:
data = pd.read_parquet(
    "gs://thesis-bucket-option-trade-classification/data/classical_size_features_log_normalized/train_set_extended_60.parquet",
    engine="fastparquet"
)

In [5]:
# try to predict last 10 % in training set using first 10 % of features. Accuracy should be above 50 %.
label = data["buy_sell"]
data.drop(columns=["buy_sell"], inplace=True)

In [6]:
y_train = label.iloc[0:len(label)//10]
y_test = label.iloc[-len(data)//10:]

X_train = data.iloc[0:len(data)//10,:]
X_test = data.iloc[-len(data)//10:,:]

del label, data

In [7]:
y_train.shape

(2951032,)

In [8]:
params = {
        "od_type": "Iter",
        "logging_level": "Silent",
        "loss_function": "Logloss",
        "task_type": "GPU",
        "cat_features": None,
        "random_seed": 42,
        "eval_metric":"Accuracy",
        "iterations":1000,
        "early_stopping_rounds":100,
}

In [13]:
columns = X_train.columns

In [21]:
results = []
for col in tqdm(columns):
  model = CatBoostClassifier(**params)
  model.fit(X_train[[col]], y_train, eval_set=(X_test[[col]],y_test))
  acc = model.score(X_test[[col]], y_test)
  results.append([col,acc])

  0%|          | 0/24 [00:00<?, ?it/s]

In [24]:
results_df = pd.DataFrame(results, columns=["feature", "accuracy"])
results_df.sort_values(by="accuracy")

Unnamed: 0,feature,accuracy
10,chg_ex_lag,0.518701
21,TRADE_SIZE,0.522728
9,chg_ex_lead,0.527728
12,chg_all_lag,0.547701
8,bid_ask_ratio_ex,0.548045
3,rel_ask_size_ex,0.550789
23,ask_size_ex,0.55215
19,price_ex_lag,0.552589
11,chg_all_lead,0.554612
0,TRADE_PRICE,0.554639


Few features are actually consistent over time and are more informative than a random guess. These include entire includes features related to the proximity of the quote and relative bid size.

Some features like `chg_ex_lead` are hard to exclude. Better weight observations, as suggested in `3.0c-feature-engineering.ipynb`.