In [1]:
import sys
import os

import gcsfs
import numpy as np
import pandas as pd
import wandb
from catboost import CatBoostClassifier, Pool
from numpy.testing import assert_almost_equal
from pandas._testing.asserters import assert_almost_equal
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_predict
# from tqdm.notebook import tqdm

sys.path.append("..")

In [2]:
exchange = "ise"
strategy =  "unsupervised"
max_i = 30 # number of partial files

In [76]:
# connect to weights and biases
run = wandb.init(project="thesis", job_type="dataset-creation", entity="fbv")
dataset = wandb.Artifact(name=f"{exchange}_{strategy}_raw", type="preprocessed_data")


In [4]:
os.environ["GCLOUD_PROJECT"] = "flowing-mantis-239216"

fs = gcsfs.GCSFileSystem(project="thesis")



In [37]:

files = [f"gs://thesis-bucket-option-trade-classification/data/preprocessed/{'matched' if strategy == 'supervised' else 'unmatched'}_{exchange}_quotes_min_mem_usage_extended_part_{i:04d}.parquet" for i in range(0, max_i)]

# asks = [f"ASK_{i}" for i in range(1, 17)]
# bids = [f"BID_{i}" for i in range(1, 17)]

columns = [
    "QUOTE_DATETIME",
    "ROOT",
    "EXPIRATION",
    "STRK_PRC",
    "OPTION_TYPE",
    "TRADE_SIZE",
    "TRADE_PRICE",
    "BEST_BID",
    "BEST_ASK",
    "ask_ex",
    "bid_ex",
    "bid_size_ex",
    "ask_size_ex",
    "price_all_lead",
    "price_all_lag",
    "optionid",
    # "day_vol",
    "price_ex_lead",
    "price_ex_lag",
    "issue_type",
    "myn",
    # *asks,
    # *bids,
    #"buy_sell",
]

dfs = [pd.read_parquet(gc_file, columns=columns) for gc_file in files]
df = pd.concat(dfs)

del dfs

In [38]:
df.memory_usage(deep=True).sum()


4246414955

In [39]:
df.sort_values(by="QUOTE_DATETIME", inplace=True)

In [40]:
# There are a few duplicates in the unlabelled dataset, removed see p. 7

subset = ['QUOTE_DATETIME', 'ROOT',
         'OPTION_TYPE', 'EXPIRATION', 'STRK_PRC', 
       'TRADE_SIZE', 'TRADE_PRICE', 'BEST_BID', 'BEST_ASK', 'ask_ex', 'bid_ex',
       'bid_size_ex', 'ask_size_ex', 
       # FIXME is different for loballed and unlabelled trades
       # 'optionid','issue_type', 'myn', 
       # 'price_all_lead', 'price_all_lag',
       # 'price_ex_lead', 'price_ex_lag', 
       ]

# df.drop_duplicates(keep='first', subset=subset, inplace=True)

In [41]:
# FIXME: replace with generic name
columns = ["buy_sell", *df.columns.tolist()]
labelled_df = pd.read_parquet("gs://thesis-bucket-option-trade-classification/data/preprocessed/train_set_ultra_60.parquet", columns=columns)

In [42]:
# slice labelled dataframe to range of unlabelled
date_range = labelled_df.QUOTE_DATETIME.between(df.QUOTE_DATETIME.min(), df.QUOTE_DATETIME.max())
labelled_df = labelled_df[date_range]

In [43]:
# labelled_df.drop_duplicates(keep='first', subset=subset, inplace=True)

In [55]:
labelled_df["duplicated"] = labelled_df.duplicated(subset=subset)

In [85]:
labelled_df[(labelled_df["QUOTE_DATETIME"]=="2012-10-24 09:30:14")  & (labelled_df["optionid"]==83414152.0)].head(20).T

Unnamed: 0,25414881,25414882,25414885
buy_sell,-1,-1,-1
QUOTE_DATETIME,2012-10-24 09:30:14,2012-10-24 09:30:14,2012-10-24 09:30:14
ROOT,CVX,CVX,CVX
EXPIRATION,2013-06-22 00:00:00,2013-06-22 00:00:00,2013-06-22 00:00:00
STRK_PRC,100.0,100.0,100.0
OPTION_TYPE,P,P,P
TRADE_SIZE,5,5,40
TRADE_PRICE,4.35,4.35,4.32
BEST_BID,4.35,4.35,4.35
BEST_ASK,4.5,4.5,4.5


In [44]:
len_labelled_df = len(labelled_df)
len_df = len(df)

In [45]:
# merge with indicator and find out which ones can be labelled
df_w_indicator = pd.merge(df, labelled_df, on=['QUOTE_DATETIME', 'ROOT', 
                                            'OPTION_TYPE', 
                                            'EXPIRATION', 'STRK_PRC',
       'TRADE_SIZE', 'TRADE_PRICE', 'BEST_BID', 'BEST_ASK', 'ask_ex', 'bid_ex', 'bid_size_ex', 'ask_size_ex',
       # myn seems to be different
       #'issue_type', 'optionid',
       # 'price_all_lead', 'price_all_lag', # FIXME is different for loballed and unlabelled trades
       # 'price_ex_lead', 'price_ex_lag', # FIXME is different for loballed and unlabelled trades 
       ], how="left", indicator='exists', suffixes=("_unlabelled", "_labelled"), sort=True)

In [47]:
df_w_indicator["exists"].value_counts()

left_only     25746535
both           4191137
right_only           0
Name: exists, dtype: int64

In [48]:
len(df_w_indicator)

29937672

In [49]:
len(df)

29842162

In [50]:
len(labelled_df)

4095627

In [51]:
df_w_indicator.sort_index(axis=1, inplace=True)

In [52]:
df_w_indicator[df_w_indicator["exists"]=="both"].head(20).T

Unnamed: 0,0,21,24,61,63,65,66,83,127,145,146,150,151,152,153,159,163,164,184,185
BEST_ASK,0.75,5.0,7.8,0.0,0.0,0.3,0.65,0.25,0.0,14.8,0.35,8.6,5.7,0.55,0.0,0.02,0.0,0.0,1.7,4.65
BEST_BID,0.5,4.6,7.5,0.0,0.0,0.15,0.5,0.0,0.0,10.2,0.25,8.1,5.2,0.3,0.0,0.01,0.0,0.0,1.6,4.5
EXPIRATION,2013-01-19 00:00:00,2012-12-22 00:00:00,2012-10-26 00:00:00,2012-12-22 00:00:00,2013-01-19 00:00:00,2013-03-16 00:00:00,2013-03-16 00:00:00,2012-12-22 00:00:00,2014-01-18 00:00:00,2014-01-18 00:00:00,2012-11-17 00:00:00,2012-11-17 00:00:00,2013-01-19 00:00:00,2013-01-19 00:00:00,2013-01-19 00:00:00,2012-10-26 00:00:00,2013-01-19 00:00:00,2013-01-19 00:00:00,2013-04-20 00:00:00,2013-04-20 00:00:00
OPTION_TYPE,C,C,C,P,P,P,P,C,C,P,P,C,C,C,C,P,P,C,P,P
QUOTE_DATETIME,2012-10-24 09:30:00,2012-10-24 09:30:00,2012-10-24 09:30:00,2012-10-24 09:30:00,2012-10-24 09:30:00,2012-10-24 09:30:00,2012-10-24 09:30:00,2012-10-24 09:30:00,2012-10-24 09:30:00,2012-10-24 09:30:00,2012-10-24 09:30:00,2012-10-24 09:30:00,2012-10-24 09:30:00,2012-10-24 09:30:00,2012-10-24 09:30:00,2012-10-24 09:30:00,2012-10-24 09:30:00,2012-10-24 09:30:00,2012-10-24 09:30:01,2012-10-24 09:30:01
ROOT,ALXN,ERX,FB,FB,FB,FB,FB,IRBT,NFLX,NFLX,OIL,QCOR,QCOR,QCOR,QCOR,QQQ,UCO,XLK,EBAY,EBAY
STRK_PRC,125.0,50.0,16.5,17.0,16.0,14.0,17.0,25.0,27.5,55.0,20.0,19.0,24.0,46.0,47.0,60.0,34.0,33.0,42.0,50.0
TRADE_PRICE,0.75,4.6,7.62,0.25,0.25,0.3,0.65,0.25,33.700001,11.38,0.35,8.1,5.2,0.3,0.25,0.01,7.9,0.05,1.62,4.61
TRADE_SIZE,4,1,20,1,10,1,10,4,1,1,50,1,10,1,2,4,2,1,2,2
ask_ex,0.75,5.0,7.8,,,0.3,0.65,0.25,,14.8,0.35,8.6,5.7,0.55,,0.02,,,1.72,4.65


In [66]:
# add fields
df_w_indicator["buy_sell"] = 0

In [67]:
df_w_indicator["day_vol"] = df_w_indicator.groupby([df_w_indicator['QUOTE_DATETIME'].dt.date, df_w_indicator['optionid_unlabelled']])['TRADE_SIZE'].transform('sum')

In [68]:
df_w_indicator

Unnamed: 0,BEST_ASK,BEST_BID,EXPIRATION,OPTION_TYPE,QUOTE_DATETIME,ROOT,STRK_PRC,TRADE_PRICE,TRADE_SIZE,ask_ex,...,optionid_unlabelled,price_all_lag_labelled,price_all_lag_unlabelled,price_all_lead_labelled,price_all_lead_unlabelled,price_ex_lag_labelled,price_ex_lag_unlabelled,price_ex_lead_labelled,price_ex_lead_unlabelled,day_vol
0,0.75,0.50,2013-01-19,C,2012-10-24 09:30:00,ALXN,125.0,0.75,4,0.75,...,81791240.0,1.0,1.00,0.63,0.63,1.0,,0.3,0.30,31.0
1,0.00,0.00,2013-02-16,C,2012-10-24 09:30:00,ALXN,125.0,1.55,10,,...,82265992.0,,6.60,,0.50,,,,0.16,10.0
2,0.00,0.00,2012-11-17,P,2012-10-24 09:30:00,ALXN,80.0,0.35,10,,...,81567600.0,,0.65,,0.52,,,,0.30,522.0
3,0.00,0.00,2013-01-19,P,2012-10-24 09:30:00,ALXN,90.0,3.20,10,,...,81708440.0,,3.65,,3.80,,,,3.80,23.0
4,0.00,0.00,2013-05-18,P,2012-10-24 09:30:00,ALXN,60.0,1.20,10,,...,83336664.0,,1.35,,1.25,,,,1.25,20.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29937667,2.20,2.18,2013-12-21,C,2013-10-24 16:14:36,QQQ,82.0,2.18,5,2.21,...,100511416.0,,2.20,,2.25,,2.20,,,3864.0
29937668,2.20,2.18,2013-12-21,C,2013-10-24 16:14:36,QQQ,82.0,2.18,5,2.21,...,100511416.0,,2.20,,2.25,,2.20,,,3864.0
29937669,2.20,2.18,2013-12-21,C,2013-10-24 16:14:36,QQQ,82.0,2.18,10,2.21,...,100511416.0,,2.20,,2.25,,2.20,,,3864.0
29937670,2.20,2.18,2013-12-21,C,2013-10-24 16:14:36,QQQ,82.0,2.18,14,2.21,...,100511416.0,,2.20,,2.25,,2.20,,,3864.0


In [77]:
train = df_w_indicator[df_w_indicator["exists"]=="left_only"].drop(columns=['issue_type_labelled', 'myn_labelled',  'optionid_labelled', 'price_all_lag_labelled', 'price_all_lead_labelled', 'price_ex_lag_labelled', 'price_ex_lead_labelled', "exists"])
train.columns = train.columns.str.replace(r'_unlabelled$', '')

  train.columns = train.columns.str.replace(r'_unlabelled$', '')


In [78]:
train.head().T

Unnamed: 0,1,2,3,4,5
BEST_ASK,0.0,0.0,0.0,0.0,1.25
BEST_BID,0.0,0.0,0.0,0.0,1.2
EXPIRATION,2013-02-16 00:00:00,2012-11-17 00:00:00,2013-01-19 00:00:00,2013-05-18 00:00:00,2013-05-18 00:00:00
OPTION_TYPE,C,P,P,P,P
QUOTE_DATETIME,2012-10-24 09:30:00,2012-10-24 09:30:00,2012-10-24 09:30:00,2012-10-24 09:30:00,2012-10-24 09:30:00
ROOT,ALXN,ALXN,ALXN,ALXN,ALXN
STRK_PRC,125.0,80.0,90.0,60.0,60.0
TRADE_PRICE,1.55,0.35,3.2,1.2,1.25
TRADE_SIZE,10,10,10,10,10
ask_ex,,,,,1.25


In [79]:
output_path = f"gs://thesis-bucket-option-trade-classification/data/preprocessed/{exchange}_{strategy}_train.parquet"
train.to_parquet(output_path)
dataset.add_reference(output_path, name="train_set")



[<ManifestEntry ref: gs://thesis-bucket-option-trade-classification/data/preprocessed/ise_unsupervised_train.parquet/train_set>]

In [80]:
# Log the artifact to save it as an output of this run
run.log_artifact(dataset)
wandb.finish()
