In [1]:
import os

import gcsfs
import numpy as np
import pandas as pd
import wandb

from tqdm.auto import tqdm


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
exchange = "ise"
strategy = "unsupervised"
max_i = 30  # number of partial files


In [3]:
# connect to weights and biases
run = wandb.init(project="thesis", job_type="dataset-creation", entity="fbv")
dataset = wandb.Artifact(name=f"{exchange}_{strategy}_raw", type="preprocessed_data")


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mkarelze[0m ([33mfbv[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [4]:
os.environ["GCLOUD_PROJECT"] = "flowing-mantis-239216"

fs = gcsfs.GCSFileSystem(project="thesis")




In [5]:
files = [
    f"gs://thesis-bucket-option-trade-classification/data/preprocessed/{'matched' if strategy == 'supervised' else 'unmatched'}_{exchange}_quotes_min_mem_usage_extended_part_{i:04d}.parquet"
    for i in range(0, max_i)
]

columns = [
    "QUOTE_DATETIME",
    "ROOT",
    "EXPIRATION",
    "STRK_PRC",
    "OPTION_TYPE",
    "TRADE_SIZE",
    "TRADE_PRICE",
    "BEST_BID",
    "BEST_ASK",
    "ask_ex",
    "bid_ex",
    "bid_size_ex",
    "ask_size_ex",
    "price_all_lead",
    "price_all_lag",
    "optionid",
    "price_ex_lead",
    "price_ex_lag",
    "issue_type",
    "myn",
]

dfs = [pd.read_parquet(gc_file, columns=columns) for gc_file in tqdm(files)]
df = pd.concat(dfs)

del dfs


100%|██████████| 30/30 [01:12<00:00,  2.42s/it]


In [6]:
df.memory_usage(deep=True).sum()


4246414955

In [7]:
df.sort_values(by="QUOTE_DATETIME", inplace=True)


In [8]:
# There are a few duplicates in the unlabelled dataset, removed see p. 7

subset = [
    "QUOTE_DATETIME",
    "ROOT",
    "OPTION_TYPE",
    "EXPIRATION",
    "STRK_PRC",
    "TRADE_SIZE",
    "TRADE_PRICE",
    "BEST_BID",
    "BEST_ASK",
    "ask_ex",
    "bid_ex",
    "bid_size_ex",
    "ask_size_ex",
    # FIXME is different for loballed and unlabelled trades
    # 'optionid','issue_type', 'myn',
    # 'price_all_lead', 'price_all_lag',
    # 'price_ex_lead', 'price_ex_lag',
]

# df.drop_duplicates(keep='first', subset=subset, inplace=True)


In [9]:
columns = ["buy_sell", *df.columns.tolist()]
labelled_df = pd.read_parquet(
    f"gs://thesis-bucket-option-trade-classification/data/preprocessed/{exchange}_supervised_train.parquet",
    columns=columns,
)


In [10]:
# slice labelled dataframe to range of unlabelled
date_range = labelled_df.QUOTE_DATETIME.between(
    df.QUOTE_DATETIME.min(), df.QUOTE_DATETIME.max()
)
labelled_df = labelled_df[date_range]


In [11]:
# labelled_df.drop_duplicates(keep='first', subset=subset, inplace=True)


In [12]:
labelled_df["duplicated"] = labelled_df.duplicated(subset=subset)


In [13]:
labelled_df["duplicated"].value_counts()


False    4066366
True       29261
Name: duplicated, dtype: int64

In [14]:
labelled_df[labelled_df["optionid"] == 83414152.0].head(20).T


index,25414885,25414881,25414882,25421396,25484785,25502223,25573097,25573112,25605276,25627039,25648828,25648830,25651784,25657637,25679640,25682745,25759804,25768170,25773219,25793669
buy_sell,-1,-1,-1,-1,-1,1,-1,-1,1,-1,1,1,1,-1,-1,-1,-1,-1,-1,-1
QUOTE_DATETIME,2012-10-24 09:30:14,2012-10-24 09:30:14,2012-10-24 09:30:14,2012-10-24 13:28:33,2012-11-05 09:30:38,2012-11-06 12:49:53,2012-11-14 10:59:07,2012-11-14 10:59:33,2012-11-16 11:42:41,2012-11-19 13:45:08,2012-11-21 12:51:45,2012-11-21 12:51:45,2012-11-21 15:03:45,2012-11-23 10:37:52,2012-11-27 10:43:24,2012-11-27 12:49:36,2012-12-05 14:53:02,2012-12-06 12:06:23,2012-12-06 15:47:03,2012-12-10 13:06:10
ROOT,CVX,CVX,CVX,CVX,CVX,CVX,CVX,CVX,CVX,CVX,CVX,CVX,CVX,CVX,CVX,CVX,CVX,CVX,CVX,CVX
EXPIRATION,2013-06-22 00:00:00,2013-06-22 00:00:00,2013-06-22 00:00:00,2013-06-22 00:00:00,2013-06-22 00:00:00,2013-06-22 00:00:00,2013-06-22 00:00:00,2013-06-22 00:00:00,2013-06-22 00:00:00,2013-06-22 00:00:00,2013-06-22 00:00:00,2013-06-22 00:00:00,2013-06-22 00:00:00,2013-06-22 00:00:00,2013-06-22 00:00:00,2013-06-22 00:00:00,2013-06-22 00:00:00,2013-06-22 00:00:00,2013-06-22 00:00:00,2013-06-22 00:00:00
STRK_PRC,100.0,100.0,100.0,100.0,100.0,100.0,90.0,105.0,95.0,100.0,100.0,100.0,95.0,105.0,80.0,105.0,100.0,100.0,100.0,100.0
OPTION_TYPE,P,P,P,P,P,P,C,P,C,P,P,P,C,P,C,P,P,P,P,P
TRADE_SIZE,40,5,5,1,1,6,1,2,30,1,2,5,1,2,5,15,25,91,4,1
TRADE_PRICE,4.32,4.35,4.35,4.27,4.69,4.06,14.77,8.62,10.1,5.44,5.32,5.32,11.29,6.93,24.66,7.21,4.44,4.2,4.12,3.95
BEST_BID,4.35,4.35,4.35,4.25,4.6,4.0,14.75,8.6,10.05,5.4,5.2,5.2,11.15,6.9,24.549999,7.15,4.4,4.2,4.05,3.9
BEST_ASK,4.5,4.5,4.5,4.3,4.8,4.15,15.0,8.65,10.15,5.55,5.35,5.35,11.45,7.0,25.5,7.3,4.5,4.25,4.2,4.0


In [15]:
len_labelled_df = len(labelled_df)
len_df = len(df)


In [16]:
# merge with indicator and find out which ones can be labelled
df_w_indicator = pd.merge(
    df,
    labelled_df,
    on=[
        "QUOTE_DATETIME",
        "ROOT",
        "OPTION_TYPE",
        "EXPIRATION",
        "STRK_PRC",
        "TRADE_SIZE",
        "TRADE_PRICE",
        "BEST_BID",
        "BEST_ASK",
        "ask_ex",
        "bid_ex",
        "bid_size_ex",
        "ask_size_ex",
        # myn seems to be different
        #'issue_type', 'optionid',
        # 'price_all_lead', 'price_all_lag', # FIXME is different for loballed and unlabelled trades
        # 'price_ex_lead', 'price_ex_lag', # FIXME is different for loballed and unlabelled trades
    ],
    how="left",
    indicator="exists",
    suffixes=("_unlabelled", "_labelled"),
)


In [17]:
df_w_indicator["exists"].value_counts()


left_only     25746535
both           4191137
right_only           0
Name: exists, dtype: int64

In [18]:
len(df_w_indicator)


29937672

In [19]:
len(df)


29842162

In [20]:
len(labelled_df)


4095627

In [21]:
# sort columns lexi.
df_w_indicator.sort_index(axis=1, inplace=True)


In [22]:
df_w_indicator[df_w_indicator["exists"] == "both"].head(20).T


Unnamed: 0,0,4,17,29,30,38,75,87,97,99,103,109,123,126,127,143,160,163,172,187
BEST_ASK,0.0,0.75,0.35,0.25,0.02,14.8,8.6,0.0,0.3,0.0,0.0,7.8,0.0,5.7,0.55,0.0,0.65,5.0,8.9,4.65
BEST_BID,0.0,0.5,0.25,0.0,0.01,10.2,8.1,0.0,0.15,0.0,0.0,7.5,0.0,5.2,0.3,0.0,0.5,4.6,8.0,4.5
EXPIRATION,2013-01-19 00:00:00,2013-01-19 00:00:00,2012-11-17 00:00:00,2012-12-22 00:00:00,2012-10-26 00:00:00,2014-01-18 00:00:00,2012-11-17 00:00:00,2013-01-19 00:00:00,2013-03-16 00:00:00,2013-01-19 00:00:00,2012-12-22 00:00:00,2012-10-26 00:00:00,2014-01-18 00:00:00,2013-01-19 00:00:00,2013-01-19 00:00:00,2013-01-19 00:00:00,2013-03-16 00:00:00,2012-12-22 00:00:00,2013-06-22 00:00:00,2013-04-20 00:00:00
OPTION_TYPE,C,C,P,C,P,P,C,P,P,C,P,C,C,C,C,P,P,C,C,P
QUOTE_DATETIME,2012-10-24 09:30:00,2012-10-24 09:30:00,2012-10-24 09:30:00,2012-10-24 09:30:00,2012-10-24 09:30:00,2012-10-24 09:30:00,2012-10-24 09:30:00,2012-10-24 09:30:00,2012-10-24 09:30:00,2012-10-24 09:30:00,2012-10-24 09:30:00,2012-10-24 09:30:00,2012-10-24 09:30:00,2012-10-24 09:30:00,2012-10-24 09:30:00,2012-10-24 09:30:00,2012-10-24 09:30:00,2012-10-24 09:30:00,2012-10-24 09:30:01,2012-10-24 09:30:01
ROOT,XLK,ALXN,OIL,IRBT,QQQ,NFLX,QCOR,FB,FB,QCOR,FB,FB,NFLX,QCOR,QCOR,UCO,FB,ERX,FB,EBAY
STRK_PRC,33.0,125.0,20.0,25.0,60.0,55.0,19.0,16.0,14.0,47.0,17.0,16.5,27.5,24.0,46.0,34.0,17.0,50.0,17.0,50.0
TRADE_PRICE,0.05,0.75,0.35,0.25,0.01,11.38,8.1,0.25,0.3,0.25,0.25,7.62,33.700001,5.2,0.3,7.9,0.65,4.6,8.0,4.61
TRADE_SIZE,1,4,50,4,4,1,1,10,1,2,1,20,1,10,1,2,10,1,1,2
ask_ex,,0.75,0.35,0.25,0.02,14.8,8.6,,0.3,,,7.8,,5.7,0.55,,0.65,5.0,8.9,4.65


## Analysis of Accucacies

In [23]:
foo = df_w_indicator[df_w_indicator["exists"] == "both"][
    ["price_ex_lag_labelled", "price_ex_lag_unlabelled", "buy_sell", "TRADE_PRICE"]
]

foo["tick_unlabelled"] = np.where(
    foo["TRADE_PRICE"] > foo["price_ex_lag_unlabelled"],
    1,
    np.where(foo["TRADE_PRICE"] < foo["price_ex_lag_unlabelled"], -1, np.nan),
)
foo["tick_labelled"] = np.where(
    foo["TRADE_PRICE"] > foo["price_ex_lag_labelled"],
    1,
    np.where(foo["TRADE_PRICE"] < foo["price_ex_lag_labelled"], -1, np.nan),
)


In [24]:
foo.head()


Unnamed: 0,price_ex_lag_labelled,price_ex_lag_unlabelled,buy_sell,TRADE_PRICE,tick_unlabelled,tick_labelled
0,0.06,,1.0,0.05,,-1.0
4,1.0,,1.0,0.75,,-1.0
17,0.2,,-1.0,0.35,,1.0
29,0.8,,1.0,0.25,,-1.0
30,0.02,,1.0,0.01,,-1.0


In [25]:
acc_unlabelled = (foo.buy_sell == foo.tick_unlabelled).sum() / len(foo)
acc_labelled = (foo.buy_sell == foo.tick_labelled).sum() / len(foo)

print(acc_unlabelled)
print(acc_labelled)


0.4599821003226571
0.4655459842997258


## Creation of Unsupervised Sample

In [26]:
df_w_indicator.head(5).T


Unnamed: 0,0,1,2,3,4
BEST_ASK,0.0,13.7,0.0,0.0,0.75
BEST_BID,0.0,12.3,0.0,0.0,0.5
EXPIRATION,2013-01-19 00:00:00,2012-11-17 00:00:00,2013-01-19 00:00:00,2012-12-22 00:00:00,2013-01-19 00:00:00
OPTION_TYPE,C,P,C,C,C
QUOTE_DATETIME,2012-10-24 09:30:00,2012-10-24 09:30:00,2012-10-24 09:30:00,2012-10-24 09:30:00,2012-10-24 09:30:00
ROOT,XLK,ISRG,FWLT,FWLT,ALXN
STRK_PRC,33.0,545.0,24.0,24.0,125.0
TRADE_PRICE,0.05,13.2,1.17,0.92,0.75
TRADE_SIZE,1,10,10,10,4
ask_ex,,,,,0.75


In [27]:
# add fields
df_w_indicator["buy_sell"] = 0


In [28]:
# create day_vol of option series. Note optionid is not enough for option series. Use key (p. 8) from paper instead.
df_w_indicator["day_vol"] = df_w_indicator.groupby(
    [
        df_w_indicator["QUOTE_DATETIME"].dt.date,
        df_w_indicator["OPTION_TYPE"],
        df_w_indicator["ROOT"],
        df_w_indicator["STRK_PRC"],
        df_w_indicator["EXPIRATION"],
    ]
)["TRADE_SIZE"].transform("sum")


In [29]:
df_w_indicator


Unnamed: 0,BEST_ASK,BEST_BID,EXPIRATION,OPTION_TYPE,QUOTE_DATETIME,ROOT,STRK_PRC,TRADE_PRICE,TRADE_SIZE,ask_ex,...,optionid_unlabelled,price_all_lag_labelled,price_all_lag_unlabelled,price_all_lead_labelled,price_all_lead_unlabelled,price_ex_lag_labelled,price_ex_lag_unlabelled,price_ex_lead_labelled,price_ex_lead_unlabelled,day_vol
0,0.00,0.00,2013-01-19,C,2012-10-24 09:30:00,XLK,33.0,0.05,1,,...,65911088.0,0.06,0.06,0.02,0.02,0.06,,0.01,0.01,1
1,13.70,12.30,2012-11-17,P,2012-10-24 09:30:00,ISRG,545.0,13.20,10,,...,83291848.0,,12.50,,15.40,,,,12.48,10
2,0.00,0.00,2013-01-19,C,2012-10-24 09:30:00,FWLT,24.0,1.17,10,,...,81797024.0,,1.32,,1.01,,,,1.01,10
3,0.00,0.00,2012-12-22,C,2012-10-24 09:30:00,FWLT,24.0,0.92,10,,...,83425032.0,,0.84,,1.05,,,,1.05,27
4,0.75,0.50,2013-01-19,C,2012-10-24 09:30:00,ALXN,125.0,0.75,4,0.75,...,81791240.0,1.00,1.00,0.63,0.63,1.00,,0.30,0.30,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29937667,2.20,2.18,2013-12-21,C,2013-10-24 16:14:36,QQQ,82.0,2.18,5,2.21,...,100511416.0,,2.20,,2.25,,2.20,,,1007
29937668,2.20,2.18,2013-12-21,C,2013-10-24 16:14:36,QQQ,82.0,2.18,3,2.21,...,100511416.0,,2.20,,2.25,,2.20,,,1007
29937669,2.20,2.18,2013-12-21,C,2013-10-24 16:14:36,QQQ,82.0,2.18,5,2.21,...,100511416.0,,2.20,,2.25,,2.20,,,1007
29937670,2.20,2.18,2013-12-21,C,2013-10-24 16:14:36,QQQ,82.0,2.18,10,2.21,...,100511416.0,,2.20,,2.25,,2.20,,,1007


In [30]:
train = df_w_indicator[df_w_indicator["exists"] == "left_only"].drop(
    columns=[
        "issue_type_labelled",
        "myn_labelled",
        "optionid_labelled",
        "price_all_lag_labelled",
        "price_all_lead_labelled",
        "price_ex_lag_labelled",
        "price_ex_lead_labelled",
        "exists",
        "duplicated",
    ]
)
train.columns = train.columns.str.replace(r"_unlabelled$", "", regex=True)


In [31]:
train.head().T


Unnamed: 0,1,2,3,5,6
BEST_ASK,13.7,0.0,0.0,0.51,0.75
BEST_BID,12.3,0.0,0.0,0.01,0.01
EXPIRATION,2012-11-17 00:00:00,2013-01-19 00:00:00,2012-12-22 00:00:00,2012-11-17 00:00:00,2012-11-17 00:00:00
OPTION_TYPE,P,C,C,C,C
QUOTE_DATETIME,2012-10-24 09:30:00,2012-10-24 09:30:00,2012-10-24 09:30:00,2012-10-24 09:30:00,2012-10-24 09:30:00
ROOT,ISRG,FWLT,FWLT,NFLX,NFLX
STRK_PRC,545.0,24.0,24.0,80.0,70.0
TRADE_PRICE,13.2,1.17,0.92,0.39,0.56
TRADE_SIZE,10,10,10,1,1
ask_ex,,,,0.51,0.75


In [32]:
train.describe()


Unnamed: 0,BEST_ASK,BEST_BID,STRK_PRC,TRADE_PRICE,TRADE_SIZE,ask_ex,ask_size_ex,bid_ex,bid_size_ex,buy_sell,myn,optionid,price_all_lag,price_all_lead,price_ex_lag,price_ex_lead,day_vol
count,25746540.0,25746540.0,25746540.0,25746540.0,25746540.0,25657330.0,25657330.0,25657330.0,25657330.0,25746535.0,25737040.0,25738661.0,25619740.0,25585100.0,25224220.0,25186020.0,25746540.0
mean,3.784462,3.688445,219.18,3.737355,21.48826,3.808429,163.0709,3.668497,158.4982,0.0,7.62595,92915504.0,3.733369,3.736671,3.732189,3.730572,1350.718
std,11.12051,10.84083,330.718,10.88925,215.1659,10.96597,470.7714,10.82188,454.9106,0.0,1849.851,11753841.0,10.76787,10.94402,10.55607,10.60334,3950.758
min,0.0,0.0,0.22,0.01,1.0,0.0,0.0,0.0,-20.0,0.0,-1.3182e-05,34917692.0,0.01,0.01,0.01,0.01,1.0
25%,0.48,0.44,38.5,0.46,1.0,0.49,13.0,0.44,13.0,0.0,0.9453334,83371480.0,0.46,0.46,0.47,0.46,61.0
50%,1.25,1.2,90.0,1.22,5.0,1.25,38.0,1.18,37.0,0.0,0.9836326,100374712.0,1.22,1.22,1.23,1.22,252.0
75%,3.1,3.0,270.0,3.05,11.0,3.15,104.0,2.98,103.0,0.0,1.003285,100906736.0,3.05,3.05,3.05,3.05,1035.0
max,10000.4,2215.7,4700.0,2215.7,208561.0,2220.5,23268.0,2215.7,22195.0,0.0,918332.9,127872056.0,2215.7,10000.0,2215.7,2215.7,348121.0


In [33]:
output_path = f"gs://thesis-bucket-option-trade-classification/data/preprocessed/{exchange}_{strategy}_train.parquet"
train.to_parquet(output_path)
dataset.add_reference(output_path, name="train_set")




[<ManifestEntry ref: gs://thesis-bucket-option-trade-classification/data/preprocessed/ise_unsupervised_train.parquet/train_set>]

In [34]:
# Log the artifact to save it as an output of this run
run.log_artifact(dataset)
wandb.finish()
