In [None]:
import os

import gcsfs
import pandas as pd
import wandb

from tqdm.auto import tqdm


In [None]:
exchange = "ise"
strategy = "unsupervised"
max_i = 30  # number of partial files


In [None]:
# connect to weights and biases
run = wandb.init(project="thesis", job_type="dataset-creation", entity="fbv")
dataset = wandb.Artifact(name=f"{exchange}_{strategy}_raw", type="preprocessed_data")


In [None]:
os.environ["GCLOUD_PROJECT"] = "flowing-mantis-239216"

fs = gcsfs.GCSFileSystem(project="thesis")


In [None]:
files = [
    f"gs://thesis-bucket-option-trade-classification/data/preprocessed/{'matched' if strategy == 'supervised' else 'unmatched'}_{exchange}_quotes_min_mem_usage_extended_part_{i:04d}.parquet"
    for i in range(0, max_i)
]

columns = [
    "QUOTE_DATETIME",
    "ROOT",
    "EXPIRATION",
    "STRK_PRC",
    "OPTION_TYPE",
    "TRADE_SIZE",
    "TRADE_PRICE",
    "BEST_BID",
    "BEST_ASK",
    "ask_ex",
    "bid_ex",
    "bid_size_ex",
    "ask_size_ex",
    "price_all_lead",
    "price_all_lag",
    "optionid",
    "price_ex_lead",
    "price_ex_lag",
    "issue_type",
    "myn",
]

dfs = [pd.read_parquet(gc_file, columns=columns) for gc_file in tqdm(files)]
df = pd.concat(dfs)

del dfs


In [None]:
df.memory_usage(deep=True).sum()


In [None]:
df.sort_values(by="QUOTE_DATETIME", inplace=True)


In [None]:
df.head()


In [None]:
columns = ["buy_sell", *df.columns.tolist()]
labelled_df = pd.read_parquet(
    f"gs://thesis-bucket-option-trade-classification/data/preprocessed/{exchange}_supervised_train.parquet",
    columns=columns,
)


In [None]:
labelled_df.head()


In [None]:
# slice labelled dataframe to range of unlabelled
date_range = labelled_df.QUOTE_DATETIME.between(
    df.QUOTE_DATETIME.min(), df.QUOTE_DATETIME.max()
)
labelled_df = labelled_df[date_range]


In [None]:
subset = [
    "QUOTE_DATETIME",
    "ROOT",
    "OPTION_TYPE",
    "EXPIRATION",
    "STRK_PRC",
    "TRADE_SIZE",
    "TRADE_PRICE",
    "BEST_BID",
    "BEST_ASK",
    "ask_ex",
    "bid_ex",
    "bid_size_ex",
    "ask_size_ex",
    # Can be different for lobelled and unlabelled trades:
    # 'optionid','issue_type', 'myn',
    # 'price_all_lead', 'price_all_lag',
    # 'price_ex_lead', 'price_ex_lag',
]

labelled_df["duplicated"] = labelled_df.duplicated(subset=subset)


In [None]:
labelled_df["duplicated"].value_counts()


In [None]:
labelled_df[labelled_df["optionid"] == 83414152.0].head(20).T


In [None]:
labelled_df["index_labelled"] = labelled_df.index


In [None]:
len_labelled_df = len(labelled_df)
len_df = len(df)


In [None]:
# merge with indicator and find out which ones can be labelled
df_w_indicator = pd.merge(
    df,
    labelled_df,
    on=[
        "QUOTE_DATETIME",
        "ROOT",
        "OPTION_TYPE",
        "EXPIRATION",
        "STRK_PRC",
        "TRADE_SIZE",
        "TRADE_PRICE",
        "BEST_BID",
        "BEST_ASK",
        "ask_ex",
        "bid_ex",
        "bid_size_ex",
        "ask_size_ex",
        # myn seems to be different for labelled and unlabelled trades
        # 'issue_type', 'optionid',
        # 'price_all_lead', 'price_all_lag',
        # 'price_ex_lead', 'price_ex_lag',
    ],
    how="left",
    indicator="exists",
    suffixes=("_unlabelled", "_labelled"),
)


In [None]:
df_w_indicator.head(50)


In [None]:
# interpolate missing indices. index increases 1 -> 2. So filling with float seems ok. will be inserted between int of labelled df.
df_w_indicator["index_labelled"].interpolate("linear", inplace=True)
df_w_indicator.set_index(keys="index_labelled", drop=True, inplace=True)


In [None]:
df_w_indicator.head()


In [None]:
len(df_w_indicator)


In [None]:
len(df)


In [None]:
len(labelled_df)


In [None]:
# sort columns lexigraphically
df_w_indicator.sort_index(axis=1, inplace=True)


In [None]:
df_w_indicator[df_w_indicator["exists"] == "both"].head(20).T


## Creation of Unsupervised Sample

In [None]:
# use last 6 months. May increase later
date_range = df_w_indicator.QUOTE_DATETIME.between(
    "2013-04-24 00:00:00", "2013-10-24 16:14:48"
)

df_w_indicator = df_w_indicator[date_range]


In [None]:
df_w_indicator.head(5).T


In [None]:
# add fields
df_w_indicator["buy_sell"] = 0


In [None]:
# create day_vol of option series. Note optionid is not enough for option series. Use key (p. 8) from paper instead.
df_w_indicator["day_vol"] = df_w_indicator.groupby(
    [
        df_w_indicator["QUOTE_DATETIME"].dt.date,
        df_w_indicator["OPTION_TYPE"],
        df_w_indicator["ROOT"],
        df_w_indicator["STRK_PRC"],
        df_w_indicator["EXPIRATION"],
    ]
)["TRADE_SIZE"].transform("sum")


In [None]:
df_w_indicator


In [None]:
train = df_w_indicator[df_w_indicator["exists"] == "left_only"].drop(
    columns=[
        "issue_type_labelled",
        "myn_labelled",
        "optionid_labelled",
        "price_all_lag_labelled",
        "price_all_lead_labelled",
        "price_ex_lag_labelled",
        "price_ex_lead_labelled",
        "exists",
        "duplicated",
    ]
)
train.columns = train.columns.str.replace(r"_unlabelled$", "", regex=True)


In [None]:
train.head().T


In [None]:
train.describe()


In [None]:
output_path = f"gs://thesis-bucket-option-trade-classification/data/preprocessed/{exchange}_{strategy}_train.parquet"
train.to_parquet(output_path)
dataset.add_reference(output_path, name="train_set")


In [None]:
# Log the artifact to save it as an output of this run
run.log_artifact(dataset)
wandb.finish()
