In [1]:
import os

import gcsfs
import numpy as np
import pandas as pd
import wandb

from tqdm.auto import tqdm


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
exchange = "ise"
strategy = "unsupervised"
max_i = 30  # number of partial files


In [3]:
# connect to weights and biases
run = wandb.init(project="thesis", job_type="dataset-creation", entity="fbv")
dataset = wandb.Artifact(name=f"{exchange}_{strategy}_raw", type="preprocessed_data")


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mkarelze[0m ([33mfbv[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [4]:
os.environ["GCLOUD_PROJECT"] = "flowing-mantis-239216"

fs = gcsfs.GCSFileSystem(project="thesis")




In [5]:
files = [
    f"gs://thesis-bucket-option-trade-classification/data/preprocessed/{'matched' if strategy == 'supervised' else 'unmatched'}_{exchange}_quotes_min_mem_usage_extended_part_{i:04d}.parquet"
    for i in range(0, max_i)
]

columns = [
    "QUOTE_DATETIME",
    "ROOT",
    "EXPIRATION",
    "STRK_PRC",
    "OPTION_TYPE",
    "TRADE_SIZE",
    "TRADE_PRICE",
    "BEST_BID",
    "BEST_ASK",
    "ask_ex",
    "bid_ex",
    "bid_size_ex",
    "ask_size_ex",
    "price_all_lead",
    "price_all_lag",
    "optionid",
    "price_ex_lead",
    "price_ex_lag",
    "issue_type",
    "myn",
]

dfs = [pd.read_parquet(gc_file, columns=columns) for gc_file in tqdm(files)]
df = pd.concat(dfs)

del dfs


100%|██████████| 30/30 [01:13<00:00,  2.44s/it]


In [6]:
df.memory_usage(deep=True).sum()


4246414955

In [7]:
df.sort_values(by="QUOTE_DATETIME", inplace=True)


In [8]:
df.head()

Unnamed: 0,QUOTE_DATETIME,ROOT,EXPIRATION,STRK_PRC,OPTION_TYPE,TRADE_SIZE,TRADE_PRICE,BEST_BID,BEST_ASK,ask_ex,bid_ex,bid_size_ex,ask_size_ex,price_all_lead,price_all_lag,optionid,price_ex_lead,price_ex_lag,issue_type,myn
0,2012-10-24 09:30:00,XLK,2013-01-19,33.0,C,1,0.05,0.0,0.0,,,,,0.02,0.06,65911088.0,0.01,,%,0.887576
106,2012-10-24 09:30:00,ISRG,2012-11-17,545.0,P,10,13.2,12.3,13.7,,,,,15.4,12.5,83291848.0,12.48,,0,0.991901
107,2012-10-24 09:30:00,FWLT,2013-01-19,24.0,C,10,1.17,0.0,0.0,,,,,1.01,1.32,81797024.0,1.01,,0,0.994792
108,2012-10-24 09:30:00,FWLT,2012-12-22,24.0,C,10,0.92,0.0,0.0,,,,,1.05,0.84,83425032.0,1.05,,0,0.994792
109,2012-10-24 09:30:00,ALXN,2013-01-19,125.0,C,4,0.75,0.5,0.75,0.75,0.5,11.0,11.0,0.63,1.0,81791240.0,0.3,,0,0.814


In [9]:
columns = ["buy_sell", *df.columns.tolist()]
labelled_df = pd.read_parquet(
    f"gs://thesis-bucket-option-trade-classification/data/preprocessed/{exchange}_supervised_train.parquet",
    columns=columns,
)


In [10]:
labelled_df.head()

Unnamed: 0_level_0,buy_sell,QUOTE_DATETIME,ROOT,EXPIRATION,STRK_PRC,OPTION_TYPE,TRADE_SIZE,TRADE_PRICE,BEST_BID,BEST_ASK,...,bid_ex,bid_size_ex,ask_size_ex,price_all_lead,price_all_lag,optionid,price_ex_lead,price_ex_lag,issue_type,myn
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1,2005-05-02 09:30:02,YNU,2006-01-21,2.5,C,10,2.05,1.9,2.1,...,1.9,131.0,20.0,1.9,1.9,21060388.0,1.9,2.1,0,1.742
1,1,2005-05-02 09:30:03,SYQ,2005-06-18,15.0,C,10,3.9,3.6,3.8,...,,,,4.0,4.0,31624184.0,4.6,4.0,0,1.235
2,-1,2005-05-02 09:30:03,SWG,2005-05-21,105.0,C,50,11.2,11.1,11.4,...,11.1,300.0,300.0,11.8,11.0,31620976.0,11.9,11.0,%,1.105381
3,1,2005-05-02 09:30:03,QAX,2005-06-18,25.0,C,10,0.2,0.0,0.25,...,0.0,0.0,86.0,0.15,0.15,31560072.0,0.15,0.15,0,0.799
4,-1,2005-05-02 09:30:03,ORQ,2005-12-17,14.0,C,15,0.25,0.3,0.4,...,0.25,3356.0,399.0,0.35,0.35,25240212.0,0.35,0.35,0,0.826429


In [11]:
# slice labelled dataframe to range of unlabelled
date_range = labelled_df.QUOTE_DATETIME.between(
    df.QUOTE_DATETIME.min(), df.QUOTE_DATETIME.max()
)
labelled_df = labelled_df[date_range]


In [12]:
subset = [
    "QUOTE_DATETIME",
    "ROOT",
    "OPTION_TYPE",
    "EXPIRATION",
    "STRK_PRC",
    "TRADE_SIZE",
    "TRADE_PRICE",
    "BEST_BID",
    "BEST_ASK",
    "ask_ex",
    "bid_ex",
    "bid_size_ex",
    "ask_size_ex",
    # FIXME is different for loballed and unlabelled trades
    # 'optionid','issue_type', 'myn',
    # 'price_all_lead', 'price_all_lag',
    # 'price_ex_lead', 'price_ex_lag',
]

labelled_df["duplicated"] = labelled_df.duplicated(subset=subset)


In [13]:
labelled_df["duplicated"].value_counts()


False    4066366
True       29261
Name: duplicated, dtype: int64

In [14]:
labelled_df[labelled_df["optionid"] == 83414152.0].head(20).T


index,25414885,25414881,25414882,25421396,25484785,25502223,25573097,25573112,25605276,25627039,25648828,25648830,25651784,25657637,25679640,25682745,25759804,25768170,25773219,25793669
buy_sell,-1,-1,-1,-1,-1,1,-1,-1,1,-1,1,1,1,-1,-1,-1,-1,-1,-1,-1
QUOTE_DATETIME,2012-10-24 09:30:14,2012-10-24 09:30:14,2012-10-24 09:30:14,2012-10-24 13:28:33,2012-11-05 09:30:38,2012-11-06 12:49:53,2012-11-14 10:59:07,2012-11-14 10:59:33,2012-11-16 11:42:41,2012-11-19 13:45:08,2012-11-21 12:51:45,2012-11-21 12:51:45,2012-11-21 15:03:45,2012-11-23 10:37:52,2012-11-27 10:43:24,2012-11-27 12:49:36,2012-12-05 14:53:02,2012-12-06 12:06:23,2012-12-06 15:47:03,2012-12-10 13:06:10
ROOT,CVX,CVX,CVX,CVX,CVX,CVX,CVX,CVX,CVX,CVX,CVX,CVX,CVX,CVX,CVX,CVX,CVX,CVX,CVX,CVX
EXPIRATION,2013-06-22 00:00:00,2013-06-22 00:00:00,2013-06-22 00:00:00,2013-06-22 00:00:00,2013-06-22 00:00:00,2013-06-22 00:00:00,2013-06-22 00:00:00,2013-06-22 00:00:00,2013-06-22 00:00:00,2013-06-22 00:00:00,2013-06-22 00:00:00,2013-06-22 00:00:00,2013-06-22 00:00:00,2013-06-22 00:00:00,2013-06-22 00:00:00,2013-06-22 00:00:00,2013-06-22 00:00:00,2013-06-22 00:00:00,2013-06-22 00:00:00,2013-06-22 00:00:00
STRK_PRC,100.0,100.0,100.0,100.0,100.0,100.0,90.0,105.0,95.0,100.0,100.0,100.0,95.0,105.0,80.0,105.0,100.0,100.0,100.0,100.0
OPTION_TYPE,P,P,P,P,P,P,C,P,C,P,P,P,C,P,C,P,P,P,P,P
TRADE_SIZE,40,5,5,1,1,6,1,2,30,1,2,5,1,2,5,15,25,91,4,1
TRADE_PRICE,4.32,4.35,4.35,4.27,4.69,4.06,14.77,8.62,10.1,5.44,5.32,5.32,11.29,6.93,24.66,7.21,4.44,4.2,4.12,3.95
BEST_BID,4.35,4.35,4.35,4.25,4.6,4.0,14.75,8.6,10.05,5.4,5.2,5.2,11.15,6.9,24.549999,7.15,4.4,4.2,4.05,3.9
BEST_ASK,4.5,4.5,4.5,4.3,4.8,4.15,15.0,8.65,10.15,5.55,5.35,5.35,11.45,7.0,25.5,7.3,4.5,4.25,4.2,4.0


In [15]:
labelled_df['index_labelled'] = labelled_df.index

In [16]:
len_labelled_df = len(labelled_df)
len_df = len(df)


In [17]:
# merge with indicator and find out which ones can be labelled
df_w_indicator = pd.merge(
    df,
    labelled_df,
    on=[
        "QUOTE_DATETIME",
        "ROOT",
        "OPTION_TYPE",
        "EXPIRATION",
        "STRK_PRC",
        "TRADE_SIZE",
        "TRADE_PRICE",
        "BEST_BID",
        "BEST_ASK",
        "ask_ex",
        "bid_ex",
        "bid_size_ex",
        "ask_size_ex",
        # myn seems to be different
        #'issue_type', 'optionid',
        # 'price_all_lead', 'price_all_lag', # FIXME is different for loballed and unlabelled trades
        # 'price_ex_lead', 'price_ex_lag', # FIXME is different for loballed and unlabelled trades
    ],
    how="left",
    indicator="exists",
    suffixes=("_unlabelled", "_labelled"),
)


In [18]:
df_w_indicator.head(50)

Unnamed: 0,QUOTE_DATETIME,ROOT,EXPIRATION,STRK_PRC,OPTION_TYPE,TRADE_SIZE,TRADE_PRICE,BEST_BID,BEST_ASK,ask_ex,...,price_all_lead_labelled,price_all_lag_labelled,optionid_labelled,price_ex_lead_labelled,price_ex_lag_labelled,issue_type_labelled,myn_labelled,duplicated,index_labelled,exists
0,2012-10-24 09:30:00,XLK,2013-01-19,33.0,C,1,0.05,0.0,0.0,,...,0.02,0.06,65911088.0,0.01,0.06,%,0.887576,False,25414693.0,both
1,2012-10-24 09:30:00,ISRG,2012-11-17,545.0,P,10,13.2,12.3,13.7,,...,,,,,,,,,,left_only
2,2012-10-24 09:30:00,FWLT,2013-01-19,24.0,C,10,1.17,0.0,0.0,,...,,,,,,,,,,left_only
3,2012-10-24 09:30:00,FWLT,2012-12-22,24.0,C,10,0.92,0.0,0.0,,...,,,,,,,,,,left_only
4,2012-10-24 09:30:00,ALXN,2013-01-19,125.0,C,4,0.75,0.5,0.75,0.75,...,0.63,1.0,81791240.0,0.3,1.0,0,0.814,False,25414707.0,both
5,2012-10-24 09:30:00,NFLX,2012-11-17,80.0,C,1,0.39,0.01,0.51,0.51,...,,,,,,,,,,left_only
6,2012-10-24 09:30:00,NFLX,2012-11-17,70.0,C,1,0.56,0.01,0.75,0.75,...,,,,,,,,,,left_only
7,2012-10-24 09:30:00,NFLX,2012-11-17,75.0,C,2,0.2,0.01,0.75,0.75,...,,,,,,,,,,left_only
8,2012-10-24 09:30:00,ISRG,2012-11-17,535.0,P,10,9.4,8.7,9.6,,...,,,,,,,,,,left_only
9,2012-10-24 09:30:00,NFLX,2012-10-26,80.0,C,3,0.01,0.0,0.01,0.01,...,,,,,,,,,,left_only


In [19]:
# interpolate missing indices. index increases 1 -> 2. So filling with float seems ok. will be inserted between int of labelled df.
df_w_indicator["index_labelled"].interpolate("linear", inplace=True)
df_w_indicator.set_index(keys = "index_labelled", drop=True, inplace=True)

In [20]:
df_w_indicator.head()


Unnamed: 0_level_0,QUOTE_DATETIME,ROOT,EXPIRATION,STRK_PRC,OPTION_TYPE,TRADE_SIZE,TRADE_PRICE,BEST_BID,BEST_ASK,ask_ex,...,buy_sell,price_all_lead_labelled,price_all_lag_labelled,optionid_labelled,price_ex_lead_labelled,price_ex_lag_labelled,issue_type_labelled,myn_labelled,duplicated,exists
index_labelled,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
25414693.0,2012-10-24 09:30:00,XLK,2013-01-19,33.0,C,1,0.05,0.0,0.0,,...,1.0,0.02,0.06,65911088.0,0.01,0.06,%,0.887576,False,both
25414696.5,2012-10-24 09:30:00,ISRG,2012-11-17,545.0,P,10,13.2,12.3,13.7,,...,,,,,,,,,,left_only
25414700.0,2012-10-24 09:30:00,FWLT,2013-01-19,24.0,C,10,1.17,0.0,0.0,,...,,,,,,,,,,left_only
25414703.5,2012-10-24 09:30:00,FWLT,2012-12-22,24.0,C,10,0.92,0.0,0.0,,...,,,,,,,,,,left_only
25414707.0,2012-10-24 09:30:00,ALXN,2013-01-19,125.0,C,4,0.75,0.5,0.75,0.75,...,1.0,0.63,1.0,81791240.0,0.3,1.0,0,0.814,False,both


In [21]:
len(df_w_indicator)


29937672

In [22]:
len(df)


29842162

In [23]:
len(labelled_df)


4095627

In [24]:
# sort columns lexigraphically
df_w_indicator.sort_index(axis=1, inplace=True)


In [25]:
df_w_indicator[df_w_indicator["exists"] == "both"].head(20).T


index_labelled,25414693.0,25414707.0,25414708.0,25414705.0,25414706.0,25414709.0,25414710.0,25414696.0,25414697.0,25414698.0,25414695.0,25414694.0,25414699.0,25414703.0,25414704.0,25414702.0,25414700.0,25414701.0,25414724.0,25414725.0
BEST_ASK,0.0,0.75,0.35,0.25,0.02,14.8,8.6,0.0,0.3,0.0,0.0,7.8,0.0,5.7,0.55,0.0,0.65,5.0,8.9,4.65
BEST_BID,0.0,0.5,0.25,0.0,0.01,10.2,8.1,0.0,0.15,0.0,0.0,7.5,0.0,5.2,0.3,0.0,0.5,4.6,8.0,4.5
EXPIRATION,2013-01-19 00:00:00,2013-01-19 00:00:00,2012-11-17 00:00:00,2012-12-22 00:00:00,2012-10-26 00:00:00,2014-01-18 00:00:00,2012-11-17 00:00:00,2013-01-19 00:00:00,2013-03-16 00:00:00,2013-01-19 00:00:00,2012-12-22 00:00:00,2012-10-26 00:00:00,2014-01-18 00:00:00,2013-01-19 00:00:00,2013-01-19 00:00:00,2013-01-19 00:00:00,2013-03-16 00:00:00,2012-12-22 00:00:00,2013-06-22 00:00:00,2013-04-20 00:00:00
OPTION_TYPE,C,C,P,C,P,P,C,P,P,C,P,C,C,C,C,P,P,C,C,P
QUOTE_DATETIME,2012-10-24 09:30:00,2012-10-24 09:30:00,2012-10-24 09:30:00,2012-10-24 09:30:00,2012-10-24 09:30:00,2012-10-24 09:30:00,2012-10-24 09:30:00,2012-10-24 09:30:00,2012-10-24 09:30:00,2012-10-24 09:30:00,2012-10-24 09:30:00,2012-10-24 09:30:00,2012-10-24 09:30:00,2012-10-24 09:30:00,2012-10-24 09:30:00,2012-10-24 09:30:00,2012-10-24 09:30:00,2012-10-24 09:30:00,2012-10-24 09:30:01,2012-10-24 09:30:01
ROOT,XLK,ALXN,OIL,IRBT,QQQ,NFLX,QCOR,FB,FB,QCOR,FB,FB,NFLX,QCOR,QCOR,UCO,FB,ERX,FB,EBAY
STRK_PRC,33.0,125.0,20.0,25.0,60.0,55.0,19.0,16.0,14.0,47.0,17.0,16.5,27.5,24.0,46.0,34.0,17.0,50.0,17.0,50.0
TRADE_PRICE,0.05,0.75,0.35,0.25,0.01,11.38,8.1,0.25,0.3,0.25,0.25,7.62,33.700001,5.2,0.3,7.9,0.65,4.6,8.0,4.61
TRADE_SIZE,1,4,50,4,4,1,1,10,1,2,1,20,1,10,1,2,10,1,1,2
ask_ex,,0.75,0.35,0.25,0.02,14.8,8.6,,0.3,,,7.8,,5.7,0.55,,0.65,5.0,8.9,4.65


## Analysis of Accucacies

In [26]:
# foo = df_w_indicator[df_w_indicator["exists"] == "both"][
#     ["price_ex_lag_labelled", "price_ex_lag_unlabelled", "buy_sell", "TRADE_PRICE"]
# ]

# foo["tick_unlabelled"] = np.where(
#     foo["TRADE_PRICE"] > foo["price_ex_lag_unlabelled"],
#     1,
#     np.where(foo["TRADE_PRICE"] < foo["price_ex_lag_unlabelled"], -1, np.nan),
# )
# foo["tick_labelled"] = np.where(
#     foo["TRADE_PRICE"] > foo["price_ex_lag_labelled"],
#     1,
#     np.where(foo["TRADE_PRICE"] < foo["price_ex_lag_labelled"], -1, np.nan),
# )


In [27]:
# foo.head()


In [28]:
# acc_unlabelled = (foo.buy_sell == foo.tick_unlabelled).sum() / len(foo)
# acc_labelled = (foo.buy_sell == foo.tick_labelled).sum() / len(foo)

# print(acc_unlabelled)
# print(acc_labelled)


## Creation of Unsupervised Sample

In [29]:
# use last 6 months. May increase later
date_range = df_w_indicator.QUOTE_DATETIME.between(
        "2013-04-24 00:00:00", "2013-10-24 16:14:48"
)

df_w_indicator = df_w_indicator[date_range]

In [30]:
df_w_indicator.head(5).T


index_labelled,2.724860e+07,2.724860e+07.1,2.724860e+07.2,2.724860e+07.3,2.724860e+07.4
BEST_ASK,0.0,0.0,0.62,7.7,0.0
BEST_BID,0.0,0.0,0.55,7.5,0.0
EXPIRATION,2014-01-18 00:00:00,2013-05-18 00:00:00,2013-06-22 00:00:00,2014-01-18 00:00:00,2013-05-18 00:00:00
OPTION_TYPE,P,P,C,C,C
QUOTE_DATETIME,2013-04-24 09:30:00,2013-04-24 09:30:00,2013-04-24 09:30:00,2013-04-24 09:30:00,2013-04-24 09:30:00
ROOT,LIFE,CTSH,CSCO,CSCO,AKAM
STRK_PRC,75.0,55.0,21.0,13.0,36.0
TRADE_PRICE,1.95,0.45,0.47,7.7,1.86
TRADE_SIZE,9,11,10,10,1
ask_ex,,,,7.7,


In [31]:
# add fields
df_w_indicator["buy_sell"] = 0


In [32]:
# create day_vol of option series. Note optionid is not enough for option series. Use key (p. 8) from paper instead.
df_w_indicator["day_vol"] = df_w_indicator.groupby(
    [
        df_w_indicator["QUOTE_DATETIME"].dt.date,
        df_w_indicator["OPTION_TYPE"],
        df_w_indicator["ROOT"],
        df_w_indicator["STRK_PRC"],
        df_w_indicator["EXPIRATION"],
    ]
)["TRADE_SIZE"].transform("sum")


In [33]:
df_w_indicator


Unnamed: 0_level_0,BEST_ASK,BEST_BID,EXPIRATION,OPTION_TYPE,QUOTE_DATETIME,ROOT,STRK_PRC,TRADE_PRICE,TRADE_SIZE,ask_ex,...,optionid_unlabelled,price_all_lag_labelled,price_all_lag_unlabelled,price_all_lead_labelled,price_all_lead_unlabelled,price_ex_lag_labelled,price_ex_lag_unlabelled,price_ex_lead_labelled,price_ex_lead_unlabelled,day_vol
index_labelled,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2.724860e+07,0.00,0.00,2014-01-18,P,2013-04-24 09:30:00,LIFE,75.0,1.95,9,,...,83371840.0,3.52,3.52,1.85,1.85,2.1,2.10,1.90,1.90,10
2.724860e+07,0.00,0.00,2013-05-18,P,2013-04-24 09:30:00,CTSH,55.0,0.45,11,,...,100877112.0,,,,0.40,,,,0.67,28
2.724860e+07,0.62,0.55,2013-06-22,C,2013-04-24 09:30:00,CSCO,21.0,0.47,10,,...,100470896.0,,0.57,,0.55,,0.76,,0.56,67
2.724860e+07,7.70,7.50,2014-01-18,C,2013-04-24 09:30:00,CSCO,13.0,7.70,10,7.70,...,81721152.0,7.85,7.85,7.65,7.65,8.1,8.10,7.65,7.65,20
2.724860e+07,0.00,0.00,2013-05-18,C,2013-04-24 09:30:00,AKAM,36.0,1.86,1,,...,83259176.0,,1.87,,1.76,,1.83,,1.80,219
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2.951032e+07,2.20,2.18,2013-12-21,C,2013-10-24 16:14:36,QQQ,82.0,2.18,5,2.21,...,100511416.0,,2.20,,2.25,,2.20,,,1007
2.951032e+07,2.20,2.18,2013-12-21,C,2013-10-24 16:14:36,QQQ,82.0,2.18,3,2.21,...,100511416.0,,2.20,,2.25,,2.20,,,1007
2.951032e+07,2.20,2.18,2013-12-21,C,2013-10-24 16:14:36,QQQ,82.0,2.18,5,2.21,...,100511416.0,,2.20,,2.25,,2.20,,,1007
2.951032e+07,2.20,2.18,2013-12-21,C,2013-10-24 16:14:36,QQQ,82.0,2.18,10,2.21,...,100511416.0,,2.20,,2.25,,2.20,,,1007


In [34]:
train = df_w_indicator[df_w_indicator["exists"] == "left_only"].drop(
    columns=[
        "issue_type_labelled",
        "myn_labelled",
        "optionid_labelled",
        "price_all_lag_labelled",
        "price_all_lead_labelled",
        "price_ex_lag_labelled",
        "price_ex_lead_labelled",
        "exists",
        "duplicated",
    ]
)
train.columns = train.columns.str.replace(r"_unlabelled$", "", regex=True)


In [35]:
train.head().T


index_labelled,2.724860e+07,2.724860e+07.1,2.724860e+07.2,2.724860e+07.3,2.724860e+07.4
BEST_ASK,0.0,0.62,0.0,2.02,3.1
BEST_BID,0.0,0.55,0.0,1.82,2.99
EXPIRATION,2013-05-18 00:00:00,2013-06-22 00:00:00,2013-05-18 00:00:00,2013-05-18 00:00:00,2013-06-22 00:00:00
OPTION_TYPE,P,C,C,P,C
QUOTE_DATETIME,2013-04-24 09:30:00,2013-04-24 09:30:00,2013-04-24 09:30:00,2013-04-24 09:30:00,2013-04-24 09:30:00
ROOT,CTSH,CSCO,AKAM,AKAM,QQQ
STRK_PRC,55.0,21.0,36.0,35.0,67.0
TRADE_PRICE,0.45,0.47,1.86,2.02,3.03
TRADE_SIZE,11,10,1,1,10
ask_ex,,,,2.02,3.1


In [36]:
train.describe()


Unnamed: 0,BEST_ASK,BEST_BID,STRK_PRC,TRADE_PRICE,TRADE_SIZE,ask_ex,ask_size_ex,bid_ex,bid_size_ex,buy_sell,myn,optionid,price_all_lag,price_all_lead,price_ex_lag,price_ex_lead,day_vol
count,13069820.0,13069820.0,13069820.0,13069820.0,13069820.0,13037050.0,13037050.0,13037050.0,13037050.0,13069819.0,13067720.0,13067737.0,13005850.0,12995060.0,12840040.0,12754520.0,13069820.0
mean,3.423623,3.321758,210.0583,3.375032,20.87727,3.448208,147.6947,3.303517,140.1309,0.0,1.408313,97806760.0,3.369529,3.372703,3.366137,3.356368,1208.536
std,9.310712,8.769208,333.2693,8.831281,221.7725,8.909276,406.891,8.759763,382.8098,0.0,566.8405,7445766.5,8.665418,9.06148,8.371501,8.354015,3573.619
min,0.0,0.0,0.22,0.01,1.0,0.0,0.0,0.0,-20.0,0.0,8.167976e-06,48498828.0,0.01,0.01,0.01,0.01,1.0
25%,0.5,0.47,40.0,0.49,1.0,0.51,13.0,0.46,12.0,0.0,0.9468513,100419784.0,0.49,0.49,0.5,0.49,54.0
50%,1.27,1.22,92.0,1.25,5.0,1.29,36.0,1.2,35.0,0.0,0.984138,100892352.0,1.25,1.25,1.26,1.25,224.0
75%,3.05,2.94,230.0,2.99,10.0,3.1,97.0,2.92,94.0,0.0,1.003228,101274944.0,2.99,3.0,3.0,2.99,937.0
max,10000.4,2215.7,4700.0,2215.7,208561.0,2220.5,21645.0,2215.7,15861.0,0.0,915121.9,127872040.0,2215.7,10000.0,2215.7,2215.7,256913.0


In [37]:
output_path = f"gs://thesis-bucket-option-trade-classification/data/preprocessed/{exchange}_{strategy}_train.parquet"
train.to_parquet(output_path)
dataset.add_reference(output_path, name="train_set")




[<ManifestEntry ref: gs://thesis-bucket-option-trade-classification/data/preprocessed/ise_unsupervised_train.parquet/train_set>]

In [38]:
# Log the artifact to save it as an output of this run
run.log_artifact(dataset)
wandb.finish()
