<a href="https://colab.research.google.com/github/KarelZe/thesis/blob/baseline/notebooks/data_preprocessing_loading_splitting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install gcsfs
!pip install modin
!pip install wandb

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [8]:
from distributed import Client
import glob
import os
import random

import gcsfs
import google.auth
from google.colab import auth

from numpy.testing import assert_almost_equal
from pandas._testing.asserters import assert_almost_equal

import modin.pandas as pd
import modin.config as cfg
from modin.config import ProgressBar

from sklearn.model_selection import train_test_split

from tqdm.notebook import tqdm

import wandb


In [3]:
# connect to weights and biases
run = wandb.init(project="thesis", job_type="dataset-creation", entity="fbv")

[34m[1mwandb[0m: Currently logged in as: [33mkarelze[0m ([33mfbv[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [4]:
# connect to google cloud storage
auth.authenticate_user()
credentials, _ = google.auth.default()
fs = gcsfs.GCSFileSystem(project="thesis", token=credentials)
fs_prefix = "gs://"

In [9]:
# set fixed seed
def seed_everything(seed): 
  """ 
  Seeds basic parameters for reproducibility of results 
  """ 
  os.environ["PYTHONHASHSEED"] = str(seed) 
  random.seed(seed)

seed_everything(42)

In [10]:
cfg.Engine.put("dask")
ProgressBar.enable()
client = Client()

files = fs.glob("thesis-bucket-option-trade-classification/data/preprocessed/matched_ise_quotes_min_mem_usage_part_*.parquet", recursive=True)
files = [fs_prefix + sub for sub in files]

columns = ['UNDERLYING_SYMBOL', 'QUOTE_DATETIME', 'SEQUENCE_NUMBER', 'ROOT',
       'EXPIRATION', 'STRK_PRC', 'OPTION_TYPE', 'TRADE_SIZE', 'TRADE_PRICE',
       'BEST_BID', 'BEST_ASK', 'order_id', 'ask_ex', 'bid_ex', 'bid_size_ex',
       'ask_size_ex', 'price_all_lead', 'price_all_lag', 'optionid',
       'day_vol', 'price_ex_lead', 'price_ex_lag', 'buy_sell']

dfs = []
for gc_file in tqdm(files):
  df = pd.read_parquet(gc_file, columns=columns)
  dfs.append(df)
df = pd.concat(dfs)

#df = pd.read_parquet("gs://thesis-bucket-option-trade-classification/data/preprocessed/matched_ise_quotes_min_mem_usage_part_*.parquet", columns=columns)

  0%|          | 0/50 [00:00<?, ?it/s]

In [11]:
df.memory_usage(deep=True).sum()



Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

5506741766

In [12]:
df.head().T

Unnamed: 0,0,1,2,3,4
UNDERLYING_SYMBOL,BRCD,SYMC,SPY,ADRX,ORCL
QUOTE_DATETIME,2005-05-02 09:30:02,2005-05-02 09:30:03,2005-05-02 09:30:03,2005-05-02 09:30:03,2005-05-02 09:30:03
SEQUENCE_NUMBER,72515,65366,65373,79195,95870
ROOT,YNU,SYQ,SWG,QAX,ORQ
EXPIRATION,2006-01-21 00:00:00,2005-06-18 00:00:00,2005-05-21 00:00:00,2005-06-18 00:00:00,2005-12-17 00:00:00
STRK_PRC,2.5,15.0,105.0,25.0,14.0
OPTION_TYPE,C,C,C,C,C
TRADE_SIZE,10,10,50,10,15
TRADE_PRICE,2.05,3.9,11.2,0.2,0.25
BEST_BID,1.9,3.6,11.1,0.0,0.3


In [16]:
# check against some stats from sub panel A.1 in Grauer et al

# trade size
stats_trade_size = df['TRADE_SIZE'].agg(['mean','median','std'])

assert_almost_equal(stats_trade_size.values.tolist(), [13.62,4.0,77.75], atol=0.1)

# moneyness; price underlying / strike
# TODO: Request price for underlyings?

# time to maturity
stats_time_to_maturity = (df['EXPIRATION'] - df['QUOTE_DATETIME']).dt.days
stats_time_to_maturity = stats_time_to_maturity.agg(['mean','median','std'])

# no of observations
stats_n = len(df)
assert stats_n == 49203747

# trade_size = quote size; TRADE_SIZE
stats_trades_with_quote_size_bid = df['bid_size_ex'].eq(df['TRADE_SIZE'])
stats_trades_with_quote_size_ask = df['ask_size_ex'].eq(df['TRADE_SIZE'])

# either ask or bid must be equal, but not both (XOR)
# TODO: mismatch Grauer et. al report 22.28 % -> 0.10956509064238543
stats_trade_with_quote_size = (stats_trades_with_quote_size_bid ^ stats_trades_with_quote_size_ask).sum() / stats_n

# no of buys
stats_buy_trades = df['buy_sell'].ge(0).sum() / stats_n
assert_almost_equal(stats_buy_trades, 0.4746, atol=0.01)

# underlyings per day
# stats_underlyings_per_day = df.groupby(['UNDERLYING_SYMBOL','QUOTE_DATETIME']).count().agg(['mean','median','std'])



## create subsample 🔢

In [18]:
df.head()

Unnamed: 0,UNDERLYING_SYMBOL,QUOTE_DATETIME,SEQUENCE_NUMBER,ROOT,EXPIRATION,STRK_PRC,OPTION_TYPE,TRADE_SIZE,TRADE_PRICE,BEST_BID,...,bid_ex,bid_size_ex,ask_size_ex,price_all_lead,price_all_lag,optionid,day_vol,price_ex_lead,price_ex_lag,buy_sell
0,BRCD,2005-05-02 09:30:02,72515,YNU,2006-01-21,2.5,C,10,2.05,1.9,...,1.9,131.0,20.0,1.9,1.9,21060388.0,10.0,1.9,2.1,1
1,SYMC,2005-05-02 09:30:03,65366,SYQ,2005-06-18,15.0,C,10,3.9,3.6,...,,,,4.0,4.0,31624184.0,10.0,4.6,4.0,1
2,SPY,2005-05-02 09:30:03,65373,SWG,2005-05-21,105.0,C,50,11.2,11.1,...,11.1,300.0,300.0,11.8,11.0,31620976.0,50.0,11.9,11.0,-1
3,ADRX,2005-05-02 09:30:03,79195,QAX,2005-06-18,25.0,C,10,0.2,0.0,...,0.0,0.0,86.0,0.15,0.15,31560072.0,10.0,0.15,0.15,1
4,ORCL,2005-05-02 09:30:03,95870,ORQ,2005-12-17,14.0,C,15,0.25,0.3,...,0.25,3356.0,399.0,0.35,0.35,25240212.0,17.0,0.35,0.35,-1


In [19]:
year = 2017 

output_path = f"gs://thesis-bucket-option-trade-classification/data/preprocessed/matched_ise_quotes_{year}.parquet"
df_sub = df[df['QUOTE_DATETIME'].dt.year == year]
df_sub.to_parquet(output_path)

dataset_at = wandb.Artifact(f'data_preprocessed_{year}',type="preprocessed_data")
dataset_at.add_reference(output_path)

Exception: ignored

In [None]:
year = 2015

output_path = f"gs://thesis-bucket-option-trade-classification/data/preprocessed/matched_ise_quotes_{year}.parquet"
df_sub = df[df['QUOTE_DATETIME'].dt.year == year]
df_sub.to_parquet(output_path)

dataset_at = wandb.Artifact(f'data_preprocessed_{year}',type="preprocessed_data")
dataset_at.add_reference(output_path)

## train-test-split ⚗️

In [20]:
# perform 60-20-20 split
train, test = train_test_split(df, test_size=0.2, shuffle=False)
train, val = train_test_split(train, test_size=0.25, shuffle=False)

In [None]:
output_path = f"gs://thesis-bucket-option-trade-classification/data/preprocessed/train_set_60.parquet"
train.to_parquet(output_path)

dataset_at = wandb.Artifact('train_set_60', type="preprocessed_data")
dataset_at.add_reference(output_path)

In [None]:
output_path = f"gs://thesis-bucket-option-trade-classification/data/preprocessed/val_set_20.parquet"
val.to_parquet(output_path)

dataset_at = wandb.Artifact('val_set_20', type="preprocessed_data")
dataset_at.add_reference(output_path)

In [None]:
output_path = f"gs://thesis-bucket-option-trade-classification/data/preprocessed/test_set_20.parquet"
test.to_parquet(output_path)

dataset_at = wandb.Artifact('test_set_20', type="preprocessed_data")
dataset_at.add_reference(output_path)