<a href="https://colab.research.google.com/github/KarelZe/thesis/blob/baseline/notebooks/data_preprocessing_loading_splitting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install gcsfs
!pip install wandb

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import os
import random

import gcsfs
import google.auth
from google.colab import auth

from numpy.testing import assert_almost_equal
from pandas._testing.asserters import assert_almost_equal
import pandas as pd

from tqdm.notebook import tqdm

import wandb


In [3]:
# connect to weights and biases
run = wandb.init(project="thesis", job_type="dataset-creation", entity="fbv")

dataset = wandb.Artifact(name='train_val_test', type="preprocessed_data")

[34m[1mwandb[0m: Currently logged in as: [33mkarelze[0m ([33mfbv[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [4]:
# connect to google cloud storage
auth.authenticate_user()
credentials, _ = google.auth.default()
fs = gcsfs.GCSFileSystem(project="thesis", token=credentials)
fs_prefix = "gs://"

In [5]:
# set fixed seed
def seed_everything(seed): 
  """ 
  Seeds basic parameters for reproducibility of results 
  """ 
  os.environ["PYTHONHASHSEED"] = str(seed) 
  random.seed(seed)

seed_everything(42)

In [6]:
files = fs.glob("thesis-bucket-option-trade-classification/data/preprocessed/matched_ise_quotes_min_mem_usage_part_*.parquet", recursive=True)
files = [fs_prefix + sub for sub in files]

columns = ['UNDERLYING_SYMBOL', 'QUOTE_DATETIME', 'SEQUENCE_NUMBER', 'ROOT',
       'EXPIRATION', 'STRK_PRC', 'OPTION_TYPE', 'TRADE_SIZE', 'TRADE_PRICE',
       'BEST_BID', 'BEST_ASK', 'order_id', 'ask_ex', 'bid_ex', 'bid_size_ex',
       'ask_size_ex', 'price_all_lead', 'price_all_lag', 'optionid',
       'day_vol', 'price_ex_lead', 'price_ex_lag', 'buy_sell']

dfs = []
for gc_file in tqdm(files):
  df = pd.read_parquet(gc_file, columns=columns)
  dfs.append(df)
df = pd.concat(dfs)

  0%|          | 0/50 [00:00<?, ?it/s]

In [7]:
df.memory_usage(deep=True).sum()

10742886851

In [8]:
df.head().T

Unnamed: 0,0,1,2,3,4
UNDERLYING_SYMBOL,BRCD,SYMC,SPY,ADRX,ORCL
QUOTE_DATETIME,2005-05-02 09:30:02,2005-05-02 09:30:03,2005-05-02 09:30:03,2005-05-02 09:30:03,2005-05-02 09:30:03
SEQUENCE_NUMBER,72515,65366,65373,79195,95870
ROOT,YNU,SYQ,SWG,QAX,ORQ
EXPIRATION,2006-01-21 00:00:00,2005-06-18 00:00:00,2005-05-21 00:00:00,2005-06-18 00:00:00,2005-12-17 00:00:00
STRK_PRC,2.5,15.0,105.0,25.0,14.0
OPTION_TYPE,C,C,C,C,C
TRADE_SIZE,10,10,50,10,15
TRADE_PRICE,2.05,3.9,11.2,0.2,0.25
BEST_BID,1.9,3.6,11.1,0.0,0.3


In [9]:
# check against some stats from sub panel A.1 in Grauer et al

# trade size
stats_trade_size = df['TRADE_SIZE'].agg(['mean','median','std'])

assert_almost_equal(stats_trade_size.values.tolist(), [13.62,4.0,77.75], atol=0.1)

# moneyness; price underlying / strike
# TODO: Request price for underlyings?

# time to maturity
stats_time_to_maturity = (df['EXPIRATION'] - df['QUOTE_DATETIME']).dt.days
stats_time_to_maturity = stats_time_to_maturity.agg(['mean','median','std'])

# no of observations
stats_n = len(df)
assert stats_n == 49203747

# trade_size = quote size; TRADE_SIZE
stats_trades_with_quote_size_bid = df['bid_size_ex'].eq(df['TRADE_SIZE'])
stats_trades_with_quote_size_ask = df['ask_size_ex'].eq(df['TRADE_SIZE'])

# either ask or bid must be equal, but not both (XOR)
# TODO: mismatch Grauer et. al report 22.28 % -> 0.10956509064238543
stats_trade_with_quote_size = (stats_trades_with_quote_size_bid ^ stats_trades_with_quote_size_ask).sum() / stats_n

# no of buys
stats_buy_trades = df['buy_sell'].ge(0).sum() / stats_n
assert_almost_equal(stats_buy_trades, 0.4746, atol=0.01)

# underlyings per day
# stats_underlyings_per_day = df.groupby(['UNDERLYING_SYMBOL','QUOTE_DATETIME']).count().agg(['mean','median','std'])

## create subsample 🔢

In [10]:
year = 2017 

output_path = f"gs://thesis-bucket-option-trade-classification/data/preprocessed/matched_ise_quotes_{year}.parquet"
df_sub = df[df['QUOTE_DATETIME'].dt.year == year]
df_sub.to_parquet(output_path)

dataset.add_reference(output_path, name=f'data_preprocessed_{year}')

[<ManifestEntry ref: gs://thesis-bucket-option-trade-classification/data/preprocessed/matched_ise_quotes_2017.parquet/data_preprocessed_2017>]

In [11]:
year = 2015

output_path = f"gs://thesis-bucket-option-trade-classification/data/preprocessed/matched_ise_quotes_{year}.parquet"
df_sub = df[df['QUOTE_DATETIME'].dt.year == year]
df_sub.to_parquet(output_path)

dataset.add_reference(output_path, name=f'data_preprocessed_{year}')

[<ManifestEntry ref: gs://thesis-bucket-option-trade-classification/data/preprocessed/matched_ise_quotes_2015.parquet/data_preprocessed_2015>]

## train-test-split ⚗️

In [12]:
train = df[df.QUOTE_DATETIME.between("2005-05-02 00:00:01", "2013-10-24 23:59:00")]

len_train = len(train) 
print(f"train ratio: {len_train / len(df)}")

train ratio: 0.5997575753732739


In [13]:
val = df[df.QUOTE_DATETIME.between("2013-10-25 00:00:01", "2015-11-05 23:59:00")]
len_val = len(val) 
print(f"train ratio: {len_val / len(df)}")

train ratio: 0.1998191519845023


In [14]:
test = df[df.QUOTE_DATETIME.between("2015-11-06 00:00:01", "2017-05-31 23:59:00")]
len_test = len(test) 
print(f"train ratio: {len_test / len(df)}")


train ratio: 0.20042327264222376


In [15]:
# check if total is sum of its parts
assert len_train + len_val + len_test == len(df)

In [16]:
train

Unnamed: 0,UNDERLYING_SYMBOL,QUOTE_DATETIME,SEQUENCE_NUMBER,ROOT,EXPIRATION,STRK_PRC,OPTION_TYPE,TRADE_SIZE,TRADE_PRICE,BEST_BID,...,bid_ex,bid_size_ex,ask_size_ex,price_all_lead,price_all_lag,optionid,day_vol,price_ex_lead,price_ex_lag,buy_sell
0,BRCD,2005-05-02 09:30:02,72515,YNU,2006-01-21,2.5,C,10,2.05,1.90,...,1.90,131.0,20.0,1.90,1.90,21060388.0,10.0,1.90,2.10,1
1,SYMC,2005-05-02 09:30:03,65366,SYQ,2005-06-18,15.0,C,10,3.90,3.60,...,,,,4.00,4.00,31624184.0,10.0,4.60,4.00,1
2,SPY,2005-05-02 09:30:03,65373,SWG,2005-05-21,105.0,C,50,11.20,11.10,...,11.10,300.0,300.0,11.80,11.00,31620976.0,50.0,11.90,11.00,-1
3,ADRX,2005-05-02 09:30:03,79195,QAX,2005-06-18,25.0,C,10,0.20,0.00,...,0.00,0.0,86.0,0.15,0.15,31560072.0,10.0,0.15,0.15,1
4,ORCL,2005-05-02 09:30:03,95870,ORQ,2005-12-17,14.0,C,15,0.25,0.30,...,0.25,3356.0,399.0,0.35,0.35,25240212.0,17.0,0.35,0.35,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29510315,IWM,2013-10-24 16:13:41,2761533800,IWM,2013-11-08,106.0,P,43,0.15,0.15,...,0.15,43.0,26.0,0.16,0.16,101818080.0,1197.0,0.16,0.16,-1
29510316,XLE,2013-10-24 16:13:52,1900753200,XLE,2014-01-18,77.0,P,1,0.60,0.56,...,0.56,10.0,20.0,0.42,0.72,100555800.0,1.0,0.47,0.72,1
29510317,XLE,2013-10-24 16:13:52,1900753201,XLE,2014-01-18,71.0,P,1,0.23,0.21,...,0.21,340.0,528.0,0.19,0.30,80840608.0,1.0,0.20,0.30,-1
29510318,IWM,2013-10-24 16:14:33,2764251401,IWM,2013-11-08,106.0,P,31,0.16,0.16,...,0.16,31.0,21.0,0.15,0.15,101818080.0,1197.0,0.15,0.15,-1


In [17]:
val

Unnamed: 0,UNDERLYING_SYMBOL,QUOTE_DATETIME,SEQUENCE_NUMBER,ROOT,EXPIRATION,STRK_PRC,OPTION_TYPE,TRADE_SIZE,TRADE_PRICE,BEST_BID,...,bid_ex,bid_size_ex,ask_size_ex,price_all_lead,price_all_lag,optionid,day_vol,price_ex_lead,price_ex_lag,buy_sell
29510320,IWM,2013-10-25 09:30:00,1901202,IWM,2014-03-31,97.0,P,20,1.47,1.38,...,,,,1.60,1.62,100723144.0,20.0,1.12,2.73,-1
29510321,IWM,2013-10-25 09:30:00,1901203,IWM,2014-09-30,105.0,P,20,6.27,5.85,...,5.85,115.0,11.0,6.32,7.69,101786576.0,20.0,5.92,10.29,1
29510322,UNG,2013-10-25 09:30:00,2497905,UNG,2013-12-21,18.0,C,2,1.32,1.19,...,1.19,82.0,82.0,1.30,1.25,101943840.0,2.0,1.02,1.19,1
29510323,VXX,2013-10-25 09:30:00,2693702,VXX,2013-10-25,14.5,P,20,1.66,1.62,...,1.62,99.0,172.0,1.62,1.60,101658624.0,82.0,1.62,1.60,1
29510324,GRPN,2013-10-25 09:30:00,2310303,GRPN,2013-10-25,10.5,P,1,0.85,0.00,...,,,,0.50,0.86,101849752.0,4.0,0.65,0.86,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39342166,UVXY,2015-11-05 16:14:37,50723741,UVXY,2015-11-06,27.0,C,1,0.88,0.86,...,0.86,12.0,21.0,0.70,0.90,109295920.0,4.0,0.46,1.88,-1
39342167,QQQ,2015-11-05 16:14:40,114141336,QQQ,2015-12-18,109.0,C,2,6.59,6.51,...,6.51,1078.0,1077.0,6.58,6.42,105521016.0,18.0,6.64,6.42,-1
39342168,SPY,2015-11-05 16:14:53,31158048,SPY,2015-12-24,220.0,P,1,11.42,11.08,...,11.08,10.0,10.0,12.63,,109720304.0,2.0,11.41,11.41,1
39342169,SPY,2015-11-05 16:14:53,98266912,SPY,2015-12-24,220.0,P,1,11.41,11.08,...,11.08,10.0,10.0,11.42,11.42,109720304.0,2.0,12.63,,1


In [18]:
test

Unnamed: 0,UNDERLYING_SYMBOL,QUOTE_DATETIME,SEQUENCE_NUMBER,ROOT,EXPIRATION,STRK_PRC,OPTION_TYPE,TRADE_SIZE,TRADE_PRICE,BEST_BID,...,bid_ex,bid_size_ex,ask_size_ex,price_all_lead,price_all_lag,optionid,day_vol,price_ex_lead,price_ex_lag,buy_sell
39342171,EWZ,2015-11-06 09:30:00,234098,EWZ,2015-12-18,22.0,P,2,0.520000,0.520000,...,0.520000,31.0,11.0,0.57,0.460000,107453656.0,3.0,0.59,0.45,-1
39342172,TSLA,2015-11-06 09:30:00,134917,TSLA,2015-11-27,230.0,C,1,7.820000,7.600000,...,7.600000,1.0,1.0,8.16,8.500000,109398624.0,1.0,4.97,8.37,-1
39342173,TSLA,2015-11-06 09:30:00,135092,TSLA,2017-01-20,260.0,C,1,28.889999,28.799999,...,28.799999,1.0,1.0,30.23,30.799999,105940216.0,3.0,29.90,29.00,-1
39342174,VB,2015-11-06 09:30:00,188332,VB,2015-11-20,115.0,C,1,2.250000,1.850000,...,1.850000,10.0,10.0,0.64,1.480000,109232832.0,1.0,,,1
39342175,VB,2015-11-06 09:30:00,188332,VB,2015-12-18,117.0,C,1,1.700000,1.700000,...,1.700000,10.0,10.0,0.65,2.200000,107538000.0,1.0,,,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49203742,SVXY,2017-05-31 16:12:19,225274471,SVXY,2017-06-02,152.0,C,1,3.540000,2.720000,...,2.720000,20.0,10.0,4.75,3.000000,115528016.0,1.0,5.24,3.85,-1
49203743,SPY,2017-05-31 16:12:45,190004593,SPY,2017-12-15,236.0,P,6,7.270000,7.220000,...,7.190000,753.0,1250.0,7.20,7.610000,113308776.0,6.0,5.37,9.01,1
49203744,^NDX,2017-05-31 16:13:39,35914334,NDX,2017-06-02,5690.0,P,12,1.000000,0.400000,...,0.000000,0.0,13.0,0.47,1.400000,115919712.0,83.0,0.47,1.40,-1
49203745,DIA,2017-05-31 16:14:02,72897705,DIA,2017-06-02,212.0,C,2,0.050000,0.030000,...,0.000000,0.0,22.0,0.03,0.010000,115563008.0,2.0,0.06,0.14,1


In [19]:
output_path = f"gs://thesis-bucket-option-trade-classification/data/preprocessed/train_set_60.parquet"
train.to_parquet(output_path)
dataset.add_reference(output_path,name='train_set_60')

[<ManifestEntry ref: gs://thesis-bucket-option-trade-classification/data/preprocessed/train_set_60.parquet/train_set_60>]

In [20]:
output_path = f"gs://thesis-bucket-option-trade-classification/data/preprocessed/val_set_20.parquet"
val.to_parquet(output_path)
dataset.add_reference(output_path,name='val_set_20')

[<ManifestEntry ref: gs://thesis-bucket-option-trade-classification/data/preprocessed/val_set_20.parquet/val_set_20>]

In [21]:
output_path = f"gs://thesis-bucket-option-trade-classification/data/preprocessed/test_set_20.parquet"
test.to_parquet(output_path)
dataset.add_reference(output_path,name='test_set_20')

[<ManifestEntry ref: gs://thesis-bucket-option-trade-classification/data/preprocessed/test_set_20.parquet/test_set_20>]

In [22]:
# Log the artifact to save it as an output of this run
run.log_artifact(dataset)

wandb.finish()