<a href="https://colab.research.google.com/github/KarelZe/thesis/blob/baseline/notebooks/data_preprocessing_loading.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install modin
!pip install gcsfs

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting modin
  Downloading modin-0.16.2-py3-none-any.whl (957 kB)
[K     |████████████████████████████████| 957 kB 4.4 MB/s 
  Downloading modin-0.16.1-py3-none-any.whl (956 kB)
[K     |████████████████████████████████| 956 kB 27.9 MB/s 
[?25h  Downloading modin-0.16.0-py3-none-any.whl (956 kB)
[K     |████████████████████████████████| 956 kB 54.9 MB/s 
[?25h  Downloading modin-0.12.1-py3-none-any.whl (761 kB)
[K     |████████████████████████████████| 761 kB 33.1 MB/s 
Installing collected packages: modin
Successfully installed modin-0.12.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gcsfs
  Downloading gcsfs-2022.10.0-py2.py3-none-any.whl (25 kB)
Installing collected packages: gcsfs
Successfully installed gcsfs-2022.10.0


In [6]:
import gcsfs
import google.auth
from google.colab import auth
auth.authenticate_user()

# connect to google cloud storage
credentials, project_id = google.auth.default()
fs = gcsfs.GCSFileSystem(project=project_id, token=credentials)

In [4]:
import numpy as np
import modin.pandas as pd
import modin.config as cfg
from modin.config import ProgressBar
from distributed import Client
cfg.Engine.put("dask")
ProgressBar.enable()

from sklearn.metrics import accuracy_score


from tqdm.notebook import tqdm

In [None]:
from google.colab import auth
auth.authenticate_user()


In [5]:
client = Client()

file_path = "gs://thesis-bucket-option-trade-classification/data/preprocessed/matched_ise_quotes_2015.parquet"

df = pd.read_parquet(file_path)

In [None]:
df.head().T

Unnamed: 0,34736684,34736685,34736686,34736687,34736688
UNDERLYING_SYMBOL,LINE,LINE,AMZN,TLT,TLT
QUOTE_DATETIME,2015-01-02 09:30:00,2015-01-02 09:30:00,2015-01-02 09:30:00,2015-01-02 09:30:00,2015-01-02 09:30:00
SEQUENCE_NUMBER,74466,67183330,8588013,7331304,7331305
ROOT,LINE,LINE,AMZN,TLT,TLT
EXPIRATION,2015-04-17 00:00:00,2015-04-17 00:00:00,2015-02-20 00:00:00,2015-01-02 00:00:00,2015-01-02 00:00:00
STRK_PRC,9.0,18.0,390.0,124.0,126.0
OPTION_TYPE,C,C,C,P,P
TRADE_SIZE,5,5,1,20,20
TRADE_PRICE,2.14,0.04,1.42,0.01,0.18
BEST_BID,1.8,0.0,0.0,0.0,0.05


In [None]:
# test if if a lead price is different from the trade price?
sum = (df['TRADE_PRICE'] == df['price_ex_lead']).sum()
print(sum)

0


In [None]:
# quote rule
mid = 0.5 * (df['ask_ex'] + df['bid_ex'])
qr = np.where(df['TRADE_PRICE'] > mid, 1, np.where(df['TRADE_PRICE'] < mid, -1, np.nan))
df['pred_quote_rule'] = qr

In [None]:
sum = (df['TRADE_PRICE'] == mid).sum()
print(sum)

632780


In [None]:
# depth rule p. 14
dr = np.where((df['TRADE_PRICE'] == mid) & (df['ask_size_ex'] > df['bid_size_ex']),1,np.where((df['TRADE_PRICE'] == mid) & (df['ask_size_ex'] < df['bid_size_ex']),-1,np.nan))
df['pred_depth_rule'] = dr

In [None]:
# tick rule
# FIXME: Disqus with Grauer et al what is used in table 9 ISE or all?
tt = np.where(df['TRADE_PRICE'] > df['price_ex_lead'], 1.0, -1.0)
df['pred_tick_test'] = tt

In [None]:
# main idea of our new “trade size rule” is that when the trade size matches exactly either the bid or ask quote size, it is likely that the quote came from a customer, the market maker found it attractive and, therefore, decided to fill it completely.
# we propose to classify trades for which the trade size is equal to the quoted bid size as customer buys and those with a trade size equal to the ask size as customer sells.

ts_eq_bid = (df['TRADE_SIZE'] == df['bid_size_ex'])
ts_eq_ask = (df['TRADE_SIZE'] == df['ask_size_ex'])

df['pred_trade_size_quote_rule'] = np.where(ts_eq_bid,1.0, np.where(ts_eq_ask,-1.0, qr))
df['pred_trade_size_tick_test'] = np.where(ts_eq_bid,1.0, np.where(ts_eq_ask,-1.0, tt))

In [None]:
df['pred_trade_size_depth_rule'] = np.where(ts_eq_bid,1.0, np.where(ts_eq_ask,-1.0, dr))

In [None]:
# reverse tick rule
df['pred_rev_tick_test'] = np.where(df['TRADE_PRICE'] > df['price_all_lag'], 1.0, -1.0)

In [None]:
df.head().T

Unnamed: 0,34736684,34736685,34736686,34736687,34736688
UNDERLYING_SYMBOL,LINE,LINE,AMZN,TLT,TLT
QUOTE_DATETIME,2015-01-02 09:30:00,2015-01-02 09:30:00,2015-01-02 09:30:00,2015-01-02 09:30:00,2015-01-02 09:30:00
SEQUENCE_NUMBER,74466,67183330,8588013,7331304,7331305
ROOT,LINE,LINE,AMZN,TLT,TLT
EXPIRATION,2015-04-17 00:00:00,2015-04-17 00:00:00,2015-02-20 00:00:00,2015-01-02 00:00:00,2015-01-02 00:00:00
STRK_PRC,9.0,18.0,390.0,124.0,126.0
OPTION_TYPE,C,C,C,P,P
TRADE_SIZE,5,5,1,20,20
TRADE_PRICE,2.14,0.04,1.42,0.01,0.18
BEST_BID,1.8,0.0,0.0,0.0,0.05


In [None]:
# Discuss with C. Grauer how results were calculated in table 9. Assign randomly (table 3) (?)
pred_quote_rule_random = df['pred_quote_rule'].applymap(lambda l: l if not np.isnan(l) else np.random.choice([-1, 1]))

acc = accuracy_score(df['buy_sell'], pred_quote_rule_random)
print(f"{acc:.3%}")

56.073%


In [None]:
# Discuss with C. Grauer how results were calculated in table 9. Assign randomly (table 3) (?)
pred_trade_size_quote_rule_random = df['pred_trade_size_quote_rule'].applymap(lambda l: l if not np.isnan(l) else np.random.choice([-1, 1]))

acc = accuracy_score(df['buy_sell'], pred_trade_size_quote_rule_random)
print(f"{acc:.3%}")

65.387%


In [None]:
# Discuss with C. Grauer how results were calculated in table 9. Assign randomly (table 3) (?)
pred_depth_rule_random = df['pred_depth_rule'].map(lambda l: l if not np.isnan(l) else np.random.choice([-1, 1]))

acc = accuracy_score(df['buy_sell'], pred_depth_rule_random)
print(f"{acc:.3%}")

51.728%


In [None]:
# Discuss with C. Grauer how results were calculated in table 9. Assign randomly (table 3) (?)
pred_trade_size_depth_rule_random = df['pred_trade_size_depth_rule'].map(lambda l: l if not np.isnan(l) else np.random.choice([-1, 1]))

acc = accuracy_score(df['buy_sell'], pred_trade_size_depth_rule_random)
print(f"{acc:.3%}")

56.629%


In [None]:
acc = accuracy_score(df['buy_sell'], df['pred_trade_size_tick_test'])
print(f"{acc:.3%}")

54.617%


In [None]:
acc = accuracy_score(df['buy_sell'], df['pred_tick_test'])
print(f"{acc:.3%}")

49.181%


In [None]:
acc = accuracy_score(df['buy_sell'], df['pred_rev_tick_test'])
print(f"{acc:.3%}")

53.864%
