<a href="https://colab.research.google.com/github/KarelZe/thesis/blob/baseline/notebooks/data_preprocessing_loading.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install modin

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting modin
  Downloading modin-0.16.2-py3-none-any.whl (957 kB)
[K     |████████████████████████████████| 957 kB 4.5 MB/s 
  Downloading modin-0.16.1-py3-none-any.whl (956 kB)
[K     |████████████████████████████████| 956 kB 60.8 MB/s 
[?25h  Downloading modin-0.16.0-py3-none-any.whl (956 kB)
[K     |████████████████████████████████| 956 kB 59.6 MB/s 
[?25h  Downloading modin-0.12.1-py3-none-any.whl (761 kB)
[K     |████████████████████████████████| 761 kB 68.9 MB/s 
Installing collected packages: modin
Successfully installed modin-0.12.1


In [2]:
# use gcs fuse to access google cloud storage
# https://stackoverflow.com/a/60450255/5755604
!echo "deb http://packages.cloud.google.com/apt gcsfuse-bionic main" > /etc/apt/sources.list.d/gcsfuse.list
!curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key add -
!apt -qq update
!apt -qq install gcsfuse

# mount google cloud stoarge as drive
!mkdir gcs
!gcsfuse thesis-bucket-option-trade-classification gcs

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100  2537  100  2537    0     0  50740      0 --:--:-- --:--:-- --:--:-- 50740
OK
29 packages can be upgraded. Run 'apt list --upgradable' to see them.
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'apt autoremove' to remove it.
The following NEW packages will be installed:
  gcsfuse
0 upgraded, 1 newly installed, 0 to remove and 29 not upgraded.
Need to get 13.3 MB of archives.
After this operation, 30.7 MB of additional disk space will be used.
Selecting previously unselected package gcsfuse.
(Reading database ... 123942 files and directories currently installed.)
Preparing to unpack .../gcsfuse_0.41.8_amd64.deb ...
Unpacking gcsfuse (0.41.8) ...
Setting up gcsfuse (0.41.8) ...
2022/10/29 14:1

In [3]:
import numpy as np
import modin.pandas as pd
import modin.config as cfg
from modin.config import ProgressBar
from distributed import Client
cfg.Engine.put("dask")
ProgressBar.enable()

from sklearn.metrics import accuracy_score


from tqdm.notebook import tqdm

In [4]:
from google.colab import auth
auth.authenticate_user()


In [5]:
client = Client()

file_path = "./gcs/data/preprocessed/matched_ise_quotes_2015.parquet"

df = pd.read_parquet(file_path)

In [6]:
df.head().T

Unnamed: 0,34736684,34736685,34736686,34736687,34736688
UNDERLYING_SYMBOL,LINE,LINE,AMZN,TLT,TLT
QUOTE_DATETIME,2015-01-02 09:30:00,2015-01-02 09:30:00,2015-01-02 09:30:00,2015-01-02 09:30:00,2015-01-02 09:30:00
SEQUENCE_NUMBER,74466,67183330,8588013,7331304,7331305
ROOT,LINE,LINE,AMZN,TLT,TLT
EXPIRATION,2015-04-17 00:00:00,2015-04-17 00:00:00,2015-02-20 00:00:00,2015-01-02 00:00:00,2015-01-02 00:00:00
STRK_PRC,9.0,18.0,390.0,124.0,126.0
OPTION_TYPE,C,C,C,P,P
TRADE_SIZE,5,5,1,20,20
TRADE_PRICE,2.14,0.04,1.42,0.01,0.18
BEST_BID,1.8,0.0,0.0,0.0,0.05


In [20]:
# test if if a lead price is different from the trade price?
sum = (df['TRADE_PRICE'] == df['price_ex_lead']).sum()
print(sum)

0


In [8]:
# quote rule
mid = 0.5 * (df['ask_ex'] + df['bid_ex'])
qr = np.where(df['TRADE_PRICE'] > mid, 1, np.where(df['TRADE_PRICE'] < mid, -1, np.nan))
df['pred_quote_rule'] = qr

In [9]:
sum = (df['TRADE_PRICE'] == mid).sum()
print(sum)

632780


In [10]:
# tick rule
# FIXME: Disqus with Grauer et al what is used in table 9 ISE or all?
tt = np.where(df['TRADE_PRICE'] > df['price_ex_lead'], 1.0, -1.0)
df['pred_tick_test'] = tt

In [11]:
# main idea of our new “trade size rule” is that when the trade size matches exactly either the bid or ask quote size, it is likely that the quote came from a customer, the market maker found it attractive and, therefore, decided to fill it completely.
# we propose to classify trades for which the trade size is equal to the quoted bid size as customer buys and those with a trade size equal to the ask size as customer sells.

ts_eq_bid = (df['TRADE_SIZE'] == df['bid_size_ex'])
ts_eq_ask = (df['TRADE_SIZE'] == df['ask_size_ex'])

df['pred_trade_size_quote_rule'] = np.where(ts_eq_bid,1.0, np.where(ts_eq_ask,-1.0, qr))
df['pred_trade_size_tick_test'] = np.where(ts_eq_bid,1.0, np.where(ts_eq_ask,-1.0, tt))

In [12]:
# reverse tick rule
df['pred_rev_tick_test'] = np.where(df['TRADE_PRICE'] > df['price_all_lag'], 1.0, -1.0)

In [13]:
df.head().T

Unnamed: 0,34736684,34736685,34736686,34736687,34736688
UNDERLYING_SYMBOL,LINE,LINE,AMZN,TLT,TLT
QUOTE_DATETIME,2015-01-02 09:30:00,2015-01-02 09:30:00,2015-01-02 09:30:00,2015-01-02 09:30:00,2015-01-02 09:30:00
SEQUENCE_NUMBER,74466,67183330,8588013,7331304,7331305
ROOT,LINE,LINE,AMZN,TLT,TLT
EXPIRATION,2015-04-17 00:00:00,2015-04-17 00:00:00,2015-02-20 00:00:00,2015-01-02 00:00:00,2015-01-02 00:00:00
STRK_PRC,9.0,18.0,390.0,124.0,126.0
OPTION_TYPE,C,C,C,P,P
TRADE_SIZE,5,5,1,20,20
TRADE_PRICE,2.14,0.04,1.42,0.01,0.18
BEST_BID,1.8,0.0,0.0,0.0,0.05


In [14]:
# Discuss with C. Grauer how results were calculated in table 9. Assign randomly (table 3) (?)
pred_quote_rule_random = df['pred_quote_rule'].applymap(lambda l: l if not np.isnan(l) else np.random.choice([-1, 1]))

acc = accuracy_score(df['buy_sell'], pred_quote_rule_random)
print(f"{acc:.3%}")

56.073%


In [15]:
# Discuss with C. Grauer how results were calculated in table 9. Assign randomly (table 3) (?)
pred_trade_size_quote_rule_random = df['pred_trade_size_quote_rule'].applymap(lambda l: l if not np.isnan(l) else np.random.choice([-1, 1]))

acc = accuracy_score(df['buy_sell'], pred_trade_size_quote_rule_random)
print(f"{acc:.3%}")

65.387%


In [16]:
acc = accuracy_score(df['buy_sell'], df['pred_trade_size_tick_test'])
print(f"{acc:.3%}")

54.617%


In [17]:
acc = accuracy_score(df['buy_sell'], df['pred_tick_test'])
print(f"{acc:.3%}")

49.181%


In [18]:
acc = accuracy_score(df['buy_sell'], df['pred_rev_tick_test'])
print(f"{acc:.3%}")

53.864%
