Run `pip install .` first to install all dependencies.

In [1]:
!pip install gcsfs==2022.10.0
!pip install google-auth==2.15.0
!pip install psutil==5.9.4
!pip install wandb
!pip install fastparquet
!pip install numpy
!pip install pandas
!pip install catboost
!pip install scipy

Defaulting to user installation because normal site-packages is not writeable
Collecting gcsfs==2022.10.0
  Using cached gcsfs-2022.10.0-py2.py3-none-any.whl (25 kB)
Collecting google-auth-oauthlib
  Using cached google_auth_oauthlib-0.8.0-py2.py3-none-any.whl (19 kB)
Collecting fsspec==2022.10.0
  Using cached fsspec-2022.10.0-py3-none-any.whl (138 kB)
Collecting google-auth>=1.2
  Using cached google_auth-2.15.0-py2.py3-none-any.whl (177 kB)
Collecting google-cloud-storage
  Using cached google_cloud_storage-2.7.0-py2.py3-none-any.whl (110 kB)
Collecting pyasn1-modules>=0.2.1
  Using cached pyasn1_modules-0.2.8-py2.py3-none-any.whl (155 kB)
Collecting cachetools<6.0,>=2.0.0
  Using cached cachetools-5.2.0-py3-none-any.whl (9.3 kB)
Collecting rsa<5,>=3.1.4
  Using cached rsa-4.9-py3-none-any.whl (34 kB)
Collecting requests-oauthlib>=0.7.0
  Using cached requests_oauthlib-1.3.1-py2.py3-none-any.whl (23 kB)
Collecting google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0dev,>=1.31.5
  U

In [2]:
import os

from catboost import CatBoostClassifier, Pool
import numpy as np

import gcsfs
import numpy as np
import pandas as pd

import wandb

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.exceptions import NotFittedError

from scipy import stats

In [3]:
# connect to weights and biases
run = wandb.init(project="thesis", job_type="dataset-creation", entity="fbv")


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mkarelze[0m ([33mfbv[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [4]:
# import google.auth
# from google.colab import auth
# # connect to google cloud storage
# auth.authenticate_user()
# credentials, _ = google.auth.default()
fs = gcsfs.GCSFileSystem(project="thesis")
#fs = gcsfs.GCSFileSystem(project="thesis", credentials=credentials)



In [5]:
# reduce number of imported cols due to memory issues
columns = [
    "QUOTE_DATETIME",
    "ROOT",
    "EXPIRATION",
    "STRK_PRC",
    "OPTION_TYPE",
    "TRADE_SIZE",
    "TRADE_PRICE",
    "BEST_BID",
    "BEST_ASK",
    "ask_ex",
    "bid_ex",
    "bid_size_ex",
    "ask_size_ex",
    "price_all_lead",
    "price_all_lag",
    "price_ex_lead",
    "price_ex_lag",
    "buy_sell",
]


In [6]:
train = pd.read_parquet(
    f"gs://thesis-bucket-option-trade-classification/data/preprocessed/train_set_extended_60.parquet",
    engine="fastparquet",
    columns=columns,
)
val = pd.read_parquet(
    f"gs://thesis-bucket-option-trade-classification/data/preprocessed/val_set_extended_20.parquet",
    engine="fastparquet",
    columns=columns,
)



## Box Cox Test

In [25]:
train.min()

  train.min()


QUOTE_DATETIME    2005-05-02 09:30:02
ROOT                                A
EXPIRATION        2005-05-21 00:00:00
STRK_PRC                          0.5
TRADE_SIZE                          1
TRADE_PRICE                      0.01
BEST_BID                          0.0
BEST_ASK                          0.0
ask_ex                            0.0
bid_ex                            0.0
bid_size_ex                       0.0
ask_size_ex                       0.0
price_all_lead                   0.01
price_all_lag                    0.01
price_ex_lead                    0.01
price_ex_lag                     0.01
buy_sell                           -1
dtype: object

In [19]:
features = ["STRK_PRC",
    "TRADE_SIZE",
    "TRADE_PRICE",
    "BEST_BID",
    "BEST_ASK",
    "ask_ex",
    "bid_ex",
    "bid_size_ex",
    "ask_size_ex",
    "price_all_lead",
    "price_all_lag",
    "price_ex_lead",
    "price_ex_lag"]

results = []

for feature in features:
    # By default, the scipy implementation of Box-Cox transform finds the lambda # parameter 
    # that will make the output the closest to a normal distribution
    
    # Const ensures all features are strictly positive. See Zheng & Casari p. 25
    # Convert to 64 bit float due to overflow warning
    vals = np.array(train[[feature]].values.flatten() + 1, dtype=np.float64)
    rc_bc, bc_params = stats.boxcox(vals)
    print([feature, bc_params])
    results.append([feature, bc_params])

['STRK_PRC', -0.12641371428231268]
['TRADE_SIZE', -0.3797862007138544]
['TRADE_PRICE', -0.4469276853869847]
['BEST_BID', -0.442183050684345]
['BEST_ASK', -0.44226354895096626]
['ask_ex', 8.472135811722177]
['bid_ex', 8.472135811722177]
['bid_size_ex', 8.472135811722177]
['ask_size_ex', 8.472135811722177]
['price_all_lead', 8.472135811722177]
['price_all_lag', 8.472135811722177]
['price_ex_lead', 8.472135811722177]
['price_ex_lag', 8.472135811722177]


In [20]:
results = pd.DataFrame(results)
results

Unnamed: 0,0,1
0,STRK_PRC,-0.126414
1,TRADE_SIZE,-0.379786
2,TRADE_PRICE,-0.446928
3,BEST_BID,-0.442183
4,BEST_ASK,-0.442264
5,ask_ex,8.472136
6,bid_ex,8.472136
7,bid_size_ex,8.472136
8,ask_size_ex,8.472136
9,price_all_lead,8.472136


## Prepare dataset

In [21]:
scaler = MinMaxScaler(feature_range=[-1, 1])

def transform(data: pd.DataFrame) -> pd.DataFrame:

    # # date features
    x = pd.DataFrame(data={"TRADE_PRICE": data["TRADE_PRICE"]}, index=data.index)

    # TODO: speak with caroline
    # x["bid_ex"].replace({0.0:np.NaN}, inplace=True)
    # x["ask_ex"].replace({0.0:np.NaN}, inplace=True)

    # size features
    x["bid_ask_size_ratio_ex"] = data["bid_size_ex"] / data["ask_size_ex"]
    x["rel_bid_size_ex"] = data["TRADE_SIZE"] / data["bid_size_ex"]
    x["rel_ask_size_ex"] = data["TRADE_SIZE"] / data["ask_size_ex"]
    x["depth_ex"] = data["bid_size_ex"] - data["ask_size_ex"]
    
    # classical
    mid_ex = 0.5 * (data["ask_ex"] + data["bid_ex"])
    mid_best = 0.5 * (data["BEST_ASK"] + data["BEST_BID"])
    spread_ex = (data["ask_ex"] - data["bid_ex"])
    spread_best = (data["BEST_ASK"] - data["BEST_BID"])
    
    x["prox_ex"] = (data["TRADE_PRICE"] - mid_ex) / (0.5 * spread_ex) 
    x["prox_best"] = (data["TRADE_PRICE"] - mid_best) / (0.5 * spread_best)  

    x["spread_ex"] = spread_ex
    x["spread_best"] = spread_best

    x["bid_ask_ratio_ex"] = data["bid_ex"] / data["ask_ex"]
    x["price_rel_nbb"] = (data["BEST_ASK"] - data["TRADE_PRICE"]) / (data["BEST_ASK"] - mid_best)
    x["price_rel_nbo"] = (data["TRADE_PRICE"] - data["BEST_BID"]) / (mid_best - data["BEST_BID"])
    
    
    # calculate change
    x["chg_ex_lead"] = data["TRADE_PRICE"] - data["price_ex_lead"]
    x["chg_ex_lag"] = data["TRADE_PRICE"] - data["price_ex_lag"]
    x["chg_all_lead"] = data["TRADE_PRICE"] - data["price_all_lead"]
    x["chg_all_lag"] = data["TRADE_PRICE"] - data["price_all_lag"]

    # log transformed features
    x[
         [
             "ask_ex",
             "bid_ex",
             "BEST_ASK",
             "BEST_BID",
             "TRADE_PRICE",
             "price_all_lag",
             "price_all_lead",
             "price_ex_lag",
             "price_ex_lead",
             "TRADE_SIZE", 
             "bid_size_ex", 
             "ask_size_ex",
         ]
     ] = np.log1p(data[
         [
             "ask_ex",
             "bid_ex",
             "BEST_ASK",
             "BEST_BID",
             "TRADE_PRICE",
             "price_all_lag",
             "price_all_lead",
             "price_ex_lag",
             "price_ex_lead",
             "TRADE_SIZE", 
             "bid_size_ex", 
             "ask_size_ex"
         ]
     ]
     )

    # impute with zeros
    x.replace([np.inf, -np.inf], np.nan, inplace=True)
    x.fillna(0, inplace=True)

    # scale to [-1, 1]
    try:
        x[x.columns] = scaler.transform(x)
        print("try")
    except NotFittedError as e:
        x[x.columns] = scaler.fit_transform(x)
        print("except")

    x["buy_sell"] = data["buy_sell"]
    return x


In [9]:
X_train = transform(train)
y_train = X_train.buy_sell
X_train.drop(columns=["buy_sell"], inplace=True)

del train

except


In [10]:
X_val = transform(val)
y_val = X_val.buy_sell
X_val.drop(columns=["buy_sell"], inplace=True)
del val

try


## Sanity Check with `catboost`

In [11]:
params = {
        "od_type": "Iter",
        "logging_level": "Silent",
        "loss_function": "Logloss",
        "task_type": "GPU",
        "cat_features": None,
        "random_seed": 42,
        "eval_metric":"Accuracy",
       "iterations":2000,
       "early_stopping_rounds":100,
       "grow_policy":"Lossguide",
}


weight = np.geomspace(0.001, 1, num=len(y_train))
# keep ordering of data
timestamp = np.linspace(0, 1, len(y_train))

model = CatBoostClassifier(**params)
train_pool = Pool(X_train, y_train,weight = weight, timestamp=timestamp)
model.fit(train_pool, eval_set=(X_val,y_val))

print(model.score(X_val, y_val))

0.7521829816176018


In [14]:
feature_importance = model.get_feature_importance(prettified=True)
feature_importance

Unnamed: 0,Feature Id,Importances
0,prox_ex,15.528679
1,ask_size_ex,14.496162
2,bid_size_ex,11.419066
3,prox_best,11.275916
4,price_rel_nbo,10.073817
5,bid_ask_size_ratio_ex,7.577764
6,price_rel_nbb,7.139597
7,rel_bid_size_ex,4.310755
8,spread_ex,3.742201
9,rel_ask_size_ex,2.9751


In [16]:
# 0.7333797810178145 (logs applied) to     x["bid_ask_size_ratio_ex"] x["rel_bid_size_ex"]   x["rel_ask_size_ex"] 
# 0.7331946683482763 (all from above + log returns)
# 0.734859665271541 (all above + compare mids, ask, and bid at exchange with nation wide)
# 0.7349013664773161 (all above + trade_price == bid_ex == ask_ex)
# 0.7351464881990674 (price normed size)
# 0.7352807457396116 (all above +  x["price_rel_nbo"] +  x["price_rel_nbb"])
# 0.74296292 (some removed, see above, max. iterations reached. Actually there is no reason to stop ensemble)
# 0.745053067 convert some ratios to percentages (all other things the same as above)
# 0.7434765583282902 without any scaler, zero imputer, symmetric tree
# 0.7445394305242655 "grow_policy" = "Lossguide" -> trained for 5000 iterations
# 0.7450428963013065 impute with -999 instead of 0 -> trained for 5000 iterations
# 0.7436240382023729 TODO: ask Caroline: What happens with LR if bid_ex = 0 or ask_ex is?   x["bid_ex"].replace({0.0:np.NaN}, inplace=True) and  x["ask_ex"].replace({0.0:np.NaN}, inplace=True)
# 0.7447164063731647 chg from previous trade to successive trade
# 0.7445821488326205 chg from previous trade + no trade indicator
# 0.7476558328290199 "spread" feature ex
# 0.747136093410701 "spread" feature best ex + fixed typo
# 0.7475317463142745 removed features with low importance + impute with -1

## Sanity Check against `lightgbm`

In [17]:
%%script false --no-raise-error
!pip install lightgbm

In [18]:
%%script false --no-raise-error
import lightgbm as lgb

In [19]:
%%script false --no-raise-error
lgb_params = {
                    'objective':'binary',
                    'boosting_type':'gbdt',
                    'metric':None,
                    'n_jobs':-1,
                    'learning_rate':0.01,
                    'num_leaves': 2**8,
                    'max_depth':-1,
                    'tree_learner':'serial',
                    'colsample_bytree': 0.7,
                    'subsample_freq':1,
                    'subsample':0.7,
                    'n_estimators':200,
                    'max_bin':255,
                    'verbose':-1,
                    'seed': 42,
                    'eval_metric':'accuracy',
                    "device": "cpu",
                }

In [20]:
%%script false --no-raise-error
# create dataset for lightgbm
# if you want to re-use data, remember to set free_raw_data=False
lgb_train = lgb.Dataset(X_train, y_train,weight=X_train.index.values, free_raw_data=False)
lgb_eval = lgb.Dataset(X_val, y_val, reference=lgb_train, free_raw_data=False)

In [21]:
%%script false --no-raise-error
bst = lgb.LGBMClassifier(**lgb_params)
bst = lgb.train(lgb_params,
                train_set=lgb_train,
                valid_sets=lgb_eval,
                early_stopping_rounds=100
)

In [22]:
%%script false --no-raise-error
pred = bst.predict(X_val)
pred = np.rint(pred)
pred[pred==0]=-1
print((y_val == pred).mean())

## Write to file

In [22]:
train = pd.read_parquet(
    f"gs://thesis-bucket-option-trade-classification/data/preprocessed/train_set_extended_60.parquet",
    engine="fastparquet",
    columns=columns,
)
val = pd.read_parquet(
    f"gs://thesis-bucket-option-trade-classification/data/preprocessed/val_set_extended_20.parquet",
    engine="fastparquet",
    columns=columns,
)
test = pd.read_parquet(
    f"gs://thesis-bucket-option-trade-classification/data/preprocessed/test_set_extended_20.parquet",
    engine="fastparquet",
    columns=columns,
)

In [23]:
os.environ["GCLOUD_PROJECT"] = "flowing-mantis-239216"

name = "ise_classic_size_log_normalized"
#dataset = wandb.Artifact(name=name, type="preprocessed_data")

# reset scaler
scaler = MinMaxScaler(feature_range=[-1, 1])

output_path = (
    f"gs://thesis-bucket-option-trade-classification/data/{name}/train_set_60.parquet"
)
train = transform(train)
train.to_parquet(output_path)
#dataset.add_reference(output_path)

del train
output_path = (
    f"gs://thesis-bucket-option-trade-classification/data/{name}/val_set_20.parquet"
)
val = transform(val)
val.to_parquet(output_path)
#dataset.add_reference(output_path)

del val
output_path = (
    f"gs://thesis-bucket-option-trade-classification/data/{name}/test_set_20.parquet"
)
test = transform(test)
test.to_parquet(output_path)
#dataset.add_reference(output_path)

except
try
try


In [24]:
test.describe()

Unnamed: 0,TRADE_PRICE,bid_ask_size_ratio_ex,rel_bid_size_ex,rel_ask_size_ex,depth_ex,prox_ex,prox_best,spread_ex,spread_best,bid_ask_ratio_ex,...,BEST_ASK,BEST_BID,price_all_lag,price_all_lead,price_ex_lag,price_ex_lead,TRADE_SIZE,bid_size_ex,ask_size_ex,buy_sell
count,9861576.0,9861576.0,9861576.0,9861576.0,9861576.0,9861576.0,9861576.0,9861576.0,9861576.0,9861576.0,...,9861576.0,9861576.0,9861576.0,9861576.0,9861576.0,9861576.0,9861576.0,9861576.0,9861576.0,9861576.0
mean,-0.7061342,-0.999836,-0.999829,-0.9998783,-0.0001517021,-0.4496103,-0.2217283,-0.3360071,-0.9974931,-0.7037069,...,-0.8189535,-0.7144768,-0.7731023,-0.8203375,-0.7321574,-0.7479942,-0.8128382,-0.4204213,-0.3656733,-0.02805495
std,0.2711683,0.001158564,0.001354047,0.0008864031,0.003358728,0.000628755,0.0007700381,0.006190585,7.246308e-06,0.1178656,...,0.1602128,0.2718154,0.2112202,0.1696631,0.270856,0.2639858,0.1995284,0.3001729,0.2637453,0.9996064
min,-1.0,-1.0,-1.0,-1.0,-0.1027665,-0.5057553,-0.32,-1.061782,-0.9978192,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
25%,-0.9195079,-0.999995,-0.9999933,-0.9999936,-0.0004050061,-0.4499189,-0.2219634,-0.3387931,-0.9974964,-0.7333333,...,-0.9448681,-0.9294147,-0.939713,-0.9544227,-0.9518099,-0.9648794,-1.0,-0.5834429,-0.5544227,-1.0
50%,-0.777799,-0.9999808,-0.9999701,-0.9999773,-5.000075e-06,-0.4495944,-0.221739,-0.3382184,-0.9974957,-0.6525424,...,-0.8607004,-0.787817,-0.8287887,-0.8662185,-0.8065964,-0.8251519,-0.8643808,-0.4034553,-0.3925944,-1.0
75%,-0.5676084,-0.9999556,-0.999864,-0.9998918,0.000265004,-0.4493566,-0.2215071,-0.3363218,-0.9974932,-0.6220183,...,-0.7362892,-0.5769865,-0.665807,-0.7341432,-0.5959212,-0.6147051,-0.6664538,-0.2222069,-0.1915243,1.0
max,1.096489,-0.8631786,-0.06186268,-0.3209091,0.1369371,-0.40672,0.1930437,1.895977,-0.997229,-0.1,...,0.2405476,1.096545,0.5909646,0.268222,1.029699,1.000336,0.9747139,0.6547516,0.6051181,1.0


In [26]:
test.columns

Index(['TRADE_PRICE', 'bid_ask_size_ratio_ex', 'rel_bid_size_ex',
       'rel_ask_size_ex', 'depth_ex', 'prox_ex', 'prox_best', 'spread_ex',
       'spread_best', 'bid_ask_ratio_ex', 'price_rel_nbb', 'price_rel_nbo',
       'chg_ex_lead', 'chg_ex_lag', 'chg_all_lead', 'chg_all_lag', 'ask_ex',
       'bid_ex', 'BEST_ASK', 'BEST_BID', 'price_all_lag', 'price_all_lead',
       'price_ex_lag', 'price_ex_lead', 'TRADE_SIZE', 'bid_size_ex',
       'ask_size_ex', 'buy_sell'],
      dtype='object')

In [None]:
#run.log_artifact(dataset)
#run.finish()