In [None]:
from __future__ import annotations

import os
from pathlib import Path
from typing import List

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import wandb
from tqdm.auto import tqdm

from otc.features.build_features import (
    features_classical_size,
)

In [None]:
EXCHANGE = "ise"
STRATEGY = "supervised"

In [None]:
dataset = f"fbv/thesis/{EXCHANGE}_{STRATEGY}_none:latest"

os.environ["GCLOUD_PROJECT"] = "flowing-mantis-239216"

run = wandb.init(project="thesis", entity="fbv")

In [None]:
# download labelled
artifact_labelled = run.use_artifact(dataset)
artifact_dir_labelled = artifact_labelled.download()

# download unlabelled
unlabelled_dataset = dataset.replace("supervised", "unsupervised")
artifact_unlabelled = run.use_artifact(unlabelled_dataset)
artifact_dir_unlabelled = artifact_unlabelled.download()

x_train_unlabelled = pd.read_parquet(
    Path(artifact_dir_unlabelled, "train_set.parquet"),
    columns=["buy_sell", *features_classical_size],
)

# labelled data
x_train_labelled = pd.read_parquet(
    Path(artifact_dir_labelled, "train_set.parquet"),
    columns=["buy_sell", *features_classical_size],
)

x_train_unlabelled["src"] = "unlabelled"
x_train_labelled["src"] = "labelled"


# load unscaled data
# artifact_labelled = run.use_artifact(dataset)
# artifact_dir_labelled = artifact_labelled.download()
# x_train = pd.concat([x_train_labelled, x_train_unlabelled])
# y_train = pd.concat([y_train_labelled, y_train_unlabelled])

In [None]:
x_train_unlabelled.head().T

In [None]:
x_train_unlabelled.index.min()

In [None]:
x_train_unlabelled.index.max()

In [None]:
# slice to same time range as unlabelled trades
x_train_labelled = x_train_labelled.iloc[27248577:29510319]

In [None]:
x_train_labelled

In [None]:
data = pd.concat([x_train_labelled, x_train_unlabelled]).sample(frac=0.10)

In [None]:
def plot_kde_src(var_name: str, clip: List[float] | None = None):
    """Plot kde plots for labelled and unlabelled with regard to the feature 'var_name'.

    Args:
        var_name (str): name of feature
        clip (List[float] | None, optional): clipping range. Defaults to None.
    """
    quantiles = np.linspace(0.1, 1, 9, 0)
    stats_unlabelled = data[data["src"] == "unlabelled"][var_name].quantile(quantiles)
    stats_labelled = data[data["src"] == "labelled"][var_name].quantile(quantiles)

    _, ax = plt.subplots()
    for i in ["unlabelled", "labelled"]:
        sns.kdeplot(
            data=data[data["src"] == i][var_name],
            clip=clip,
            label=str(i),
            cumulative=True,
            common_grid=True,
            # common_norm=True,
        )
    ax.title.set_text(f"Distribution of '{var_name}'")
    ax.legend()
    sns.move_legend(ax, "lower center", bbox_to_anchor=(0.5, -0.3))
    plt.show()

    stats = pd.concat(
        [stats_unlabelled, stats_labelled], keys=["unlabelled", "labelled"], axis=1
    )
    print(stats)

In [None]:
data.columns

In [None]:
keys = data.columns.tolist()
keys.remove("buy_sell")
keys.remove("src")

for key in tqdm(keys):
    plot_kde_src(key)

## Proximity to Quotes

In [None]:
def plot_hist(unlabelled, labelled, title):
    fig, ax = plt.subplots()
    ax.hist(
        unlabelled, bins=50, alpha=0.5, label="unlabelled", density=True, range=[-2, 2]
    )
    ax.hist(labelled, bins=50, alpha=0.5, label="labelled", density=True, range=[-2, 2])
    plt.title(title)
    plt.legend()
    plt.show()

In [None]:
var_name = "prox_ex"
unlabelled = data[data["src"] == "unlabelled"][var_name]
labelled = data[data["src"] == "labelled"][var_name]

In [None]:
plot_hist(unlabelled, labelled, var_name)

In [None]:
var_name = "prox_best"
unlabelled = data[data["src"] == "unlabelled"][var_name]
labelled = data[data["src"] == "labelled"][var_name]

In [None]:
plot_hist(unlabelled, labelled, var_name)

## Bid Size == Trade Size || Ask Size == Trade Size

In [None]:
var_name = "rel_bid_size_ex"

unlabelled = data[data["src"] == "unlabelled"][var_name]
labelled = data[data["src"] == "labelled"][var_name]

In [None]:
plot_hist(unlabelled, labelled, var_name)

In [None]:
var_name = "rel_ask_size_ex"

unlabelled = data[data["src"] == "unlabelled"][var_name]
labelled = data[data["src"] == "labelled"][var_name]

In [None]:
plot_hist(unlabelled, labelled, var_name)

## Absolute Spread

In [None]:
var_name = "spread"

data[var_name] = data["ask_ex"] - data["bid_ex"]

In [None]:
unlabelled = data[data["src"] == "unlabelled"][var_name]
labelled = data[data["src"] == "labelled"][var_name]

In [None]:
plot_hist(unlabelled, labelled, var_name)

## Delta to previous / next trade price

In [None]:
var_name = "prc_delta"
data[var_name] = data["TRADE_PRICE"] - data["price_ex_lead"]

In [None]:
unlabelled = data[data["src"] == "unlabelled"][var_name]
labelled = data[data["src"] == "labelled"][var_name]

In [None]:
plot_hist(unlabelled, labelled, var_name)