# LOB volumes analysis

The goal of this notebook is to preprocess sample distributions for each level of the order book from which one can sample random volumes that can be used in the simulation, e.g. for simulating reactions to agent's limit orders.

In [None]:
import os
import datetime

import matplotlib.pyplot as plt
import pandas as pd
import polars as pl

from data.utils import get_list_of_dates_between, set_plot_style, ensure_dir_exists

In [None]:
pl.enable_string_cache(True)
set_plot_style()

### Load the insample data

Load the insample dataset from the daily parquet files.

In [None]:
# SOL-USDT
exchange = "BIT.COM"
symbol = "SOL-USDT"

In [None]:
# Set parameters
start_date = datetime.datetime(2023, 9, 1)
end_date = datetime.datetime(2023, 9, 10) # Use the insample data
path = os.path.join(os.getcwd(), "datasets")
second = False

In [None]:
# Get the list of dates
dates = get_list_of_dates_between(start_date, end_date)

In [None]:
# Load the data
prefix = "order_book"
for date in dates:
    file_name = f"{exchange}_{symbol}_{prefix}_{date.strftime('%Y_%m_%d')}.parquet"
    df_single = pd.read_parquet(os.path.join(path, file_name))
    if date  == start_date:
        df = df_single
    else:
        df = pd.concat([df, df_single])
    
df.sort_index(inplace=True)

In [None]:
df

In [None]:
for i in range(3):
    vols_level = list(df[f"bid_{i}_size"].values) + list(df[f"ask_{i}_size"].values)
    
    fig = plt.figure(figsize=(10, 5))
    plt.hist(vols_level, bins=100, log=True)
    plt.xlabel("Volume")
    plt.ylabel("Frequency")
    plt.title(f"Volume distribution for level {i+1}")
    plt.show()

In [None]:
vols_level_0 = list(df[f"bid_0_size"].values) + list(df[f"ask_0_size"].values)
vols_level_1 = list(df[f"bid_1_size"].values) + list(df[f"ask_1_size"].values)
vols_level_2 = list(df[f"bid_2_size"].values) + list(df[f"ask_2_size"].values)

# # Make the assumption that there are on average 2 orders per level
# vols_level_0 = [vol/2 for vol in vols_level_0]
# vols_level_1 = [vol/2 for vol in vols_level_1]
# vols_level_2 = [vol/2 for vol in vols_level_2]

# Save the data
ensure_dir_exists(os.path.join(os.getcwd(), "distributions"))

# Save all three lists as pickle files
vols_level_0 = pd.Series(vols_level_0)
vols_level_1 = pd.Series(vols_level_1)
vols_level_2 = pd.Series(vols_level_2)
vols_level_0.to_pickle(
    os.path.join(os.getcwd(), "distributions", "volumes_level_0.pkl")
)
vols_level_1.to_pickle(
    os.path.join(os.getcwd(), "distributions", "volumes_level_1.pkl")
)
vols_level_2.to_pickle(
    os.path.join(os.getcwd(), "distributions", "volumes_level_2.pkl")
)

In [None]:
class EmpiricalOrderVolumeDistribution():
    """
    Class for sampling order volumes from the empirical distribution estimated
    on the insample order book data.
    """
    
    def __init__(self) -> None:
        """
        Initialize the class by loading the volume distributions from the pickle
        files.
        """
        self.vols_level_0 = pd.read_pickle(
            os.path.join(os.getcwd(), "distributions", "volumes_level_0.pkl")
        )
        self.vols_level_1 = pd.read_pickle(
            os.path.join(os.getcwd(), "distributions", "volumes_level_1.pkl")
        )
        self.vols_level_2 = pd.read_pickle(
            os.path.join(os.getcwd(), "distributions", "volumes_level_2.pkl")
        )
        
    def sample(self, level: int) -> float:
        """
        Sample a volume from the empirical distribution.

        Args:
            level: The level of the order book to sample from.

        Returns:
            The sampled volume.
        """
        if level == 0:
            return self.vols_level_0.sample().values[0]
        elif level == 1:
            return self.vols_level_1.sample().values[0]
        elif level == 2:
            return self.vols_level_2.sample().values[0]
        else:
            raise ValueError("Level must be between 0 and 2.")

In [None]:
dist = EmpiricalOrderVolumeDistribution()
for i in range(100):
    print(dist.sample(2))
    