# Exploratory analysis of trade data

In this notebook I explore the datasets and plot some of the data.

In [None]:
import os
import datetime
import polars as pl
import matplotlib.pyplot as plt
import pandas as pd

from data.utils import (
    get_list_of_second_timestamps,
    get_rnd_id,
    set_plot_style,
    ensure_dir_exists
)

set_plot_style()

In [None]:
# Indicate whether to save figures
save_fig = False

# Set path for figures saving
FIGURES_PATH = "/home/juraj/Projects/thesis-market-making/thesis/images"
ensure_dir_exists(FIGURES_PATH)

In [None]:
# Set the exchange and symbol
# exchange = "BINANCE"
# exchange = "OKX"
# exchange = "GATEIO"
exchange = "BIT.COM"

symbol = "SOL-USDT"

### Load multiple dataframes

In [None]:
# Set parameters
start_date = datetime.datetime(2023, 9, 1)
end_date = datetime.datetime(2023, 9, 13)
path = os.path.join(os.getcwd(), "datasets")
second = False

In [None]:
# Generate a list of dates
dates = [start_date + datetime.timedelta(days=x) for x in range((end_date - start_date).days + 1)]

In [None]:
# Load the data
prefix = "trades"
for date in dates:
    file_name = f"{exchange}_{symbol}_{prefix}_{date.strftime('%Y_%m_%d')}.parquet"
    # df_single = pl.read_parquet(os.path.join(path, file_name))
    df_single = pd.read_parquet(os.path.join(path, file_name))
    if date  == start_date:
        df = df_single
    else:
        df = pd.concat([df, df_single])
        
df.set_index("received_time", inplace=True)

In [None]:
# Process the data for each day
avg_buy_volume = 0
avg_sell_volume = 0
avg_buy_orders = 0
avg_sell_orders = 0

prefix = "trades"
for date in dates:
    file_name = f"{exchange}_{symbol}_{prefix}_{date.strftime('%Y_%m_%d')}.parquet"
    # df_single = pl.read_parquet(os.path.join(path, file_name))
    df_single = pd.read_parquet(os.path.join(path, file_name))
    print(f"Statistics for date: {date.strftime('%Y-%m-%d')}")
    
    # Compute the number of buy and sell orders
    buy_orders = df_single[df_single["side"] == "buy"]
    sell_orders = df_single[df_single["side"] == "sell"]
    avg_buy_orders += buy_orders.shape[0]
    avg_sell_orders += sell_orders.shape[0]
    print(f"Number of buy orders: {buy_orders.shape[0]}")
    print(f"Number of sell orders: {sell_orders.shape[0]}")
    
    # Compute the total volume of buy and sell orders
    buy_volume = buy_orders["quantity"].sum()
    sell_volume = sell_orders["quantity"].sum()
    avg_buy_volume += buy_volume
    avg_sell_volume += sell_volume
    print(f"Total buy volume: {round(buy_volume, 2)}")
    print(f"Total sell volume: {round(sell_volume, 2)}")
    
    # Compute the total volume
    total_volume = df_single["quantity"].sum()
    print()

# Compute the average number of buy and sell orders
avg_buy_orders /= len(dates)
avg_sell_orders /= len(dates)
print(f"Average number of buy orders: {round(avg_buy_orders, 2)}")
print(f"Average number of sell orders: {round(avg_sell_orders, 2)}")

# Compute the average buy and sell volume
avg_buy_volume /= len(dates)
avg_sell_volume /= len(dates)
print(f"Average buy volume: {round(avg_buy_volume, 2)}")
print(f"Average sell volume: {round(avg_sell_volume, 2)}")

In [None]:
df.head(10)

In [None]:
# Separate 
buy_orders = df[df["side"] == "buy"]
sell_orders = df[df["side"] == "sell"]
# buy_orders.set_index("received_time")
# sell_orders.set_index("received_time")

# Check the number of buy and sell orders
print(f"Number of buy orders: {buy_orders.shape[0]}")
print(f"Number of sell orders: {sell_orders.shape[0]}")

### Visualize buy and sell volumes

In [None]:
# Plot hours only instead of full timestamps
from matplotlib.dates import DateFormatter
date_format = DateFormatter("%H:%M")

In [None]:
# Define custom colors
color_green = "#13961a"
color_red = "#eb5c14"

In [None]:
# Visualize the buy volumes
fig = plt.figure(figsize=(12, 4))
plt.plot(buy_orders['quantity'], color=color_green)
# plt.gca().xaxis.set_major_formatter(date_format)
plt.xlabel('Time')
plt.ylabel('Volume (SOL)')

plt.tight_layout()
# plt.show()

# Save the figure
if save_fig:
    fig.savefig(f"{FIGURES_PATH}/{exchange}_{symbol}_buy_volume.pdf")

In [None]:
# # Compute the changes in incoming volume
# buy_volume_diff = buy_orders["quantity"].diff()

# # Plot the changes in incoming buy volume
# plt.figure(figsize=(10, 4))
# plt.plot(buy_volume_diff, color=color_green)
# plt.xlabel('Time')
# plt.ylabel('Volume change')
# plt.tight_layout()
# plt.show()

In [None]:
# # Merge the above two plots into one figure with two subplots
# fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 8))
# ax1.plot(buy_orders['quantity'], color=color_green)
# ax1.set_ylabel('Volume')
# ax2.plot(buy_volume_diff, color=color_green)
# ax2.set_ylabel('Volume change')
# plt.xlabel('Time')
# plt.tight_layout()

# # Change tick label size
# # ax1.tick_params(axis='x', labelsize=18)
# # ax1.tick_params(axis='y', labelsize=18)
# plt.show()

# # Save the figure
# # if save_fig:
# #     fig.savefig(f"{FIGURES_PATH}/buy_volume.pdf")

In [None]:
# buy_volume_diff.describe()

In [None]:
# Visualize the sell volumes
fig = plt.figure(figsize=(12, 4))
plt.plot(sell_orders['quantity'], color=color_red)
# plt.gca().xaxis.set_major_formatter(date_format)
plt.xlabel('Time')
plt.ylabel('Volume (SOL)')
plt.tight_layout()
# plt.show()

# Save the figure
if save_fig:
    fig.savefig(f"{FIGURES_PATH}/{exchange}_{symbol}_sell_volume.pdf")

In [None]:
# # Compute the changes in incoming volume
# sell_volume_diff = sell_orders["quantity"].diff()

# # Plot the changes in incoming buy volume
# plt.figure(figsize=(10, 4))
# plt.plot(sell_volume_diff, color=color_red)
# plt.xlabel('Time')
# plt.ylabel('Volume change')
# plt.tight_layout()
# plt.show()

In [None]:
# # Merge the above two plots into one figure with two subplots
# fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 8))
# ax1.plot(sell_orders['quantity'], color=color_red)
# ax1.set_ylabel('Volume')
# ax2.plot(sell_volume_diff, color=color_red)
# ax2.set_ylabel('Volume change')
# plt.xlabel('Time')
# plt.tight_layout()
# plt.show()

# # Save the figure
# if save_fig:
#     fig.savefig(f"{FIGURES_PATH}/sell_volume.pdf")

In [None]:
# sell_volume_diff.describe()

### Volume histograms

In [None]:
# buy_orders_describe = buy_orders.filter()
# buy_orders.describe()

In [None]:
# Visualize the buy volumes (excluding outliers at 0.99 quantile)
# buy_filtered = buy_orders.filter(pl.col('quantity') < buy_orders['quantity'].quantile(0.99))

fig = plt.figure(figsize=(12, 4))
# plt.figure(figsize=(12, 4))
plt.hist(buy_orders['quantity'], bins=100, color=color_green, edgecolor='black', linewidth=1.1, log=True)
# plt.hist(buy_orders[buy_orders["quantity"] > 0]["quantity"], bins=100, color=color_green, edgecolor='black', linewidth=1.1, log=True)
plt.xlabel('Volume (SOL)')
plt.ylabel('Count')
plt.tight_layout()
plt.show()

# Save the figure
if save_fig:
    fig.savefig(f"{FIGURES_PATH}/{exchange}_{symbol}_buy_volume_hist.pdf")

In [None]:
# Visualize the sell volumes (excluding outliers at 0.99 quantile)
# sell_filtered = sell_orders.filter(pl.col('quantity') < sell_orders['quantity'].quantile(0.99))

fig = plt.figure(figsize=(12, 4))
plt.hist(sell_orders['quantity'], bins=100, color=color_red, edgecolor='black', linewidth=1.1, log=True)
plt.xlabel('Volume (SOL)')
plt.ylabel('Count')
plt.tight_layout()
plt.show()

# Save the figure
if save_fig:
    fig.savefig(f"{FIGURES_PATH}/{exchange}_{symbol}_sell_volume_hist.pdf")

In [None]:
# Describe the buy orders statistics
buy_orders.describe()

In [None]:
# Describe the sell orders statistics
sell_orders.describe()

### Trade flow imbalance

In [None]:
# Resample the data to 1 minute intervals
buy_orders_1min = buy_orders["quantity"].resample("1min").sum()
sell_order_1min = sell_orders["quantity"].resample("1min").sum()

In [None]:
# Compute the order flow imbalance
eps = 1e-8
denominator = buy_orders_1min + sell_order_1min
denominator = denominator.replace(0, eps)
imbalance = (buy_orders_1min - sell_order_1min) / denominator

In [None]:
# # Describe the order flow imbalance statistics
imbalance.describe()

In [None]:
# # Visualize the order flow imbalance
start_index = 720
end_index = 1080

fig = plt.figure(figsize=(12, 4))
plt.plot(imbalance[start_index:end_index], color='black')
plt.gca().xaxis.set_major_formatter(date_format)
plt.xlabel('Time (hours)')
plt.ylabel('Order flow imbalance')
plt.tight_layout()
plt.show()

# Save the figure
if save_fig:
    fig.savefig(f"{FIGURES_PATH}/{exchange}_{symbol}_trade_flow_imbalance.pdf")


### Volume differences analysis

### Load single dataframe

In [None]:
# Set parameters
date = datetime.datetime(2023, 9, 1)
path = os.path.join(os.getcwd(), "datasets")
second = True

In [None]:
seconds = get_list_of_second_timestamps(date)

In [None]:
# Load the data
prefix = "trades"
file_name = f"{exchange}_{symbol}_{prefix}_{date.strftime('%Y_%m_%d')}.parquet"
df = pl.read_parquet(os.path.join(path, file_name))

In [None]:
df.head(10)

In [None]:
# select all rows where the 'column_name' column has the value 'value'
buy_orders = df.filter(pl.col('side') == 'buy')
sell_orders = df.filter(pl.col('side') == 'sell')

In [None]:
# Check the number of buy and sell orders
print(f"Number of buy orders: {buy_orders.shape[0]}")
print(f"Number of sell orders: {sell_orders.shape[0]}")

In [None]:
# Check that there is no timestamp duplication
assert len(buy_orders['received_time'].unique()) == len(buy_orders["received_time"])
assert len(sell_orders['received_time'].unique()) == len(sell_orders["received_time"])