# Exploratory Analysis: LOB Microstructure

This notebook explores the synthetic limit order book data and microstructure features.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style('whitegrid')
%matplotlib inline

## Load Data

In [None]:
# Load LOB data
df = pd.read_parquet('../data/raw/lob_data.parquet')
print(f"Data shape: {df.shape}")
df.head()

## Basic Statistics

In [None]:
# Compute mid price
df['mid_price'] = (df['bid_price_1'] + df['ask_price_1']) / 2
df['spread'] = df['ask_price_1'] - df['bid_price_1']

print("Mid Price Statistics:")
print(df['mid_price'].describe())
print("\nSpread Statistics:")
print(df['spread'].describe())

## Price Dynamics

In [None]:
# Plot mid price over time
plt.figure(figsize=(14, 6))
plt.plot(df.index[:5000], df['mid_price'][:5000], linewidth=0.8)
plt.xlabel('Event Index')
plt.ylabel('Mid Price')
plt.title('Mid Price Evolution (First 5000 Events)')
plt.grid(True, alpha=0.3)
plt.show()

## Order Book Depth

In [None]:
# Visualize order book snapshot
snapshot_idx = 1000
snapshot = df.iloc[snapshot_idx]

bid_prices = [snapshot[f'bid_price_{i}'] for i in range(1, 6)]
bid_sizes = [snapshot[f'bid_size_{i}'] for i in range(1, 6)]
ask_prices = [snapshot[f'ask_price_{i}'] for i in range(1, 6)]
ask_sizes = [snapshot[f'ask_size_{i}'] for i in range(1, 6)]

fig, ax = plt.subplots(figsize=(10, 6))
ax.barh(bid_prices, bid_sizes, height=0.005, color='green', alpha=0.6, label='Bid')
ax.barh(ask_prices, [-s for s in ask_sizes], height=0.005, color='red', alpha=0.6, label='Ask')
ax.axvline(x=0, color='black', linestyle='--', alpha=0.5)
ax.set_xlabel('Size')
ax.set_ylabel('Price')
ax.set_title(f'Order Book Snapshot at Event {snapshot_idx}')
ax.legend()
plt.show()

## Feature Exploration

In [None]:
# Load features if available
try:
    features = pd.read_parquet('../data/processed/features.parquet')
    print(f"Features shape: {features.shape}")
    print(f"\nFeature columns: {features.columns.tolist()}")
    
    # Correlation with future returns
    future_return = df['mid_price'].pct_change(5).shift(-5)
    
    correlations = {}
    for col in features.columns[:20]:  # Top 20 features
        corr = features[col].corr(future_return)
        if not np.isnan(corr):
            correlations[col] = corr
    
    corr_df = pd.DataFrame(list(correlations.items()), columns=['Feature', 'Correlation'])
    corr_df = corr_df.sort_values('Correlation', key=abs, ascending=False)
    
    print("\nTop features by correlation with 5-tick future return:")
    print(corr_df.head(10))
    
except FileNotFoundError:
    print("Features not yet generated. Run pipeline first.")

## Trade Analysis

In [None]:
# Analyze trade events
trades = df[df['event_type'] == 'trade']
print(f"Number of trades: {len(trades)}")
print(f"Trade frequency: {len(trades) / len(df) * 100:.2f}%")

if len(trades) > 0:
    print("\nTrade size statistics:")
    print(trades['trade_size'].describe())