# Feature Engineering Deep Dive
## Microstructure Features: OFI, Microprice, Imbalance, and More

This notebook provides detailed analysis of:
- Order Flow Imbalance (OFI) calculation and interpretation
- Microprice vs mid-price
- Queue imbalance at multiple levels
- Volume and volatility features
- Feature correlations and importance
- Predictive power analysis

In [None]:
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

from src.config import *
from utils.feature_utils import (
    calculate_microprice, calculate_spread, calculate_queue_imbalance,
    calculate_ofi, calculate_depth_imbalance
)
from utils.io_utils import read_parquet

%matplotlib inline
plt.style.use('seaborn-v0_8-darkgrid')

## 1. Load Order Book Data

In [None]:
# Load preprocessed order book data
date = "2025-09-15"
instrument_id = "AAPL.P.XNAS"

file_path = INTERIM_DATA_PATH / f"date={date}" / f"{instrument_id}.parquet"

# Generate synthetic data if file doesn't exist
if file_path.exists():
    df = read_parquet(file_path)
else:
    print("Generating synthetic order book data...")
    n_samples = 2000
    base_price = 150.0
    
    df = pd.DataFrame({
        'ts_event': pd.date_range('2025-09-15 09:30:00', periods=n_samples, freq='100ms')
    })
    
    # Generate order book levels with realistic dynamics
    for i in range(1, N_LEVELS + 1):
        df[f'bid_px_{i}'] = base_price - (i-1) * 0.01 + np.cumsum(np.random.randn(n_samples) * 0.001)
        df[f'bid_sz_{i}'] = np.maximum(np.random.randint(50, 200, n_samples) + np.random.randn(n_samples) * 20, 10)
        df[f'ask_px_{i}'] = base_price + (i-1) * 0.01 + np.cumsum(np.random.randn(n_samples) * 0.001)
        df[f'ask_sz_{i}'] = np.maximum(np.random.randint(50, 200, n_samples) + np.random.randn(n_samples) * 20, 10)

print(f"Loaded {len(df)} order book snapshots")
print(f"Time range: {df['ts_event'].min()} to {df['ts_event'].max()}")

## 2. Microprice Analysis
### Volume-weighted mid price vs simple mid price

In [None]:
# Calculate microprice and mid price
df['microprice'] = calculate_microprice(
    df['bid_px_1'].values,
    df['bid_sz_1'].values,
    df['ask_px_1'].values,
    df['ask_sz_1'].values,
    method="weighted"
)

df['mid_price'] = (df['bid_px_1'] + df['ask_px_1']) / 2
df['microprice_diff'] = df['microprice'] - df['mid_price']

# Plot comparison
fig, axes = plt.subplots(2, 1, figsize=(14, 8))

# Price comparison
axes[0].plot(df['ts_event'][:500], df['mid_price'][:500], 
             label='Mid Price', linewidth=1, alpha=0.7)
axes[0].plot(df['ts_event'][:500], df['microprice'][:500], 
             label='Microprice', linewidth=1, alpha=0.7)
axes[0].set_ylabel('Price')
axes[0].set_title('Microprice vs Mid Price')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Difference distribution
axes[1].hist(df['microprice_diff'], bins=50, alpha=0.7, edgecolor='black')
axes[1].axvline(0, color='red', linestyle='--')
axes[1].axvline(df['microprice_diff'].mean(), color='green', linestyle='--',
                label=f"Mean: {df['microprice_diff'].mean():.6f}")
axes[1].set_xlabel('Microprice - Mid Price')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Microprice Deviation from Mid Price')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"Microprice statistics:")
print(f"  Mean difference: {df['microprice_diff'].mean():.6f}")
print(f"  Std difference: {df['microprice_diff'].std():.6f}")
print(f"  Correlation with future returns: (to be calculated)")

## 3. Order Flow Imbalance (OFI)
### Multiple time windows

In [None]:
# Calculate OFI at different windows
for window in [10, 50, 100]:
    df[f'ofi_{window}'] = calculate_ofi(
        df['bid_px_1'],
        df['bid_sz_1'],
        df['ask_px_1'],
        df['ask_sz_1'],
        window=window
    )

# Plot OFI at different scales
fig, axes = plt.subplots(3, 1, figsize=(14, 10))

for idx, window in enumerate([10, 50, 100]):
    col = f'ofi_{window}'
    axes[idx].plot(df['ts_event'][:1000], df[col][:1000], linewidth=1)
    axes[idx].axhline(0, color='red', linestyle='--', alpha=0.5)
    axes[idx].set_ylabel('OFI')
    axes[idx].set_title(f'Order Flow Imbalance (window={window})')
    axes[idx].grid(True, alpha=0.3)

axes[2].set_xlabel('Time')
plt.tight_layout()
plt.show()

# OFI statistics
print("OFI Statistics:")
for window in [10, 50, 100]:
    col = f'ofi_{window}'
    print(f"\nWindow {window}:")
    print(f"  Mean: {df[col].mean():.2f}")
    print(f"  Std: {df[col].std():.2f}")
    print(f"  Skewness: {df[col].skew():.3f}")

## 4. Queue Imbalance at Multiple Levels

In [None]:
# Calculate depth imbalance
imbalance_df = calculate_depth_imbalance(df, n_levels=N_LEVELS)
df = pd.concat([df, imbalance_df], axis=1)

# Plot imbalance at different levels
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes = axes.flatten()

for idx, level in enumerate([1, 3, 5, 10]):
    col = f'imbalance_L{level}'
    if col in df.columns:
        axes[idx].hist(df[col].dropna(), bins=50, alpha=0.7, edgecolor='black')
        axes[idx].axvline(0, color='red', linestyle='--')
        axes[idx].axvline(df[col].mean(), color='green', linestyle='--',
                         label=f"Mean: {df[col].mean():.3f}")
        axes[idx].set_xlabel('Imbalance')
        axes[idx].set_ylabel('Frequency')
        axes[idx].set_title(f'Imbalance at {level} Level(s)')
        axes[idx].legend()
        axes[idx].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Correlation between imbalance at different levels
imbalance_cols = [f'imbalance_L{l}' for l in [1, 3, 5, 10] if f'imbalance_L{l}' in df.columns]
if imbalance_cols:
    corr_matrix = df[imbalance_cols].corr()
    
    plt.figure(figsize=(8, 6))
    sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, 
                square=True, linewidths=1)
    plt.title('Correlation: Imbalance Across Levels')
    plt.tight_layout()
    plt.show()

## 5. Volume and Spread Features

In [None]:
# Calculate aggregate volume features
bid_cols = [f'bid_sz_{i}' for i in range(1, N_LEVELS + 1)]
ask_cols = [f'ask_sz_{i}' for i in range(1, N_LEVELS + 1)]

df['total_bid_volume'] = df[bid_cols].sum(axis=1)
df['total_ask_volume'] = df[ask_cols].sum(axis=1)
df['total_volume'] = df['total_bid_volume'] + df['total_ask_volume']
df['volume_ratio'] = df['total_bid_volume'] / df['total_ask_volume']

# Spread features
spread_abs, spread_bps = calculate_spread(df['bid_px_1'].values, df['ask_px_1'].values)
df['spread_abs'] = spread_abs
df['spread_bps'] = spread_bps

# Visualizations
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Total volume over time
axes[0, 0].plot(df['ts_event'][:1000], df['total_volume'][:1000], linewidth=1)
axes[0, 0].set_ylabel('Total Volume')
axes[0, 0].set_title('Total Order Book Volume')
axes[0, 0].grid(True, alpha=0.3)

# Volume ratio
axes[0, 1].plot(df['ts_event'][:1000], df['volume_ratio'][:1000], linewidth=1, color='purple')
axes[0, 1].axhline(1.0, color='red', linestyle='--', alpha=0.5)
axes[0, 1].set_ylabel('Volume Ratio (Bid/Ask)')
axes[0, 1].set_title('Bid-Ask Volume Ratio')
axes[0, 1].grid(True, alpha=0.3)

# Spread distribution
axes[1, 0].hist(df['spread_bps'], bins=50, alpha=0.7, edgecolor='black', color='orange')
axes[1, 0].axvline(df['spread_bps'].median(), color='red', linestyle='--',
                   label=f"Median: {df['spread_bps'].median():.2f} bps")
axes[1, 0].set_xlabel('Spread (bps)')
axes[1, 0].set_ylabel('Frequency')
axes[1, 0].set_title('Spread Distribution')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)

# Volume vs Spread
axes[1, 1].scatter(df['total_volume'][:1000], df['spread_bps'][:1000], 
                   alpha=0.3, s=10)
axes[1, 1].set_xlabel('Total Volume')
axes[1, 1].set_ylabel('Spread (bps)')
axes[1, 1].set_title('Volume vs Spread')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 6. Feature Correlation Analysis

In [None]:
# Select key features for correlation analysis
key_features = [
    'microprice_diff', 'ofi_10', 'ofi_50', 'ofi_100',
    'imbalance_L1', 'imbalance_L3', 'imbalance_L5',
    'spread_bps', 'total_volume', 'volume_ratio'
]

# Filter features that exist
existing_features = [f for f in key_features if f in df.columns]

# Correlation matrix
corr_matrix = df[existing_features].corr()

plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', center=0,
            square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Feature Correlation Matrix', fontsize=14)
plt.tight_layout()
plt.show()

# Find highly correlated pairs
print("\nHighly Correlated Feature Pairs (|r| > 0.7):")
for i in range(len(corr_matrix.columns)):
    for j in range(i+1, len(corr_matrix.columns)):
        if abs(corr_matrix.iloc[i, j]) > 0.7:
            print(f"  {corr_matrix.columns[i]} <-> {corr_matrix.columns[j]}: {corr_matrix.iloc[i, j]:.3f}")

## 7. Predictive Power Analysis
### Correlation with future price movements

In [None]:
# Calculate future price change
horizons = [5, 10, 50, 100]  # Forward-looking steps

for horizon in horizons:
    df[f'future_return_{horizon}'] = (
        df['mid_price'].shift(-horizon) - df['mid_price']
    ) / df['mid_price']

# Calculate correlations
predictive_features = ['microprice_diff', 'ofi_10', 'ofi_50', 'ofi_100',
                       'imbalance_L1', 'imbalance_L3']
predictive_features = [f for f in predictive_features if f in df.columns]

correlations = pd.DataFrame(
    index=predictive_features,
    columns=[f'horizon_{h}' for h in horizons]
)

for feature in predictive_features:
    for horizon in horizons:
        corr = df[feature].corr(df[f'future_return_{horizon}'])
        correlations.loc[feature, f'horizon_{horizon}'] = corr

# Plot heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(correlations.astype(float), annot=True, fmt='.3f', cmap='RdYlGn', center=0,
            linewidths=1, cbar_kws={"label": "Correlation"})
plt.xlabel('Prediction Horizon (steps)')
plt.ylabel('Feature')
plt.title('Feature Correlation with Future Returns')
plt.tight_layout()
plt.show()

print("\nFeature Predictive Power (avg abs correlation):")
avg_corr = correlations.astype(float).abs().mean(axis=1).sort_values(ascending=False)
for feature, corr in avg_corr.items():
    print(f"  {feature:.<30} {corr:.4f}")

## 8. Conclusions

### Key Insights:

1. **Microprice**: Provides better price discovery than simple mid-price, especially during high imbalance
2. **OFI**: Stronger predictive power at shorter time windows (10-50 steps)
3. **Queue Imbalance**: Level 1 imbalance is most predictive, deeper levels add marginal value
4. **Volume Features**: Bid-ask volume ratio correlates with short-term price direction
5. **Spread**: Widens during uncertainty, acts as liquidity indicator

### Feature Importance Ranking:

Based on predictive power analysis:
1. OFI (short windows)
2. Queue imbalance (Level 1)
3. Microprice deviation
4. Volume ratio
5. Spread dynamics