# Exploratory Data Analysis
## Market Microstructure Modeling Platform

This notebook performs initial exploration of order book data:
- Data distribution analysis
- Order book snapshot visualization
- Spread and liquidity analysis
- Feature correlation analysis

In [None]:
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from src.config import *
from utils.io_utils import read_parquet
from utils.plotting_utils import plot_orderbook_snapshot, plot_spread_analysis

%matplotlib inline
plt.style.use('seaborn-v0_8-darkgrid')

## 1. Load Preprocessed Data

In [None]:
# Load preprocessed order book data
date = "2025-09-15"
instrument_id = "AAPL.P.XNAS"

file_path = INTERIM_DATA_PATH / f"date={date}" / f"{instrument_id}.parquet"

if file_path.exists():
    df = read_parquet(file_path)
    print(f"Loaded {len(df)} rows")
    print(f"Time range: {df['ts_event'].min()} to {df['ts_event'].max()}")
else:
    print(f"Data file not found: {file_path}")
    print("Please run 0_data_preprocessing.py first")

## 2. Basic Statistics

In [None]:
if 'df' in locals():
    print("Dataset shape:", df.shape)
    print("\nBasic statistics:")
    print(df[['bid_px_1', 'ask_px_1', 'mid_px', 'spread_bps']].describe())

## 3. Spread Analysis

In [None]:
if 'df' in locals():
    fig = plot_spread_analysis(df, spread_col='spread_bps')
    plt.show()

## 4. Order Book Depth Visualization

In [None]:
if 'df' in locals():
    # Plot order book snapshot at a random time
    sample_idx = len(df) // 2
    sample_row = df.iloc[sample_idx]
    
    bid_prices = [sample_row[f'bid_px_{i}'] for i in range(1, 11)]
    bid_sizes = [sample_row[f'bid_sz_{i}'] for i in range(1, 11)]
    ask_prices = [sample_row[f'ask_px_{i}'] for i in range(1, 11)]
    ask_sizes = [sample_row[f'ask_sz_{i}'] for i in range(1, 11)]
    
    fig = plot_orderbook_snapshot(
        np.array(bid_prices),
        np.array(bid_sizes),
        np.array(ask_prices),
        np.array(ask_sizes),
        title=f"Order Book Snapshot at {sample_row['ts_event']}"
    )
    plt.show()

## 5. Price and Volume Time Series

In [None]:
if 'df' in locals():
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(14, 8))
    
    # Mid price
    ax1.plot(df['ts_event'], df['mid_px'], linewidth=0.5)
    ax1.set_ylabel('Mid Price')
    ax1.set_title('Mid Price Evolution')
    ax1.grid(True, alpha=0.3)
    
    # Total depth
    df['total_bid_volume'] = df[[f'bid_sz_{i}' for i in range(1, 11)]].sum(axis=1)
    df['total_ask_volume'] = df[[f'ask_sz_{i}' for i in range(1, 11)]].sum(axis=1)
    
    ax2.plot(df['ts_event'], df['total_bid_volume'], label='Bid Volume', alpha=0.7)
    ax2.plot(df['ts_event'], df['total_ask_volume'], label='Ask Volume', alpha=0.7)
    ax2.set_ylabel('Volume')
    ax2.set_xlabel('Time')
    ax2.set_title('Order Book Depth')
    ax2.legend()
    ax2.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

## 6. Correlation Analysis

In [None]:
if 'df' in locals():
    # Calculate correlations for key features
    features = ['bid_px_1', 'ask_px_1', 'mid_px', 'spread_bps', 
                'bid_sz_1', 'ask_sz_1', 'total_bid_volume', 'total_ask_volume']
    
    corr_matrix = df[features].corr()
    
    plt.figure(figsize=(10, 8))
    sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, 
                square=True, linewidths=1, cbar_kws={"shrink": 0.8})
    plt.title('Feature Correlation Matrix')
    plt.tight_layout()
    plt.show()

## 7. Conclusions

Summary of findings:
- Spread characteristics
- Liquidity patterns
- Data quality assessment
- Next steps for feature engineering