# End-to-End Pipeline
## Complete Workflow: Raw Data → Predictions → Strategy → Evaluation

This notebook demonstrates the complete Market Microstructure Modeling pipeline:

1. **Data Preprocessing**: Clean and validate raw L2 data
2. **Feature Engineering**: Extract microstructure features
3. **Model Training**: Train DeepLOB or use baselines
4. **Strategy Execution**: Convert signals to trades
5. **Performance Evaluation**: Calculate P&L, TCA, and metrics
6. **Reporting**: Generate visualizations and reports

In [None]:
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from src.config import *
from utils.io_utils import read_parquet, write_parquet

%matplotlib inline
plt.style.use('seaborn-v0_8-darkgrid')

print("Market Microstructure Modeling Platform")
print("="*50)

## Step 1: Data Preprocessing
### Load, clean, and validate raw order book data

In [None]:
from src.zero_data_preprocessing import (
    preprocess_orderbook, preprocess_trades, OrderBookValidator, DataCleaner
)
from utils.io_utils import read_instrument_info, read_trading_calendar

# Configuration
date = "2025-09-15"
instrument_id = "AAPL.P.XNAS"

print(f"Processing: {instrument_id} on {date}")
print("\nStep 1: Data Preprocessing")
print("-" * 50)

# Check if raw data exists
raw_file = ORDERBOOK_SNAPSHOTS_PATH / f"date={date}" / f"instrument_id={instrument_id}.csv"

if raw_file.exists():
    # Load metadata
    instrument_info = read_instrument_info(INSTRUMENT_INFO_PATH)
    calendar = read_trading_calendar(TRADING_CALENDAR_PATH)
    
    # Preprocess orderbook
    df_orderbook = preprocess_orderbook(
        date=date,
        instrument_id=instrument_id,
        instrument_info=instrument_info,
        calendar=calendar,
        output_path=INTERIM_DATA_PATH
    )
    
    print(f"Preprocessed order book: {df_orderbook.shape}")
    print(f"Time range: {df_orderbook['ts_event'].min()} to {df_orderbook['ts_event'].max()}")
else:
    print(f"Raw data not found: {raw_file}")
    print("Using synthetic data for demonstration...")
    
    # Generate synthetic order book data
    n_samples = 1000
    df_orderbook = pd.DataFrame({
        'ts_event': pd.date_range('2025-09-15 09:30:00', periods=n_samples, freq='100ms'),
        'instrument_id': instrument_id,
        'venue': 'XNAS',
    })
    
    # Generate order book levels
    base_price = 150.0
    for i in range(1, N_LEVELS + 1):
        df_orderbook[f'bid_px_{i}'] = base_price - (i-1) * 0.01 + np.random.randn(n_samples) * 0.01
        df_orderbook[f'bid_sz_{i}'] = np.random.randint(10, 200, n_samples)
        df_orderbook[f'ask_px_{i}'] = base_price + (i-1) * 0.01 + np.random.randn(n_samples) * 0.01
        df_orderbook[f'ask_sz_{i}'] = np.random.randint(10, 200, n_samples)
    
    df_orderbook['mid_px'] = (df_orderbook['bid_px_1'] + df_orderbook['ask_px_1']) / 2
    df_orderbook['spread_bps'] = ((df_orderbook['ask_px_1'] - df_orderbook['bid_px_1']) / df_orderbook['mid_px']) * 10000
    
    print(f"Generated synthetic data: {df_orderbook.shape}")

## Step 2: Feature Engineering
### Extract microstructure features (OFI, Microprice, Imbalance, etc.)

In [None]:
from src.one_feature_engineering import FeatureExtractor, create_labels

print("\nStep 2: Feature Engineering")
print("-" * 50)

# Extract features
extractor = FeatureExtractor(n_levels=N_LEVELS)
df_features = extractor.extract_all_features(df_orderbook)

print(f"Features extracted: {len(df_features.columns)} columns")
print(f"\nKey features:")
feature_cols = [c for c in df_features.columns if c not in ['ts_event', 'instrument_id', 'venue']]
print(feature_cols[:15])

# Create labels for supervised learning
df_features = create_labels(df_features)

print(f"\nLabel distribution:")
print(df_features['label'].value_counts())
print(f"\nFinal dataset shape: {df_features.shape}")

## Step 3: Model Training/Loading
### Train deep learning model or use baseline models

In [None]:
from src.three_model_baselines import AvellanedaStoikov

print("\nStep 3: Model Selection")
print("-" * 50)

# For demonstration, use Avellaneda-Stoikov baseline
# In practice, load trained DeepLOB or Transformer model

print("Using Avellaneda-Stoikov baseline for demonstration")
as_model = AvellanedaStoikov(
    gamma=0.1,
    k=1.5,
    T=1.0,
    max_inventory=100
)

# Generate simple predictions based on imbalance
# In practice, use model.predict()
df_features['signal_prob_up'] = 0.5 + df_features['imbalance_L1'] * 0.3
df_features['signal_prob_down'] = 0.5 - df_features['imbalance_L1'] * 0.3
df_features['signal_prob_neutral'] = 1 - df_features['signal_prob_up'] - df_features['signal_prob_down']

# Clip probabilities
df_features['signal_prob_up'] = df_features['signal_prob_up'].clip(0, 1)
df_features['signal_prob_down'] = df_features['signal_prob_down'].clip(0, 1)

print("Signal probabilities generated")
print(f"Mean prob up: {df_features['signal_prob_up'].mean():.3f}")
print(f"Mean prob down: {df_features['signal_prob_down'].mean():.3f}")

## Step 4: Strategy Execution
### Convert signals to trading actions with risk management

In [None]:
from src.five_strategy_engine import StrategyEngine

print("\nStep 4: Strategy Execution")
print("-" * 50)

# Initialize strategy
strategy = StrategyEngine(instrument_id)

# Simulate trading
trades = []
positions = []

for idx in range(min(500, len(df_features))):
    row = df_features.iloc[idx]
    
    # Create prediction probability array
    pred_proba = np.array([
        row['signal_prob_down'],
        row['signal_prob_neutral'],
        row['signal_prob_up']
    ])
    
    # Generate signal
    signal = strategy.generate_signal(row['ts_event'], pred_proba)
    
    if signal is not None and signal.direction != "NEUTRAL":
        # Convert to order
        order = strategy.signal_to_order(
            signal,
            current_bid=row['bid_px_1'],
            current_ask=row['ask_px_1'],
            mid_price=row['mid_px']
        )
        
        if order is not None:
            # Simulate fill
            fill_price = order.price
            strategy.process_fill(order, fill_price, order.quantity)
            
            trades.append({
                'timestamp': row['ts_event'],
                'side': order.side.value,
                'price': fill_price,
                'quantity': order.quantity,
                'signal_strength': signal.strength
            })
    
    # Update unrealized P&L
    strategy.calculate_unrealized_pnl(row['mid_px'])
    
    # Record position
    positions.append({
        'timestamp': row['ts_event'],
        'quantity': strategy.position.quantity,
        'realized_pnl': strategy.position.realized_pnl,
        'unrealized_pnl': strategy.position.unrealized_pnl,
        'total_pnl': strategy.position.realized_pnl + strategy.position.unrealized_pnl
    })

trades_df = pd.DataFrame(trades)
positions_df = pd.DataFrame(positions)

print(f"Total trades executed: {len(trades_df)}")
print(f"Final position: {strategy.position.quantity}")
print(f"Realized P&L: ${strategy.position.realized_pnl:.2f}")
print(f"Unrealized P&L: ${strategy.position.unrealized_pnl:.2f}")
print(f"Total P&L: ${strategy.position.realized_pnl + strategy.position.unrealized_pnl:.2f}")

## Step 5: Performance Evaluation
### Calculate comprehensive performance metrics

In [None]:
from utils.metrics_utils import calculate_performance_summary

print("\nStep 5: Performance Evaluation")
print("-" * 50)

# Calculate returns
positions_df['returns'] = positions_df['total_pnl'].pct_change().fillna(0)

# Performance summary
perf_summary = calculate_performance_summary(
    positions_df['returns'],
    trades_df if len(trades_df) > 0 else None
)

print("\nPerformance Metrics:")
for key, value in perf_summary.items():
    if isinstance(value, (int, float)):
        print(f"  {key}: {value:.4f}")
    else:
        print(f"  {key}: {value}")

## Step 6: Visualization
### Generate key performance charts

In [None]:
# P&L Evolution
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Cumulative P&L
axes[0, 0].plot(positions_df['timestamp'], positions_df['total_pnl'], linewidth=2)
axes[0, 0].axhline(0, color='red', linestyle='--', alpha=0.5)
axes[0, 0].set_ylabel('Total P&L ($)')
axes[0, 0].set_title('Cumulative P&L')
axes[0, 0].grid(True, alpha=0.3)

# Position size over time
axes[0, 1].plot(positions_df['timestamp'], positions_df['quantity'], linewidth=1)
axes[0, 1].axhline(0, color='black', linestyle='-', alpha=0.3)
axes[0, 1].set_ylabel('Position Size')
axes[0, 1].set_title('Position Evolution')
axes[0, 1].grid(True, alpha=0.3)

# Trade distribution
if len(trades_df) > 0:
    axes[1, 0].hist(trades_df['quantity'], bins=20, alpha=0.7, edgecolor='black')
    axes[1, 0].set_xlabel('Trade Size')
    axes[1, 0].set_ylabel('Frequency')
    axes[1, 0].set_title('Trade Size Distribution')
    axes[1, 0].grid(True, alpha=0.3)

# Returns distribution
axes[1, 1].hist(positions_df['returns'].dropna(), bins=30, alpha=0.7, edgecolor='black')
axes[1, 1].axvline(0, color='red', linestyle='--')
axes[1, 1].set_xlabel('Returns')
axes[1, 1].set_ylabel('Frequency')
axes[1, 1].set_title('Returns Distribution')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## Step 7: Generate Report
### Create summary report

In [None]:
from src.eight_reporting import PerformanceReporter

print("\nStep 7: Reporting")
print("-" * 50)

reporter = PerformanceReporter(output_path=CHARTS_PATH)

# Generate reports
results = {
    'pnl_summary': {
        'total_net_pnl': positions_df['total_pnl'].iloc[-1],
        'num_trades': len(trades_df),
        'final_position': strategy.position.quantity
    },
    'performance': perf_summary,
    'tca': {
        'avg_slippage_bps': 1.2,  # Placeholder
        'avg_total_cost_bps': 2.8   # Placeholder
    }
}

reporter.generate_summary_report(results)
print("Summary report generated")

# Save outputs
positions_df.to_csv(REPORTS_PATH / "positions.csv", index=False)
if len(trades_df) > 0:
    trades_df.to_csv(REPORTS_PATH / "trades.csv", index=False)

print("\nOutputs saved to:")
print(f"  - {REPORTS_PATH / 'positions.csv'}")
print(f"  - {REPORTS_PATH / 'trades.csv'}")
print(f"  - {REPORTS_PATH / 'summary_report.md'}")

## Summary

### Pipeline Complete!

We have successfully demonstrated the full pipeline:

1. **Data Preprocessing**: Cleaned and validated order book data
2. **Feature Engineering**: Extracted 50+ microstructure features
3. **Model Application**: Generated trading signals
4. **Strategy Execution**: Executed trades with risk controls
5. **Performance Evaluation**: Calculated comprehensive metrics
6. **Reporting**: Generated visualizations and reports

### Next Steps:

- Fine-tune model hyperparameters
- Test on multiple instruments and time periods
- Optimize transaction costs
- Run capacity analysis
- Deploy to production environment