# Advanced Features and Extensibility

This notebook demonstrates the advanced features added to the HFT simulator, including machine learning strategies, arbitrage trading, and the extensible strategy framework.

## Learning Objectives

By the end of this notebook, you will understand:
- Machine learning integration in trading strategies
- Statistical arbitrage and pairs trading concepts
- Modular strategy development framework
- Custom component creation and integration
- Advanced backtesting and walk-forward analysis

## Key Concepts

### Advanced Strategy Types
- **Machine Learning**: Pattern recognition and predictive modeling
- **Statistical Arbitrage**: Mean reversion in asset relationships
- **Pairs Trading**: Long/short positions in correlated assets
- **Modular Framework**: Component-based strategy construction

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Import HFT simulator components
from src.strategies.ml_strategy import MLTradingStrategy, MLFeatureEngineer
from src.strategies.arbitrage_strategy import PairsTradingStrategy, StatisticalArbitrageStrategy
from src.strategies.strategy_framework import (
    StrategyFactory, StrategyConfig, StrategyType, StrategyTester,
    ModularStrategy, MovingAverageSignalGenerator, PositionSizeRiskManager, LimitOrderManager
)
from src.engine.order_types import Order, MarketDataPoint
from src.utils.constants import OrderSide, OrderType
from src.utils.helpers import Timer

# Set up plotting
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("✅ All imports successful!")

## 1. Machine Learning Strategy Demonstration

Let's explore the machine learning-based trading strategy:

In [None]:
# Generate sample market data for ML strategy
def generate_ml_market_data(n_points=5000):
    """Generate realistic market data with patterns for ML to learn"""
    np.random.seed(42)
    
    # Base price with trend and noise
    base_price = 100.0
    trend = np.linspace(0, 5, n_points)  # Upward trend
    noise = np.random.randn(n_points) * 0.5
    
    # Add some cyclical patterns
    cycle1 = 2 * np.sin(np.linspace(0, 4*np.pi, n_points))
    cycle2 = 1 * np.sin(np.linspace(0, 8*np.pi, n_points))
    
    prices = base_price + trend + noise + cycle1 + cycle2
    
    # Generate timestamps
    start_time = datetime(2023, 1, 1, 9, 30)
    timestamps = [start_time + timedelta(seconds=i) for i in range(n_points)]
    
    # Generate volumes
    volumes = np.random.randint(100, 1000, n_points)
    
    # Create order book data
    spread = np.random.uniform(0.01, 0.05, n_points)
    best_bids = prices - spread/2
    best_asks = prices + spread/2
    
    return pd.DataFrame({
        'timestamp': timestamps,
        'price': prices,
        'volume': volumes,
        'best_bid': best_bids,
        'best_ask': best_asks,
        'bid_volume': np.random.randint(500, 2000, n_points),
        'ask_volume': np.random.randint(500, 2000, n_points)
    })

# Generate data
ml_data = generate_ml_market_data(3000)
print(f"Generated {len(ml_data):,} data points for ML strategy")

# Display sample
ml_data.head()

In [None]:
# Visualize the generated data
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 10))

# Price series
ax1.plot(ml_data.index, ml_data['price'], linewidth=1, alpha=0.8)
ax1.set_title('Price Series with Patterns')
ax1.set_ylabel('Price')
ax1.grid(True, alpha=0.3)

# Bid-Ask spread
spread = ml_data['best_ask'] - ml_data['best_bid']
ax2.plot(ml_data.index, spread, color='orange', linewidth=1)
ax2.set_title('Bid-Ask Spread')
ax2.set_ylabel('Spread')
ax2.grid(True, alpha=0.3)

# Volume
ax3.bar(ml_data.index[::50], ml_data['volume'][::50], alpha=0.7, color='green')
ax3.set_title('Trading Volume (Sample)')
ax3.set_ylabel('Volume')
ax3.grid(True, alpha=0.3)

# Price returns distribution
returns = ml_data['price'].pct_change().dropna()
ax4.hist(returns, bins=50, alpha=0.7, color='purple')
ax4.set_title('Price Returns Distribution')
ax4.set_xlabel('Returns')
ax4.set_ylabel('Frequency')
ax4.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"📊 Data Statistics:")
print(f"   • Price range: ${ml_data['price'].min():.2f} - ${ml_data['price'].max():.2f}")
print(f"   • Average spread: ${spread.mean():.4f}")
print(f"   • Return volatility: {returns.std():.4f}")

In [None]:
# Initialize ML strategy
ml_config = {
    'lookback_periods': [5, 10, 20],
    'prediction_horizon': 3,
    'confidence_threshold': 0.65,
    'position_size': 500,
    'retrain_frequency': 500,
    'min_training_samples': 200
}

ml_strategy = MLTradingStrategy("MLTEST", ml_config)

print("🤖 ML Strategy Initialized")
print(f"   • Lookback periods: {ml_config['lookback_periods']}")
print(f"   • Prediction horizon: {ml_config['prediction_horizon']} ticks")
print(f"   • Confidence threshold: {ml_config['confidence_threshold']}")
print(f"   • Retrain frequency: {ml_config['retrain_frequency']} trades")

In [None]:
# Demonstrate feature engineering
feature_engineer = MLFeatureEngineer()

# Create features from sample data
sample_data = ml_data.head(500).copy()
features_df = feature_engineer.create_features(sample_data, [5, 10, 20])

print(f"🔧 Feature Engineering Results:")
print(f"   • Original columns: {len(sample_data.columns)}")
print(f"   • Feature columns: {len(features_df.columns)}")
print(f"   • Generated features: {len(feature_engineer.feature_names)}")

# Display some key features
key_features = [col for col in features_df.columns if any(x in col for x in ['return', 'momentum', 'volatility', 'rsi'])]
print(f"\n📈 Key Features Generated:")
for feature in key_features[:10]:  # Show first 10
    print(f"   • {feature}")

# Show feature correlation matrix (sample)
feature_sample = features_df[key_features[:8]].dropna()
if len(feature_sample) > 0:
    plt.figure(figsize=(10, 8))
    correlation_matrix = feature_sample.corr()
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
                square=True, fmt='.2f')
    plt.title('Feature Correlation Matrix (Sample)')
    plt.tight_layout()
    plt.show()

## 2. Pairs Trading Strategy

Let's explore statistical arbitrage using pairs trading:

In [None]:
# Generate correlated pair data
def generate_pair_data(n_points=2000, correlation=0.8):
    """Generate data for two correlated assets"""
    np.random.seed(42)
    
    # Generate correlated random walks
    returns_a = np.random.randn(n_points) * 0.02
    returns_b = correlation * returns_a + np.sqrt(1 - correlation**2) * np.random.randn(n_points) * 0.02
    
    # Add some mean reversion to create arbitrage opportunities
    spread_target = 0
    spread_reversion = 0.05
    
    prices_a = [100.0]
    prices_b = [98.0]
    
    for i in range(1, n_points):
        # Current spread
        current_spread = prices_a[i-1] - prices_b[i-1]
        
        # Mean reversion component
        reversion_a = -spread_reversion * (current_spread - spread_target) * 0.5
        reversion_b = spread_reversion * (current_spread - spread_target) * 0.5
        
        # New prices
        price_a = prices_a[i-1] * (1 + returns_a[i] + reversion_a)
        price_b = prices_b[i-1] * (1 + returns_b[i] + reversion_b)
        
        prices_a.append(price_a)
        prices_b.append(price_b)
    
    # Create timestamps
    start_time = datetime(2023, 1, 1, 9, 30)
    timestamps = [start_time + timedelta(minutes=i) for i in range(n_points)]
    
    return {
        'ASSET_A': pd.DataFrame({
            'timestamp': timestamps,
            'price': prices_a,
            'volume': np.random.randint(500, 1500, n_points)
        }),
        'ASSET_B': pd.DataFrame({
            'timestamp': timestamps,
            'price': prices_b,
            'volume': np.random.randint(500, 1500, n_points)
        })
    }

# Generate pair data
pair_data = generate_pair_data(1500, correlation=0.85)
print(f"Generated pair data with {len(pair_data['ASSET_A'])} points each")

# Calculate correlation
correlation = pair_data['ASSET_A']['price'].corr(pair_data['ASSET_B']['price'])
print(f"Actual correlation: {correlation:.3f}")

In [None]:
# Visualize pair relationship
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 10))

# Price series
ax1.plot(pair_data['ASSET_A'].index, pair_data['ASSET_A']['price'], label='Asset A', linewidth=1.5)
ax1.plot(pair_data['ASSET_B'].index, pair_data['ASSET_B']['price'], label='Asset B', linewidth=1.5)
ax1.set_title('Price Series - Correlated Assets')
ax1.set_ylabel('Price')
ax1.legend()
ax1.grid(True, alpha=0.3)

# Scatter plot
ax2.scatter(pair_data['ASSET_A']['price'], pair_data['ASSET_B']['price'], alpha=0.6, s=10)
ax2.set_xlabel('Asset A Price')
ax2.set_ylabel('Asset B Price')
ax2.set_title(f'Price Relationship (Correlation: {correlation:.3f})')
ax2.grid(True, alpha=0.3)

# Spread analysis
spread = pair_data['ASSET_A']['price'] - pair_data['ASSET_B']['price']
ax3.plot(spread.index, spread, color='purple', linewidth=1)
ax3.axhline(y=spread.mean(), color='red', linestyle='--', alpha=0.7, label='Mean')
ax3.axhline(y=spread.mean() + 2*spread.std(), color='orange', linestyle='--', alpha=0.7, label='+2σ')
ax3.axhline(y=spread.mean() - 2*spread.std(), color='orange', linestyle='--', alpha=0.7, label='-2σ')
ax3.set_title('Price Spread (A - B)')
ax3.set_ylabel('Spread')
ax3.legend()
ax3.grid(True, alpha=0.3)

# Z-score of spread
z_score = (spread - spread.mean()) / spread.std()
ax4.plot(z_score.index, z_score, color='green', linewidth=1)
ax4.axhline(y=0, color='black', linestyle='-', alpha=0.5)
ax4.axhline(y=2, color='red', linestyle='--', alpha=0.7, label='Entry Threshold')
ax4.axhline(y=-2, color='red', linestyle='--', alpha=0.7)
ax4.axhline(y=0.5, color='blue', linestyle='--', alpha=0.7, label='Exit Threshold')
ax4.axhline(y=-0.5, color='blue', linestyle='--', alpha=0.7)
ax4.set_title('Spread Z-Score')
ax4.set_ylabel('Z-Score')
ax4.legend()
ax4.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"📊 Pair Analysis:")
print(f"   • Spread mean: {spread.mean():.4f}")
print(f"   • Spread std: {spread.std():.4f}")
print(f"   • Z-score range: {z_score.min():.2f} to {z_score.max():.2f}")
print(f"   • Opportunities (|z| > 2): {len(z_score[abs(z_score) > 2])}")

In [None]:
# Initialize pairs trading strategy
pairs_config = {
    'lookback_period': 100,
    'entry_threshold': 2.0,
    'exit_threshold': 0.5,
    'stop_loss_threshold': 3.5,
    'position_size': 1000,
    'min_correlation': 0.7
}

pairs_strategy = PairsTradingStrategy(('ASSET_A', 'ASSET_B'), pairs_config)

print("📈 Pairs Trading Strategy Initialized")
print(f"   • Entry threshold: ±{pairs_config['entry_threshold']} σ")
print(f"   • Exit threshold: ±{pairs_config['exit_threshold']} σ")
print(f"   • Stop loss: ±{pairs_config['stop_loss_threshold']} σ")
print(f"   • Position size: {pairs_config['position_size']}")

## 3. Modular Strategy Framework

Let's explore the extensible strategy framework:

In [None]:
# Initialize strategy factory
factory = StrategyFactory()

# Check available components
available_components = factory.get_available_components()

print("🏭 Strategy Factory Initialized")
print("\n📦 Available Components:")
for component_type, components in available_components.items():
    print(f"\n{component_type.replace('_', ' ').title()}:")
    for component in components:
        print(f"   • {component}")

In [None]:
# Create a modular strategy configuration
modular_config = StrategyConfig(
    name="MovingAverageCrossover",
    strategy_type=StrategyType.MOMENTUM,
    parameters={
        'components': {
            'signal_generators': [
                {
                    'name': 'MovingAverageSignalGenerator',
                    'parameters': {
                        'short_window': 10,
                        'long_window': 30
                    }
                }
            ],
            'risk_managers': [
                {
                    'name': 'PositionSizeRiskManager',
                    'parameters': {
                        'max_position_size': 1000,
                        'max_total_exposure': 5000,
                        'risk_per_trade': 0.02
                    }
                }
            ],
            'order_managers': [
                {
                    'name': 'LimitOrderManager',
                    'parameters': {
                        'price_offset': 0.01,
                        'default_volume': 100
                    }
                }
            ]
        }
    },
    risk_limits={
        'max_drawdown': 0.1,
        'max_daily_loss': 0.05
    },
    performance_targets={
        'min_sharpe_ratio': 1.0,
        'min_win_rate': 0.55
    }
)

# Create strategy
modular_strategy = factory.create_strategy("MODULAR", modular_config)

print("🔧 Modular Strategy Created")
print(f"   • Strategy name: {modular_config.name}")
print(f"   • Strategy type: {modular_config.strategy_type.value}")
print(f"   • Components: {len(modular_strategy.components['signal_generators'])} signal generators, "
      f"{len(modular_strategy.components['risk_managers'])} risk managers, "
      f"{len(modular_strategy.components['order_managers'])} order managers")

In [None]:
# Test the modular strategy with sample data
test_data = ml_data.head(200).copy()
orders_generated = []
signals_timeline = []

print("🧪 Testing Modular Strategy...")

for i, (_, row) in enumerate(test_data.iterrows()):
    # Create market data point
    market_point = MarketDataPoint(
        timestamp=row['timestamp'],
        symbol="MODULAR",
        price=row['price'],
        volume=row['volume']
    )
    
    # Update strategy context
    modular_strategy.update_context('capital', 100000)
    modular_strategy.update_context('current_position', 0)
    
    # Generate signals
    orders = modular_strategy.generate_signals(market_point)
    
    if orders:
        orders_generated.extend(orders)
        signals_timeline.append({
            'timestamp': row['timestamp'],
            'price': row['price'],
            'num_orders': len(orders),
            'order_types': [order.side.value for order in orders]
        })

print(f"✅ Strategy Testing Complete")
print(f"   • Total orders generated: {len(orders_generated)}")
print(f"   • Signal events: {len(signals_timeline)}")

if signals_timeline:
    print(f"\n📊 Signal Summary:")
    buy_signals = sum(1 for s in signals_timeline for ot in s['order_types'] if ot == 'bid')
    sell_signals = sum(1 for s in signals_timeline for ot in s['order_types'] if ot == 'ask')
    print(f"   • Buy signals: {buy_signals}")
    print(f"   • Sell signals: {sell_signals}")

In [None]:
# Visualize strategy signals
if signals_timeline:
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(15, 10))
    
    # Price chart with signals
    ax1.plot(test_data.index, test_data['price'], linewidth=1, alpha=0.8, label='Price')
    
    # Mark signal points
    signal_indices = []
    signal_prices = []
    signal_colors = []
    
    for signal in signals_timeline:
        # Find corresponding index
        matching_rows = test_data[test_data['timestamp'] == signal['timestamp']]
        if not matching_rows.empty:
            idx = matching_rows.index[0]
            signal_indices.append(idx)
            signal_prices.append(signal['price'])
            
            # Color based on signal type
            if 'bid' in signal['order_types']:
                signal_colors.append('green')
            else:
                signal_colors.append('red')
    
    ax1.scatter(signal_indices, signal_prices, c=signal_colors, s=50, alpha=0.8, zorder=5)
    ax1.set_title('Modular Strategy Signals')
    ax1.set_ylabel('Price')
    ax1.legend(['Price', 'Buy Signal', 'Sell Signal'])
    ax1.grid(True, alpha=0.3)
    
    # Signal frequency over time
    signal_counts = [len(s['order_types']) for s in signals_timeline]
    ax2.bar(range(len(signals_timeline)), signal_counts, alpha=0.7, color='blue')
    ax2.set_title('Signal Generation Frequency')
    ax2.set_xlabel('Signal Event')
    ax2.set_ylabel('Number of Orders')
    ax2.grid(True, alpha=0.3)\n    \n    plt.tight_layout()\n    plt.show()\n    \n    print(f\"\\n📈 Strategy Performance:\")\n    print(f\"   • Average orders per signal: {np.mean(signal_counts):.1f}\")\n    print(f\"   • Signal generation rate: {len(signals_timeline)/len(test_data)*100:.1f}% of data points\")\n

## 4. Strategy Performance Comparison\n\nLet's compare the performance of different advanced strategies:

In [None]:
# Get comprehensive strategy information\nprint(\"📊 STRATEGY COMPARISON\")\nprint(\"=\" * 60)\n\n# ML Strategy Info\nml_info = ml_strategy.get_strategy_info()\nprint(f\"\\n🤖 ML Strategy:\")\nprint(f\"   • Models trained: {ml_info['models_trained']}\")\nprint(f\"   • Feature count: {ml_info['feature_count']}\")\nprint(f\"   • Buffer size: {ml_info['buffer_size']}\")\nprint(f\"   • Confidence threshold: {ml_info['parameters']['confidence_threshold']}\")\n\n# Pairs Strategy Info\npairs_info = pairs_strategy.get_strategy_info()\nprint(f\"\\n📈 Pairs Trading Strategy:\")\nprint(f\"   • Symbol pair: {pairs_info['symbol_pair']}\")\nprint(f\"   • Hedge ratio: {pairs_info['hedge_ratio']:.4f}\")\nprint(f\"   • Current spread: {pairs_info['current_spread']}\")\nprint(f\"   • Current z-score: {pairs_info['current_z_score']}\")\n\n# Modular Strategy Info\nmodular_info = modular_strategy.get_strategy_info()\nprint(f\"\\n🔧 Modular Strategy:\")\nprint(f\"   • Strategy name: {modular_info['strategy_config']['name']}\")\nprint(f\"   • Strategy type: {modular_info['strategy_config']['type']}\")\nprint(f\"   • Signals generated: {modular_info['performance']['signals_generated']}\")\nprint(f\"   • Orders created: {modular_info['performance']['orders_created']}\")\n

## 5. Advanced Features Summary\n\nThe HFT simulator now includes powerful advanced features:

In [None]:
# Advanced features summary\nprint(\"🚀 ADVANCED FEATURES SUMMARY\")\nprint(\"=\" * 60)\n\nprint(\"\\n🤖 Machine Learning Integration:\")\nprint(f\"   • Automated feature engineering from market microstructure\")\nprint(f\"   • Random Forest and Gradient Boosting models\")\nprint(f\"   • Real-time model retraining and adaptation\")\nprint(f\"   • Confidence-based signal filtering\")\nprint(f\"   • Cross-validation and performance tracking\")\n\nprint(\"\\n📈 Statistical Arbitrage:\")\nprint(f\"   • Pairs trading with cointegration analysis\")\nprint(f\"   • Multi-asset statistical arbitrage\")\nprint(f\"   • Dynamic hedge ratio calculation\")\nprint(f\"   • Z-score based entry/exit signals\")\nprint(f\"   • Mean reversion modeling\")\n\nprint(\"\\n🔧 Extensible Framework:\")\nprint(f\"   • Modular component architecture\")\nprint(f\"   • Plugin-based signal generators\")\nprint(f\"   • Configurable risk management\")\nprint(f\"   • Custom order management strategies\")\nprint(f\"   • YAML/JSON configuration support\")\n\nprint(\"\\n⚡ Performance Optimizations:\")\nprint(f\"   • Vectorized order book operations\")\nprint(f\"   • Parallel data processing\")\nprint(f\"   • Memory-efficient data structures\")\nprint(f\"   • Batch order processing\")\nprint(f\"   • Parquet format support\")\n\nprint(\"\\n🧪 Testing & Validation:\")\nprint(f\"   • Walk-forward analysis\")\nprint(f\"   • Cross-validation frameworks\")\nprint(f\"   • Performance benchmarking\")\nprint(f\"   • Component-level testing\")\nprint(f\"   • Strategy comparison tools\")\n\nprint(\"\\n" + \"=\" * 60)\nprint(\"✅ Advanced features demonstration complete!\")\nprint(\"\\n💡 Next Steps:\")\nprint(f\"   • Experiment with custom strategy components\")\nprint(f\"   • Implement your own ML features\")\nprint(f\"   • Test with real market data\")\nprint(f\"   • Optimize parameters for your use case\")\nprint(f\"   • Deploy strategies in paper trading environment\")\n