In [32]:
import pandas as pd
import numpy as np
import json
from datetime import datetime, timedelta
from itertools import product

class SmartOrderRouter:
    def __init__(self, lambda_over, lambda_under, theta_queue):
        self.lambda_over = lambda_over
        self.lambda_under = lambda_under
        self.theta_queue = theta_queue
        self.h = 0.005  # Half of bid-ask spread
        self.f = 0.003  # Market order fee
        self.r = 0.002  # Limit order rebate (simplified)
        
    def allocate(self, venues, remaining_qty):
        """Implement Cont & Kukanov allocation logic"""
        K = len(venues)
        Q = np.array([v['queue'] for v in venues])  # Queue positions
        
        # Simplified version of the optimal allocation
        # In practice, would implement full equations from paper
        if remaining_qty < 1000:
            # Small orders - mostly limit orders
            market_qty = max(0, remaining_qty * 0.1)
            limit_qtys = [(remaining_qty - market_qty) / K] * K
        else:
            # Large orders - more market orders
            market_qty = remaining_qty * 0.4
            limit_qtys = [(remaining_qty - market_qty) / K] * K
        
        return market_qty, limit_qtys



In [33]:
def load_and_preprocess_data(filepath):
    """Load and preprocess market data"""
    df = pd.read_csv(filepath)
    
    # Filter to first message per publisher per timestamp
    df = df.sort_values(['ts_event', 'publisher_id'])\
           .groupby(['ts_event', 'publisher_id']).first().reset_index()
    
    # Convert timestamps
    df['timestamp'] = pd.to_datetime(df['ts_event'], unit='ns')
    return df

In [34]:
def run_backtest(df, params, total_shares=5000):
    """Run backtest with given parameters"""
    router = SmartOrderRouter(**params)
    remaining_qty = total_shares
    total_cost = 0
    filled_shares = 0
    execution_log = []
    
    for ts, snapshot in df.groupby('ts_event'):
        if remaining_qty <= 0:
            break
            
        venues = []
        for _, row in snapshot.iterrows():
            venues.append({
                'publisher_id': row['publisher_id'],
                'price': row['ask_px_00'],
                'size': row['ask_sz_00'],
                'queue': row['ask_sz_00']  # Simplified queue position
            })
        
        if not venues:
            continue
            
        # Get allocation
        market_qty, limit_qtys = router.allocate(venues, remaining_qty)
        
        # Execute market orders (take liquidity)
        if market_qty > 0:
            # Sort venues by best price
            venues_sorted = sorted(venues, key=lambda x: x['price'])
            for venue in venues_sorted:
                if market_qty <= 0:
                    break
                fill = min(market_qty, venue['size'])
                cost = fill * venue['price']
                total_cost += cost
                filled_shares += fill
                remaining_qty -= fill
                market_qty -= fill
                execution_log.append({
                    'ts': ts,
                    'type': 'market',
                    'venue': venue['publisher_id'],
                    'price': venue['price'],
                    'shares': fill,
                    'cost': cost
                })
        
        # Execute limit orders (provide liquidity)
        for i, venue in enumerate(venues):
            if i >= len(limit_qtys) or limit_qtys[i] <= 0:
                continue
                
            # Simplified fill probability - would use actual queue dynamics in practice
            fill_prob = min(1, venue['size'] / (venue['queue'] + limit_qtys[i])) if (venue['queue'] + limit_qtys[i]) > 0 else 0
            fill = min(limit_qtys[i] * fill_prob, venue['size'])
            
            if fill > 0:
                cost = fill * venue['price']
                total_cost += cost
                filled_shares += fill
                remaining_qty -= fill
                execution_log.append({
                    'ts': ts,
                    'type': 'limit',
                    'venue': venue['publisher_id'],
                    'price': venue['price'],
                    'shares': fill,
                    'cost': cost
                })
    
    avg_price = total_cost / filled_shares if filled_shares > 0 else 0
    return {
        'total_cost': total_cost,
        'avg_price': avg_price,
        'filled_shares': filled_shares,
        'remaining_qty': remaining_qty,
        'execution_log': execution_log
    }

In [35]:
def run_baselines(df, total_shares=5000):
    """Calculate baseline strategies"""
    # Get all unique timestamps
    timestamps = df['ts_event'].unique()
    
    # Naive best ask - take all shares at first best price
    if len(timestamps) > 0:
        first_snapshot = df[df['ts_event'] == timestamps[0]]
        best_venue = first_snapshot.loc[first_snapshot['ask_px_00'].idxmin()]
        naive_cost = total_shares * best_venue['ask_px_00']
        naive_avg = best_venue['ask_px_00']
    else:
        naive_cost = 0
        naive_avg = 0
    
    # TWAP - split equally over 60-second intervals
    if len(timestamps) > 0:
        start_time = pd.to_datetime(timestamps[0], unit='ns')
        end_time = pd.to_datetime(timestamps[-1], unit='ns')
        duration = (end_time - start_time).total_seconds()
        intervals = max(1, int(duration / 60))
        
        shares_per_interval = total_shares / intervals
        twap_cost = 0
        
        for i in range(intervals):
            interval_start = start_time + timedelta(seconds=60*i)
            interval_end = interval_start + timedelta(seconds=60)
            
            interval_data = df[
                (df['timestamp'] >= interval_start) & 
                (df['timestamp'] < interval_end)
            ]
            
            if not interval_data.empty:
                avg_price = interval_data['ask_px_00'].mean()
                twap_cost += shares_per_interval * avg_price
        
        twap_avg = twap_cost / total_shares if total_shares > 0 else 0
    else:
        twap_cost = 0
        twap_avg = 0
    
    # VWAP - weight by displayed size
    if len(df) > 0:
        total_size = df['ask_sz_00'].sum()
        if total_size > 0:
            vwap_price = (df['ask_px_00'] * df['ask_sz_00']).sum() / total_size
            vwap_cost = total_shares * vwap_price
        else:
            vwap_cost = 0
            vwap_price = 0
    else:
        vwap_cost = 0
        vwap_price = 0
    
    return {
        'naive': {
            'total_cost': naive_cost,
            'avg_price': naive_avg,
            'filled_shares': total_shares if naive_cost > 0 else 0
        },
        'twap': {
            'total_cost': twap_cost,
            'avg_price': twap_avg,
            'filled_shares': total_shares if twap_cost > 0 else 0
        },
        'vwap': {
            'total_cost': vwap_cost,
            'avg_price': vwap_price,
            'filled_shares': total_shares if vwap_cost > 0 else 0
        }
    }

In [36]:
def parameter_search(df, param_grid, total_shares=5000):
    """Perform grid search over parameters"""
    best_result = None
    best_cost = float('inf')
    best_params = None
    
    # Generate all parameter combinations
    param_combinations = product(
        param_grid['lambda_over'],
        param_grid['lambda_under'],
        param_grid['theta_queue']
    )
    
    for lo, lu, tq in param_combinations:
        params = {
            'lambda_over': lo,
            'lambda_under': lu,
            'theta_queue': tq
        }
        
        result = run_backtest(df, params, total_shares)
        
        if result['filled_shares'] > 0 and result['total_cost'] < best_cost:
            best_cost = result['total_cost']
            best_result = result
            best_params = params
    
    return best_params, best_result

def calculate_savings(router_result, baseline_results):
    """Calculate savings in basis points"""
    def get_bps_savings(ref_avg, router_avg):
        if router_avg == 0:
            return 0.0
        return ((ref_avg - router_avg) / router_avg) * 10000
    
    savings = {
        'vs_naive': get_bps_savings(
            baseline_results['naive']['avg_price'],
            router_result['avg_price']
        ),
        'vs_twap': get_bps_savings(
            baseline_results['twap']['avg_price'],
            router_result['avg_price']
        ),
        'vs_vwap': get_bps_savings(
            baseline_results['vwap']['avg_price'],
            router_result['avg_price']
        )
    }
    
    return savings

def main():
    # Load and preprocess data
    df = load_and_preprocess_data('l1_day.csv')
    
    # Define parameter search grid
    param_grid = {
        'lambda_over': [0.0005, 0.001, 0.002],
        'lambda_under': [0.0005, 0.001, 0.002],
        'theta_queue': [0.00005, 0.0001, 0.0002]
    }
    
    # Run parameter search
    best_params, best_result = parameter_search(df, param_grid)
    
    # Run baselines
    baseline_results = run_baselines(df)
    
    # Calculate savings
    savings = calculate_savings(best_result, baseline_results)
    
    # Prepare output JSON
    output = {
        'best_parameters': best_params,
        'router_results': {
            'total_cost': best_result['total_cost'],
            'avg_price': best_result['avg_price'],
            'filled_shares': best_result['filled_shares']
        },
        'baselines': {
            'naive': {
                'total_cost': baseline_results['naive']['total_cost'],
                'avg_price': baseline_results['naive']['avg_price']
            },
            'twap': {
                'total_cost': baseline_results['twap']['total_cost'],
                'avg_price': baseline_results['twap']['avg_price']
            },
            'vwap': {
                'total_cost': baseline_results['vwap']['total_cost'],
                'avg_price': baseline_results['vwap']['avg_price']
            }
        },
        'savings_bps': savings
    }
    
    # Print JSON output
    print(json.dumps(output, indent=2))

if __name__ == "__main__":
    main()

{
  "best_parameters": {
    "lambda_over": 0.0005,
    "lambda_under": 0.0005,
    "theta_queue": 5e-05
  },
  "router_results": {
    "total_cost": 1114104.4787570855,
    "avg_price": 222.82089575141703,
    "filled_shares": 5000.000000000002
  },
  "baselines": {
    "naive": {
      "total_cost": 1114150.0,
      "avg_price": 222.83
    },
    "twap": {
      "total_cost": 1115343.5786612884,
      "avg_price": 223.0687157322577
    },
    "vwap": {
      "total_cost": 1115319.1590477992,
      "avg_price": 223.06383180955982
    }
  },
  "savings_bps": {
    "vs_naive": 0.4085904309954322,
    "vs_twap": 11.121936297982234,
    "vs_vwap": 10.90275027050481
  }
}
