### Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from io import StringIO
import json
import re
import ast
import statsmodels.api as sm
import warnings
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.stattools import pacf
from sklearn.linear_model import LassoCV, RidgeCV
from sklearn.preprocessing import StandardScaler
from statsmodels.tsa.arima.model import ARIMA
from pmdarima.arima.utils import ndiffs
import scipy.optimize as opt
from functools import partial

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

warnings.simplefilter(action='ignore', category=FutureWarning)

### Pulling In Log Data

In [None]:
def parse_log_file(file_path):
    
    with open(file_path, 'r') as file:
        content = file.read()
    
    # Extract sections
    sandbox_section = re.search(r'Sandbox logs:(.*?)Activities log:', content, re.DOTALL).group(1).strip()
    activities_section = re.search(r'Activities log:(.*?)Trade History:', content, re.DOTALL).group(1).strip()
    trade_section = re.search(r'Trade History:(.*?)$', content, re.DOTALL).group(1).strip()
    
    # Function to parse multi-line JSON objects
    def extract_json_objects(json_string):
        # Split the string into lines
        lines = json_string.strip().split('\n')
        
        # Initialize variables
        json_objects = []
        current_object = ""
        inside_object = False
        
        for idx, line in enumerate(lines):
            if line.strip().startswith('{'):
                inside_object = True
                current_object = line
            elif line.strip().endswith('}') and inside_object:
                current_object += line
                json_objects.append(json.loads(current_object))
                current_object = ""
                inside_object = False
            elif inside_object:
                current_object += line
        
        return json_objects
    
    # Parse each section
    sandbox_logs = extract_json_objects(sandbox_section)
    activities_list = pd.read_csv(StringIO(activities_section), sep=';')
    trade_list = ast.literal_eval(trade_section)

    sandbox_dicts = {}
    
    own_trades_df = pd.DataFrame(columns=['symbol', 'price', 'quantity', 'buyer', 'seller', 'timestamp'])

    market_trades_df = pd.DataFrame(columns=['symbol', 'price', 'quantity', 'buyer', 'seller', 'timestamp'])
    
    bid_df = pd.DataFrame(columns=['symbol','bid_price', 'bid_volume','timestamp'])
    ask_df = pd.DataFrame(columns=['symbol','ask_price', 'ask_volume','timestamp'])

    position_df = pd.DataFrame(columns=['symbol', 'position', 'timestamp'])

    trader_orders_df = pd.DataFrame(columns=['timestamp', 'symbol', 'price', 'quantity', 'type'])
    trader_data_df = pd.DataFrame(columns=['timestamp', 'symbol', 'orderbook_theo', 'signal_theo', 'return', 'residual', 'expected_return'])

    timestamps  = list(range(0, 1 + (len(sandbox_logs) * 100), 100))

    market_trade_info_df = pd.DataFrame(columns=['timestamp', 'symbol', 'average_weighted_price', 'total_volume'])
    own_trade_info_df = pd.DataFrame(columns=['timestamp', 'symbol', 'average_weighted_price', 'total_volume'])

    for log in sandbox_logs:

        lambda_log = log['lambdaLog']
        try:
            if "LATEST\n" in lambda_log and "\nEND" in lambda_log:
                start_idx = lambda_log.find("LATEST\n") + len("LATEST\n")
                end_idx = lambda_log.find("\nEND")
                
                json_content = lambda_log[start_idx:end_idx]
                parsed = json.loads(json_content)

            else:
                parsed = json.loads(lambda_log)

        except:
            raise Exception("Error parsing log")
        

        sandbox_dicts[parsed['state']['timestamp']] = parsed

        parsed_own_trades_df = pd.DataFrame(parsed['state']["own_trades"], columns=['symbol', 'price', 'quantity', 'buyer', 'seller', 'timestamp'])

        parsed_own_trades_df = parsed_own_trades_df[parsed_own_trades_df['timestamp']  == parsed['state']['timestamp']- 100]

        if not parsed_own_trades_df.empty:
            own_trades_df = pd.concat([own_trades_df, parsed_own_trades_df], ignore_index=True)
            
        parsed_market_trades_df = pd.DataFrame(parsed['state']["market_trades"], columns=['symbol', 'price', 'quantity', 'buyer', 'seller', 'timestamp'])
        
        parsed_market_trades_df = parsed_market_trades_df[parsed_market_trades_df['timestamp']  == parsed['state']['timestamp']- 100]

        if not parsed_market_trades_df.empty:
            market_trades_df = pd.concat([market_trades_df, parsed_market_trades_df], ignore_index=True)

        for symbol, depth in parsed['state']['order_depths'].items():

            parsed_bid_df = pd.DataFrame(list(depth[0].items()), columns=['bid_price', 'bid_volume'])
            parsed_ask_df = pd.DataFrame(list(depth[1].items()), columns=['ask_price', 'ask_volume'])

            parsed_ask_df['ask_volume'] = np.abs(parsed_ask_df['ask_volume'])

            parsed_bid_df['symbol'] = symbol
            parsed_ask_df['symbol'] = symbol

            parsed_bid_df['timestamp'] = parsed['state']['timestamp']
            parsed_ask_df['timestamp'] = parsed['state']['timestamp']
            
            bid_df = pd.concat([bid_df, parsed_bid_df], ignore_index=True)
            ask_df = pd.concat([ask_df, parsed_ask_df], ignore_index=True)

        parsed_position_df = pd.DataFrame(list(parsed['state']['position'].items()), columns=['symbol', 'position'])

        parsed_position_df['timestamp'] = parsed['state']['timestamp']

        position_df = pd.concat([position_df, parsed_position_df], ignore_index=True)
        
        parsed_trader_data = json.loads(parsed['state']['trader_data'])

        orders_df = pd.DataFrame(columns = ['symbol', 'price', 'quantity', 'type', 'order_type','offset_asked'])

        symbol_list = list(parsed_trader_data['orderbook_theos'].keys())
        info_df = pd.DataFrame({'symbol': symbol_list,
                                     'orderbook_theo': [parsed_trader_data['orderbook_theos'][symbol][-1] for symbol in symbol_list],
                                     'signal_theo': [parsed_trader_data['signal_theos'][symbol][-1] for symbol in symbol_list],
                                        'return': [parsed_trader_data['return'][symbol][-1] for symbol in symbol_list],
                                        'residual': [parsed_trader_data['residual'][symbol][-1] for symbol in symbol_list],
                                        'expected_return': [parsed_trader_data['expected_return'][symbol] for symbol in symbol_list]
                                        })
        
        info_df['timestamp'] = parsed['state']['timestamp']

        market_trade_info_df = pd.concat([market_trade_info_df, pd.DataFrame({'timestamp': parsed['state']['timestamp'],
                                             'symbol': list(parsed_trader_data['market_trades_data'].keys()),
                                             'average_weighted_price': [values['average_weighted_price'] for values in parsed_trader_data['market_trades_data'].values()],
                                             'total_volume': [values['total_volume'] for values in parsed_trader_data['market_trades_data'].values()],
                                        })], ignore_index=True)
        
        own_trade_info_df = pd.concat([own_trade_info_df, pd.DataFrame({'timestamp': parsed['state']['timestamp'],
                                        'symbol': list(parsed_trader_data['own_trades_data'].keys()),
                                        'average_weighted_price': [values['average_weighted_price'] for values in parsed_trader_data['own_trades_data'].values()],
                                        'total_volume': [values['total_volume'] for values in parsed_trader_data['own_trades_data'].values()],
                                })], ignore_index=True)


        for symbol in parsed_trader_data['maker_orders'].keys():
            maker_orders_df = pd.DataFrame(parsed_trader_data['maker_orders'][symbol], columns=['symbol', 'price', 'quantity','offset_asked']) if len(parsed_trader_data['maker_orders'][symbol]) > 0 else pd.DataFrame(columns=['symbol', 'price', 'quantity','offset_asked'])
            taker_orders_df = pd.DataFrame(parsed_trader_data['_taker_orders'][symbol], columns=['symbol', 'price', 'quantity']) if len(parsed_trader_data['_taker_orders'][symbol]) > 0 else pd.DataFrame(columns=['symbol', 'price', 'quantity'])

            maker_orders_df['type'] = 'maker'
            taker_orders_df['type'] = 'taker'
            taker_orders_df['offset_asked'] = np.nan

            dfs_to_concat = [df for df in [orders_df, maker_orders_df, taker_orders_df] if not df.empty]
            if dfs_to_concat: 
                orders_df = pd.concat(dfs_to_concat, ignore_index=True)


        orders_df['timestamp'] = parsed['state']['timestamp']
    
        if not info_df.empty:
            trader_data_df = pd.concat([trader_data_df, info_df], ignore_index=True)
        if not orders_df.empty:
            trader_orders_df = pd.concat([trader_orders_df, orders_df], ignore_index=True)

    trader_data_df = trader_data_df.groupby('symbol').apply(
        lambda x: x.assign(**{'return': x['return'].shift(-1)})
    ).sort_values('timestamp').reset_index(drop=True)
    
    trader_data_df['return'] = trader_data_df['return'].fillna(0)

    market_trades_df = market_trades_df.sort_values('timestamp')
    
    symbols = position_df['symbol'].unique()

    result_dfs = []

    for symbol in symbols:
        symbol_data = position_df[position_df['symbol'] == symbol]

        symbol_all_times = pd.DataFrame({
                'symbol': symbol,
                'timestamp': timestamps
            })
        
        symbol_merged = pd.merge(symbol_all_times, symbol_data, 
                                    on=['symbol', 'timestamp'], how='left')
            
        symbol_merged = symbol_merged.sort_values('timestamp')
        symbol_merged['position'] = symbol_merged['position'].ffill().fillna(0).astype('float64')

        result_dfs.append(symbol_merged)

    position_df = pd.concat(result_dfs, ignore_index=True)

    activities_df, trade_history_df = pd.DataFrame(activities_list), pd.DataFrame(trade_list)

    # Conversion for market_trade_info_df
    market_trade_info_df['timestamp'] = market_trade_info_df['timestamp'].astype(int)
    market_trade_info_df['symbol'] = market_trade_info_df['symbol'].astype(str)
    market_trade_info_df['average_weighted_price'] = market_trade_info_df['average_weighted_price'].astype(float)
    market_trade_info_df['total_volume'] = market_trade_info_df['total_volume'].astype(int)

    own_trade_info_df['timestamp'] = own_trade_info_df['timestamp'].astype(int)
    own_trade_info_df['symbol'] = own_trade_info_df['symbol'].astype(str)
    own_trade_info_df['average_weighted_price'] = own_trade_info_df['average_weighted_price'].astype(float)
    own_trade_info_df['total_volume'] = own_trade_info_df['total_volume'].astype(int)

    # Conversion for trader_orders_df
    trader_orders_df['timestamp'] = trader_orders_df['timestamp'].astype(int)
    trader_orders_df['symbol'] = trader_orders_df['symbol'].astype(str)
    trader_orders_df['price'] = trader_orders_df['price'].astype(int)
    trader_orders_df['quantity'] = trader_orders_df['quantity'].astype(int)
    trader_orders_df['type'] = trader_orders_df['type'].astype(str)
    # offset_asked is already float64

    # Conversion for trader_data_df
    trader_data_df['timestamp'] = trader_data_df['timestamp'].astype(int)
    trader_data_df['symbol'] = trader_data_df['symbol'].astype(str)
    trader_data_df['orderbook_theo'] = trader_data_df['orderbook_theo'].astype(float)
    trader_data_df['signal_theo'] = trader_data_df['signal_theo'].astype(float)

    # Conversion for position_df
    position_df['symbol'] = position_df['symbol'].astype(str)
    position_df['timestamp'] = position_df['timestamp'].astype(int)
    # position is already float64

    # Conversion for bid_df
    bid_df['symbol'] = bid_df['symbol'].astype(str)
    bid_df['bid_price'] = bid_df['bid_price'].astype(int)
    bid_df['bid_volume'] = bid_df['bid_volume'].astype(int)
    bid_df['timestamp'] = bid_df['timestamp'].astype(int)

    # Conversion for ask_df
    ask_df['symbol'] = ask_df['symbol'].astype(str)
    ask_df['ask_price'] = ask_df['ask_price'].astype(int)
    ask_df['ask_volume'] = ask_df['ask_volume'].astype(int)
    ask_df['timestamp'] = ask_df['timestamp'].astype(int)

    # Conversion for own_trades_df
    own_trades_df['symbol'] = own_trades_df['symbol'].astype(str)
    # price is already float64
    own_trades_df['quantity'] = own_trades_df['quantity'].astype(int)
    own_trades_df['buyer'] = own_trades_df['buyer'].astype(str)
    own_trades_df['seller'] = own_trades_df['seller'].astype(str)
    own_trades_df['timestamp'] = own_trades_df['timestamp'].astype(int)

    # Conversion for market_trades_df
    market_trades_df['symbol'] = market_trades_df['symbol'].astype(str)
    # price is already float64
    market_trades_df['quantity'] = market_trades_df['quantity'].astype(int)
    market_trades_df['buyer'] = market_trades_df['buyer'].astype(str)
    market_trades_df['seller'] = market_trades_df['seller'].astype(str)
    market_trades_df['timestamp'] = market_trades_df['timestamp'].astype(int)

    activities_df['product'] = activities_df['product'].astype(str)

    trade_history_df['buyer'] = trade_history_df['buyer'].astype(str)
    trade_history_df['seller'] = trade_history_df['seller'].astype(str)
    trade_history_df['symbol'] = trade_history_df['symbol'].astype(str)
    trade_history_df['currency'] = trade_history_df['currency'].astype(str)

    return sandbox_dicts, market_trade_info_df, own_trade_info_df, trader_orders_df, trader_data_df, position_df, bid_df, ask_df, own_trades_df, market_trades_df,  activities_df, trade_history_df


In [None]:
log_file_path = "C://Users//marco//Downloads//6d902f9a-0384-4028-bd0a-5d7f80eb878c.log"

sandbox_dicts, market_trade_info_df, own_trade_info_df, trader_orders_df, trader_data_df, position_df, bid_df, ask_df, own_trades_df, market_trades_df,  activities_df, trade_history_df = parse_log_file(log_file_path)

### Description of All of the Dataframes

1.   market_trade_info_df: average weighted price, total_volume traded of market trades during timestamp

        We receive information about these trades in run() at timestamp t + 100

2.   own_trade_info_df: average weighted price, total_volume traded of own trades during timestamp

        We receive information about these trades in run() at timestamp t + 100

3.   trader_orders_df: timestamp, price, quantity, type of order (maker or taker), and offset demanded if maker

        Explaining Offset Demanded:
        - If the trader is a maker, the offset demanded is (Edge of Order Price to Theo) / (Std of Price Return)

4.   trader_data_df: timestamp, symbol, orderbook_theo, signal_theo, return, residual, expected_return

5.   position_df: current position at timestamp t

6.   bid_df: symbol, bid price, volume, timestamp

7.   ask_df: symbol, ask price, volume, timestamp

8.   own_trades_df: symbol, price, quantity, buyer, seller, timestamp

9.   market_trades_df: symbol, price, quantity, buyer, seller, timestamp

9.   activities_df: day, timestamp, product, Top 3 Order Book Levels, mid_price, profit_and_loss

10.  trade_history_df: market_trades_df + own_trades_df

### Sandbox Dictionary

1.   Parsed Dictionary From JSON


## Generating Theo = f(Orderbook, Market Trades, Own Trades)

Our goal is to generate a theo that is as close as possible to imc_theo_(t+1). We want every single trade we make to mark in as well as possible.



### Functions

In [None]:
def build_position_trajectory_with_pnl(own_trades: pd.DataFrame) -> pd.DataFrame:
    trades_sorted = own_trades.sort_values(by="timestamp").copy()
    pos_records = [{
            "timestamp": 0,
            "product": product,
            "position": 0,
            "cost_basis": np.nan,
            "realized_pnl": np.nan,
            "realized_pnl_cum": np.nan,
        } for product in own_trades['symbol'].unique()]
    
    # Track state by product
    state = {}
    

    for _, row in trades_sorted.iterrows():
        product = row["symbol"]
        t = row["timestamp"]
        qty = row["quantity"]
        px = row["price"]
        
        # Determine if we're buying or selling
        if row["buyer"] == "SUBMISSION":
            delta_pos = +qty  # We're buying
        elif row["seller"] == "SUBMISSION":
            delta_pos = -qty  # We're selling
        else:
            continue  # Not our trade
        
        # Initialize state for this product if not seen before
        if product not in state:
            state[product] = {
                "pos": 0.0,
                "cost_basis": 0.0,
                "realized_cum": 0.0
            }
        
        old_pos = state[product]["pos"]
        old_cb = state[product]["cost_basis"]
        old_realized = state[product]["realized_cum"]
                
        # Determine if we're opening, increasing, reducing, or closing a position
        if abs(old_pos) < 1e-10:  # No existing position
        # Opening a new position
            new_pos = delta_pos
            new_cb = px
            trade_realized = 0.0
        elif (old_pos > 0 and delta_pos > 0) or (old_pos < 0 and delta_pos < 0):
            total_pos = old_pos + delta_pos
            new_cb = (old_pos * old_cb + delta_pos * px) / total_pos
            new_pos = total_pos
            trade_realized = 0.0
        elif (old_pos > 0 and delta_pos < 0) or (old_pos < 0 and delta_pos > 0):
            # Trade in opposite direction of position
            # First determine if we're reducing or flipping the position
            if abs(delta_pos) <= abs(old_pos):
                # Reducing position
                trade_realized = -delta_pos * (px - old_cb) if old_pos > 0 else delta_pos * (old_cb - px)
                
                new_pos = old_pos + delta_pos
                new_cb = old_cb  # Cost basis stays the same when reducing
            else:

                # Flipping position (close existing and open new in opposite direction)
                # Calculate PnL on the closed portion
                trade_realized = old_pos * (px - old_cb) if old_pos > 0 else -old_pos * (old_cb - px)
                # # Remaining quantity becomes a new position
                new_pos = old_pos + delta_pos
                new_cb = px  # New cost basis is this trade's price
            

        # Update cumulative realized PnL
        new_realized_cum = old_realized + trade_realized
        
        # Clean up tiny values due to floating point issues
        if abs(new_pos) < 1e-10:
            new_pos = 0.0
            new_cb = 0.0
        
        # Update state
        state[product]["pos"] = new_pos
        state[product]["cost_basis"] = new_cb
        state[product]["realized_cum"] = new_realized_cum
        
        # Record this position update
        pos_records.append({
            "timestamp": t + 100,
            "product": product,
            "position": new_pos,
            "cost_basis": new_cb,
            "realized_pnl": trade_realized,
            "realized_pnl_cum": new_realized_cum
        })
    
    position_df = pd.DataFrame(pos_records)
    
    position_df = (
        position_df
        .sort_values(["product", "timestamp"])
        .groupby(["product", "timestamp"], as_index=False)
        .last()
    )
    
    return position_df

In [None]:
def microprice_theo(symbol, timestamp, bid_df, ask_df, market_trade_info_df, own_trade_info_df, betas) -> float:

    bid_df_selected, ask_df_selected = bid_df[(bid_df['symbol'] == symbol) & (bid_df['timestamp'] == timestamp)], ask_df[(ask_df['symbol'] == symbol) & (ask_df['timestamp'] == timestamp)]
    own_trade_info_df_selected = own_trade_info_df[(own_trade_info_df['symbol'] == symbol) & (own_trade_info_df['timestamp'] == timestamp)]
    market_trade_info_df_selected = market_trade_info_df[(market_trade_info_df['symbol'] == symbol) & (market_trade_info_df['timestamp'] == timestamp)]

    bid_prices = bid_df_selected['bid_price'].values
    bid_volumes = bid_df_selected['bid_volume'].values
    ask_prices = ask_df_selected['ask_price'].values
    ask_volumes = ask_df_selected['ask_volume'].values

    own_volume, own_average_price = own_trade_info_df_selected['total_volume'].values[0], own_trade_info_df_selected['average_weighted_price'].values[0]
    market_volume, market_average_price = market_trade_info_df_selected['total_volume'].values[0], market_trade_info_df_selected['average_weighted_price'].values[0]

    if len(bid_volumes) == 0 or len(ask_volumes) == 0 or max(bid_volumes) == 0 or max(ask_volumes) == 0:
        return np.nan

    level_weights = np.array([1] + betas[:-2])

    own_trades_weight, market_trades_weight = betas[-2:]

    bid_levels = []

    for idx_bid in range(min(len(bid_prices), len(level_weights))):
        if pd.notna(bid_prices[idx_bid]) and pd.notna(bid_volumes[idx_bid]) and bid_volumes[idx_bid]>0:
            bid_levels.append((bid_prices[idx_bid], bid_volumes[idx_bid] * level_weights[idx_bid]))

    total_bid_vol = sum(x[1] for x in bid_levels)
    weighted_bid  = sum(x[0]*x[1] for x in bid_levels) / total_bid_vol

    ask_levels = []

    for idx_ask in range(min(len(ask_prices), len(level_weights))):
        if pd.notna(ask_prices[idx_ask]) and pd.notna(ask_volumes[idx_ask]) and ask_volumes[idx_ask]>0:
            ask_levels.append((ask_prices[idx_ask], ask_volumes[idx_ask] * level_weights[idx_ask]))

    total_ask_vol = sum(x[1] for x in ask_levels)
    weighted_ask  = sum(x[0]*x[1] for x in ask_levels) / total_ask_vol

    weighted_own_vol = own_volume * own_trades_weight
    weighted_market_vol = market_volume * market_trades_weight
    weighted_own_average_price = own_average_price * weighted_own_vol if weighted_own_vol > 0 else 0
    weighted_market_average_price = market_average_price * weighted_market_vol if weighted_market_vol > 0 else 0

    total_vol = total_bid_vol + total_ask_vol + weighted_own_vol + weighted_market_vol
    if total_vol < 1e-9:
        return np.nan
    
    microprice = (weighted_ask * total_bid_vol + weighted_bid * total_ask_vol + weighted_own_average_price + weighted_market_average_price) / total_vol

    return microprice

In [None]:
def compute_predicted_pnl(
    symbol: str,
    position_steps: pd.DataFrame,
    activities_df: pd.DataFrame,
    betas: tuple
) -> pd.DataFrame:

    act_df = activities_df.copy()

    act_df = act_df.sort_values(["product", "timestamp"], ascending=[True, True])
    position_steps = position_steps.sort_values(["product", "timestamp"], ascending=[True, True])

    merged = pd.merge(
        act_df,
        position_steps,
        on=["product","timestamp"],
        how = "left"
    )
    
    merged["position"]   = merged["position"].ffill().fillna(0.0)
    merged["cost_basis"] = merged["cost_basis"].ffill().fillna(0.0)
    merged["realized_pnl"] = merged["realized_pnl"].ffill().fillna(0.0)
    merged["realized_pnl_cum"] = merged["realized_pnl_cum"].ffill().fillna(0.0)

    merged["theo_price"] = merged.apply(lambda x: microprice_theo(symbol, x['timestamp'], bid_df, ask_df, market_trade_info_df, own_trade_info_df, betas), axis=1)
    merged["imc_theo"] =  merged['cost_basis'] + ((merged['profit_and_loss'] - merged['realized_pnl_cum']) / merged['position'])

    merged['imc_theo'].replace([np.inf, -np.inf], np.nan, inplace=True)

    merged['imc_theo'] = merged['imc_theo'].ffill()

    merged["error_theo"] = merged["imc_theo"] - merged["theo_price"]

    return merged 

In [None]:
def mse_pnl(position_steps, activities_df, own_trades, betas, verbose=False):

    df = compute_predicted_pnl(position_steps, activities_df, own_trades, betas)

    if verbose:
        print(betas)

    mse = np.mean(df[~np.isnan(df['error_theo'])]["error_theo"]**2)

    return df, mse

In [None]:
def objective(betas, position_pnl_df, activities_df, own_trades_df, verbose=False):

    _, mse_val = mse_pnl(
        position_pnl_df,
        activities_df,
        own_trades_df,
        betas,
        verbose=verbose
    )
    
    return mse_val

In [None]:
symbols = own_trades_df['symbol'].unique()
timestamps = list(sorted(activities_df['timestamp'].unique()))

position_pnl_df = build_position_trajectory_with_pnl(own_trades_df)

result_dfs = []

for symbol in symbols:

    symbol_data = position_pnl_df[position_pnl_df['product'] ==  symbol]

    symbol_all_times = pd.DataFrame({
            'product': symbol,
            'timestamp': timestamps
        })
    
    symbol_merged = pd.merge(symbol_all_times, symbol_data, 
                                on=['product', 'timestamp'], how='left')
        
    symbol_merged = symbol_merged.sort_values('timestamp')
    symbol_merged['position'] = symbol_merged['position'].ffill().fillna(0).astype('float64')

    result_dfs.append(symbol_merged)

position_df = pd.concat(result_dfs, ignore_index=True)

In [None]:
init_guess = (1, 1, 1, 1)

bounds = [(0, 2)]*4

betas = {}
cleaned_theos = {}

for product in position_df['product'].unique():

    init_mse = objective(init_guess,
                         position_pnl_df[position_pnl_df['product'] == product],
                         activities_df[activities_df['product'] == product],
                         own_trades_df[own_trades_df['symbol'] == product])
    
    product_objective = partial(objective,
                               position_pnl_df = position_pnl_df[position_pnl_df['product'] == product],
                               activities_df = activities_df[activities_df['product'] == product],
                               own_trades_df = own_trades_df[own_trades_df['symbol'] == product],
                               verbose=False)
    

    result = opt.minimize(
        product_objective,
        x0=init_guess,
        method='SLSQP',
        bounds=bounds,
        options={"eps": 1e-4}
    )

    mse_df, mse_val = mse_pnl(
        position_df[position_df['product'] == product],
        activities_df[activities_df['product'] == product],
        own_trades_df[own_trades_df['symbol'] == product],
        result.x,
        verbose=False
    )

    cleaned_theos[product] = mse_df[['timestamp', 'theo_price','imc_theo']].copy()

    cleaned_theos[product].loc[:, 'return_theo'] = cleaned_theos[product]['theo_price'].pct_change().shift(-1).fillna(0)
    cleaned_theos[product].loc[:, 'return_mtm'] = (cleaned_theos[product]['imc_theo'].shift(-1) / cleaned_theos[product]['theo_price']) - 1

    cleaned_theos[product].loc[:, 'next_imc_theo'] = cleaned_theos[product]['imc_theo'].shift(-1)

    print(f"Optimization result for product {product}:", result)
    print(f"Best betas for product {product}:", result.x)
    print(f"Initial MSE for product {product}:", init_mse)
    print(f"MSE for product {product}:", result.fun)
    betas[product] = result.x


cleaned_theos_df = pd.concat(cleaned_theos.values(), ignore_index=True)1

## Signal Creation

### Objective Function

### HAR Regression Function

### ARIMA Regression Function


### Testing For P and Q

## Production Quoting Optimization

### Preprocessing

In [None]:
# Examining Quantity Filled

maker_orders = trader_orders_df[trader_orders_df['type'] == 'maker'].copy()

buy_trades = own_trades_df[own_trades_df['buyer'] == 'SUBMISSION'].copy()
sell_trades = own_trades_df[own_trades_df['seller'] == 'SUBMISSION'].copy()

trader_orders_df['quantity_filled'] = 0

maker_orders['original_index'] = maker_orders.index

buy_merged = maker_orders[maker_orders['quantity'] > 0].merge(
    buy_trades,
    on=['timestamp', 'price', 'symbol'],
    how='left',
    suffixes=('', '_trade')
)

sell_merged = maker_orders[maker_orders['quantity'] < 0].merge(
    sell_trades,
    on=['timestamp', 'price', 'symbol'],
    how='left',
    suffixes=('', '_trade')
)

filled_orders = pd.concat([buy_merged, sell_merged])

trader_orders_df.loc[filled_orders['original_index'], 'quantity_filled'] = filled_orders['quantity_trade']

trader_orders_df.drop(columns=['original_index'], inplace=True)

trader_orders_df['quantity_filled'] = trader_orders_df['quantity_filled'].fillna(0)

# Examining 1 Tick Markout

trader_orders_df = trader_orders_df.merge(cleaned_theos_df[['timestamp', 'symbol', 'next_imc_theo']], on=['timestamp', 'symbol'], how='left').merge(trader_data_df[['timestamp', 'symbol', 'signal_theo']], on=['timestamp', 'symbol'], how='left')

filled_orders = trader_orders_df[trader_orders_df['quantity_filled'] > 0]

filled_orders['expected_edge_per_contract'] = np.where(filled_orders['buyer_trade'] == 'SUBMISSION', filled_orders['signal_theo'] - filled_orders['price'], filled_orders['price'] - filled_orders['signal_theo'])
filled_orders['retained_edge_per_contract'] = np.where(filled_orders['buyer_trade'] == 'SUBMISSION', filled_orders['next_imc_theo'] - filled_orders['price'], filled_orders['price'] - filled_orders['next_imc_theo'])

filled_orders['expected_edge'] = filled_orders['expected_edge_per_contract'] * filled_orders['quantity_filled']
filled_orders['retained_edge'] = filled_orders['retained_edge_per_contract'] * filled_orders['quantity_filled']

filled_orders['fill_percentage'] = filled_orders['quantity_filled'] / filled_orders['quantity']

filled_orders['offset_bucket'] = pd.cut(filled_orders['offset_asked'], bins=[-np.inf, 0, 0.5, 1, 1.5, 2, np.inf], labels=['-', '0-0.5', '0.5-1', '1-1.5', '1.5-2', '2+'])
filled_orders['fill_percentage_bucket'] = pd.cut(filled_orders['fill_percentage'], bins=[-np.inf, 0.3, 0.8, 1], labels=['0-0.3', '0.3-0.8', '0.8-1'])


### Edge Retained Statistics

For every product, we want to calculate the following statistics:
1.   Cash Edge, Percentage of Edge Retained Over 1 Tick
2.   Cash Edge, Percentage of Edge Retained By Offset Bucket [(0, 0.5), (0.5, 1), (1, 1.5), (1.5, 2), (2+)]
3.   Cash Edge, Percentage of Edge Retained Over Fill Percentage Bucket [(0, 0.3), (0.3, 0.8), (0.8, 1)] And Offset Bucket [(0, 1), (1, 2), (2+)]

In [None]:
# Cash Edge Retained Over 1 Tick

cash_edge_retained_summary = filled_orders.groupby(['symbol']).agg({
    'retained_edge': 'sum',
    'expected_edge': 'sum'
})

display(cash_edge_retained_summary)

# Percentage of Edge Retained Over 1 Tick

percentage_edge_retained_summary = pd.DataFrame({'percentage_retained': cash_edge_retained_summary['retained_edge'] / cash_edge_retained_summary['expected_edge']})

display(percentage_edge_retained_summary)

# Cash Edge Retained By Offset Bucket

cash_edge_retained_offset_summary = filled_orders.groupby(['symbol', 'offset_bucket']).agg({
    'retained_edge': 'sum',
    'expected_edge': 'sum'
})

display(cash_edge_retained_offset_summary)

# Percentage of Edge Retained By Offset Bucket

percentage_edge_retained_offset_summary = pd.DataFrame({'percentage_retained': cash_edge_retained_offset_summary['retained_edge'] / cash_edge_retained_offset_summary['expected_edge']})

display(percentage_edge_retained_offset_summary)

# Cash Edge Retained Over Fill Percentage Bucket And Offset Bucket

cash_edge_retained_offset_fill_summary = filled_orders.groupby(['symbol', 'offset_bucket', 'fill_percentage_bucket']).agg({
    'retained_edge': 'sum',
    'expected_edge': 'sum'
})

display(cash_edge_retained_offset_fill_summary)

# Percentage of Edge Retained Over Fill Percentage Bucket And Offset Bucket

percentage_edge_retained_offset_fill_summary = pd.DataFrame({'percentage_retained': cash_edge_retained_offset_fill_summary['retained_edge'] / cash_edge_retained_offset_fill_summary['expected_edge']})

display(percentage_edge_retained_offset_fill_summary)


### Autocorrelation of Trade Direction

Given dPos_maker(t), we want correlation of dPos_maker(t) with dPos_maker(t-1)

## Production Taking Optimization

### Taking Opportunities, How Many Do We Capture

In [None]:

print(np.mean(rainforest_resin_df['weighted_midprice_level_1'].shift(1) < rainforest_resin_df['bid_price_1']))
print(np.mean(rainforest_resin_df['weighted_midprice_level_1'].shift(1) > rainforest_resin_df['ask_price_1']))

### OOS Signal / Return Performance

In [None]:
for symbol in trader_data_df['symbol'].unique():
    symbol_mask = trader_data_df['symbol'] == symbol
    X = trader_data_df[symbol_mask][['expected_return']]
    y = trader_data_df[symbol_mask]['return'].fillna(0)
    signal_model = sm.OLS(y, sm.add_constant(X)).fit()
    print(f"Symbol: {symbol}, R-squared: {signal_model.rsquared}, Correlation: {signal_model.rsquared ** 0.5}, Beta: {signal_model.params[1]}")

### Plotting Expected Return vs Return

In [None]:
n_symbols = len(trader_data_df['symbol'].unique())

fig, axes = plt.subplots(1, n_symbols, figsize=(5*n_symbols, 5), sharey=True)

if n_symbols == 1:
    axes = np.array([axes])

for i, symbol in enumerate(trader_data_df['symbol'].unique()):
    symbol_mask = trader_data_df['symbol'] == symbol

    axes[i].scatter(trader_data_df[symbol_mask]['expected_return'], trader_data_df[symbol_mask]['return'])

    min_val, max_val = min(trader_data_df[symbol_mask]['expected_return']), max(trader_data_df[symbol_mask]['expected_return']) 

    axes[i].plot([min_val, max_val], [min_val, max_val], 'r-')
    axes[i].set_title(f'{symbol} Signal vs Return')
    axes[i].set_xlabel('Expected Return')
    axes[i].set_ylabel('Return')

plt.tight_layout()
plt.show()

### Edge Retention Analysis - Taking Orders

## Inventory Management


### Plotting Position

In [None]:
n_symbols = len(trader_data_df['symbol'].unique())

fig, axes = plt.subplots(1, n_symbols, figsize=(5*n_symbols, 5), sharey=True)

if n_symbols == 1:
    axes = np.array([axes])

for i, symbol in enumerate(trader_data_df['symbol'].unique()):
    symbol_mask = trader_data_df['symbol'] == symbol

    axes[i].scatter(trader_data_df[symbol_mask]['expected_return'], trader_data_df[symbol_mask]['return'])

    min_val, max_val = min(trader_data_df[symbol_mask]['expected_return']), max(trader_data_df[symbol_mask]['expected_return']) 

    axes[i].plot([min_val, max_val], [min_val, max_val], 'r-')
    axes[i].set_title(f'{symbol} Signal vs Return')
    axes[i].set_xlabel('Expected Return')
    axes[i].set_ylabel('Return')

plt.tight_layout()
plt.show()

### Plotting Position And PnL

## Random Code

In [None]:
import itables
import itables.options as opt
opt.precision = 8 
itables.show(own_trades_df)