In [63]:
#Python Libraries
from typing import List, Optional

#External libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from joblib import dump, load

In [39]:
#Paths
EXEC_PATH="./assignment 3/executions.csv"
QUOTES_PATH="./assignment 4/quotes_2025-09-10_small.csv.gz"

#experiment on these tickers first
KEEP_TICKERS = ["AAPL", "MSFT", "NVDA", "BRK.B", "HXHX", "SFHG", "DIDIY", "INEO", "THH", "FMCC", "LKNCY", "IVFH"]

MARKET_OPEN  = pd.to_datetime("09:30").time()
MARKET_CLOSE = pd.to_datetime("16:00").time()

In [40]:
def load_quotes(path: str, tickers: Optional[List[str]]) -> pd.DataFrame:
    dtypes = {
        "ticker":    "string",   # matches executions['Symbol']
        "bid_price": "float32",
        "ask_price": "float32",
        "bid_size":  "int32",
        "ask_size":  "int32",
        # bid_exchange / ask_exchange will default to object
    }

    chunk_size = 1_000_000
    chunks: List[pd.DataFrame] = []

    for chunk in pd.read_csv(
        path,
        compression="gzip",
        dtype=dtypes,
        low_memory=False,
        chunksize=chunk_size,
    ):
        if tickers is not None:
            chunk = chunk[chunk["ticker"].isin(tickers)]

        if chunk.empty:
            continue

        # ns since epoch -> datetime64[ns]
        chunk["sip_timestamp"] = pd.to_datetime(
            chunk["sip_timestamp"].astype("int64"),
            unit="ns",
            errors="coerce",
        )

        # keep only market hours
        t = chunk["sip_timestamp"].dt.time
        chunk = chunk[(t >= MARKET_OPEN) & (t <= MARKET_CLOSE)]

        if not chunk.empty:
            chunks.append(chunk)

    if chunks:
        quotes = pd.concat(chunks, ignore_index=True)
    else:
        quotes = pd.DataFrame(
            columns=[
                "ticker", "bid_price", "ask_price",
                "bid_size", "ask_size", "sip_timestamp",
                "bid_exchange", "ask_exchange",
            ]
        )

    return quotes

def load_executions(path: str, tickers: Optional[List[str]]) -> pd.DataFrame:
    dtypes = {
        "ClOrdID": "string",
        "Symbol":  "string",   # matches quotes['ticker']
        "Side":    "Int8",     # nullable int; 1 = buy, else sell
        "OrderQty": "Int32",
        "LimitPrice": "float32",
        "AvgPx":      "float32",
        "LastMkt":    "string",  # exchange
    }

    executions = pd.read_csv(
        path,
        dtype=dtypes,
        parse_dates=["OrderTransactTime", "ExecutionTransactTime"],
        infer_datetime_format=True,
        low_memory=False,
    )

    if tickers is not None:
        executions = executions[executions["Symbol"].isin(tickers)]

    # keep only market hours based on order time
    t = executions["OrderTransactTime"].dt.time
    executions = executions[(t >= MARKET_OPEN) & (t <= MARKET_CLOSE)]

    return executions

def attach_quotes(executions: pd.DataFrame, quotes: pd.DataFrame) -> pd.DataFrame:
    """
    Attach the most recent quote (per symbol) at or before OrderTransactTime.

    Assumes:
    - executions['Symbol'] and quotes['ticker'] are strings
    - executions['OrderTransactTime'] and quotes['sip_timestamp'] are datetime64[ns]
    """

    # merge_asof requires both sides sorted by the ON key (time), then BY key
    executions.sort_values(["OrderTransactTime", "Symbol"], inplace=True)
    quotes.sort_values(["sip_timestamp", "ticker"], inplace=True)

    merged = pd.merge_asof(
        executions,
        quotes,
        left_on="OrderTransactTime",
        right_on="sip_timestamp",
        left_by="Symbol",
        right_by="ticker",
        direction="backward",       # last quote <= order time
        allow_exact_matches=True,
    )

    return merged


In [41]:
quotes = load_quotes(QUOTES_PATH, KEEP_TICKERS)
quotes

Unnamed: 0,ticker,bid_price,ask_price,bid_size,ask_size,bid_exchange,ask_exchange,sip_timestamp
0,AAPL,233.350006,233.419998,2,1,11,12,2025-09-10 09:30:02.629839703
1,AAPL,233.350006,233.419998,1,1,11,12,2025-09-10 09:30:04.025984936
2,AAPL,233.350006,233.419998,3,1,11,12,2025-09-10 09:32:24.670486058
3,AAPL,233.350006,233.419998,2,1,11,12,2025-09-10 09:32:45.433953660
4,AAPL,233.350006,233.389999,2,1,11,11,2025-09-10 09:32:52.772468663
...,...,...,...,...,...,...,...,...
2274176,THH,5.650000,5.860000,1,1,8,7,2025-09-10 15:11:16.937714642
2274177,THH,5.650000,5.870000,1,1,8,8,2025-09-10 15:11:16.937847764
2274178,THH,5.700000,5.870000,1,1,20,8,2025-09-10 15:14:11.544400331
2274179,THH,5.700000,5.840000,1,10,20,12,2025-09-10 15:25:15.993176985


In [46]:
executions = load_executions(EXEC_PATH, tickers=KEEP_TICKERS)
executions

  executions = pd.read_csv(


Unnamed: 0,ClOrdID,ExecutionTransactTime,OrderTransactTime,Symbol,Side,OrderQty,LimitPrice,AvgPx,LastMkt
62367,ID87422,2025-09-10 09:32:32.518,2025-09-10 09:32:32.519,NVDA,1,,,,
62368,ID87422,2025-09-10 09:32:32.621,2025-09-10 09:32:32.621,NVDA,1,10,173.899994,0.0,ID1516
63002,ID88268,2025-09-10 09:34:25.865,2025-09-10 09:34:25.866,NVDA,1,,,,
63003,ID88268,2025-09-10 09:34:25.969,2025-09-10 09:34:25.968,NVDA,1,76,164.000000,0.0,ID1516
63169,ID88499,2025-09-10 09:35:01.374,2025-09-10 09:35:01.374,NVDA,2,3,177.000000,,
...,...,...,...,...,...,...,...,...,...
329323,ID452074,2025-09-10 15:59:53.136,2025-09-10 15:59:53.135,NVDA,2,5,177.919998,0.0,ID1516
329338,ID452096,2025-09-10 15:59:54.030,2025-09-10 15:59:54.032,NVDA,2,,,,
329339,ID452096,2025-09-10 15:59:54.139,2025-09-10 15:59:54.139,NVDA,2,10,180.000000,0.0,ID1516
329357,ID452120,2025-09-10 15:59:56.706,2025-09-10 15:59:56.707,NVDA,2,13,178.000000,,


In [47]:
df_features = attach_quotes(executions, quotes)
df_features

Unnamed: 0,ClOrdID,ExecutionTransactTime,OrderTransactTime,Symbol,Side,OrderQty,LimitPrice,AvgPx,LastMkt,ticker,bid_price,ask_price,bid_size,ask_size,bid_exchange,ask_exchange,sip_timestamp
0,ID87422,2025-09-10 09:32:32.518,2025-09-10 09:32:32.519,NVDA,1,,,,,NVDA,174.059998,174.089996,3.0,7.0,8.0,11.0,2025-09-10 09:32:31.665844308
1,ID87422,2025-09-10 09:32:32.621,2025-09-10 09:32:32.621,NVDA,1,10,173.899994,0.0,ID1516,NVDA,174.059998,174.089996,3.0,7.0,8.0,11.0,2025-09-10 09:32:31.665844308
2,ID88268,2025-09-10 09:34:25.865,2025-09-10 09:34:25.866,NVDA,1,,,,,NVDA,174.029999,174.050003,3.0,4.0,11.0,8.0,2025-09-10 09:34:22.831229981
3,ID88268,2025-09-10 09:34:25.969,2025-09-10 09:34:25.968,NVDA,1,76,164.000000,0.0,ID1516,NVDA,174.029999,174.050003,3.0,4.0,11.0,8.0,2025-09-10 09:34:22.831229981
4,ID88499,2025-09-10 09:35:01.374,2025-09-10 09:35:01.374,NVDA,2,3,177.000000,,,NVDA,174.009995,174.020004,7.0,1.0,11.0,11.0,2025-09-10 09:34:59.944885716
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7897,ID452074,2025-09-10 15:59:53.136,2025-09-10 15:59:53.135,NVDA,2,5,177.919998,0.0,ID1516,NVDA,177.860001,177.869995,4.0,5.0,12.0,15.0,2025-09-10 15:59:53.126032685
7898,ID452096,2025-09-10 15:59:54.030,2025-09-10 15:59:54.032,NVDA,2,,,,,NVDA,177.860001,177.869995,4.0,2.0,12.0,8.0,2025-09-10 15:59:54.013277886
7899,ID452096,2025-09-10 15:59:54.139,2025-09-10 15:59:54.139,NVDA,2,10,180.000000,0.0,ID1516,NVDA,177.860001,177.869995,5.0,2.0,12.0,8.0,2025-09-10 15:59:54.107780459
7900,ID452120,2025-09-10 15:59:56.706,2025-09-10 15:59:56.707,NVDA,2,13,178.000000,,,NVDA,177.860001,177.869995,3.0,6.0,12.0,15.0,2025-09-10 15:59:56.399432112


In [53]:
df_features.groupby("LastMkt").size()

LastMkt
ID1516      4218
ID1792       128
ID211917       6
ID282763      10
ID295386       4
ID29608      114
ID40869       28
ID412967       2
ID422100       1
dtype: int64

In [51]:
df_features[df_features['LastMkt']=='ID1792'].count()

ClOrdID                  128
ExecutionTransactTime    128
OrderTransactTime        128
Symbol                   128
Side                     128
OrderQty                 128
LimitPrice               128
AvgPx                    128
LastMkt                  128
ticker                    95
bid_price                 95
ask_price                 95
bid_size                  95
ask_size                  95
bid_exchange              95
ask_exchange              95
sip_timestamp             95
price_improvement         95
side_num                 128
dtype: int64

In [56]:
RANDOM_SEED=42
FEATURE_COLS = [
    "side_num",
    "OrderQty",
    "LimitPrice",
    "bid_price",
    "ask_price",
    "bid_size",
    "ask_size",
]
TARGET_COL = "price_improvement"

def add_price_improvement(df: pd.DataFrame) -> None:
    """
    Adds 'price_improvement' to df in place.
    - Side 1 = buy ‚Üí reference is ask_price
    - Side != 1 = sell ‚Üí reference is bid_price
    Positive = better than NBBO.
    """
    is_buy = df["Side"] == 1

    ref_price = np.where(is_buy, df["ask_price"], df["bid_price"])
    improvement = np.where(
        is_buy,
        ref_price - df["AvgPx"],   # buy: cheaper than ask is good
        df["AvgPx"] - ref_price,   # sell: higher than bid is good
    ).astype("float32")

    df["price_improvement"] = improvement

def prepare_data(df: pd.DataFrame) -> None:
    """Prepare merged executions+quotes dataframe for model training.

    - Encode side into side_num: +1 for buy (Side == 1), -1 for sell.
    - Drop rows that have NaN in any feature column or in price_improvement.
    """
    # encode side in place
    df["side_num"] = np.where(df["Side"] == 1, 1, -1).astype("int8")

    # drop rows with missing values in any feature or target
    cols_needed = FEATURE_COLS + [TARGET_COL]
    df.dropna(subset=cols_needed, inplace=True)

In [64]:
def train_models_per_exchange(df: pd.DataFrame):
    """Train one regression model per exchange and report R¬≤ and MSE.

    Assumes `prepare_data(df)` has already been called so that:
    - side_num exists
    - rows with NaN in features/target have been dropped.
    """
    models: dict = {}
    metrics: dict = {}

    for exch, d in df.groupby("LastMkt"):
        # skip exchanges with too little data
        if len(d) < 50:
            continue

        X = d[FEATURE_COLS].astype("float32")
        y = d[TARGET_COL].astype("float32")

        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=RANDOM_SEED
        )

        pipe = Pipeline(
            steps=[
                ("scaler", StandardScaler()),
                ("model", RandomForestRegressor(random_state=RANDOM_SEED, n_jobs=-1)),
            ]
        )

        param_grid = {
            "model__n_estimators": [50, 100],
            "model__max_depth": [None, 10],
        }

        grid = GridSearchCV(
            pipe,
            param_grid=param_grid,
            cv=3,
            n_jobs=-1,
            scoring="r2",
        )
        grid.fit(X_train, y_train)

        best_model = grid.best_estimator_

        # R¬≤ on test set
        r2 = best_model.score(X_test, y_test)
        # MSE on test set
        y_pred = best_model.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)

        models[exch] = best_model
        metrics[exch] = {"r2": r2, "mse": mse}

        print(f"Exchange {exch}: R¬≤ = {r2:.3f}, MSE = {mse:.6f} (n={len(d)})")

    return models, metrics

In [None]:
add_price_improvement(df_features)
prepare_data(df_features)
models, metrics = train_models_per_exchange(df_features)
dump(models, "per_exchange_price_improvement_models.joblib")

Exchange ID1516: R¬≤ = 0.595, MSE = 12132.001042 (n=4049)
Exchange ID1792: R¬≤ = 0.998, MSE = 134.442780 (n=95)
Exchange ID29608: R¬≤ = 0.902, MSE = 0.004217 (n=114)


['per_exchange_price_improvement_models.joblib']

In [None]:
!pytest ./test_somewhat_smart_order_router.py

platform win32 -- Python 3.13.7, pytest-8.4.2, pluggy-1.6.0
rootdir: c:\Users\gonza\Documents\UChicago\Quarters\Fall I\FINM 32400 - Python for Financial Data Science\assignment4_order_router
plugins: anyio-4.11.0
collected 2 items

test_somewhat_smart_order_router.py [32m.[0m[32m.[0m[32m                                   [100%][0m



In [None]:
!black ./somewhat_smart_order_router.py --diff --color

[1m--- somewhat_smart_order_router.py	2025-11-18 06:03:59.135551+00:00[0m
[1m+++ somewhat_smart_order_router.py	2025-11-18 17:03:12.884595+00:00[0m
[36m@@ -35,18 +35,18 @@[0m
         _models = load(MODELS_PATH)
     return _models
 
 
 def best_price_improvement(
[31m-        symbol:         str,[0m
[31m-        side:           str,[0m
[31m-        quantity:       int,[0m
[31m-        limit_price:    float,[0m
[31m-        bid_price:      float,[0m
[31m-        ask_price:      float,[0m
[31m-        bid_size:       int,[0m
[31m-        ask_size:       int,[0m
[32m+    symbol: str,[0m
[32m+    side: str,[0m
[32m+    quantity: int,[0m
[32m+    limit_price: float,[0m
[32m+    bid_price: float,[0m
[32m+    ask_price: float,[0m
[32m+    bid_size: int,[0m
[32m+    ask_size: int,[0m
 ) -> str:
     """Return the exchange with the highest predicted price improvement.
 
     The features passed here must match those used when training the models
 
[36m@@ 

would reformat somewhat_smart_order_router.py

All done! ‚ú® üç∞ ‚ú®
1 file would be reformatted.


In [None]:
!pylint ./somewhat_smart_order_router.py

************* Module somewhat_smart_order_router
somewhat_smart_order_router.py:32:4: W0603: Using the global statement (global-statement)
somewhat_smart_order_router.py:39:0: R0913: Too many arguments (8/5) (too-many-arguments)
somewhat_smart_order_router.py:39:0: R0917: Too many positional arguments (8/5) (too-many-positional-arguments)
somewhat_smart_order_router.py:39:0: R0914: Too many local variables (16/15) (too-many-locals)

------------------------------------------------------------------
Your code has been rated at 8.71/10 (previous run: 8.71/10, +0.00)



In [None]:
!ruff check ./somewhat_smart_order_router.py

All checks passed!
