In [1]:
# Cell 1: Imports & Data Load

import pandas as pd

# Load the first 25 000 rows
df = pd.read_csv('../files/first_25000_rows.csv')
df.head()


Unnamed: 0,ts_recv,ts_event,rtype,publisher_id,instrument_id,action,side,depth,price,size,...,ask_sz_08,bid_ct_08,ask_ct_08,bid_px_09,ask_px_09,bid_sz_09,ask_sz_09,bid_ct_09,ask_ct_09,symbol
0,2024-10-21T11:54:29.221230963Z,2024-10-21T11:54:29.221064336Z,10,2,38,C,B,1,233.62,2,...,155,1,7,233.25,234.13,55,400,2,1,AAPL
1,2024-10-21T11:54:29.223936626Z,2024-10-21T11:54:29.223769812Z,10,2,38,A,B,0,233.67,2,...,155,1,7,233.25,234.13,55,400,2,1,AAPL
2,2024-10-21T11:54:29.225196809Z,2024-10-21T11:54:29.225030400Z,10,2,38,A,B,0,233.67,3,...,155,1,7,233.25,234.13,55,400,2,1,AAPL
3,2024-10-21T11:54:29.712600612Z,2024-10-21T11:54:29.712434212Z,10,2,38,A,B,2,233.52,200,...,155,1,7,233.25,234.13,55,400,2,1,AAPL
4,2024-10-21T11:54:29.764839221Z,2024-10-21T11:54:29.764673165Z,10,2,38,C,B,2,233.52,200,...,155,1,7,233.25,234.13,55,400,2,1,AAPL


In [2]:
# Cell 2: Parse & Sort

# Convert event timestamp to datetime and sort
df['ts_event'] = pd.to_datetime(df['ts_event'])
df = df.sort_values(['symbol','ts_event']).reset_index(drop=True)
df[['symbol','ts_event']].head()


Unnamed: 0,symbol,ts_event
0,AAPL,2024-10-21 11:54:29.221064336+00:00
1,AAPL,2024-10-21 11:54:29.223769812+00:00
2,AAPL,2024-10-21 11:54:29.225030400+00:00
3,AAPL,2024-10-21 11:54:29.712434212+00:00
4,AAPL,2024-10-21 11:54:29.764673165+00:00


In [3]:
# Cell 3: Compute Best-Level OFI per Event

# Shift prior best bid/ask quotes
df['prev_bid_px_00'] = df['bid_px_00'].shift(1)
df['prev_bid_sz_00'] = df['bid_sz_00'].shift(1)
df['prev_ask_px_00'] = df['ask_px_00'].shift(1)
df['prev_ask_sz_00'] = df['ask_sz_00'].shift(1)

# Flow function
def compute_flow(curr_px, prev_px, curr_sz, prev_sz):
    if curr_px > prev_px:
        return  curr_sz
    elif curr_px < prev_px:
        return -curr_sz
    else:
        return curr_sz - prev_sz

# Bid‐ and ask‐flows
df['bid_flow_0'] = df.apply(lambda r: compute_flow(
    r['bid_px_00'], r['prev_bid_px_00'],
    r['bid_sz_00'], r['prev_bid_sz_00']), axis=1)

df['ask_flow_0'] = df.apply(lambda r: compute_flow(
    r['prev_ask_px_00'], r['ask_px_00'],
    r['prev_ask_sz_00'], r['ask_sz_00']), axis=1)

# Best‐level OFI per event
df['ofi_level_0'] = df['bid_flow_0'] - df['ask_flow_0']
df[['bid_flow_0','ask_flow_0','ofi_level_0']].head()


Unnamed: 0,bid_flow_0,ask_flow_0,ofi_level_0
0,,,
1,2.0,0.0,2.0
2,3.0,0.0,3.0
3,0.0,0.0,0.0
4,0.0,0.0,0.0


In [4]:
# Cell 4: Aggregate Best-Level OFI per Minute & Symbol

df['minute'] = df['ts_event'].dt.floor('min')
ofi_symbol_min = (
    df.groupby(['symbol','minute'])['ofi_level_0']
      .sum()
      .reset_index()
      .rename(columns={'ofi_level_0':'ofi_best_level'})
)
ofi_symbol_min.head()


Unnamed: 0,symbol,minute,ofi_best_level
0,AAPL,2024-10-21 11:54:00+00:00,-133.0
1,AAPL,2024-10-21 11:55:00+00:00,-820.0
2,AAPL,2024-10-21 11:56:00+00:00,-268.0
3,AAPL,2024-10-21 11:57:00+00:00,103.0
4,AAPL,2024-10-21 11:58:00+00:00,789.0


In [5]:
# Cell 5: Pivot to Wide Form (symbols as columns)

ofi_wide = (
    ofi_symbol_min
      .pivot(index='minute', columns='symbol', values='ofi_best_level')
      .fillna(0)
)
ofi_wide.head()


symbol,AAPL
minute,Unnamed: 1_level_1
2024-10-21 11:54:00+00:00,-133.0
2024-10-21 11:55:00+00:00,-820.0
2024-10-21 11:56:00+00:00,-268.0
2024-10-21 11:57:00+00:00,103.0
2024-10-21 11:58:00+00:00,789.0


In [6]:
# Cell 6: Compute Cross-Asset OFI

# Total OFI across all symbols at each minute
total_ofi = ofi_wide.sum(axis=1)

# For each symbol: cross-asset OFI = total OFI – that symbol’s OFI
cross_ofi = ofi_wide.copy()
for sym in ofi_wide.columns:
    cross_ofi[sym] = total_ofi - ofi_wide[sym]

cross_ofi.head()


symbol,AAPL
minute,Unnamed: 1_level_1
2024-10-21 11:54:00+00:00,0.0
2024-10-21 11:55:00+00:00,0.0
2024-10-21 11:56:00+00:00,0.0
2024-10-21 11:57:00+00:00,0.0
2024-10-21 11:58:00+00:00,0.0


In [7]:
# Cell 7: Melt & (Optional) Export/Plot

# Back to long form
ofi_cross = (
    cross_ofi
      .reset_index()
      .melt(id_vars='minute', var_name='symbol', value_name='ofi_cross_asset')
)
ofi_cross.head()

# Optionally save or plot:
# ofi_cross.to_csv('ofi_cross_asset_per_minute.csv', index=False)
# cross_ofi.plot(figsize=(10,4), title='Cross-Asset OFI per Symbol')


Unnamed: 0,minute,symbol,ofi_cross_asset
0,2024-10-21 11:54:00+00:00,AAPL,0.0
1,2024-10-21 11:55:00+00:00,AAPL,0.0
2,2024-10-21 11:56:00+00:00,AAPL,0.0
3,2024-10-21 11:57:00+00:00,AAPL,0.0
4,2024-10-21 11:58:00+00:00,AAPL,0.0
