## Main Notebook for the task

### Step 1 Compute OFI metrics 
1. Derive multi-level OFI metrics (up to 5 levels) for each stock in the dataset.
2. Integrate these multi-level OFIs into a single metric using Principal Component Analysis (PCA) or another dimensionality reduction method.

#### Data-preprocessing
I followed the instructions and retrieved data from AAPL, AMGN, TSLA, JPM, XOM from period 12/07/2024 to 01/07/2025 (one month).  

In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import databento as db

In [6]:
print(np.__version__)

1.23.5


In [7]:
data_directory=[]

data_directory_AAPL=f"/Users/mic/Desktop/Github/Cross-Impact-Analysis-of-Order-Flow-Imbalance-OFI-/data/XNAS-20250107-APPL"
data_directory_AMGN=f"/Users/mic/Desktop/Github/Cross-Impact-Analysis-of-Order-Flow-Imbalance-OFI-/data/XNAS-20250107-AMGN"
data_directory_TSLA=f"/Users/mic/Desktop/Github/Cross-Impact-Analysis-of-Order-Flow-Imbalance-OFI-/data/XNAS-20250107-TSLA"
data_directory_JPM=f"/Users/mic/Desktop/Github/Cross-Impact-Analysis-of-Order-Flow-Imbalance-OFI-/data/XNAS-20250107-JPM"
data_directory_XOM=f"/Users/mic/Desktop/Github/Cross-Impact-Analysis-of-Order-Flow-Imbalance-OFI-/data/XNAS-20250107-XOM"

data_directory.append(data_directory_AAPL)
data_directory.append(data_directory_AMGN)
data_directory.append(data_directory_TSLA)
data_directory.append(data_directory_JPM)
data_directory.append(data_directory_XOM)

data_directory

['/Users/mic/Desktop/Github/Cross-Impact-Analysis-of-Order-Flow-Imbalance-OFI-/data/XNAS-20250107-APPL',
 '/Users/mic/Desktop/Github/Cross-Impact-Analysis-of-Order-Flow-Imbalance-OFI-/data/XNAS-20250107-AMGN',
 '/Users/mic/Desktop/Github/Cross-Impact-Analysis-of-Order-Flow-Imbalance-OFI-/data/XNAS-20250107-TSLA',
 '/Users/mic/Desktop/Github/Cross-Impact-Analysis-of-Order-Flow-Imbalance-OFI-/data/XNAS-20250107-JPM',
 '/Users/mic/Desktop/Github/Cross-Impact-Analysis-of-Order-Flow-Imbalance-OFI-/data/XNAS-20250107-XOM']

Making the downloads into dataframe. There are .dbn.zst files for days of MBP-10 order books, we transform it into pandas dataframe and concatenate data for days into a big dataset for each of the five companies. 

In [8]:
combined_dataframes=[]
dataframes=[]
for data_directory_company in data_directory:
    for filename in os.listdir(data_directory_company):
        if filename.endswith(".dbn.zst"):
            file_path=os.path.join(data_directory_company, filename)
            file_df=db.DBNStore.from_file(file_path).to_df()
            dataframes.append(file_df)
    combined_dataframes.append(pd.concat(dataframes, ignore_index=True))
    dataframes=[]

In [9]:
len(combined_dataframes)

5

In [10]:
combined_df_AAPL=combined_dataframes[0]
combined_df_AMGN=combined_dataframes[1]
combined_df_TSLA=combined_dataframes[2]
combined_df_JPM=combined_dataframes[3]
combined_df_XOM=combined_dataframes[4]


In [11]:
combined_df_AAPL.shape

(25602477, 73)

In [12]:
%store combined_df_AAPL
%store combined_df_AMGN
%store combined_df_TSLA
%store combined_df_JPM
%store combined_df_XOM

Stored 'combined_df_AAPL' (DataFrame)
Stored 'combined_df_AMGN' (DataFrame)
Stored 'combined_df_TSLA' (DataFrame)
Stored 'combined_df_JPM' (DataFrame)
Stored 'combined_df_XOM' (DataFrame)


In [13]:
combined_df_AAPL.columns

Index(['ts_event', 'rtype', 'publisher_id', 'instrument_id', 'action', 'side',
       'depth', 'price', 'size', 'flags', 'ts_in_delta', 'sequence',
       'bid_px_00', 'ask_px_00', 'bid_sz_00', 'ask_sz_00', 'bid_ct_00',
       'ask_ct_00', 'bid_px_01', 'ask_px_01', 'bid_sz_01', 'ask_sz_01',
       'bid_ct_01', 'ask_ct_01', 'bid_px_02', 'ask_px_02', 'bid_sz_02',
       'ask_sz_02', 'bid_ct_02', 'ask_ct_02', 'bid_px_03', 'ask_px_03',
       'bid_sz_03', 'ask_sz_03', 'bid_ct_03', 'ask_ct_03', 'bid_px_04',
       'ask_px_04', 'bid_sz_04', 'ask_sz_04', 'bid_ct_04', 'ask_ct_04',
       'bid_px_05', 'ask_px_05', 'bid_sz_05', 'ask_sz_05', 'bid_ct_05',
       'ask_ct_05', 'bid_px_06', 'ask_px_06', 'bid_sz_06', 'ask_sz_06',
       'bid_ct_06', 'ask_ct_06', 'bid_px_07', 'ask_px_07', 'bid_sz_07',
       'ask_sz_07', 'bid_ct_07', 'ask_ct_07', 'bid_px_08', 'ask_px_08',
       'bid_sz_08', 'ask_sz_08', 'bid_ct_08', 'ask_ct_08', 'bid_px_09',
       'ask_px_09', 'bid_sz_09', 'ask_sz_09', 'bid_ct_09', '

Notice that the columns bid_px_N denote the bid price at level N (top level then N=00), bid_sz_N denote the bid size at level N. Similarily for ask levels. 

We implement the best level OFI algorithm as described in the paper

In [14]:
def compute_best_level_ofi(df):
    """
    Compute best-level OFI (Order Flow Imbalance)
    We assume df has columns:
      'bid_px_01', 'bid_sz_01',
      'ask_px_01', 'ask_sz_01'
    
    Algorithm based on the paper
    """
    # First, shift columns to compare current row with previous
    df['prev_bid_px'] = df['bid_px_01'].shift()
    df['prev_bid_sz'] = df['bid_sz_01'].shift()
    df['prev_ask_px'] = df['ask_px_01'].shift()
    df['prev_ask_sz'] = df['ask_sz_01'].shift()

    # 1) BID-SIDE OFI
    # Masks for NaN (the first row), up, down, same
    mask_bid_nan  = df['prev_bid_px'].isna()
    mask_bid_up   = df['bid_px_01']  > df['prev_bid_px']
    mask_bid_down = df['bid_px_01']  < df['prev_bid_px']
    mask_bid_same = ~(mask_bid_up | mask_bid_down | mask_bid_nan)
    
    # Use np.select to assign the correct value for each row
    of_bid = np.select(
        [mask_bid_nan, mask_bid_up, mask_bid_down, mask_bid_same],
        [0.0, df['bid_sz_01'], -df['bid_sz_01'], df['bid_sz_01'] - df['prev_bid_sz']],
        default=0.0
    )

    # 2) ASK-SIDE OFI
    # Similarily
    mask_ask_nan  = df['prev_ask_px'].isna()
    mask_ask_down = df['ask_px_01']  < df['prev_ask_px']
    mask_ask_up   = df['ask_px_01']  > df['prev_ask_px']
    mask_ask_same = ~(mask_ask_down | mask_ask_up | mask_ask_nan)
    
    of_ask = np.select(
        [mask_ask_nan, mask_ask_down, mask_ask_up, mask_ask_same],
        [0.0, df['ask_sz_01'], -df['ask_sz_01'], df['ask_sz_01'] - df['prev_ask_sz']],
        default=0.0
    )
    
    # ---------------------------
    # 3) Combine to get best-level OFI
    # ---------------------------
    df['best_level_ofi'] = of_bid - of_ask
    return df['best_level_ofi']

In [16]:
best_level_ofi_AAPL = compute_best_level_ofi(combined_df_AAPL)



In [17]:
best_level_ofi_AAPL.shape

(25602477,)

In [18]:
best_level_ofi_AMGN = compute_best_level_ofi(combined_df_AMGN)
best_level_ofi_TSLA = compute_best_level_ofi(combined_df_TSLA)
best_level_ofi_JPM = compute_best_level_ofi(combined_df_JPM)
best_level_ofi_XOM = compute_best_level_ofi(combined_df_XOM)

In [19]:
best_level_ofi_AMGN.shape

(1550548,)

In [20]:
def compute_multi_level_ofi(df, levels=range(2,6)):
    """
     multi-level OFI computation for levels 2..5 

    Returns a DataFrame of only the multi-level OFI columns:
        ofi_lvl_2, ofi_lvl_3, ofi_lvl_4, ofi_lvl_5
    """
    df = df.copy()
    ofi_results = {}

    for lvl in levels:
        px_bid_col = f'bid_px_0{lvl}'
        sz_bid_col = f'bid_sz_0{lvl}'
        px_ask_col = f'ask_px_0{lvl}'
        sz_ask_col = f'ask_sz_0{lvl}'

        # Shift columns to compare to previous row
        df[f'prev_bid_px_{lvl}'] = df[px_bid_col].shift()
        df[f'prev_bid_sz_{lvl}'] = df[sz_bid_col].shift()
        df[f'prev_ask_px_{lvl}'] = df[px_ask_col].shift()
        df[f'prev_ask_sz_{lvl}'] = df[sz_ask_col].shift()

        # Bid side
        mask_bid_nan  = df[f'prev_bid_px_{lvl}'].isna()  # first row or missing
        mask_bid_up   = df[px_bid_col] > df[f'prev_bid_px_{lvl}']
        mask_bid_down = df[px_bid_col] < df[f'prev_bid_px_{lvl}']
        mask_bid_same = ~(mask_bid_nan | mask_bid_up | mask_bid_down)

        of_bid = np.select(
            [mask_bid_nan, mask_bid_up, mask_bid_down, mask_bid_same],
            [0.0, df[sz_bid_col], -df[sz_bid_col], df[sz_bid_col] - df[f'prev_bid_sz_{lvl}']],
            default=0.0
        )

        # Ask side
        mask_ask_nan  = df[f'prev_ask_px_{lvl}'].isna()
        mask_ask_down = df[px_ask_col] < df[f'prev_ask_px_{lvl}']
        mask_ask_up   = df[px_ask_col] > df[f'prev_ask_px_{lvl}']
        mask_ask_same = ~(mask_ask_nan | mask_ask_down | mask_ask_up)

        of_ask = np.select(
            [mask_ask_nan, mask_ask_down, mask_ask_up, mask_ask_same],
            [0.0, df[sz_ask_col], -df[sz_ask_col], df[sz_ask_col] - df[f'prev_ask_sz_{lvl}']],
            default=0.0
        )

        # ---------------------------
        # Combine to get OFI at this level
        # ---------------------------
        ofi_lvl = of_bid - of_ask
        ofi_results[f'ofi_lvl_{lvl}'] = ofi_lvl

    # Append the results to the DataFrame
    for col_name, col_array in ofi_results.items():
        df[col_name] = col_array

    # Return only the columns of interest (the multi-level OFI columns)
    ofi_cols = list(ofi_results.keys())
    return df[ofi_cols]


In [21]:
multi_level_ofi_AAPL = compute_multi_level_ofi(combined_df_AAPL)
multi_level_ofi_AAPL.head()

Unnamed: 0,ofi_lvl_2,ofi_lvl_3,ofi_lvl_4,ofi_lvl_5
0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0


In [22]:
multi_level_ofi_AAPL.shape

(25602477, 4)

In [23]:
multi_level_ofi_AMGN = compute_multi_level_ofi(combined_df_AMGN)
multi_level_ofi_TSLA = compute_multi_level_ofi(combined_df_TSLA)
multi_level_ofi_JPM = compute_multi_level_ofi(combined_df_JPM)
multi_level_ofi_XOM = compute_multi_level_ofi(combined_df_XOM)

In [24]:
combined_level_ofi_AAPL=pd.concat([best_level_ofi_AAPL, multi_level_ofi_AAPL], axis=1)
combined_level_ofi_AAPL.shape

(25602477, 5)

In [25]:
combined_level_ofi_AMGN=pd.concat([best_level_ofi_AMGN, multi_level_ofi_AMGN], axis=1)
combined_level_ofi_TSLA=pd.concat([best_level_ofi_TSLA, multi_level_ofi_TSLA], axis=1)
combined_level_ofi_JPM=pd.concat([best_level_ofi_JPM, multi_level_ofi_JPM], axis=1)
combined_level_ofi_XOM=pd.concat([best_level_ofi_XOM, multi_level_ofi_XOM], axis=1)

Once we have the OFI level 1-5 for each 5 stocks, we can then perform PCA and look for the dominant pinciple component and use it to calculate the ""Integrated OFI""

In [1]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

scalar=StandardScaler()
combined_level_ofi_AAPL_scaled=scalar.fit_transform(combined_level_ofi_AAPL.T)

pca=PCA(n_components=1)
aapl_PCA=pca.fit(combined_level_ofi_AAPL_scaled)

NameError: name 'combined_level_ofi_AAPL' is not defined

In [34]:
combined_level_ofi_AMGN_scaled=scalar.fit_transform(combined_level_ofi_AMGN)
combined_level_ofi_TSLA_scaled=scalar.fit_transform(combined_level_ofi_TSLA)
combined_level_ofi_JPM_scaled=scalar.fit_transform(combined_level_ofi_JPM)
combined_level_ofi_XOM_scaled=scalar.fit_transform(combined_level_ofi_XOM)

amgn_PCA=pca.fit(combined_level_ofi_AMGN_scaled)
tsla_PCA=pca.fit(combined_level_ofi_TSLA_scaled)
jpm_PCA=pca.fit(combined_level_ofi_JPM_scaled)
xom_PCA=pca.fit(combined_level_ofi_XOM_scaled)

  return lib.map_infer(values, mapper, convert=convert)
  return lib.map_infer(values, mapper, convert=convert)
  return lib.map_infer(values, mapper, convert=convert)
  return lib.map_infer(values, mapper, convert=convert)
  return lib.map_infer(values, mapper, convert=convert)
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  return lib.map_infer(values, mapper, convert=convert)
  return lib.map_infer(values, mapper, convert=convert)
  return lib.map_infer(values, mapper, convert=convert)
  return lib.map_infer(values, mapper, convert=convert)
  return lib.map_infer(val

In [37]:
aapl_PCA.explained_variance_ratio_

array([0.98608582])

In [39]:
aapl_PCA.components_

array([[-0.44235018, -0.44828824, -0.44901729, -0.44864271, -0.44773559]])