# Step 3: Calculating Multi-Level OFI

In [2]:
import pandas as pd
import numpy as np

# Load the data
df = pd.read_csv('../files/first_25000_rows.csv')
df['ts_recv'] = pd.to_datetime(df['ts_recv'])

print("Step 3: Multi-Level OFI Calculation")
print("===================================")

# First, let's examine the depth structure across all levels
def analyze_depth_structure(df):
    depth_stats = {}
    
    for level in range(10):
        level_str = f"{level:02d}"
        bid_col = f'bid_sz_{level_str}'
        ask_col = f'ask_sz_{level_str}'
        
        bid_stats = {
            'mean': df[bid_col].mean(),
            'std': df[bid_col].std(),
            'non_zero_pct': (df[bid_col] > 0).mean() * 100
        }
        
        ask_stats = {
            'mean': df[ask_col].mean(),
            'std': df[ask_col].std(),
            'non_zero_pct': (df[ask_col] > 0).mean() * 100
        }
        
        depth_stats[level] = {'bid': bid_stats, 'ask': ask_stats}
    
    return depth_stats

depth_analysis = analyze_depth_structure(df)

print("Order book depth analysis:")
print("Level | Bid Avg Size | Ask Avg Size | Bid Fill % | Ask Fill %")
print("------|--------------|--------------|------------|------------")

for level in range(10):
    bid_avg = depth_analysis[level]['bid']['mean']
    ask_avg = depth_analysis[level]['ask']['mean']
    bid_fill = depth_analysis[level]['bid']['non_zero_pct']
    ask_fill = depth_analysis[level]['ask']['non_zero_pct']
    
    print(f"  {level:2d}  |    {bid_avg:8.1f}  |    {ask_avg:8.1f}  |    {bid_fill:6.1f}% |    {ask_fill:6.1f}%")

Step 3: Multi-Level OFI Calculation
Order book depth analysis:
Level | Bid Avg Size | Ask Avg Size | Bid Fill % | Ask Fill %
------|--------------|--------------|------------|------------
   0  |       286.7  |       122.8  |     100.0% |     100.0%
   1  |       268.5  |       154.8  |     100.0% |     100.0%
   2  |       328.1  |       130.1  |     100.0% |     100.0%
   3  |       360.1  |       151.6  |     100.0% |     100.0%
   4  |       304.8  |       182.3  |     100.0% |     100.0%
   5  |       191.6  |       134.1  |     100.0% |     100.0%
   6  |       158.3  |       171.3  |     100.0% |     100.0%
   7  |        86.3  |       267.7  |     100.0% |     100.0%
   8  |        89.9  |       259.9  |     100.0% |     100.0%
   9  |        68.4  |       185.7  |     100.0% |     100.0%


In [3]:
# Step 3.1: Calculate multi-level order flows

def calculate_multilevel_flows(df):
    df_copy = df.copy()
    df_copy = df_copy.sort_values('ts_recv').reset_index(drop=True)
    
    # Initialize flow columns for all levels
    for level in range(10):
        level_str = f"{level:02d}"
        df_copy[f'bid_flow_{level_str}'] = 0.0
        df_copy[f'ask_flow_{level_str}'] = 0.0
    
    # Calculate flows for each level and each row
    for i in range(1, len(df_copy)):
        current = df_copy.iloc[i]
        previous = df_copy.iloc[i-1]
        
        for level in range(10):
            level_str = f"{level:02d}"
            
            # Current and previous values for this level
            curr_bid_px = current[f'bid_px_{level_str}']
            prev_bid_px = previous[f'bid_px_{level_str}']
            curr_bid_sz = current[f'bid_sz_{level_str}']
            prev_bid_sz = previous[f'bid_sz_{level_str}']
            
            curr_ask_px = current[f'ask_px_{level_str}']
            prev_ask_px = previous[f'ask_px_{level_str}']
            curr_ask_sz = current[f'ask_sz_{level_str}']
            prev_ask_sz = previous[f'ask_sz_{level_str}']
            
            # Bid flow calculation for this level
            if curr_bid_px > prev_bid_px:
                df_copy.loc[i, f'bid_flow_{level_str}'] = curr_bid_sz
            elif curr_bid_px == prev_bid_px:
                df_copy.loc[i, f'bid_flow_{level_str}'] = curr_bid_sz - prev_bid_sz
            elif curr_bid_px < prev_bid_px:
                df_copy.loc[i, f'bid_flow_{level_str}'] = -prev_bid_sz
                
            # Ask flow calculation for this level
            if curr_ask_px < prev_ask_px:
                df_copy.loc[i, f'ask_flow_{level_str}'] = curr_ask_sz
            elif curr_ask_px == prev_ask_px:
                df_copy.loc[i, f'ask_flow_{level_str}'] = curr_ask_sz - prev_ask_sz
            elif curr_ask_px > prev_ask_px:
                df_copy.loc[i, f'ask_flow_{level_str}'] = -prev_ask_sz
    
    return df_copy

print("\nCalculating multi-level order flows...")
df_multilevel = calculate_multilevel_flows(df)

# Show sample of the calculated flows
sample_cols = ['ts_recv', 'bid_flow_00', 'ask_flow_00', 'bid_flow_01', 'ask_flow_01', 'bid_flow_02', 'ask_flow_02']
print("\nSample multi-level flows (first 3 levels):")
print(df_multilevel[sample_cols].head(8))


Calculating multi-level order flows...

Sample multi-level flows (first 3 levels):
                              ts_recv  bid_flow_00  ask_flow_00  bid_flow_01  \
0 2024-10-21 11:54:29.221230963+00:00          0.0          0.0          0.0   
1 2024-10-21 11:54:29.223936626+00:00          2.0          0.0          0.0   
2 2024-10-21 11:54:29.225196809+00:00          3.0          0.0          0.0   
3 2024-10-21 11:54:29.712600612+00:00          0.0          0.0          0.0   
4 2024-10-21 11:54:29.764839221+00:00          0.0          0.0          0.0   
5 2024-10-21 11:54:29.764851707+00:00          0.0          0.0        400.0   
6 2024-10-21 11:54:36.289594629+00:00          0.0          0.0          0.0   
7 2024-10-21 11:54:37.990960617+00:00          0.0        200.0          0.0   

   ask_flow_01  bid_flow_02  ask_flow_02  
0          0.0          0.0          0.0  
1          0.0          0.0          0.0  
2          0.0          0.0          0.0  
3          0.0        2

In [5]:
# Step 3.2: Calculate OFI for each level separately

def calculate_individual_level_ofi(df_flows, time_window_seconds=60):
    ofi_results = {}
    
    # Calculate instant OFI for each level
    for level in range(10):
        level_str = f"{level:02d}"
        bid_flow_col = f'bid_flow_{level_str}'
        ask_flow_col = f'ask_flow_{level_str}'
        ofi_col = f'ofi_{level_str}'
        
        df_flows[ofi_col] = df_flows[bid_flow_col] - df_flows[ask_flow_col]
    
    # Aggregate over time windows
    df_flows_indexed = df_flows.set_index('ts_recv')
    
    aggregated_data = []
    
    for level in range(10):
        level_str = f"{level:02d}"
        ofi_col = f'ofi_{level_str}'
        
        level_ofi = df_flows_indexed[ofi_col].resample(f'{time_window_seconds}s').sum().reset_index()
        level_ofi.columns = ['timestamp', f'ofi_level_{level_str}']
        
        if level == 0:
            aggregated_data = level_ofi
        else:
            aggregated_data = aggregated_data.merge(level_ofi, on='timestamp', how='outer')
    
    return aggregated_data

# Calculate individual level OFIs
individual_ofi = calculate_individual_level_ofi(df_multilevel, time_window_seconds=60)

print(f"\nIndividual level OFI calculated for {len(individual_ofi)} time intervals")
print("\nSample individual level OFI values:")
sample_ofi_cols = ['timestamp'] + [f'ofi_level_{i:02d}' for i in range(5)]
print(individual_ofi[sample_ofi_cols].head())

# Statistics for each level
print(f"\nOFI statistics by level:")
print("Level | Mean    | Std     | Min     | Max     | Non-zero %")
print("------|---------|---------|---------|---------|----------")

for level in range(10):
    level_str = f"{level:02d}"
    col = f'ofi_level_{level_str}'
    
    mean_val = individual_ofi[col].mean()
    std_val = individual_ofi[col].std()
    min_val = individual_ofi[col].min()
    max_val = individual_ofi[col].max()
    nonzero_pct = (individual_ofi[col] != 0).mean() * 100
    
    print(f"  {level:2d}  | {mean_val:7.1f} | {std_val:7.1f} | {min_val:7.1f} | {max_val:7.1f} | {nonzero_pct:8.1f}%")


Individual level OFI calculated for 71 time intervals

Sample individual level OFI values:
                  timestamp  ofi_level_00  ofi_level_01  ofi_level_02  \
0 2024-10-21 11:54:00+00:00        -195.0         600.0          10.0   
1 2024-10-21 11:55:00+00:00        -916.0         974.0        1228.0   
2 2024-10-21 11:56:00+00:00         199.0           0.0        1200.0   
3 2024-10-21 11:57:00+00:00         201.0         399.0         392.0   
4 2024-10-21 11:58:00+00:00         863.0         931.0         555.0   

   ofi_level_03  ofi_level_04  
0         -30.0          29.0  
1         152.0          10.0  
2           0.0          -4.0  
3        -121.0        -610.0  
4         182.0         599.0  

OFI statistics by level:
Level | Mean    | Std     | Min     | Max     | Non-zero %
------|---------|---------|---------|---------|----------
   0  |  -169.5 |   761.9 | -4344.0 |  1380.0 |     94.4%
   1  |  -299.4 |  1212.9 | -5458.0 |  2800.0 |     98.6%
   2  |   457.9 | 

In [6]:
# Step 3.3: Combine levels into multi-level OFI

def calculate_combined_multilevel_ofi(individual_ofi_df, method='simple_sum'):
    result_df = individual_ofi_df.copy()
    
    if method == 'simple_sum':
        # Simple sum of all levels
        ofi_cols = [f'ofi_level_{i:02d}' for i in range(10)]
        result_df['multilevel_ofi'] = result_df[ofi_cols].sum(axis=1)
        
    elif method == 'weighted_sum':
        # Weight levels by their typical activity (inverse of level number)
        weights = [1.0 / (i + 1) for i in range(10)]
        weights = np.array(weights) / np.sum(weights)  # Normalize
        
        multilevel_ofi = 0
        for level in range(10):
            level_str = f"{level:02d}"
            col = f'ofi_level_{level_str}'
            multilevel_ofi += weights[level] * result_df[col]
        
        result_df['multilevel_ofi'] = multilevel_ofi
        
    elif method == 'distance_weighted':
        # Weight by distance from mid price (closer levels get higher weight)
        weights = np.exp(-np.arange(10) * 0.5)  # Exponential decay
        weights = weights / np.sum(weights)  # Normalize
        
        multilevel_ofi = 0
        for level in range(10):
            level_str = f"{level:02d}"
            col = f'ofi_level_{level_str}'
            multilevel_ofi += weights[level] * result_df[col]
        
        result_df['multilevel_ofi'] = multilevel_ofi
    
    return result_df

# Calculate multi-level OFI using different methods
methods = ['simple_sum', 'weighted_sum', 'distance_weighted']
multilevel_results = {}

for method in methods:
    result = calculate_combined_multilevel_ofi(individual_ofi, method=method)
    multilevel_results[method] = result
    
    print(f"\nMulti-level OFI using {method}:")
    print(f"  Mean: {result['multilevel_ofi'].mean():.2f}")
    print(f"  Std: {result['multilevel_ofi'].std():.2f}")
    print(f"  Range: [{result['multilevel_ofi'].min():.2f}, {result['multilevel_ofi'].max():.2f}]")

# Compare the methods
print(f"\nComparison of multi-level OFI methods:")
print("Timestamp               | Simple Sum | Weighted   | Distance W.")
print("------------------------|------------|------------|------------")

for i in range(min(8, len(individual_ofi))):
    ts = individual_ofi.iloc[i]['timestamp'].strftime('%H:%M:%S')
    simple = multilevel_results['simple_sum'].iloc[i]['multilevel_ofi']
    weighted = multilevel_results['weighted_sum'].iloc[i]['multilevel_ofi']
    distance = multilevel_results['distance_weighted'].iloc[i]['multilevel_ofi']
    
    print(f"{ts}               | {simple:10.1f} | {weighted:10.1f} | {distance:10.1f}")


Multi-level OFI using simple_sum:
  Mean: 220.32
  Std: 4736.23
  Range: [-23387.00, 9533.00]

Multi-level OFI using weighted_sum:
  Mean: -41.55
  Std: 585.79
  Range: [-3421.32, 776.06]

Multi-level OFI using distance_weighted:
  Mean: -58.83
  Std: 668.86
  Range: [-3965.08, 843.01]

Comparison of multi-level OFI methods:
Timestamp               | Simple Sum | Weighted   | Distance W.
------------------------|------------|------------|------------
11:54:00               |     1438.0 |       80.4 |       81.9
11:55:00               |     1089.0 |       -8.7 |       58.9
11:56:00               |     1396.0 |      204.3 |      253.5
11:57:00               |     -632.0 |       95.5 |      182.2
11:58:00               |     3550.0 |      579.5 |      685.2
11:59:00               |      732.0 |      211.4 |      299.1
12:00:00               |     -356.0 |       84.3 |       98.0
12:01:00               |     2843.0 |      244.6 |      293.2


In [7]:
# Step 3.4: Compare multi-level vs best-level OFI

# Load the best-level OFI we calculated in Step 2
try:
    best_level_ofi = pd.read_csv('best_level_ofi.csv')
    best_level_ofi['timestamp'] = pd.to_datetime(best_level_ofi['timestamp'])
    
    # Merge with multi-level results
    comparison_df = best_level_ofi.merge(
        multilevel_results['simple_sum'][['timestamp', 'multilevel_ofi']], 
        on='timestamp', 
        how='inner'
    )
    
    print(f"\nComparison: Best-Level vs Multi-Level OFI")
    print(f"Correlation: {comparison_df['best_level_ofi'].corr(comparison_df['multilevel_ofi']):.3f}")
    
    print(f"\nSample comparison:")
    print("Timestamp               | Best Level | Multi Level | Difference")
    print("------------------------|------------|-------------|------------")
    
    for i in range(min(8, len(comparison_df))):
        ts = comparison_df.iloc[i]['timestamp'].strftime('%H:%M:%S')
        best = comparison_df.iloc[i]['best_level_ofi']
        multi = comparison_df.iloc[i]['multilevel_ofi']
        diff = multi - best
        
        print(f"{ts}               | {best:10.1f} | {multi:11.1f} | {diff:10.1f}")
        
    # Additional statistics
    print(f"\nAdditional comparison statistics:")
    print(f"Best-level OFI  - Mean: {comparison_df['best_level_ofi'].mean():.2f}, Std: {comparison_df['best_level_ofi'].std():.2f}")
    print(f"Multi-level OFI - Mean: {comparison_df['multilevel_ofi'].mean():.2f}, Std: {comparison_df['multilevel_ofi'].std():.2f}")
    
    # Information gain from multiple levels
    additional_info = comparison_df['multilevel_ofi'] - comparison_df['best_level_ofi']
    print(f"Additional info from deeper levels - Mean: {additional_info.mean():.2f}, Std: {additional_info.std():.2f}")
    
except FileNotFoundError:
    print("Best-level OFI file not found. Skipping comparison.")


Comparison: Best-Level vs Multi-Level OFI
Correlation: 0.711

Sample comparison:
Timestamp               | Best Level | Multi Level | Difference
------------------------|------------|-------------|------------
11:54:00               |     -195.0 |      1438.0 |     1633.0
11:55:00               |     -916.0 |      1089.0 |     2005.0
11:56:00               |      199.0 |      1396.0 |     1197.0
11:57:00               |      201.0 |      -632.0 |     -833.0
11:58:00               |      863.0 |      3550.0 |     2687.0
11:59:00               |      208.0 |       732.0 |      524.0
12:00:00               |      498.0 |      -356.0 |     -854.0
12:01:00               |     -199.0 |      2843.0 |     3042.0

Additional comparison statistics:
Best-level OFI  - Mean: -169.49, Std: 761.90
Multi-level OFI - Mean: 220.32, Std: 4736.23
Additional info from deeper levels - Mean: 389.82, Std: 4228.82


In [9]:
# Step 3.5: Save results and summary

# Save the individual level OFIs
individual_ofi.to_csv('individual_level_ofi.csv', index=False)

# Save the combined multi-level OFI (using simple sum method)
final_multilevel = multilevel_results['simple_sum'][['timestamp', 'multilevel_ofi']]
final_multilevel.to_csv('multilevel_ofi.csv', index=False)

print(f"\nResults saved:")
print(f"- individual_level_ofi.csv: OFI for each level separately")
print(f"- multilevel_ofi.csv: Combined multi-level OFI")

# Final summary
print(f"\nStep 3 Summary - Multi-Level OFI:")
print(f"================================")
print(f"- Calculated OFI for all 10 order book levels")
print(f"- Level 0 (best) has highest activity: {(individual_ofi['ofi_level_00'] != 0).mean()*100:.1f}% non-zero")
print(f"- Deeper levels show decreasing activity")
print(f"- Multi-level OFI captures {individual_ofi.shape[1]-1} dimensions of order flow")
print(f"- Time intervals: {len(individual_ofi)}")

# Show which levels contribute most
level_contributions = {}
for level in range(10):
    level_str = f"{level:02d}"
    col = f'ofi_level_{level_str}'
    level_contributions[level] = abs(individual_ofi[col]).mean()

print(f"\nAverage absolute contribution by level:")
for level, contrib in level_contributions.items():
    print(f"Level {level:2d}: {contrib:6.1f}")



Results saved:
- individual_level_ofi.csv: OFI for each level separately
- multilevel_ofi.csv: Combined multi-level OFI

Step 3 Summary - Multi-Level OFI:
- Calculated OFI for all 10 order book levels
- Level 0 (best) has highest activity: 94.4% non-zero
- Deeper levels show decreasing activity
- Multi-level OFI captures 10 dimensions of order flow
- Time intervals: 71

Average absolute contribution by level:
Level  0:  477.0
Level  1:  868.7
Level  2:  897.3
Level  3:  616.7
Level  4:  539.0
Level  5:  390.3
Level  6:  341.9
Level  7:  307.5
Level  8:  326.7
Level  9:  305.7
