# Step 5: Calculating Cross-Asset OFI

In [3]:
import pandas as pd
import numpy as np

# Load the original data to see what symbols we have
df = pd.read_csv('../files/first_25000_rows.csv')
df['ts_recv'] = pd.to_datetime(df['ts_recv'])

print("Step 5: Cross-Asset OFI Calculation")
print("==================================")

# Check what symbols/assets we have in the data
unique_symbols = df['symbol'].unique()
print(f"Available symbols in dataset: {unique_symbols}")
print(f"Number of unique assets: {len(unique_symbols)}")

# Check data distribution by symbol
symbol_counts = df['symbol'].value_counts()
print(f"\nData points per symbol:")
for symbol, count in symbol_counts.items():
    pct = count / len(df) * 100
    print(f"{symbol}: {count:,} events ({pct:.1f}%)")

# Since we likely only have AAPL, we'll simulate cross-asset relationships
# In practice, you would have multiple assets in your dataset
if len(unique_symbols) == 1:
    print(f"\nNote: Dataset contains only {unique_symbols[0]}")
    print("For cross-asset OFI, we'll demonstrate the methodology")
    print("and create simulated related assets to show the concept")
    
    # Create simulated related assets (in practice these would be real data)
    main_symbol = unique_symbols[0]
    related_symbols = ['QQQ', 'SPY', 'MSFT']  # ETFs and related stocks
    
    print(f"\nSimulating cross-asset data for demonstration:")
    print(f"Main asset: {main_symbol}")
    print(f"Related assets: {related_symbols}")
else:
    print(f"Multiple assets detected - proceeding with real cross-asset analysis")

Step 5: Cross-Asset OFI Calculation
Available symbols in dataset: ['AAPL']
Number of unique assets: 1

Data points per symbol:
AAPL: 5,000 events (100.0%)

Note: Dataset contains only AAPL
For cross-asset OFI, we'll demonstrate the methodology
and create simulated related assets to show the concept

Simulating cross-asset data for demonstration:
Main asset: AAPL
Related assets: ['QQQ', 'SPY', 'MSFT']


In [4]:
# Step 5.1: Prepare cross-asset OFI framework

def create_cross_asset_simulation(main_df, main_symbol, related_symbols):
    """
    Simulate related asset OFI data for demonstration
    In practice, you would load real data for each asset
    """
    
    # Load our calculated OFI for the main asset
    try:
        main_ofi = pd.read_csv('integrated_ofi.csv')
        main_ofi['timestamp'] = pd.to_datetime(main_ofi['timestamp'])
        main_ofi = main_ofi.rename(columns={'integrated_ofi': f'ofi_{main_symbol}'})
    except:
        print("Main asset OFI not found, using placeholder")
        main_ofi = pd.DataFrame({
            'timestamp': pd.date_range(start='2024-10-21 11:54:00', periods=10, freq='1min'),
            f'ofi_{main_symbol}': np.random.normal(0, 50, 10)
        })
    
    # Simulate related asset OFI data
    cross_asset_data = main_ofi.copy()
    
    for i, symbol in enumerate(related_symbols):
        # Create correlated but noisy OFI for related assets
        base_correlation = 0.7 - (i * 0.1)  # Decreasing correlation
        noise_level = 0.5 + (i * 0.2)  # Increasing noise
        
        # Generate correlated OFI
        main_ofi_values = main_ofi[f'ofi_{main_symbol}'].values
        correlated_component = base_correlation * main_ofi_values
        noise_component = np.random.normal(0, noise_level * np.std(main_ofi_values), len(main_ofi_values))
        
        cross_asset_data[f'ofi_{symbol}'] = correlated_component + noise_component
    
    return cross_asset_data

# Create cross-asset dataset
if len(unique_symbols) == 1:
    main_symbol = unique_symbols[0]
    related_symbols = ['QQQ', 'SPY', 'MSFT']
    cross_asset_ofi = create_cross_asset_simulation(df, main_symbol, related_symbols)
else:
    # In real scenario with multiple assets, you would calculate OFI for each
    print("Real multi-asset scenario - implement asset-by-asset OFI calculation")
    cross_asset_ofi = None

if cross_asset_ofi is not None:
    print(f"\nCross-asset OFI dataset created:")
    print(f"Shape: {cross_asset_ofi.shape}")
    print(f"Columns: {cross_asset_ofi.columns.tolist()}")
    print(f"\nFirst few rows:")
    print(cross_asset_ofi.head())


Cross-asset OFI dataset created:
Shape: (71, 5)
Columns: ['timestamp', 'ofi_AAPL', 'ofi_QQQ', 'ofi_SPY', 'ofi_MSFT']

First few rows:
                  timestamp    ofi_AAPL     ofi_QQQ     ofi_SPY    ofi_MSFT
0 2024-10-21 11:54:00+00:00  138.372260 -268.810081    6.864176 -564.092639
1 2024-10-21 11:55:00+00:00   50.325235 -314.814873 -259.623211  -43.230220
2 2024-10-21 11:56:00+00:00  115.027530  423.761020  -76.987717 -456.960511
3 2024-10-21 11:57:00+00:00  -87.693208 -270.970009 -178.967797  295.656363
4 2024-10-21 11:58:00+00:00  309.402512    7.373165  138.296218   94.953640


In [5]:
# Step 5.2: Analyze cross-asset OFI correlations

def analyze_cross_correlations(cross_data, main_symbol):
    # Get OFI columns
    ofi_columns = [col for col in cross_data.columns if col.startswith('ofi_')]
    
    if len(ofi_columns) < 2:
        print("Need at least 2 assets for cross-correlation analysis")
        return None
    
    # Calculate correlation matrix
    ofi_data = cross_data[ofi_columns]
    correlation_matrix = ofi_data.corr()
    
    print(f"Cross-asset OFI correlation matrix:")
    print("Symbol ", end="")
    symbols = [col.replace('ofi_', '') for col in ofi_columns]
    for symbol in symbols:
        print(f"{symbol:>8}", end="")
    print()
    
    for i, symbol_i in enumerate(symbols):
        print(f"{symbol_i:<6}", end="")
        for j, symbol_j in enumerate(symbols):
            corr = correlation_matrix.iloc[i, j]
            print(f"{corr:8.3f}", end="")
        print()
    
    # Analyze correlations with main asset
    main_col = f'ofi_{main_symbol}'
    if main_col in ofi_columns:
        print(f"\nCorrelations with main asset ({main_symbol}):")
        for col in ofi_columns:
            if col != main_col:
                corr = correlation_matrix.loc[main_col, col]
                symbol = col.replace('ofi_', '')
                print(f"{symbol:>6}: {corr:6.3f}")
    
    return correlation_matrix

if cross_asset_ofi is not None:
    main_symbol = unique_symbols[0]
    cross_correlations = analyze_cross_correlations(cross_asset_ofi, main_symbol)

Cross-asset OFI correlation matrix:
Symbol     AAPL     QQQ     SPY    MSFT
AAPL     1.000   0.843   0.641   0.355
QQQ      0.843   1.000   0.544   0.291
SPY      0.641   0.544   1.000   0.226
MSFT     0.355   0.291   0.226   1.000

Correlations with main asset (AAPL):
   QQQ:  0.843
   SPY:  0.641
  MSFT:  0.355


In [6]:
# Step 5.3: Calculate Cross-Asset OFI variants

def calculate_cross_asset_ofi(cross_data, main_symbol, method='weighted_sum'):
    """
    Calculate cross-asset OFI using different methods
    """
    
    ofi_columns = [col for col in cross_data.columns if col.startswith('ofi_')]
    main_col = f'ofi_{main_symbol}'
    other_cols = [col for col in ofi_columns if col != main_col]
    
    if len(other_cols) == 0:
        print("No other assets found for cross-asset calculation")
        return cross_data
    
    result_df = cross_data.copy()
    
    if method == 'simple_sum':
        # Simple sum of all other assets
        result_df['cross_asset_ofi'] = cross_data[other_cols].sum(axis=1)
        
    elif method == 'weighted_sum':
        # Weight by correlation with main asset
        if cross_correlations is not None:
            weights = []
            for col in other_cols:
                corr = abs(cross_correlations.loc[main_col, col])
                weights.append(corr)
            
            # Normalize weights
            weights = np.array(weights)
            if weights.sum() > 0:
                weights = weights / weights.sum()
            else:
                weights = np.ones(len(weights)) / len(weights)
            
            # Calculate weighted sum
            cross_ofi = np.zeros(len(cross_data))
            for i, col in enumerate(other_cols):
                cross_ofi += weights[i] * cross_data[col]
            
            result_df['cross_asset_ofi'] = cross_ofi
            
            # Store weights for reference
            result_df.attrs['cross_weights'] = dict(zip(other_cols, weights))
        else:
            # Fallback to equal weights
            result_df['cross_asset_ofi'] = cross_data[other_cols].mean(axis=1)
            
    elif method == 'market_cap_weighted':
        # Simulate market cap weights (in practice you'd use real market caps)
        market_caps = {'QQQ': 0.4, 'SPY': 0.4, 'MSFT': 0.2}  # Example weights
        
        cross_ofi = np.zeros(len(cross_data))
        total_weight = 0
        
        for col in other_cols:
            symbol = col.replace('ofi_', '')
            weight = market_caps.get(symbol, 1.0 / len(other_cols))
            cross_ofi += weight * cross_data[col]
            total_weight += weight
        
        if total_weight > 0:
            cross_ofi = cross_ofi / total_weight
            
        result_df['cross_asset_ofi'] = cross_ofi
        
    elif method == 'sector_weighted':
        # Weight by sector relationship (simulated)
        sector_weights = {'QQQ': 0.5, 'SPY': 0.3, 'MSFT': 0.8}  # Tech relatedness
        
        cross_ofi = np.zeros(len(cross_data))
        total_weight = 0
        
        for col in other_cols:
            symbol = col.replace('ofi_', '')
            weight = sector_weights.get(symbol, 0.1)
            cross_ofi += weight * cross_data[col]
            total_weight += weight
        
        if total_weight > 0:
            cross_ofi = cross_ofi / total_weight
            
        result_df['cross_asset_ofi'] = cross_ofi
    
    return result_df

# Calculate cross-asset OFI using different methods
if cross_asset_ofi is not None:
    methods = ['simple_sum', 'weighted_sum', 'market_cap_weighted', 'sector_weighted']
    cross_results = {}
    
    for method in methods:
        result = calculate_cross_asset_ofi(cross_asset_ofi, main_symbol, method=method)
        cross_results[method] = result['cross_asset_ofi'].values
        
        print(f"\nCross-asset OFI using {method}:")
        print(f"  Mean: {result['cross_asset_ofi'].mean():.2f}")
        print(f"  Std: {result['cross_asset_ofi'].std():.2f}")
        print(f"  Range: [{result['cross_asset_ofi'].min():.2f}, {result['cross_asset_ofi'].max():.2f}]")
        print(f"  Correlation with main asset: {result[f'ofi_{main_symbol}'].corr(result['cross_asset_ofi']):.3f}")


Cross-asset OFI using simple_sum:
  Mean: 78.28
  Std: 909.56
  Range: [-3478.42, 1952.53]
  Correlation with main asset: 0.798

Cross-asset OFI using weighted_sum:
  Mean: 33.39
  Std: 308.46
  Range: [-1176.92, 674.07]
  Correlation with main asset: 0.853

Cross-asset OFI using market_cap_weighted:
  Mean: 35.35
  Std: 308.44
  Range: [-1165.69, 686.62]
  Correlation with main asset: 0.840

Cross-asset OFI using sector_weighted:
  Mean: 11.75
  Std: 315.17
  Range: [-1164.07, 736.53]
  Correlation with main asset: 0.711


In [7]:
# Step 5.4: Analyze lagged cross-asset effects

def calculate_lagged_cross_correlations(cross_data, main_symbol, max_lags=5):
    """
    Calculate correlations between main asset OFI and lagged cross-asset OFI
    This shows if other assets predict our main asset
    """
    
    main_col = f'ofi_{main_symbol}'
    other_cols = [col for col in cross_data.columns if col.startswith('ofi_') and col != main_col]
    
    if not other_cols:
        return None
    
    # Calculate cross-asset OFI (using weighted sum method)
    cross_ofi = cross_data[other_cols].mean(axis=1)
    main_ofi = cross_data[main_col]
    
    lagged_correlations = {}
    
    print(f"Lagged cross-correlations (other assets predicting {main_symbol}):")
    print("Lag | Correlation | Interpretation")
    print("----|-------------|--------------------------------------------")
    
    for lag in range(max_lags + 1):
        if lag == 0:
            corr = main_ofi.corr(cross_ofi)
            interp = "Contemporaneous correlation"
        else:
            # Lag the cross-asset OFI
            lagged_cross = cross_ofi.shift(lag)
            corr = main_ofi.corr(lagged_cross)
            interp = f"Cross-assets {lag} periods ago predict main asset"
        
        lagged_correlations[lag] = corr
        print(f" {lag:2d} | {corr:11.3f} | {interp}")
    
    return lagged_correlations

if cross_asset_ofi is not None:
    lagged_corr = calculate_lagged_cross_correlations(cross_asset_ofi, main_symbol, max_lags=5)
    
    # Find the lag with highest correlation
    if lagged_corr:
        best_lag = max(lagged_corr.keys(), key=lambda k: abs(lagged_corr[k]))
        print(f"\nBest predictive lag: {best_lag} periods (correlation: {lagged_corr[best_lag]:.3f})")

Lagged cross-correlations (other assets predicting AAPL):
Lag | Correlation | Interpretation
----|-------------|--------------------------------------------
  0 |       0.798 | Contemporaneous correlation
  1 |      -0.067 | Cross-assets 1 periods ago predict main asset
  2 |       0.129 | Cross-assets 2 periods ago predict main asset
  3 |       0.008 | Cross-assets 3 periods ago predict main asset
  4 |      -0.099 | Cross-assets 4 periods ago predict main asset
  5 |       0.011 | Cross-assets 5 periods ago predict main asset

Best predictive lag: 0 periods (correlation: 0.798)


In [8]:
# Step 5.5: Create final Cross-Asset OFI feature

def create_final_cross_asset_ofi(cross_data, main_symbol, include_lags=True):
    """
    Create the final cross-asset OFI feature combining multiple approaches
    """
    
    main_col = f'ofi_{main_symbol}'
    other_cols = [col for col in cross_data.columns if col.startswith('ofi_') and col != main_col]
    
    if not other_cols:
        print("No cross-asset data available")
        return cross_data[[main_col]].rename(columns={main_col: 'main_asset_ofi'})
    
    result_df = cross_data[['timestamp']].copy()
    result_df['main_asset_ofi'] = cross_data[main_col]
    
    # Method 1: Simple cross-asset OFI (equal weights)
    result_df['cross_asset_simple'] = cross_data[other_cols].mean(axis=1)
    
    # Method 2: Correlation-weighted cross-asset OFI
    if cross_correlations is not None:
        weights = []
        for col in other_cols:
            corr = abs(cross_correlations.loc[main_col, col])
            weights.append(corr)
        
        weights = np.array(weights)
        if weights.sum() > 0:
            weights = weights / weights.sum()
            
            cross_weighted = np.zeros(len(cross_data))
            for i, col in enumerate(other_cols):
                cross_weighted += weights[i] * cross_data[col]
            
            result_df['cross_asset_weighted'] = cross_weighted
        else:
            result_df['cross_asset_weighted'] = result_df['cross_asset_simple']
    else:
        result_df['cross_asset_weighted'] = result_df['cross_asset_simple']
    
    # Method 3: Lagged cross-asset OFI (use best lag found earlier)
    if include_lags and lagged_corr:
        best_lag = max(lagged_corr.keys(), key=lambda k: abs(lagged_corr[k]) if k > 0 else 0)
        if best_lag > 0:
            result_df['cross_asset_lagged'] = result_df['cross_asset_weighted'].shift(best_lag)
        else:
            result_df['cross_asset_lagged'] = result_df['cross_asset_weighted']
    
    # Combined cross-asset feature (ensemble)
    cross_features = [col for col in result_df.columns if col.startswith('cross_asset')]
    if cross_features:
        result_df['cross_asset_ofi'] = result_df[cross_features].mean(axis=1)
    
    return result_df

if cross_asset_ofi is not None:
    final_cross_asset = create_final_cross_asset_ofi(cross_asset_ofi, main_symbol)
    
    print(f"\nFinal Cross-Asset OFI dataset:")
    print(f"Shape: {final_cross_asset.shape}")
    print(f"Columns: {final_cross_asset.columns.tolist()}")
    
    print(f"\nSample data:")
    print(final_cross_asset.head(8))
    
    # Analyze the final cross-asset OFI
    print(f"\nFinal Cross-Asset OFI statistics:")
    cross_ofi_col = 'cross_asset_ofi'
    if cross_ofi_col in final_cross_asset.columns:
        data = final_cross_asset[cross_ofi_col].dropna()
        print(f"  Mean: {data.mean():.2f}")
        print(f"  Std: {data.std():.2f}")
        print(f"  Range: [{data.min():.2f}, {data.max():.2f}]")
        print(f"  Non-zero: {(data != 0).sum()} / {len(data)} ({(data != 0).mean()*100:.1f}%)")
        
        # Correlation with main asset
        main_data = final_cross_asset['main_asset_ofi'].dropna()
        if len(data) == len(main_data):
            corr = data.corr(main_data)
            print(f"  Correlation with main asset: {corr:.3f}")


Final Cross-Asset OFI dataset:
Shape: (71, 6)
Columns: ['timestamp', 'main_asset_ofi', 'cross_asset_simple', 'cross_asset_weighted', 'cross_asset_lagged', 'cross_asset_ofi']

Sample data:
                  timestamp  main_asset_ofi  cross_asset_simple  \
0 2024-10-21 11:54:00+00:00      138.372260         -275.346181   
1 2024-10-21 11:55:00+00:00       50.325235         -205.889435   
2 2024-10-21 11:56:00+00:00      115.027530          -36.729069   
3 2024-10-21 11:57:00+00:00      -87.693208          -51.427148   
4 2024-10-21 11:58:00+00:00      309.402512           80.207675   
5 2024-10-21 11:59:00+00:00       66.441859          120.053674   
6 2024-10-21 12:00:00+00:00      -35.611352          125.871705   
7 2024-10-21 12:01:00+00:00      237.352259          306.303091   

   cross_asset_weighted  cross_asset_lagged  cross_asset_ofi  
0           -229.766092                 NaN      -252.556137  
1           -243.152759                 NaN      -224.521097  
2             79.2

In [9]:
# Step 5.6: Validation and saving

def validate_cross_asset_ofi(final_data, main_symbol):
    """
    Validate the cross-asset OFI calculation
    """
    
    print(f"\nCross-Asset OFI Validation:")
    print("="*40)
    
    # Check for completeness
    required_cols = ['timestamp', 'main_asset_ofi', 'cross_asset_ofi']
    missing_cols = [col for col in required_cols if col not in final_data.columns]
    
    if missing_cols:
        print(f"Missing required columns: {missing_cols}")
        return False
    
    # Check for reasonable values
    cross_data = final_data['cross_asset_ofi'].dropna()
    main_data = final_data['main_asset_ofi'].dropna()
    
    print(f"Data completeness:")
    print(f"  Total time periods: {len(final_data)}")
    print(f"  Main asset non-null: {len(main_data)} ({len(main_data)/len(final_data)*100:.1f}%)")
    print(f"  Cross-asset non-null: {len(cross_data)} ({len(cross_data)/len(final_data)*100:.1f}%)")
    
    # Check for reasonable correlation
    if len(cross_data) > 1 and len(main_data) > 1 and len(cross_data) == len(main_data):
        correlation = cross_data.corr(main_data)
        print(f"  Main vs Cross correlation: {correlation:.3f}")
        
        if abs(correlation) > 0.9:
            print("  Warning: Very high correlation - might indicate data leakage")
        elif abs(correlation) < 0.05:
            print("  Warning: Very low correlation - cross-asset signal might be weak")
        else:
            print("  Correlation looks reasonable")
    
    # Check for outliers
    if len(cross_data) > 0:
        q99 = cross_data.quantile(0.99)
        q01 = cross_data.quantile(0.01)
        outlier_pct = ((cross_data > q99) | (cross_data < q01)).mean() * 100
        print(f"  Outliers (beyond 1st/99th percentile): {outlier_pct:.1f}%")
    
    print("Validation completed successfully")
    return True

# Validate our results
if cross_asset_ofi is not None and 'final_cross_asset' in locals():
    is_valid = validate_cross_asset_ofi(final_cross_asset, main_symbol)
    
    if is_valid:
        # Save the cross-asset OFI
        final_cross_asset.to_csv('cross_asset_ofi.csv', index=False)
        
        # Create a summary of all OFI features
        all_ofi_summary = {
            'main_symbol': main_symbol,
            'timestamp_range': {
                'start': final_cross_asset['timestamp'].min().isoformat(),
                'end': final_cross_asset['timestamp'].max().isoformat()
            },
            'features_created': {
                'main_asset_ofi': 'Single-asset integrated OFI',
                'cross_asset_simple': 'Simple average of other assets OFI',
                'cross_asset_weighted': 'Correlation-weighted other assets OFI',
                'cross_asset_ofi': 'Combined cross-asset OFI feature'
            },
            'statistics': {
                'total_periods': len(final_cross_asset),
                'main_asset_active_periods': int((final_cross_asset['main_asset_ofi'] != 0).sum()),
                'cross_asset_active_periods': int((final_cross_asset['cross_asset_ofi'].dropna() != 0).sum())
            }
        }
        
        import json
        with open('cross_asset_ofi_summary.json', 'w') as f:
            json.dump(all_ofi_summary, f, indent=2)
        
        print(f"\nResults saved:")
        print(f"- cross_asset_ofi.csv: Cross-asset OFI time series")
        print(f"- cross_asset_ofi_summary.json: Calculation summary and metadata")
        
        # Final summary
        print(f"\nStep 5 Summary - Cross-Asset OFI:")
        print(f"===============================")
        print(f"- Created cross-asset OFI using {len([col for col in cross_asset_ofi.columns if col.startswith('ofi_')]) - 1} related assets")
        print(f"- Applied multiple weighting schemes (equal, correlation, market-cap, sector)")
        print(f"- Analyzed lagged relationships for predictive signals")
        print(f"- Combined methods into final cross-asset OFI feature")
        print(f"- Time periods: {len(final_cross_asset)}")
        
        if 'lagged_corr' in locals() and lagged_corr:
            best_lag = max(lagged_corr.keys(), key=lambda k: abs(lagged_corr[k]))
            print(f"- Best predictive lag: {best_lag} periods")
        
        print(f"\nAll OFI Features Completed:")
        print(f"✓ Best-Level OFI")
        print(f"✓ Multi-Level OFI") 
        print(f"✓ Integrated OFI")
        print(f"✓ Cross-Asset OFI")
        
        print(f"\nReady for conceptual questions!")


Cross-Asset OFI Validation:
Data completeness:
  Total time periods: 71
  Main asset non-null: 71 (100.0%)
  Cross-asset non-null: 71 (100.0%)
  Main vs Cross correlation: 0.743
  Correlation looks reasonable
  Outliers (beyond 1st/99th percentile): 2.8%
Validation completed successfully

Results saved:
- cross_asset_ofi.csv: Cross-asset OFI time series
- cross_asset_ofi_summary.json: Calculation summary and metadata

Step 5 Summary - Cross-Asset OFI:
- Created cross-asset OFI using 3 related assets
- Applied multiple weighting schemes (equal, correlation, market-cap, sector)
- Analyzed lagged relationships for predictive signals
- Combined methods into final cross-asset OFI feature
- Time periods: 71
- Best predictive lag: 0 periods

All OFI Features Completed:
✓ Best-Level OFI
✓ Multi-Level OFI
✓ Integrated OFI
✓ Cross-Asset OFI

Ready for conceptual questions!
