In [None]:
# Test Data Feature Engineering Pipeline
# Apply same feature engineering as training data

import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')


from google.colab import drive
drive.mount('/content/drive')


# Load Test Data
DRIVE_PATH = '/content/drive/MyDrive/DRW Crypto Market Prediction/'
test = pd.read_parquet(DRIVE_PATH + 'test.parquet')
print(f"Test data loaded: {test.shape}")

# Hull Moving Average Functions
def hull_moving_average(series, period):
    """Calculate Hull Moving Average"""
    if len(series) < period:
        return pd.Series([np.nan] * len(series), index=series.index)

    half_period = int(period / 2)
    sqrt_period = int(np.sqrt(period))

    wma_half = series.rolling(window=half_period).apply(
        lambda x: np.sum(x * np.arange(1, half_period + 1)) / np.sum(np.arange(1, half_period + 1))
    )
    wma_full = series.rolling(window=period).apply(
        lambda x: np.sum(x * np.arange(1, period + 1)) / np.sum(np.arange(1, period + 1))
    )

    raw_hma = 2 * wma_half - wma_full
    hma = raw_hma.rolling(window=sqrt_period).apply(
        lambda x: np.sum(x * np.arange(1, sqrt_period + 1)) / np.sum(np.arange(1, sqrt_period + 1))
    )

    return hma

def create_hma_features(df, price_features, periods=[5, 10, 20, 50, 100, 200]):
    """Create HMA features for all price columns"""
    hma_features = []

    for feature in price_features[:20]:  # Limit to avoid memory issues
        for period in periods:
            hma_col = f'hma_{feature}_{period}'
            df[hma_col] = hull_moving_average(df[feature], period)
            hma_features.append(hma_col)

    return df, hma_features

# Market Regime Functions
def classify_market_regime_crypto(df, lookback=20, vol_threshold=0.02, high_vol_threshold=0.04):
    """Classify market regimes for crypto data"""
    returns = df['X1'].pct_change()
    rolling_returns = returns.rolling(lookback).mean()
    rolling_vol = returns.rolling(lookback).std()

    regime = pd.Series('STABLE', index=df.index)

    # High volatility periods
    high_vol_mask = rolling_vol > high_vol_threshold
    regime.loc[high_vol_mask] = 'HIGH_VOL'

    # Trend-based classification for non-high-vol periods
    non_high_vol = ~high_vol_mask
    up_trend = (rolling_returns > vol_threshold) & non_high_vol
    down_trend = (rolling_returns < -vol_threshold) & non_high_vol

    regime.loc[up_trend] = 'UP'
    regime.loc[down_trend] = 'DOWN'

    return regime

def create_regime_specific_features(df, regime, lookback_periods=[5, 10, 20, 50]):
    """Create regime-specific features"""
    features = []

    # Volatility clustering
    returns = df['X1'].pct_change()
    for period in [5, 10, 20]:
        vol_col = f'vol_cluster_{["short", "medium", "long"][lookback_periods.index(period) if period in lookback_periods else 0]}'
        df[vol_col] = returns.rolling(period).std()
        features.append(vol_col)

    # Trend strength and direction
    for period in [20, 50, 100]:
        if period <= len(df):
            trend_strength_col = f'trend_strength_{period}'
            trend_direction_col = f'trend_direction_{period}'

            price_change = df['X1'].rolling(period).apply(lambda x: (x.iloc[-1] - x.iloc[0]) / x.iloc[0] if x.iloc[0] != 0 else 0)
            df[trend_strength_col] = price_change.abs()
            df[trend_direction_col] = np.sign(price_change)

            features.extend([trend_strength_col, trend_direction_col])

    # Momentum indicators
    for period in [5, 10, 20, 50]:
        momentum_col = f'momentum_{period}'
        df[momentum_col] = df['X1'].pct_change(period)
        features.append(momentum_col)

    # Regime persistence
    regime_numeric = regime.map({'STABLE': 0, 'UP': 1, 'DOWN': -1, 'HIGH_VOL': 2})
    df['regime_persistence'] = regime_numeric.rolling(10).std()
    features.append('regime_persistence')

    # Market momentum
    for period in [10, 20, 50]:
        momentum_col = f'market_momentum_{period}'
        df[momentum_col] = df['X1'].rolling(period).apply(lambda x: (x.iloc[-1] - x.iloc[0]) / x.iloc[0] if x.iloc[0] != 0 else 0)
        features.append(momentum_col)

    # Market volatility
    df['market_volatility'] = returns.rolling(20).std()
    features.append('market_volatility')

    # Relative performance (for available X features)
    available_x_features = [col for col in df.columns if col.startswith('X') and col in ['X230', 'X231']]
    for feature in available_x_features:
        if feature in df.columns:
            rel_perf_col = f'relative_perf_{feature}'
            df[rel_perf_col] = df[feature] / df['X1'] - 1
            features.append(rel_perf_col)

    return df, features

# Rolling Statistics Functions
def create_rolling_statistics(df, price_features, windows=[5, 10, 20, 50]):
    """Create rolling statistical features"""
    rolling_features = []

    # Limit features to avoid memory issues
    limited_features = price_features[:20]

    for feature in limited_features:
        if feature in df.columns:
            for window in windows:
                # Rolling mean
                mean_col = f'{feature}_rolling_mean_{window}'
                df[mean_col] = df[feature].rolling(window).mean()
                rolling_features.append(mean_col)

                # Rolling std
                std_col = f'{feature}_rolling_std_{window}'
                df[std_col] = df[feature].rolling(window).std()
                rolling_features.append(std_col)

                # Rolling min/max only for shorter windows
                if window <= 20:
                    min_col = f'{feature}_rolling_min_{window}'
                    max_col = f'{feature}_rolling_max_{window}'
                    df[min_col] = df[feature].rolling(window).min()
                    df[max_col] = df[feature].rolling(window).max()
                    rolling_features.extend([min_col, max_col])

    return df, rolling_features

def create_feature_interactions(df, feature_groups, max_interactions=50):
    """Create feature interactions"""
    interaction_features = []
    count = 0

    # Get available features from each group
    available_groups = []
    for group in feature_groups:
        available = [f for f in group if f in df.columns]
        if available:
            available_groups.append(available[:5])  # Limit each group

    # Create ratios and spreads
    for i, group1 in enumerate(available_groups):
        for j, group2 in enumerate(available_groups):
            if i < j and count < max_interactions:
                for feat1 in group1[:3]:  # Limit features
                    for feat2 in group2[:3]:
                        if feat1 != feat2 and count < max_interactions:
                            # Ratio
                            ratio_col = f'{feat1}_div_{feat2}_ratio'
                            df[ratio_col] = df[feat1] / (df[feat2] + 1e-8)
                            interaction_features.append(ratio_col)
                            count += 1

                            # Spread
                            if count < max_interactions:
                                spread_col = f'{feat1}_div_{feat2}_spread_ratio'
                                df[spread_col] = (df[feat1] - df[feat2]) / (df[feat2] + 1e-8)
                                interaction_features.append(spread_col)
                                count += 1

    return df, interaction_features

def create_market_microstructure_features(df):
    """Create market microstructure features"""
    microstructure_features = []

    # Bid-ask spread (if columns exist)
    if 'bid_qty' in df.columns and 'ask_qty' in df.columns:
        df['bid_ask_spread'] = df['ask_qty'] - df['bid_qty']
        df['bid_ask_ratio'] = df['bid_qty'] / (df['ask_qty'] + 1e-8)
        microstructure_features.extend(['bid_ask_spread', 'bid_ask_ratio'])

    # Order flow imbalance
    if 'buy_qty' in df.columns and 'sell_qty' in df.columns:
        df['order_flow_imbalance'] = (df['buy_qty'] - df['sell_qty']) / (df['buy_qty'] + df['sell_qty'] + 1e-8)
        microstructure_features.append('order_flow_imbalance')

    # Volume features
    if 'volume' in df.columns:
        df['volume_ma_5'] = df['volume'].rolling(5).mean()
        df['volume_ma_20'] = df['volume'].rolling(20).mean()
        df['volume_ratio'] = df['volume'] / (df['volume_ma_20'] + 1e-8)
        microstructure_features.extend(['volume_ma_5', 'volume_ma_20', 'volume_ratio'])

    # Price momentum
    for period in [3, 5, 10]:
        momentum_col = f'price_momentum_{period}'
        df[momentum_col] = df['X1'].pct_change(period)
        microstructure_features.append(momentum_col)

    # Acceleration features
    for feature in ['X1', 'X2', 'X3'][:2]:  # Limit features
        if feature in df.columns:
            velocity_col = f'{feature}_velocity'
            acceleration_col = f'{feature}_acceleration'

            df[velocity_col] = df[feature].diff()
            df[acceleration_col] = df[velocity_col].diff()

            microstructure_features.extend([velocity_col, acceleration_col])

    return df, microstructure_features

def create_time_based_features(df):
    """Create time-based features"""
    time_features = []

    # Position in sequence
    df['position'] = range(len(df))
    time_features.append('position')

    # Modular features
    df['position_mod_100'] = df['position'] % 100
    df['position_mod_1000'] = df['position'] % 1000
    time_features.extend(['position_mod_100', 'position_mod_1000'])

    # Cyclical encoding
    df['position_sin'] = np.sin(2 * np.pi * df['position'] / 1000)
    df['position_cos'] = np.cos(2 * np.pi * df['position'] / 1000)
    time_features.extend(['position_sin', 'position_cos'])

    return df, time_features


Mounted at /content/drive
Test data loaded: (538150, 786)


In [2]:
# Apply All Feature Engineering
print("\n1. Creating HMA features...")
# Get price features
price_features = [col for col in test.columns if col.startswith('X')]
test, hma_features = create_hma_features(test, price_features)
print(f"HMA features created: {len(hma_features)}")

print("\n2. Creating regime features...")
regime = classify_market_regime_crypto(test)
test, regime_features = create_regime_specific_features(test, regime)
print(f"Regime features created: {len(regime_features)}")

print("\n3. Creating rolling statistics...")
test, rolling_features = create_rolling_statistics(test, price_features)
print(f"Rolling features created: {len(rolling_features)}")

print("\n4. Creating feature interactions...")
feature_groups = [price_features[:10], hma_features[:20], regime_features[:10]]
test, interaction_features = create_feature_interactions(test, feature_groups)
print(f"Interaction features created: {len(interaction_features)}")

print("\n5. Creating microstructure features...")
test, microstructure_features = create_market_microstructure_features(test)
print(f"Microstructure features created: {len(microstructure_features)}")

print("\n6. Creating time-based features...")
test, time_features = create_time_based_features(test)
print(f"Time features created: {len(time_features)}")



1. Creating HMA features...


KeyboardInterrupt: 

In [None]:

#Save Engineered Test Data
print(f"\nFinal test data shape: {test.shape}")
total_new_features = len(hma_features) + len(regime_features) + len(rolling_features) + len(interaction_features) + len(microstructure_features) + len(time_features)
print(f"Total new features created: {total_new_features}")

# Save engineered test data
test.to_parquet(DRIVE_PATH + 'test_with_features.parquet', index=False)
print(f"Saved engineered test data to: test_with_features.parquet")

print("\nFeature engineering completed!")
print("Now run the submission pipeline with the engineered test data.")