In [1]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

print("Loading Data")
# Load the data with HMA and regime features
train = pd.read_parquet('/content/drive/MyDrive/DRW Crypto Market Prediction/train_with_features.parquet')
print(f"Loaded data shape: {train.shape}")

# Verify features are present
hma_features = [col for col in train.columns if 'hma_' in col]
regime_features = [col for col in train.columns if any(x in col for x in ['regime_', 'vol_cluster_', 'trend_', 'momentum_', 'market_'])]
print(f"HMA features: {len(hma_features)}")
print(f"Regime features: {len(regime_features)}")

# Get price-like features (excluding target and time columns)
price_features = [col for col in train.columns if any(x in col for x in ['X', 'Y']) and col not in ['time_id', 'target']]
print(f"Price-like features: {len(price_features)}")

Mounted at /content/drive
Loading Data
Loaded data shape: (525886, 1508)
HMA features: 220
Regime features: 132
Price-like features: 1502


In [7]:
# VERIFY time_id and target exist
print(f"Has time_id: {'time_id' in train.columns}")
print(f"Has target: {'target' in train.columns}")

# Get price-like features (excluding target and time columns)
price_features = [col for col in train.columns if any(x in col for x in ['X', 'Y']) and col not in ['time_id', 'target']]
print(f"Price-like features: {len(price_features)}")

Has time_id: False
Has target: False
Price-like features: 4564


In [2]:
# rolling stats function

def create_rolling_statistics(df, features, windows=[5, 10, 20, 50, 100]):
    """
    Create rolling statistics for given features
    """
    print(f"Creating rolling statistics for {len(features)} features with windows {windows}")

    rolling_features = []

    for feature in features:
        for window in windows:
            # Rolling mean
            col_name = f'{feature}_rolling_mean_{window}'
            df[col_name] = df[feature].rolling(window=window, min_periods=1).mean()
            rolling_features.append(col_name)

            # Rolling std
            col_name = f'{feature}_rolling_std_{window}'
            df[col_name] = df[feature].rolling(window=window, min_periods=1).std()
            rolling_features.append(col_name)

            # Rolling min/max
            col_name = f'{feature}_rolling_min_{window}'
            df[col_name] = df[feature].rolling(window=window, min_periods=1).min()
            rolling_features.append(col_name)

            col_name = f'{feature}_rolling_max_{window}'
            df[col_name] = df[feature].rolling(window=window, min_periods=1).max()
            rolling_features.append(col_name)

            # Rolling quantiles
            col_name = f'{feature}_rolling_q25_{window}'
            df[col_name] = df[feature].rolling(window=window, min_periods=1).quantile(0.25)
            rolling_features.append(col_name)

            col_name = f'{feature}_rolling_q75_{window}'
            df[col_name] = df[feature].rolling(window=window, min_periods=1).quantile(0.75)
            rolling_features.append(col_name)

    print(f"Created {len(rolling_features)} rolling statistics features")
    return df, rolling_features

In [3]:
# feature interactions function

def create_feature_interactions(df, feature_groups, max_interactions=1000):
    """
    Create meaningful feature interactions
    """
    print("Creating feature interactions...")

    interaction_features = []
    interaction_count = 0

    # 1. Price feature interactions (ratios, spreads)
    price_features = [col for col in df.columns if any(x in col for x in ['X', 'Y']) and col not in ['time_id', 'target']]

    # Create some key price ratios
    for i in range(min(10, len(price_features))):
        for j in range(i+1, min(11, len(price_features))):
            if interaction_count >= max_interactions:
                break

            feat1, feat2 = price_features[i], price_features[j]

            # Ratio
            col_name = f'{feat1}_div_{feat2}'
            df[col_name] = df[feat1] / (df[feat2] + 1e-8)
            interaction_features.append(col_name)
            interaction_count += 1

            # Spread
            col_name = f'{feat1}_minus_{feat2}'
            df[col_name] = df[feat1] - df[feat2]
            interaction_features.append(col_name)
            interaction_count += 1

    # 2. HMA feature interactions
    hma_features = [col for col in df.columns if 'hma_' in col]

    # Create HMA ratios for different windows
    hma_windows = [5, 10, 20, 50, 100]
    for i, window1 in enumerate(hma_windows):
        for j, window2 in enumerate(hma_windows[i+1:], i+1):
            if interaction_count >= max_interactions:
                break

            # Find HMA features for these windows
            hma_short = [f for f in hma_features if f'_{window1}' in f]
            hma_long = [f for f in hma_features if f'_{window2}' in f]

            # Create interactions between short and long HMA
            for short_feat in hma_short[:5]:  # Limit to avoid too many features
                for long_feat in hma_long[:5]:
                    if interaction_count >= max_interactions:
                        break

                    base_short = short_feat.replace(f'_hma_{window1}', '')
                    base_long = long_feat.replace(f'_hma_{window2}', '')

                    if base_short == base_long:
                        # Ratio of short to long HMA
                        col_name = f'{short_feat}_div_{long_feat}'
                        df[col_name] = df[short_feat] / (df[long_feat] + 1e-8)
                        interaction_features.append(col_name)
                        interaction_count += 1

    # 3. Regime feature interactions
    regime_features = [col for col in df.columns if any(x in col for x in ['regime_', 'vol_cluster_', 'trend_', 'momentum_'])]

    # Create regime strength indicators
    for regime_feat in regime_features[:10]:  # Limit to avoid explosion
        # Interaction with volatility
        vol_features = [col for col in df.columns if 'rolling_std' in col][:5]
        for vol_feat in vol_features:
            if interaction_count >= max_interactions:
                break

            col_name = f'{regime_feat}_times_{vol_feat}'
            df[col_name] = df[regime_feat] * df[vol_feat]
            interaction_features.append(col_name)
            interaction_count += 1

    print(f"Created {len(interaction_features)} interaction features")
    return df, interaction_features

In [4]:
# market microstrucutre function

def create_market_microstructure_features(df):
    """
    Create market microstructure features
    """
    print("Creating market microstructure features...")

    microstructure_features = []

    # 1. Bid-ask spread proxies
    price_features = [col for col in df.columns if any(x in col for x in ['X', 'Y']) and col not in ['time_id', 'target']]

    for i in range(0, len(price_features), 2):
        if i+1 < len(price_features):
            ask_feat = price_features[i]
            bid_feat = price_features[i+1]

            # Spread
            col_name = f'{ask_feat}_minus_{bid_feat}_spread'
            df[col_name] = df[ask_feat] - df[bid_feat]
            microstructure_features.append(col_name)

            # Spread ratio
            col_name = f'{ask_feat}_div_{bid_feat}_spread_ratio'
            df[col_name] = df[ask_feat] / (df[bid_feat] + 1e-8)
            microstructure_features.append(col_name)

    # 2. Order book imbalance
    for i in range(0, len(price_features), 4):
        if i+3 < len(price_features):
            # Assume order book levels
            level1_ask = price_features[i]
            level1_bid = price_features[i+1]
            level2_ask = price_features[i+2]
            level2_bid = price_features[i+3]

            # Order book imbalance
            col_name = f'ob_imbalance_{i//4}'
            df[col_name] = (df[level1_ask] + df[level2_ask]) / (df[level1_bid] + df[level2_bid] + 1e-8)
            microstructure_features.append(col_name)

    # 3. Price momentum indicators
    for feature in price_features[:20]:  # Limit to avoid too many features
        # Price acceleration
        col_name = f'{feature}_acceleration'
        df[col_name] = df[feature].diff().diff()
        microstructure_features.append(col_name)

        # Price velocity
        col_name = f'{feature}_velocity'
        df[col_name] = df[feature].diff()
        microstructure_features.append(col_name)

    print(f"Created {len(microstructure_features)} microstructure features")
    return df, microstructure_features

In [5]:
# time based features

def create_time_based_features(df):
    """
    Create time-based features
    """
    print("Creating time-based features...")

    time_features = []

    # 1. Time of day features (if time_id is available)
    if 'time_id' in df.columns:
        # Extract hour from time_id (assuming it's in seconds or similar)
        df['hour'] = (df['time_id'] // 3600) % 24
        df['minute'] = (df['time_id'] // 60) % 60

        # Cyclical encoding
        df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
        df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
        df['minute_sin'] = np.sin(2 * np.pi * df['minute'] / 60)
        df['minute_cos'] = np.cos(2 * np.pi * df['minute'] / 60)

        time_features.extend(['hour_sin', 'hour_cos', 'minute_sin', 'minute_cos'])

    # 2. Sequential position features
    df['position'] = range(len(df))
    df['position_mod_100'] = df['position'] % 100
    df['position_mod_1000'] = df['position'] % 1000

    # Cyclical encoding for position
    df['position_sin'] = np.sin(2 * np.pi * df['position'] / 100)
    df['position_cos'] = np.cos(2 * np.pi * df['position'] / 100)

    time_features.extend(['position_mod_100', 'position_mod_1000', 'position_sin', 'position_cos'])

    print(f"Created {len(time_features)} time-based features")
    return df, time_features

In [6]:
# main

# 1. Rolling Statistics
print("\n1. Creating rolling statistics...")
train, rolling_features = create_rolling_statistics(train, price_features[:20])  # Limit to avoid memory issues

# 2. Feature Interactions
print("\n2. Creating feature interactions...")
train, interaction_features = create_feature_interactions(train, [price_features, hma_features, regime_features])

# 3. Market Microstructure
print("\n3. Creating microstructure features...")
train, microstructure_features = create_market_microstructure_features(train)

# 4. Time-based features
print("\n4. Creating time-based features...")
train, time_features = create_time_based_features(train)

# Save results (MAKE SURE TO KEEP time_id and target)
print("\n saving results")
# Ensure time_id and target are included
if 'time_id' not in train.columns or 'target' not in train.columns:
    print("ERROR: time_id or target missing!")
else:
    train.to_parquet('/content/drive/MyDrive/DRW Crypto Market Prediction/train_with_rolling_interactions.parquet', index=False)
    print("✅ Saved with time_id and target")

# Summary
print(f"\n final summary ")
print(f"Final dataset shape: {train.shape}")
print(f"Rolling features created: {len(rolling_features)}")
print(f"Interaction features created: {len(interaction_features)}")
print(f"Microstructure features created: {len(microstructure_features)}")
print(f"Time features created: {len(time_features)}")

# Memory usage
print(f"\nMemory usage: {train.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

print("\n Rolling statistics and interactions completed!")
print("Saved to: /content/drive/MyDrive/DRW Crypto Market Prediction/train_with_rolling_interactions.parquet")


1. Creating rolling statistics...
Creating rolling statistics for 20 features with windows [5, 10, 20, 50, 100]
Created 600 rolling statistics features

2. Creating feature interactions...
Creating feature interactions...
Created 160 interaction features

3. Creating microstructure features...
Creating market microstructure features...
Created 2867 microstructure features

4. Creating time-based features...
Creating time-based features...
Created 4 time-based features

 saving results
ERROR: time_id or target missing!

 final summary 
Final dataset shape: (525886, 5140)
Rolling features created: 600
Interaction features created: 160
Microstructure features created: 2867
Time features created: 4

Memory usage: 20626.68 MB

 Rolling statistics and interactions completed!
Saved to: /content/drive/MyDrive/DRW Crypto Market Prediction/train_with_rolling_interactions.parquet
