In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/drw-crypto-market-prediction/sample_submission.csv
/kaggle/input/drw-crypto-market-prediction/train.parquet
/kaggle/input/drw-crypto-market-prediction/test.parquet


In [2]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
DRW Crypto Market Prediction - Dual Model Approach
Combines original model with feature-augmented model for ensemble prediction
"""

import sys
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler, RobustScaler
from xgboost import XGBRegressor
from scipy.stats import pearsonr
from scipy.special import expit
import warnings
warnings.filterwarnings('ignore')

# Set random seeds for reproducibility
np.random.seed(42)

print("=" * 80)
print("DRW CRYPTO MARKET PREDICTION - DUAL MODEL APPROACH")
print("=" * 80)

# ==============================================================================
# PART 1: TARGETED FEATURE ENGINEERING
# ==============================================================================

class TargetedFeatureEngineer:
    """
    Create specific complex interaction features for augmented model.
    """
    
    def __init__(self):
        self.feature_scaler = RobustScaler()
    
    def create_augmented_features(self, df, base_features):
        """
        Create at least 5 complex interaction features based on top performing features.
        """
        print("\nCreating targeted augmented features...")
        
        augmented_features = []
        feature_names = []
        
        # 1. COMPLEX INTERACTIONS BETWEEN TOP FEATURES
        # Based on importance: X862, X852, X345, X532, X888
        if all(f in df.columns for f in ['X862', 'X852', 'X345']):
            # Three-way interaction with non-linearity
            feat1 = df['X862'].values
            feat2 = df['X852'].values
            feat3 = df['X345'].values
            
            # Feature 1: Non-linear three-way interaction
            interaction = np.tanh(feat1) * np.exp(-np.abs(feat2) / 2) * np.sign(feat3)
            augmented_features.append(interaction)
            feature_names.append('complex_interaction_862_852_345')
            
            # Feature 2: Polynomial ratio
            ratio_poly = (feat1 ** 2) / (feat2 ** 2 + 1)
            augmented_features.append(ratio_poly)
            feature_names.append('poly_ratio_862_852')
            
            # NEW: Subtraction interactions
            sub_interaction1 = feat1 - feat2
            augmented_features.append(sub_interaction1)
            feature_names.append('X862_minus_X852')
            
            sub_interaction2 = feat2 - feat3
            augmented_features.append(sub_interaction2)
            feature_names.append('X852_minus_X345')
            
            # NEW: RBF transformations
            # RBF between X862 and X852
            rbf_sigma = np.std(feat1 - feat2) + 1e-6
            rbf_862_852 = np.exp(-((feat1 - feat2) ** 2) / (2 * rbf_sigma ** 2))
            augmented_features.append(rbf_862_852)
            feature_names.append('rbf_862_852')
        
        # 2. MARKET MICROSTRUCTURE COMPLEX FEATURES
        if all(f in df.columns for f in ['bid_qty', 'ask_qty', 'volume', 'buy_qty', 'sell_qty']):
            bid = df['bid_qty'].values
            ask = df['ask_qty'].values
            vol = df['volume'].values
            buy = df['buy_qty'].values
            sell = df['sell_qty'].values
            
            # Feature 3: Kyle's lambda approximation with non-linearity
            order_imbalance = (bid - ask) / (bid + ask + 1e-6)
            flow_imbalance = (buy - sell) / (buy + sell + 1e-6)
            kyle_lambda = flow_imbalance * np.sqrt(np.abs(order_imbalance)) / (np.log1p(vol) + 1e-6)
            augmented_features.append(kyle_lambda)
            feature_names.append('kyle_lambda_complex')
            
            # Feature 4: Volatility-adjusted pressure
            total_pressure = bid + ask
            vol_adj_pressure = np.log1p(total_pressure) * np.exp(-vol / (vol.mean() + 1e-6))
            augmented_features.append(vol_adj_pressure)
            feature_names.append('vol_adjusted_pressure')
            
            # Feature 5: Trade intensity asymmetry
            buy_intensity = buy / (vol + 1e-6)
            sell_intensity = sell / (vol + 1e-6)
            intensity_asymmetry = np.sign(buy_intensity - sell_intensity) * \
                                 np.log1p(np.abs(buy_intensity - sell_intensity))
            augmented_features.append(intensity_asymmetry)
            feature_names.append('trade_intensity_asymmetry')
            
            # NEW: Bid-Ask subtraction and kernel
            bid_ask_diff = bid - ask
            augmented_features.append(bid_ask_diff)
            feature_names.append('bid_minus_ask')
            
            # NEW: Gaussian kernel on volume
            vol_kernel = np.exp(-((vol - vol.mean()) ** 2) / (2 * (vol.std() + 1e-6) ** 2))
            augmented_features.append(vol_kernel)
            feature_names.append('volume_gaussian_kernel')
        
        # 3. CROSS-DOMAIN INTERACTIONS
        if all(f in df.columns for f in ['X532', 'X888', 'volume']):
            # Feature 6: Technical indicator with volume
            technical = df['X532'].values * df['X888'].values
            vol = df['volume'].values
            vol_weighted_tech = technical * np.log1p(vol) / (vol.std() + 1e-6)
            augmented_features.append(vol_weighted_tech)
            feature_names.append('volume_weighted_technical')
            
            # NEW: X532 + X888 combination
            tech_sum = df['X532'].values + df['X888'].values
            augmented_features.append(tech_sum)
            feature_names.append('X532_plus_X888')
            
            # NEW: RBF between X532 and X888
            tech_diff = df['X532'].values - df['X888'].values
            tech_rbf = np.exp(-tech_diff ** 2 / (2 * (np.std(tech_diff) + 1e-6) ** 2))
            augmented_features.append(tech_rbf)
            feature_names.append('rbf_532_888')
        
        # 4. REGIME-BASED FEATURES
        if 'X137' in df.columns and 'X302' in df.columns:
            # Feature 7: Regime detection feature
            feat1 = df['X137'].values
            feat2 = df['X302'].values
            
            # Create regime boundaries
            regime_indicator = np.where(
                (feat1 > np.percentile(feat1, 75)) & (feat2 > np.percentile(feat2, 75)), 1,
                np.where(
                    (feat1 < np.percentile(feat1, 25)) & (feat2 < np.percentile(feat2, 25)), -1,
                    0
                )
            )
            augmented_features.append(regime_indicator)
            feature_names.append('regime_indicator_137_302')
            
            # NEW: Laplacian kernel for regime
            laplacian_kernel = np.exp(-np.abs(feat1 - feat2) / (np.std(feat1 - feat2) + 1e-6))
            augmented_features.append(laplacian_kernel)
            feature_names.append('laplacian_kernel_137_302')
        
        # 5. DISTANCE-BASED CLUSTER FEATURES
        if all(f in df.columns for f in ['X178', 'X168', 'X612']):
            # Feature 8: Multi-dimensional distance from center
            feat1 = (df['X178'].values - df['X178'].mean()) / (df['X178'].std() + 1e-6)
            feat2 = (df['X168'].values - df['X168'].mean()) / (df['X168'].std() + 1e-6)
            feat3 = (df['X612'].values - df['X612'].mean()) / (df['X612'].std() + 1e-6)
            
            distance = np.sqrt(feat1**2 + feat2**2 + feat3**2)
            distance_kernel = np.exp(-distance**2 / 2)
            augmented_features.append(distance_kernel)
            feature_names.append('multidim_distance_kernel')
            
            # NEW: Pairwise differences
            diff_178_168 = df['X178'].values - df['X168'].values
            augmented_features.append(diff_178_168)
            feature_names.append('X178_minus_X168')
            
            diff_168_612 = df['X168'].values - df['X612'].values
            augmented_features.append(diff_168_612)
            feature_names.append('X168_minus_X612')
        
        # 6. NEW: Additional top feature interactions
        if 'X598' in df.columns and 'X862' in df.columns:
            # X598 is often important
            f598 = df['X598'].values
            f862 = df['X862'].values
            
            # Subtraction
            diff_598_862 = f598 - f862
            augmented_features.append(diff_598_862)
            feature_names.append('X598_minus_X862')
            
            # RBF
            rbf_598_862 = np.exp(-((f598 - f862) ** 2) / (2 * (np.std(f598 - f862) + 1e-6) ** 2))
            augmented_features.append(rbf_598_862)
            feature_names.append('rbf_598_862')
        
        # Stack all features
        if augmented_features:
            augmented_array = np.column_stack(augmented_features)
            print(f"  Created {len(feature_names)} augmented features:")
            for name in feature_names:
                print(f"    - {name}")
            return augmented_array, feature_names
        else:
            return np.zeros((len(df), 0)), []

# ==============================================================================
# PART 2: DUAL MODEL PREDICTOR
# ==============================================================================

class DualModelCryptoPredictor:
    """Train both original and augmented models, then ensemble predictions."""
    
    def __init__(self):
        # Paths
        self.TRAIN_PATH = "/kaggle/input/drw-crypto-market-prediction/train.parquet"
        self.TEST_PATH = "/kaggle/input/drw-crypto-market-prediction/test.parquet"
        self.SUBMISSION_PATH = "/kaggle/input/drw-crypto-market-prediction/sample_submission.csv"
        
        # Original features (unchanged)
        self.ORIGINAL_FEATURES = [
            "X863", "X856", "X344", "X598", "X862", "X385", "X852", "X603", "X860", "X674",
            "X415", "X345", "X137", "X855", "X174", "X302", "X178", "X532", "X168", "X612",
            "bid_qty", "ask_qty", "buy_qty", "sell_qty", "volume", "X888", "X421", "X333"
        ]
        
        self.LABEL_COLUMN = "label"
        self.N_FOLDS = 3
        self.RANDOM_STATE = 42
        
        # XGBoost parameters (unchanged)
        self.XGB_PARAMS = {
            "tree_method": "hist",
            "device": "cpu",
            "colsample_bylevel": 0.4778,
            "colsample_bynode": 0.3628,
            "colsample_bytree": 0.7107,
            "gamma": 1.7095,
            "learning_rate": 0.02213,
            "max_depth": 20,
            "max_leaves": 12,
            "min_child_weight": 16,
            "n_estimators": 1667,
            "subsample": 0.06567,
            "reg_alpha": 39.3524,
            "reg_lambda": 75.4484,
            "verbosity": 0,
            "random_state": self.RANDOM_STATE,
            "n_jobs": -1,
            "verbose": False,
        }
        
        self.MODEL_SLICES = [
            {"name": "full_data", "cutoff": 0},
            {"name": "last_75pct", "cutoff": 0},  
            {"name": "last_50pct", "cutoff": 0}
        ]
        
        self.feature_engineer = TargetedFeatureEngineer()
        self.scaler = RobustScaler()
        
    def create_time_decay_weights(self, n, decay=0.95):
        """Create time decay weights for samples."""
        positions = np.arange(n)
        normalized = positions / float(n - 1)
        weights = decay ** (1.0 - normalized)
        return weights * n / weights.sum()
    
    def load_data(self):
        """Load train, test, and submission data."""
        print("\nLoading data...")
        train_df = pd.read_parquet(self.TRAIN_PATH).reset_index(drop=True)
        test_df = pd.read_parquet(self.TEST_PATH).reset_index(drop=True)
        submission_df = pd.read_csv(self.SUBMISSION_PATH)
        
        print(f"Loaded train: {train_df.shape}, test: {test_df.shape}")
        return train_df, test_df, submission_df
    
    def train_and_predict(self, train_df, test_df, features, model_name="Model"):
        """Train XGBoost model with cross-validation and make predictions."""
        print(f"\nTraining {model_name}...")
        
        n_samples = len(train_df)
        
        # Set slice cutoffs
        self.MODEL_SLICES[1]["cutoff"] = int(0.25 * n_samples)
        self.MODEL_SLICES[2]["cutoff"] = int(0.50 * n_samples)
        
        # Prepare storage
        oof_preds = {sl["name"]: np.zeros(n_samples) for sl in self.MODEL_SLICES}
        test_preds = {sl["name"]: np.zeros(len(test_df)) for sl in self.MODEL_SLICES}
        
        full_weights = self.create_time_decay_weights(n_samples)
        kf = KFold(n_splits=self.N_FOLDS, shuffle=False)
        
        # Cross-validation
        for fold, (train_idx, valid_idx) in enumerate(kf.split(train_df), start=1):
            print(f"\n--- {model_name} Fold {fold}/{self.N_FOLDS} ---")
            X_valid = train_df.iloc[valid_idx][features]
            y_valid = train_df.iloc[valid_idx][self.LABEL_COLUMN]
            
            for sl in self.MODEL_SLICES:
                slice_name = sl["name"]
                cutoff = sl["cutoff"]
                subset = train_df.iloc[cutoff:].reset_index(drop=True)
                rel_idx = train_idx[train_idx >= cutoff] - cutoff
                
                print(f"Training {slice_name}...")
                X_train = subset.iloc[rel_idx][features]
                y_train = subset.iloc[rel_idx][self.LABEL_COLUMN]
                
                # Sample weights
                if cutoff == 0:
                    sw = full_weights[train_idx]
                else:
                    sw_total = self.create_time_decay_weights(len(subset))
                    sw = sw_total[rel_idx]
                
                # Train model
                model = XGBRegressor(**self.XGB_PARAMS)
                model.fit(
                    X_train, y_train, 
                    sample_weight=sw,
                    eval_set=[(X_valid, y_valid)],
                    verbose=100
                )
                
                # OOF predictions
                mask = valid_idx >= cutoff
                if mask.any():
                    idxs = valid_idx[mask]
                    oof_preds[slice_name][idxs] = model.predict(train_df.iloc[idxs][features])
                if cutoff > 0 and (~mask).any():
                    oof_preds[slice_name][valid_idx[~mask]] = oof_preds["full_data"][valid_idx[~mask]]
                
                # Test predictions
                test_preds[slice_name] += model.predict(test_df[features])
        
        # Average test predictions
        for slice_name in test_preds:
            test_preds[slice_name] /= self.N_FOLDS
        
        # Compute Pearson scores
        pearson_scores = {
            slice_name: pearsonr(train_df[self.LABEL_COLUMN], preds)[0]
            for slice_name, preds in oof_preds.items()
        }
        
        print(f"\n{model_name} Pearson scores by slice:")
        for slice_name, score in pearson_scores.items():
            print(f"  {slice_name}: {score:.4f}")
        
        # Simple ensemble
        oof_ensemble = np.mean(list(oof_preds.values()), axis=0)
        test_ensemble = np.mean(list(test_preds.values()), axis=0)
        ensemble_score = pearsonr(train_df[self.LABEL_COLUMN], oof_ensemble)[0]
        
        print(f"\n{model_name} Ensemble Pearson score: {ensemble_score:.4f}")
        
        return test_ensemble, ensemble_score
    
    def prepare_augmented_data(self, train_df, test_df):
        """Prepare data with augmented features."""
        print("\nPreparing augmented features...")
        
        # Create augmented features
        train_aug, aug_names = self.feature_engineer.create_augmented_features(
            train_df, self.ORIGINAL_FEATURES
        )
        test_aug, _ = self.feature_engineer.create_augmented_features(
            test_df, self.ORIGINAL_FEATURES
        )
        
        # Combine with original features
        train_combined = np.hstack([train_df[self.ORIGINAL_FEATURES].values, train_aug])
        test_combined = np.hstack([test_df[self.ORIGINAL_FEATURES].values, test_aug])
        
        # Scale all features
        train_scaled = self.scaler.fit_transform(train_combined)
        test_scaled = self.scaler.transform(test_combined)
        
        # Create DataFrames
        all_features = self.ORIGINAL_FEATURES + aug_names
        train_final = pd.DataFrame(train_scaled, columns=all_features)
        train_final['label'] = train_df['label'].values
        
        test_final = pd.DataFrame(test_scaled, columns=all_features)
        
        return train_final, test_final, all_features

# ==============================================================================
# PART 3: MAIN EXECUTION
# ==============================================================================

def main():
    """Main execution function."""
    
    # Initialize predictor
    predictor = DualModelCryptoPredictor()
    
    # Load data
    train_df, test_df, submission_df = predictor.load_data()
    
    # =========================================================================
    # MODEL 1: ORIGINAL MODEL (UNCHANGED)
    # =========================================================================
    print("\n" + "=" * 80)
    print("MODEL 1: ORIGINAL MODEL")
    print("=" * 80)
    
    # Prepare original data
    train_original = train_df[predictor.ORIGINAL_FEATURES + ['label']].copy()
    test_original = test_df[predictor.ORIGINAL_FEATURES].copy()
    
    # Train original model
    original_predictions, original_score = predictor.train_and_predict(
        train_original, test_original, predictor.ORIGINAL_FEATURES, "Original Model"
    )
    
    # Save original predictions (as requested)
    submission_original = submission_df.copy()
    submission_original["prediction"] = original_predictions
    submission_original.to_csv("submission.csv", index=False)
    print("\nSaved original submission.csv")
    
    # =========================================================================
    # MODEL 2: AUGMENTED MODEL
    # =========================================================================
    print("\n" + "=" * 80)
    print("MODEL 2: AUGMENTED MODEL")
    print("=" * 80)
    
    # Prepare augmented data
    train_augmented, test_augmented, augmented_features = predictor.prepare_augmented_data(
        train_df, test_df
    )
    
    print(f"\nTotal features in augmented model: {len(augmented_features)}")
    print(f"New features added: {len(augmented_features) - len(predictor.ORIGINAL_FEATURES)}")
    
    # Train augmented model
    augmented_predictions, augmented_score = predictor.train_and_predict(
        train_augmented, test_augmented, augmented_features, "Augmented Model"
    )
    
    # =========================================================================
    # ENSEMBLE: AVERAGE BOTH MODELS
    # =========================================================================
    print("\n" + "=" * 80)
    print("FINAL ENSEMBLE")
    print("=" * 80)
    
    # Average predictions
    ensemble_predictions = (original_predictions + augmented_predictions) / 2
    
    # Save ensemble predictions
    submission_ensemble = submission_df.copy()
    submission_ensemble["prediction"] = ensemble_predictions
    submission_ensemble.to_csv("submission_ensemble.csv", index=False)
    print("\nSaved ensemble submission_ensemble.csv")
    
    # =========================================================================
    # RESULTS SUMMARY
    # =========================================================================
    print("\n" + "=" * 80)
    print("RESULTS SUMMARY")
    print("=" * 80)
    print(f"Original Model:")
    print(f"  Features: {len(predictor.ORIGINAL_FEATURES)}")
    print(f"  Score: {original_score:.4f}")
    print(f"\nAugmented Model:")
    print(f"  Features: {len(augmented_features)}")
    print(f"  Score: {augmented_score:.4f}")
    print(f"\nExpected Ensemble Performance:")
    print(f"  Better than: {min(original_score, augmented_score):.4f}")
    print(f"  Potential: ~{(original_score + augmented_score) / 2:.4f} to {max(original_score, augmented_score) * 1.02:.4f}")
    
    # Feature importance for augmented model
    print("\n" + "=" * 80)
    print("AUGMENTED FEATURES IMPORTANCE CHECK")
    print("=" * 80)
    
    # Quick importance check on augmented features
    sample_model = XGBRegressor(n_estimators=100, max_depth=6, random_state=42)
    sample_size = min(50000, len(train_augmented))
    sample_idx = np.random.choice(len(train_augmented), sample_size, replace=False)
    
    sample_model.fit(
        train_augmented.iloc[sample_idx][augmented_features],
        train_augmented.iloc[sample_idx]['label']
    )
    
    importance_df = pd.DataFrame({
        'feature': augmented_features,
        'importance': sample_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    # Show importance of new features
    print("\nImportance of new augmented features:")
    for feat in augmented_features:
        if feat not in predictor.ORIGINAL_FEATURES:
            imp = importance_df[importance_df['feature'] == feat]['importance'].values[0]
            rank = list(importance_df['feature']).index(feat) + 1
            print(f"  {feat}: Rank {rank}/{len(augmented_features)}, Importance: {imp:.4f}")

if __name__ == "__main__":
    main()

DRW CRYPTO MARKET PREDICTION - DUAL MODEL APPROACH

Loading data...
Loaded train: (525887, 896), test: (538150, 896)

MODEL 1: ORIGINAL MODEL

Training Original Model...

--- Original Model Fold 1/3 ---
Training full_data...
[0]	validation_0-rmse:1.00216
[100]	validation_0-rmse:0.99508
[200]	validation_0-rmse:0.99333
[300]	validation_0-rmse:0.99217
[400]	validation_0-rmse:0.99180
[500]	validation_0-rmse:0.99255
[600]	validation_0-rmse:0.99313
[700]	validation_0-rmse:0.99403
[800]	validation_0-rmse:0.99549
[900]	validation_0-rmse:0.99715
[1000]	validation_0-rmse:0.99819
[1100]	validation_0-rmse:0.99981
[1200]	validation_0-rmse:1.00125
[1300]	validation_0-rmse:1.00249
[1400]	validation_0-rmse:1.00324
[1500]	validation_0-rmse:1.00515
[1600]	validation_0-rmse:1.00674
[1666]	validation_0-rmse:1.00782
Training last_75pct...
[0]	validation_0-rmse:1.00213
[100]	validation_0-rmse:0.99441
[200]	validation_0-rmse:0.99181
[300]	validation_0-rmse:0.99120
[400]	validation_0-rmse:0.99077
[500]	valida

In [3]:
import sys
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from scipy.stats import pearsonr

In [4]:
def feature_engineering(df):

    df['volume_weighted_sell'] = df['sell_qty'] * df['volume']
    df['buy_sell_ratio'] = df['buy_qty'] / (df['sell_qty'] + 1e-8)
    df['selling_pressure'] = df['sell_qty'] / (df['volume'] + 1e-8)
    df['effective_spread_proxy'] = np.abs(df['buy_qty'] - df['sell_qty']) / (df['volume'] + 1e-8)


    df = df.replace([np.inf, -np.inf], np.nan)
    df = df.fillna(0)
    return df 

In [5]:
class Config:
    TRAIN_PATH = "/kaggle/input/drw-crypto-market-prediction/train.parquet"
    TEST_PATH = "/kaggle/input/drw-crypto-market-prediction/test.parquet"
    SUBMISSION_PATH = "/kaggle/input/drw-crypto-market-prediction/sample_submission.csv"

    FEATURES = [
        "X863", "X856", "X598", "X862", "X385", "X852", "X603", "X860", "X674",
        "X415", "X345", "X855", "X174", "X302", "X178", "X168", "X612",
        "buy_qty", "sell_qty", "volume", "X888", "X421", "X333"
    ]

    LABEL_COLUMN = "label"
    N_FOLDS = 3
    RANDOM_STATE = 42

XGB_PARAMS = {
    "tree_method": "hist",
    "device": "gpu",
    "colsample_bylevel": 0.4778,
    "colsample_bynode": 0.3628,
    "colsample_bytree": 0.7107,
    "gamma": 1.7095,
    "learning_rate": 0.02213,
    "max_depth": 20,
    "max_leaves": 12,
    "min_child_weight": 16,
    "n_estimators": 1667,
    "subsample": 0.06567,
    "reg_alpha": 39.3524,
    "reg_lambda": 75.4484,
    "verbosity": 0,
    "random_state": Config.RANDOM_STATE,
    "n_jobs": -1
}

LEARNERS = [
    {"name": "xgb", "Estimator": XGBRegressor, "params": XGB_PARAMS},
]

In [6]:
def create_time_decay_weights(n: int, decay: float = 0.9) -> np.ndarray:
    positions = np.arange(n)
    normalized = positions / (n - 1)
    weights = decay ** (1.0 - normalized)
    return weights * n / weights.sum()

def load_data():
    train_df = pd.read_parquet(Config.TRAIN_PATH, columns=Config.FEATURES + [Config.LABEL_COLUMN])
    test_df = pd.read_parquet(Config.TEST_PATH, columns=Config.FEATURES)
    submission_df = pd.read_csv(Config.SUBMISSION_PATH)

    train_df = feature_engineering(train_df)
    test_df = feature_engineering(test_df)
    print(f"Loaded data - Train: {train_df.shape}, Test: {test_df.shape}, Submission: {submission_df.shape}")
    return train_df.reset_index(drop=True), test_df.reset_index(drop=True), submission_df
Config.FEATURES += ["bid_qty", "ask_qty", "buy_qty", "sell_qty", "volume"]
Config.FEATURES = list(set(Config.FEATURES))  # remove duplicates

In [7]:
def get_model_slices(n_samples: int):
    return [
        {"name": "full_data", "cutoff": 0},
        {"name": "last_75pct", "cutoff": int(0.25 * n_samples)},
        {"name": "last_50pct", "cutoff": int(0.50 * n_samples)},
    ]

def train_and_evaluate(train_df, test_df):
    n_samples = len(train_df)
    model_slices = get_model_slices(n_samples)

    oof_preds = {
        learner["name"]: {s["name"]: np.zeros(n_samples) for s in model_slices}
        for learner in LEARNERS
    }
    test_preds = {
        learner["name"]: {s["name"]: np.zeros(len(test_df)) for s in model_slices}
        for learner in LEARNERS
    }

    full_weights = create_time_decay_weights(n_samples)
    kf = KFold(n_splits=Config.N_FOLDS, shuffle=False)

    for fold, (train_idx, valid_idx) in enumerate(kf.split(train_df), start=1):
        print(f"\n--- Fold {fold}/{Config.N_FOLDS} ---")
        X_valid = train_df.iloc[valid_idx][Config.FEATURES]
        y_valid = train_df.iloc[valid_idx][Config.LABEL_COLUMN]

        for s in model_slices:
            cutoff = s["cutoff"]
            slice_name = s["name"]
            subset = train_df.iloc[cutoff:].reset_index(drop=True)
            rel_idx = train_idx[train_idx >= cutoff] - cutoff

            X_train = subset.iloc[rel_idx][Config.FEATURES]
            y_train = subset.iloc[rel_idx][Config.LABEL_COLUMN]
            sw = create_time_decay_weights(len(subset))[rel_idx] if cutoff > 0 else full_weights[train_idx]

            print(f"  Training slice: {slice_name}, samples: {len(X_train)}")

            for learner in LEARNERS:
                model = learner["Estimator"](**learner["params"])
                model.fit(X_train, y_train, sample_weight=sw, eval_set=[(X_valid, y_valid)], verbose=False)

                mask = valid_idx >= cutoff
                if mask.any():
                    idxs = valid_idx[mask]
                    oof_preds[learner["name"]][slice_name][idxs] = model.predict(train_df.iloc[idxs][Config.FEATURES])
                if cutoff > 0 and (~mask).any():
                    oof_preds[learner["name"]][slice_name][valid_idx[~mask]] = oof_preds[learner["name"]]["full_data"][valid_idx[~mask]]

                test_preds[learner["name"]][slice_name] += model.predict(test_df[Config.FEATURES])

    # Normalize test predictions
    for learner_name in test_preds:
        for slice_name in test_preds[learner_name]:
            test_preds[learner_name][slice_name] /= Config.N_FOLDS

    return oof_preds, test_preds, model_slices

In [8]:
def ensemble_and_submit(train_df, oof_preds, test_preds, submission_df):
    learner_ensembles = {}
    for learner_name in oof_preds:
        scores = {s: pearsonr(train_df[Config.LABEL_COLUMN], oof_preds[learner_name][s])[0]
                  for s in oof_preds[learner_name]}
        total_score = sum(scores.values())

        oof_simple = np.mean(list(oof_preds[learner_name].values()), axis=0)
        test_simple = np.mean(list(test_preds[learner_name].values()), axis=0)
        score_simple = pearsonr(train_df[Config.LABEL_COLUMN], oof_simple)[0]

        oof_weighted = sum(scores[s] / total_score * oof_preds[learner_name][s] for s in scores)
        test_weighted = sum(scores[s] / total_score * test_preds[learner_name][s] for s in scores)
        score_weighted = pearsonr(train_df[Config.LABEL_COLUMN], oof_weighted)[0]

        print(f"\n{learner_name.upper()} Simple Ensemble Pearson:   {score_simple:.4f}")
        print(f"{learner_name.upper()} Weighted Ensemble Pearson: {score_weighted:.4f}")

        learner_ensembles[learner_name] = {
            "oof_simple": oof_simple,
            "test_simple": test_simple
        }

    final_oof = np.mean([le["oof_simple"] for le in learner_ensembles.values()], axis=0)
    final_test = np.mean([le["test_simple"] for le in learner_ensembles.values()], axis=0)
    final_score = pearsonr(train_df[Config.LABEL_COLUMN], final_oof)[0]

    print(f"\nFINAL ensemble across learners Pearson: {final_score:.4f}")

    submission_df["prediction"] = final_test
    submission_df.to_csv("submission.csv", index=False)
    print("Saved: submission.csv")

In [9]:
if __name__ == "__main__":
    train_df, test_df, submission_df = load_data()
    oof_preds, test_preds, model_slices = train_and_evaluate(train_df, test_df)
    ensemble_and_submit(train_df, oof_preds, test_preds, submission_df)

Loaded data - Train: (525887, 30), Test: (538150, 29), Submission: (538150, 2)

--- Fold 1/3 ---
  Training slice: full_data, samples: 350591
  Training slice: last_75pct, samples: 350591
  Training slice: last_50pct, samples: 262944

--- Fold 2/3 ---
  Training slice: full_data, samples: 350591
  Training slice: last_75pct, samples: 219120
  Training slice: last_50pct, samples: 175295

--- Fold 3/3 ---
  Training slice: full_data, samples: 350592
  Training slice: last_75pct, samples: 219121
  Training slice: last_50pct, samples: 87649

XGB Simple Ensemble Pearson:   0.1094
XGB Weighted Ensemble Pearson: 0.1097

FINAL ensemble across learners Pearson: 0.1094
Saved: submission.csv
