In [1]:
from google.colab import drive
drive.mount('/content/drive')

# Quick distribution check
import pandas as pd

train = pd.read_parquet('/content/drive/MyDrive/DRW Crypto Market Prediction/train_final_selected.parquet')
test = pd.read_parquet('/content/drive/MyDrive/DRW Crypto Market Prediction/test.parquet')

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")
print(f"Train date range: {train.index.min()} to {train.index.max()}")
print(f"Test date range: {test.index.min()} to {test.index.max()}")


Mounted at /content/drive
Train shape: (525886, 402)
Test shape: (538150, 786)
Train date range: 0 to 525885
Test date range: 1 to 538150


In [1]:
# Quick drift detection on training data across time periods
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np

# Load your training data
train = pd.read_parquet('/content/drive/MyDrive/DRW Crypto Market Prediction/train_final_selected.parquet')
selected_features_df = pd.read_csv('/content/drive/MyDrive/DRW Crypto Market Prediction/selected_features.csv')
final_features = selected_features_df['feature'].tolist()

print(f"Checking drift across time periods for {len(final_features)} features")

# Split training data into early vs late periods
mid_point = len(train) // 2
early_period = train.iloc[:mid_point]
late_period = train.iloc[mid_point:]

print(f"Early period: {len(early_period)} samples")
print(f"Late period: {len(late_period)} samples")

# Check drift for each feature
high_drift_features = []
drift_scores = []

for feature in final_features:
    early_mean = early_period[feature].mean()
    late_mean = late_period[feature].mean()

    # Calculate drift score
    drift = abs(early_mean - late_mean) / (abs(early_mean) + 1e-8)
    drift_scores.append(drift)

    if drift > 1.0:  # High drift threshold
        high_drift_features.append((feature, drift))

print(f"\n=== Drift Analysis ===")
print(f"Mean drift score: {np.mean(drift_scores):.4f}")
print(f"Max drift score: {np.max(drift_scores):.4f}")
print(f"Features with high drift (>1.0): {len(high_drift_features)}")

if len(high_drift_features) > 0:
    print(f"\nTop 5 most drifted features:")
    high_drift_features.sort(key=lambda x: x[1], reverse=True)
    for feat, score in high_drift_features[:5]:
        print(f"  {feat}: {score:.4f}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Checking drift across time periods for 400 features
Early period: 262943 samples
Late period: 262943 samples

=== Drift Analysis ===
Mean drift score: nan
Max drift score: nan
Features with high drift (>1.0): 217

Top 5 most drifted features:
  X304: 130.4998
  X388: 84.3425
  X587_div_X588_spread_ratio: 46.9160
  X3_rolling_mean_50: 36.4383
  X258: 29.7867


  return umr_sum(a, axis, dtype, out, keepdims, initial, where)


In [1]:
# dectecting distribution drift

from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np


# Load 400 CV-selected features
train = pd.read_parquet('/content/drive/MyDrive/DRW Crypto Market Prediction/train_final_selected.parquet')
selected_features_df = pd.read_csv('/content/drive/MyDrive/DRW Crypto Market Prediction/selected_features.csv')
final_features = selected_features_df['feature'].tolist()

print(f"Starting with {len(final_features)} CV-selected features")
X = train[final_features]
y = train['label']

class DistributionDriftDetector:
    """
    Detect distribution drift between train and validation periods.
    """

    def __init__(self, drift_threshold=0.5):
        self.drift_threshold = drift_threshold
        self.drift_scores = {}

    def calculate_ks_statistic(self, train_data, val_data):
        """
        Calculate Kolmogorov-Smirnov statistic for distribution comparison.
        """
        ks_stat, p_value = stats.ks_2samp(train_data, val_data)
        return ks_stat, p_value

    def detect_drift(self, X_train, X_val, feature_names=None):
        """
        Detect distribution drift for all features.
        """
        if feature_names is None:
            feature_names = X_train.columns

        drift_scores = {}

        for feature in tqdm(feature_names, desc="Detecting drift"):
            if feature in X_train.columns and feature in X_val.columns:
                train_data = X_train[feature].dropna()
                val_data = X_val[feature].dropna()

                if len(train_data) > 10 and len(val_data) > 10:
                    ks_stat, p_value = self.calculate_ks_statistic(train_data, val_data)

                    drift_scores[feature] = {
                        'ks_statistic': ks_stat,
                        'p_value': p_value,
                        'has_drift': p_value < self.drift_threshold,
                        'drift_severity': ks_stat
                    }

        self.drift_scores = drift_scores
        return drift_scores

    def get_drift_free_features(self, drift_scores):
        """
        Get features without significant drift.
        """
        drift_free = [feature for feature, scores in drift_scores.items()
                     if not scores['has_drift']]
        return drift_free

    def plot_drift_analysis(self, drift_scores, top_n=20):
        """
        Plot drift analysis results.
        """
        # Sort by drift severity
        sorted_drift = sorted(drift_scores.items(),
                            key=lambda x: x[1]['drift_severity'], reverse=True)

        features = [f[0] for f in sorted_drift[:top_n]]
        severities = [f[1]['drift_severity'] for f in sorted_drift[:top_n]]
        p_values = [f[1]['p_value'] for f in sorted_drift[:top_n]]

        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

        # Drift severity
        ax1.barh(range(len(features)), severities)
        ax1.set_yticks(range(len(features)))
        ax1.set_yticklabels(features)
        ax1.set_title('Top Features by Drift Severity')
        ax1.set_xlabel('KS Statistic')

        # P-values
        ax2.scatter(severities, p_values, alpha=0.7)
        ax2.axhline(y=self.drift_threshold, color='r', linestyle='--', label=f'Threshold ({self.drift_threshold})')
        ax2.set_xlabel('KS Statistic')
        ax2.set_ylabel('P-value')
        ax2.set_title('Drift Severity vs P-value')
        ax2.legend()

        plt.tight_layout()
        plt.show()

        return fig

print("Distribution drift detector loaded!")

Mounted at /content/drive
Starting with 400 CV-selected features
Distribution drift detector loaded!


In [2]:
# forward / backward feature selection

class ForwardBackwardSelector:
    """
    Forward/backward feature selection with CV validation.
    """

    def __init__(self, cv_runner, min_improvement=0.001, max_features=100):
        self.cv_runner = cv_runner
        self.min_improvement = min_improvement
        self.max_features = max_features
        self.selected_features = []
        self.selection_history = []

    def forward_selection(self, X, y, candidate_features):
        """
        Forward selection: add features one by one.
        """
        selected = []
        remaining = candidate_features.copy()

        print("Starting forward selection...")

        while len(selected) < self.max_features and remaining:
            best_feature = None
            best_score = -1

            for feature in tqdm(remaining, desc=f"Testing {len(remaining)} features"):
                # Add feature temporarily
                test_features = selected + [feature]
                X_test = X[test_features]

                # Run CV
                results = self.cv_runner.run_cv(X_test, y)
                score = results['mean_score']

                if score > best_score:
                    best_score = score
                    best_feature = feature

            # Check if improvement is significant
            if best_feature and best_score > self.min_improvement:
                selected.append(best_feature)
                remaining.remove(best_feature)

                self.selection_history.append({
                    'feature': best_feature,
                    'score': best_score,
                    'n_features': len(selected)
                })

                print(f"Added {best_feature}, score: {best_score:.4f}, n_features: {len(selected)}")
            else:
                break

        self.selected_features = selected
        return selected

    def backward_elimination(self, X, y, initial_features):
        """
        Backward elimination: remove features one by one.
        """
        current_features = initial_features.copy()

        print("Starting backward elimination...")

        while len(current_features) > 10:  # Keep at least 10 features
            worst_feature = None
            best_score = -1

            for feature in tqdm(current_features, desc=f"Testing removal of {len(current_features)} features"):
                # Remove feature temporarily
                test_features = [f for f in current_features if f != feature]
                X_test = X[test_features]

                # Run CV
                results = self.cv_runner.run_cv(X_test, y)
                score = results['mean_score']

                if score > best_score:
                    best_score = score
                    worst_feature = feature

            # Remove worst feature if it improves score
            if worst_feature and best_score > self.min_improvement:
                current_features.remove(worst_feature)
                print(f"Removed {worst_feature}, score: {best_score:.4f}, n_features: {len(current_features)}")
            else:
                break

        return current_features

print("Forward/backward selector loaded!")

Forward/backward selector loaded!


In [3]:
# regularization-based selection

class RegularizationSelector:
    """
    Feature selection using L1/L2 regularization.
    """

    def __init__(self, l1_ratio=0.5, max_features=100):
        self.l1_ratio = l1_ratio
        self.max_features = max_features

    def select_with_regularization(self, X, y, cv_runner):
        """
        Select features using regularization.
        """
        from sklearn.linear_model import ElasticNet
        from sklearn.preprocessing import StandardScaler

        # Scale features
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)

        # Use ElasticNet with L1 penalty
        model = ElasticNet(
            alpha=0.01,  # Regularization strength
            l1_ratio=self.l1_ratio,  # L1 vs L2 ratio
            random_state=42,
            max_iter=1000
        )

        # Fit model
        model.fit(X_scaled, y)

        # Get feature importance (absolute coefficients)
        feature_importance = np.abs(model.coef_)

        # Select features with non-zero coefficients
        selected_indices = np.where(feature_importance > 0)[0]
        selected_features = X.columns[selected_indices].tolist()

        # Limit number of features
        if len(selected_features) > self.max_features:
            # Sort by importance and take top features
            feature_scores = list(zip(selected_features, feature_importance[selected_indices]))
            feature_scores.sort(key=lambda x: x[1], reverse=True)
            selected_features = [f[0] for f in feature_scores[:self.max_features]]

        return selected_features

print("Regularization selector loaded!")

Regularization selector loaded!


In [4]:
# selection pipeline

class ComprehensiveFeatureSelector:
    """
    Comprehensive feature selection pipeline combining multiple strategies.
    """

    def __init__(self, cv_runner, max_features=200):
        self.cv_runner = cv_runner
        self.max_features = max_features
        # Remove stability_selector since we already did that
        self.drift_detector = DistributionDriftDetector()
        self.forward_backward_selector = ForwardBackwardSelector(cv_runner)
        self.regularization_selector = RegularizationSelector()

        self.final_features = []
        self.selection_summary = {}

    def run_comprehensive_selection(self, X, y, X_train=None, X_val=None):
        """
        Run comprehensive feature selection pipeline.
        """
        print("COMPREHENSIVE FEATURE SELECTION")

        # Start with provided features (your 400 CV-selected ones)
        starting_features = list(X.columns)
        print(f"Starting with {len(starting_features)} pre-selected features")

        # Step 1: Distribution drift detection
        if X_train is not None and X_val is not None:
            print("\n1. Distribution drift detection...")
            drift_scores = self.drift_detector.detect_drift(
                X_train, X_val, starting_features
            )
            drift_free_features = self.drift_detector.get_drift_free_features(drift_scores)
            print(f"Found {len(drift_free_features)} drift-free features")
        else:
            drift_free_features = starting_features

        # Step 2: Forward selection
        print("\n2. Forward selection...")
        forward_selected = self.forward_backward_selector.forward_selection(
            X[drift_free_features], y, drift_free_features
        )
        print(f"Forward selection: {len(forward_selected)} features")

        # Step 3: Regularization-based selection
        print("\n3. Regularization-based selection...")
        reg_selected = self.regularization_selector.select_with_regularization(
            X[forward_selected], y, self.cv_runner
        )
        print(f"Regularization selection: {len(reg_selected)} features")

        # Step 4: Final validation
        print("\n4. Final validation...")
        X_final = X[reg_selected]
        final_results = self.cv_runner.run_cv(X_final, y)

        # Store results
        self.final_features = reg_selected
        self.selection_summary = {
            'starting_features': len(starting_features),
            'drift_free_features': len(drift_free_features),
            'forward_selected': len(forward_selected),
            'final_features': len(reg_selected),
            'final_score': final_results['mean_score'],
            'final_std': final_results['std_score']
        }

        return reg_selected, final_results

    def plot_selection_summary(self):
        """
        Plot feature selection summary.
        """
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

        # Selection pipeline summary
        stages = ['Starting', 'Drift-Free', 'Forward', 'Final']
        counts = [
            self.selection_summary['starting_features'],
            self.selection_summary['drift_free_features'],
            self.selection_summary['forward_selected'],
            self.selection_summary['final_features']
        ]

        ax1.bar(stages, counts)
        ax1.set_title('Feature Count at Each Selection Stage')
        ax1.set_ylabel('Number of Features')

        # Final performance
        ax2.bar(['Final Score'], [self.selection_summary['final_score']])
        ax2.set_title('Final CV Score')
        ax2.set_ylabel('Pearson Correlation')
        ax2.set_ylim(0, 1)

        plt.tight_layout()
        plt.show()

        return fig

In [5]:
# copying CV functions again

class CrossValidationRunner:
    """
    Cross-validation runner for feature selection.
    """

    def __init__(self, model_class, model_params, cv_folds=5, random_state=42):
        self.model_class = model_class
        self.model_params = model_params
        self.cv_folds = cv_folds
        self.random_state = random_state
        self.feature_importance = {}

    def run_cv(self, X, y):
        """
        Run cross-validation and return results.
        """
        from sklearn.model_selection import KFold
        from sklearn.metrics import mean_squared_error
        import numpy as np

        kf = KFold(n_splits=self.cv_folds, shuffle=True, random_state=self.random_state)
        scores = []

        for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
            X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
            y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

            # Train model
            model = self.model_class(**self.model_params)
            model.fit(X_train, y_train)

            # Predict and score
            y_pred = model.predict(X_val)
            score = np.sqrt(mean_squared_error(y_val, y_pred))
            scores.append(score)

            # Store feature importance
            if hasattr(model, 'feature_importances_'):
                self.feature_importance[fold] = dict(zip(X.columns, model.feature_importances_))

        return {
            'scores': scores,
            'mean_score': np.mean(scores),
            'std_score': np.std(scores)
        }

In [6]:
# main

import pandas as pd
import numpy as np
import warnings
from sklearn.model_selection import TimeSeriesSplit, KFold
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Lasso, Ridge
import lightgbm as lgb
import gc
from tqdm import tqdm
from scipy import stats
from scipy.stats import ks_2samp
import matplotlib.pyplot as plt
import seaborn as sns
import os
warnings.filterwarnings('ignore')

train = pd.read_parquet('/content/drive/MyDrive/DRW Crypto Market Prediction/train_final_selected.parquet')
selected_features_df = pd.read_csv('/content/drive/MyDrive/DRW Crypto Market Prediction/selected_features.csv')
final_features = selected_features_df['feature'].tolist()

X = train[final_features]  # Use 400 selected features
y = train['label']

print(f"Starting with {X.shape[1]} CV-selected features")

# 2.5. Create CV runner (THIS WAS MISSING!)
cv_runner = CrossValidationRunner(
    model_class=lgb.LGBMRegressor,
    model_params={
        'n_estimators': 100,
        'learning_rate': 0.1,
        'max_depth': 6,
        'random_state': 42,
        'verbose': -1
    },
    cv_folds=3,  # Reduced for speed
    random_state=42
)

# 3. Create feature selector
feature_selector = ComprehensiveFeatureSelector(cv_runner, max_features=200)

# 4. Split data for drift detection
split_idx = int(len(X) * 0.6)
X_train_drift = X.iloc[:split_idx]
X_val_drift = X.iloc[split_idx:split_idx + int(len(X) * 0.2)]

# 5. Run comprehensive feature selection
selected_features, final_results = feature_selector.run_comprehensive_selection(
    X, y, X_train_drift, X_val_drift
)


print(f"Final features: {len(selected_features)}")
print(f"Final CV score: {final_results['mean_score']:.6f} ± {final_results['std_score']:.6f}")

# Save final features
final_features_df = pd.DataFrame({'feature': selected_features})
final_features_df.to_csv('/content/drive/MyDrive/DRW Crypto Market Prediction/final_optimized_features.csv', index=False)

Starting with 400 CV-selected features
COMPREHENSIVE FEATURE SELECTION
Starting with 400 pre-selected features

1. Distribution drift detection...


Detecting drift: 100%|██████████| 400/400 [00:42<00:00,  9.35it/s]


Found 0 drift-free features

2. Forward selection...
Starting forward selection...
Forward selection: 0 features

3. Regularization-based selection...


ValueError: at least one array or dtype is required