In [1]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np
import warnings
from sklearn.model_selection import TimeSeriesSplit
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
from sklearn.preprocessing import StandardScaler
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
import gc
warnings.filterwarnings('ignore')

print("Loading Data")
# Load the data with all features (HMA + regime + rolling + interactions)
train = pd.read_parquet('/content/drive/MyDrive/DRW Crypto Market Prediction/train_with_rolling_interactions.parquet')
print(f"Loaded data shape: {train.shape}")

# Verify features are present
hma_features = [col for col in train.columns if 'hma_' in col]
regime_features = [col for col in train.columns if any(x in col for x in ['regime_', 'vol_cluster_', 'trend_', 'momentum_', 'market_'])]
rolling_features = [col for col in train.columns if 'rolling_' in col]
interaction_features = [col for col in train.columns if any(x in col for x in ['_div_', '_minus_', '_times_', '_spread', '_imbalance', '_velocity', '_acceleration'])]

print(f"HMA features: {len(hma_features)}")
print(f"Regime features: {len(regime_features)}")
print(f"Rolling features: {len(rolling_features)}")
print(f"Interaction features: {len(interaction_features)}")

# Get all feature columns (excluding target and time)
feature_cols = [col for col in train.columns if col not in ['time_id', 'label']]
print(f"Total features: {len(feature_cols)}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Loading Data
Loaded data shape: (525886, 5140)
HMA features: 440
Regime features: 364
Rolling features: 1300
Interaction features: 3027
Total features: 5139


In [1]:
# custom time series CV class

class WalkForwardTimeSeriesCV:
    """
    Custom walk-forward time series cross-validation
    """
    def __init__(self, n_splits=5, test_size=0.2, gap=0):
        self.n_splits = n_splits
        self.test_size = test_size
        self.gap = gap

    def split(self, X, y=None):
        n_samples = len(X)
        test_size = int(n_samples * self.test_size)

        for i in range(self.n_splits):
            # Calculate split points
            split_point = n_samples - (self.n_splits - i) * test_size

            if split_point <= 0:
                continue

            train_end = split_point - self.gap
            test_start = split_point

            if train_end <= 0 or test_start >= n_samples:
                continue

            train_indices = list(range(0, train_end))
            test_indices = list(range(test_start, min(test_start + test_size, n_samples)))

            yield train_indices, test_indices

    def get_n_splits(self, X, y=None):
        return self.n_splits

# Initialize CV
tscv = WalkForwardTimeSeriesCV(n_splits=5, test_size=0.1, gap=100)
print("Time series CV initialized")

Time series CV initialized


In [2]:
# feature selection functions

def select_features_stability(X, y, n_features=500):
    """
    Select features based on stability across CV folds
    """
    print("Selecting features based on stability...")

    feature_scores = {}
    tscv = WalkForwardTimeSeriesCV(n_splits=3, test_size=0.1, gap=100)

    for fold, (train_idx, val_idx) in enumerate(tscv.split(X)):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        # Train LightGBM for feature importance
        model = lgb.LGBMRegressor(
            n_estimators=100,
            learning_rate=0.1,
            max_depth=6,
            random_state=42,
            verbose=-1
        )

        model.fit(X_train, y_train)

        # Get feature importance
        importance = model.feature_importances_

        for i, feature in enumerate(X.columns):
            if feature not in feature_scores:
                feature_scores[feature] = []
            feature_scores[feature].append(importance[i])

    # Calculate stability (lower std = more stable)
    feature_stability = {}
    for feature, scores in feature_scores.items():
        if len(scores) >= 2:
            stability_score = 1 / (1 + np.std(scores))  # Higher is better
            feature_stability[feature] = stability_score

    # Select top features by stability
    sorted_features = sorted(feature_stability.items(), key=lambda x: x[1], reverse=True)
    selected_features = [f[0] for f in sorted_features[:n_features]]

    print(f"Selected {len(selected_features)} stable features")
    return selected_features

def select_features_correlation(X, y, threshold=0.95):
    """
    Remove highly correlated features
    """
    print("Removing highly correlated features...")

    # Calculate correlation matrix
    corr_matrix = X.corr().abs()

    # Find highly correlated features
    upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > threshold)]

    print(f"Removed {len(to_drop)} highly correlated features")
    return [col for col in X.columns if col not in to_drop]

def select_features_importance(X, y, n_features=500):
    """
    Select features based on LightGBM importance
    """
    print("Selecting features based on importance...")

    # Train a quick model to get feature importance
    model = lgb.LGBMRegressor(
        n_estimators=100,
        learning_rate=0.1,
        max_depth=6,
        random_state=42,
        verbose=-1
    )

    model.fit(X, y)

    # Get feature importance
    importance = model.feature_importances_
    feature_importance = dict(zip(X.columns, importance))

    # Select top features
    sorted_features = sorted(feature_importance.items(), key=lambda x: x[1], reverse=True)
    selected_features = [f[0] for f in sorted_features[:n_features]]

    print(f"Selected {len(selected_features)} important features")
    return selected_features

In [5]:
# main (# Step 1: Remove correlated features in chunks)

X = train[feature_cols]
y = train['label']
print(f"Starting with {X.shape[1]} features")

# Process correlation in smaller chunks
chunk_size = 500  # Smaller chunks for correlation
uncorr_features = []

for i in range(0, len(feature_cols), chunk_size):
    chunk_features = feature_cols[i:i+chunk_size]
    print(f"Processing correlation chunk {i//chunk_size + 1}: {len(chunk_features)} features")

    X_chunk = X[chunk_features]

    # Calculate correlation within chunk
    corr_matrix = X_chunk.corr().abs()
    upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.95)]

    # Keep features not in to_drop
    keep_from_chunk = [f for f in chunk_features if f not in to_drop]
    uncorr_features.extend(keep_from_chunk)

    print(f"Kept {len(keep_from_chunk)} from {len(chunk_features)} features")

print(f"After correlation removal: {len(uncorr_features)} features")

# Save intermediate result
import pickle
with open('uncorr_features.pkl', 'wb') as f:
    pickle.dump(uncorr_features, f)

# Memory cleanup
del X, corr_matrix, upper_tri, X_chunk
gc.collect()

Starting with 5139 features
Processing correlation chunk 1: 500 features
Kept 286 from 500 features
Processing correlation chunk 2: 500 features
Kept 357 from 500 features
Processing correlation chunk 3: 500 features
Kept 239 from 500 features
Processing correlation chunk 4: 500 features
Kept 183 from 500 features
Processing correlation chunk 5: 500 features
Kept 336 from 500 features
Processing correlation chunk 6: 500 features
Kept 458 from 500 features
Processing correlation chunk 7: 500 features
Kept 428 from 500 features
Processing correlation chunk 8: 500 features
Kept 377 from 500 features
Processing correlation chunk 9: 500 features
Kept 353 from 500 features
Processing correlation chunk 10: 500 features
Kept 489 from 500 features
Processing correlation chunk 11: 139 features
Kept 101 from 139 features
After correlation removal: 3607 features


0

In [4]:
# main (Step 2: Stability selection)
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np
import pickle
import warnings
from sklearn.model_selection import TimeSeriesSplit
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
from sklearn.preprocessing import StandardScaler
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
import gc
warnings.filterwarnings('ignore')

print("Reloading training data...")
train = pd.read_parquet('/content/drive/MyDrive/DRW Crypto Market Prediction/train_with_rolling_interactions.parquet')

# Load uncorrelated features
with open('uncorr_features.pkl', 'rb') as f:
    uncorr_features = pickle.load(f)

print(f"Processing {len(uncorr_features)} features in chunks")

# Process stability in chunks
chunk_size = 200  # Small chunks for stability
stable_candidates = []
y = train['label']

for i in range(0, len(uncorr_features), chunk_size):
    chunk_features = uncorr_features[i:i+chunk_size]
    print(f"Stability chunk {i//chunk_size + 1}/{(len(uncorr_features)-1)//chunk_size + 1}: {len(chunk_features)} features")

    try:
        X_chunk = train[chunk_features]

        # Simple stability test - train 3 models on different data splits
        feature_scores = {}

        for split in [0.3, 0.5, 0.7]:  # 3 different train sizes
            split_size = int(len(X_chunk) * split)
            X_split = X_chunk.iloc[:split_size]
            y_split = y.iloc[:split_size]

            # Quick model
            model = lgb.LGBMRegressor(
                n_estimators=50,  # Reduced for speed
                learning_rate=0.1,
                max_depth=5,
                random_state=42,
                verbose=-1
            )

            model.fit(X_split, y_split)

            # Store importance
            for j, feature in enumerate(chunk_features):
                if feature not in feature_scores:
                    feature_scores[feature] = []
                feature_scores[feature].append(model.feature_importances_[j])

        # Calculate stability for this chunk
        chunk_stable = []
        for feature, scores in feature_scores.items():
            if len(scores) >= 2:
                stability = 1 / (1 + np.std(scores))  # Higher = more stable
                chunk_stable.append((feature, stability))

        # Take top 50% from each chunk
        chunk_stable.sort(key=lambda x: x[1], reverse=True)
        top_from_chunk = [f[0] for f in chunk_stable[:len(chunk_stable)//2]]
        stable_candidates.extend(top_from_chunk)

        print(f"  Added {len(top_from_chunk)} stable features from chunk")

        # Clean up
        del X_chunk, X_split, model
        gc.collect()

    except Exception as e:
        print(f"  Chunk failed: {e}, skipping...")

print(f"Total stable candidates: {len(stable_candidates)}")

# Final selection - take top 400
if len(stable_candidates) > 400:
    # Quick final ranking
    X_final = train[stable_candidates]
    model = lgb.LGBMRegressor(n_estimators=50, random_state=42, verbose=-1)
    model.fit(X_final, y)

    importance = dict(zip(stable_candidates, model.feature_importances_))
    sorted_final = sorted(importance.items(), key=lambda x: x[1], reverse=True)
    stable_features = [f[0] for f in sorted_final[:400]]
else:
    stable_features = stable_candidates

print(f"Final stable features: {len(stable_features)}")

# Save result
with open('stable_features.pkl', 'wb') as f:
    pickle.dump(stable_features, f)

print("Step 2 completed and saved!")
gc.collect()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Reloading training data...
Processing 3607 features in chunks
Stability chunk 1/19: 200 features
  Added 100 stable features from chunk
Stability chunk 2/19: 200 features
  Added 100 stable features from chunk
Stability chunk 3/19: 200 features
  Added 100 stable features from chunk
Stability chunk 4/19: 200 features
  Added 100 stable features from chunk
Stability chunk 5/19: 200 features
  Added 100 stable features from chunk
Stability chunk 6/19: 200 features
  Added 100 stable features from chunk
Stability chunk 7/19: 200 features
  Added 100 stable features from chunk
Stability chunk 8/19: 200 features
  Added 100 stable features from chunk
Stability chunk 9/19: 200 features
  Added 100 stable features from chunk
Stability chunk 10/19: 200 features
  Added 100 stable features from chunk
Stability chunk 11/19: 200 features
  Added 100 stable features from

8

In [6]:
# main (Step 3: Final importance selection)

feature_cols = [col for col in train.columns if col not in ['time_id', 'label', 'index']]
# Load stable features
import pickle
with open('stable_features.pkl', 'rb') as f:
    stable_features = pickle.load(f)

X_stable = train[stable_features]
y = train['label']

print(f"Running importance selection on {len(stable_features)} features")

# Final selection
final_features = select_features_importance(X_stable, y, n_features=500)

print(f"Final features: {len(final_features)}")

# Save selected features
selected_features_df = pd.DataFrame({'feature': final_features})
selected_features_df.to_csv('/content/drive/MyDrive/DRW Crypto Market Prediction/selected_features.csv', index=False)

print(f"\n Feature selection completed!")
print(f"Original features: {len(feature_cols)}")
print(f"Final features: {len(final_features)}")
print("Selected features saved to: selected_features.csv")

# Memory cleanup
del X_stable
gc.collect()

Running importance selection on 400 features
Selecting features based on importance...
Selected 400 important features
Final features: 400

 Feature selection completed!
Original features: 5139
Final features: 400
Selected features saved to: selected_features.csv


42

In [8]:

# Cross-validation with selected features

import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
import gc
import warnings
warnings.filterwarnings('ignore')

# Load train data
train = pd.read_parquet('/content/drive/MyDrive/DRW Crypto Market Prediction/train_with_rolling_interactions.parquet')
if 'time_id' not in train.columns:
    train = train.reset_index()
    train['time_id'] = range(len(train))

# Load selected features
selected_features_df = pd.read_csv('/content/drive/MyDrive/DRW Crypto Market Prediction/selected_features.csv')
final_features = selected_features_df['feature'].tolist()

print(f"Data loaded: {train.shape}")
print(f"Selected features: {len(final_features)}")

X_final = train[final_features]
y = train['label']

print(f"X_final shape: {X_final.shape}")
print(f"y shape: {y.shape}")

cv_scores = []
feature_importance_cv = {}

tscv = WalkForwardTimeSeriesCV(n_splits=5, test_size=0.1, gap=100)

for fold, (train_idx, val_idx) in enumerate(tscv.split(X_final)):
    print(f"\nFold {fold + 1}/5")

    X_train, X_val = X_final.iloc[train_idx], X_final.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    # Train model
    model = lgb.LGBMRegressor(
        n_estimators=200,
        learning_rate=0.05,
        max_depth=8,
        num_leaves=31,
        random_state=42,
        verbose=-1
    )

    model.fit(X_train, y_train)

    # Predictions
    y_pred = model.predict(X_val)

    # Calculate RMSE
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    cv_scores.append(rmse)

    # Store feature importance
    for i, feature in enumerate(X_final.columns):
        if feature not in feature_importance_cv:
            feature_importance_cv[feature] = []
        feature_importance_cv[feature].append(model.feature_importances_[i])

    print(f"Fold {fold + 1} RMSE: {rmse:.6f}")

print(f"\n CV results")
print(f"Mean RMSE: {np.mean(cv_scores):.6f}")
print(f"Std RMSE: {np.std(cv_scores):.6f}")
print(f"Min RMSE: {np.min(cv_scores):.6f}")
print(f"Max RMSE: {np.max(cv_scores):.6f}")

# Save CV results
cv_results = pd.DataFrame({
    'fold': range(1, len(cv_scores) + 1),
    'rmse': cv_scores
})
cv_results.to_csv('/content/drive/MyDrive/DRW Crypto Market Prediction/cv_results.csv', index=False)

print("\n Cross-validation completed!")
print(" CV results saved to: cv_results.csv")

Data loaded: (525886, 5142)
Selected features: 400
X_final shape: (525886, 400)
y shape: (525886,)

Fold 1/5
Fold 1 RMSE: 0.970242

Fold 2/5
Fold 2 RMSE: 1.068473

Fold 3/5
Fold 3 RMSE: 1.017032

Fold 4/5
Fold 4 RMSE: 1.089529

Fold 5/5
Fold 5 RMSE: 1.068730

 CV results
Mean RMSE: 1.042801
Std RMSE: 0.043460
Min RMSE: 0.970242
Max RMSE: 1.089529

 Cross-validation completed!
 CV results saved to: cv_results.csv


In [1]:
# Create final dataset with selected features

import pandas as pd
train = pd.read_parquet('/content/drive/MyDrive/DRW Crypto Market Prediction/train_with_rolling_interactions.parquet')

# Create time_id from the RangeIndex
train['time_id'] = train.index
print(f"Created time_id from index: {train['time_id'].head()}")

selected_features_df = pd.read_csv('/content/drive/MyDrive/DRW Crypto Market Prediction/selected_features.csv')
final_features = selected_features_df['feature'].tolist()

print(f"Reloaded {len(final_features)} selected features")

# Add back target and time_id
final_dataset = train[['time_id', 'label'] + final_features]
print(f"Final dataset shape: {final_dataset.shape}")

# Save final dataset
final_dataset.to_parquet('/content/drive/MyDrive/DRW Crypto Market Prediction/train_final_selected.parquet', index=False)

print(f"\ Final dataset created!")
print(f"Shape: {final_dataset.shape}")
print(" Saved to: train_final_selected.parquet")


Created time_id from index: 0    0
1    1
2    2
3    3
4    4
Name: time_id, dtype: int64
Reloaded 400 selected features
Final dataset shape: (525886, 402)
\ Final dataset created!
Shape: (525886, 402)
 Saved to: train_final_selected.parquet


NameError: name 'X_final' is not defined