In [18]:
# configurations

import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import make_scorer
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import warnings
import pickle
import json
from datetime import datetime
warnings.filterwarnings('ignore')

class LightGBMTrainer:

    def __init__(self, cv_runner, pearson_scorer):
        self.cv_runner = cv_runner
        self.pearson_scorer = pearson_scorer
        self.best_params = {}
        self.final_models = {}
        self.feature_importance = {}
        self.training_history = {}

    def get_winner_parameters(self):

        param_grid = {
            'objective': ['regression'],  # Squared loss as per 2nd place
            'metric': ['rmse'],
            'boosting_type': ['gbdt'],
            'learning_rate': [0.01, 0.05, 0.1, 0.3],  # Winners focused on this
            'num_leaves': [31, 50, 100, 200],  # Winners tuned this
            'n_estimators': [500, 1000, 2000],  # With early stopping
            'feature_fraction': [0.9],  # Winners kept default
            'bagging_fraction': [0.8],  # Winners kept default
            'bagging_freq': [5],
            'verbose': [-1],
            'random_state': [42],
            'early_stopping_rounds': [50]
        }

        return param_grid

    def optimize_hyperparameters(self, X, y, max_combinations=20):
        """
        Optimize hyperparameters using time series CV.
        """

        param_grid = self.get_winner_parameters()
        param_combinations = list(ParameterGrid(param_grid))

        # Limit combinations to avoid excessive computation
        if len(param_combinations) > max_combinations:
            # Sample most promising combinations
            param_combinations = np.random.choice(
                param_combinations, max_combinations, replace=False
            ).tolist()

        best_score = -1
        best_params = {}
        results = []

        print(f"Testing {len(param_combinations)} parameter combinations...")

        for i, params in enumerate(tqdm(param_combinations, desc="Parameter tuning")):
            try:
                # Run CV with these parameters
                cv_results = self.cv_runner.run_cv(X, y, params)
                score = cv_results['mean_score']

                results.append({
                    'params': params,
                    'score': score,
                    'std_score': cv_results['std_score']
                })

                if score > best_score:
                    best_score = score
                    best_params = params.copy()
                    print(f"New best score: {score:.4f} with params: {params}")

            except Exception as e:
                print(f"Error with params {params}: {e}")
                continue

        # Sort results by score
        results.sort(key=lambda x: x['score'], reverse=True)

        self.best_params = best_params
        self.training_history['hyperparameter_results'] = results

        print(f"\nBest parameters: {best_params}")
        print(f"Best CV score: {best_score:.4f}")

        return best_params, results

print("LightGBM trainer setup complete!")

LightGBM trainer setup complete!


In [19]:
# market regime models

class MarketRegimeModels:
    """
    Train separate models for different market regimes.
    """

    def __init__(self, cv_runner, best_params):
        self.cv_runner = cv_runner
        self.best_params = best_params
        self.regime_models = {}
        self.regime_performance = {}

    def train_regime_models(self, X, y, regime_labels):
        """
        Train separate models for each market regime.
        """
        print("=== MARKET REGIME MODELS (9th Place Strategy) ===")

        # Identify regimes
        unique_regimes = regime_labels.unique()
        print(f"Found regimes: {unique_regimes}")

        for regime in unique_regimes:
            print(f"\nTraining model for {regime} regime...")

            # Filter data for this regime
            regime_mask = regime_labels == regime
            X_regime = X[regime_mask]
            y_regime = y[regime_mask]

            if len(X_regime) < 1000:  # Skip if too few samples
                print(f"Skipping {regime} regime - too few samples ({len(X_regime)})")
                continue

            print(f"Regime {regime}: {len(X_regime)} samples")

            # Train model for this regime
            model = lgb.LGBMRegressor(**self.best_params)
            model.fit(X_regime, y_regime)

            # Store model
            self.regime_models[regime] = model

            # Evaluate performance
            y_pred = model.predict(X_regime)
            score = np.corrcoef(y_regime, y_pred)[0, 1]
            self.regime_performance[regime] = score

            print(f"Regime {regime} performance: {score:.4f}")

        return self.regime_models

    def predict_with_regime_models(self, X, regime_labels):
        """
        Make predictions using regime-specific models.
        """
        predictions = np.zeros(len(X))
        regime_counts = {}

        for regime, model in self.regime_models.items():
            regime_mask = regime_labels == regime
            if regime_mask.sum() > 0:
                regime_pred = model.predict(X[regime_mask])
                predictions[regime_mask] = regime_pred
                regime_counts[regime] = regime_mask.sum()

        print(f"Predictions by regime: {regime_counts}")
        return predictions

print("Market regime models setup complete!")

Market regime models setup complete!


In [20]:
# model training

class FinalModelTrainer:
    """
    Train final model on all available data.
    """

    def __init__(self, best_params):
        self.best_params = best_params
        self.final_model = None
        self.feature_importance = {}
        self.training_metrics = {}

    def train_final_model(self, X, y, save_path=None):
        """
        Train final model on all available data
        """

        print(f"Training on {len(X)} samples with {X.shape[1]} features")

        # Train model
        self.final_model = lgb.LGBMRegressor(**self.best_params)
        self.final_model.fit(X, y)

        # Get feature importance
        if hasattr(self.final_model, 'feature_importances_'):
            self.feature_importance = dict(zip(X.columns, self.final_model.feature_importances_))

        # Calculate training metrics
        y_pred = self.final_model.predict(X)
        self.training_metrics = {
            'pearson_correlation': np.corrcoef(y, y_pred)[0, 1],
            'rmse': np.sqrt(np.mean((y - y_pred) ** 2)),
            'mae': np.mean(np.abs(y - y_pred))
        }

        print(f"Training metrics:")
        for metric, value in self.training_metrics.items():
            print(f"  {metric}: {value:.4f}")

        # Save model
        if save_path:
            self.save_model(save_path)

        return self.final_model

    def save_model(self, save_path):
        """
        Save model and metadata.
        """
        model_data = {
            'model': self.final_model,
            'feature_importance': self.feature_importance,
            'training_metrics': self.training_metrics,
            'best_params': self.best_params,
            'timestamp': datetime.now().isoformat()
        }

        with open(save_path, 'wb') as f:
            pickle.dump(model_data, f)

        print(f"Model saved to {save_path}")

    def load_model(self, load_path):
        """
        Load saved model.
        """
        with open(load_path, 'rb') as f:
            model_data = pickle.load(f)

        self.final_model = model_data['model']
        self.feature_importance = model_data['feature_importance']
        self.training_metrics = model_data['training_metrics']
        self.best_params = model_data['best_params']

        print(f"Model loaded from {load_path}")

print("Final model trainer setup complete!")

Final model trainer setup complete!


In [21]:
# model diagonostics

class ModelDiagnostics:
    """
    Comprehensive model diagnostics and analysis.
    """

    def __init__(self, final_model, feature_importance, training_metrics):
        self.final_model = final_model
        self.feature_importance = feature_importance
        self.training_metrics = training_metrics

    def plot_feature_importance(self, top_n=20):
        """
        Plot feature importance analysis.
        """
        if not self.feature_importance:
            print("No feature importance available")
            return

        # Sort features by importance
        sorted_features = sorted(self.feature_importance.items(),
                               key=lambda x: x[1], reverse=True)

        features = [f[0] for f in sorted_features[:top_n]]
        importances = [f[1] for f in sorted_features[:top_n]]

        plt.figure(figsize=(12, 8))
        plt.barh(range(len(features)), importances)
        plt.yticks(range(len(features)), features)
        plt.xlabel('Feature Importance')
        plt.title('Top Feature Importance')
        plt.gca().invert_yaxis()
        plt.tight_layout()
        plt.show()

        return sorted_features[:top_n]

    def plot_prediction_analysis(self, y_true, y_pred):
        """
        Plot prediction vs actual analysis.
        """
        fig, axes = plt.subplots(2, 2, figsize=(15, 10))

        # Scatter plot
        axes[0, 0].scatter(y_true, y_pred, alpha=0.5)
        axes[0, 0].plot([y_true.min(), y_true.max()], [y_true.min(), y_true.max()], 'r--')
        axes[0, 0].set_xlabel('Actual')
        axes[0, 0].set_ylabel('Predicted')
        axes[0, 0].set_title('Predictions vs Actual')

        # Residual plot
        residuals = y_true - y_pred
        axes[0, 1].scatter(y_pred, residuals, alpha=0.5)
        axes[0, 1].axhline(y=0, color='r', linestyle='--')
        axes[0, 1].set_xlabel('Predicted')
        axes[0, 1].set_ylabel('Residuals')
        axes[0, 1].set_title('Residual Plot')

        # Distribution comparison
        axes[1, 0].hist(y_true, alpha=0.7, label='Actual', bins=30)
        axes[1, 0].hist(y_pred, alpha=0.7, label='Predicted', bins=30)
        axes[1, 0].set_xlabel('Value')
        axes[1, 0].set_ylabel('Frequency')
        axes[1, 0].set_title('Distribution Comparison')
        axes[1, 0].legend()

        # Residual distribution
        axes[1, 1].hist(residuals, bins=30)
        axes[1, 1].set_xlabel('Residual')
        axes[1, 1].set_ylabel('Frequency')
        axes[1, 1].set_title('Residual Distribution')

        plt.tight_layout()
        plt.show()

        return fig

    def analyze_regime_performance(self, y_true, y_pred, regime_labels):
        """
        Analyze performance by market regime.
        """
        regime_performance = {}

        for regime in regime_labels.unique():
            mask = regime_labels == regime
            if mask.sum() > 10:
                regime_true = y_true[mask]
                regime_pred = y_pred[mask]
                corr = np.corrcoef(regime_true, regime_pred)[0, 1]
                rmse = np.sqrt(np.mean((regime_true - regime_pred) ** 2))

                regime_performance[regime] = {
                    'correlation': corr,
                    'rmse': rmse,
                    'samples': mask.sum()
                }

        # Plot regime performance
        regimes = list(regime_performance.keys())
        correlations = [regime_performance[r]['correlation'] for r in regimes]
        samples = [regime_performance[r]['samples'] for r in regimes]

        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

        ax1.bar(regimes, correlations)
        ax1.set_title('Performance by Market Regime')
        ax1.set_ylabel('Pearson Correlation')
        ax1.tick_params(axis='x', rotation=45)

        ax2.bar(regimes, samples)
        ax2.set_title('Samples by Market Regime')
        ax2.set_ylabel('Number of Samples')
        ax2.tick_params(axis='x', rotation=45)

        plt.tight_layout()
        plt.show()

        return regime_performance

print("Model diagnostics setup complete!")

Model diagnostics setup complete!


In [22]:
# test data processing & predictions

class TestDataProcessor:
    """
    Process test data and generate predictions.
    """

    def __init__(self, final_model, feature_importance):
        self.final_model = final_model
        self.feature_importance = feature_importance

    def process_test_data(self, test_data, feature_engineering_pipeline):
        """
        Apply same preprocessing to test data.
        """
        print("TEST DATA PROCESSING")

        # Apply same feature engineering
        # This should include:
        # - HMA features
        # - Market regime features
        # - Rolling statistics
        # - Cross-feature interactions

        processed_test = test_data.copy()

        # Add your feature engineering steps here
        # (This will depend on your specific pipeline)

        return processed_test

    def generate_predictions(self, processed_test, regime_models=None, regime_labels=None):
        """
        Generate predictions for test data.
        """
        print("GENERATING PREDICTIONS")

        if regime_models and regime_labels is not None:
            # Use regime-specific models (9th place strategy)
            predictions = np.zeros(len(processed_test))

            for regime, model in regime_models.items():
                regime_mask = regime_labels == regime
                if regime_mask.sum() > 0:
                    regime_pred = model.predict(processed_test[regime_mask])
                    predictions[regime_mask] = regime_pred

            print(f"Generated predictions using regime models")
        else:
            # Use single final model
            predictions = self.final_model.predict(processed_test)
            print(f"Generated predictions using final model")

        return predictions

    def create_submission(self, predictions, sample_submission_path, output_path):
        """
        Create submission file.
        """
        # Load sample submission
        sample_sub = pd.read_csv(sample_submission_path)

        # Update predictions
        sample_sub['label'] = predictions

        # Save submission
        sample_sub.to_csv(output_path, index=False)
        print(f"Submission saved to {output_path}")

        return sample_sub

print("Test data processor setup complete!")

Test data processor setup complete!


In [23]:
from google.colab import drive
drive.mount('/content/drive')

# Load final selected features
train = pd.read_parquet('/content/drive/MyDrive/DRW Crypto Market Prediction/train_final_selected.parquet')
selected_features_df = pd.read_csv('/content/drive/MyDrive/DRW Crypto Market Prediction/selected_features.csv')
feature_names = selected_features_df['feature'].tolist()

print(f"Loaded data shape: {train.shape}")
print(f"Selected features: {len(feature_names)}")

# Prepare features and target
X = train[feature_names]
y = train['label']  # Changed from 'target' to 'label'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Loaded data shape: (525886, 402)
Selected features: 400


In [24]:
!pip install optuna



In [25]:
# Core data science libraries
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Sklearn imports
from sklearn.model_selection import cross_val_score, StratifiedKFold, TimeSeriesSplit
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_selection import SelectKBest, f_regression

# LightGBM
import lightgbm as lgb

# Optimization
from scipy.stats import pearsonr
from scipy.optimize import minimize
import optuna

# Plotting
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn-v0_8')

# System and utilities
import gc
import os
import pickle
import joblib
from datetime import datetime
import time
from tqdm import tqdm

# Statistics
from scipy import stats
from scipy.stats import spearmanr

# Additional utilities
import itertools
from collections import defaultdict
import json

# Memory optimization
import psutil


In [26]:
# Complete CrossValidationRunner with hyperparameter fix
class CrossValidationRunner:
    def __init__(self, n_splits=5, test_size=0.2, gap=100, scorer=None):
        self.n_splits = n_splits
        self.test_size = test_size
        self.gap = gap
        self.scorer = scorer or pearson_scorer

    def split(self, X, y=None):
        n_samples = len(X)
        test_size = int(n_samples * self.test_size)

        for i in range(self.n_splits):
            split_point = n_samples - (self.n_splits - i) * test_size
            if split_point <= 0:
                continue

            train_end = split_point - self.gap
            test_start = split_point

            if train_end <= 0 or test_start >= n_samples:
                continue

            train_indices = list(range(0, train_end))
            test_indices = list(range(test_start, min(test_start + test_size, n_samples)))

            yield train_indices, test_indices

    def run_cv(self, X, y, model_params):
        """Run cross-validation and return scores"""
        scores = []

        for train_idx, val_idx in self.split(X, y):
            X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
            y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

            # FIXED: Handle early_stopping_rounds properly
            model_params_cv = model_params.copy()

            if 'early_stopping_rounds' in model_params_cv:
                # Remove early stopping for CV (it needs eval_set which complicates things)
                early_stopping = model_params_cv.pop('early_stopping_rounds')
                model = lgb.LGBMRegressor(**model_params_cv)
                model.fit(X_train, y_train)
            else:
                model = lgb.LGBMRegressor(**model_params_cv)
                model.fit(X_train, y_train)

            # Predict and score
            y_pred = model.predict(X_val)
            score = self.scorer(y_val, y_pred)
            scores.append(score)

        return {
            'scores': scores,
            'mean_score': np.mean(scores),
            'std_score': np.std(scores)
        }

# Define pearson_scorer function
def pearson_scorer(y_true, y_pred):
    """Pearson correlation scorer"""
    from scipy.stats import pearsonr
    corr, _ = pearsonr(y_true, y_pred)
    return corr if not np.isnan(corr) else 0.0

# Create the CV runner
cv_runner = CrossValidationRunner(n_splits=5, test_size=0.1, gap=100, scorer=pearson_scorer)
print(" Fixed CV runner created with proper early stopping handling")

 Fixed CV runner created with proper early stopping handling


In [None]:
# main


# 3. Create balanced regime labels
print("Creating balanced regime labels...")

# Better regime creation using quantiles for balance
vol_col = y.rolling(20).std()
regime_labels = pd.qcut(vol_col.dropna(), q=4, labels=['LOW_VOL', 'STABLE', 'HIGH_VOL', 'EXTREME'])

# Extend to match original length (fill NaN with most common)
regime_labels = regime_labels.reindex(y.index).fillna('STABLE')
print("Created balanced regime labels from target volatility")

print(f"Data shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"Regime distribution:\n{regime_labels.value_counts()}")

# 4. Initialize trainers
print("\nInitializing trainers...")
trainer = LightGBMTrainer(cv_runner, pearson_scorer)

# 5. Hyperparameter optimization
print("\n 1. Hyperparameter optimization")
best_params, hp_results = trainer.optimize_hyperparameters(X, y, max_combinations=20)
print(f"Best parameters found: {best_params}")

# 6. Market regime models
if regime_labels is not None:
    print("\n 2. Training market regime models")
    regime_trainer = MarketRegimeModels(cv_runner, best_params)
    regime_models = regime_trainer.train_regime_models(X, y, regime_labels)
    print(f"Trained {len(regime_models)} regime-specific models")
else:
    regime_models = None
    print("Skipping regime models - no regime labels available")

# 7. Final model training
print("\n 3. Training final model")
final_trainer = FinalModelTrainer(best_params)
final_model = final_trainer.train_final_model(
    X, y,
    save_path='/content/drive/MyDrive/DRW Crypto Market Prediction/final_model.pkl'
)
print(f"Final model trained with correlation: {final_trainer.training_metrics.get('pearson_correlation', 'N/A'):.4f}")

# 8. Model diagnostics
print("\n 4. Model diagnostics")
diagnostics = ModelDiagnostics(
    final_trainer.final_model,
    final_trainer.feature_importance,
    final_trainer.training_metrics
)

# Plot feature importance
print("Generating feature importance plot...")
top_features = diagnostics.plot_feature_importance(top_n=20)

# Plot prediction analysis
print("Generating prediction analysis...")
y_pred = final_trainer.final_model.predict(X)
diagnostics.plot_prediction_analysis(y, y_pred)

# Regime performance analysis
if regime_labels is not None:
    print("Analyzing regime performance...")
    regime_perf = diagnostics.analyze_regime_performance(y, y_pred, regime_labels)

# 9. Save results
print("\n=== 5. Saving results ===")
# Save feature importance
feature_importance_df = pd.DataFrame({
    'feature': [f[0] for f in top_features],
    'importance': [f[1] for f in top_features]
})
feature_importance_df.to_csv('/content/drive/MyDrive/DRW Crypto Market Prediction/feature_importance.csv', index=False)

# Save predictions
predictions_df = pd.DataFrame({
    'time_id': train['time_id'],
    'actual': y,
    'predicted': y_pred
})
predictions_df.to_csv('/content/drive/MyDrive/DRW Crypto Market Prediction/training_predictions.csv', index=False)

print("\n=== TRAINING COMPLETE ===")
print(f" Best parameters: {best_params}")
print(f" Training correlation: {final_trainer.training_metrics.get('pearson_correlation', 'N/A'):.4f}")
print(f" Top 5 features: {[f[0] for f in top_features[:5]]}")
print(f" Model saved to: final_model.pkl")
print(f" Results saved to Google Drive")

# Memory cleanup
del X, y, train
gc.collect()

Creating balanced regime labels...
Created balanced regime labels from target volatility
Data shape: (525886, 400)
Target shape: (525886,)
Regime distribution:
label
STABLE      131486
LOW_VOL     131467
EXTREME     131467
HIGH_VOL    131466
Name: count, dtype: int64

Initializing trainers...

 1. Hyperparameter optimization
Testing 20 parameter combinations...


Parameter tuning:   5%|▌         | 1/20 [04:29<1:25:11, 269.04s/it]

New best score: 0.0446 with params: {'bagging_fraction': 0.8, 'bagging_freq': 5, 'boosting_type': 'gbdt', 'early_stopping_rounds': 50, 'feature_fraction': 0.9, 'learning_rate': 0.1, 'metric': 'rmse', 'n_estimators': 500, 'num_leaves': 50, 'objective': 'regression', 'random_state': 42, 'verbose': -1}


Parameter tuning:  30%|███       | 6/20 [57:13<2:08:21, 550.08s/it]

New best score: 0.0488 with params: {'bagging_fraction': 0.8, 'bagging_freq': 5, 'boosting_type': 'gbdt', 'early_stopping_rounds': 50, 'feature_fraction': 0.9, 'learning_rate': 0.01, 'metric': 'rmse', 'n_estimators': 500, 'num_leaves': 50, 'objective': 'regression', 'random_state': 42, 'verbose': -1}
