In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/drw-crypto-market-prediction/sample_submission.csv
/kaggle/input/drw-crypto-market-prediction/train.parquet
/kaggle/input/drw-crypto-market-prediction/test.parquet


In [2]:
import pandas as pd

train = pd.read_parquet('/kaggle/input/drw-crypto-market-prediction/train.parquet')
train.head()


Unnamed: 0,bid_qty,ask_qty,buy_qty,sell_qty,volume,X1,X2,X3,X4,X5,...,X772,X773,X774,X775,X776,X777,X778,X779,X780,label
2023-03-01 00:00:00,15.283,8.425,176.405,44.984,221.389,0.181844,-0.63786,0.006652,0.13687,0.116698,...,0.333753,-0.009992,-0.695595,-0.444077,-0.191238,-0.184251,-0.471897,-0.625428,-0.553991,0.562539
2023-03-01 00:01:00,38.59,2.336,525.846,321.95,847.796,0.489497,-0.075619,0.431594,0.5224,0.475255,...,0.333657,-0.01004,-0.696226,-0.452866,-0.200082,-0.188929,-0.472842,-0.625832,-0.554426,0.533686
2023-03-01 00:02:00,0.442,60.25,159.227,136.369,295.596,0.260121,-0.444684,0.100695,0.224729,0.203282,...,0.333667,-0.010037,-0.696832,-0.461383,-0.208786,-0.193571,-0.473785,-0.626236,-0.55486,0.546505
2023-03-01 00:03:00,4.865,21.016,335.742,124.963,460.705,0.099976,-0.666728,-0.123858,0.019197,0.014459,...,0.333174,-0.010279,-0.697391,-0.469628,-0.21735,-0.198175,-0.474726,-0.626639,-0.555294,0.357703
2023-03-01 00:04:00,27.158,3.451,98.411,44.407,142.818,0.270893,-0.325973,0.116336,0.234311,0.214073,...,0.333171,-0.010283,-0.69794,-0.477622,-0.22578,-0.202745,-0.475666,-0.627043,-0.555728,0.362452


In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, BayesianRidge
import xgboost as xgb
import lightgbm as lgb
import gc

SEED = 2024
np.random.seed(SEED)

# Load
print("📦 Loading Data...")
train = pd.read_parquet('/kaggle/input/drw-crypto-market-prediction/train.parquet')
test = pd.read_parquet('/kaggle/input/drw-crypto-market-prediction/test.parquet')
sample_submission = pd.read_csv('/kaggle/input/drw-crypto-market-prediction/sample_submission.csv')

# Feature Engineering
def engineer_features(df):
    df = df.copy()
    df['spread'] = np.abs(df['ask_qty'] - df['bid_qty']) / (df['ask_qty'] + df['bid_qty'] + 1e-6)
    df['ofi'] = (df['bid_qty'] - df['ask_qty']) / (df['bid_qty'] + df['ask_qty'] + 1e-6)
    df['buy_sell_ratio'] = (df['buy_qty'] + 1e-6) / (df['sell_qty'] + 1e-6)
    df['volume_imbalance'] = (df['buy_qty'] - df['sell_qty']) / (df['buy_qty'] + df['sell_qty'] + 1e-6)
    df['log_volume'] = np.log1p(df['volume'])
    df['liquidity_imbalance'] = (df['bid_qty'] - df['ask_qty']) / (df['bid_qty'] + df['ask_qty'] + 1e-6)
    df.replace([np.inf, -np.inf], 0, inplace=True)
    df.fillna(0, inplace=True)
    return df.astype(np.float32)

def add_simple_rolling(df, features, windows=[3, 5], lags=[1]):
    df_new = df.copy()
    for col in features:
        for w in windows:
            df_new[f'{col}_rollmean{w}'] = df[col].rolling(window=w, min_periods=1).mean()
            df_new[f'{col}_ema{w}'] = df[col].ewm(span=w, adjust=False).mean()
        for l in lags:
            df_new[f'{col}_lag{l}'] = df[col].shift(l)
    df_new.replace([np.inf, -np.inf], 0, inplace=True)
    df_new.fillna(0, inplace=True)
    return df_new.astype(np.float32)

print("🛠 Feature Engineering...")
train = engineer_features(train)
test = engineer_features(test)

core_feats = ['spread', 'ofi', 'buy_sell_ratio', 'volume_imbalance', 'liquidity_imbalance', 'log_volume']
print(f"Using core features: {core_feats}")

train = add_simple_rolling(train, core_feats)
test = add_simple_rolling(test, core_feats)

ignore = set(['timestamp', 'asset', 'label', 'log_return_forward_1s'])
feature_cols = [col for col in train.columns if col not in ignore and not train[col].isnull().all()]

min_lag = 5
train = train.iloc[min_lag:].reset_index(drop=True)

# Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(train[feature_cols])
X_test_scaled = scaler.transform(test[feature_cols])

y_raw = train['label']
y = (y_raw - y_raw.mean()) / (y_raw.std() + 1e-6)

# Time-based Split
N = len(X_train_scaled)
split = int(0.85 * N)
gap = int(0.05 * N)
X_tr = X_train_scaled[:split]
y_tr = y.iloc[:split]
X_val = X_train_scaled[split + gap:]
y_val = y.iloc[split + gap:]

del train
gc.collect()

print("🎯 Training Models...")

# XGBoost
model_xgb = xgb.XGBRegressor(
    objective='reg:squarederror',
    learning_rate=0.012,
    max_depth=5,
    n_estimators=500,
    subsample=0.85,
    colsample_bytree=0.7,
    tree_method='hist',
    reg_alpha=0.4,
    reg_lambda=1.5,
    early_stopping_rounds=30,
    random_state=SEED,
    verbosity=0
)
model_xgb.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], verbose=False)
xgb_val = model_xgb.predict(X_val)

# LightGBM
model_lgb = lgb.LGBMRegressor(
    objective='regression',
    learning_rate=0.01,
    max_depth=6,
    n_estimators=500,
    subsample=0.85,
    colsample_bytree=0.75,
    reg_alpha=0.3,
    reg_lambda=1.2,
    max_bin=96,
    force_col_wise=True,
    random_state=SEED
)
model_lgb.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], eval_metric='rmse')
lgb_val = model_lgb.predict(X_val)

# Ridge
ridge = Ridge(alpha=1.0)
ridge.fit(X_tr, y_tr)
ridge_val = ridge.predict(X_val)

# Bayesian Ridge
bayes = BayesianRidge()
bayes.fit(X_tr, y_tr)
bayes_val = bayes.predict(X_val)

# Blend
w_xgb = 0.45
w_lgb = 0.3
w_ridge = 0.15
w_bayes = 0.10

val_blend = w_xgb * xgb_val + w_lgb * lgb_val + w_ridge * ridge_val + w_bayes * bayes_val
val_corr = np.corrcoef(y_val, val_blend)[0, 1]
print("📈 Validation Pearson Correlation (blended):", round(val_corr, 6))

del X_tr, X_val, y_tr, y_val
gc.collect()

print("✏️ Predicting on Test Set...")
xgb_pred = model_xgb.predict(X_test_scaled)
lgb_pred = model_lgb.predict(X_test_scaled)
ridge_pred = ridge.predict(X_test_scaled)
bayes_pred = bayes.predict(X_test_scaled)

test_preds = w_xgb * xgb_pred + w_lgb * lgb_pred + w_ridge * ridge_pred + w_bayes * bayes_pred
test_preds_final = test_preds * y_raw.std() + y_raw.mean()

print("💾 Saving Submission...")
submission = sample_submission.copy()
submission['prediction'] = test_preds_final
submission.to_csv('submission.csv', index=False)
print("✅ submission.csv saved with", len(submission), "rows.")


📦 Loading Data...
🛠 Feature Engineering...
Using core features: ['spread', 'ofi', 'buy_sell_ratio', 'volume_imbalance', 'liquidity_imbalance', 'log_volume']
🎯 Training Models...
[LightGBM] [Info] Total Bins 78804
[LightGBM] [Info] Number of data points in the train set: 446998, number of used features: 821
[LightGBM] [Info] Start training from score -0.009776
📈 Validation Pearson Correlation (blended): 0.065957
✏️ Predicting on Test Set...
💾 Saving Submission...
✅ submission.csv saved with 538150 rows.
