In [1]:
import numpy as np
import pandas as pd 
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


/kaggle/input/drw-crypto-market-prediction/sample_submission.csv
/kaggle/input/drw-crypto-market-prediction/train.parquet
/kaggle/input/drw-crypto-market-prediction/test.parquet


In [2]:
import sys
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from scipy.stats import pearsonr

class Config:
    TRAIN_PATH       = "/kaggle/input/drw-crypto-market-prediction/train.parquet"
    TEST_PATH        = "/kaggle/input/drw-crypto-market-prediction/test.parquet"
    SUBMISSION_PATH  = "/kaggle/input/drw-crypto-market-prediction/sample_submission.csv"
    FEATURES         = [
        "X863","X856","X344","X598","X862","X385","X852","X603",
        "X860","X674","X415","X345","X137","X855","X174","X302",
        "X178","X532","X168","X612",
        "bid_qty","ask_qty","buy_qty","sell_qty","volume"
    ]
    LABEL_COLUMN     = "label"
    N_FOLDS          = 3
    RANDOM_STATE     = 42

# Hyperparameters for XGBoost and LightGBM
XGB_PARAMS = {
    "tree_method": "hist",
    "device": "gpu",
    "colsample_bylevel": 0.4778,
    "colsample_bynode": 0.3628,
    "colsample_bytree": 0.7107,
    "gamma": 1.7095,
    "learning_rate": 0.02213,
    "max_depth": 20,
    "max_leaves": 12,
    "min_child_weight": 16,
    "n_estimators": 1667,
    "subsample": 0.06567,
    "reg_alpha": 39.3524,
    "reg_lambda": 75.4484,
    "verbosity": 0,
    "random_state": Config.RANDOM_STATE,
    "n_jobs": -1,
    "verbose": False,
}

LGBM_PARAMS = {
    "boosting_type": "gbdt",
    "device": "gpu",
    "n_jobs": -1,
    "verbose": -1,
    "random_state": Config.RANDOM_STATE,
    "colsample_bytree": 0.5039,
    "learning_rate": 0.01260,
    "min_child_samples": 20,
    "min_child_weight": 0.1146,
    "n_estimators": 915,
    "num_leaves": 145,
    "reg_alpha": 19.2447,
    "reg_lambda": 55.5046,
    "subsample": 0.9709,
    "max_depth": 9
}

LEARNERS = [
    {"name": "xgb",  "Estimator": XGBRegressor,  "params": XGB_PARAMS},
    {"name": "lgbm", "Estimator": LGBMRegressor, "params": LGBM_PARAMS}
]

MODEL_SLICES = [
    {"name": "full_data",   "cutoff": 0},
    {"name": "last_75pct",  "cutoff": 0},  # to be set after loading
    {"name": "last_50pct",  "cutoff": 0}
]


def create_time_decay_weights(n: int, decay: float = 0.95) -> np.ndarray:
    positions = np.arange(n)
    normalized = positions / float(n - 1)
    weights = decay ** (1.0 - normalized)
    return weights * n / weights.sum()


def load_data():
    train_df = pd.read_parquet(
        Config.TRAIN_PATH,
        columns=Config.FEATURES + [Config.LABEL_COLUMN]
    ).reset_index(drop=True)
    test_df = pd.read_parquet(
        Config.TEST_PATH,
        columns=Config.FEATURES
    ).reset_index(drop=True)
    submission_df = pd.read_csv(Config.SUBMISSION_PATH)
    print(f"Loaded train: {train_df.shape}, test: {test_df.shape}, submission: {submission_df.shape}")
    return train_df, test_df, submission_df



############################
# MAIN
############################

train_df, test_df, submission_df = load_data()
n_samples = len(train_df)
# set slice cutoffs
MODEL_SLICES[1]["cutoff"] = int(0.25 * n_samples)
MODEL_SLICES[2]["cutoff"] = int(0.50 * n_samples)

# prepare storage for OOF and test preds
oof_preds = {
    learner["name"]: {sl["name"]: np.zeros(n_samples) for sl in MODEL_SLICES}
    for learner in LEARNERS
}
test_preds = {
    learner["name"]: {sl["name"]: np.zeros(len(test_df)) for sl in MODEL_SLICES}
    for learner in LEARNERS
}

full_weights = create_time_decay_weights(n_samples)
kf = KFold(n_splits=Config.N_FOLDS, shuffle=False)

# cross-validation
for fold, (train_idx, valid_idx) in enumerate(kf.split(train_df), start=1):
    print(f"\n--- Fold {fold}/{Config.N_FOLDS} ---")
    X_valid = train_df.iloc[valid_idx][Config.FEATURES]
    y_valid = train_df.iloc[valid_idx][Config.LABEL_COLUMN]

    for sl in MODEL_SLICES:
        slice_name = sl["name"]
        cutoff     = sl["cutoff"]
        subset     = train_df.iloc[cutoff:].reset_index(drop=True)
        rel_idx    = train_idx[train_idx >= cutoff] - cutoff

        X_train = subset.iloc[rel_idx][Config.FEATURES]
        y_train = subset.iloc[rel_idx][Config.LABEL_COLUMN]

        # sample weights
        if cutoff == 0:
            sw = full_weights[train_idx]
        else:
            sw_total = create_time_decay_weights(len(subset))
            sw = sw_total[rel_idx]

        for learner in LEARNERS:
            name      = learner["name"]
            Estimator = learner["Estimator"]
            params    = learner["params"]

            model = Estimator(**params)
            model.fit(X_train, y_train, sample_weight=sw,
                      eval_set=[(X_valid, y_valid)])

            # OOF predictions
            mask = valid_idx >= cutoff
            if mask.any():
                idxs = valid_idx[mask]
                oof_preds[name][slice_name][idxs] = model.predict(
                    train_df.iloc[idxs][Config.FEATURES])
            if cutoff > 0 and (~mask).any():
                oof_preds[name][slice_name][valid_idx[~mask]] = (
                    oof_preds[name]["full_data"][valid_idx[~mask]])

            # test predictions
            test_preds[name][slice_name] += model.predict(test_df[Config.FEATURES])

# average test preds
for name in test_preds:
    for slice_name in test_preds[name]:
        test_preds[name][slice_name] /= Config.N_FOLDS

# compute Pearson scores per learner and slice
pearson_scores = {
    name: {slice_name: pearsonr(train_df[Config.LABEL_COLUMN], preds)[0]
           for slice_name, preds in slices.items()}
    for name, slices in oof_preds.items()
}
print("\nPearson scores by learner and slice:")
print(pearson_scores)

# -- Ensemble per learner across slices --
learner_ensembles = {}
for learner_name, slice_scores in pearson_scores.items():
    # simple ensemble
    oof_simple = np.mean(list(oof_preds[learner_name].values()), axis=0)
    test_simple = np.mean(list(test_preds[learner_name].values()), axis=0)
    score_simple = pearsonr(train_df[Config.LABEL_COLUMN], oof_simple)[0]

    # weighted ensemble
    total_score = sum(slice_scores.values())
    slice_weights = {sn: sc/total_score for sn, sc in slice_scores.items()}
    oof_weighted = sum(slice_weights[sn] * oof_preds[learner_name][sn]
                       for sn in slice_weights)
    test_weighted = sum(slice_weights[sn] * test_preds[learner_name][sn]
                        for sn in slice_weights)
    score_weighted = pearsonr(train_df[Config.LABEL_COLUMN], oof_weighted)[0]

    print(f"\n{learner_name.upper()} Simple ensemble Pearson:   {score_simple:.4f}")
    print(f"{learner_name.upper()} Weighted ensemble Pearson: {score_weighted:.4f}")

    learner_ensembles[learner_name] = {
        "oof_simple": oof_simple,
        "test_simple": test_simple
    }

# -- Final ensemble across learners (simple) --
final_oof = np.mean([le["oof_simple"] for le in learner_ensembles.values()], axis=0)
final_test = np.mean([le["test_simple"] for le in learner_ensembles.values()], axis=0)
final_score = pearsonr(train_df[Config.LABEL_COLUMN], final_oof)[0]
print(f"\nFINAL ensemble across learners Pearson: {final_score:.4f}")

# save submission
submission_df["prediction"] = final_test
submission_df.to_csv("submission.csv", index=False)
print("Wrote submission.csv")

Loaded train: (525887, 26), test: (538150, 25), submission: (538150, 2)

--- Fold 1/3 ---
[0]	validation_0-rmse:1.00220
[1]	validation_0-rmse:1.00204
[2]	validation_0-rmse:1.00196
[3]	validation_0-rmse:1.00197
[4]	validation_0-rmse:1.00190
[5]	validation_0-rmse:1.00179
[6]	validation_0-rmse:1.00157
[7]	validation_0-rmse:1.00145
[8]	validation_0-rmse:1.00144
[9]	validation_0-rmse:1.00132
[10]	validation_0-rmse:1.00128
[11]	validation_0-rmse:1.00133
[12]	validation_0-rmse:1.00123
[13]	validation_0-rmse:1.00114
[14]	validation_0-rmse:1.00109
[15]	validation_0-rmse:1.00100
[16]	validation_0-rmse:1.00090
[17]	validation_0-rmse:1.00072
[18]	validation_0-rmse:1.00072
[19]	validation_0-rmse:1.00060
[20]	validation_0-rmse:1.00049
[21]	validation_0-rmse:1.00038
[22]	validation_0-rmse:1.00035
[23]	validation_0-rmse:1.00015
[24]	validation_0-rmse:1.00003
[25]	validation_0-rmse:0.99992
[26]	validation_0-rmse:0.99987
[27]	validation_0-rmse:0.99978
[28]	validation_0-rmse:0.99973
[29]	validation_0-rms



[0]	validation_0-rmse:1.00220
[1]	validation_0-rmse:1.00204
[2]	validation_0-rmse:1.00196
[3]	validation_0-rmse:1.00196
[4]	validation_0-rmse:1.00190
[5]	validation_0-rmse:1.00178
[6]	validation_0-rmse:1.00156
[7]	validation_0-rmse:1.00145
[8]	validation_0-rmse:1.00143
[9]	validation_0-rmse:1.00131
[10]	validation_0-rmse:1.00126
[11]	validation_0-rmse:1.00131
[12]	validation_0-rmse:1.00120
[13]	validation_0-rmse:1.00111
[14]	validation_0-rmse:1.00106
[15]	validation_0-rmse:1.00100
[16]	validation_0-rmse:1.00100
[17]	validation_0-rmse:1.00088
[18]	validation_0-rmse:1.00081
[19]	validation_0-rmse:1.00069
[20]	validation_0-rmse:1.00064
[21]	validation_0-rmse:1.00059
[22]	validation_0-rmse:1.00055
[23]	validation_0-rmse:1.00049
[24]	validation_0-rmse:1.00039
[25]	validation_0-rmse:1.00025
[26]	validation_0-rmse:1.00016
[27]	validation_0-rmse:1.00006
[28]	validation_0-rmse:0.99998
[29]	validation_0-rmse:0.99989
[30]	validation_0-rmse:0.99988
[31]	validation_0-rmse:0.99973
[32]	validation_0-