In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install catboost[gpu]
!pip install optuna
!pip install dask[dataframe]

Collecting catboost[gpu]
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m23.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7
Collecting optuna
  Downloading optuna-4.1.0-py3-none-any.whl.metadata (16 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.14.0-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.6-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.1.0-py3-none-any.whl (364 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m364.4/364.4 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.14.0-py3-none-any.whl (233 k

In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from tqdm import tqdm
from joblib import dump
from optuna import create_study

In [None]:
# Load and preprocess data
df_train = pd.read_csv('/content/drive/MyDrive/Xin/train_new_together.csv')
df_test = pd.read_csv('/content/drive/MyDrive/Xin/test_new_together.csv')
df_test.set_index('id', inplace=True)

X = df_train.drop(columns=['price'], axis=1)
y = df_train['price']

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
# Hyperparameter optimization for base models
def optimize_base_model(trial, model_name):
    """Objective function for optimizing base model hyperparameters."""
    if model_name == "CatBoost":
        params = {
            "depth": trial.suggest_int("depth", 3, 10),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
            "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1, 10),
            "iterations": 2000,
            "random_state": 42,
            "task_type": "GPU",
            "devices": "0",
            "verbose": 0
        }
        model = CatBoostClassifier(**params)
    elif model_name == "LightGBM":
        params = {
            "min_gain_to_split": 0.1,
            "min_data_in_leaf": 20,
            "num_leaves": trial.suggest_int("num_leaves", 31, 256),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
            "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
            "max_depth": trial.suggest_int("max_depth", 3, 12),
            "random_state": 42
        }
        model = LGBMClassifier(**params)
    elif model_name == "XGBoost":
        params = {
            "max_depth": trial.suggest_int("max_depth", 3, 10),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
            "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
            "subsample": trial.suggest_float("subsample", 0.5, 1.0),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
            "random_state": 42,
            "use_label_encoder": False,
            "eval_metric": "logloss"
        }
        model = XGBClassifier(**params)
    else:
        raise ValueError("Unknown model name!")

    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    return np.sqrt(mean_squared_error(y_test, predictions))



In [None]:
# Optimize each base model
optimized_models = {}
for model_name in ["CatBoost", "LightGBM", "XGBoost"]:
    print(f"Optimizing {model_name} with Optuna...")
    with tqdm(total=200, desc=f"{model_name} Optimization Progress") as pbar:
        def callback(study, trial):
            pbar.update(1)

        study = create_study(direction="minimize")
        study.optimize(lambda trial: optimize_base_model(trial, model_name), n_trials=100, callbacks=[callback])
        optimized_models[model_name] = study.best_params
        print(f"Best parameters for {model_name}: {study.best_params}")



In [None]:
# Train optimized base models with K-Fold
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
train_meta = np.zeros((X_train.shape[0], len(optimized_models)))
test_meta = np.zeros((X_test.shape[0], len(optimized_models)))

In [9]:
print("\nTraining optimized base models for stacking...")
for idx, (model_name, best_params) in enumerate(optimized_models.items()):
    print(f"Training {model_name} with optimized parameters...")
    test_meta_fold = np.zeros((n_splits, X_test.shape[0]))

    if model_name == "CatBoost":
        best_params.update({"random_state": 42, "task_type": "GPU", "devices": "0", "verbose": 0})
        model = CatBoostClassifier(**best_params)
    elif model_name == "LightGBM":
        best_params.update({"random_state": 42})
        model = LGBMClassifier(**best_params)
    elif model_name == "XGBoost":
        best_params.update({"random_state": 42, "use_label_encoder": False, "eval_metric": "logloss"})
        model = XGBClassifier(**best_params)

    with tqdm(total=n_splits, desc=f"{model_name} Training Progress") as pbar:
        for fold, (train_idx, val_idx) in enumerate(kf.split(X_train)):
            X_train_fold, X_val_fold = X_train.iloc[train_idx], X_train.iloc[val_idx]
            y_train_fold, y_val_fold = y_train.iloc[train_idx], y_train.iloc[val_idx]

            model.fit(X_train_fold, y_train_fold)
            # Flatten the prediction to match the shape of train_meta
            train_meta[val_idx, idx] = model.predict(X_val_fold).flatten()
            test_meta_fold[fold, :] = model.predict(X_test).flatten()
            pbar.update(1)

    test_meta[:, idx] = test_meta_fold.mean(axis=0)




Training optimized base models for stacking...
Training CatBoost with optimized parameters...


CatBoost Training Progress: 100%|██████████| 5/5 [05:39<00:00, 67.86s/it]


Training LightGBM with optimized parameters...


LightGBM Training Progress:   0%|          | 0/5 [00:00<?, ?it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006232 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 26449
[LightGBM] [Info] Number of data points in the train set: 10044, number of used features: 162
[LightGBM] [Info] Start training from score -1.757699
[LightGBM] [Info] Start training from score -1.776936
[LightGBM] [Info] Start training from score -1.814721
[LightGBM] [Info] Start training from score -1.721970
[LightGBM] [Info] Start training from score -1.886950
[LightGBM] [Info] Start training from score -1.800158


LightGBM Training Progress:  20%|██        | 1/5 [00:08<00:34,  8.63s/it]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003890 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 26450
[LightGBM] [Info] Number of data points in the train set: 10045, number of used features: 162
[LightGBM] [Info] Start training from score -1.758954
[LightGBM] [Info] Start training from score -1.764751
[LightGBM] [Info] Start training from score -1.814210
[LightGBM] [Info] Start training from score -1.746317
[LightGBM] [Info] Start training from score -1.886393
[LightGBM] [Info] Start training from score -1.786497


LightGBM Training Progress:  40%|████      | 2/5 [00:17<00:26,  8.74s/it]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006678 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 26443
[LightGBM] [Info] Number of data points in the train set: 10045, number of used features: 162
[LightGBM] [Info] Start training from score -1.757221
[LightGBM] [Info] Start training from score -1.776447
[LightGBM] [Info] Start training from score -1.822799
[LightGBM] [Info] Start training from score -1.702759
[LightGBM] [Info] Start training from score -1.902944
[LightGBM] [Info] Start training from score -1.799655


LightGBM Training Progress:  60%|██████    | 3/5 [00:26<00:17,  8.76s/it]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006191 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 26445
[LightGBM] [Info] Number of data points in the train set: 10045, number of used features: 162
[LightGBM] [Info] Start training from score -1.761268
[LightGBM] [Info] Start training from score -1.760110
[LightGBM] [Info] Start training from score -1.832706
[LightGBM] [Info] Start training from score -1.742330
[LightGBM] [Info] Start training from score -1.868175
[LightGBM] [Info] Start training from score -1.791859


LightGBM Training Progress:  80%|████████  | 4/5 [00:34<00:08,  8.64s/it]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003763 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 26430
[LightGBM] [Info] Number of data points in the train set: 10045, number of used features: 162
[LightGBM] [Info] Start training from score -1.774684
[LightGBM] [Info] Start training from score -1.754916
[LightGBM] [Info] Start training from score -1.821567
[LightGBM] [Info] Start training from score -1.734966
[LightGBM] [Info] Start training from score -1.881807
[LightGBM] [Info] Start training from score -1.789472


LightGBM Training Progress: 100%|██████████| 5/5 [00:43<00:00,  8.68s/it]


Training XGBoost with optimized parameters...


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

XGBoost Training Progress: 100%|██████████| 5/5 [01:25<00:00, 17.05s/it]


In [None]:
# Optimize meta model
print("\nOptimizing meta model with Optuna...")
def optimize_meta_model(trial):
    params = {
        "depth": trial.suggest_int("depth", 3, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1, 10),
        "iterations": 2000,
        "random_state": 42,
        "task_type": "GPU",
        "devices": "0",
        "verbose": 0
    }
    meta_model = CatBoostClassifier(**params)
    meta_model.fit(train_meta, y_train)
    meta_predictions = meta_model.predict(test_meta)
    return np.sqrt(mean_squared_error(y_test, meta_predictions))

with tqdm(total=50, desc="Meta Model Optimization Progress") as pbar:
    def callback_meta(study, trial):
        pbar.update(1)

    study_meta = create_study(direction="minimize")
    study_meta.optimize(optimize_meta_model, n_trials=50, callbacks=[callback_meta])
best_meta_params = study_meta.best_params
print(f"Best parameters for meta model: {best_meta_params}")

# Train final meta model
final_meta_model = CatBoostClassifier(
    **best_meta_params, random_state=42, task_type="GPU", devices="0", verbose=0
)
final_meta_model.fit(train_meta, y_train)





Optimizing meta model with Optuna...


Meta Model Optimization Progress:   0%|          | 0/50 [00:00<?, ?it/s][I 2024-11-21 03:10:39,276] A new study created in memory with name: no-name-7494125b-e898-4d88-b30c-5c133c8ccd6f


In [None]:
# Evaluate final meta model
meta_predictions = final_meta_model.predict(test_meta)
print(meta_predictions)
meta_predictions = np.array(meta_predictions).flatten()
print(len(meta_predictions))

stacking_rmse = np.sqrt(mean_squared_error(y_test, meta_predictions))
print(f"\nFinal Stacking Model RMSE: {stacking_rmse:.4f}")


[[1]
 [4]
 [3]
 ...
 [1]
 [0]
 [4]]
3140

Final Stacking Model RMSE: 0.8139


In [None]:
test_prediction = final_meta_model.predict(df_test)
test_prediction = np.array(test_prediction).flatten()
print(test_prediction)

[0 2 0 ... 2 0 0]


In [None]:

print("Generating test_meta features using optimized base models...")
for idx, (model_name, best_params) in enumerate(optimized_models.items()):
    print(f"Predicting {model_name} for df_test...")
    if model_name == "CatBoost":
        best_params.update({"random_state": 42, "task_type": "GPU", "devices": "0", "verbose": 0})
        model = CatBoostClassifier(**best_params)
    elif model_name == "LightGBM":
        best_params.update({"random_state": 42})
        model = LGBMClassifier(**best_params)
    elif model_name == "XGBoost":
        best_params.update({"random_state": 42, "use_label_encoder": False, "eval_metric": "logloss"})
        model = XGBClassifier(**best_params)

    # 训练基础模型
    model.fit(X_train, y_train)

    # 对 df_test 进行预测，存储结果
    testing_meta[:, idx] = model.predict(df_test).flatten()

# 使用最终的元模型对 df_test 进行预测
print("Predicting final prices for df_test using final_meta_model...")
meta_predictions = final_meta_model.predict(test_meta)
meta_predictions = np.array(meta_predictions).flatten()


Generating test_meta features using optimized base models...
Predicting CatBoost for df_test...
Predicting LightGBM for df_test...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007612 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 26638
[LightGBM] [Info] Number of data points in the train set: 12556, number of used features: 162
[LightGBM] [Info] Start training from score -1.761944
[LightGBM] [Info] Start training from score -1.766593
[LightGBM] [Info] Start training from score -1.821178
[LightGBM] [Info] Start training from score -1.729543
[LightGBM] [Info] Start training from score -1.885192
[LightGBM] [Info] Start training from score -1.793513
Predicting XGBoost for df_test...


Parameters: { "use_label_encoder" } are not used.



Predicting final prices for df_test using final_meta_model...


In [None]:
meta_predictions[:7]

array([4, 0, 3, 1, 3, 5, 4])

In [None]:

output_path_stacking = '/content/drive/MyDrive/Xin/test_predict_stacking.csv'
pd.DataFrame({'id': df_test.index, 'price': meta_predictions}).to_csv(output_path_stacking, index=False)
print(f"Test predictions saved to: {os.path.abspath(output_path_stacking)}")

Test predictions saved to: /content/drive/MyDrive/Xin/test_predict_stacking.csv
