In [1]:
!pip install catboost --quiet
!pip install XGBoost --quiet
!pip install LightGBM --quiet

In [2]:
import pandas as pd

# --- Load training and test data ---
train_df = pd.read_csv("train.csv")   # replace with correct path if needed
test_df  = pd.read_csv("test.csv")    # replace with correct path if needed

# --- Define target and ID columns ---
target_col = "price"       # target variable
id_col     = "sample_id"   # unique ID column

print("✅ Data loaded successfully!")
print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")
print(f"Target column: {target_col}")
print(f"ID column: {id_col}")


✅ Data loaded successfully!
Train shape: (75000, 4)
Test shape: (75000, 3)
Target column: price
ID column: sample_id


In [3]:
# ===================================================================
# K-FOLD ENSEMBLE: XGBoost + LightGBM + CatBoost + Tweedie-LGBM
# (log-target training, SMAPE computed on original price scale)
# ===================================================================

import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

import time

# -----------------------------
# Ensure these variables exist:
# train_df, test_df, target_col, id_col
# Also provide paths to embedding files:
train_emb_file = "train_e5_embeddings.npy"  # <-- update path if needed
test_emb_file  = "test_e5_embeddings.npy"   # <-- update path if needed
# -----------------------------

# Load embeddings from files
train_embeddings = np.load(train_emb_file)
test_embeddings  = np.load(test_emb_file)

# Create embedding DataFrames
train_embedding_df = pd.DataFrame(train_embeddings, columns=[f'e5_emb_{i}' for i in range(train_embeddings.shape[1])])
test_embedding_df  = pd.DataFrame(test_embeddings,  columns=[f'e5_emb_{i}' for i in range(test_embeddings.shape[1])])

# Combine embeddings with your text-stat features (same feature selection you used)
feature_cols = [col for col in train_df.columns if any(
    suffix in col for suffix in ['_char_count', '_word_count', '_avg_word_length', 
                                 '_sentence_count', '_uppercase_ratio', '_digit_ratio',
                                 '_special_char_ratio', '_unique_word_ratio']
)]

train_features = pd.concat([train_embedding_df, train_df[feature_cols].reset_index(drop=True)], axis=1)
test_features  = pd.concat([test_embedding_df,  test_df[feature_cols].reset_index(drop=True)], axis=1)

# Fill missing and scale
train_features = train_features.fillna(0)
test_features  = test_features.fillna(0)

scaler = StandardScaler()
train_features_scaled = scaler.fit_transform(train_features)
test_features_scaled  = scaler.transform(test_features)

print(f"Features prepared. Shape: {train_features_scaled.shape}")

# -----------------------------
# Target: use log1p for training stability
# -----------------------------
y_orig = train_df[target_col].values.astype(float)    # original prices
y_log  = np.log1p(y_orig)                             # log-transformed target used for training

# -----------------------------
# Define models (kept as requested)
# -----------------------------
models = {
    "XGBoost": XGBRegressor(
        n_estimators=2000,
        learning_rate=0.03,
        max_depth=6,
        min_child_weight=5,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_alpha=2.0,
        reg_lambda=4.0,
        gamma=0.2,
        tree_method="hist",
        random_state=42,
        n_jobs=-1,
        verbosity=0
    ),

    "LightGBM": LGBMRegressor(
        n_estimators=2000,
        learning_rate=0.03,
        num_leaves=128,
        max_depth=8,
        min_data_in_leaf=40,
        feature_fraction=0.8,
        bagging_fraction=0.8,
        bagging_freq=5,
        lambda_l1=1.0,
        lambda_l2=2.0,
        objective="regression_l1",
        metric="rmse",
        random_state=42,
        n_jobs=-1,
        verbose=-1
    ),

    "CatBoost": CatBoostRegressor(
        iterations=2000,
        depth=10,
        learning_rate=0.02,
        l2_leaf_reg=6.0,
        random_seed=42,
        loss_function="RMSE",
        bootstrap_type="Bernoulli",
        subsample=0.7,
        rsm=0.8,
        grow_policy="Lossguide",
        eval_metric="RMSE",
        verbose=0
    ),

    "Tweedie": LGBMRegressor(
        n_estimators=2000,
        learning_rate=0.03,
        num_leaves=128,
        max_depth=8,
        min_data_in_leaf=40,
        feature_fraction=0.8,
        bagging_fraction=0.8,
        bagging_freq=5,
        lambda_l1=1.0,
        lambda_l2=2.0,
        objective="tweedie",
        tweedie_variance_power=1.5,
        metric="rmse",
        random_state=42,
        n_jobs=-1,
        verbose=-1
    )
}

# -----------------------------
# SMAPE computed on original scale
# -----------------------------
def smape_original_scale(y_true_orig, y_pred_orig):
    denom = (np.abs(y_true_orig) + np.abs(y_pred_orig))
    diff = np.abs(y_true_orig - y_pred_orig) / np.where(denom == 0, 1, denom)
    return 100.0 * np.mean(diff)

# -----------------------------
# K-Fold training
# -----------------------------
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

oof_preds_log = np.zeros(len(train_features_scaled))
test_preds_log = np.zeros((len(test_features_scaled), len(models)))
fold_scores = []

start_time = time.time()
for fold, (tr_idx, val_idx) in enumerate(kf.split(train_features_scaled), 1):
    t0 = time.time()
    print(f"\n==== Fold {fold}/{n_splits} ====")
    X_tr, X_val = train_features_scaled[tr_idx], train_features_scaled[val_idx]
    y_tr_log, y_val_log = y_log[tr_idx], y_log[val_idx]
    y_val_orig = y_orig[val_idx]

    fold_model_val_preds_log = np.zeros((len(val_idx), len(models)))
    
    for m_idx, (name, model) in enumerate(models.items()):
        print(f"Training {name} ...", end=' ')
        try:
            if name == "XGBoost":
                model.fit(
                    X_tr, y_tr_log,
                    eval_set=[(X_val, y_val_log)],
                    eval_metric="rmse",
                    early_stopping_rounds=100,
                    verbose=False
                )
            elif name in ("LightGBM", "Tweedie"):
                model.fit(
                    X_tr, y_tr_log,
                    eval_set=[(X_val, y_val_log)],
                    eval_metric="rmse",
                    early_stopping_rounds=100,
                    verbose=False
                )
            elif name == "CatBoost":
                model.fit(
                    X_tr, y_tr_log,
                    eval_set=(X_val, y_val_log),
                    use_best_model=True,
                    early_stopping_rounds=100,
                    verbose=False
                )
            else:
                model.fit(X_tr, y_tr_log)
        except Exception as e:
            print(f"(eval fit failed: {e}) using basic fit.")
            model.fit(X_tr, y_tr_log)
        
        val_pred_log = model.predict(X_val)
        fold_model_val_preds_log[:, m_idx] = val_pred_log
        
        val_pred_orig = np.expm1(val_pred_log)
        model_smape = smape_original_scale(y_val_orig, val_pred_orig)
        print(f"done. SMAPE (orig scale) = {model_smape:.4f}")
    
    val_pred_mean_log = np.mean(fold_model_val_preds_log, axis=1)
    oof_preds_log[val_idx] = val_pred_mean_log
    
    val_pred_mean_orig = np.expm1(val_pred_mean_log)
    fold_smape = smape_original_scale(y_val_orig, val_pred_mean_orig)
    fold_scores.append(fold_smape)
    print(f"Fold {fold} ensemble SMAPE (orig scale): {fold_smape:.4f} (time: {time.time()-t0:.1f}s)")

    test_fold_preds = np.column_stack([model.predict(test_features_scaled) for model in models.values()])
    test_preds_log += test_fold_preds / n_splits

total_time = time.time() - start_time

oof_preds_orig = np.expm1(oof_preds_log)
final_oof_smape = smape_original_scale(y_orig, oof_preds_orig)

print("\n" + "="*60)
print(f"Final OOF SMAPE (orig price scale): {final_oof_smape:.4f}")
print(f"Fold SMAPEs mean: {np.mean(fold_scores):.4f} ± {np.std(fold_scores):.4f}")
print(f"Total training time: {total_time:.1f}s")
print("="*60)

test_pred_final_orig = np.expm1(np.mean(test_preds_log, axis=1))



submission = pd.DataFrame({
    "sample_id": test_df[id_col],
    "price": test_pred_final_orig
})[["sample_id", "price"]]

out_fname = "submission_kfold_logblend.csv"
submission.to_csv(out_fname, index=False)

print(f"\n✅ Submission file saved as '{out_fname}'")
print("📄 First few rows of submission:")
display(submission.head())


Features prepared. Shape: (75000, 1024)

==== Fold 1/5 ====
Training XGBoost ... (eval fit failed: XGBModel.fit() got an unexpected keyword argument 'eval_metric') using basic fit.
done. SMAPE (orig scale) = 28.3861
Training LightGBM ... (eval fit failed: LGBMRegressor.fit() got an unexpected keyword argument 'early_stopping_rounds') using basic fit.
done. SMAPE (orig scale) = 29.0680
Training CatBoost ... done. SMAPE (orig scale) = 29.6246
Training Tweedie ... (eval fit failed: LGBMRegressor.fit() got an unexpected keyword argument 'early_stopping_rounds') using basic fit.
done. SMAPE (orig scale) = 27.9978
Fold 1 ensemble SMAPE (orig scale): 28.5439 (time: 3185.9s)

==== Fold 2/5 ====
Training XGBoost ... (eval fit failed: XGBModel.fit() got an unexpected keyword argument 'eval_metric') using basic fit.
done. SMAPE (orig scale) = 28.2032
Training LightGBM ... (eval fit failed: LGBMRegressor.fit() got an unexpected keyword argument 'early_stopping_rounds') using basic fit.
done. SMAPE

Unnamed: 0,sample_id,price
0,100179,13.815932
1,245611,11.742197
2,146263,25.263652
3,95658,10.819246
4,36806,19.624214
