In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
# ============================================================
# üöÄ Phase 11 ‚Äî Encoded + Transformed Feature Ensemble
# ============================================================

import numpy as np, pandas as pd
import lightgbm as lgb, xgboost as xgb
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from scipy.optimize import minimize

# ------------------------------------------------------------
# üì• Load data
# ------------------------------------------------------------
df = pd.read_csv("/kaggle/input/ensembles/train_hardcore_nlp_features.csv")
print(f"‚úÖ Data Loaded | Shape: {df.shape}")

# ------------------------------------------------------------
# üéØ Target (log-transformed)
# ------------------------------------------------------------
y = np.log1p(df["price"])
X = df.drop(columns=["price"]).copy()

# ------------------------------------------------------------
# üßπ Fill NA, enforce consistent types
# ------------------------------------------------------------
X = X.fillna(0)
for col in ["brand_name", "category", "unit"]:
    X[col] = X[col].astype(str)

# ============================================================
# ‚ú® PHASE 11 ADDITIONS ‚Äî Encoding & Transformations
# ============================================================

# 1Ô∏è‚É£ Frequency encoding
for col in ["brand_name", "category", "unit"]:
    freq_map = X[col].value_counts().to_dict()
    X[col + "_freq"] = X[col].map(freq_map)

# 2Ô∏è‚É£ Log transform skewed numeric features
for col in ["desc_char_count", "desc_word_count", "total_text_length", "flesch_grade"]:
    X[col + "_log"] = np.log1p(X[col])

# 3Ô∏è‚É£ Target encoding (per category & brand)
for col in ["brand_name", "category"]:
    mean_map = df.groupby(col)["price"].mean().to_dict()
    X[col + "_te"] = X[col].map(mean_map)

# 4Ô∏è‚É£ Ratio / interaction features
X["word_char_ratio"] = (X["desc_word_count"] + 1) / (X["desc_char_count"] + 1)
X["words_per_bullet"] = (X["desc_word_count"] + 1) / (X["bullet_count"] + 1)
X["text_density"] = (X["total_text_length"] + 1) / (X["flesch_grade"] + 2)

# 5Ô∏è‚É£ Brand-category combination
X["brand_category"] = X["brand_name"] + "_" + X["category"]
le = LabelEncoder()
X["brand_category"] = le.fit_transform(X["brand_category"])

# 6Ô∏è‚É£ Scale continuous features
scale_cols = [c for c in X.select_dtypes(include=[np.number]).columns if X[c].nunique() > 10]
scaler = StandardScaler()
X[scale_cols] = scaler.fit_transform(X[scale_cols])

# ============================================================
# ‚úÇÔ∏è Train-validation split
# ============================================================
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)

# ============================================================
# ‚öôÔ∏è SMAPE Metric
# ============================================================
def smape(y_true, y_pred):
    return np.mean(200 * np.abs(y_pred - y_true) /
                   (np.abs(y_true) + np.abs(y_pred) + 1e-8))

# ============================================================
# ‚úÖ LightGBM
# ============================================================
print("\nüöÄ Training LightGBM ...")
lgb_params = dict(
    objective="regression",
    device="gpu",
    learning_rate=0.05,
    n_estimators=2000,
    num_leaves=256,
    max_depth=14,
    feature_fraction=0.9,
    bagging_fraction=0.9,
    reg_alpha=0.3,
    reg_lambda=1.0,
    min_child_samples=20,
    random_state=42,
    verbosity=-1,
)
non_obj = [c for c in X_train.columns if X_train[c].dtype != "object"]

lgb_model = lgb.LGBMRegressor(**lgb_params)
lgb_model.fit(X_train[non_obj], y_train,
              eval_set=[(X_val[non_obj], y_val)],
              eval_metric="l1",
              callbacks=[lgb.early_stopping(100), lgb.log_evaluation(200)])

lgb_pred = np.expm1(lgb_model.predict(X_val[non_obj]))
smape_lgb = smape(np.expm1(y_val), lgb_pred)
print(f"‚úÖ LightGBM SMAPE: {smape_lgb:.2f}%")

# ============================================================
# ‚úÖ XGBoost
# ============================================================
print("\nüöÄ Training XGBoost ...")
xgb_params = dict(
    tree_method="gpu_hist",
    objective="reg:squarederror",
    eval_metric="mae",
    learning_rate=0.05,
    max_depth=12,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.2,
    reg_lambda=1.2,
    min_child_weight=3,
    gamma=0.2,
    seed=42,
)
xgb_model = xgb.XGBRegressor(**xgb_params, n_estimators=2000)
xgb_model.fit(X_train[non_obj], y_train,
              eval_set=[(X_val[non_obj], y_val)],
              early_stopping_rounds=100,
              verbose=200)
xgb_pred = np.expm1(xgb_model.predict(X_val[non_obj]))
smape_xgb = smape(np.expm1(y_val), xgb_pred)
print(f"‚úÖ XGBoost SMAPE: {smape_xgb:.2f}%")

# ============================================================
# ‚úÖ CatBoost (Robust Version ‚Äî final fix)
# ============================================================
print("\nüöÄ Training CatBoost ...")

# Explicit lists
cat_features = ["brand_name", "category", "unit"]
text_features = ["item_name", "bullet_points", "product_description"]

# Ensure text columns are strings
for col in text_features:
    if col in X_train.columns:
        X_train[col] = X_train[col].astype(str)
        X_val[col] = X_val[col].astype(str)

# Drop encoded/text-mixed object columns (if any)
non_declared_objs = [
    c for c in X_train.columns
    if (X_train[c].dtype == "object") and (c not in cat_features + text_features)
]
if non_declared_objs:
    print(f"üßπ Dropping non-declared object columns: {non_declared_objs}")
    X_train = X_train.drop(columns=non_declared_objs)
    X_val = X_val.drop(columns=non_declared_objs)

# Create Pools cleanly
train_pool = Pool(
    data=X_train,
    label=y_train,
    cat_features=cat_features,
    text_features=text_features
)
val_pool = Pool(
    data=X_val,
    label=y_val,
    cat_features=cat_features,
    text_features=text_features
)

cat_model = CatBoostRegressor(
    iterations=1500,
    learning_rate=0.05,
    depth=12,
    l2_leaf_reg=6,
    loss_function="MAE",
    eval_metric="MAE",
    task_type="GPU",
    random_seed=42,
    verbose=200
)

cat_model.fit(train_pool, eval_set=val_pool)

cat_pred = np.expm1(cat_model.predict(X_val))
smape_cat = smape(np.expm1(y_val), cat_pred)
print(f"‚úÖ CatBoost SMAPE: {smape_cat:.2f}%")


# ============================================================
# üéØ Optimized Ensemble
# ============================================================
print("\nüîÆ Optimizing blend weights ...")
stack = np.vstack([lgb_pred, xgb_pred, cat_pred])

def smape_loss(w):
    pred = np.dot(w, stack)
    return smape(np.expm1(y_val), pred)

cons = {"type": "eq", "fun": lambda w: np.sum(w) - 1}
bounds = [(0,1)] * 3
res = minimize(smape_loss, [0.33,0.33,0.34], bounds=bounds, constraints=cons)
best_w = res.x / np.sum(res.x)
blend_pred = np.dot(best_w, stack)
blend_smape = smape(np.expm1(y_val), blend_pred)
print(f"üéØ Optimal Weights: {best_w.round(3)}")
print(f"üèÜ Optimized Blend SMAPE: {blend_smape:.2f}%")

# ============================================================
# üèÅ Comparison Table
# ============================================================
summary = pd.DataFrame({
    "Model": ["LightGBM", "XGBoost", "CatBoost", "Optimized Blend"],
    "Validation_SMAPE": [smape_lgb, smape_xgb, smape_cat, blend_smape],
}).sort_values("Validation_SMAPE")

print("\nüèÅ Final SMAPE Comparison:")
display(summary)


‚úÖ Data Loaded | Shape: (75000, 53)


  result = getattr(ufunc, method)(*inputs, **kwargs)



üöÄ Training LightGBM ...




Training until validation scores don't improve for 100 rounds
[200]	valid_0's l1: 0.298867	valid_0's l2: 0.241563
[400]	valid_0's l1: 0.29821	valid_0's l2: 0.239823
Early stopping, best iteration is:
[374]	valid_0's l1: 0.298048	valid_0's l2: 0.239699
‚úÖ LightGBM SMAPE: 30.57%

üöÄ Training XGBoost ...



    E.g. tree_method = "hist", device = "cuda"



[0]	validation_0-mae:0.74623
[200]	validation_0-mae:0.30376
[400]	validation_0-mae:0.30337
[600]	validation_0-mae:0.30316
[799]	validation_0-mae:0.30306
‚úÖ XGBoost SMAPE: 31.04%

üöÄ Training CatBoost ...
üßπ Dropping non-declared object columns: ['product_type', 'item_name_clean', 'bullet_points_clean']



    E.g. tree_method = "hist", device = "cuda"

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Default metric period is 5 because MAE is/are not implemented for GPU


0:	learn: 0.7471094	test: 0.7593415	best: 0.7593415 (0)	total: 308ms	remaining: 7m 41s
200:	learn: 0.2788822	test: 0.3000354	best: 0.3000354 (200)	total: 16.4s	remaining: 1m 45s
400:	learn: 0.2557142	test: 0.2946588	best: 0.2946588 (400)	total: 31.2s	remaining: 1m 25s
600:	learn: 0.2404301	test: 0.2932966	best: 0.2932966 (600)	total: 45.7s	remaining: 1m 8s
800:	learn: 0.2152614	test: 0.2939593	best: 0.2929553 (670)	total: 1m 5s	remaining: 56.7s
1000:	learn: 0.1941021	test: 0.2956187	best: 0.2929553 (670)	total: 1m 24s	remaining: 42.3s
1200:	learn: 0.1777014	test: 0.2967512	best: 0.2929553 (670)	total: 1m 43s	remaining: 25.9s
1400:	learn: 0.1637891	test: 0.2979410	best: 0.2929553 (670)	total: 2m 2s	remaining: 8.69s
1499:	learn: 0.1571803	test: 0.2984485	best: 0.2929553 (670)	total: 2m 12s	remaining: 0us
bestTest = 0.2929553385
bestIteration = 670
Shrink model to first 671 iterations.
‚úÖ CatBoost SMAPE: 29.99%

üîÆ Optimizing blend weights ...
üéØ Optimal Weights: [0.295 0.114 0.591]


Unnamed: 0,Model,Validation_SMAPE
3,Optimized Blend,29.53676
2,CatBoost,29.99476
0,LightGBM,30.571158
1,XGBoost,31.037553
