In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/e5-embedd/train_e5_embeddings.npy
/kaggle/input/e5-embedd/test_e5_embeddings.npy
/kaggle/input/e5-embedd/train.csv
/kaggle/input/e5-embedd/test.csv


In [5]:
!pip install catboost --quiet
!pip install XGBoost --quiet
!pip install LightGBM --quiet
!pip install optuna -q


In [3]:
import pandas as pd

# --- Load training and test data ---
train_df = pd.read_csv("/kaggle/input/e5-embedd/train.csv")   # replace with correct path if needed
test_df  = pd.read_csv("/kaggle/input/e5-embedd/test.csv")    # replace with correct path if needed

# --- Define target and ID columns ---
target_col = "price"       # target variable
id_col     = "sample_id"   # unique ID column

print("✅ Data loaded successfully!")
print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")
print(f"Target column: {target_col}")
print(f"ID column: {id_col}")


✅ Data loaded successfully!
Train shape: (75000, 4)
Test shape: (75000, 3)
Target column: price
ID column: sample_id


In [11]:
# ===================================================================
# MAXIMUM SMAPE REDUCTION: ADVANCED FEATURE ENGINEERING +
# QUANTILE REGRESSION + PSEUDO-LABELING + STACKING
# OPTIMIZED VERSION: 80/20 TRAIN-TEST SPLIT (No K-Fold)
# Target: Reduce SMAPE from 56 to ~40
# ===================================================================

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import Ridge, QuantileRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from scipy.stats import skew, kurtosis
import time
import warnings
warnings.filterwarnings('ignore')

print("="*80)
print("🚀 ADVANCED PRICE PREDICTION PIPELINE - MAXIMUM SMAPE REDUCTION")
print("⚡ FAST MODE: 80/20 TRAIN-TEST SPLIT (No K-Fold)")
print("="*80)

# -----------------------------
# Define pseudo-labeling function BEFORE use
# -----------------------------
def pseudo_label_test_data(train_X, train_y, test_X, confidence_threshold=0.80):
    """Generate high-confidence pseudo-labels for test data"""
    
    quick_models = [
        LGBMRegressor(n_estimators=500, learning_rate=0.05, verbose=-1, random_state=42),
        XGBRegressor(n_estimators=500, learning_rate=0.05, verbosity=0, random_state=42)
    ]
    
    test_preds = []
    for model in quick_models:
        model.fit(train_X, train_y)
        test_preds.append(model.predict(test_X))
    
    test_pred_mean = np.mean(test_preds, axis=0)
    test_pred_std = np.std(test_preds, axis=0)
    
    confidence = 1 / (1 + test_pred_std)
    high_conf_mask = confidence > np.quantile(confidence, confidence_threshold)
    
    pseudo_X = test_X[high_conf_mask]
    pseudo_y = test_pred_mean[high_conf_mask]
    
    print(f"  ✓ Generated {len(pseudo_y)} pseudo-labels ({len(pseudo_y)/len(test_X)*100:.1f}% of test data)")
    print(f"  ✓ Mean confidence: {confidence[high_conf_mask].mean():.3f}")
    
    return pseudo_X, pseudo_y

# -----------------------------
# 1. LOAD AND PREPARE BASE FEATURES
# -----------------------------
print("\n[Step 1/6] Loading embeddings and base features...")
train_emb_file = "/kaggle/input/e5-embedd/train_e5_embeddings.npy"
test_emb_file  = "/kaggle/input/e5-embedd/test_e5_embeddings.npy"

train_embeddings = np.load(train_emb_file)
test_embeddings  = np.load(test_emb_file)

train_embedding_df = pd.DataFrame(train_embeddings, columns=[f'e5_emb_{i}' for i in range(train_embeddings.shape[1])])
test_embedding_df  = pd.DataFrame(test_embeddings,  columns=[f'e5_emb_{i}' for i in range(test_embeddings.shape[1])])

feature_cols = [col for col in train_df.columns if any(
    suffix in col for suffix in ['_char_count', '_word_count', '_avg_word_length', 
                                 '_sentence_count', '_uppercase_ratio', '_digit_ratio',
                                 '_special_char_ratio', '_unique_word_ratio']
)]

print(f"✓ Loaded {train_embeddings.shape[1]} embedding dimensions")
print(f"✓ Loaded {len(feature_cols)} text statistics features")

# -----------------------------
# 2. ADVANCED FEATURE ENGINEERING
# -----------------------------
print("\n[Step 2/6] Creating advanced engineered features...")

def create_advanced_features(df, embedding_df, feature_cols):
    features_list = []
    features_list.append(embedding_df)
    base_features = df[feature_cols].copy()
    features_list.append(base_features)
    
    emb_array = embedding_df.values
    stat_features = pd.DataFrame({
        'emb_mean': np.mean(emb_array, axis=1),
        'emb_std': np.std(emb_array, axis=1),
        'emb_max': np.max(emb_array, axis=1),
        'emb_min': np.min(emb_array, axis=1),
        'emb_median': np.median(emb_array, axis=1),
        'emb_range': np.max(emb_array, axis=1) - np.min(emb_array, axis=1),
        'emb_skew': [skew(row) for row in emb_array],
        'emb_kurtosis': [kurtosis(row) for row in emb_array]
    })
    features_list.append(stat_features)
    
    if len(feature_cols) > 0:
        text_array = base_features.values
        interaction_features = pd.DataFrame({
            'text_char_word_ratio': df[feature_cols[0]] / (df[feature_cols[1]] + 1) if len(feature_cols) > 1 else 0,
            'text_complexity_score': df[feature_cols[2]] * df[feature_cols[7]] if len(feature_cols) > 7 else 0,
            'text_feature_sum': np.sum(text_array, axis=1),
            'text_feature_mean': np.mean(text_array, axis=1),
            'text_feature_std': np.std(text_array, axis=1)
        })
        features_list.append(interaction_features)
    
    if len(feature_cols) > 0:
        poly_features = pd.DataFrame({
            'char_count_squared': df[feature_cols[0]] ** 2 if len(feature_cols) > 0 else 0,
            'word_count_squared': df[feature_cols[1]] ** 2 if len(feature_cols) > 1 else 0,
            'char_word_interaction': df[feature_cols[0]] * df[feature_cols[1]] if len(feature_cols) > 1 else 0
        })
        features_list.append(poly_features)
    
    emb_pca_proxy = pd.DataFrame({
        'emb_first_10_mean': np.mean(emb_array[:, :10], axis=1),
        'emb_last_10_mean': np.mean(emb_array[:, -10:], axis=1),
        'emb_first_last_diff': np.mean(emb_array[:, :10], axis=1) - np.mean(emb_array[:, -10:], axis=1)
    })
    features_list.append(emb_pca_proxy)
    
    combined_features = pd.concat(features_list, axis=1)
    return combined_features

train_features = create_advanced_features(train_df, train_embedding_df, feature_cols)
test_features = create_advanced_features(test_df, test_embedding_df, feature_cols)

print(f"✓ Created {train_features.shape[1]} total features")

train_features = train_features.fillna(0).replace([np.inf, -np.inf], 0)
test_features = test_features.fillna(0).replace([np.inf, -np.inf], 0)

scaler = RobustScaler()
train_features_scaled = scaler.fit_transform(train_features)
test_features_scaled = scaler.transform(test_features)

# -----------------------------
# 3. TARGET PREPARATION
# -----------------------------
y_orig = train_df[target_col].values.astype(float)
y_log = np.log1p(y_orig)

def smape_original_scale(y_true_orig, y_pred_orig):
    denom = (np.abs(y_true_orig) + np.abs(y_pred_orig))
    diff = np.abs(y_true_orig - y_pred_orig) / np.where(denom == 0, 1, denom)
    return 100.0 * np.mean(diff)

# -----------------------------
# 4. OPTIMIZED MODEL PARAMETERS
# -----------------------------
print("\n[Step 3/6] Initializing optimized models...")

xgb_params = {
    'n_estimators': 3000,
    'learning_rate': 0.04,
    'max_depth': 7,
    'min_child_weight': 3,
    'subsample': 0.85,
    'colsample_bytree': 0.85,
    'reg_alpha': 2.0,
    'reg_lambda': 4.0,
    'gamma': 0.15,
    'tree_method': 'hist',
    'random_state': 42,
    'n_jobs': -1,
    'verbosity': 0
}

lgb_params = {
    'n_estimators': 3000,
    'learning_rate': 0.04,
    'num_leaves': 120,
    'max_depth': 9,
    'min_data_in_leaf': 25,
    'feature_fraction': 0.85,
    'bagging_fraction': 0.85,
    'bagging_freq': 5,
    'lambda_l1': 1.5,
    'lambda_l2': 3.0,
    'objective': 'regression_l1',
    'metric': 'rmse',
    'random_state': 42,
    'n_jobs': -1,
    'verbose': -1
}

cat_params = {
    'iterations': 3000,
    'depth': 9,
    'learning_rate': 0.035,
    'l2_leaf_reg': 6.0,
    'subsample': 0.85,
    'rsm': 0.85,
    'random_seed': 42,
    'loss_function': 'RMSE',
    'bootstrap_type': 'Bernoulli',
    'grow_policy': 'Lossguide',
    'eval_metric': 'RMSE',
    'verbose': 0
}

print("✓ Models configured with aggressive hyperparameters")

# -----------------------------
# 5. SPLIT TRAINING DATA (80/20)
# -----------------------------
print("\n[Step 4/6] Splitting training data (80/20)...")
X_train, X_val, y_train_log, y_val_log, y_train_orig, y_val_orig = train_test_split(
    train_features_scaled, y_log, y_orig, test_size=0.20, random_state=42
)
print(f"✓ Training samples: {len(X_train)}, Validation samples: {len(X_val)}")

# -----------------------------
# 6. PSEUDO-LABELING ON TEST DATA
# -----------------------------
print("\n[Step 5/6] Applying pseudo-labeling for semi-supervised learning...")
pseudo_X, pseudo_y = pseudo_label_test_data(X_train, y_train_log, test_features_scaled, confidence_threshold=0.75)

# Augment training data
train_features_augmented = np.vstack([X_train, pseudo_X])
y_log_augmented = np.concatenate([y_train_log, pseudo_y])
print(f"✓ Augmented training size: {len(train_features_augmented)} (from {len(X_train)})")

# -----------------------------
# 7. TRAIN BASE MODELS AND STACKING META-LEARNERS
# -----------------------------
print("\n[Step 6/6] Training base models and stacking meta-learners...")

base_models = [
    ('XGBoost', XGBRegressor(**xgb_params)),
    ('LightGBM', LGBMRegressor(**lgb_params)),
    ('CatBoost', CatBoostRegressor(**cat_params))
]

meta_train = np.zeros((len(train_features_augmented), len(base_models)))
meta_val = np.zeros((len(X_val), len(base_models)))
meta_test = np.zeros((len(test_features_scaled), len(base_models)))

for i, (name, model) in enumerate(base_models):
    print(f"  🔧 Training {name}...", end=' ')
    model_start = time.time()
    try:
        model.fit(train_features_augmented, y_log_augmented,
                  eval_set=[(X_val, y_val_log)], early_stopping_rounds=200, verbose=False)
    except:
        model.fit(train_features_augmented, y_log_augmented)
    meta_train[:, i] = model.predict(train_features_augmented)
    meta_val[:, i] = model.predict(X_val)
    meta_test[:, i] = model.predict(test_features_scaled)
    
    val_pred_orig = np.expm1(meta_val[:, i])
    model_smape = smape_original_scale(y_val_orig, val_pred_orig)
    print(f"SMAPE: {model_smape:.4f} ({time.time()-model_start:.1f}s)")

print(f"  🎯 Training Quantile Regression Meta-Learner...", end=' ')
quantile_model = QuantileRegressor(quantile=0.5, alpha=1.0, solver='highs')
quantile_model.fit(meta_train, y_log_augmented)

ridge_model = Ridge(alpha=1.0, random_state=42)
ridge_model.fit(meta_train, y_log_augmented)
print("✓")

# Blend meta predictions on validation and test sets
qr_val_pred = quantile_model.predict(meta_val)
ridge_val_pred = ridge_model.predict(meta_val)
final_val_pred_log = 0.6 * qr_val_pred + 0.4 * ridge_val_pred
val_pred_orig_final = np.expm1(final_val_pred_log)
final_val_smape = smape_original_scale(y_val_orig, val_pred_orig_final)

print(f"\n🎉 Final Validation SMAPE after stacking: {final_val_smape:.4f}")

test_pred_log = 0.6 * quantile_model.predict(meta_test) + 0.4 * ridge_model.predict(meta_test)
test_pred_orig = np.expm1(test_pred_log)
test_pred_orig = np.clip(test_pred_orig, np.percentile(y_orig, 1), np.percentile(y_orig, 99))

# -----------------------------
# 8. SAVE SUBMISSION
# -----------------------------
submission = pd.DataFrame({
    "sample_id": test_df[id_col],
    "price": test_pred_orig
})[["sample_id", "price"]]

out_fname = "submission_advanced_optimized_train_test_split.csv"
submission.to_csv(out_fname, index=False)

print(f"\n✅ Submission saved as '{out_fname}'")
print("\n📄 Sample predictions:")
print(submission)


🚀 ADVANCED PRICE PREDICTION PIPELINE - MAXIMUM SMAPE REDUCTION
⚡ FAST MODE: 80/20 TRAIN-TEST SPLIT (No K-Fold)

[Step 1/6] Loading embeddings and base features...
✓ Loaded 1024 embedding dimensions
✓ Loaded 0 text statistics features

[Step 2/6] Creating advanced engineered features...
✓ Created 1035 total features

[Step 3/6] Initializing optimized models...
✓ Models configured with aggressive hyperparameters

[Step 4/6] Splitting training data (80/20)...
✓ Training samples: 60000, Validation samples: 15000

[Step 5/6] Applying pseudo-labeling for semi-supervised learning...
  ✓ Generated 18750 pseudo-labels (25.0% of test data)
  ✓ Mean confidence: 0.990
✓ Augmented training size: 78750 (from 60000)

[Step 6/6] Training base models and stacking meta-learners...
  🔧 Training XGBoost... SMAPE: 28.2934 (1514.1s)
  🔧 Training LightGBM... SMAPE: 29.6171 (1182.1s)
  🔧 Training CatBoost... SMAPE: 28.7048 (2613.4s)
  🎯 Training Quantile Regression Meta-Learner... ✓

🎉 Final Validation SMAPE 