In [None]:
import os
import re
import gc
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import KFold
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.neighbors import NearestNeighbors
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.metrics import mean_absolute_error
from scipy.stats import skew, kurtosis
from scipy.optimize import nnls

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from lightgbm import early_stopping, log_evaluation
from catboost import CatBoostRegressor

# =============================================================================
# CONFIG
# =============================================================================
BASE_PATH = "/content/drive/MyDrive/images"
TRAIN_CSV = f"{BASE_PATH}/local_datasets/train_local.csv"
TEST_CSV = f"{BASE_PATH}/local_datasets/test_local.csv"

# Use Nomic embeddings (768 dims)
TRAIN_TEXT_EMB = f"{BASE_PATH}/text_embeddings/train_text_embeddings_clip512.npy"
TEST_TEXT_EMB = f"{BASE_PATH}/text_embeddings/test_text_embeddings_clip512.npy"

# CLIP image embeddings
TRAIN_IMG_EMB = f"{BASE_PATH}/image_embeddings/train_image_embeddings.npy"
TEST_IMG_EMB = f"{BASE_PATH}/image_embeddings/test_image_embeddings.npy"

OUTPUT_CSV = f"{BASE_PATH}/predictions/submission_final.csv"

SEED = 42
N_FOLDS = 7
USE_GPU = True

print("="*80)
print("Amazon ML Challenge - Advanced Pipeline")
print("="*80)

# =============================================================================
# 1. LOAD DATA
# =============================================================================
print("\n[1/12] Loading data...")
train_df = pd.read_csv(TRAIN_CSV)
test_df = pd.read_csv(TEST_CSV)

print(f"Train: {train_df.shape}, Test: {test_df.shape}")
print(f"Price stats - Min: ${train_df['price'].min():.2f}, Max: ${train_df['price'].max():.2f}, "
      f"Median: ${train_df['price'].median():.2f}")

# Load embeddings
X_text_train = np.load(TRAIN_TEXT_EMB).astype(np.float32)
X_text_test = np.load(TEST_TEXT_EMB).astype(np.float32)
X_img_train = np.load(TRAIN_IMG_EMB).astype(np.float32)
X_img_test = np.load(TEST_IMG_EMB).astype(np.float32)

print(f"Text embeddings: {X_text_train.shape}")
print(f"Image embeddings: {X_img_train.shape}")

# =============================================================================
# 2. ADVANCED CATALOG PARSING
# =============================================================================
print("\n[2/12] Advanced catalog parsing...")

def extract_value_and_unit(text):
    """Extract Value and Unit from catalog"""
    value_match = re.search(r'Value:\s*([\d\.]+)\s*\n\s*Unit:\s*(\w+)', str(text), re.IGNORECASE)
    if value_match:
        try:
            return float(value_match.group(1)), value_match.group(2).lower()
        except:
            pass
    return None, None

def extract_quantity_info(text):
    """Extract quantity, pack size, weight info"""
    text_lower = str(text).lower()
    features = {}

    # Pack detection: "12 pack", "pack of 12", "12-pack"
    pack_patterns = [
        r'(\d+)[\s-]*pack',
        r'pack\s+of\s+(\d+)',
        r'(\d+)[\s-]*count',
        r'(\d+)\s*ct\b'
    ]
    pack_size = 1
    for pattern in pack_patterns:
        match = re.search(pattern, text_lower)
        if match:
            pack_size = max(pack_size, int(match.group(1)))
    features['pack_size'] = pack_size

    # Multipack: "6 x 500ml", "12 x 16oz"
    multipack = re.search(r'(\d+)\s*[x×]\s*(\d+\.?\d*)\s*(ml|oz|g|l|kg)', text_lower)
    if multipack:
        features['is_multipack'] = 1
        features['multipack_count'] = int(multipack.group(1))
        features['unit_size'] = float(multipack.group(2))
        features['unit_type'] = multipack.group(3)
    else:
        features['is_multipack'] = 0
        features['multipack_count'] = 1

        # Single unit extraction
        unit_match = re.search(r'(\d+\.?\d*)\s*(oz|ounce|ml|liter|litre|gram|kg|lb|pound)s?\b', text_lower)
        if unit_match:
            features['unit_size'] = float(unit_match.group(1))
            features['unit_type'] = unit_match.group(2)[:2]  # Normalize
        else:
            features['unit_size'] = 0
            features['unit_type'] = 'count'

    # Total quantity estimate
    unit_size = features.get('unit_size', 0)
    multipack_count = features.get('multipack_count', 1)

    # Convert everything to grams for normalization
    total_grams = 0
    if features.get('unit_type') in ['oz', 'ou']:
        total_grams = unit_size * 28.35 * multipack_count * pack_size
    elif features.get('unit_type') in ['ml', 'ml']:
        total_grams = unit_size * multipack_count * pack_size  # Approximate ml as g
    elif features.get('unit_type') in ['g', 'gr']:
        total_grams = unit_size * multipack_count * pack_size
    elif features.get('unit_type') in ['kg']:
        total_grams = unit_size * 1000 * multipack_count * pack_size
    elif features.get('unit_type') in ['l', 'li']:
        total_grams = unit_size * 1000 * multipack_count * pack_size
    elif features.get('unit_type') in ['lb', 'po']:
        total_grams = unit_size * 453.592 * multipack_count * pack_size

    features['total_grams_estimate'] = total_grams

    return features

def parse_catalog_comprehensive(text):
    """Comprehensive catalog parsing"""
    text_str = str(text)
    text_lower = text_str.lower()

    features = {}

    # Basic text stats
    features['text_len'] = len(text_str)
    features['word_count'] = len(text_str.split())
    features['digit_count'] = sum(c.isdigit() for c in text_str)
    features['upper_count'] = sum(c.isupper() for c in text_str)
    features['special_count'] = sum(not c.isalnum() and not c.isspace() for c in text_str)

    # Structural features
    features['bullet_count'] = text_lower.count('bullet point')
    features['has_description'] = int('product description:' in text_lower)
    features['has_brand'] = int('brand:' in text_lower)
    features['newline_count'] = text_str.count('\n')
    features['colon_count'] = text_str.count(':')

    # Value and Unit extraction (CRITICAL!)
    value, unit = extract_value_and_unit(text_str)
    if value is not None:
        features['value_extracted'] = value
        features['has_value'] = 1
        # Encode unit type
        unit_map = {'kilogram': 1, 'pound': 2, 'ounce': 3, 'gram': 4,
                   'ton': 5, 'count': 6, 'milliliter': 7, 'liter': 8}
        features['unit_encoded'] = unit_map.get(unit, 0)
    else:
        features['value_extracted'] = 0
        features['has_value'] = 0
        features['unit_encoded'] = 0

    # Quantity/pack info
    qty_info = extract_quantity_info(text_str)
    features.update(qty_info)

    # Brand extraction (improved)
    brand_match = re.search(r'Brand:\s*([^\n]+)', text_str, re.IGNORECASE)
    if brand_match:
        brand = brand_match.group(1).strip()[:30]
    else:
        # Try item name
        name_match = re.search(r'Item Name:\s*([^\n]+)', text_str, re.IGNORECASE)
        if name_match:
            name = name_match.group(1).strip()
            brand = name.split()[0] if name.split() else 'unknown'
        else:
            brand = 'unknown'

    brand = re.sub(r'[^a-zA-Z0-9]', '', brand.lower())
    features['brand'] = brand if brand else 'unknown'

    # Category signals
    categories = {
        'food': ['food', 'snack', 'candy', 'chocolate', 'chip', 'cookie', 'beverage'],
        'health': ['vitamin', 'supplement', 'protein', 'organic', 'health'],
        'beauty': ['beauty', 'cosmetic', 'shampoo', 'soap', 'lotion', 'cream'],
        'household': ['cleaner', 'detergent', 'paper towel', 'tissue', 'trash'],
        'baby': ['baby', 'infant', 'diaper', 'wipe', 'formula'],
        'pet': ['pet', 'dog', 'cat', 'animal']
    }

    for cat, keywords in categories.items():
        features[f'cat_{cat}'] = int(any(kw in text_lower for kw in keywords))

    # Premium/quality signals
    features['premium'] = int(any(w in text_lower for w in ['premium', 'luxury', 'gourmet', 'deluxe']))
    features['organic'] = int(any(w in text_lower for w in ['organic', 'natural', 'non-gmo']))
    features['bulk'] = int(any(w in text_lower for w in ['bulk', 'wholesale', 'case']))

    # All numbers in text
    numbers = [float(x) for x in re.findall(r'\d+\.?\d*', text_str)]
    if numbers:
        features['num_count'] = len(numbers)
        features['num_max'] = max(numbers)
        features['num_mean'] = np.mean(numbers)
        features['num_std'] = np.std(numbers) if len(numbers) > 1 else 0
    else:
        features['num_count'] = 0
        features['num_max'] = features['num_mean'] = features['num_std'] = 0

    return features

print("Parsing train catalogs...")
cat_train = pd.DataFrame([parse_catalog_comprehensive(t) for t in train_df['catalog_content']])
print("Parsing test catalogs...")
cat_test = pd.DataFrame([parse_catalog_comprehensive(t) for t in test_df['catalog_content']])

# =============================================================================
# 3. TARGET ENCODING FOR BRANDS
# =============================================================================
print("\n[3/12] Target encoding brands...")

def target_encode_kfold(train_series, target, test_series, n_folds=5):
    """KFold target encoding to prevent leakage"""
    global_mean = target.mean()

    # Train encoding with KFold
    train_encoded = np.zeros(len(train_series))
    kf = KFold(n_splits=n_folds, shuffle=True, random_state=SEED)

    for fold, (tr_idx, val_idx) in enumerate(kf.split(train_series)):
        # Calculate means on train fold
        stats = pd.DataFrame({
            'cat': train_series.iloc[tr_idx],
            'target': target.iloc[tr_idx]
        }).groupby('cat')['target'].mean()

        # Encode validation fold
        train_encoded[val_idx] = train_series.iloc[val_idx].map(stats).fillna(global_mean).values

    # Test encoding (use full train)
    full_stats = pd.DataFrame({
        'cat': train_series,
        'target': target
    }).groupby('cat')['target'].agg(['mean', 'std', 'count'])

    test_encoded = test_series.map(full_stats['mean']).fillna(global_mean)
    test_std = test_series.map(full_stats['std']).fillna(0)
    test_count = test_series.map(full_stats['count']).fillna(0)

    train_std = train_series.map(full_stats['std']).fillna(0)
    train_count = train_series.map(full_stats['count']).fillna(0)

    return train_encoded, train_std, train_count, test_encoded, test_std, test_count

brand_enc_tr, brand_std_tr, brand_cnt_tr, brand_enc_te, brand_std_te, brand_cnt_te = \
    target_encode_kfold(cat_train['brand'], train_df['price'], cat_test['brand'])

cat_train['brand_target_mean'] = brand_enc_tr
cat_train['brand_target_std'] = brand_std_tr
cat_train['brand_count'] = brand_cnt_tr

cat_test['brand_target_mean'] = brand_enc_te
cat_test['brand_target_std'] = brand_std_te
cat_test['brand_count'] = brand_cnt_te

# Drop string column
cat_train = cat_train.drop(columns=['brand'])
cat_test = cat_test.drop(columns=['brand'])

# Encode unit_type to numeric
if 'unit_type' in cat_train.columns:
    unit_mapping = {unit: i for i, unit in enumerate(cat_train['unit_type'].unique())}
    cat_train['unit_type'] = cat_train['unit_type'].map(unit_mapping).fillna(0)
    cat_test['unit_type'] = cat_test['unit_type'].map(unit_mapping).fillna(0)

# =============================================================================
# 4. EMBEDDING STATISTICAL FEATURES
# =============================================================================
print("\n[4/12] Creating embedding statistical features...")

def create_emb_stats(emb):
    """Create statistical features from embeddings"""
    stats = pd.DataFrame({
        'mean': emb.mean(axis=1),
        'std': emb.std(axis=1),
        'min': emb.min(axis=1),
        'max': emb.max(axis=1),
        'median': np.median(emb, axis=1),
        'q25': np.percentile(emb, 25, axis=1),
        'q75': np.percentile(emb, 75, axis=1),
        'norm': np.linalg.norm(emb, axis=1),
        'skew': [skew(row) for row in emb],
        'kurt': [kurtosis(row) for row in emb]
    })
    return stats

text_stats_tr = create_emb_stats(X_text_train)
text_stats_tr.columns = ['text_' + c for c in text_stats_tr.columns]

text_stats_te = create_emb_stats(X_text_test)
text_stats_te.columns = ['text_' + c for c in text_stats_te.columns]

img_stats_tr = create_emb_stats(X_img_train)
img_stats_tr.columns = ['img_' + c for c in img_stats_tr.columns]

img_stats_te = create_emb_stats(X_img_test)
img_stats_te.columns = ['img_' + c for c in img_stats_te.columns]

# =============================================================================
# 5. CROSS-MODAL FEATURES (CRITICAL!)
# =============================================================================
print("\n[5/12] Computing cross-modal text-image features...")

def compute_cross_modal(text_emb, img_emb):
    """Text-image interaction features - these are GOLD for pricing!"""

    # Normalize for cosine similarity
    text_norm = text_emb / (np.linalg.norm(text_emb, axis=1, keepdims=True) + 1e-8)
    img_norm = img_emb / (np.linalg.norm(img_emb, axis=1, keepdims=True) + 1e-8)

    features = pd.DataFrame({
        # Cosine similarity
        'cosine_sim': (text_norm * img_norm).sum(axis=1),

        # Euclidean distance
        'euclidean_dist': np.linalg.norm(text_emb - img_emb, axis=1),

        # Manhattan distance
        'manhattan_dist': np.abs(text_emb - img_emb).sum(axis=1),

        # Element-wise product stats
        'product_mean': (text_emb * img_emb).mean(axis=1),
        'product_std': (text_emb * img_emb).std(axis=1),
        'product_max': (text_emb * img_emb).max(axis=1),
        'product_min': (text_emb * img_emb).min(axis=1),

        # Difference stats
        'diff_mean': np.abs(text_emb - img_emb).mean(axis=1),
        'diff_std': np.abs(text_emb - img_emb).std(axis=1),
        'diff_max': np.abs(text_emb - img_emb).max(axis=1),

        # Norm ratios
        'norm_ratio': np.linalg.norm(text_emb, axis=1) / (np.linalg.norm(img_emb, axis=1) + 1e-8),

        # Correlation (row-wise)
        'correlation': [np.corrcoef(t, i)[0, 1] if not np.any(np.isnan([t, i])) else 0
                       for t, i in zip(text_emb, img_emb)]
    })

    return features.fillna(0).replace([np.inf, -np.inf], 0)

cross_modal_tr = compute_cross_modal(X_text_train, X_img_train)
cross_modal_te = compute_cross_modal(X_text_test, X_img_test)

# =============================================================================
# 6. PCA ON EMBEDDINGS (KEEP MORE DIMS!)
# =============================================================================
print("\n[6/12] PCA on embeddings (keeping 256 dims)...")

# Text PCA - keep 256 dims (much better than 64!)
pca_text = PCA(n_components=256, random_state=SEED)
text_pca_tr = pca_text.fit_transform(X_text_train)
text_pca_te = pca_text.transform(X_text_test)
print(f"Text PCA explained variance: {pca_text.explained_variance_ratio_.sum():.3f}")

# Image PCA
pca_img = PCA(n_components=128, random_state=SEED)
img_pca_tr = pca_img.fit_transform(X_img_train)
img_pca_te = pca_img.transform(X_img_test)
print(f"Image PCA explained variance: {pca_img.explained_variance_ratio_.sum():.3f}")

text_pca_df_tr = pd.DataFrame(text_pca_tr, columns=[f'text_pca{i}' for i in range(256)])
text_pca_df_te = pd.DataFrame(text_pca_te, columns=[f'text_pca{i}' for i in range(256)])

img_pca_df_tr = pd.DataFrame(img_pca_tr, columns=[f'img_pca{i}' for i in range(128)])
img_pca_df_te = pd.DataFrame(img_pca_te, columns=[f'img_pca{i}' for i in range(128)])

# =============================================================================
# 7. KNN PRICE FEATURES
# =============================================================================
print("\n[7/12] Computing KNN neighborhood price features...")

# Use combined embedding space
combined_tr = np.hstack([text_pca_tr[:, :64], img_pca_tr[:, :64]])  # 128 dims for KNN
combined_te = np.hstack([text_pca_te[:, :64], img_pca_te[:, :64]])

K = 20
nbrs = NearestNeighbors(n_neighbors=K+1, n_jobs=-1)
nbrs.fit(combined_tr)

# Train KNN
dist_tr, idx_tr = nbrs.kneighbors(combined_tr)
knn_prices_tr = np.array([train_df['price'].iloc[idx_tr[i, 1:]].values for i in range(len(train_df))])

knn_tr = pd.DataFrame({
    'knn_price_mean': knn_prices_tr.mean(axis=1),
    'knn_price_median': np.median(knn_prices_tr, axis=1),
    'knn_price_std': knn_prices_tr.std(axis=1),
    'knn_price_min': knn_prices_tr.min(axis=1),
    'knn_price_max': knn_prices_tr.max(axis=1),
    'knn_price_range': knn_prices_tr.max(axis=1) - knn_prices_tr.min(axis=1),
    'knn_dist_mean': dist_tr[:, 1:].mean(axis=1),
    'knn_dist_min': dist_tr[:, 1:].min(axis=1)
})

# Test KNN
dist_te, idx_te = nbrs.kneighbors(combined_te)
knn_prices_te = np.array([train_df['price'].iloc[idx_te[i, :]].values for i in range(len(test_df))])

knn_te = pd.DataFrame({
    'knn_price_mean': knn_prices_te.mean(axis=1),
    'knn_price_median': np.median(knn_prices_te, axis=1),
    'knn_price_std': knn_prices_te.std(axis=1),
    'knn_price_min': knn_prices_te.min(axis=1),
    'knn_price_max': knn_prices_te.max(axis=1),
    'knn_price_range': knn_prices_te.max(axis=1) - knn_prices_te.min(axis=1),
    'knn_dist_mean': dist_te.mean(axis=1),
    'knn_dist_min': dist_te.min(axis=1)
})

# =============================================================================
# 8. TFIDF FEATURES
# =============================================================================
print("\n[8/12] TF-IDF features...")

tfidf = TfidfVectorizer(max_features=10000, ngram_range=(1, 2), min_df=3)
tfidf_tr = tfidf.fit_transform(train_df['catalog_content'].fillna(''))
tfidf_te = tfidf.transform(test_df['catalog_content'].fillna(''))

svd = TruncatedSVD(n_components=50, random_state=SEED)
tfidf_svd_tr = svd.fit_transform(tfidf_tr)
tfidf_svd_te = svd.transform(tfidf_te)

tfidf_df_tr = pd.DataFrame(tfidf_svd_tr, columns=[f'tfidf{i}' for i in range(50)])
tfidf_df_te = pd.DataFrame(tfidf_svd_te, columns=[f'tfidf{i}' for i in range(50)])

# =============================================================================
# 9. COMBINE ALL FEATURES
# =============================================================================
print("\n[9/12] Combining all features...")

X_train = pd.concat([
    cat_train.reset_index(drop=True),
    text_stats_tr.reset_index(drop=True),
    img_stats_tr.reset_index(drop=True),
    cross_modal_tr.reset_index(drop=True),
    text_pca_df_tr.reset_index(drop=True),
    img_pca_df_tr.reset_index(drop=True),
    knn_tr.reset_index(drop=True),
    tfidf_df_tr.reset_index(drop=True)
], axis=1)

X_test = pd.concat([
    cat_test.reset_index(drop=True),
    text_stats_te.reset_index(drop=True),
    img_stats_te.reset_index(drop=True),
    cross_modal_te.reset_index(drop=True),
    text_pca_df_te.reset_index(drop=True),
    img_pca_df_te.reset_index(drop=True),
    knn_te.reset_index(drop=True),
    tfidf_df_te.reset_index(drop=True)
], axis=1)

print(f"Feature matrix: Train {X_train.shape}, Test {X_test.shape}")

# Clean data
X_train = X_train.replace([np.inf, -np.inf], np.nan).fillna(0)
X_test = X_test.replace([np.inf, -np.inf], np.nan).fillna(0)

# Align columns
for col in X_train.columns:
    if col not in X_test.columns:
        X_test[col] = 0
X_test = X_test[X_train.columns]

y = train_df['price'].values

# Light outlier removal (keep 99% of data)
q01, q99 = np.percentile(y, [0.5, 99.5])
mask = (y >= q01) & (y <= q99)
X_train = X_train[mask].reset_index(drop=True)
y = y[mask]
print(f"Removed {sum(~mask)} outliers, keeping {len(y)} samples")

# =============================================================================
# 10. SCALE FEATURES
# =============================================================================
print("\n[10/12] Scaling features...")

scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# =============================================================================
# 11. TRAIN MODELS - FIXED VERSION
# =============================================================================
print("\n[11/12] Training ensemble models...")

def smape(y_true, y_pred):
    denom = np.abs(y_true) + np.abs(y_pred)
    denom[denom == 0] = 1
    return 100 * np.mean(2 * np.abs(y_pred - y_true) / denom)

kf = KFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)

oof_preds = {}
test_preds = {}

# XGBoost
print("\nTraining XGBoost...")
oof_preds['xgb'] = np.zeros(len(X_train_scaled))
test_preds['xgb'] = np.zeros(len(X_test_scaled))
y_log = np.log1p(y)

for fold, (tr_idx, val_idx) in enumerate(kf.split(X_train_scaled), 1):
    Xtr, Xval = X_train_scaled[tr_idx], X_train_scaled[val_idx]
    ytr, yval = y_log[tr_idx], y_log[val_idx]

    # --- THIS IS THE FIX ---
    xgb = XGBRegressor(
        n_estimators=1200,
        max_depth=7,
        learning_rate=0.04,
        subsample=0.75,
        colsample_bytree=0.75,
        reg_alpha=0.5,
        reg_lambda=2.0,
        min_child_weight=3,
        tree_method='gpu_hist' if USE_GPU else 'hist',
        random_state=SEED,
        verbosity=0,
        early_stopping_rounds=100 # <-- The parameter is now part of the definition
    )
    # The .fit() call is now cleaner and correct
    xgb.fit(Xtr, ytr, eval_set=[(Xval, yval)], verbose=False)

    oof_preds['xgb'][val_idx] = xgb.predict(Xval)
    test_preds['xgb'] += xgb.predict(X_test_scaled) / N_FOLDS

    val_smape = smape(np.expm1(yval), np.expm1(xgb.predict(Xval)))
    print(f"  Fold {fold} SMAPE: {val_smape:.2f}%")

gc.collect()

# =============================================================================
# LightGBM with LOG TRANSFORM (FIXED!)
# =============================================================================
print("\nTraining LightGBM...")
oof_preds['lgb'] = np.zeros(len(X_train_scaled))
test_preds['lgb'] = np.zeros(len(X_test_scaled))

for fold, (tr_idx, val_idx) in enumerate(kf.split(X_train_scaled), 1):
    Xtr, Xval = X_train_scaled[tr_idx], X_train_scaled[val_idx]
    ytr, yval = y_log[tr_idx], y_log[val_idx]  # Use LOG transform!

    lgb = LGBMRegressor(
        n_estimators=1500,
        max_depth=8,
        learning_rate=0.04,
        subsample=0.75,
        colsample_bytree=0.75,
        reg_alpha=0.5,
        reg_lambda=2.0,
        min_child_samples=20,
        objective='regression',  # Standard regression, NOT MAPE!
        metric='l2',
        device='gpu' if USE_GPU else 'cpu',
        random_state=SEED,
        verbose=-1
    )

    lgb.fit(
        Xtr, ytr,
        eval_set=[(Xval, yval)],
        callbacks=[early_stopping(stopping_rounds=100),
                  log_evaluation(period=0)]
    )

    # Predict in log space, then convert back
    val_pred = np.expm1(lgb.predict(Xval))
    oof_preds['lgb'][val_idx] = val_pred
    test_preds['lgb'] += np.expm1(lgb.predict(X_test_scaled)) / N_FOLDS

    val_smape = smape(np.expm1(yval), val_pred)
    print(f"  Fold {fold} SMAPE: {val_smape:.2f}%")

gc.collect()

# =============================================================================
# CatBoost with LOG TRANSFORM (FIXED!)
# =============================================================================
print("\nTraining CatBoost...")
oof_preds['cat'] = np.zeros(len(X_train_scaled))
test_preds['cat'] = np.zeros(len(X_test_scaled))

for fold, (tr_idx, val_idx) in enumerate(kf.split(X_train_scaled), 1):
    Xtr, Xval = X_train_scaled[tr_idx], X_train_scaled[val_idx]
    ytr, yval = y_log[tr_idx], y_log[val_idx]  # Use LOG transform!

    cat = CatBoostRegressor(
        iterations=1500,
        depth=6,
        learning_rate=0.04,
        l2_leaf_reg=3,
        loss_function='RMSE',  # Standard RMSE, NOT MAPE!
        eval_metric='RMSE',
        task_type='GPU' if USE_GPU else 'CPU',
        random_state=SEED,
        verbose=False
    )

    cat.fit(
        Xtr, ytr,
        eval_set=(Xval, yval),
        early_stopping_rounds=100
    )

    # Predict in log space, then convert back
    val_pred = np.expm1(cat.predict(Xval))
    oof_preds['cat'][val_idx] = val_pred
    test_preds['cat'] += np.expm1(cat.predict(X_test_scaled)) / N_FOLDS

    val_smape = smape(np.expm1(yval), val_pred)
    print(f"  Fold {fold} SMAPE: {val_smape:.2f}%")

gc.collect()

# =============================================================================
# 12. META ENSEMBLE
# =============================================================================
print("\n[12/12] Meta ensemble with SMAPE-optimized blending...")

# Convert XGB predictions back from log space
oof_preds['xgb'] = np.expm1(oof_preds['xgb'])
test_preds['xgb'] = np.expm1(test_preds['xgb'])

# Check individual model scores
print("\nIndividual model OOF SMAPE:")
for name in ['xgb', 'lgb', 'cat']:
    score = smape(y, oof_preds[name])
    print(f"  {name.upper()}: {score:.2f}%")

# Custom SMAPE-optimized blending
from scipy.optimize import minimize

def smape_loss(weights):
    weights = np.abs(weights) / (np.abs(weights).sum() + 1e-10)
    blend = sum(w * oof_preds[name] for w, name in zip(weights, ['xgb', 'lgb', 'cat']))
    return smape(y, blend)

# Find optimal weights
result = minimize(smape_loss, x0=[0.33, 0.33, 0.34], method='Nelder-Mead',
                 options={'maxiter': 1000})
optimal_weights = np.abs(result.x) / (np.abs(result.x).sum() + 1e-10)

print(f"\nOptimal blend weights: XGB={optimal_weights[0]:.3f}, "
      f"LGB={optimal_weights[1]:.3f}, CAT={optimal_weights[2]:.3f}")

# Final predictions
final_oof = sum(w * oof_preds[name] for w, name in zip(optimal_weights, ['xgb', 'lgb', 'cat']))
final_test = sum(w * test_preds[name] for w, name in zip(optimal_weights, ['xgb', 'lgb', 'cat']))

# Post-processing: clip extremes
final_test = np.clip(final_test, y.min() * 0.8, y.max() * 1.1)

final_smape = smape(y, final_oof)
print(f"\n{'='*80}")
print(f"FINAL OOF SMAPE: {final_smape:.2f}%")
print(f"{'='*80}")

# =============================================================================
# SAVE PREDICTIONS
# =============================================================================
print(f"\nSaving predictions to: {OUTPUT_CSV}")
submission = pd.DataFrame({
    'sample_id': test_df['sample_id'],
    'price': final_test
})
submission = submission.sort_values('sample_id').reset_index(drop=True)
submission.to_csv(OUTPUT_CSV, index=False)

print(f"\n✅ DONE!")
print(f"Predicted price range: ${submission['price'].min():.2f} - ${submission['price'].max():.2f}")
print(f"Mean predicted price: ${submission['price'].mean():.2f}")
print(f"Median predicted price: ${submission['price'].median():.2f}")