In [None]:
import pandas as pd
import numpy as np
import os
import gc
import re
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, HistGradientBoostingRegressor
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.ensemble import VotingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
import lightgbm as lgb
from catboost import CatBoostRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_percentage_error
import warnings
warnings.filterwarnings('ignore')

# Optional: Install optuna for hyperparameter tuning
# !pip install optuna -q
import optuna
from optuna.pruners import MedianPruner


def smape(y_true, y_pred):
    """The official SMAPE metric function."""
    numerator = np.abs(y_pred - y_true)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    return np.mean(numerator / (denominator + 1e-8)) * 100

def smape_objective(y_true_log, y_pred_log):
    """Custom objective function for LightGBM to learn SMAPE."""
    y_true = np.expm1(y_true_log)
    y_pred = np.expm1(y_pred_log)
    grad_numerator = 2 * (y_pred - y_true) * (np.abs(y_true) + np.abs(y_pred)) - 2 * np.abs(y_pred - y_true) * np.sign(y_pred)
    grad_denominator = (np.abs(y_true) + np.abs(y_pred))**2
    grad = grad_numerator / (grad_denominator + 1e-8)
    grad = grad * y_pred
    hess = np.ones_like(y_true)
    return grad, hess

def smape_lgbm_eval(y_true_log, y_pred_log):
    """Custom evaluation metric for LightGBM to show SMAPE score during training."""
    y_true = np.expm1(y_true_log)
    y_pred = np.expm1(y_pred_log)
    score = smape(y_true, y_pred)
    return 'smape_eval', score, False # The False means a lower score is better


# ============================================================================
# CONFIGURATION
# ============================================================================
BASE_PATH = "/content/drive/MyDrive/images"
TRAIN_CSV_PATH = f"{BASE_PATH}/local_datasets/train_local.csv"
TEST_CSV_PATH = f"{BASE_PATH}/local_datasets/test_local.csv"
TEXT_EMB_PATH = f"{BASE_PATH}/text_embeddings/train_text_embeddings.npy"
IMG_EMB_PATH = f"{BASE_PATH}/image_embeddings/train_image_embeddings.npy"
TEST_IMG_EMB_PATH = f"{BASE_PATH}/image_embeddings/test_image_embeddings.npy"
OUTPUT_PATH = "/content/drive/MyDrive/images/predictions/test_out_TIER1_SMAPE_FIX.csv"

# ============================================================================
# STEP 1: LOAD AND PREPARE DATA
# ============================================================================
print("Loading training data...")
df_train = pd.read_csv(TRAIN_CSV_PATH)
df_test = pd.read_csv(TEST_CSV_PATH)

print(f"Train shape: {df_train.shape}")
print(f"Test shape: {df_test.shape}")

# Load embeddings
print("Loading embeddings...")
text_embeddings = np.load(TEXT_EMB_PATH)
image_embeddings = np.load(IMG_EMB_PATH)
test_image_embeddings = np.load(TEST_IMG_EMB_PATH)

# Generate test text embeddings if needed

test_text_embeddings = np.load(f"{BASE_PATH}/text_embeddings/test_text_embeddings.npy")


# ============================================================================
# STEP 2: ADVANCED FEATURE ENGINEERING (ENHANCED)
# ============================================================================
print("\nEngineering advanced features (Enhanced)...")

def extract_catalog_features(catalog_text):
    """Extract structured features from catalog content - FULLY ENHANCED version"""
    features = {}

    if not isinstance(catalog_text, str):
        catalog_text = ""

    catalog_text = catalog_text.encode('utf-8', 'ignore').decode('utf-8')
    catalog_text_lower = catalog_text.lower()

    # --- Basic Features (Extended) ---
    features['catalog_length'] = len(catalog_text)
    features['word_count'] = len(catalog_text.split())
    features['has_quantity'] = int('Value:' in catalog_text)
    features['has_unit'] = int('Unit:' in catalog_text)
    features['has_description'] = int('Product Description:' in catalog_text)
    features['uppercase_ratio'] = sum(1 for c in catalog_text if c.isupper()) / max(len(catalog_text), 1)
    features['digit_count'] = sum(1 for c in catalog_text if c.isdigit())
    features['special_char_count'] = sum(1 for c in catalog_text if not c.isalnum() and not c.isspace())

    # --- Quantity Value ---
    quantity_match = re.search(r"Value:\s*([\d\.]+)", catalog_text)
    try:
        features['quantity_value'] = float(quantity_match.group(1)) if quantity_match else 0
    except:
        features['quantity_value'] = 0

    # --- Pack Size Extraction ---
    pack_size = 1
    pack_match = re.search(r'(?:pack of|case of|pk of)\s*\(?(\d{1,3})\)?|(\d{1,3})\s*(?:-pack|-count|ct|units)', catalog_text_lower)
    if pack_match:
        size_str = next((s for s in pack_match.groups() if s is not None), None)
        if size_str:
            try:
                pack_size = int(size_str)
            except:
                pack_size = 1
    features['pack_size'] = pack_size
    
    # --- Total Quantity (NEW) - Interaction between quantity and pack size ---
    features['total_quantity'] = features['quantity_value'] * pack_size

    # --- Unit Standardization ---
    unit_match = re.search(r"Unit:\s*(\w+)", catalog_text)
    unit_raw = unit_match.group(1).strip().lower() if unit_match else 'none'

    if re.match(r'o(u)?nce|oz|fl oz', unit_raw):
        features['unit_std'] = 'Ounce'
    elif re.match(r'count|ct|each|piece|units|tea', unit_raw):
        features['unit_std'] = 'Count'
    elif re.match(r'pound|lb', unit_raw):
        features['unit_std'] = 'Pound'
    elif re.match(r'fl|fluid|ml|millilitre|liters|ltr|volume|gallon', unit_raw):
        features['unit_std'] = 'Volume'
    elif re.match(r'gram|gr|kg', unit_raw):
        features['unit_std'] = 'Gram'
    elif re.match(r'case|pack|box|bag|jar|bottle|pouch|bucket|can', unit_raw):
        features['unit_std'] = 'Container'
    else:
        features['unit_std'] = 'Other'

    # --- Bullet Point Count and Description Length ---
    features['bullet_point_count'] = len(re.findall(r"Bullet Point \d+:", catalog_text))
    
    desc_match = re.search(r"Product Description:\s*(.*)", catalog_text, re.DOTALL)
    if desc_match:
        desc_text = desc_match.group(1).strip()
        features['description_length'] = len(desc_text)
        features['description_word_count'] = len(desc_text.split())
    else:
        features['description_length'] = 0
        features['description_word_count'] = 0

    # --- Item Name / Brand Proxy Extraction ---
    item_name_match = re.search(r"Item Name:\s*(.*?)(?:Bullet Point 1:|Product Description:|Value:|$)", catalog_text, re.DOTALL)
    item_name = item_name_match.group(1).strip() if item_name_match else ''
    
    words = item_name.split()
    brand_proxy = words[0].lower() if words else 'none'
    
    if brand_proxy in ['the', 'a', 'deluxe', 'premium', 'original', 'gourmet', 'best', 'traditional', 'goya']:
        brand_proxy = " ".join(words[:2]).lower() if len(words) >= 2 else 'none'

    features['brand_proxy'] = brand_proxy
    
    # --- Quality & Health Keywords ---
    premium_words = ['premium', 'gourmet', 'deluxe', 'artisanal', 'handcrafted', 'superior', 'luxury']
    health_words = ['organic', 'natural', 'healthy', 'non-gmo', 'usda organic']
    dietary_words = ['gluten-free', 'gluten free', 'sugar-free', 'vegan', 'keto', 'paleo']
    
    features['is_premium'] = int(any(word in catalog_text_lower for word in premium_words))
    features['is_health_focused'] = int(any(word in catalog_text_lower for word in health_words))
    features['has_dietary_claim'] = int(any(word in catalog_text_lower for word in dietary_words))

    # --- Product Form ---
    features['form_powder'] = int('powder' in catalog_text_lower or 'mix' in catalog_text_lower)
    features['form_liquid'] = int('liquid' in catalog_text_lower or 'juice' in catalog_text_lower or 
                                   'sauce' in catalog_text_lower or 'drink' in catalog_text_lower)
    features['form_solid'] = int('bar' in catalog_text_lower or 'solid' in catalog_text_lower or 
                                  'cookies' in catalog_text_lower or 'crackers' in catalog_text_lower)
    features['form_capsule'] = int('capsule' in catalog_text_lower or 'pod' in catalog_text_lower or 
                                    'k-cup' in catalog_text_lower or 'k cup' in catalog_text_lower)

    # --- Purpose & Audience ---
    gift_words = ['gift', 'basket', 'assortment', 'variety pack', 'sampler', 'housewarming']
    features['is_for_gifting'] = int(any(word in catalog_text_lower for word in gift_words))
    features['is_bulk'] = int('bulk' in catalog_text_lower or 'food service' in catalog_text_lower or 
                              'industrial' in catalog_text_lower)

    # --- Origin & Packaging ---
    features['made_in_usa'] = int('made in usa' in catalog_text_lower or 'made in the usa' in catalog_text_lower)
    features['is_imported'] = int('imported' in catalog_text_lower or 'product of' in catalog_text_lower)
    features['is_resealable'] = int('resealable' in catalog_text_lower or 'reseal' in catalog_text_lower)
    
    # Packaging type detection
    packaging_keywords = ['bag', 'box', 'can', 'canister', 'jar', 'bottle', 'bowl', 'pouch', 'tube', 'case']
    found_packaging = 'other'
    for pkg in packaging_keywords:
        if pkg in catalog_text_lower:
            found_packaging = pkg
            break
    features['packaging_type'] = found_packaging

    # --- Additional Keywords ---
    features['has_pack'] = int('pack' in catalog_text_lower or 'case' in catalog_text_lower)
    features['has_bundle'] = int('bundle' in catalog_text_lower or 'assortment' in catalog_text_lower)
    features['has_discount'] = int('discount' in catalog_text_lower or 'sale' in catalog_text_lower)
    features['has_professional'] = int('professional' in catalog_text_lower or 'bulk' in catalog_text_lower)
    features['has_gluten_free'] = int('gluten-free' in catalog_text_lower or 'gluten free' in catalog_text_lower)
    features['has_kosher'] = int('kosher' in catalog_text_lower or 'dairy certified' in catalog_text_lower)
    features['has_organic'] = int('organic' in catalog_text_lower)
    features['has_sugar_free'] = int('sugar-free' in catalog_text_lower or 'no sugar added' in catalog_text_lower)
    features['has_natural'] = int('natural' in catalog_text_lower or 'all natural' in catalog_text_lower)
    features['has_vegan'] = int('vegan' in catalog_text_lower or 'plant-based' in catalog_text_lower)
    features['has_certified'] = int('certified' in catalog_text_lower or 'gmp' in catalog_text_lower or 
                                     'fda' in catalog_text_lower or 'fair trade' in catalog_text_lower)

    return features

def extract_catalog_features_df(df):
    """Applies the extraction to the dataframe and returns a new feature dataframe."""
    print("Applying catalog feature extraction...")
    catalog_features_list = [extract_catalog_features(text) for text in df['catalog_content']]
    features_df = pd.DataFrame(catalog_features_list)
    features_df['sample_id'] = df['sample_id'].values
    return features_df

# --- 2a. Extract base catalog features ---
cat_train_df = extract_catalog_features_df(df_train)
cat_test_df = extract_catalog_features_df(df_test)

# --- 2b. Categorical Feature Processing (Unit/Brand/Packaging OHE) ---

N_BRAND_CATEGORIES = 20
brand_counts = cat_train_df['brand_proxy'].value_counts()
top_brands = brand_counts[brand_counts > 5].index.tolist()
top_brands = top_brands[:N_BRAND_CATEGORIES]
print(f"Using {len(top_brands)} top brands for OHE.")

def process_categorical_features(df, top_brands):
    """Process categorical features with OHE for unit, brand, AND packaging"""
    df_cat = df[['unit_std', 'brand_proxy', 'packaging_type']].copy()
    
    # 1. Unit OHE (Use all standardized units)
    unit_dummies = pd.get_dummies(df_cat['unit_std'], prefix='unit', dummy_na=False)
    df_cat = pd.concat([df_cat, unit_dummies], axis=1)
    
    # 2. Brand OHE (Use only the top brands)
    df_cat['brand_proxy_other'] = df_cat['brand_proxy'].apply(lambda x: x if x in top_brands else 'Other')
    brand_dummies = pd.get_dummies(df_cat['brand_proxy_other'], prefix='brand', dummy_na=False)
    df_cat = pd.concat([df_cat, brand_dummies], axis=1)
    
    # 3. Packaging OHE (NEW) - One-hot encode all packaging types
    packaging_dummies = pd.get_dummies(df_cat['packaging_type'], prefix='pkg', dummy_na=False)
    df_cat = pd.concat([df_cat, packaging_dummies], axis=1)
    
    # Remove original columns
    df_cat = df_cat.drop(columns=['unit_std', 'brand_proxy', 'brand_proxy_other', 'packaging_type'], errors='ignore')
    return df_cat

cat_train_ohe = process_categorical_features(cat_train_df, top_brands)
cat_test_ohe = process_categorical_features(cat_test_df, top_brands)

# Align test columns with train columns (CRITICAL for OHE consistency)
missing_cols = set(cat_train_ohe.columns) - set(cat_test_ohe.columns)
for c in missing_cols:
    cat_test_ohe[c] = 0
cat_test_ohe = cat_test_ohe[cat_train_ohe.columns]
print(f"Train OHE shape: {cat_train_ohe.shape}, Test OHE shape: {cat_test_ohe.shape}")

# Drop un-encoded categoricals from the main catalog feature DF
cat_train_df = cat_train_df.drop(columns=['unit_std', 'brand_proxy', 'packaging_type'])
cat_test_df = cat_test_df.drop(columns=['unit_std', 'brand_proxy', 'packaging_type'])

# Merge OHE back into the main feature dataframes
cat_train_final = pd.concat([cat_train_df, cat_train_ohe], axis=1)
cat_test_final = pd.concat([cat_test_df, cat_test_ohe], axis=1)


def create_feature_matrix(df, text_emb, img_emb, catalog_features_df):
    """Create comprehensive feature matrix"""
    features = pd.DataFrame()
    features['sample_id'] = df['sample_id'].values
    
    # ===== Embedding features =====
    # Raw embedding dimensions
    for i in range(text_emb.shape[1]):
        features[f'text_emb_{i}'] = text_emb[:, i]
    
    for i in range(img_emb.shape[1]):
        features[f'img_emb_{i}'] = img_emb[:, i]
    
    # Statistical features from embeddings
    features['text_emb_mean'] = text_emb.mean(axis=1)
    features['text_emb_std'] = text_emb.std(axis=1)
    features['text_emb_max'] = text_emb.max(axis=1)
    features['text_emb_min'] = text_emb.min(axis=1)
    features['text_emb_range'] = features['text_emb_max'] - features['text_emb_min']
    features['text_emb_median'] = np.median(text_emb, axis=1)
    features['text_emb_skew'] = pd.DataFrame(text_emb).skew(axis=1).values
    features['text_emb_kurtosis'] = pd.DataFrame(text_emb).kurtosis(axis=1).values
    
    features['img_emb_mean'] = img_emb.mean(axis=1)
    features['img_emb_std'] = img_emb.std(axis=1)
    features['img_emb_max'] = img_emb.max(axis=1)
    features['img_emb_min'] = img_emb.min(axis=1)
    features['img_emb_range'] = features['img_emb_max'] - features['img_emb_min']
    features['img_emb_median'] = np.median(img_emb, axis=1)
    
    # L2 norms
    features['text_emb_l2'] = np.sqrt((text_emb ** 2).sum(axis=1))
    features['img_emb_l2'] = np.sqrt((img_emb ** 2).sum(axis=1))
    
    # Interaction features
    # features['emb_cosine_sim'] = (text_emb * img_emb).sum(axis=1) / (features['text_emb_l2'] * features['img_emb_l2'] + 1e-8)
    features['emb_interaction_mean'] = features['text_emb_mean'] * features['img_emb_mean']
    features['emb_interaction_std'] = features['text_emb_std'] * features['img_emb_std']
    features['emb_l2_dist'] = features['text_emb_l2'] + features['img_emb_l2']
    features['emb_mean_ratio'] = features['text_emb_mean'] / (features['img_emb_mean'] + 1e-8)
    
    # ===== Catalog content features =====
    catalog_cols = [col for col in catalog_features_df.columns if col != 'sample_id']
    for col in catalog_cols:
        features[f'cat_{col}'] = catalog_features_df[col].values
        
    # ===== Combined features (REVISED) =====
    features['emb_text_to_catalog_ratio'] = features['text_emb_l2'] / (features['cat_catalog_length'] + 1e-8)
    features['emb_img_to_quantity_ratio'] = features['img_emb_l2'] / (features['cat_quantity_value'] + 1e-8)
    features['cat_description_to_catalog_ratio'] = features['cat_description_length'] / (features['cat_catalog_length'] + 1e-8)
    features['cat_words_per_bp'] = features['cat_word_count'] / (features['cat_bullet_point_count'] + 1e-8)
    
    # ===== NEW: Price-per-unit proxy and total quantity interaction =====
    features['price_unit_proxy'] = features['img_emb_l2'] / (features['cat_total_quantity'] + 1e-8)
    features['quantity_complexity'] = features['cat_total_quantity'] * features['cat_bullet_point_count']
    
    return features

X_train = create_feature_matrix(df_train, text_embeddings, image_embeddings, cat_train_final)
X_test = create_feature_matrix(df_test, test_text_embeddings, test_image_embeddings, cat_test_final)

y_train = df_train['price'].values

print(f"Feature matrix shape: {X_train.shape}")
print(f"Test feature matrix shape: {X_test.shape}")


# ============================================================================
# STEP 3: DATA PREPROCESSING & OUTLIER HANDLING
# ============================================================================
print("\nPreprocessing data...")

price_mean = y_train.mean()
price_std = y_train.std()
mask = (y_train >= price_mean - 3.5 * price_std) & (y_train <= price_mean + 3.5 * price_std)

X_train_clean = X_train[mask].reset_index(drop=True)
y_train_clean = y_train[mask]

print(f"Removed {len(y_train) - len(y_train_clean)} outliers")
print(f"Price distribution: min={y_train_clean.min():.2f}, max={y_train_clean.max():.2f}, mean={y_train_clean.mean():.2f}")

# Handle missing values
X_train_clean = X_train_clean.fillna(X_train_clean.median())
X_test = X_test.fillna(X_train_clean.median())

# Feature scaling
scaler = RobustScaler()
feature_cols = [col for col in X_train_clean.columns if col != 'sample_id']
X_train_scaled = scaler.fit_transform(X_train_clean[feature_cols])
X_test_scaled = scaler.transform(X_test[feature_cols])

# Log transformation
print("\nApplying log transformation...")
y_train_log = np.log1p(y_train_clean)
print(f"Original price - mean: {y_train_clean.mean():.2f}, std: {y_train_clean.std():.2f}")
print(f"Log price - mean: {y_train_log.mean():.2f}, std: {y_train_log.std():.2f}")

# ============================================================================
# STEP 4: HYPERPARAMETER TUNING WITH OPTUNA
# ============================================================================
print("\nTuning hyperparameters with Optuna...")

def objective(trial):
    """Optuna objective function for hyperparameter tuning"""
    # We only tune LGBM for this focused run
    params = {
        'lgb_lr': trial.suggest_float('lgb_lr', 0.01, 0.1),
        'lgb_depth': trial.suggest_int('lgb_depth', 5, 10),
        # Add other lgbm params to tune if desired
        'num_leaves': trial.suggest_int('num_leaves', 20, 50),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.7, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.7, 1.0),
    }
    
    cv = KFold(n_splits=3, shuffle=True, random_state=42)
    cv_scores = []
    
    for train_idx, val_idx in cv.split(X_train_scaled):
        X_tr, X_val = X_train_scaled[train_idx], X_train_scaled[val_idx]
        y_tr, y_val = y_train_log[train_idx], y_train_log[val_idx]
        
        # --- FIX #4: Use the upgraded LightGBM model training ---
        lgb_model = LGBMRegressor(
            objective='regression_l1',
            n_estimators=2000,
            learning_rate=params['lgb_lr'],
            max_depth=params['lgb_depth'],
            num_leaves=params['num_leaves'],
            feature_fraction=params['feature_fraction'],
            bagging_fraction=params['bagging_fraction'],
            bagging_freq=1,
            random_state=42, 
            n_jobs=-1, 
            verbose=-1
        )
        lgb_model.fit(X_tr, y_tr,
                      eval_set=[(X_val, y_val)],
                      eval_metric=smape_lgbm_eval,
                      callbacks=[lgb.early_stopping(100, verbose=False)])
        
        lgb_pred = lgb_model.predict(X_val)
        
        pred_price = np.expm1(lgb_pred)
        y_val_original = np.expm1(y_val)
        
        smape_score = smape(y_val_original, pred_price)
        cv_scores.append(smape_score)
    
    return np.mean(cv_scores)

try:
    study = optuna.create_study(direction='minimize', pruner=MedianPruner(), study_name="price_tuning")
    study.optimize(objective, n_trials=50, show_progress_bar=True, catch=(Exception,))

    best_params = study.best_params
    print(f"\nBest parameters found:")
    for key, value in best_params.items():
        print(f"  {key}: {value}")
    print(f"Best CV SMAPE: {study.best_value:.4f}%")
except Exception as e:
    print(f"\nOptuna error: {e}. Using default parameters.")
    best_params = {
        'xgb_lr': 0.05, 'xgb_depth': 7,
        'lgb_lr': 0.05, 'lgb_depth': 7,
        'cat_lr': 0.05, 'cat_depth': 6,
        'gb_lr': 0.05, 'gb_depth': 6,
    }

# ============================================================================
# STEP 5: TRAIN FINAL MODELS WITH TUNED HYPERPARAMETERS
# ============================================================================
print("\nTraining final models with tuned hyperparameters...")

models = {
    'xgb': XGBRegressor(
        n_estimators=400,
        max_depth=int(best_params.get('xgb_depth', 7)),
        learning_rate=best_params.get('xgb_lr', 0.05),
        subsample=0.85,
        colsample_bytree=0.85,
        reg_alpha=1.0,
        reg_lambda=1.0,
        random_state=42,
        n_jobs=-1,
        verbosity=0
    ),
    'lgb': LGBMRegressor(
        n_estimators=400,
        max_depth=int(best_params.get('lgb_depth', 7)),
        learning_rate=best_params.get('lgb_lr', 0.05),
        subsample=0.85,
        colsample_bytree=0.85,
        reg_alpha=1.0,
        reg_lambda=1.0,
        random_state=42,
        n_jobs=-1,
        verbose=-1
    ),
    'catboost': CatBoostRegressor(
        iterations=400,
        depth=int(best_params.get('cat_depth', 6)),
        learning_rate=best_params.get('cat_lr', 0.05),
        subsample=0.85,
        verbose=0,
        random_state=42,
        thread_count=-1
    ),
    'gb': GradientBoostingRegressor(
        n_estimators=400,
        max_depth=int(best_params.get('gb_depth', 6)),
        learning_rate=best_params.get('gb_lr', 0.05),
        subsample=0.85,
        random_state=42
    ),
    'histgb': HistGradientBoostingRegressor(
        max_iter=400,
        max_depth=8,
        learning_rate=0.05,
        random_state=42,
        loss='squared_error'
    ),
    'ridge': Ridge(alpha=0.5),
    'lasso': Lasso(alpha=0.05),
    'elasticnet': ElasticNet(alpha=0.05, l1_ratio=0.5),
}

# Train all models
train_preds = {}
test_preds = {}

for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train_scaled, y_train_log)
    train_preds[name] = model.predict(X_train_scaled)
    test_preds[name] = model.predict(X_test_scaled)
    gc.collect()

# ============================================================================
# STEP 6: ADVANCED STACKING WITH MULTIPLE META-LEARNERS
# ============================================================================
print("\nTraining meta-learners...")

X_meta_train = np.column_stack([train_preds[name] for name in models.keys()])
X_meta_test = np.column_stack([test_preds[name] for name in models.keys()])

meta_models = {
    'meta_ridge': Ridge(alpha=0.1),
    'meta_ridge_strong': Ridge(alpha=1.0),
    'meta_lasso': Lasso(alpha=0.01),
}

meta_preds_train = {}
meta_preds_test = {}

for meta_name, meta_model in meta_models.items():
    meta_model.fit(X_meta_train, y_train_log)
    meta_preds_train[meta_name] = meta_model.predict(X_meta_train)
    meta_preds_test[meta_name] = meta_model.predict(X_meta_test)

# ============================================================================
# STEP 7: FINAL ENSEMBLE WITH WEIGHTED AVERAGING
# ============================================================================
print("Creating final ensemble...")

# Get weights from first meta-learner
meta_model_final = meta_models['meta_ridge']
weights = np.abs(meta_model_final.coef_) / np.abs(meta_model_final.coef_).sum()
print(f"Model weights: {dict(zip(models.keys(), np.round(weights, 3)))}")

# Weighted ensemble
ensemble_train_log = np.average(X_meta_train, axis=1, weights=weights)
ensemble_test_log = np.average(X_meta_test, axis=1, weights=weights)

# Convert back from log scale
ensemble_train = np.expm1(ensemble_train_log)
ensemble_test = np.expm1(ensemble_test_log)

# ============================================================================
# STEP 8: EVALUATE ON TRAINING SET
# ============================================================================
def smape(actual, predicted):
    return 100 * np.mean(2 * np.abs(predicted - actual) / (np.abs(actual) + np.abs(predicted)))

train_smape = smape(y_train_clean, ensemble_train)
print(f"\nTraining SMAPE: {train_smape:.4f}%")

# Cross-validation evaluation
print("Running 5-fold cross-validation...")
cv = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = []

for fold, (train_idx, val_idx) in enumerate(cv.split(X_train_scaled)):
    X_tr, X_val = X_train_scaled[train_idx], X_train_scaled[val_idx]
    y_tr, y_val = y_train_log[train_idx], y_train_log[val_idx]
    
    fold_preds = []
    for name, model in models.items():
        # Re-initialize model to avoid training on previous fold's data
        m = model.__class__(**{k: v for k, v in model.get_params().items() if not k.startswith('_')})
        m.fit(X_tr, y_tr)
        fold_preds.append(m.predict(X_val))
    
    X_meta_val = np.column_stack(fold_preds)
    fold_ensemble_log = np.average(X_meta_val, axis=1, weights=weights)
    fold_ensemble = np.expm1(fold_ensemble_log)
    y_val_original = np.expm1(y_val)
    
    fold_smape = smape(y_val_original, fold_ensemble)
    cv_scores.append(fold_smape)
    print(f"  Fold {fold + 1}: {fold_smape:.4f}%")

print(f"Cross-validation SMAPE: {np.mean(cv_scores):.4f}% ± {np.std(cv_scores):.4f}%")

# ============================================================================
# STEP 9: PREPARE OUTPUT
# ============================================================================
print("\nPreparing output...")

# Ensure predictions are positive
ensemble_test = np.maximum(ensemble_test, 0.1)

output_df = pd.DataFrame({
    'sample_id': df_test['sample_id'].values,
    'price': ensemble_test
})

# Sort by sample_id
output_df = output_df.sort_values('sample_id').reset_index(drop=True)

# Verify output
print(f"Output shape: {output_df.shape}")
print(f"Price range: [{output_df['price'].min():.2f}, {output_df['price'].max():.2f}]")
print(f"Mean price: {output_df['price'].mean():.2f}")
print("\nFirst 10 predictions:")
print(output_df.head(10))

# ============================================================================
# STEP 10: SAVE PREDICTIONS
# ============================================================================
os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)
output_df.to_csv(OUTPUT_PATH, index=False)
print(f"\n✅ Predictions saved to: {OUTPUT_PATH}")