In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
"""
ENHANCED PRICE PREDICTION V2
Target: <45% SMAPE
Key improvements: Better embeddings usage, price bucketing, advanced ensembling
"""

import os
import pickle
import numpy as np
import pandas as pd
import re
import warnings
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.preprocessing import StandardScaler, RobustScaler, QuantileTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.cluster import KMeans
import xgboost as xgb
import lightgbm as lgb
from scipy.sparse import hstack, csr_matrix
from sklearn.linear_model import Ridge
from scipy.stats import skew, kurtosis

warnings.filterwarnings('ignore')

print("=" * 80)
print("ENHANCED PRICE PREDICTION V2 - TARGET <45% SMAPE")
print("=" * 80)

# ==================== CONFIGURATION ====================
INPUT_PATH = '/kaggle/input/combined2/Combined'
OUTPUT_PATH = '/kaggle/working/'

TRAIN_CSV = os.path.join(INPUT_PATH, 'train.csv')
TEST_CSV = os.path.join(INPUT_PATH, 'test.csv')
TEXT_EMB_PKL = os.path.join(INPUT_PATH, 'Temp/text_embeddings.pkl')
TRAIN_IMG_EMB_PKL = os.path.join(INPUT_PATH, 'embeddings/train_image_embeddings.pkl')
TEST_IMG_EMB_PKL = os.path.join(INPUT_PATH, 'embeddings/test_image_embeddings.pkl')

# ==================== UTILITY FUNCTIONS ====================

def smape(y_true, y_pred):
    """Calculate SMAPE"""
    numerator = np.abs(y_pred - y_true)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    return np.mean(numerator / denominator)

def clean_text(text):
    """Clean text"""
    if pd.isna(text):
        return ""
    text = str(text).lower()
    text = re.sub(r'http\S+|www\S+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def extract_comprehensive_features(text):
    """Extract comprehensive features from text"""
    text_lower = text.lower()
    
    features = {}
    
    # ========== QUANTITY PATTERNS ==========
    pack_patterns = [
        (r'(\d+)\s*(?:pack|count|ct|piece|pcs|pc)\b', 1),
        (r'pack\s+of\s+(\d+)', 1),
        (r'(\d+)\s*-\s*pack', 1),
        (r'case\s+of\s+(\d+)', 1),
        (r'[x×]\s*(\d+)', 1),
        (r'set\s+of\s+(\d+)', 1),
    ]
    
    pack_qty = 1.0
    for pattern, _ in pack_patterns:
        match = re.search(pattern, text_lower)
        if match:
            pack_qty = float(match.group(1))
            break
    
    features['pack_quantity'] = pack_qty
    features['log_pack_qty'] = np.log1p(pack_qty)
    
    # ========== VOLUME/WEIGHT PATTERNS ==========
    volume_patterns = [
        (r'(\d+(?:\.\d+)?)\s*(ml|l|liter|litre|milliliter)', 1),
        (r'(\d+(?:\.\d+)?)\s*(fl\s*oz|fluid\s*ounce)', 29.5735),
        (r'(\d+(?:\.\d+)?)\s*(oz|ounce)(?!\s*fl)', 28.3495),
        (r'(\d+(?:\.\d+)?)\s*(g|gram)(?!allon)', 1),
        (r'(\d+(?:\.\d+)?)\s*(kg|kilogram)', 1000),
        (r'(\d+(?:\.\d+)?)\s*(lb|pound)', 453.592),
    ]
    
    volume_value = 0.0
    for pattern, multiplier in volume_patterns:
        match = re.search(pattern, text_lower)
        if match:
            volume_value = float(match.group(1)) * multiplier
            break
    
    features['volume_value'] = volume_value
    features['log_volume'] = np.log1p(volume_value)
    features['total_volume'] = volume_value * pack_qty
    features['log_total_volume'] = np.log1p(volume_value * pack_qty)
    
    # ========== IPQ EXTRACTION ==========
    ipq_match = re.search(r'Value:\s*([\d.]+)', text)
    ipq_val = float(ipq_match.group(1)) if ipq_match else 1.0
    features['ipq_value'] = ipq_val
    features['log_ipq'] = np.log1p(ipq_val)
    
    unit_match = re.search(r'Unit:\s*(\w+)', text)
    features['unit'] = unit_match.group(1) if unit_match else 'UNKNOWN'
    
    # ========== TEXT STATISTICS ==========
    features['char_count'] = len(text)
    features['word_count'] = len(text.split())
    features['line_count'] = len(text.split('\n'))
    features['digit_count'] = sum(1 for c in text if c.isdigit())
    features['upper_count'] = sum(1 for c in text if c.isupper())
    features['comma_count'] = text.count(',')
    features['newline_count'] = text.count('\n')
    features['avg_word_len'] = np.mean([len(w) for w in text.split()]) if text.split() else 0
    
    # ========== NUMERIC PATTERNS ==========
    numbers = re.findall(r'\d+\.?\d*', text)
    if numbers:
        nums = [float(n) for n in numbers]
        features['num_count'] = len(nums)
        features['num_max'] = max(nums)
        features['num_min'] = min(nums)
        features['num_mean'] = np.mean(nums)
        features['num_sum'] = sum(nums)
        features['num_std'] = np.std(nums) if len(nums) > 1 else 0
        features['num_range'] = max(nums) - min(nums)
        features['num_skew'] = skew(nums) if len(nums) > 2 else 0
    else:
        features.update({
            'num_count': 0, 'num_max': 0, 'num_min': 0,
            'num_mean': 0, 'num_sum': 0, 'num_std': 0,
            'num_range': 0, 'num_skew': 0
        })
    
    # ========== CATEGORY INDICATORS ==========
    features['is_food'] = int(any(kw in text_lower for kw in 
        ['food', 'snack', 'drink', 'beverage', 'nutrition', 'candy', 'chocolate', 'meal']))
    features['is_health'] = int(any(kw in text_lower for kw in 
        ['vitamin', 'supplement', 'health', 'wellness', 'medicine', 'protein']))
    features['is_beauty'] = int(any(kw in text_lower for kw in 
        ['beauty', 'skincare', 'lotion', 'cream', 'cosmetic', 'shampoo', 'soap']))
    features['is_household'] = int(any(kw in text_lower for kw in 
        ['cleaner', 'detergent', 'paper', 'tissue', 'towel', 'trash', 'laundry']))
    features['is_baby'] = int(any(kw in text_lower for kw in 
        ['baby', 'infant', 'diaper', 'wipes', 'formula']))
    features['is_organic'] = int(any(kw in text_lower for kw in 
        ['organic', 'natural', 'non-gmo', 'gluten-free']))
    
    # ========== PRICE INDICATORS ==========
    features['has_value'] = int(any(kw in text_lower for kw in ['value', 'economy', 'saver']))
    features['has_premium'] = int(any(kw in text_lower for kw in ['premium', 'deluxe', 'gourmet', 'luxury']))
    features['has_bulk'] = int(any(kw in text_lower for kw in ['bulk', 'wholesale', 'family size']))
    
    # ========== INTERACTION FEATURES ==========
    features['value_per_pack'] = ipq_val / max(pack_qty, 1)
    features['total_quantity'] = ipq_val * pack_qty
    features['log_total_qty'] = np.log1p(ipq_val * pack_qty)
    features['words_per_line'] = features['word_count'] / max(features['line_count'], 1)
    features['density'] = features['char_count'] / max(features['word_count'], 1)
    features['digit_ratio'] = features['digit_count'] / max(features['char_count'], 1)
    features['upper_ratio'] = features['upper_count'] / max(features['char_count'], 1)
    
    # Volume per pack
    if volume_value > 0 and pack_qty > 0:
        features['volume_per_pack'] = volume_value / pack_qty
    else:
        features['volume_per_pack'] = 0
    
    return features

# ==================== LOAD DATA ====================

print("\n" + "=" * 80)
print("LOADING DATA")
print("=" * 80)

train_df = pd.read_csv(TRAIN_CSV)
test_df = pd.read_csv(TEST_CSV)

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")

# ==================== OUTLIER REMOVAL ====================

print("\n" + "=" * 80)
print("HANDLING OUTLIERS")
print("=" * 80)

# More aggressive outlier removal
Q1 = train_df['price'].quantile(0.01)
Q3 = train_df['price'].quantile(0.99)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

original_len = len(train_df)
train_df = train_df[(train_df['price'] >= lower_bound) & 
                    (train_df['price'] <= upper_bound)].reset_index(drop=True)

print(f"Outliers removed: {original_len} → {len(train_df)} ({original_len - len(train_df)} removed)")

train_df['log_price'] = np.log1p(train_df['price'])

# CREATE PRICE BUCKETS for stratified sampling
train_df['price_bucket'] = pd.qcut(train_df['price'], q=10, labels=False, duplicates='drop')

# ==================== LOAD EMBEDDINGS ====================

print("\n" + "=" * 80)
print("LOADING EMBEDDINGS")
print("=" * 80)

X_train_img = None
X_test_img = None
X_train_text_emb = None
X_test_text_emb = None

# Load image embeddings
try:
    print("Loading image embeddings...")
    with open(TRAIN_IMG_EMB_PKL, 'rb') as f:
        train_img_data = pickle.load(f)
    with open(TEST_IMG_EMB_PKL, 'rb') as f:
        test_img_data = pickle.load(f)
    
    X_train_img_raw = train_img_data['embeddings']
    X_test_img = test_img_data['embeddings']
    
    if 'ids' in train_img_data:
        emb_ids = train_img_data['ids']
        train_sample_ids = train_df['sample_id'].values
        mask = np.isin(emb_ids, train_sample_ids)
        X_train_img = X_train_img_raw[mask]
    else:
        if len(X_train_img_raw) > len(train_df):
            X_train_img = X_train_img_raw[:len(train_df)]
        else:
            X_train_img = X_train_img_raw
    
    if len(X_train_img) != len(train_df):
        if len(X_train_img) < len(train_df):
            mean_emb = np.mean(X_train_img, axis=0, keepdims=True)
            padding = np.repeat(mean_emb, len(train_df) - len(X_train_img), axis=0)
            X_train_img = np.vstack([X_train_img, padding])
        else:
            X_train_img = X_train_img[:len(train_df)]
    
    print(f"✓ Image embeddings loaded: Train {X_train_img.shape}, Test {X_test_img.shape}")
except Exception as e:
    print(f"⚠️ Image embeddings not loaded: {e}")

# Load text embeddings
try:
    print("Loading text embeddings...")
    with open(TEXT_EMB_PKL, 'rb') as f:
        text_emb_data = pickle.load(f)
    
    X_train_text_emb_raw = text_emb_data['train_embeddings']
    X_test_text_emb = text_emb_data['test_embeddings']
    
    if 'metadata' in text_emb_data and 'train_sample_ids' in text_emb_data['metadata']:
        emb_sample_ids = text_emb_data['metadata']['train_sample_ids']
        train_sample_ids = train_df['sample_id'].values
        mask = np.isin(emb_sample_ids, train_sample_ids)
        X_train_text_emb = X_train_text_emb_raw[mask]
    else:
        if len(X_train_text_emb_raw) > len(train_df):
            X_train_text_emb = X_train_text_emb_raw[:len(train_df)]
        else:
            X_train_text_emb = X_train_text_emb_raw
    
    if len(X_train_text_emb) != len(train_df):
        if len(X_train_text_emb) < len(train_df):
            padding = np.zeros((len(train_df) - len(X_train_text_emb), X_train_text_emb.shape[1]))
            X_train_text_emb = np.vstack([X_train_text_emb, padding])
        else:
            X_train_text_emb = X_train_text_emb[:len(train_df)]
    
    print(f"✓ Text embeddings loaded: Train {X_train_text_emb.shape}, Test {X_test_text_emb.shape}")
except Exception as e:
    print(f"⚠️ Text embeddings not loaded: {e}")

# ==================== FEATURE ENGINEERING ====================

print("\n" + "=" * 80)
print("COMPREHENSIVE FEATURE ENGINEERING")
print("=" * 80)

print("Extracting comprehensive features...")
train_df['clean_text'] = train_df['catalog_content'].fillna('').apply(clean_text)
test_df['clean_text'] = test_df['catalog_content'].fillna('').apply(clean_text)

train_features = train_df['catalog_content'].fillna('').apply(
    lambda x: pd.Series(extract_comprehensive_features(x)))
test_features = test_df['catalog_content'].fillna('').apply(
    lambda x: pd.Series(extract_comprehensive_features(x)))

print(f"✓ Extracted {len(train_features.columns)} engineered features")

# TF-IDF with optimized parameters
print("Creating TF-IDF features...")
tfidf = TfidfVectorizer(
    max_features=20000,  # Increased
    ngram_range=(1, 3),
    min_df=2,
    max_df=0.90,
    sublinear_tf=True,
    strip_accents='unicode'
)

X_tfidf_train = tfidf.fit_transform(train_df['clean_text'])
X_tfidf_test = tfidf.transform(test_df['clean_text'])

# SVD reduction
print("Applying SVD dimensionality reduction...")
svd = TruncatedSVD(n_components=200, random_state=42)  # Increased
X_svd_train = svd.fit_transform(X_tfidf_train)
X_svd_test = svd.transform(X_tfidf_test)

print(f"✓ TF-IDF → SVD: {X_tfidf_train.shape} → {X_svd_train.shape}")

# ==================== EMBEDDING CLUSTERING ====================

print("\n" + "=" * 80)
print("CREATING EMBEDDING-BASED CLUSTERS")
print("=" * 80)

# Cluster embeddings to create categorical features
if X_train_text_emb is not None:
    print("Clustering text embeddings...")
    n_clusters = 50
    kmeans_text = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
    train_text_clusters = kmeans_text.fit_predict(X_train_text_emb)
    test_text_clusters = kmeans_text.predict(X_test_text_emb)
    
    # One-hot encode clusters
    train_text_cluster_onehot = np.eye(n_clusters)[train_text_clusters]
    test_text_cluster_onehot = np.eye(n_clusters)[test_text_clusters]
    print(f"✓ Text embedding clusters: {n_clusters}")
else:
    train_text_cluster_onehot = None
    test_text_cluster_onehot = None

if X_train_img is not None:
    print("Clustering image embeddings...")
    n_clusters = 30
    kmeans_img = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
    train_img_clusters = kmeans_img.fit_predict(X_train_img)
    test_img_clusters = kmeans_img.predict(X_test_img)
    
    train_img_cluster_onehot = np.eye(n_clusters)[train_img_clusters]
    test_img_cluster_onehot = np.eye(n_clusters)[test_img_clusters]
    print(f"✓ Image embedding clusters: {n_clusters}")
else:
    train_img_cluster_onehot = None
    test_img_cluster_onehot = None

# ==================== COMBINE ALL FEATURES ====================

print("\n" + "=" * 80)
print("COMBINING FEATURES")
print("=" * 80)

# Numeric features
numeric_cols = [col for col in train_features.columns if col != 'unit']
X_numeric_train = train_features[numeric_cols].fillna(0).values
X_numeric_test = test_features[numeric_cols].fillna(0).values

# Scale numeric features with QuantileTransformer for better distribution
scaler = QuantileTransformer(n_quantiles=1000, output_distribution='normal', random_state=42)
X_numeric_train_scaled = scaler.fit_transform(X_numeric_train)
X_numeric_test_scaled = scaler.transform(X_numeric_test)

# Combine features
feature_list_train = [X_svd_train, X_numeric_train_scaled]
feature_list_test = [X_svd_test, X_numeric_test_scaled]
feature_names = ["TF-IDF+SVD", "Numeric"]

if X_train_img is not None:
    # More components from image embeddings
    pca_img = PCA(n_components=100, random_state=42)  # Increased
    X_img_reduced_train = pca_img.fit_transform(X_train_img)
    X_img_reduced_test = pca_img.transform(X_test_img)
    feature_list_train.append(X_img_reduced_train)
    feature_list_test.append(X_img_reduced_test)
    feature_names.append("Image(PCA)")
    
    if train_img_cluster_onehot is not None:
        feature_list_train.append(train_img_cluster_onehot)
        feature_list_test.append(test_img_cluster_onehot)
        feature_names.append("ImgClusters")

if X_train_text_emb is not None:
    pca_text = PCA(n_components=100, random_state=42)  # Increased
    X_text_reduced_train = pca_text.fit_transform(X_train_text_emb)
    X_text_reduced_test = pca_text.transform(X_test_text_emb)
    feature_list_train.append(X_text_reduced_train)
    feature_list_test.append(X_text_reduced_test)
    feature_names.append("TextEmb(PCA)")
    
    if train_text_cluster_onehot is not None:
        feature_list_train.append(train_text_cluster_onehot)
        feature_list_test.append(test_text_cluster_onehot)
        feature_names.append("TextClusters")

X_train_combined = np.hstack(feature_list_train)
X_test_combined = np.hstack(feature_list_test)

print(f"Feature groups: {' + '.join(feature_names)}")
print(f"Final shape: Train {X_train_combined.shape}, Test {X_test_combined.shape}")

y_train = train_df['log_price'].values
price_buckets = train_df['price_bucket'].values

# ==================== ENSEMBLE TRAINING ====================

print("\n" + "=" * 80)
print("TRAINING ENHANCED ENSEMBLE")
print("=" * 80)

# Use StratifiedKFold for better validation split
skf = StratifiedKFold(n_splits=7, shuffle=True, random_state=42)  # Increased folds

lgb_oof = np.zeros(len(X_train_combined))
xgb_oof = np.zeros(len(X_train_combined))
ridge_oof = np.zeros(len(X_train_combined))

lgb_test_preds = []
xgb_test_preds = []
ridge_test_preds = []

# Enhanced LightGBM parameters
lgb_params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'n_estimators': 5000,  # Increased
    'learning_rate': 0.02,  # Decreased for more iterations
    'max_depth': 10,  # Increased
    'num_leaves': 255,  # Increased
    'min_child_samples': 10,  # Decreased
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'reg_alpha': 0.05,
    'reg_lambda': 0.05,
    'random_state': 42,
    'n_jobs': -1,
    'verbose': -1,
    'extra_trees': True  # Added for regularization
}

# Enhanced XGBoost parameters
xgb_params = {
    'objective': 'reg:squarederror',
    'tree_method': 'hist',
    'n_estimators': 4000,  # Increased
    'learning_rate': 0.02,  # Decreased
    'max_depth': 9,  # Increased
    'min_child_weight': 1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'reg_alpha': 0.05,
    'reg_lambda': 0.05,
    'gamma': 0.1,  # Added
    'random_state': 42
}

# Ridge regression parameters
ridge_params = {
    'alpha': 10.0,
    'random_state': 42
}

print("\nTraining 7-Fold Stratified Cross-Validation Ensemble...")

for fold, (train_idx, val_idx) in enumerate(skf.split(X_train_combined, price_buckets)):
    print(f"\nFold {fold + 1}/7")
    
    X_tr, X_val = X_train_combined[train_idx], X_train_combined[val_idx]
    y_tr, y_val = y_train[train_idx], y_train[val_idx]
    
    # Train LightGBM
    print("  Training LightGBM...")
    lgb_model = lgb.LGBMRegressor(**lgb_params)
    lgb_model.fit(
        X_tr, y_tr,
        eval_set=[(X_val, y_val)],
        callbacks=[lgb.early_stopping(150, verbose=False)]
    )
    lgb_oof[val_idx] = lgb_model.predict(X_val)
    lgb_test_preds.append(lgb_model.predict(X_test_combined))
    
    # Train XGBoost
    print("  Training XGBoost...")
    xgb_model = xgb.XGBRegressor(**xgb_params)
    xgb_model.fit(
        X_tr, y_tr,
        eval_set=[(X_val, y_val)],
        verbose=False
    )
    xgb_oof[val_idx] = xgb_model.predict(X_val)
    xgb_test_preds.append(xgb_model.predict(X_test_combined))
    
    # Train Ridge
    print("  Training Ridge...")
    ridge_model = Ridge(**ridge_params)
    ridge_model.fit(X_tr, y_tr)
    ridge_oof[val_idx] = ridge_model.predict(X_val)
    ridge_test_preds.append(ridge_model.predict(X_test_combined))
    
    # Fold metrics
    lgb_fold_smape = smape(np.expm1(y_val), np.expm1(lgb_oof[val_idx]))
    xgb_fold_smape = smape(np.expm1(y_val), np.expm1(xgb_oof[val_idx]))
    ridge_fold_smape = smape(np.expm1(y_val), np.expm1(ridge_oof[val_idx]))
    
    print(f"  LGB: {lgb_fold_smape:.4%}, XGB: {xgb_fold_smape:.4%}, Ridge: {ridge_fold_smape:.4%}")

# Average test predictions
lgb_test_pred = np.mean(lgb_test_preds, axis=0)
xgb_test_pred = np.mean(xgb_test_preds, axis=0)
ridge_test_pred = np.mean(ridge_test_preds, axis=0)

# ==================== STACKING ====================

print("\n" + "=" * 80)
print("STACKING LAYER")
print("=" * 80)

# Create stacking features
X_stack_train = np.column_stack([lgb_oof, xgb_oof, ridge_oof])
X_stack_test = np.column_stack([lgb_test_pred, xgb_test_pred, ridge_test_pred])

# Train meta-learner
print("Training meta-learner (Ridge)...")
meta_model = Ridge(alpha=1.0, random_state=42)
meta_model.fit(X_stack_train, y_train)

final_oof = meta_model.predict(X_stack_train)
final_test_pred = meta_model.predict(X_stack_test)

stack_smape = smape(np.expm1(y_train), np.expm1(final_oof))
print(f"✓ Stacked OOF SMAPE: {stack_smape:.4%}")

# Print meta-model weights
print(f"Meta-model weights: LGB={meta_model.coef_[0]:.3f}, XGB={meta_model.coef_[1]:.3f}, Ridge={meta_model.coef_[2]:.3f}")

# ==================== GENERATE SUBMISSION ====================

print("\n" + "=" * 80)
print("GENERATING SUBMISSION")
print("=" * 80)

final_test_pred = np.expm1(final_test_pred)
final_test_pred = np.maximum(final_test_pred, 0.01)

submission = pd.DataFrame({
    'sample_id': test_df['sample_id'],
    'price': final_test_pred
})

output_file = os.path.join(OUTPUT_PATH, 'submission_v2.csv')
submission.to_csv(output_file, index=False)

print(f"\n✓ Submission saved to '{output_file}'")
print(f"  Rows: {len(submission)}")
print(f"  Price range: ${submission['price'].min():.2f} - ${submission['price'].max():.2f}")
print(f"  Mean: ${submission['price'].mean():.2f}, Median: ${submission['price'].median():.2f}")

print("\n" + "=" * 80)
print(f"FINAL OOF SMAPE: {stack_smape:.4%}")
print("=" * 80)

print("\nFirst 20 predictions:")
print(submission.head(20))

ENHANCED PRICE PREDICTION V2 - TARGET <45% SMAPE

LOADING DATA
Train shape: (75000, 4)
Test shape: (75000, 3)

HANDLING OUTLIERS
Outliers removed: 75000 → 74943 (57 removed)

LOADING EMBEDDINGS
Loading image embeddings...
✓ Image embeddings loaded: Train (74943, 512), Test (75000, 512)
Loading text embeddings...
✓ Text embeddings loaded: Train (74943, 384), Test (75000, 384)

COMPREHENSIVE FEATURE ENGINEERING
Extracting comprehensive features...
✓ Extracted 42 engineered features
Creating TF-IDF features...
Applying SVD dimensionality reduction...
✓ TF-IDF → SVD: (74943, 20000) → (74943, 200)

CREATING EMBEDDING-BASED CLUSTERS
Clustering text embeddings...
✓ Text embedding clusters: 50
Clustering image embeddings...
✓ Image embedding clusters: 30

COMBINING FEATURES
Feature groups: TF-IDF+SVD + Numeric + Image(PCA) + ImgClusters + TextEmb(PCA) + TextClusters
Final shape: Train (74943, 521), Test (75000, 521)

TRAINING ENHANCED ENSEMBLE

Training 7-Fold Stratified Cross-Validation Ensem