# Smart Product Pricing - Enhanced Solution with CLIP
## Advanced Multimodal Price Prediction

**Key Enhancements:**
- CLIP-ViT-B/32 for joint image-text embeddings
- GroupKFold by brand to prevent data leakage
- Advanced post-processing with per-brand bias correction
- Dual modeling: Neural Network + LightGBM ensemble
- Box-Cox and log1p target transformations

## 1. Setup and Configuration

In [None]:
# Install additional packages for CLIP
!pip install -q torch torchvision sentence-transformers pillow pandas numpy scikit-learn lightgbm tqdm requests transformers

In [None]:
import pandas as pd
import numpy as np
import re
import os
from pathlib import Path
import warnings
import hashlib
warnings.filterwarnings('ignore')

# ML Libraries
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
from transformers import CLIPProcessor, CLIPModel, CLIPTokenizer
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from scipy import stats
import lightgbm as lgb
from tqdm.auto import tqdm

# Set seeds
SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

In [None]:
# Enhanced Configuration
DATA_DIR = Path('dataset')
IMAGE_DIR = Path('images')
IMAGE_DIR.mkdir(exist_ok=True)

CONFIG = {
    # CLIP configuration
    'use_clip': True,
    'clip_model': 'openai/clip-vit-base-patch32',  # CLIP-ViT-B/32
    'freeze_clip': True,  # Freeze CLIP weights for efficiency
    
    # Alternative text model (if not using CLIP text encoder)
    'text_model': 'sentence-transformers/all-MiniLM-L6-v2',
    
    # Training
    'batch_size': 64,
    'epochs': 30,
    'learning_rate': 0.001,
    'weight_decay': 1e-5,
    'n_folds': 5,
    'early_stopping_rounds': 5,
    
    # Target transformation
    'target_transform': 'log1p',  # 'log1p' or 'boxcox'
    
    # Post-processing
    'use_brand_calibration': True,
    'use_median_scaling': True,
    'min_price': 0.01,
    
    # Ensemble
    'use_lgb': True,
    'nn_weight': 0.7,
    'lgb_weight': 0.3,
}

print("Configuration:")
for k, v in CONFIG.items():
    print(f"  {k}: {v}")

## 2. Data Loading and Group Creation

In [None]:
# Load data
print("Loading datasets...")
train_df = pd.read_csv(DATA_DIR / 'train.csv')
test_df = pd.read_csv(DATA_DIR / 'test.csv')

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")
print(f"\nPrice statistics:")
print(train_df['price'].describe())

In [None]:
def create_groups(df, method='brand'):
    """
    Create groups for GroupKFold to prevent data leakage
    
    Args:
        df: DataFrame with catalog_content
        method: 'brand' or 'hash'
    
    Returns:
        Array of group identifiers
    """
    if method == 'brand':
        # Extract brand from first few words
        def extract_brand(text):
            if pd.isna(text):
                return 'unknown'
            words = str(text).split()[:3]
            brand = ' '.join(words).split(',')[0].split('-')[0].strip()
            return brand.lower() if brand else 'unknown'
        
        brands = df['catalog_content'].apply(extract_brand)
        
        # Encode brands
        brand_encoder = LabelEncoder()
        groups = brand_encoder.fit_transform(brands)
        
        print(f"Created {len(np.unique(groups))} brand groups")
        
    elif method == 'hash':
        # Hash product title for grouping
        def hash_title(text):
            if pd.isna(text):
                return 0
            # Take first 50 chars as title proxy
            title = str(text)[:50]
            # Hash to create groups (modulo to limit number of groups)
            hash_val = int(hashlib.md5(title.encode()).hexdigest(), 16)
            return hash_val % 1000  # 1000 groups
        
        groups = df['catalog_content'].apply(hash_title).values
        print(f"Created {len(np.unique(groups))} hash-based groups")
    
    return groups

# Create groups for training data
train_groups = create_groups(train_df, method='brand')

print(f"\nGroup distribution:")
print(f"  Min samples per group: {np.bincount(train_groups).min()}")
print(f"  Max samples per group: {np.bincount(train_groups).max()}")
print(f"  Mean samples per group: {np.bincount(train_groups).mean():.1f}")

## 3. Image Download (same as before)

In [None]:
# Import download utilities
import sys
sys.path.append('.')
from utils import download_images_parallel

print("Downloading images...")
download_images_parallel(train_df, IMAGE_DIR / 'train', max_workers=20)
download_images_parallel(test_df, IMAGE_DIR / 'test', max_workers=20)

## 4. Enhanced Feature Engineering with Brand Extraction

In [None]:
class EnhancedStructuredFeatureExtractor:
    """Enhanced structured feature extraction with brand handling"""
    
    def __init__(self):
        self.brand_encoder = LabelEncoder()
        self.fitted = False
        self.brand_stats = {}  # Store per-brand statistics
    
    def extract_quantity(self, text):
        """Extract numeric quantities"""
        if pd.isna(text):
            return 1.0
        
        patterns = [
            r'pack of (\d+)',
            r'(\d+)[\s-]*pack',
            r'(\d+)[\s-]*count',
            r'(\d+)\s*x\s*\d+',
            r'IPQ[:\s]*(\d+)',
            r'quantity[:\s]*(\d+)',
        ]
        
        text_lower = text.lower()
        for pattern in patterns:
            match = re.search(pattern, text_lower)
            if match:
                try:
                    return float(match.group(1))
                except:
                    pass
        return 1.0
    
    def extract_weight(self, text):
        """Extract weight/volume in grams/ml"""
        if pd.isna(text):
            return 0.0
        
        text_lower = text.lower()
        patterns = [
            (r'(\d+\.?\d*)\s*kg', 1000),
            (r'(\d+\.?\d*)\s*g\b', 1),
            (r'(\d+\.?\d*)\s*lb', 453.592),
            (r'(\d+\.?\d*)\s*oz', 28.3495),
            (r'(\d+\.?\d*)\s*ml', 1),
            (r'(\d+\.?\d*)\s*l\b', 1000),
        ]
        
        for pattern, multiplier in patterns:
            match = re.search(pattern, text_lower)
            if match:
                try:
                    return float(match.group(1)) * multiplier
                except:
                    pass
        return 0.0
    
    def extract_brand(self, text):
        """Extract brand name"""
        if pd.isna(text):
            return 'unknown'
        
        words = text.split()[:3]
        brand = ' '.join(words).split(',')[0].split('-')[0].strip()
        return brand.lower() if brand else 'unknown'
    
    def extract_binary_flags(self, text):
        """Extract binary features"""
        if pd.isna(text):
            return {}
        
        text_lower = text.lower()
        return {
            'is_organic': int('organic' in text_lower),
            'is_vegan': int('vegan' in text_lower),
            'is_gluten_free': int('gluten free' in text_lower or 'gluten-free' in text_lower),
            'is_combo': int('combo' in text_lower or 'bundle' in text_lower),
            'is_refill': int('refill' in text_lower),
            'is_premium': int(any(word in text_lower for word in ['premium', 'deluxe', 'luxury'])),
        }
    
    def fit_transform(self, df, prices=None):
        """Extract features and compute brand statistics"""
        features = {}
        
        features['quantity'] = df['catalog_content'].apply(self.extract_quantity)
        features['weight'] = df['catalog_content'].apply(self.extract_weight)
        features['text_length'] = df['catalog_content'].fillna('').apply(len)
        features['word_count'] = df['catalog_content'].fillna('').apply(lambda x: len(x.split()))
        
        # Brand encoding
        brands = df['catalog_content'].apply(self.extract_brand)
        features['brand'] = self.brand_encoder.fit_transform(brands)
        features['brand_name'] = brands.values  # Keep for later
        
        # Compute per-brand price statistics (for calibration)
        if prices is not None:
            brand_df = pd.DataFrame({
                'brand': brands,
                'price': prices
            })
            self.brand_stats = brand_df.groupby('brand')['price'].agg(['mean', 'median', 'std', 'count']).to_dict('index')
            print(f"Computed statistics for {len(self.brand_stats)} brands")
        
        # Binary flags
        flags_df = pd.DataFrame(df['catalog_content'].apply(self.extract_binary_flags).tolist())
        for col in flags_df.columns:
            features[col] = flags_df[col].values
        
        self.fitted = True
        return pd.DataFrame(features)
    
    def transform(self, df):
        """Transform test data"""
        features = {}
        
        features['quantity'] = df['catalog_content'].apply(self.extract_quantity)
        features['weight'] = df['catalog_content'].apply(self.extract_weight)
        features['text_length'] = df['catalog_content'].fillna('').apply(len)
        features['word_count'] = df['catalog_content'].fillna('').apply(lambda x: len(x.split()))
        
        # Brand encoding with unknown handling
        brands = df['catalog_content'].apply(self.extract_brand)
        brand_encoded = []
        for b in brands:
            if b in self.brand_encoder.classes_:
                brand_encoded.append(self.brand_encoder.transform([b])[0])
            else:
                brand_encoded.append(-1)
        features['brand'] = brand_encoded
        features['brand_name'] = brands.values
        
        # Binary flags
        flags_df = pd.DataFrame(df['catalog_content'].apply(self.extract_binary_flags).tolist())
        for col in flags_df.columns:
            features[col] = flags_df[col].values
        
        return pd.DataFrame(features)

# Extract structured features
print("Extracting structured features...")
struct_extractor = EnhancedStructuredFeatureExtractor()
train_struct = struct_extractor.fit_transform(train_df, train_df['price'].values)
test_struct = struct_extractor.transform(test_df)

print(f"Features: {[c for c in train_struct.columns if c != 'brand_name']}")

## 5. CLIP Feature Extraction

In [None]:
class CLIPFeatureExtractor:
    """Extract features using frozen CLIP-ViT-B/32"""
    
    def __init__(self, model_name='openai/clip-vit-base-patch32', freeze=True):
        print(f"Loading CLIP model: {model_name}")
        self.model = CLIPModel.from_pretrained(model_name)
        self.processor = CLIPProcessor.from_pretrained(model_name)
        
        if freeze:
            # Freeze all CLIP parameters
            for param in self.model.parameters():
                param.requires_grad = False
            print("CLIP weights frozen")
        
        self.model.eval()
        self.model.to(device)
    
    def extract_image_features(self, sample_ids, image_dir, batch_size=64):
        """Extract image features using CLIP vision encoder"""
        features_list = []
        
        with torch.no_grad():
            for i in tqdm(range(0, len(sample_ids), batch_size), desc="CLIP image features"):
                batch_ids = sample_ids[i:i+batch_size]
                batch_images = []
                
                for sample_id in batch_ids:
                    img_path = Path(image_dir) / f"{sample_id}.jpg"
                    try:
                        img = Image.open(img_path).convert('RGB')
                    except:
                        # Create blank image if loading fails
                        img = Image.new('RGB', (224, 224), color='black')
                    batch_images.append(img)
                
                # Process with CLIP
                inputs = self.processor(images=batch_images, return_tensors="pt", padding=True)
                inputs = {k: v.to(device) for k, v in inputs.items()}
                
                # Get image features
                image_features = self.model.get_image_features(**inputs)
                features_list.append(image_features.cpu().numpy())
        
        return np.vstack(features_list)
    
    def extract_text_features(self, texts, batch_size=64):
        """Extract text features using CLIP text encoder"""
        texts = [str(t) if not pd.isna(t) else '' for t in texts]
        features_list = []
        
        with torch.no_grad():
            for i in tqdm(range(0, len(texts), batch_size), desc="CLIP text features"):
                batch_texts = texts[i:i+batch_size]
                
                # Truncate long texts (CLIP has 77 token limit)
                batch_texts = [t[:500] for t in batch_texts]
                
                inputs = self.processor(text=batch_texts, return_tensors="pt", padding=True, truncation=True)
                inputs = {k: v.to(device) for k, v in inputs.items()}
                
                # Get text features
                text_features = self.model.get_text_features(**inputs)
                features_list.append(text_features.cpu().numpy())
        
        return np.vstack(features_list)

if CONFIG['use_clip']:
    print("\nExtracting CLIP features...")
    clip_extractor = CLIPFeatureExtractor(CONFIG['clip_model'], freeze=CONFIG['freeze_clip'])
    
    # Extract image features
    train_image = clip_extractor.extract_image_features(train_df['sample_id'].values, IMAGE_DIR / 'train')
    test_image = clip_extractor.extract_image_features(test_df['sample_id'].values, IMAGE_DIR / 'test')
    print(f"Image features: {train_image.shape}")
    
    # Extract text features
    train_text = clip_extractor.extract_text_features(train_df['catalog_content'].values)
    test_text = clip_extractor.extract_text_features(test_df['catalog_content'].values)
    print(f"Text features: {train_text.shape}")
    
    # Note: CLIP features are already L2-normalized
else:
    # Fallback to separate encoders
    print("\nUsing separate text/image encoders...")
    # (Use previous implementation)

In [None]:
# Scale structured features
print("\nScaling structured features...")
struct_cols = [c for c in train_struct.columns if c != 'brand_name']
struct_scaler = StandardScaler()
train_struct_scaled = struct_scaler.fit_transform(train_struct[struct_cols])
test_struct_scaled = struct_scaler.transform(test_struct[struct_cols])

print(f"Structured features: {train_struct_scaled.shape}")

## 6. Enhanced Target Transformation

In [None]:
class EnhancedTargetTransformer:
    """Enhanced target transformation with Box-Cox option"""
    
    def __init__(self, method='log1p'):
        self.method = method
        self.boxcox_lambda = None
        self.shift = 1e-6
    
    def fit_transform(self, y):
        """Fit and transform target"""
        if self.method == 'log1p':
            return np.log1p(y)
        elif self.method == 'boxcox':
            y_shifted = y + self.shift
            # Find optimal lambda
            transformed, self.boxcox_lambda = stats.boxcox(y_shifted)
            print(f"Box-Cox lambda: {self.boxcox_lambda:.4f}")
            return transformed
        else:
            return y
    
    def inverse_transform(self, y_transformed):
        """Inverse transform (expm1 or Box-Cox inverse)"""
        if self.method == 'log1p':
            return np.expm1(y_transformed)
        elif self.method == 'boxcox':
            y_original = stats.inv_boxcox(y_transformed, self.boxcox_lambda)
            return y_original - self.shift
        else:
            return y_transformed

# Transform target
print(f"\nApplying {CONFIG['target_transform']} transformation...")
target_transformer = EnhancedTargetTransformer(method=CONFIG['target_transform'])
train_target = target_transformer.fit_transform(train_df['price'].values)

print(f"Original range: [{train_df['price'].min():.2f}, {train_df['price'].max():.2f}]")
print(f"Transformed range: [{train_target.min():.2f}, {train_target.max():.2f}]")

## 7. Compact Multimodal Fusion MLP

In [None]:
class CompactMultimodalMLP(nn.Module):
    """Compact fusion MLP: concatenate text + image + structured features"""
    
    def __init__(self, text_dim, image_dim, structured_dim, hidden_dims=[256, 128, 64]):
        super().__init__()
        
        # Concatenate all features directly
        input_dim = text_dim + image_dim + structured_dim
        
        # Build MLP
        layers = []
        prev_dim = input_dim
        
        for hidden_dim in hidden_dims:
            layers.extend([
                nn.Linear(prev_dim, hidden_dim),
                nn.ReLU(),
                nn.Dropout(0.3),
            ])
            prev_dim = hidden_dim
        
        # Output layer
        layers.append(nn.Linear(prev_dim, 1))
        
        self.mlp = nn.Sequential(*layers)
    
    def forward(self, text_features, image_features, structured_features):
        # Simple concatenation
        combined = torch.cat([text_features, image_features, structured_features], dim=1)
        output = self.mlp(combined)
        return output.squeeze()

## 8. Dataset and SMAPE Metric

In [None]:
class PriceDataset(Dataset):
    def __init__(self, text_features, image_features, structured_features, targets=None):
        self.text_features = torch.FloatTensor(text_features)
        self.image_features = torch.FloatTensor(image_features)
        self.structured_features = torch.FloatTensor(structured_features)
        self.targets = torch.FloatTensor(targets) if targets is not None else None
    
    def __len__(self):
        return len(self.text_features)
    
    def __getitem__(self, idx):
        if self.targets is not None:
            return (
                self.text_features[idx],
                self.image_features[idx],
                self.structured_features[idx],
                self.targets[idx]
            )
        else:
            return (
                self.text_features[idx],
                self.image_features[idx],
                self.structured_features[idx]
            )

def smape(y_true, y_pred):
    """SMAPE metric"""
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2.0
    denominator = np.where(denominator == 0, 1e-8, denominator)
    return np.mean(np.abs(y_true - y_pred) / denominator) * 100

## 9. Training with GroupKFold

In [None]:
def train_nn_fold(model, train_loader, val_loader, epochs, lr, target_transformer, patience=5):
    """Train neural network for one fold"""
    criterion = nn.MSELoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=CONFIG['weight_decay'])
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3)
    
    best_val_smape = float('inf')
    patience_counter = 0
    best_state = None
    
    for epoch in range(epochs):
        # Training
        model.train()
        train_loss = 0
        
        for batch in train_loader:
            text_feat, image_feat, struct_feat, targets = batch
            text_feat = text_feat.to(device)
            image_feat = image_feat.to(device)
            struct_feat = struct_feat.to(device)
            targets = targets.to(device)
            
            optimizer.zero_grad()
            outputs = model(text_feat, image_feat, struct_feat)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
        
        train_loss /= len(train_loader)
        
        # Validation
        model.eval()
        val_preds = []
        val_targets = []
        
        with torch.no_grad():
            for batch in val_loader:
                text_feat, image_feat, struct_feat, targets = batch
                text_feat = text_feat.to(device)
                image_feat = image_feat.to(device)
                struct_feat = struct_feat.to(device)
                
                outputs = model(text_feat, image_feat, struct_feat)
                val_preds.extend(outputs.cpu().numpy())
                val_targets.extend(targets.numpy())
        
        # Calculate SMAPE on original scale
        val_preds_orig = target_transformer.inverse_transform(np.array(val_preds))
        val_targets_orig = target_transformer.inverse_transform(np.array(val_targets))
        val_smape = smape(val_targets_orig, val_preds_orig)
        
        scheduler.step(val_smape)
        
        if epoch % 5 == 0:
            print(f"  Epoch {epoch+1}/{epochs} - Loss: {train_loss:.4f} - SMAPE: {val_smape:.2f}%")
        
        # Early stopping
        if val_smape < best_val_smape:
            best_val_smape = val_smape
            best_state = model.state_dict().copy()
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print(f"  Early stopping at epoch {epoch+1}")
                break
    
    if best_state is not None:
        model.load_state_dict(best_state)
    
    return model, best_val_smape

In [None]:
# GroupKFold Cross-Validation
print(f"\n{'='*60}")
print(f"GroupKFold Cross-Validation (n_splits={CONFIG['n_folds']})")
print(f"{'='*60}\n")

gkf = GroupKFold(n_splits=CONFIG['n_folds'])
nn_models = []
nn_scores = []
lgb_models = []
lgb_scores = []

for fold, (train_idx, val_idx) in enumerate(gkf.split(train_df, groups=train_groups), 1):
    print(f"\n{'='*60}")
    print(f"Fold {fold}/{CONFIG['n_folds']}")
    print(f"{'='*60}")
    print(f"Train: {len(train_idx)} samples, Val: {len(val_idx)} samples")
    
    # Split data
    X_train_text = train_text[train_idx]
    X_val_text = train_text[val_idx]
    X_train_image = train_image[train_idx]
    X_val_image = train_image[val_idx]
    X_train_struct = train_struct_scaled[train_idx]
    X_val_struct = train_struct_scaled[val_idx]
    y_train = train_target[train_idx]
    y_val = train_target[val_idx]
    
    # === Neural Network ===
    print("\nTraining Neural Network...")
    train_dataset = PriceDataset(X_train_text, X_train_image, X_train_struct, y_train)
    val_dataset = PriceDataset(X_val_text, X_val_image, X_val_struct, y_val)
    train_loader = DataLoader(train_dataset, batch_size=CONFIG['batch_size'], shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=CONFIG['batch_size'], shuffle=False)
    
    nn_model = CompactMultimodalMLP(
        text_dim=train_text.shape[1],
        image_dim=train_image.shape[1],
        structured_dim=train_struct_scaled.shape[1]
    ).to(device)
    
    nn_model, nn_smape = train_nn_fold(
        nn_model, train_loader, val_loader,
        epochs=CONFIG['epochs'],
        lr=CONFIG['learning_rate'],
        target_transformer=target_transformer,
        patience=CONFIG['early_stopping_rounds']
    )
    
    nn_models.append(nn_model)
    nn_scores.append(nn_smape)
    print(f"NN SMAPE: {nn_smape:.2f}%")
    
    # === LightGBM (optional) ===
    if CONFIG['use_lgb']:
        print("\nTraining LightGBM...")
        X_train_all = np.hstack([X_train_text, X_train_image, X_train_struct])
        X_val_all = np.hstack([X_val_text, X_val_image, X_val_struct])
        
        lgb_params = {
            'objective': 'regression',
            'metric': 'mse',
            'learning_rate': 0.05,
            'num_leaves': 31,
            'max_depth': -1,
            'min_data_in_leaf': 20,
            'feature_fraction': 0.8,
            'bagging_fraction': 0.8,
            'bagging_freq': 5,
            'verbose': -1,
            'seed': SEED
        }
        
        train_data = lgb.Dataset(X_train_all, label=y_train)
        val_data = lgb.Dataset(X_val_all, label=y_val, reference=train_data)
        
        lgb_model = lgb.train(
            lgb_params,
            train_data,
            num_boost_round=1000,
            valid_sets=[val_data],
            callbacks=[lgb.early_stopping(50), lgb.log_evaluation(0)]
        )
        
        # Validate LGB
        lgb_preds = lgb_model.predict(X_val_all)
        lgb_preds_orig = target_transformer.inverse_transform(lgb_preds)
        y_val_orig = target_transformer.inverse_transform(y_val)
        lgb_smape = smape(y_val_orig, lgb_preds_orig)
        
        lgb_models.append(lgb_model)
        lgb_scores.append(lgb_smape)
        print(f"LGB SMAPE: {lgb_smape:.2f}%")

print(f"\n{'='*60}")
print("Cross-Validation Results")
print(f"{'='*60}")
print(f"NN Mean SMAPE: {np.mean(nn_scores):.2f}% (+/- {np.std(nn_scores):.2f}%)")
if CONFIG['use_lgb']:
    print(f"LGB Mean SMAPE: {np.mean(lgb_scores):.2f}% (+/- {np.std(lgb_scores):.2f}%)")

## 10. Generate Test Predictions

In [None]:
print("\nGenerating test predictions...")
test_dataset = PriceDataset(test_text, test_image, test_struct_scaled)
test_loader = DataLoader(test_dataset, batch_size=CONFIG['batch_size'], shuffle=False)

# Neural Network predictions
nn_predictions_all = []
for fold_idx, model in enumerate(nn_models, 1):
    print(f"  NN Fold {fold_idx}...")
    model.eval()
    fold_preds = []
    
    with torch.no_grad():
        for batch in test_loader:
            text_feat, image_feat, struct_feat = batch
            text_feat = text_feat.to(device)
            image_feat = image_feat.to(device)
            struct_feat = struct_feat.to(device)
            
            outputs = model(text_feat, image_feat, struct_feat)
            fold_preds.extend(outputs.cpu().numpy())
    
    nn_predictions_all.append(fold_preds)

nn_predictions = np.mean(nn_predictions_all, axis=0)

# LightGBM predictions
if CONFIG['use_lgb']:
    print("\nGenerating LightGBM predictions...")
    X_test_all = np.hstack([test_text, test_image, test_struct_scaled])
    lgb_predictions_all = [model.predict(X_test_all) for model in lgb_models]
    lgb_predictions = np.mean(lgb_predictions_all, axis=0)
    
    # Ensemble
    print(f"\nEnsembling: {CONFIG['nn_weight']*100:.0f}% NN + {CONFIG['lgb_weight']*100:.0f}% LGB")
    final_predictions = (CONFIG['nn_weight'] * nn_predictions + 
                         CONFIG['lgb_weight'] * lgb_predictions)
else:
    final_predictions = nn_predictions

# Inverse transform
final_predictions = target_transformer.inverse_transform(final_predictions)

## 11. Advanced Post-Processing

In [None]:
def clip_predictions(predictions, min_val=0.01):
    """Clip negative predictions to minimum value"""
    return np.clip(predictions, min_val, None)

def median_scaling(predictions, train_prices, max_adjustment=0.1):
    """Apply median scaling to reduce systematic bias"""
    pred_median = np.median(predictions)
    train_median = np.median(train_prices)
    scale_factor = train_median / pred_median
    
    # Limit adjustment
    scale_factor = np.clip(scale_factor, 1-max_adjustment, 1+max_adjustment)
    
    return predictions * scale_factor, scale_factor

def per_brand_calibration(predictions, brand_names, brand_stats, alpha=0.5):
    """Apply per-brand bias correction"""
    calibrated = predictions.copy()
    
    for i, brand in enumerate(brand_names):
        if brand in brand_stats:
            stats = brand_stats[brand]
            if stats['count'] >= 10:  # Only calibrate if enough samples
                # Blend prediction with brand mean
                brand_mean = stats['mean']
                calibrated[i] = alpha * predictions[i] + (1 - alpha) * brand_mean
    
    return calibrated

# Apply post-processing
print("\nApplying post-processing...")

# 1. Clip negatives
final_predictions = clip_predictions(final_predictions, min_val=CONFIG['min_price'])
print(f"✓ Clipped predictions to ≥ {CONFIG['min_price']}")

# 2. Median scaling (optional)
if CONFIG['use_median_scaling']:
    final_predictions, scale_factor = median_scaling(final_predictions, train_df['price'].values)
    print(f"✓ Applied median scaling (factor: {scale_factor:.4f})")

# 3. Per-brand calibration (optional)
if CONFIG['use_brand_calibration'] and struct_extractor.brand_stats:
    final_predictions = per_brand_calibration(
        final_predictions,
        test_struct['brand_name'].values,
        struct_extractor.brand_stats,
        alpha=0.9  # 90% model, 10% brand prior
    )
    print(f"✓ Applied per-brand calibration")

print(f"\nFinal predictions:")
print(f"  Min: ${final_predictions.min():.2f}")
print(f"  Max: ${final_predictions.max():.2f}")
print(f"  Mean: ${final_predictions.mean():.2f}")
print(f"  Median: ${np.median(final_predictions):.2f}")

## 12. Create Submission

In [None]:
submission = pd.DataFrame({
    'sample_id': test_df['sample_id'],
    'price': final_predictions
})

print("\nSubmission format:")
print(submission.head(10))
print(f"\nShape: {submission.shape}")
print(f"Missing: {submission.isnull().sum().sum()}")
print(f"Negative: {(submission['price'] < 0).sum()}")

submission.to_csv('test_out.csv', index=False)
print("\n✓ Submission saved: test_out.csv")

## 13. Final Summary

In [None]:
print(f"\n{'='*60}")
print("ENHANCED SOLUTION COMPLETE")
print(f"{'='*60}\n")

print("✓ Features:")
print(f"  - CLIP Text: {train_text.shape[1]}D")
print(f"  - CLIP Image: {train_image.shape[1]}D")
print(f"  - Structured: {train_struct_scaled.shape[1]} features")

print(f"\n✓ Validation (GroupKFold by brand):")
print(f"  - NN Mean SMAPE: {np.mean(nn_scores):.2f}% (+/- {np.std(nn_scores):.2f}%)")
if CONFIG['use_lgb']:
    print(f"  - LGB Mean SMAPE: {np.mean(lgb_scores):.2f}% (+/- {np.std(lgb_scores):.2f}%)")

print(f"\n✓ Post-Processing:")
print(f"  - Target transform: {CONFIG['target_transform']}")
print(f"  - Clipping: predictions ≥ ${CONFIG['min_price']}")
if CONFIG['use_median_scaling']:
    print(f"  - Median scaling: applied")
if CONFIG['use_brand_calibration']:
    print(f"  - Brand calibration: applied")

print(f"\n✓ Submission: test_out.csv ({len(submission)} predictions)")
print(f"\n{'='*60}")