# üéØ Brand + Image Solution - Amazon ML Challenge 2025

## Strategy:
1. **Extract price-relevant text features**: Brand, quantity, pack count, quality indicators
2. **Use miniCLIP for images**: Smaller model (512-dim), less noise
3. **Simple ensemble**: LightGBM + XGBoost
4. **No complex embeddings**: Focus on features that actually predict price

**Expected**: 45-48% validation, 46-49% test (< 2% gap)

In [None]:
# Install required packages
!pip install -q lightgbm xgboost catboost scikit-learn pandas numpy pillow requests transformers torch torchvision fuzzywuzzy python-Levenshtein

In [None]:
import pandas as pd
import numpy as np
import re
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import RobustScaler
import lightgbm as lgb
import xgboost as xgb
from fuzzywuzzy import process
from scipy.optimize import minimize

# For image processing
import torch
from PIL import Image
import requests
from io import BytesIO
from transformers import CLIPProcessor, CLIPModel

print("‚úÖ All imports successful!")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"üî• Using device: {device}")

## 1. Extract Price-Relevant Text Features

In [None]:
def extract_price_features(df):
    """
    Extract features that ACTUALLY predict price:
    - Brand (critical for pricing)
    - Quantity + unit (normalized)
    - Pack count
    - Quality indicators (premium, organic, etc.)
    - Size descriptors (small, large, XL, etc.)
    """
    print("üîß Extracting price-relevant features...")
    
    # ==================== BASIC EXTRACTION ====================
    def safe_extract(text, pattern, default=""):
        if pd.isna(text):
            return default
        match = re.search(pattern, str(text), re.IGNORECASE)
        return match.group(1).strip() if match else default
    
    # Extract item name and description
    df['item_name'] = df['catalog_content'].apply(
        lambda x: safe_extract(x, r"Item Name:\s*(.*?)(?=\n|Bullet|Product|$)")
    )
    df['product_desc'] = df['catalog_content'].apply(
        lambda x: safe_extract(x, r"Product Description:\s*(.*?)(?=\n|Value:|Unit:|$)")
    )
    
    # ==================== BRAND EXTRACTION (CRITICAL!) ====================
    def extract_brand(item_name):
        """Extract brand - usually first capitalized word"""
        words = str(item_name).split()
        if not words:
            return 'unknown'
        
        # Check first 3 words for brand
        for word in words[:3]:
            word_clean = re.sub(r'[^a-zA-Z]', '', word)  # Remove special chars
            if len(word_clean) > 2 and word_clean[0].isupper():
                return word_clean.lower()
        
        return words[0].lower() if words else 'unknown'
    
    df['brand'] = df['item_name'].apply(extract_brand)
    df['brand_len'] = df['brand'].str.len()
    
    # ==================== QUANTITY EXTRACTION ====================
    def extract_value(text):
        match = re.search(r"Value:\s*([\d.,]+)", str(text), re.IGNORECASE)
        if match:
            try:
                return float(match.group(1).replace(',', ''))
            except:
                return 0.0
        return 0.0
    
    def extract_unit(text):
        match = re.search(r"Unit:\s*([A-Za-z\s]+)", str(text), re.IGNORECASE)
        return match.group(1).strip().lower() if match else 'unknown'
    
    df['value'] = df['catalog_content'].apply(extract_value)
    df['unit'] = df['catalog_content'].apply(extract_unit)
    
    # Unit categorization
    def categorize_unit(unit):
        unit_lower = str(unit).lower()
        if any(u in unit_lower for u in ['gram', 'kg', 'oz', 'ounce', 'pound', 'lb', 'mg']):
            return 'weight'
        elif any(u in unit_lower for u in ['ml', 'liter', 'litre', 'gallon', 'fl', 'fluid']):
            return 'volume'
        elif any(u in unit_lower for u in ['count', 'piece', 'each', 'unit']):
            return 'count'
        else:
            return 'other'
    
    df['unit_category'] = df['unit'].apply(categorize_unit)
    
    # ==================== PACK COUNT ====================
    def extract_pack_count(text):
        patterns = [r'(\d+)\s*[-\s]?pack', r'pack\s*of\s*(\d+)', r'(\d+)\s*count']
        for pattern in patterns:
            match = re.search(pattern, str(text).lower())
            if match:
                try:
                    return int(match.group(1))
                except:
                    pass
        return 1
    
    df['pack_count'] = df['catalog_content'].apply(extract_pack_count)
    df['total_quantity'] = df['value'] * df['pack_count']
    
    # ==================== QUALITY INDICATORS ====================
    combined_text = (df['item_name'].fillna('') + ' ' + df['product_desc'].fillna('')).str.lower()
    
    quality_keywords = {
        'organic': ['organic', 'bio'],
        'premium': ['premium', 'deluxe', 'luxury', 'gourmet'],
        'natural': ['natural', 'pure'],
        'professional': ['professional', 'pro', 'industrial'],
    }
    
    for key, terms in quality_keywords.items():
        df[f'kw_{key}'] = combined_text.apply(
            lambda x: int(any(term in str(x) for term in terms))
        )
    
    # ==================== SIZE INDICATORS ====================
    size_keywords = {
        'small': ['small', 'mini', 'tiny'],
        'large': ['large', 'xl', 'xxl', 'jumbo', 'family'],
        'multi': ['pack', 'multi', 'bundle']
    }
    
    for key, terms in size_keywords.items():
        df[f'size_{key}'] = combined_text.apply(
            lambda x: int(any(term in str(x) for term in terms))
        )
    
    # ==================== TEXT STATISTICS ====================
    df['text_len'] = df['catalog_content'].str.len()
    df['word_count'] = combined_text.str.split().str.len()
    df['digit_count'] = combined_text.str.count(r'\d')
    
    # ==================== LOG TRANSFORMS ====================
    df['log_value'] = np.log1p(df['value'].fillna(0))
    df['sqrt_value'] = np.sqrt(df['value'].fillna(0))
    df['log_pack'] = np.log1p(df['pack_count'])
    df['log_total_qty'] = np.log1p(df['total_quantity'])
    
    print(f"‚úÖ Extracted {len(df.columns)} features")
    return df

## 2. miniCLIP Image Feature Extraction

In [None]:
def extract_image_features_miniclip(df, max_images=None):
    """
    Extract image features using openai/clip-vit-base-patch32 (miniCLIP)
    - Smaller model (512-dim) vs large CLIP (768-dim)
    - Less noise, faster processing
    - Focus on product appearance, not semantic understanding
    """
    print("\nüñºÔ∏è Extracting miniCLIP image features...")
    
    # Load miniCLIP model
    print("   Loading miniCLIP model (openai/clip-vit-base-patch32)...")
    model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
    processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
    model.eval()
    
    print(f"   Model loaded on {device}")
    print(f"   Output dimension: 512")
    
    # Filter valid image links
    valid_links = df['image_link'].notna()
    print(f"   Total samples with images: {valid_links.sum()} / {len(df)}")
    
    if max_images:
        print(f"   Processing only first {max_images} images for testing")
        valid_indices = df[valid_links].index[:max_images]
    else:
        valid_indices = df[valid_links].index
    
    # Initialize feature array
    image_features = np.zeros((len(df), 512))
    
    # Process images in batches
    batch_size = 32
    successful = 0
    failed = 0
    
    for i in range(0, len(valid_indices), batch_size):
        batch_indices = valid_indices[i:i+batch_size]
        batch_images = []
        batch_valid_indices = []
        
        for idx in batch_indices:
            try:
                url = df.loc[idx, 'image_link']
                response = requests.get(url, timeout=5)
                image = Image.open(BytesIO(response.content)).convert('RGB')
                batch_images.append(image)
                batch_valid_indices.append(idx)
            except Exception as e:
                failed += 1
                continue
        
        if batch_images:
            try:
                # Process batch
                inputs = processor(images=batch_images, return_tensors="pt", padding=True).to(device)
                
                with torch.no_grad():
                    outputs = model.get_image_features(**inputs)
                    features = outputs.cpu().numpy()
                
                # Store features
                for j, idx in enumerate(batch_valid_indices):
                    image_features[idx] = features[j]
                    successful += 1
                
            except Exception as e:
                print(f"   Batch processing error: {e}")
                failed += len(batch_images)
        
        if (i + batch_size) % 320 == 0:
            print(f"   Processed {i+batch_size}/{len(valid_indices)} | Success: {successful} | Failed: {failed}")
    
    print(f"\n‚úÖ Image feature extraction complete!")
    print(f"   Successfully processed: {successful}")
    print(f"   Failed: {failed}")
    print(f"   Success rate: {successful/(successful+failed)*100:.1f}%")
    
    # Convert to DataFrame
    img_cols = [f'img_feat_{i}' for i in range(512)]
    img_df = pd.DataFrame(image_features, columns=img_cols, index=df.index)
    
    return img_df

## 3. Load and Process Data

In [None]:
print("="*70)
print("üìÇ LOADING DATA")
print("="*70)

train = pd.read_csv('dataset/train.csv', encoding='latin1')
test = pd.read_csv('dataset/test.csv', encoding='latin1')

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")

# Extract text features
train = extract_price_features(train)
test = extract_price_features(test)

## 4. Extract Image Features (Test on Sample First)

In [None]:
# Test on 1000 samples first
print("\nüß™ Testing image extraction on 1000 train samples...")
train_img_sample = extract_image_features_miniclip(train, max_images=1000)

print("\nüí° If this works well (success rate > 80%), we'll process all images")
print("   Otherwise, we'll continue with text features only")

In [None]:
# Decide whether to use images based on sample test
use_images = input("\nDo you want to extract ALL image features? (yes/no): ").lower() == 'yes'

if use_images:
    print("\nüñºÔ∏è Extracting ALL image features (this will take ~30-45 minutes)...")
    train_img_features = extract_image_features_miniclip(train)
    test_img_features = extract_image_features_miniclip(test)
else:
    print("\n‚è≠Ô∏è Skipping image extraction, using text features only")
    train_img_features = None
    test_img_features = None

## 5. Out-of-Fold Brand Encoding

In [None]:
print("\n" + "="*70)
print("üîß OUT-OF-FOLD BRAND ENCODING")
print("="*70)

# 5-fold OOF encoding
kf = KFold(n_splits=5, shuffle=True, random_state=42)

oof_brand_mean = np.zeros(len(train))
oof_brand_freq = np.zeros(len(train))
oof_unit_mean = np.zeros(len(train))

for fold, (train_idx, val_idx) in enumerate(kf.split(train)):
    print(f"Processing fold {fold+1}/5...", end='\r')
    
    train_fold = train.iloc[train_idx]
    val_fold = train.iloc[val_idx]
    
    # Brand mean price
    brand_mean = train_fold.groupby('brand')['price'].mean()
    global_mean = train_fold['price'].mean()
    oof_brand_mean[val_idx] = val_fold['brand'].map(brand_mean).fillna(global_mean).values
    
    # Brand frequency
    brand_freq = train_fold['brand'].value_counts()
    oof_brand_freq[val_idx] = val_fold['brand'].map(brand_freq).fillna(0).values
    
    # Unit mean price
    unit_mean = train_fold.groupby('unit_category')['price'].mean()
    oof_unit_mean[val_idx] = val_fold['unit_category'].map(unit_mean).fillna(global_mean).values

print("Processing fold 5/5... ‚úÖ")

# Add to dataframe
train['brand_mean_encoded'] = oof_brand_mean
train['brand_freq_encoded'] = oof_brand_freq
train['unit_mean_encoded'] = oof_unit_mean

# For test, use full train statistics
brand_mean_full = train.groupby('brand')['price'].mean()
brand_freq_full = train['brand'].value_counts()
unit_mean_full = train.groupby('unit_category')['price'].mean()
global_mean_full = train['price'].mean()

test['brand_mean_encoded'] = test['brand'].map(brand_mean_full).fillna(global_mean_full)
test['brand_freq_encoded'] = test['brand'].map(brand_freq_full).fillna(0)
test['unit_mean_encoded'] = test['unit_category'].map(unit_mean_full).fillna(global_mean_full)

# Interaction features
train['value_x_brand'] = train['value'] * train['brand_mean_encoded']
test['value_x_brand'] = test['value'] * test['brand_mean_encoded']

print("\n‚úÖ OOF encoding complete!")

## 6. Prepare Features for Training

In [None]:
# Exclude non-feature columns
exclude_cols = [
    'sample_id', 'catalog_content', 'image_link', 'price',
    'item_name', 'product_desc', 'unit', 'brand'
]

# Select numerical features
num_feature_cols = [col for col in train.columns 
                    if col not in exclude_cols and not col.startswith('img_feat_')]

print(f"üìä Feature counts:")
print(f"   Numerical features: {len(num_feature_cols)}")

# Prepare data
X_num = train[num_feature_cols].fillna(0)
y = train['price']

# Scale numerical features
scaler = RobustScaler()
X_num_scaled = scaler.fit_transform(X_num)

# Combine with image features if available
if use_images and train_img_features is not None:
    X_combined = np.hstack([X_num_scaled, train_img_features.values])
    print(f"   Image features: 512")
    print(f"   Total features: {X_combined.shape[1]}")
else:
    X_combined = X_num_scaled
    print(f"   Total features: {X_combined.shape[1]} (text only)")

# Split for validation
X_train, X_val, y_train, y_val = train_test_split(X_combined, y, test_size=0.15, random_state=42)

# Log transform target
y_train_log = np.log1p(y_train)
y_val_log = np.log1p(y_val)

print(f"\nüìä Training set: {X_train.shape}")
print(f"üìä Validation set: {X_val.shape}")

## 7. Train Models

In [None]:
def smape(y_true, y_pred):
    """SMAPE metric"""
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2.0
    diff = np.abs(y_true - y_pred)
    return np.mean(diff / denominator) * 100

print("="*70)
print("üöÄ TRAINING MODELS")
print("="*70)

# ==================== LIGHTGBM ====================
print("\n1Ô∏è‚É£ Training LightGBM...")

lgb_params = {
    'objective': 'regression',
    'metric': 'mae',
    'learning_rate': 0.03,
    'num_leaves': 63,
    'max_depth': 8,
    'min_child_samples': 20,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'reg_alpha': 0.3,
    'reg_lambda': 0.3,
    'random_state': 42,
    'verbose': -1
}

train_data = lgb.Dataset(X_train, label=y_train_log)
val_data = lgb.Dataset(X_val, label=y_val_log, reference=train_data)

lgb_model = lgb.train(
    lgb_params,
    train_data,
    num_boost_round=2000,
    valid_sets=[val_data],
    callbacks=[lgb.early_stopping(stopping_rounds=150), lgb.log_evaluation(0)]
)

y_pred_lgb_log = lgb_model.predict(X_val)
y_pred_lgb = np.expm1(y_pred_lgb_log)
smape_lgb = smape(y_val, y_pred_lgb)
print(f"   LightGBM SMAPE: {smape_lgb:.2f}%")

# ==================== XGBOOST ====================
print("\n2Ô∏è‚É£ Training XGBoost...")

xgb_params = {
    'objective': 'reg:squarederror',
    'learning_rate': 0.03,
    'max_depth': 8,
    'min_child_weight': 3,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'gamma': 0.1,
    'reg_alpha': 0.3,
    'reg_lambda': 0.3,
    'random_state': 42,
    'tree_method': 'hist'
}

dtrain = xgb.DMatrix(X_train, label=y_train_log)
dval = xgb.DMatrix(X_val, label=y_val_log)

xgb_model = xgb.train(
    xgb_params,
    dtrain,
    num_boost_round=2000,
    evals=[(dval, 'val')],
    early_stopping_rounds=150,
    verbose_eval=0
)

y_pred_xgb_log = xgb_model.predict(dval)
y_pred_xgb = np.expm1(y_pred_xgb_log)
smape_xgb = smape(y_val, y_pred_xgb)
print(f"   XGBoost SMAPE: {smape_xgb:.2f}%")

print("\n" + "="*70)
print("üìä INDIVIDUAL MODEL RESULTS")
print("="*70)
print(f"LightGBM: {smape_lgb:.2f}%")
print(f"XGBoost:  {smape_xgb:.2f}%")

## 8. Ensemble Optimization

In [None]:
print("\n" + "="*70)
print("üîß OPTIMIZING ENSEMBLE WEIGHTS")
print("="*70)

def smape_loss(weights):
    ensemble = weights[0] * y_pred_lgb + weights[1] * y_pred_xgb
    return smape(y_val, ensemble)

constraints = {'type': 'eq', 'fun': lambda w: np.sum(w) - 1}
bounds = [(0, 1)] * 2
initial_weights = [0.5, 0.5]

result = minimize(smape_loss, x0=initial_weights, bounds=bounds, constraints=constraints, method='SLSQP')
optimal_weights = result.x

print(f"\n‚úÖ Optimal weights:")
print(f"   LightGBM: {optimal_weights[0]:.3f}")
print(f"   XGBoost:  {optimal_weights[1]:.3f}")

y_pred_ensemble = optimal_weights[0] * y_pred_lgb + optimal_weights[1] * y_pred_xgb
smape_ensemble = smape(y_val, y_pred_ensemble)

print(f"\nüèÜ FINAL ENSEMBLE SMAPE: {smape_ensemble:.2f}%")

if smape_ensemble < 45:
    print("   üéâ EXCELLENT! Should be competitive!")
elif smape_ensemble < 48:
    print("   ‚úÖ GOOD! Better than before!")
else:
    print("   ‚ö†Ô∏è Need further improvement")

## 9. Generate Final Predictions

In [None]:
print("\n" + "="*70)
print("üöÄ GENERATING FINAL PREDICTIONS")
print("="*70)

# Prepare test data
X_num_test = test[num_feature_cols].fillna(0)
X_num_test_scaled = scaler.transform(X_num_test)

if use_images and test_img_features is not None:
    X_test_combined = np.hstack([X_num_test_scaled, test_img_features.values])
else:
    X_test_combined = X_num_test_scaled

# Generate predictions
print("\nGenerating predictions...")

y_test_lgb_log = lgb_model.predict(X_test_combined)
y_test_lgb = np.expm1(y_test_lgb_log)

dtest = xgb.DMatrix(X_test_combined)
y_test_xgb_log = xgb_model.predict(dtest)
y_test_xgb = np.expm1(y_test_xgb_log)

# Ensemble
y_test_ensemble = optimal_weights[0] * y_test_lgb + optimal_weights[1] * y_test_xgb
y_test_ensemble = np.clip(y_test_ensemble, 0.01, None)

# Create submission
submission = pd.DataFrame({
    'sample_id': test['sample_id'],
    'price': y_test_ensemble
})

submission.to_csv('submission_brand_image.csv', index=False)

print("\n" + "="*70)
print("üéâ SUBMISSION CREATED!")
print("="*70)
print(f"üìù Filename: submission_brand_image.csv")
print(f"üìä Statistics:")
print(f"   Samples:  {len(submission)}")
print(f"   Min:      ${submission['price'].min():.2f}")
print(f"   Max:      ${submission['price'].max():.2f}")
print(f"   Mean:     ${submission['price'].mean():.2f}")
print(f"   Median:   ${submission['price'].median():.2f}")

print(f"\nüéØ Performance Expectations:")
print(f"   Validation SMAPE: {smape_ensemble:.2f}%")
print(f"   Expected Test:    {smape_ensemble:.1f}-{smape_ensemble+2:.1f}%")

print("\n‚úÖ Key improvements:")
print("   ‚Ä¢ Brand-focused features (critical for pricing)")
print("   ‚Ä¢ Out-of-fold encoding (no leakage)")
if use_images:
    print("   ‚Ä¢ miniCLIP image features (512-dim, less noise)")
print("   ‚Ä¢ Simple 2-model ensemble (LightGBM + XGBoost)")
print("   ‚Ä¢ No complex embeddings (focus on price signals)")

print("\nüöÄ Ready to submit!")
print("="*70)