In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# import numpy as np # linear algebra
# import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
pip install transformers==4.44.2


Collecting transformers==4.44.2
  Downloading transformers-4.44.2-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.7/43.7 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.23.2 (from transformers==4.44.2)
  Downloading huggingface_hub-0.35.3-py3-none-any.whl.metadata (14 kB)
Collecting tokenizers<0.20,>=0.19 (from transformers==4.44.2)
  Downloading tokenizers-0.19.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading transformers-4.44.2-py3-none-any.whl (9.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.5/9.5 MB[0m [31m84.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading huggingface_hub-0.35.3-py3-none-any.whl (564 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m564.3/564.3 kB[0m [31m31.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tokenizers-0.19.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (

In [3]:
"""
Smart Product Pricing Challenge - Complete Kaggle Solution
A comprehensive multimodal ML solution combining text and image features for price prediction
Optimized for Kaggle environment with GPU support
"""

# ============================================================================
# IMPORTS AND SETUP
# ============================================================================

import os
import sys
import warnings
import logging
import time
import gc
from pathlib import Path
from typing import List, Dict, Tuple, Optional, Any, Union

# Data manipulation
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import Ridge

# Deep learning and transformers
import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as transforms
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import Dataset, DataLoader

# ML models
import xgboost as xgb
import lightgbm as lgb

# Image processing
from PIL import Image, ImageFile
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
import cv2

# Utilities
import re
import joblib
import json
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

# Suppress warnings
warnings.filterwarnings('ignore')
ImageFile.LOAD_TRUNCATED_IMAGES = True

In [4]:
# ============================================================================
# CONFIGURATION
# ============================================================================

class Config:
    """Configuration settings optimized for Kaggle"""

    # Paths (Kaggle structure)
    INPUT_DIR = Path('/kaggle/input/amazon-ml-2025-dataset')
    WORKING_DIR = Path('/kaggle/working')

    # Data files
    TRAIN_CSV = INPUT_DIR / 'train.csv'
    TEST_CSV = INPUT_DIR / 'test.csv'

    # Model parameters
    BERT_MODEL_NAME = "distilbert-base-uncased"
    MAX_TEXT_LENGTH = 256  # Reduced for Kaggle
    IMAGE_SIZE = (224, 224)
    BATCH_SIZE = 16  # Optimized for Kaggle GPU
    NUM_WORKERS = 2

    # Training parameters
    VALIDATION_SPLIT = 0.2
    RANDOM_STATE = 42

    # XGBoost parameters
    XGBOOST_PARAMS = {
        'objective': 'reg:squarederror',
        'n_estimators': 500,  # Reduced for Kaggle time limits
        'max_depth': 6,
        'learning_rate': 0.1,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'random_state': RANDOM_STATE,
        'tree_method': 'gpu_hist',  # GPU acceleration
        'gpu_id': 0
    }

    # LightGBM parameters
    LIGHTGBM_PARAMS = {
        'objective': 'regression',
        'metric': 'rmse',
        'boosting_type': 'gbdt',
        'num_leaves': 31,
        'learning_rate': 0.1,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'verbose': -1,
        'random_state': RANDOM_STATE,
        'device': 'gpu',  # GPU acceleration
        'gpu_platform_id': 0,
        'gpu_device_id': 0
    }

    # Ensemble weights
    ENSEMBLE_WEIGHTS = {
        'xgboost': 0.4,
        'lightgbm': 0.35,
        'ridge': 0.25
    }

config = Config()

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Check GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [5]:
# ============================================================================
# UTILITY FUNCTIONS
# ============================================================================

def symmetric_mean_absolute_percentage_error(y_true: np.ndarray, y_pred: np.ndarray) -> float:
    """Calculate SMAPE metric"""
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)

    numerator = np.abs(y_pred - y_true)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2

    mask = denominator != 0
    smape_values = np.zeros_like(numerator)
    smape_values[mask] = numerator[mask] / denominator[mask]

    return np.mean(smape_values) * 100

def clean_memory():
    """Clean up memory"""
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

In [6]:
# ============================================================================
# IMAGE UTILITIES
# ============================================================================

class ImageProcessor:
    """Handle image downloading and preprocessing"""

    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        })

        # Setup retry strategy
        retry_strategy = Retry(
            total=2,  # Reduced retries for Kaggle
            backoff_factor=0.3,
            status_forcelist=[429, 500, 502, 503, 504],
        )
        adapter = HTTPAdapter(max_retries=retry_strategy)
        self.session.mount("http://", adapter)
        self.session.mount("https://", adapter)

        # Image transforms
        self.transform = transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])

    def download_image(self, url: str, sample_id: str, max_retries: int = 2) -> Optional[str]:
        """Download single image with retry logic"""
        if not url or pd.isna(url):
            return None

        try:
            response = self.session.get(url, timeout=5)
            response.raise_for_status()

            if len(response.content) < 1000:
                return None

            # Save to working directory
            filepath = config.WORKING_DIR / f"{sample_id}.jpg"
            with open(filepath, 'wb') as f:
                f.write(response.content)

            # Validate image
            try:
                with Image.open(filepath) as img:
                    img.verify()
                return str(filepath)
            except:
                if filepath.exists():
                    filepath.unlink()
                return None

        except Exception as e:
            print(f"Failed to download {url}: {e}")
            return None

    def download_images_batch(self, df: pd.DataFrame, max_workers: int = 4) -> List[Optional[str]]:
        """Download images in parallel"""
        df_reset = df.reset_index(drop=True)
        results = [None] * len(df_reset)
        
        future_to_idx = {}
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            for idx, (_, row) in enumerate(df_reset.iterrows()):
                future = executor.submit(self.download_image, row['image_link'], str(row['sample_id']))
                future_to_idx[future] = idx
    
            for future in tqdm(as_completed(future_to_idx), total=len(future_to_idx), desc="Downloading images"):
                idx = future_to_idx[future]
                try:
                    results[idx] = future.result()
                except Exception as e:
                    print(f"Error processing image at index {idx}: {e}")
                    results[idx] = None
    
        return results
        # return results

    def preprocess_image(self, image_path: str) -> Optional[torch.Tensor]:
        """Preprocess single image"""
        try:
            if not image_path or not os.path.exists(image_path):
                return None

            image = Image.open(image_path).convert('RGB')
            return self.transform(image)
        except Exception as e:
            print(f"Error preprocessing image {image_path}: {e}")
            return None

    def create_default_tensor(self) -> torch.Tensor:
        """Create default tensor for missing images"""
        default_image = Image.new('RGB', config.IMAGE_SIZE, color=(128, 128, 128))
        return self.transform(default_image)

In [7]:
# ============================================================================
# TEXT FEATURE EXTRACTION
# ============================================================================

class TextFeatureExtractor:
    """Extract comprehensive text features using BERT and engineering"""

    def __init__(self):
        self.device = device
        print("Loading BERT model...")

        # Initialize BERT
        try:
            self.tokenizer = AutoTokenizer.from_pretrained(config.BERT_MODEL_NAME)
            self.model = AutoModel.from_pretrained(config.BERT_MODEL_NAME)
            self.model.to(self.device)
            self.model.eval()
        except Exception as e:
            print(f"Failed to load BERT: {e}. Using fallback.")
            self.tokenizer = None
            self.model = None

        # Initialize other components
        self.tfidf = TfidfVectorizer(max_features=500, stop_words='english')  # Reduced for memory
        self.pca = PCA(n_components=30, random_state=config.RANDOM_STATE)  # Reduced components

        print("Text feature extractor initialized")

    def extract_bert_features(self, texts: List[str], batch_size: int = 8) -> np.ndarray:
        """Extract BERT embeddings"""
        all_embeddings = []

        for i in tqdm(range(0, len(texts), batch_size), desc="Extracting BERT features"):
            batch_texts = texts[i:i + batch_size]

            inputs = self.tokenizer(
                batch_texts,
                padding=True,
                truncation=True,
                max_length=config.MAX_TEXT_LENGTH,
                return_tensors='pt'
            )

            inputs = {k: v.to(self.device) for k, v in inputs.items()}

            with torch.no_grad():
                outputs = self.model(**inputs)
                embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
                all_embeddings.append(embeddings)

        return np.vstack(all_embeddings)

    def extract_manual_features(self, df: pd.DataFrame) -> np.ndarray:
        """Extract manually engineered text features"""
        features = []

        for _, row in df.iterrows():
            text = str(row['catalog_content']).lower()

            feature_dict = {
                # Basic statistics
                'char_count': len(text),
                'word_count': len(text.split()),
                'avg_word_length': np.mean([len(word) for word in text.split()]) if text.split() else 0,

                # Price indicators
                'has_price': int(bool(re.search(r'\$[0-9,]+\.?[0-9]*|price|cost|dollar', text))),
                'has_discount': int(bool(re.search(r'discount|sale|off|deal|save', text))),
                'has_quantity': int(bool(re.search(r'pack|quantity|count|piece|item|unit', text))),

                # Quality indicators
                'has_premium': int(bool(re.search(r'premium|luxury|deluxe|professional|high.quality', text))),
                'has_brand': int(bool(re.search(r'brand|branded|authentic|original', text))),
                'has_warranty': int(bool(re.search(r'warranty|guarantee|return|refund', text))),

                # Technical specs
                'has_size': int(bool(re.search(r'size|dimension|inch|cm|mm|foot|yard', text))),
                'has_weight': int(bool(re.search(r'weight|pound|kg|gram|ounce|lb|oz', text))),
                'has_material': int(bool(re.search(r'cotton|plastic|metal|wood|leather|fabric', text))),

                # Sentiment
                'positive_words': len(re.findall(r'excellent|great|amazing|perfect|best|good', text)),
                'negative_words': len(re.findall(r'bad|poor|worst|terrible|awful|cheap', text)),

                # Formatting
                'exclamation_count': text.count('!'),
                'uppercase_ratio': sum(1 for c in text if c.isupper()) / len(text) if text else 0,
                'digit_count': sum(1 for c in text if c.isdigit()),
            }

            features.append(list(feature_dict.values()))

        return np.array(features)

    def extract_tfidf_features(self, texts: List[str], fit: bool = True) -> np.ndarray:
        """Extract TF-IDF features"""
        if fit:
            tfidf_features = self.tfidf.fit_transform(texts)
            tfidf_dense = tfidf_features.toarray()
            reduced_features = self.pca.fit_transform(tfidf_dense)
        else:
            tfidf_features = self.tfidf.transform(texts)
            tfidf_dense = tfidf_features.toarray()
            reduced_features = self.pca.transform(tfidf_dense)

        return reduced_features

    def extract_all_features(self, df: pd.DataFrame, fit: bool = True) -> np.ndarray:
        """Extract all text features"""
        texts = df['catalog_content'].fillna('').astype(str).tolist()

        # BERT features
        bert_features = self.extract_bert_features(texts)

        # Manual features
        manual_features = self.extract_manual_features(df)

        # TF-IDF features
        tfidf_features = self.extract_tfidf_features(texts, fit=fit)

        # Combine all features
        combined = np.hstack([bert_features, manual_features, tfidf_features])

        print(f"Text features shape: {combined.shape}")
        return combined

In [8]:
# ============================================================================
# IMAGE FEATURE EXTRACTION
# ============================================================================

class ImageFeatureExtractor:
    """Extract CNN features from product images"""

    def __init__(self):
        self.device = device
        print("Loading CNN model...")

        # Load ResNet50
        self.model = models.resnet50(pretrained=True)
        self.model = nn.Sequential(*list(self.model.children())[:-1])  # Remove final layer
        self.model.to(self.device)
        self.model.eval()

        self.image_processor = ImageProcessor()

        print("Image feature extractor initialized")

    def extract_cnn_features(self, image_paths: List[Optional[str]], batch_size: int = 16) -> np.ndarray:
        """Extract CNN features from images"""
        all_features = []
        default_tensor = self.image_processor.create_default_tensor()

        for i in tqdm(range(0, len(image_paths), batch_size), desc="Extracting CNN features"):
            batch_paths = image_paths[i:i + batch_size]
            batch_tensors = []

            for image_path in batch_paths:
                if image_path:
                    tensor = self.image_processor.preprocess_image(image_path)
                    if tensor is not None:
                        batch_tensors.append(tensor)
                    else:
                        batch_tensors.append(default_tensor)
                else:
                    batch_tensors.append(default_tensor)

            if batch_tensors:
                batch_tensor = torch.stack(batch_tensors).to(self.device)

                with torch.no_grad():
                    features = self.model(batch_tensor)
                    features = features.view(features.size(0), -1).cpu().numpy()
                    all_features.append(features)

        return np.vstack(all_features) if all_features else np.zeros((len(image_paths), 2048))

    def extract_statistical_features(self, image_paths: List[Optional[str]]) -> np.ndarray:
        """Extract statistical image features"""
        features = []

        for image_path in image_paths:
            try:
                if not image_path or not os.path.exists(image_path):
                    features.append([0] * 10)  # Default features
                    continue

                image = Image.open(image_path).convert('RGB')
                img_array = np.array(image)

                # Color statistics
                mean_rgb = img_array.mean(axis=(0, 1))
                std_rgb = img_array.std(axis=(0, 1))

                # Brightness and contrast
                gray = np.array(image.convert('L'))
                brightness = gray.mean()
                contrast = gray.std()

                # Dimensions
                height, width = gray.shape
                aspect_ratio = width / height if height > 0 else 1

                stat_features = [
                    *mean_rgb[:3],  # RGB means
                    brightness, contrast, aspect_ratio,
                    width, height, width * height
                ]

                features.append(stat_features)

            except Exception as e:
                print(f"Error extracting stats from {image_path}: {e}")
                features.append([0] * 10)

        return np.array(features)

    def extract_all_features(self, image_paths: List[Optional[str]]) -> np.ndarray:
        """Extract all image features"""
        # CNN features
        cnn_features = self.extract_cnn_features(image_paths)

        # Statistical features
        stat_features = self.extract_statistical_features(image_paths)

        # Combine features
        combined = np.hstack([cnn_features, stat_features])

        print(f"Image features shape: {combined.shape}")
        return combined

In [9]:
# ============================================================================
# MODEL TRAINING
# ============================================================================

class ModelTrainer:
    """Train multiple ML models"""

    def __init__(self):
        self.models = {}

    def train_xgboost(self, X_train: np.ndarray, y_train: np.ndarray, 
                     X_val: np.ndarray, y_val: np.ndarray) -> xgb.XGBRegressor:
        """Train XGBoost model"""
        print("Training XGBoost...")

        model = xgb.XGBRegressor(**config.XGBOOST_PARAMS)

        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            early_stopping_rounds=30,
            verbose=False
        )

        self.models['xgboost'] = model
        return model

    def train_lightgbm(self, X_train: np.ndarray, y_train: np.ndarray,
                      X_val: np.ndarray, y_val: np.ndarray) -> lgb.LGBMRegressor:
        """Train LightGBM model"""
        print("Training LightGBM...")

        model = lgb.LGBMRegressor(**config.LIGHTGBM_PARAMS)

        # model.fit(
        #     X_train, y_train,
        #     eval_set=[(X_val, y_val)],
        #     early_stopping_rounds=30,
        #     verbose=False
        # )
        try:
            model.fit(
                X_train, y_train,
                eval_set=[(X_val, y_val)],
                callbacks=[
                    lgb.early_stopping(30, verbose=False),  # ← Correct callback syntax
                    lgb.log_evaluation(0)  # Silent training
                ]
            )
        except:
            try:
                # Level 2: Simple training (no callbacks)
                model.fit(X_train, y_train)
            except Exception as e:
                print("Failed to train LightGBM model:",e)
                

        self.models['lightgbm'] = model
        return model

    def train_ridge(self, X_train: np.ndarray, y_train: np.ndarray) -> Ridge:
        """Train Ridge regression"""
        print("Training Ridge regression...")

        model = Ridge(alpha=1.0, random_state=config.RANDOM_STATE)
        model.fit(X_train, y_train)

        self.models['ridge'] = model
        return model

    def train_all_models(self, X_train: np.ndarray, y_train: np.ndarray,
                        X_val: np.ndarray, y_val: np.ndarray) -> Dict[str, Any]:
        """Train all models"""
        self.train_xgboost(X_train, y_train, X_val, y_val)
        self.train_lightgbm(X_train, y_train, X_val, y_val)
        self.train_ridge(X_train, y_train)

        return self.models

    def predict_all(self, X: np.ndarray) -> Dict[str, np.ndarray]:
        """Get predictions from all models"""
        predictions = {}
        for model_name, model in self.models.items():
            predictions[model_name] = model.predict(X)
        return predictions

    def create_ensemble_prediction(self, predictions: Dict[str, np.ndarray]) -> np.ndarray:
        """Create weighted ensemble prediction"""
        ensemble_pred = (
            config.ENSEMBLE_WEIGHTS['xgboost'] * predictions['xgboost'] +
            config.ENSEMBLE_WEIGHTS['lightgbm'] * predictions['lightgbm'] +
            config.ENSEMBLE_WEIGHTS['ridge'] * predictions['ridge']
        )
        return ensemble_pred

In [10]:
print("Starting Smart Product Pricing Challenge Pipeline")
print("=" * 60)

start_time = time.time()

try:
        # 1. Load Data
    print("Step 1: Loading data...")
    train_df = pd.read_csv(config.TRAIN_CSV)
    test_df = pd.read_csv(config.TEST_CSV)

    print(f"Training samples: {len(train_df)}")
    print(f"Test samples: {len(test_df)}")
except Exception as e:
    print(f"Pipeline failed: {e}")
    raise

Starting Smart Product Pricing Challenge Pipeline
Step 1: Loading data...
Training samples: 75000
Test samples: 75000


In [11]:
try:
# Basic preprocessing
    train_df['catalog_content'] = train_df['catalog_content'].fillna('')
    test_df['catalog_content'] = test_df['catalog_content'].fillna('')

    # Create train/validation splits
    train_df, val_df = train_test_split(
        train_df, 
        test_size=config.VALIDATION_SPLIT,
        random_state=config.RANDOM_STATE,
        stratify=pd.qcut(train_df['price'], q=5, duplicates='drop')
    )

    print(f"Final train: {len(train_df)}, val: {len(val_df)}")
except Exception as e:
    print(f"Pipeline failed: {e}")
    raise

Final train: 60000, val: 15000


In [12]:
try:    # 2. Download Images (sample for Kaggle time limits)
    print("Step 2: Downloading sample images...")

    image_processor = ImageProcessor()

    # Download subset for demonstration (adjust based on Kaggle time limits)
    sample_size = min(1000, len(train_df))  # Limit for Kaggle
    train_sample = train_df.head(sample_size).copy()
    val_sample = val_df.head(min(200, len(val_df))).copy()
    test_sample = test_df.head(min(500, len(test_df))).copy()

    train_image_paths = image_processor.download_images_batch(train_sample)
    val_image_paths = image_processor.download_images_batch(val_sample)
    test_image_paths = image_processor.download_images_batch(test_sample)

    # Update dataframes with paths
    train_sample['image_path'] = train_image_paths
    val_sample['image_path'] = val_image_paths
    test_sample['image_path'] = test_image_paths
    clean_memory()
except Exception as e:
    print(f"Pipeline failed: {e}")
    raise
    

Step 2: Downloading sample images...


Downloading images: 100%|██████████| 1000/1000 [00:04<00:00, 246.82it/s]
Downloading images: 100%|██████████| 200/200 [00:00<00:00, 273.16it/s]
Downloading images: 100%|██████████| 500/500 [00:01<00:00, 253.23it/s]


In [13]:
try:    # 3. Extract Text Features
    print("Step 3: Extracting text features...")

    text_extractor = TextFeatureExtractor()

    train_text_features = text_extractor.extract_all_features(train_sample, fit=True)
    val_text_features = text_extractor.extract_all_features(val_sample, fit=False)
    test_text_features = text_extractor.extract_all_features(test_sample, fit=False)

    clean_memory()
except Exception as e:
    print(f"Pipeline failed: {e}")
    raise

Step 3: Extracting text features...
Loading BERT model...


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Text feature extractor initialized


Extracting BERT features: 100%|██████████| 125/125 [00:04<00:00, 27.01it/s]


Text features shape: (1000, 815)


Extracting BERT features: 100%|██████████| 25/25 [00:00<00:00, 28.63it/s]


Text features shape: (200, 815)


Extracting BERT features: 100%|██████████| 63/63 [00:02<00:00, 29.55it/s]


Text features shape: (500, 815)


In [14]:
try:    # 4. Extract Image Features
    print("Step 4: Extracting image features...")

    image_extractor = ImageFeatureExtractor()

    train_image_features = image_extractor.extract_all_features(train_image_paths)
    val_image_features = image_extractor.extract_all_features(val_image_paths)
    test_image_features = image_extractor.extract_all_features(test_image_paths)

    clean_memory()
except Exception as e:
    print(f"Pipeline failed: {e}")
    raise

Step 4: Extracting image features...
Loading CNN model...


Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth
100%|██████████| 97.8M/97.8M [00:00<00:00, 223MB/s]


Image feature extractor initialized


Extracting CNN features: 100%|██████████| 63/63 [00:38<00:00,  1.66it/s]


Image features shape: (1000, 2057)


Extracting CNN features: 100%|██████████| 13/13 [00:07<00:00,  1.74it/s]


Image features shape: (200, 2057)


Extracting CNN features: 100%|██████████| 32/32 [00:19<00:00,  1.66it/s]


Image features shape: (500, 2057)


In [15]:
    
try:
    # 5. Combine Features
    print("Step 5: Combining features...")

    X_train = np.hstack([train_text_features, train_image_features])
    X_val = np.hstack([val_text_features, val_image_features])
    X_test = np.hstack([test_text_features, test_image_features])

    y_train = train_sample['price'].values
    y_val = val_sample['price'].values

    print(f"Final feature dimensions: {X_train.shape}")
except Exception as e:
    print(f"Pipeline failed: {e}")
    raise
  

Step 5: Combining features...
Final feature dimensions: (1000, 2872)


In [16]:
  
try:
    # 6. Train Models
    print("Step 6: Training models...")

    trainer = ModelTrainer()
    models = trainer.train_all_models(X_train, y_train, X_val, y_val)
except Exception as e:
    print(f"Pipeline failed: {e}")
    raise

Step 6: Training models...
Training XGBoost...
Training LightGBM...




Training Ridge regression...


In [17]:
    
try:
    # 7. Evaluate Models
    print("Step 7: Evaluating models...")

    val_predictions = trainer.predict_all(X_val)

    for model_name, pred in val_predictions.items():
        smape = symmetric_mean_absolute_percentage_error(y_val, pred)
        rmse = np.sqrt(mean_squared_error(y_val, pred))
        print(f"{model_name} - SMAPE: {smape:.2f}%, RMSE: {rmse:.2f}")
except Exception as e:
    print(f"Pipeline failed: {e}")
    raise

Step 7: Evaluating models...
xgboost - SMAPE: 79.19%, RMSE: 43.80
lightgbm - SMAPE: 78.93%, RMSE: 43.65
ridge - SMAPE: 113.42%, RMSE: 51.65


In [18]:
try:
    # 8. Create Ensemble Predictions
    print("Step 8: Creating ensemble predictions...")

    ensemble_pred = trainer.create_ensemble_prediction(val_predictions)
    ensemble_smape = symmetric_mean_absolute_percentage_error(y_val, ensemble_pred)
    print(f"Ensemble SMAPE: {ensemble_smape:.2f}%")
except Exception as e:
    print(f"Pipeline failed: {e}")
    raise

Step 8: Creating ensemble predictions...
Ensemble SMAPE: 80.93%


In [19]:
try:
    # 9. Generate Test Predictions
    print("Step 9: Generating test predictions...")

    test_predictions = trainer.predict_all(X_test)
    final_predictions = trainer.create_ensemble_prediction(test_predictions)

    # Ensure positive predictions
    final_predictions = np.maximum(final_predictions, 0.01)
except Exception as e:
    print(f"Pipeline failed: {e}")
    raise

Step 9: Generating test predictions...


In [20]:
try:
    # 10. Create Submission
    print("Step 10: Creating submission...")

    submission_df = pd.DataFrame({
        'sample_id': test_sample['sample_id'],
        'price': final_predictions
    })

    submission_df.to_csv('submission.csv', index=False)

    # Final Summary
    elapsed_time = time.time() - start_time
    print("=" * 60)
    print("PIPELINE COMPLETED SUCCESSFULLY!")
    print("=" * 60)
    print(f"Total execution time: {elapsed_time:.2f} seconds")
    print(f"Best single model SMAPE: {min([symmetric_mean_absolute_percentage_error(y_val, pred) for pred in val_predictions.values()]):.2f}%")
    print(f"Ensemble SMAPE: {ensemble_smape:.2f}%")
    print(f"Submission shape: {submission_df.shape}")
    print(f"Price range: ${final_predictions.min():.2f} - ${final_predictions.max():.2f}")
    print("Submission saved as 'submission.csv'")

except Exception as e:
    print(f"Pipeline failed: {e}")
    raise

# if __name__ == "__main__":
#     submission = main()
#     print("\nPipeline completed! Check submission.csv for results.")

Step 10: Creating submission...
PIPELINE COMPLETED SUCCESSFULLY!
Total execution time: 586.17 seconds
Best single model SMAPE: 78.93%
Ensemble SMAPE: 80.93%
Submission shape: (500, 2)
Price range: $0.01 - $46.11
Submission saved as 'submission.csv'
