In [1]:
%pip install pandas numpy scikit-learn torch torchvision transformers pillow requests tqdm xgboost lightgbm optuna matplotlib seaborn

Note: you may need to restart the kernel to use updated packages.


In [2]:
%pip install torch torchvision --index-url https://download.pytorch.org/whl/cu118

Looking in indexes: https://download.pytorch.org/whl/cu118
Note: you may need to restart the kernel to use updated packages.


In [3]:
# ============================================================
# STEP 1: ENVIRONMENT SETUP & DATA LOADING
# Amazon ML Challenge 2025 - Product Price Prediction
# ============================================================

# Import core libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# For image processing
from PIL import Image
import requests
from io import BytesIO
import urllib.request
from tqdm import tqdm
import time

# Machine learning libraries (we'll use more later)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 100)
sns.set_style('whitegrid')

print("✅ Libraries imported successfully!")
print(f"NumPy version: {np.__version__}")
print(f"Pandas version: {pd.__version__}")


✅ Libraries imported successfully!
NumPy version: 2.0.1
Pandas version: 2.3.3


In [4]:
# ============================================================
# LOAD DATASETS
# ============================================================

# Load training data
train_df = pd.read_csv('/kaggle/input/dataset/train.csv')
test_df = pd.read_csv('/kaggle/input/dataset/test.csv')
sample_test = pd.read_csv('/kaggle/input/dataset/sample_test.csv')
sample_output = pd.read_csv('/kaggle/input/dataset/sample_test_out.csv')

print("✅ Data loaded successfully!")
print(f"\nTraining set shape: {train_df.shape}")
print(f"Test set shape: {test_df.shape}")
print(f"Sample test shape: {sample_test.shape}")
print(f"Sample output shape: {sample_output.shape}")


FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/input/dataset/train.csv'

In [None]:
# ============================================================
# BASIC DATA EXPLORATION
# ============================================================

# Display first few rows
print("=" * 80)
print("TRAINING DATA - FIRST 5 ROWS")
print("=" * 80)
display(train_df.head())

print("\n" + "=" * 80)
print("DATA TYPES AND MISSING VALUES")
print("=" * 80)
print(train_df.info())

# Check for missing values
print("\n" + "=" * 80)
print("MISSING VALUES ANALYSIS")
print("=" * 80)
missing_train = train_df.isnull().sum()
missing_pct = (missing_train / len(train_df)) * 100
missing_df = pd.DataFrame({
    'Column': missing_train.index,
    'Missing_Count': missing_train.values,
    'Missing_Percentage': missing_pct.values
})
display(missing_df)

# Check test data missing values
print("\nTest Data Missing Values:")
print(test_df.isnull().sum())


In [None]:
# ============================================================
# TARGET VARIABLE ANALYSIS (PRICE)
# ============================================================

print("=" * 80)
print("PRICE DISTRIBUTION STATISTICS")
print("=" * 80)
print(train_df['price'].describe())

# Additional statistics
print(f"\nMin Price: ${train_df['price'].min():.2f}")
print(f"Max Price: ${train_df['price'].max():.2f}")
print(f"Mean Price: ${train_df['price'].mean():.2f}")
print(f"Median Price: ${train_df['price'].median():.2f}")
print(f"Std Dev: ${train_df['price'].std():.2f}")
print(f"Skewness: {train_df['price'].skew():.2f}")
print(f"Kurtosis: {train_df['price'].kurtosis():.2f}")

# Check for negative prices (data quality check)
negative_prices = train_df[train_df['price'] <= 0]
print(f"\n⚠️ Rows with negative/zero prices: {len(negative_prices)}")


In [None]:
# ============================================================
# PRICE DISTRIBUTION VISUALIZATIONS
# ============================================================

fig, axes = plt.subplots(2, 2, figsize=(16, 10))

# Original price distribution
axes[0, 0].hist(train_df['price'], bins=100, color='steelblue', edgecolor='black', alpha=0.7)
axes[0, 0].set_title('Price Distribution (Original)', fontsize=14, fontweight='bold')
axes[0, 0].set_xlabel('Price ($)')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].grid(True, alpha=0.3)

# Log-transformed price (for better visualization)
axes[0, 1].hist(np.log1p(train_df['price']), bins=100, color='coral', edgecolor='black', alpha=0.7)
axes[0, 1].set_title('Price Distribution (Log-Transformed)', fontsize=14, fontweight='bold')
axes[0, 1].set_xlabel('Log(Price + 1)')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].grid(True, alpha=0.3)

# Box plot for outlier detection
axes[1, 0].boxplot(train_df['price'], vert=True, patch_artist=True,
                    boxprops=dict(facecolor='lightgreen', alpha=0.7))
axes[1, 0].set_title('Price Box Plot (Outlier Detection)', fontsize=14, fontweight='bold')
axes[1, 0].set_ylabel('Price ($)')
axes[1, 0].grid(True, alpha=0.3)

# Q-Q plot for normality check
from scipy import stats
stats.probplot(train_df['price'], dist="norm", plot=axes[1, 1])
axes[1, 1].set_title('Q-Q Plot (Normality Check)', fontsize=14, fontweight='bold')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("📊 Price distribution visualizations created!")


In [None]:
# ============================================================
# CATALOG CONTENT ANALYSIS
# ============================================================

print("=" * 80)
print("CATALOG CONTENT ANALYSIS")
print("=" * 80)

# Text length analysis
train_df['catalog_length'] = train_df['catalog_content'].str.len()

print(f"Average catalog content length: {train_df['catalog_length'].mean():.0f} characters")
print(f"Min length: {train_df['catalog_length'].min()}")
print(f"Max length: {train_df['catalog_length'].max()}")
print(f"Median length: {train_df['catalog_length'].median():.0f}")

# Sample catalog content
print("\n" + "=" * 80)
print("SAMPLE CATALOG CONTENT (First 3 products)")
print("=" * 80)
for idx, row in train_df.head(3).iterrows():
    print(f"\nProduct {idx + 1} (Price: ${row['price']:.2f}):")
    print(f"Catalog: {row['catalog_content'][:300]}...")
    print(f"Image URL: {row['image_link']}")
    print("-" * 80)


In [None]:
# ============================================================
# IMAGE LINK ANALYSIS
# ============================================================

print("=" * 80)
print("IMAGE LINK ANALYSIS")
print("=" * 80)

# Check for missing image links
missing_images_train = train_df['image_link'].isnull().sum()
missing_images_test = test_df['image_link'].isnull().sum()

print(f"Missing image links in training: {missing_images_train} ({missing_images_train/len(train_df)*100:.2f}%)")
print(f"Missing image links in test: {missing_images_test} ({missing_images_test/len(test_df)*100:.2f}%)")

# Check image URL patterns
print("\nSample image URLs:")
display(train_df['image_link'].head(10))

# Check unique domains
train_df['image_domain'] = train_df['image_link'].str.extract(r'https?://([^/]+)/')
print(f"\nUnique image domains: {train_df['image_domain'].nunique()}")
print("\nTop image domains:")
print(train_df['image_domain'].value_counts().head())


In [None]:
# ============================================================
# CORRELATION ANALYSIS
# ============================================================

print("=" * 80)
print("CORRELATION ANALYSIS")
print("=" * 80)

# Create correlation between text length and price
correlation_data = pd.DataFrame({
    'catalog_length': train_df['catalog_length'],
    'price': train_df['price'],
    'log_price': np.log1p(train_df['price'])
})

# Calculate correlations
corr_matrix = correlation_data.corr()
print("\nCorrelation Matrix:")
display(corr_matrix)

# Visualize correlation
plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, linewidths=1, fmt='.3f')
plt.title('Correlation: Catalog Length vs Price', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

# Scatter plot
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.scatter(train_df['catalog_length'], train_df['price'], alpha=0.3, s=10)
plt.xlabel('Catalog Content Length')
plt.ylabel('Price ($)')
plt.title('Catalog Length vs Price')
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
plt.scatter(train_df['catalog_length'], np.log1p(train_df['price']), alpha=0.3, s=10)
plt.xlabel('Catalog Content Length')
plt.ylabel('Log(Price + 1)')
plt.title('Catalog Length vs Log(Price)')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()


In [None]:
# ============================================================
# PRICE RANGE ANALYSIS
# ============================================================

# Create price bins for analysis
train_df['price_range'] = pd.cut(train_df['price'], 
                                   bins=[0, 10, 25, 50, 100, 500, 1000, float('inf')],
                                   labels=['$0-10', '$10-25', '$25-50', '$50-100', 
                                          '$100-500', '$500-1000', '$1000+'])

print("=" * 80)
print("PRICE RANGE DISTRIBUTION")
print("=" * 80)
price_range_dist = train_df['price_range'].value_counts().sort_index()
print(price_range_dist)

# Visualize
plt.figure(figsize=(12, 6))
price_range_dist.plot(kind='bar', color='teal', alpha=0.7, edgecolor='black')
plt.title('Product Count by Price Range', fontsize=14, fontweight='bold')
plt.xlabel('Price Range')
plt.ylabel('Number of Products')
plt.xticks(rotation=45)
plt.grid(True, alpha=0.3, axis='y')
plt.tight_layout()
plt.show()


In [None]:
# ============================================================
# DATA QUALITY CHECKS
# ============================================================

print("=" * 80)
print("DATA QUALITY CHECKS")
print("=" * 80)

# Check for duplicates
duplicates = train_df.duplicated().sum()
print(f"Duplicate rows in training: {duplicates}")

# Check sample_id uniqueness
print(f"Unique sample_ids in train: {train_df['sample_id'].nunique()} / {len(train_df)}")
print(f"Unique sample_ids in test: {test_df['sample_id'].nunique()} / {len(test_df)}")

# Check if test sample_ids match expected output
test_ids = set(test_df['sample_id'].values)
sample_ids = set(sample_test['sample_id'].values)
output_ids = set(sample_output['sample_id'].values)

print(f"\nSample test IDs match output: {sample_ids == output_ids}")
print(f"Sample test size: {len(sample_test)}")
print(f"Sample output size: {len(sample_output)}")


In [None]:
# ============================================================
# SUMMARY STATISTICS
# ============================================================

print("\n" + "=" * 80)
print("DATASET SUMMARY FOR AMAZON ML CHALLENGE 2025")
print("=" * 80)

summary = {
    'Metric': [
        'Training Samples',
        'Test Samples',
        'Total Samples',
        'Features',
        'Target Variable',
        'Min Price',
        'Max Price',
        'Mean Price',
        'Median Price',
        'Price Std Dev',
        'Missing Images (Train)',
        'Missing Images (Test)',
        'Avg Catalog Length'
    ],
    'Value': [
        f"{len(train_df):,}",
        f"{len(test_df):,}",
        f"{len(train_df) + len(test_df):,}",
        f"{len(train_df.columns) - 1}",  # Excluding target
        "price (continuous)",
        f"${train_df['price'].min():.2f}",
        f"${train_df['price'].max():.2f}",
        f"${train_df['price'].mean():.2f}",
        f"${train_df['price'].median():.2f}",
        f"${train_df['price'].std():.2f}",
        f"{missing_images_train} ({missing_images_train/len(train_df)*100:.2f}%)",
        f"{missing_images_test} ({missing_images_test/len(test_df)*100:.2f}%)",
        f"{train_df['catalog_length'].mean():.0f} chars"
    ]
}

summary_df = pd.DataFrame(summary)
display(summary_df)

print("\n✅ STEP 1 COMPLETED: Data loaded and explored!")
print("📊 Next steps: Feature extraction, text processing, and image handling")


In [None]:
# ============================================================
# STEP 2: FEATURE ENGINEERING & TEXT PROCESSING
# Amazon ML Challenge 2025 - Extract Rich Features
# ============================================================

import re
from collections import Counter

print("=" * 80)
print("STEP 2: FEATURE ENGINEERING FROM CATALOG CONTENT")
print("=" * 80)

# ============================================================
# HELPER FUNCTIONS FOR TEXT EXTRACTION
# ============================================================

def extract_item_name(text):
    """Extract item name from catalog content"""
    match = re.search(r'Item Name:\s*(.+?)(?:\n|$)', text)
    return match.group(1).strip() if match else ''

def extract_value_unit(text):
    """Extract numerical value and unit (e.g., 12.0 Fl Oz)"""
    value_match = re.search(r'Value:\s*(\d+\.?\d*)', text)
    unit_match = re.search(r'Unit:\s*(.+?)(?:\n|$)', text)
    
    value = float(value_match.group(1)) if value_match else 0.0
    unit = unit_match.group(1).strip() if unit_match else ''
    
    return value, unit

def extract_pack_count(text):
    """Extract pack count from text (e.g., Pack of 6, 4-Pack)"""
    patterns = [
        r'[Pp]ack\s+of\s+(\d+)',
        r'\(Pack\s+of\s+(\d+)\)',
        r'(\d+)[- ][Pp]ack',
        r'(\d+)\s*[Cc]ount'
    ]
    
    for pattern in patterns:
        match = re.search(pattern, text)
        if match:
            return int(match.group(1))
    return 1  # Default to 1 if no pack info found

def extract_all_numbers(text):
    """Extract all numeric values from text"""
    numbers = re.findall(r'\d+\.?\d*', text)
    return [float(n) for n in numbers] if numbers else []

def extract_weight_volume(text):
    """Extract weight or volume with units"""
    # Common patterns for weight/volume
    patterns = [
        r'(\d+\.?\d*)\s*(oz|ounce|lb|pound|kg|kilogram|g|gram)',
        r'(\d+\.?\d*)\s*(ml|milliliter|l|liter|fl\s*oz|gallon)',
    ]
    
    for pattern in patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            return float(match.group(1)), match.group(2).lower()
    return 0.0, ''

def extract_brand(text):
    """Extract potential brand name (first few words or capitalized words)"""
    item_name = extract_item_name(text)
    if item_name:
        # Get first 2-3 words as potential brand
        words = item_name.split()[:3]
        return ' '.join(words)
    return ''

def count_bullet_points(text):
    """Count number of bullet points in description"""
    return len(re.findall(r'Bullet Point \d+:', text))

def calculate_total_quantity(text):
    """Calculate total quantity (pack_count * unit_value)"""
    pack_count = extract_pack_count(text)
    value, unit = extract_value_unit(text)
    return pack_count * value if value > 0 else pack_count

print("✅ Helper functions defined successfully!")


In [None]:
# ============================================================
# APPLY FEATURE EXTRACTION TO TRAINING DATA
# ============================================================

print("\n" + "=" * 80)
print("EXTRACTING FEATURES FROM TRAINING DATA")
print("=" * 80)

# Create feature columns
train_df['item_name'] = train_df['catalog_content'].apply(extract_item_name)
train_df['brand'] = train_df['catalog_content'].apply(extract_brand)
train_df['unit_value'], train_df['unit_type'] = zip(*train_df['catalog_content'].apply(extract_value_unit))
train_df['pack_count'] = train_df['catalog_content'].apply(extract_pack_count)
train_df['total_quantity'] = train_df['catalog_content'].apply(calculate_total_quantity)
train_df['weight_value'], train_df['weight_unit'] = zip(*train_df['catalog_content'].apply(extract_weight_volume))
train_df['num_bullet_points'] = train_df['catalog_content'].apply(count_bullet_points)
train_df['word_count'] = train_df['catalog_content'].apply(lambda x: len(x.split()))
train_df['char_count'] = train_df['catalog_content'].apply(len)
train_df['num_numbers'] = train_df['catalog_content'].apply(lambda x: len(extract_all_numbers(x)))

# Text complexity features
train_df['avg_word_length'] = train_df['catalog_content'].apply(
    lambda x: np.mean([len(word) for word in x.split()]) if len(x.split()) > 0 else 0
)

print("✅ Features extracted successfully!")
print(f"\nNew feature columns added: {len(train_df.columns) - 4}")
print("\nFeature columns:")
for col in train_df.columns[4:]:  # Skip original columns
    print(f"  - {col}")


In [None]:
# ============================================================
# DISPLAY EXTRACTED FEATURES (SAMPLE)
# ============================================================

print("\n" + "=" * 80)
print("SAMPLE EXTRACTED FEATURES")
print("=" * 80)

# Select important columns to display
feature_cols = ['sample_id', 'item_name', 'brand', 'unit_value', 'unit_type', 
                'pack_count', 'total_quantity', 'num_bullet_points', 'price']

print("\nFirst 10 products with extracted features:")
display(train_df[feature_cols].head(10))

# Show statistics for numerical features
print("\n" + "=" * 80)
print("NUMERICAL FEATURE STATISTICS")
print("=" * 80)

numerical_features = ['unit_value', 'pack_count', 'total_quantity', 'weight_value',
                      'num_bullet_points', 'word_count', 'char_count', 'num_numbers', 
                      'avg_word_length']

display(train_df[numerical_features].describe())


In [None]:
# ============================================================
# UNIT TYPE ANALYSIS
# ============================================================

print("\n" + "=" * 80)
print("UNIT TYPE DISTRIBUTION")
print("=" * 80)

# Analyze unit types
unit_counts = train_df['unit_type'].value_counts()
print(f"\nUnique unit types: {len(unit_counts)}")
print("\nTop 20 unit types:")
print(unit_counts.head(20))

# Standardize common units
def standardize_unit(unit):
    """Standardize unit types to common categories"""
    unit = unit.lower().strip()
    
    # Weight units
    if unit in ['oz', 'ounce', 'ounces']:
        return 'oz'
    elif unit in ['lb', 'pound', 'pounds']:
        return 'lb'
    elif unit in ['kg', 'kilogram', 'kilograms']:
        return 'kg'
    elif unit in ['g', 'gram', 'grams']:
        return 'g'
    
    # Volume units
    elif unit in ['ml', 'milliliter', 'milliliters']:
        return 'ml'
    elif unit in ['l', 'liter', 'liters']:
        return 'l'
    elif 'fl oz' in unit or 'fluid ounce' in unit:
        return 'fl_oz'
    elif unit in ['gallon', 'gallons', 'gal']:
        return 'gallon'
    
    # Count units
    elif unit in ['count', 'piece', 'pieces', 'ct']:
        return 'count'
    
    return unit

train_df['unit_type_std'] = train_df['unit_type'].apply(standardize_unit)

print("\nStandardized unit types:")
print(train_df['unit_type_std'].value_counts().head(15))


In [None]:
# ============================================================
# CATEGORY/PRODUCT TYPE EXTRACTION
# ============================================================

print("\n" + "=" * 80)
print("PRODUCT CATEGORY IDENTIFICATION")
print("=" * 80)

# Define common food/product categories
def identify_category(text):
    """Identify product category from text"""
    text_lower = text.lower()
    
    categories = {
        'sauce': ['sauce', 'salsa', 'dressing', 'condiment'],
        'cookie': ['cookie', 'biscuit', 'wafer'],
        'soup': ['soup', 'broth', 'stew'],
        'cheese': ['cheese', 'cheddar', 'parmesan'],
        'wine': ['wine', 'cooking wine'],
        'snack': ['chip', 'snack', 'cracker', 'popcorn'],
        'beverage': ['coffee', 'tea', 'juice', 'drink', 'soda'],
        'pasta': ['pasta', 'noodle', 'spaghetti', 'macaroni'],
        'oil': ['oil', 'olive oil', 'vegetable oil'],
        'spice': ['spice', 'seasoning', 'powder', 'herb'],
        'cereal': ['cereal', 'granola', 'oatmeal'],
        'candy': ['candy', 'chocolate', 'sweet', 'gummy'],
        'baking': ['flour', 'sugar', 'baking', 'mix'],
        'canned': ['canned', 'can of'],
        'dried': ['dried', 'dehydrated']
    }
    
    for category, keywords in categories.items():
        for keyword in keywords:
            if keyword in text_lower:
                return category
    
    return 'other'

train_df['category'] = train_df['catalog_content'].apply(identify_category)

print("Category distribution:")
category_dist = train_df['category'].value_counts()
print(category_dist)

# Visualize
plt.figure(figsize=(12, 6))
category_dist.head(15).plot(kind='barh', color='teal', edgecolor='black', alpha=0.7)
plt.title('Top 15 Product Categories', fontsize=14, fontweight='bold')
plt.xlabel('Count')
plt.ylabel('Category')
plt.tight_layout()
plt.show()


In [None]:
# ============================================================
# PRICE ANALYSIS BY FEATURES
# ============================================================

print("\n" + "=" * 80)
print("PRICE ANALYSIS BY KEY FEATURES")
print("=" * 80)

# Price by pack count
print("\n1. Average Price by Pack Count:")
price_by_pack = train_df.groupby('pack_count')['price'].agg(['mean', 'median', 'count']).round(2)
display(price_by_pack.head(15))

# Price by category
print("\n2. Average Price by Category:")
price_by_category = train_df.groupby('category')['price'].agg(['mean', 'median', 'count']).round(2)
price_by_category = price_by_category.sort_values('mean', ascending=False)
display(price_by_category)

# Price by unit type
print("\n3. Average Price by Unit Type (Top 10):")
price_by_unit = train_df[train_df['unit_type_std'] != ''].groupby('unit_type_std')['price'].agg(['mean', 'median', 'count']).round(2)
price_by_unit = price_by_unit.sort_values('mean', ascending=False)
display(price_by_unit.head(10))


In [None]:
# ============================================================
# CORRELATION ANALYSIS WITH NEW FEATURES
# ============================================================

print("\n" + "=" * 80)
print("CORRELATION ANALYSIS: NEW FEATURES VS PRICE")
print("=" * 80)

# Select numerical features for correlation
correlation_features = ['unit_value', 'pack_count', 'total_quantity', 'weight_value',
                        'num_bullet_points', 'word_count', 'char_count', 
                        'num_numbers', 'avg_word_length', 'price']

# Calculate correlation matrix
corr_matrix = train_df[correlation_features].corr()

# Display correlations with price
price_corr = corr_matrix['price'].sort_values(ascending=False)
print("\nFeature Correlations with Price:")
print(price_corr)

# Visualize correlation heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, cmap='RdYlGn', center=0, 
            square=True, linewidths=1, fmt='.3f', cbar_kws={"shrink": 0.8})
plt.title('Feature Correlation Heatmap', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()


In [None]:
# ============================================================
# VISUALIZE KEY RELATIONSHIPS
# ============================================================

fig, axes = plt.subplots(2, 3, figsize=(18, 12))

# 1. Pack Count vs Price
axes[0, 0].scatter(train_df['pack_count'], train_df['price'], alpha=0.3, s=10)
axes[0, 0].set_xlabel('Pack Count')
axes[0, 0].set_ylabel('Price ($)')
axes[0, 0].set_title('Pack Count vs Price')
axes[0, 0].set_xlim(0, 50)
axes[0, 0].grid(True, alpha=0.3)

# 2. Total Quantity vs Log Price
axes[0, 1].scatter(train_df['total_quantity'], np.log1p(train_df['price']), alpha=0.3, s=10, color='coral')
axes[0, 1].set_xlabel('Total Quantity')
axes[0, 1].set_ylabel('Log(Price + 1)')
axes[0, 1].set_title('Total Quantity vs Log(Price)')
axes[0, 1].set_xlim(0, 500)
axes[0, 1].grid(True, alpha=0.3)

# 3. Word Count vs Price
axes[0, 2].scatter(train_df['word_count'], train_df['price'], alpha=0.3, s=10, color='green')
axes[0, 2].set_xlabel('Word Count')
axes[0, 2].set_ylabel('Price ($)')
axes[0, 2].set_title('Word Count vs Price')
axes[0, 2].grid(True, alpha=0.3)

# 4. Number of Bullet Points vs Price
bp_price = train_df.groupby('num_bullet_points')['price'].mean()
axes[1, 0].bar(bp_price.index, bp_price.values, color='purple', alpha=0.7, edgecolor='black')
axes[1, 0].set_xlabel('Number of Bullet Points')
axes[1, 0].set_ylabel('Average Price ($)')
axes[1, 0].set_title('Bullet Points vs Average Price')
axes[1, 0].grid(True, alpha=0.3, axis='y')

# 5. Category vs Average Price (Top 10)
category_price = train_df.groupby('category')['price'].mean().sort_values(ascending=False).head(10)
axes[1, 1].barh(category_price.index, category_price.values, color='teal', alpha=0.7, edgecolor='black')
axes[1, 1].set_xlabel('Average Price ($)')
axes[1, 1].set_ylabel('Category')
axes[1, 1].set_title('Top 10 Categories by Price')
axes[1, 1].grid(True, alpha=0.3, axis='x')

# 6. Unit Value Distribution
axes[1, 2].hist(train_df[train_df['unit_value'] > 0]['unit_value'], bins=50, 
                color='orange', alpha=0.7, edgecolor='black')
axes[1, 2].set_xlabel('Unit Value')
axes[1, 2].set_ylabel('Frequency')
axes[1, 2].set_title('Unit Value Distribution')
axes[1, 2].set_xlim(0, 100)
axes[1, 2].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()


In [None]:
# ============================================================
# APPLY SAME FEATURE EXTRACTION TO TEST DATA
# ============================================================

print("\n" + "=" * 80)
print("EXTRACTING FEATURES FROM TEST DATA")
print("=" * 80)

# Apply same transformations to test data
test_df['item_name'] = test_df['catalog_content'].apply(extract_item_name)
test_df['brand'] = test_df['catalog_content'].apply(extract_brand)
test_df['unit_value'], test_df['unit_type'] = zip(*test_df['catalog_content'].apply(extract_value_unit))
test_df['pack_count'] = test_df['catalog_content'].apply(extract_pack_count)
test_df['total_quantity'] = test_df['catalog_content'].apply(calculate_total_quantity)
test_df['weight_value'], test_df['weight_unit'] = zip(*test_df['catalog_content'].apply(extract_weight_volume))
test_df['num_bullet_points'] = test_df['catalog_content'].apply(count_bullet_points)
test_df['word_count'] = test_df['catalog_content'].apply(lambda x: len(x.split()))
test_df['char_count'] = test_df['catalog_content'].apply(len)
test_df['num_numbers'] = test_df['catalog_content'].apply(lambda x: len(extract_all_numbers(x)))
test_df['avg_word_length'] = test_df['catalog_content'].apply(
    lambda x: np.mean([len(word) for word in x.split()]) if len(x.split()) > 0 else 0
)
test_df['unit_type_std'] = test_df['unit_type'].apply(standardize_unit)
test_df['category'] = test_df['catalog_content'].apply(identify_category)

print("✅ Features extracted from test data!")
print(f"Test data shape: {test_df.shape}")
print(f"\nSample test features:")
display(test_df[['sample_id', 'brand', 'pack_count', 'total_quantity', 'category']].head())


In [None]:
# ============================================================
# ENCODING CATEGORICAL FEATURES - HANDLE UNSEEN VALUES
# ============================================================

print("\n" + "=" * 80)
print("ENCODING CATEGORICAL FEATURES (FIXED VERSION)")
print("=" * 80)

import numpy as np
from sklearn.preprocessing import LabelEncoder

def safe_label_encode_v2(train_series, test_series, column_name):
    """
    Safely encode labels by fitting on combined unique values
    """
    # Fill NaN with 'unknown'
    train_clean = train_series.fillna('unknown').astype(str)
    test_clean = test_series.fillna('unknown').astype(str)
    
    # Get all unique values from both train and test
    all_unique_values = list(set(train_clean.unique()) | set(test_clean.unique()))
    
    # Add 'unknown' if not present
    if 'unknown' not in all_unique_values:
        all_unique_values.append('unknown')
    
    # Create and fit encoder on all possible values
    le = LabelEncoder()
    le.fit(all_unique_values)
    
    # Transform both datasets
    train_encoded = le.transform(train_clean)
    test_encoded = le.transform(test_clean)
    
    # Count how many test values were not in training
    train_set = set(train_clean.unique())
    test_set = set(test_clean.unique())
    unseen = test_set - train_set
    
    if unseen:
        num_unseen = sum(test_clean.isin(unseen))
        print(f"  ⚠️  {column_name}: {len(unseen)} unique unseen categories ({num_unseen} total occurrences)")
        print(f"      Examples: {list(unseen)[:3]}")
    else:
        print(f"  ✅ {column_name}: No unseen categories in test data")
    
    return train_encoded, test_encoded, le

# Apply safe encoding to unit_type_std
print("\n1. Encoding unit_type_std...")
train_df['unit_type_encoded'], test_df['unit_type_encoded'], le_unit = safe_label_encode_v2(
    train_df['unit_type_std'], 
    test_df['unit_type_std'],
    'unit_type_std'
)

# Apply safe encoding to category
print("\n2. Encoding category...")
train_df['category_encoded'], test_df['category_encoded'], le_category = safe_label_encode_v2(
    train_df['category'], 
    test_df['category'],
    'category'
)

print("\n✅ Categorical encoding completed successfully!")
print(f"\nTotal unique unit types encoded: {len(le_unit.classes_)}")
print(f"Total unique categories encoded: {len(le_category.classes_)}")

# Verify encoding worked
print("\n" + "=" * 80)
print("ENCODING VERIFICATION")
print("=" * 80)
print(f"\nTrain unit_type_encoded - Min: {train_df['unit_type_encoded'].min()}, Max: {train_df['unit_type_encoded'].max()}")
print(f"Test unit_type_encoded - Min: {test_df['unit_type_encoded'].min()}, Max: {test_df['unit_type_encoded'].max()}")
print(f"\nTrain category_encoded - Min: {train_df['category_encoded'].min()}, Max: {train_df['category_encoded'].max()}")
print(f"Test category_encoded - Min: {test_df['category_encoded'].min()}, Max: {test_df['category_encoded'].max()}")

print("\n✅ Encoding ranges are compatible!")


In [None]:
# ============================================================
# FREQUENCY ENCODING FOR ADDITIONAL ROBUSTNESS
# ============================================================

print("\n" + "=" * 80)
print("FREQUENCY ENCODING (HANDLES UNSEEN VALUES NATURALLY)")
print("=" * 80)

def frequency_encoding(train_series, test_series, column_name):
    """
    Encode categories by their frequency in training data
    Unseen categories get a default frequency
    """
    # Calculate frequency from training data
    freq_map = train_series.value_counts(normalize=True).to_dict()
    
    # Default frequency for unseen categories (use minimum frequency or small value)
    default_freq = min(freq_map.values()) if freq_map else 0.0001
    
    # Apply encoding
    train_encoded = train_series.map(lambda x: freq_map.get(x, default_freq))
    test_encoded = test_series.map(lambda x: freq_map.get(x, default_freq))
    
    # Count unseen
    test_set = set(test_series.dropna().unique())
    train_set = set(train_series.dropna().unique())
    unseen = test_set - train_set
    
    if unseen:
        unseen_count = sum(test_series.isin(unseen))
        print(f"  {column_name}: {len(unseen)} unseen categories ({unseen_count} occurrences) → frequency: {default_freq:.6f}")
    else:
        print(f"  ✅ {column_name}: No unseen categories")
    
    return train_encoded, test_encoded

# Apply frequency encoding
print("\nApplying frequency encoding...\n")
train_df['unit_type_freq'], test_df['unit_type_freq'] = frequency_encoding(
    train_df['unit_type_std'], 
    test_df['unit_type_std'],
    'unit_type_std'
)

train_df['category_freq'], test_df['category_freq'] = frequency_encoding(
    train_df['category'], 
    test_df['category'],
    'category'
)

print("\n✅ Frequency encoding completed!")
print(f"\nFrequency encoding statistics:")
print(f"  unit_type_freq - Min: {train_df['unit_type_freq'].min():.6f}, Max: {train_df['unit_type_freq'].max():.6f}")
print(f"  category_freq - Min: {train_df['category_freq'].min():.6f}, Max: {train_df['category_freq'].max():.6f}")


In [None]:
# ============================================================
# HANDLE PRICE TRANSFORMATION FOR MODELING
# ============================================================

print("\n" + "=" * 80)
print("PRICE TRANSFORMATION ANALYSIS")
print("=" * 80)

# Create log-transformed price (helps with skewed distribution)
train_df['log_price'] = np.log1p(train_df['price'])

# Compare distributions
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].hist(train_df['price'], bins=100, color='steelblue', alpha=0.7, edgecolor='black')
axes[0].set_title('Original Price Distribution', fontsize=12, fontweight='bold')
axes[0].set_xlabel('Price ($)')
axes[0].set_ylabel('Frequency')
axes[0].axvline(train_df['price'].median(), color='red', linestyle='--', linewidth=2, label=f'Median: ${train_df["price"].median():.2f}')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

axes[1].hist(train_df['log_price'], bins=100, color='coral', alpha=0.7, edgecolor='black')
axes[1].set_title('Log-Transformed Price Distribution', fontsize=12, fontweight='bold')
axes[1].set_xlabel('Log(Price + 1)')
axes[1].set_ylabel('Frequency')
axes[1].axvline(train_df['log_price'].median(), color='red', linestyle='--', linewidth=2, label=f'Median: {train_df["log_price"].median():.2f}')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"\n📊 Distribution Comparison:")
print(f"  Original Price Skewness: {train_df['price'].skew():.2f}")
print(f"  Log Price Skewness: {train_df['log_price'].skew():.2f}")
print(f"\n✅ Log transformation reduces skewness by {abs(train_df['price'].skew() - train_df['log_price'].skew()):.2f}!")


In [None]:
# ============================================================
# CREATE FINAL FEATURE SET FOR MODELING
# ============================================================

print("\n" + "=" * 80)
print("FINAL FEATURE SET FOR MODELING")
print("=" * 80)

# Define all engineered features
numerical_features = [
    'unit_value',
    'pack_count',
    'total_quantity',
    'weight_value',
    'num_bullet_points',
    'word_count',
    'char_count',
    'num_numbers',
    'avg_word_length'
]

categorical_features_encoded = [
    'unit_type_encoded',
    'category_encoded'
]

frequency_features = [
    'unit_type_freq',
    'category_freq'
]

# All features combined
all_features = numerical_features + categorical_features_encoded + frequency_features

print(f"📊 Total features for modeling: {len(all_features)}")
print("\nFeature breakdown:")
print(f"  ✓ Numerical features: {len(numerical_features)}")
print(f"  ✓ Encoded categorical: {len(categorical_features_encoded)}")
print(f"  ✓ Frequency encoded: {len(frequency_features)}")

print("\n" + "=" * 80)
print("ALL FEATURES LIST")
print("=" * 80)
for i, feat in enumerate(all_features, 1):
    print(f"  {i:2d}. {feat}")

# Verify all features exist in both datasets
missing_in_train = [f for f in all_features if f not in train_df.columns]
missing_in_test = [f for f in all_features if f not in test_df.columns]

if missing_in_train:
    print(f"\n⚠️  WARNING: Missing in train: {missing_in_train}")
if missing_in_test:
    print(f"⚠️  WARNING: Missing in test: {missing_in_test}")
    
if not missing_in_train and not missing_in_test:
    print("\n✅ All features present in both train and test datasets!")


In [None]:
# ============================================================
# FEATURE STATISTICS
# ============================================================

print("\n" + "=" * 80)
print("FEATURE STATISTICS")
print("=" * 80)

print("\nTraining data feature statistics:")
display(train_df[all_features].describe())

print("\n" + "=" * 80)
print("FEATURE CORRELATION WITH PRICE")
print("=" * 80)

# Calculate correlation with target
feature_correlations = train_df[all_features + ['price']].corr()['price'].sort_values(ascending=False)[1:]
print("\nTop 10 Features by Correlation with Price:")
print(feature_correlations.head(10))

# Visualize
plt.figure(figsize=(10, 8))
feature_correlations.plot(kind='barh', color='teal', edgecolor='black', alpha=0.7)
plt.title('Feature Correlations with Price', fontsize=14, fontweight='bold')
plt.xlabel('Correlation Coefficient')
plt.ylabel('Features')
plt.axvline(0, color='black', linestyle='-', linewidth=0.8)
plt.grid(True, alpha=0.3, axis='x')
plt.tight_layout()
plt.show()


In [None]:
# ============================================================
# SAVE PROCESSED DATA
# ============================================================

print("\n" + "=" * 80)
print("SAVE PROCESSED DATASETS")
print("=" * 80)

# Save to CSV for later use
train_df.to_csv('train_processed.csv', index=False)
test_df.to_csv('test_processed.csv', index=False)

print("✅ Processed datasets saved!")
print(f"  📁 train_processed.csv: {train_df.shape}")
print(f"  📁 test_processed.csv: {test_df.shape}")

# Save feature list for later use
import json

feature_config = {
    'all_features': all_features,
    'numerical_features': numerical_features,
    'categorical_features': categorical_features_encoded,
    'frequency_features': frequency_features,
    'target': 'price',
    'log_target': 'log_price'
}

with open('feature_config.json', 'w') as f:
    json.dump(feature_config, f, indent=2)

print(f"  📁 feature_config.json: Feature configuration saved")

print("\n" + "=" * 80)
print("✅✅✅ STEP 2 COMPLETED SUCCESSFULLY! ✅✅✅")
print("=" * 80)
print("\n📊 Summary of achievements:")
print(f"  ✓ Extracted {len(all_features)} powerful engineered features")
print(f"  ✓ Handled {len(le_unit.classes_)} unit types (including unseen values)")
print(f"  ✓ Handled {len(le_category.classes_)} product categories")
print(f"  ✓ Applied dual encoding strategy (label + frequency)")
print(f"  ✓ Log-transformed target reduces skewness from 13.60 → {train_df['log_price'].skew():.2f}")
print(f"  ✓ Datasets saved and ready for modeling")
print(f"\n🎯 Next step: Build baseline and advanced ML models!")
print(f"   Recommended models: XGBoost, LightGBM, CatBoost")


In [None]:
# ============================================================
# STEP 3: BUILD BASELINE & ADVANCED ML MODELS
# ============================================================

print("=" * 80)
print("STEP 3: MODEL BUILDING & TRAINING")
print("=" * 80)

# Import ML libraries
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor, Pool
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

print("✅ ML libraries imported successfully!")


In [None]:
# ============================================================
# CUSTOM SMAPE METRIC (EVALUATION METRIC)
# ============================================================

def calculate_smape(y_true, y_pred):
    """
    Calculate Symmetric Mean Absolute Percentage Error (SMAPE)
    SMAPE = (100/n) * Σ |y_pred - y_true| / ((|y_true| + |y_pred|)/2)
    """
    numerator = np.abs(y_pred - y_true)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    
    # Avoid division by zero
    denominator = np.where(denominator == 0, 1e-10, denominator)
    
    smape = np.mean(numerator / denominator) * 100
    return smape

# Custom scorer for sklearn cross-validation
from sklearn.metrics import make_scorer

def smape_scorer(y_true, y_pred):
    """Negative SMAPE for sklearn optimization (sklearn maximizes scores)"""
    return -calculate_smape(y_true, y_pred)

smape_score = make_scorer(smape_scorer, greater_is_better=True)

print("\n✅ Custom SMAPE metric defined!")
print(f"Test SMAPE calculation: {calculate_smape(np.array([100, 200]), np.array([110, 180])):.2f}%")


In [None]:
# ============================================================
# PREPARE DATA FOR MODELING
# ============================================================

print("\n" + "=" * 80)
print("PREPARING DATA FOR MODELING")
print("=" * 80)

# Load processed data
train_df = pd.read_csv('train_processed.csv')
test_df = pd.read_csv('test_processed.csv')

# Load feature configuration
import json
with open('feature_config.json', 'r') as f:
    feature_config = json.load(f)

all_features = feature_config['all_features']

# Prepare feature matrices
X_train = train_df[all_features].copy()
y_train = train_df['price'].copy()
y_train_log = train_df['log_price'].copy()

X_test = test_df[all_features].copy()
test_ids = test_df['sample_id'].copy()

# Handle any missing values
X_train = X_train.fillna(0)
X_test = X_test.fillna(0)

print(f"✅ Data prepared!")
print(f"   Training samples: {X_train.shape[0]:,}")
print(f"   Test samples: {X_test.shape[0]:,}")
print(f"   Features: {X_train.shape[1]}")
print(f"   Target range: ${y_train.min():.2f} - ${y_train.max():.2f}")
print(f"   Log target range: {y_train_log.min():.2f} - {y_train_log.max():.2f}")


In [None]:
# ============================================================
# TRAIN-VALIDATION SPLIT FOR MODEL EVALUATION
# ============================================================

from sklearn.model_selection import train_test_split

X_tr, X_val, y_tr, y_val = train_test_split(
    X_train, y_train, test_size=0.15, random_state=42
)

y_tr_log, y_val_log = np.log1p(y_tr), np.log1p(y_val)

print("\n" + "=" * 80)
print("TRAIN-VALIDATION SPLIT")
print("=" * 80)
print(f"Training set: {X_tr.shape[0]:,} samples")
print(f"Validation set: {X_val.shape[0]:,} samples")


In [None]:
# ============================================================
# MODEL 1: BASELINE - SIMPLE GRADIENT BOOSTING
# ============================================================

print("\n" + "=" * 80)
print("MODEL 1: BASELINE GRADIENT BOOSTING")
print("=" * 80)

from sklearn.ensemble import GradientBoostingRegressor

baseline_model = GradientBoostingRegressor(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=5,
    random_state=42,
    verbose=0
)

print("Training baseline model...")
baseline_model.fit(X_tr, y_tr_log)

# Predictions (inverse log transform)
baseline_pred_val = np.expm1(baseline_model.predict(X_val))
baseline_pred_val = np.maximum(baseline_pred_val, 0.1)  # Ensure positive

# Calculate metrics
baseline_smape = calculate_smape(y_val, baseline_pred_val)
baseline_mae = mean_absolute_error(y_val, baseline_pred_val)
baseline_rmse = np.sqrt(mean_squared_error(y_val, baseline_pred_val))

print(f"\n✅ Baseline Model Results:")
print(f"   SMAPE: {baseline_smape:.4f}%")
print(f"   MAE: ${baseline_mae:.2f}")
print(f"   RMSE: ${baseline_rmse:.2f}")


In [None]:
# ============================================================
# MODEL 2: XGBOOST (OPTIMIZED)
# ============================================================

print("\n" + "=" * 80)
print("MODEL 2: XGBOOST")
print("=" * 80)

xgb_params = {
    'objective': 'reg:squarederror',
    'learning_rate': 0.05,
    'max_depth': 7,
    'min_child_weight': 3,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'gamma': 0.1,
    'reg_alpha': 0.1,
    'reg_lambda': 1.0,
    'random_state': 42,
    'n_estimators': 500,
    'early_stopping_rounds': 50,
    'verbose': 0
}

print("Training XGBoost model...")
xgb_model = xgb.XGBRegressor(**xgb_params)
xgb_model.fit(
    X_tr, y_tr_log,
    eval_set=[(X_val, y_val_log)],
    verbose=False
)

# Predictions
xgb_pred_val = np.expm1(xgb_model.predict(X_val))
xgb_pred_val = np.maximum(xgb_pred_val, 0.1)

# Metrics
xgb_smape = calculate_smape(y_val, xgb_pred_val)
xgb_mae = mean_absolute_error(y_val, xgb_pred_val)
xgb_rmse = np.sqrt(mean_squared_error(y_val, xgb_pred_val))

print(f"\n✅ XGBoost Results:")
print(f"   SMAPE: {xgb_smape:.4f}%")
print(f"   MAE: ${xgb_mae:.2f}")
print(f"   RMSE: ${xgb_rmse:.2f}")
print(f"   Best iteration: {xgb_model.best_iteration}")


In [None]:
# ============================================================
# MODEL 3: LIGHTGBM (FAST & ACCURATE)
# ============================================================

print("\n" + "=" * 80)
print("MODEL 3: LIGHTGBM")
print("=" * 80)

lgb_params = {
    'objective': 'regression',
    'metric': 'rmse',
    'learning_rate': 0.05,
    'num_leaves': 63,
    'max_depth': 8,
    'min_child_samples': 20,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'reg_alpha': 0.1,
    'reg_lambda': 1.0,
    'random_state': 42,
    'n_estimators': 500,
    'verbose': -1
}

print("Training LightGBM model...")
lgb_model = lgb.LGBMRegressor(**lgb_params)
lgb_model.fit(
    X_tr, y_tr_log,
    eval_set=[(X_val, y_val_log)],
    callbacks=[lgb.early_stopping(50), lgb.log_evaluation(0)]
)

# Predictions
lgb_pred_val = np.expm1(lgb_model.predict(X_val))
lgb_pred_val = np.maximum(lgb_pred_val, 0.1)

# Metrics
lgb_smape = calculate_smape(y_val, lgb_pred_val)
lgb_mae = mean_absolute_error(y_val, lgb_pred_val)
lgb_rmse = np.sqrt(mean_squared_error(y_val, lgb_pred_val))

print(f"\n✅ LightGBM Results:")
print(f"   SMAPE: {lgb_smape:.4f}%")
print(f"   MAE: ${lgb_mae:.2f}")
print(f"   RMSE: ${lgb_rmse:.2f}")
print(f"   Best iteration: {lgb_model.best_iteration_}")


In [None]:
# ============================================================
# MODEL 4: CATBOOST (ROBUST & ACCURATE)
# ============================================================

print("\n" + "=" * 80)
print("MODEL 4: CATBOOST")
print("=" * 80)

cat_params = {
    'iterations': 500,
    'learning_rate': 0.05,
    'depth': 8,
    'l2_leaf_reg': 3,
    'subsample': 0.8,
    'random_strength': 1,
    'bagging_temperature': 1,
    'random_seed': 42,
    'verbose': 0,
    'early_stopping_rounds': 50,
    'loss_function': 'RMSE'
}

print("Training CatBoost model...")
cat_model = CatBoostRegressor(**cat_params)
cat_model.fit(
    X_tr, y_tr_log,
    eval_set=(X_val, y_val_log),
    verbose=False
)

# Predictions
cat_pred_val = np.expm1(cat_model.predict(X_val))
cat_pred_val = np.maximum(cat_pred_val, 0.1)

# Metrics
cat_smape = calculate_smape(y_val, cat_pred_val)
cat_mae = mean_absolute_error(y_val, cat_pred_val)
cat_rmse = np.sqrt(mean_squared_error(y_val, cat_pred_val))

print(f"\n✅ CatBoost Results:")
print(f"   SMAPE: {cat_smape:.4f}%")
print(f"   MAE: ${cat_mae:.2f}")
print(f"   RMSE: ${cat_rmse:.2f}")
print(f"   Best iteration: {cat_model.get_best_iteration()}")


In [None]:
# ============================================================
# MODEL COMPARISON
# ============================================================

print("\n" + "=" * 80)
print("MODEL PERFORMANCE COMPARISON")
print("=" * 80)

results_df = pd.DataFrame({
    'Model': ['Baseline GB', 'XGBoost', 'LightGBM', 'CatBoost'],
    'SMAPE (%)': [baseline_smape, xgb_smape, lgb_smape, cat_smape],
    'MAE ($)': [baseline_mae, xgb_mae, lgb_mae, cat_mae],
    'RMSE ($)': [baseline_rmse, xgb_rmse, lgb_rmse, cat_rmse]
})

results_df = results_df.sort_values('SMAPE (%)')
display(results_df)

# Visualize comparison
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# SMAPE comparison
axes[0].bar(results_df['Model'], results_df['SMAPE (%)'], color='steelblue', edgecolor='black', alpha=0.7)
axes[0].set_title('SMAPE Comparison (Lower is Better)', fontsize=12, fontweight='bold')
axes[0].set_ylabel('SMAPE (%)')
axes[0].set_xlabel('Model')
axes[0].tick_params(axis='x', rotation=45)
axes[0].grid(True, alpha=0.3, axis='y')

# MAE comparison
axes[1].bar(results_df['Model'], results_df['MAE ($)'], color='coral', edgecolor='black', alpha=0.7)
axes[1].set_title('MAE Comparison (Lower is Better)', fontsize=12, fontweight='bold')
axes[1].set_ylabel('MAE ($)')
axes[1].set_xlabel('Model')
axes[1].tick_params(axis='x', rotation=45)
axes[1].grid(True, alpha=0.3, axis='y')

# RMSE comparison
axes[2].bar(results_df['Model'], results_df['RMSE ($)'], color='green', edgecolor='black', alpha=0.7)
axes[2].set_title('RMSE Comparison (Lower is Better)', fontsize=12, fontweight='bold')
axes[2].set_ylabel('RMSE ($)')
axes[2].set_xlabel('Model')
axes[2].tick_params(axis='x', rotation=45)
axes[2].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

print(f"\n🏆 Best Model: {results_df.iloc[0]['Model']} (SMAPE: {results_df.iloc[0]['SMAPE (%)']:.4f}%)")


In [None]:
# ============================================================
# STEP 4: ADVANCED ENSEMBLE & FINAL PREDICTIONS
# ============================================================

In [None]:
# ============================================================
# SIMPLE WEIGHTED AVERAGE ENSEMBLE
# ============================================================

print("\n" + "=" * 80)
print("WEIGHTED AVERAGE ENSEMBLE")
print("=" * 80)

# Calculate weights based on inverse SMAPE (better models get higher weights)
smape_scores = np.array([xgb_smape, lgb_smape, cat_smape])
weights = 1 / smape_scores
weights = weights / weights.sum()  # Normalize to sum to 1

print("Ensemble weights based on SMAPE performance:")
print(f"  XGBoost:  {weights[0]:.4f} (SMAPE: {xgb_smape:.4f}%)")
print(f"  LightGBM: {weights[1]:.4f} (SMAPE: {lgb_smape:.4f}%)")
print(f"  CatBoost: {weights[2]:.4f} (SMAPE: {cat_smape:.4f}%)")

# Create weighted ensemble prediction
ensemble_pred_val = (
    weights[0] * xgb_pred_val +
    weights[1] * lgb_pred_val +
    weights[2] * cat_pred_val
)

# Calculate ensemble SMAPE
ensemble_smape = calculate_smape(y_val, ensemble_pred_val)
ensemble_mae = mean_absolute_error(y_val, ensemble_pred_val)
ensemble_rmse = np.sqrt(mean_squared_error(y_val, ensemble_pred_val))

print(f"\n✅ Weighted Ensemble Results:")
print(f"   SMAPE: {ensemble_smape:.4f}%")
print(f"   MAE: ${ensemble_mae:.2f}")
print(f"   RMSE: ${ensemble_rmse:.2f}")

if ensemble_smape < min(xgb_smape, lgb_smape, cat_smape):
    print(f"\n🎉 Ensemble improved SMAPE by {min(xgb_smape, lgb_smape, cat_smape) - ensemble_smape:.4f}%!")
else:
    print(f"\n⚠️  Ensemble SMAPE: {ensemble_smape:.4f}% vs Best Single Model: {min(xgb_smape, lgb_smape, cat_smape):.4f}%")


In [None]:
# ============================================================
# K-FOLD CROSS-VALIDATION FOR ROBUSTNESS
# ============================================================

print("\n" + "=" * 80)
print("5-FOLD CROSS-VALIDATION")
print("=" * 80)

from sklearn.model_selection import KFold

n_folds = 5
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

# Store CV predictions
xgb_cv_scores = []
lgb_cv_scores = []
cat_cv_scores = []

# Store OOF (Out-of-Fold) predictions for ensemble
oof_xgb = np.zeros(len(X_train))
oof_lgb = np.zeros(len(X_train))
oof_cat = np.zeros(len(X_train))

# Store test predictions from each fold
test_xgb = np.zeros(len(X_test))
test_lgb = np.zeros(len(X_test))
test_cat = np.zeros(len(X_test))

print("Training models with 5-fold cross-validation...\n")

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train), 1):
    print(f"Fold {fold}/{n_folds}")
    
    X_tr_fold, X_val_fold = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr_fold, y_val_fold = y_train.iloc[train_idx], y_train.iloc[val_idx]
    y_tr_log_fold = np.log1p(y_tr_fold)
    y_val_log_fold = np.log1p(y_val_fold)
    
    # XGBoost
    xgb_fold = xgb.XGBRegressor(**xgb_params)
    xgb_fold.fit(X_tr_fold, y_tr_log_fold, eval_set=[(X_val_fold, y_val_log_fold)], verbose=False)
    
    oof_xgb[val_idx] = np.expm1(xgb_fold.predict(X_val_fold))
    test_xgb += np.expm1(xgb_fold.predict(X_test)) / n_folds
    
    fold_xgb_smape = calculate_smape(y_val_fold, np.maximum(oof_xgb[val_idx], 0.1))
    xgb_cv_scores.append(fold_xgb_smape)
    
    # LightGBM
    lgb_fold = lgb.LGBMRegressor(**lgb_params)
    lgb_fold.fit(X_tr_fold, y_tr_log_fold, eval_set=[(X_val_fold, y_val_log_fold)], 
                 callbacks=[lgb.early_stopping(50), lgb.log_evaluation(0)])
    
    oof_lgb[val_idx] = np.expm1(lgb_fold.predict(X_val_fold))
    test_lgb += np.expm1(lgb_fold.predict(X_test)) / n_folds
    
    fold_lgb_smape = calculate_smape(y_val_fold, np.maximum(oof_lgb[val_idx], 0.1))
    lgb_cv_scores.append(fold_lgb_smape)
    
    # CatBoost
    cat_fold = CatBoostRegressor(**cat_params)
    cat_fold.fit(X_tr_fold, y_tr_log_fold, eval_set=(X_val_fold, y_val_log_fold), verbose=False)
    
    oof_cat[val_idx] = np.expm1(cat_fold.predict(X_val_fold))
    test_cat += np.expm1(cat_fold.predict(X_test)) / n_folds
    
    fold_cat_smape = calculate_smape(y_val_fold, np.maximum(oof_cat[val_idx], 0.1))
    cat_cv_scores.append(fold_cat_smape)
    
    print(f"  XGBoost SMAPE: {fold_xgb_smape:.4f}% | LightGBM: {fold_lgb_smape:.4f}% | CatBoost: {fold_cat_smape:.4f}%\n")

# Ensure positive predictions
oof_xgb = np.maximum(oof_xgb, 0.1)
oof_lgb = np.maximum(oof_lgb, 0.1)
oof_cat = np.maximum(oof_cat, 0.1)
test_xgb = np.maximum(test_xgb, 0.1)
test_lgb = np.maximum(test_lgb, 0.1)
test_cat = np.maximum(test_cat, 0.1)

print("=" * 80)
print("CROSS-VALIDATION RESULTS")
print("=" * 80)
print(f"XGBoost  - Mean SMAPE: {np.mean(xgb_cv_scores):.4f}% (±{np.std(xgb_cv_scores):.4f}%)")
print(f"LightGBM - Mean SMAPE: {np.mean(lgb_cv_scores):.4f}% (±{np.std(lgb_cv_scores):.4f}%)")
print(f"CatBoost - Mean SMAPE: {np.mean(cat_cv_scores):.4f}% (±{np.std(cat_cv_scores):.4f}%)")

# Calculate OOF SMAPE
oof_xgb_smape = calculate_smape(y_train, oof_xgb)
oof_lgb_smape = calculate_smape(y_train, oof_lgb)
oof_cat_smape = calculate_smape(y_train, oof_cat)

print(f"\nOut-of-Fold SMAPE on entire training set:")
print(f"  XGBoost:  {oof_xgb_smape:.4f}%")
print(f"  LightGBM: {oof_lgb_smape:.4f}%")
print(f"  CatBoost: {oof_cat_smape:.4f}%")


In [None]:
# ============================================================
# OPTIMIZED ENSEMBLE ON OOF PREDICTIONS
# ============================================================

print("\n" + "=" * 80)
print("OPTIMIZED WEIGHTED ENSEMBLE")
print("=" * 80)

# Calculate optimal weights using OOF predictions
oof_smape_scores = np.array([oof_xgb_smape, oof_lgb_smape, oof_cat_smape])
optimal_weights = 1 / oof_smape_scores
optimal_weights = optimal_weights / optimal_weights.sum()

print("Optimal ensemble weights:")
print(f"  XGBoost:  {optimal_weights[0]:.4f}")
print(f"  LightGBM: {optimal_weights[1]:.4f}")
print(f"  CatBoost: {optimal_weights[2]:.4f}")

# Create ensemble OOF predictions
oof_ensemble = (
    optimal_weights[0] * oof_xgb +
    optimal_weights[1] * oof_lgb +
    optimal_weights[2] * oof_cat
)

oof_ensemble_smape = calculate_smape(y_train, oof_ensemble)
print(f"\n✅ Ensemble OOF SMAPE: {oof_ensemble_smape:.4f}%")

# Create ensemble test predictions
test_ensemble = (
    optimal_weights[0] * test_xgb +
    optimal_weights[1] * test_lgb +
    optimal_weights[2] * test_cat
)


In [None]:
# ============================================================
# FINAL MODEL COMPARISON
# ============================================================


final_results = pd.DataFrame({
    'Model': ['XGBoost', 'LightGBM', 'CatBoost', 'Weighted Ensemble'],
    'OOF SMAPE (%)': [oof_xgb_smape, oof_lgb_smape, oof_cat_smape, oof_ensemble_smape],
    'CV Mean (%)': [np.mean(xgb_cv_scores), np.mean(lgb_cv_scores), np.mean(cat_cv_scores), '-'],
    'CV Std (%)': [np.std(xgb_cv_scores), np.std(lgb_cv_scores), np.std(cat_cv_scores), '-']
})

final_results = final_results.sort_values('OOF SMAPE (%)')
display(final_results)

# Visualize
plt.figure(figsize=(12, 6))
plt.bar(final_results['Model'], final_results['OOF SMAPE (%)'], 
        color=['steelblue', 'coral', 'green', 'purple'], 
        edgecolor='black', alpha=0.7)
plt.title('Final Model Comparison - Out-of-Fold SMAPE', fontsize=14, fontweight='bold')
plt.ylabel('SMAPE (%)')
plt.xlabel('Model')
plt.axhline(y=final_results['OOF SMAPE (%)'].min(), color='red', linestyle='--', 
            linewidth=2, label=f"Best: {final_results['OOF SMAPE (%)'].min():.4f}%")
plt.legend()
plt.grid(True, alpha=0.3, axis='y')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

print(f"\n🏆 Best performing model: {final_results.iloc[0]['Model']}")
print(f"   OOF SMAPE: {final_results.iloc[0]['OOF SMAPE (%)']:.4f}%")


In [None]:
# ============================================================
# GENERATE FINAL SUBMISSION FILE
# ============================================================

# Use ensemble predictions as final submission
final_predictions = test_ensemble

# Create submission dataframe
submission = pd.DataFrame({
    'sample_id': test_ids,
    'price': final_predictions
})

# Ensure all predictions are positive
submission['price'] = submission['price'].clip(lower=0.1)

# Sort by sample_id (required by some competitions)
submission = submission.sort_values('sample_id').reset_index(drop=True)

# Save submission file
submission.to_csv('test_out.csv', index=False)

print("✅ Submission file created: test_out.csv")
print(f"\nSubmission file details:")
print(f"  Total predictions: {len(submission):,}")
print(f"  Price range: ${submission['price'].min():.2f} - ${submission['price'].max():.2f}")
print(f"  Mean price: ${submission['price'].mean():.2f}")
print(f"  Median price: ${submission['price'].median():.2f}")

print("\nFirst 10 predictions:")
display(submission.head(10))

print("\nLast 10 predictions:")
display(submission.tail(10))

# Verify submission format
print("\n" + "=" * 80)
print("SUBMISSION FORMAT VERIFICATION")
print("=" * 80)
print(f"✅ Column names: {list(submission.columns)}")
print(f"✅ Number of rows: {len(submission)}")
print(f"✅ Number of unique sample_ids: {submission['sample_id'].nunique()}")
print(f"✅ Any missing values: {submission.isnull().sum().sum()}")
print(f"✅ Any negative prices: {(submission['price'] < 0).sum()}")

if len(submission) == 75000 and submission['sample_id'].nunique() == 75000:
    print("\n🎉 Submission file is VALID and ready for upload!")
else:
    print("\n⚠️  WARNING: Check submission file format!")


In [None]:
# ============================================================
# FEATURE IMPORTANCE ANALYSIS
# ============================================================

# Remove early_stopping_rounds for final training (or provide eval_set)
xgb_params_final = xgb_params.copy()
xgb_params_final.pop('early_stopping_rounds', None)

# Get feature importance from XGBoost (best model)
xgb_final = xgb.XGBRegressor(**xgb_params_final)
xgb_final.fit(X_train, np.log1p(y_train), verbose=False)

feature_importance = pd.DataFrame({
    'feature': all_features,
    'importance': xgb_final.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 10 Most Important Features:")
display(feature_importance.head(10))

# Visualize
plt.figure(figsize=(12, 8))
plt.barh(feature_importance['feature'].head(13), 
         feature_importance['importance'].head(13),
         color='teal', edgecolor='black', alpha=0.7)
plt.xlabel('Importance Score')
plt.ylabel('Feature')
plt.title('Feature Importance (XGBoost)', fontsize=14, fontweight='bold')
plt.gca().invert_yaxis()
plt.grid(True, alpha=0.3, axis='x')
plt.tight_layout()
plt.show()


In [None]:
# ============================================================
# STEP 5A: IMAGE DOWNLOAD & PROCESSING SETUP
# Amazon ML Challenge 2025 - Add Image Features
# ============================================================

# Import additional libraries for image processing
from PIL import Image
import requests
from io import BytesIO
import concurrent.futures
from tqdm import tqdm
import hashlib
import os

# Create directories for images
os.makedirs('images/train', exist_ok=True)
os.makedirs('images/test', exist_ok=True)

print("✅ Image processing libraries imported!")
print("✅ Image directories created!")


In [None]:
# ============================================================
# IMAGE DOWNLOAD UTILITY (BATCH PROCESSING)
# ============================================================

def download_image(url, save_path, timeout=5, retries=3):
    """
    Download image from URL with retry logic
    Returns: True if successful, False otherwise
    """
    for attempt in range(retries):
        try:
            response = requests.get(url, timeout=timeout, stream=True)
            if response.status_code == 200:
                img = Image.open(BytesIO(response.content))
                # Convert to RGB if necessary
                if img.mode != 'RGB':
                    img = img.convert('RGB')
                # Resize to standard size (224x224 for most CNNs)
                img = img.resize((224, 224), Image.Resampling.LANCZOS)
                img.save(save_path)
                return True
        except Exception as e:
            if attempt == retries - 1:
                return False
            time.sleep(0.5)
    return False

def batch_download_images(df, split='train', max_workers=32, max_images=None):
    """
    Download images in parallel with progress bar
    """
    print(f"\n{'='*80}")
    print(f"DOWNLOADING {split.upper()} IMAGES")
    print(f"{'='*80}")
    
    # Limit images for faster testing
    if max_images:
        df_subset = df.head(max_images).copy()
        print(f"⚠️  Limited to {max_images} images for faster processing")
    else:
        df_subset = df.copy()
    
    successful = 0
    failed = 0
    
    def download_row(row):
        sample_id = row['sample_id']
        url = row['image_link']
        save_path = f'images/{split}/{sample_id}.jpg'
        
        # Skip if already downloaded
        if os.path.exists(save_path):
            return True
        
        return download_image(url, save_path)
    
    # Use ThreadPoolExecutor for parallel downloads
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        results = list(tqdm(
            executor.map(download_row, [row for _, row in df_subset.iterrows()]),
            total=len(df_subset),
            desc=f"Downloading {split} images"
        ))
    
    successful = sum(results)
    failed = len(results) - successful
    
    print(f"\n✅ Download complete!")
    print(f"   Successful: {successful:,} images")
    print(f"   Failed: {failed:,} images")
    print(f"   Success rate: {successful/len(results)*100:.2f}%")
    
    return successful, failed

# Download training images (limit to subset for speed)
print("\n⏱️  Note: Full download of 75K images takes ~30-60 minutes")
print("Starting with 10,000 images for faster testing...")

train_success, train_failed = batch_download_images(
    train_df, 
    split='train', 
    max_workers=32,
    max_images=10000  # Start with 10K for speed; remove for full dataset
)


In [None]:
# ============================================================
# DOWNLOAD TEST IMAGES
# ============================================================

test_success, test_failed = batch_download_images(
    test_df, 
    split='test', 
    max_workers=32,
    max_images=10000  # Start with 10K for speed
)

In [None]:
# ============================================================
# STEP 5B: EXTRACT IMAGE FEATURES USING RESNET50
# ============================================================

print("\n" + "=" * 80)
print("IMAGE FEATURE EXTRACTION USING RESNET50")
print("=" * 80)

# Import Keras/TensorFlow for feature extraction
import tensorflow as tf
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.applications.resnet50 import preprocess_input
from tensorflow.keras.preprocessing import image as keras_image
from tensorflow.keras.models import Model

# Suppress TensorFlow warnings
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

# Load pre-trained ResNet50 (without top classification layer)
print("Loading ResNet50 model...")
base_model = ResNet50(weights='imagenet', include_top=False, pooling='avg')
print(f"✅ ResNet50 loaded! Output shape: {base_model.output_shape}")

def extract_image_features(image_path, model):
    """
    Extract features from a single image using ResNet50
    Returns: 2048-dim feature vector
    """
    try:
        # Load and preprocess image
        img = keras_image.load_img(image_path, target_size=(224, 224))
        img_array = keras_image.img_to_array(img)
        img_array = np.expand_dims(img_array, axis=0)
        img_array = preprocess_input(img_array)
        
        # Extract features
        features = model.predict(img_array, verbose=0)
        return features.flatten()
    except Exception as e:
        # Return zero vector if image fails
        return np.zeros(2048)

def batch_extract_features(df, split='train', model=None):
    """
    Extract features for all images in dataframe
    """
    print(f"\nExtracting features for {split} images...")
    
    feature_list = []
    sample_ids = []
    
    for _, row in tqdm(df.iterrows(), total=len(df), desc=f"Extracting {split} features"):
        sample_id = row['sample_id']
        image_path = f'images/{split}/{sample_id}.jpg'
        
        if os.path.exists(image_path):
            features = extract_image_features(image_path, model)
            feature_list.append(features)
            sample_ids.append(sample_id)
        else:
            # Use zero features for missing images
            feature_list.append(np.zeros(2048))
            sample_ids.append(sample_id)
    
    # Convert to numpy array
    features_array = np.array(feature_list)
    
    print(f"✅ Extracted features shape: {features_array.shape}")
    return features_array, sample_ids

# Extract features from training images
train_image_features, train_sample_ids = batch_extract_features(
    train_df.head(10000),  # Match downloaded images
    split='train',
    model=base_model
)

# Extract features from test images
test_image_features, test_sample_ids = batch_extract_features(
    test_df.head(10000),  # Match downloaded images
    split='test',
    model=base_model
)

# Save features for later use
np.save('train_image_features.npy', train_image_features)
np.save('test_image_features.npy', test_image_features)
np.save('train_image_ids.npy', train_sample_ids)
np.save('test_image_ids.npy', test_sample_ids)

print("\n✅ Image features saved to disk!")


In [None]:
# ============================================================
# STEP 5C: COMBINE TEXT + IMAGE FEATURES
# ============================================================

print("\n" + "=" * 80)
print("COMBINING TEXT AND IMAGE FEATURES")
print("=" * 80)

# Load processed data
train_df_full = pd.read_csv('train_processed.csv')
test_df_full = pd.read_csv('test_processed.csv')

# Get text features (from previous step)
text_features = all_features  # 13 text features from Step 2

# Load image features
train_img_feat = np.load('train_image_features.npy')
test_img_feat = np.load('test_image_features.npy')

# Create image feature column names
img_feature_names = [f'img_feat_{i}' for i in range(train_img_feat.shape[1])]

# Add image features to train dataframe (first 10K rows)
for i, col_name in enumerate(img_feature_names):
    train_df_full.loc[:9999, col_name] = train_img_feat[:, i]

# Add image features to test dataframe (first 10K rows)
for i, col_name in enumerate(img_feature_names):
    test_df_full.loc[:9999, col_name] = test_img_feat[:, i]

# Fill missing image features with 0 (for rows without downloaded images)
train_df_full[img_feature_names] = train_df_full[img_feature_names].fillna(0)
test_df_full[img_feature_names] = test_df_full[img_feature_names].fillna(0)

# Combined feature list
combined_features = text_features + img_feature_names

print(f"✅ Combined features created!")
print(f"   Text features: {len(text_features)}")
print(f"   Image features: {len(img_feature_names)}")
print(f"   Total features: {len(combined_features)}")

# Prepare data with combined features
X_train_combined = train_df_full.loc[:9999, combined_features].values
y_train_combined = train_df_full.loc[:9999, 'price'].values
y_train_log_combined = np.log1p(y_train_combined)

X_test_combined = test_df_full.loc[:9999, combined_features].values
test_ids_combined = test_df_full.loc[:9999, 'sample_id'].values

print(f"\nCombined training data shape: {X_train_combined.shape}")
print(f"Combined test data shape: {X_test_combined.shape}")


In [None]:
# ============================================================
# STEP 5D: TRAIN IMPROVED MODEL WITH IMAGE FEATURES
# (Run this after feature extraction completes)
# ============================================================

print("\n" + "=" * 80)
print("TRAINING IMPROVED MODELS WITH TEXT + IMAGE FEATURES")
print("=" * 80)

# Split data for validation
from sklearn.model_selection import train_test_split

X_tr_img, X_val_img, y_tr_img, y_val_img = train_test_split(
    X_train_combined, y_train_combined, test_size=0.15, random_state=42
)

y_tr_log_img = np.log1p(y_tr_img)
y_val_log_img = np.log1p(y_val_img)

print(f"Training set: {X_tr_img.shape[0]:,} samples")
print(f"Validation set: {X_val_img.shape[0]:,} samples")
print(f"Features: {X_tr_img.shape[1]} (13 text + 2048 image)")


In [None]:
# ============================================================
# TRAIN XGBOOST WITH IMAGE FEATURES
# ============================================================

print("\n" + "=" * 80)
print("XGBOOST WITH IMAGE FEATURES")
print("=" * 80)

# Optimized parameters for larger feature space
xgb_img_params = {
    'objective': 'reg:squarederror',
    'learning_rate': 0.03,  # Slightly lower for stability
    'max_depth': 6,
    'min_child_weight': 3,
    'subsample': 0.8,
    'colsample_bytree': 0.5,  # Lower due to many features
    'colsample_bylevel': 0.7,
    'gamma': 0.1,
    'reg_alpha': 0.5,  # Higher regularization for many features
    'reg_lambda': 2.0,
    'random_state': 42,
    'n_estimators': 300,
    'tree_method': 'gpu_hist',  # Use GPU acceleration!
    'gpu_id': 0,
    'early_stopping_rounds': 30
}

print("Training XGBoost with GPU acceleration...")
xgb_img_model = xgb.XGBRegressor(**xgb_img_params)
xgb_img_model.fit(
    X_tr_img, y_tr_log_img,
    eval_set=[(X_val_img, y_val_log_img)],
    verbose=True
)

# Predictions
xgb_img_pred_val = np.expm1(xgb_img_model.predict(X_val_img))
xgb_img_pred_val = np.maximum(xgb_img_pred_val, 0.1)

# Calculate metrics
xgb_img_smape = calculate_smape(y_val_img, xgb_img_pred_val)
xgb_img_mae = mean_absolute_error(y_val_img, xgb_img_pred_val)
xgb_img_rmse = np.sqrt(mean_squared_error(y_val_img, xgb_img_pred_val))

print(f"\n✅ XGBoost + Images Results:")
print(f"   SMAPE: {xgb_img_smape:.4f}%")
print(f"   MAE: ${xgb_img_mae:.2f}")
print(f"   RMSE: ${xgb_img_rmse:.2f}")
print(f"\n🎯 Improvement from text-only: {58.09 - xgb_img_smape:.4f}% reduction in SMAPE")


In [None]:
# ============================================================
# TRAIN LIGHTGBM WITH IMAGE FEATURES
# ============================================================

print("\n" + "=" * 80)
print("LIGHTGBM WITH IMAGE FEATURES")
print("=" * 80)

lgb_img_params = {
    'objective': 'regression',
    'metric': 'rmse',
    'learning_rate': 0.03,
    'num_leaves': 50,
    'max_depth': 7,
    'min_child_samples': 20,
    'subsample': 0.8,
    'colsample_bytree': 0.5,
    'feature_fraction': 0.7,
    'reg_alpha': 0.5,
    'reg_lambda': 2.0,
    'random_state': 42,
    'n_estimators': 300,
    'device': 'gpu',  # Use GPU
    'gpu_platform_id': 0,
    'gpu_device_id': 0,
    'verbose': -1
}

print("Training LightGBM with GPU acceleration...")
lgb_img_model = lgb.LGBMRegressor(**lgb_img_params)
lgb_img_model.fit(
    X_tr_img, y_tr_log_img,
    eval_set=[(X_val_img, y_val_log_img)],
    callbacks=[lgb.early_stopping(30), lgb.log_evaluation(10)]
)

# Predictions
lgb_img_pred_val = np.expm1(lgb_img_model.predict(X_val_img))
lgb_img_pred_val = np.maximum(lgb_img_pred_val, 0.1)

# Calculate metrics
lgb_img_smape = calculate_smape(y_val_img, lgb_img_pred_val)
lgb_img_mae = mean_absolute_error(y_val_img, lgb_img_pred_val)
lgb_img_rmse = np.sqrt(mean_squared_error(y_val_img, lgb_img_pred_val))

print(f"\n✅ LightGBM + Images Results:")
print(f"   SMAPE: {lgb_img_smape:.4f}%")
print(f"   MAE: ${lgb_img_mae:.2f}")
print(f"   RMSE: ${lgb_img_rmse:.2f}")


In [None]:
# ============================================================
# COMPARISON: TEXT-ONLY VS TEXT+IMAGE
# ============================================================

print("\n" + "=" * 80)
print("PERFORMANCE COMPARISON: TEXT-ONLY VS TEXT+IMAGE")
print("=" * 80)

comparison_df = pd.DataFrame({
    'Model': [
        'XGBoost (Text Only)',
        'XGBoost (Text + Image)',
        'LightGBM (Text Only)',
        'LightGBM (Text + Image)'
    ],
    'SMAPE (%)': [
        58.09,
        xgb_img_smape,
        58.33,
        lgb_img_smape
    ],
    'Improvement': [
        '-',
        f'{58.09 - xgb_img_smape:.2f}%',
        '-',
        f'{58.33 - lgb_img_smape:.2f}%'
    ]
})

display(comparison_df)

# Visualize improvement
plt.figure(figsize=(12, 6))
models = ['XGBoost\n(Text Only)', 'XGBoost\n(Text+Image)', 'LightGBM\n(Text Only)', 'LightGBM\n(Text+Image)']
scores = [58.09, xgb_img_smape, 58.33, lgb_img_smape]
colors = ['lightcoral', 'lightgreen', 'lightcoral', 'lightgreen']

bars = plt.bar(models, scores, color=colors, edgecolor='black', alpha=0.7)
plt.ylabel('SMAPE (%)', fontsize=12)
plt.title('Performance Improvement: Adding Image Features', fontsize=14, fontweight='bold')
plt.axhline(y=50, color='red', linestyle='--', linewidth=2, label='Target: Top 10 (< 50%)')
plt.axhline(y=40, color='green', linestyle='--', linewidth=2, label='Target: Top 3 (< 40%)')
plt.legend()
plt.grid(True, alpha=0.3, axis='y')

# Add value labels on bars
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height,
             f'{height:.2f}%',
             ha='center', va='bottom', fontsize=11, fontweight='bold')

plt.tight_layout()
plt.show()

best_smape = min(xgb_img_smape, lgb_img_smape)
print(f"\n🏆 Best Model with Images: {best_smape:.4f}% SMAPE")
print(f"📈 Total improvement: {58.09 - best_smape:.4f}%")

if best_smape < 50:
    print(f"\n🎉 EXCELLENT! You're in Top 10-20 range!")
elif best_smape < 55:
    print(f"\n✅ GOOD! You're in Top 20-50 range!")
else:
    print(f"\n💪 Making progress! Consider more images or fine-tuning.")


In [None]:
# ============================================================
# GENERATE IMPROVED PREDICTIONS FOR TEST SET
# ============================================================

print("\n" + "=" * 80)
print("GENERATING IMPROVED TEST PREDICTIONS")
print("=" * 80)

# Use best model for predictions
if xgb_img_smape < lgb_img_smape:
    best_model_img = xgb_img_model
    best_model_name = 'XGBoost + Images'
else:
    best_model_img = lgb_img_model
    best_model_name = 'LightGBM + Images'

print(f"Using {best_model_name} for final predictions...")

# Predict on test set (with image features)
test_predictions_img = np.expm1(best_model_img.predict(X_test_combined))
test_predictions_img = np.maximum(test_predictions_img, 0.1)

# Create improved submission
submission_improved = pd.DataFrame({
    'sample_id': test_ids_combined,
    'price': test_predictions_img
})

submission_improved = submission_improved.sort_values('sample_id').reset_index(drop=True)
submission_improved.to_csv('test_out_improved.csv', index=False)

print(f"\n✅ Improved submission saved: test_out_improved.csv")
print(f"   Using: {best_model_name}")
print(f"   Expected SMAPE: ~{best_smape:.2f}%")
print(f"   Predictions: {len(submission_improved):,} (first 10K with images)")

print("\n⚠️  NOTE: This submission covers only first 10K samples")
print("For full 75K submission, you need to:")
print("1. Download all 75K images (both train + test)")
print("2. Extract features for all images")
print("3. Retrain on full dataset")
print("4. Generate predictions for all 75K test samples")


In [None]:
# ============================================================
# STRATEGY 1: DIMENSIONALITY REDUCTION 
# ============================================================

print("\n" + "=" * 80)
print("STRATEGY 1: REDUCE IMAGE FEATURE DIMENSIONS")
print("=" * 80)

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Reduce 2048 image features to 50-100 principal components
print("Applying PCA to reduce image features from 2048 → 100 dimensions...")

# Standardize image features first
scaler = StandardScaler()
train_img_scaled = scaler.fit_transform(train_image_features)
test_img_scaled = scaler.transform(test_image_features)

# Apply PCA
pca = PCA(n_components=100, random_state=42)
train_img_pca = pca.fit_transform(train_img_scaled)
test_img_pca = pca.transform(test_img_scaled)

print(f"✅ Explained variance: {pca.explained_variance_ratio_.sum()*100:.2f}%")

# Create new feature names
pca_feature_names = [f'img_pca_{i}' for i in range(100)]

# Combine with text features
train_df_pca = train_df_full.head(10000).copy()
test_df_pca = test_df_full.head(10000).copy()

# Add PCA features
for i, col_name in enumerate(pca_feature_names):
    train_df_pca[col_name] = train_img_pca[:, i]
    test_df_pca[col_name] = test_img_pca[:, i]

# Combined features: 13 text + 100 PCA image = 113 total
reduced_features = text_features + pca_feature_names

print(f"✅ Reduced features: {len(reduced_features)} (13 text + 100 PCA image)")

# Prepare data
X_train_pca = train_df_pca[reduced_features].values
y_train_pca = train_df_pca['price'].values
y_train_log_pca = np.log1p(y_train_pca)

X_test_pca = test_df_pca[reduced_features].values
test_ids_pca = test_df_pca['sample_id'].values

# Split for validation
X_tr_pca, X_val_pca, y_tr_pca, y_val_pca = train_test_split(
    X_train_pca, y_train_pca, test_size=0.15, random_state=42
)
y_tr_log_pca = np.log1p(y_tr_pca)
y_val_log_pca = np.log1p(y_val_pca)

print(f"\nTraining samples: {X_tr_pca.shape[0]:,}")
print(f"Features: {X_tr_pca.shape[1]} (much better ratio!)")


In [None]:
# ============================================================
# TRAIN WITH REDUCED FEATURES
# ============================================================

print("\n" + "=" * 80)
print("XGBOOST WITH PCA-REDUCED IMAGE FEATURES")
print("=" * 80)

xgb_pca_params = {
    'objective': 'reg:squarederror',
    'learning_rate': 0.05,
    'max_depth': 7,
    'min_child_weight': 3,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'gamma': 0.1,
    'reg_alpha': 0.1,
    'reg_lambda': 1.0,
    'random_state': 42,
    'n_estimators': 500,
    'tree_method': 'gpu_hist',
    'gpu_id': 0,
    'early_stopping_rounds': 50
}

print("Training XGBoost with PCA features...")
xgb_pca_model = xgb.XGBRegressor(**xgb_pca_params)
xgb_pca_model.fit(
    X_tr_pca, y_tr_log_pca,
    eval_set=[(X_val_pca, y_val_log_pca)],
    verbose=False
)

# Predictions
xgb_pca_pred_val = np.expm1(xgb_pca_model.predict(X_val_pca))
xgb_pca_pred_val = np.maximum(xgb_pca_pred_val, 0.1)

# Calculate metrics
xgb_pca_smape = calculate_smape(y_val_pca, xgb_pca_pred_val)
xgb_pca_mae = mean_absolute_error(y_val_pca, xgb_pca_pred_val)
xgb_pca_rmse = np.sqrt(mean_squared_error(y_val_pca, xgb_pca_pred_val))

print(f"\n✅ XGBoost + PCA Images Results:")
print(f"   SMAPE: {xgb_pca_smape:.4f}%")
print(f"   MAE: ${xgb_pca_mae:.2f}")
print(f"   RMSE: ${xgb_pca_rmse:.2f}")

improvement = 58.09 - xgb_pca_smape
if improvement > 0:
    print(f"\n🎉 SUCCESS! Improvement: {improvement:.4f}% reduction in SMAPE")
else:
    print(f"\n⚠️  Still worse by {abs(improvement):.4f}%")


In [None]:
# ============================================================
# STRATEGY 2: TEXT EMBEDDINGS (FASTER & OFTEN BETTER)
# ============================================================

print("\n" + "=" * 80)
print("STRATEGY 2: ADD TEXT EMBEDDINGS (FASTER APPROACH)")
print("=" * 80)

# Install if needed
try:
    from sentence_transformers import SentenceTransformer
    print("✅ sentence-transformers already installed")
except:
    print("Installing sentence-transformers...")
    import subprocess
    subprocess.check_call(['pip', 'install', '-q', 'sentence-transformers'])
    from sentence_transformers import SentenceTransformer

# Load lightweight sentence transformer
print("\nLoading sentence transformer model...")
text_model = SentenceTransformer('all-MiniLM-L6-v2')
print("✅ Model loaded! (384-dim embeddings)")

def extract_text_embeddings(df, text_column='catalog_content'):
    """Extract text embeddings from catalog content"""
    texts = df[text_column].tolist()
    print(f"Extracting embeddings for {len(texts):,} texts...")
    embeddings = text_model.encode(texts, show_progress_bar=True, batch_size=128)
    return embeddings

# Extract text embeddings
train_text_emb = extract_text_embeddings(train_df_full.head(10000))
test_text_emb = extract_text_embeddings(test_df_full.head(10000))

# Save embeddings
np.save('train_text_embeddings.npy', train_text_emb)
np.save('test_text_embeddings.npy', test_text_emb)

print(f"\n✅ Text embeddings extracted: {train_text_emb.shape}")
print(f"   Dimension: 384 (much smaller than 2048 image features!)")


In [None]:
# ============================================================
# COMBINE TEXT FEATURES + TEXT EMBEDDINGS
# ============================================================

print("\n" + "=" * 80)
print("TRAINING WITH TEXT FEATURES + TEXT EMBEDDINGS")
print("=" * 80)

# Create text embedding feature names
text_emb_names = [f'text_emb_{i}' for i in range(384)]

# Add to dataframe
train_df_textemb = train_df_full.head(10000).copy()
test_df_textemb = test_df_full.head(10000).copy()

for i, col_name in enumerate(text_emb_names):
    train_df_textemb[col_name] = train_text_emb[:, i]
    test_df_textemb[col_name] = test_text_emb[:, i]

# Combined: 13 engineered + 384 embeddings = 397 features
textemb_features = text_features + text_emb_names

print(f"Combined features: {len(textemb_features)}")

# Prepare data
X_train_textemb = train_df_textemb[textemb_features].values
y_train_textemb = train_df_textemb['price'].values

X_test_textemb = test_df_textemb[textemb_features].values

# Split
X_tr_te, X_val_te, y_tr_te, y_val_te = train_test_split(
    X_train_textemb, y_train_textemb, test_size=0.15, random_state=42
)
y_tr_log_te = np.log1p(y_tr_te)
y_val_log_te = np.log1p(y_val_te)

# Train XGBoost
print("\nTraining XGBoost with text embeddings...")
xgb_te_model = xgb.XGBRegressor(
    objective='reg:squarederror',
    learning_rate=0.05,
    max_depth=7,
    subsample=0.8,
    colsample_bytree=0.7,
    reg_alpha=0.2,
    reg_lambda=1.5,
    n_estimators=500,
    tree_method='gpu_hist',
    random_state=42
)

xgb_te_model.fit(
    X_tr_te, y_tr_log_te,
    eval_set=[(X_val_te, y_val_log_te)],
    verbose=False
)

# Predict
xgb_te_pred = np.expm1(xgb_te_model.predict(X_val_te))
xgb_te_pred = np.maximum(xgb_te_pred, 0.1)

# Metrics
xgb_te_smape = calculate_smape(y_val_te, xgb_te_pred)
xgb_te_mae = mean_absolute_error(y_val_te, xgb_te_pred)

print(f"\n✅ XGBoost + Text Embeddings Results:")
print(f"   SMAPE: {xgb_te_smape:.4f}%")
print(f"   MAE: ${xgb_te_mae:.2f}")

if xgb_te_smape < 58.09:
    print(f"\n🎉 IMPROVED! {58.09 - xgb_te_smape:.4f}% better than text-only")


In [None]:
# ============================================================
# FINAL COMPARISON & DECISION
# ============================================================

print("\n" + "=" * 80)
print("COMPLETE PERFORMANCE COMPARISON")
print("=" * 80)

results_complete = pd.DataFrame({
    'Approach': [
        '1. Text Only (Original)',
        '2. Text + Raw Images (2048 feat)',
        '3. Text + PCA Images (100 feat)',
        '4. Text + Text Embeddings (384 feat)'
    ],
    'Features': [13, 2061, 113, 397],
    'SMAPE (%)': [
        58.09,
        59.52,
        xgb_pca_smape,
        xgb_te_smape
    ]
})

results_complete = results_complete.sort_values('SMAPE (%)')
display(results_complete)

best_approach = results_complete.iloc[0]
print(f"\n🏆 BEST APPROACH: {best_approach['Approach']}")
print(f"   SMAPE: {best_approach['SMAPE (%)']:.4f}%")
print(f"   Features: {int(best_approach['Features'])}")

# Visualize
plt.figure(figsize=(12, 6))
plt.bar(results_complete['Approach'], results_complete['SMAPE (%)'], 
        color=['green' if x == results_complete['SMAPE (%)'].min() else 'coral' 
               for x in results_complete['SMAPE (%)']],
        edgecolor='black', alpha=0.7)
plt.ylabel('SMAPE (%)', fontsize=12)
plt.title('Performance Comparison: Different Feature Strategies', fontsize=14, fontweight='bold')
plt.axhline(y=50, color='red', linestyle='--', label='Top 10 Target')
plt.xticks(rotation=15, ha='right')
plt.legend()
plt.grid(True, alpha=0.3, axis='y')
plt.tight_layout()
plt.show()


In [None]:
# ============================================================
# RECOMMENDATION & SUBMISSION STRATEGY
# ============================================================

print("\n" + "=" * 80)
print("📊 FINAL RECOMMENDATION")
print("=" * 80)

best_smape = min(58.09, xgb_pca_smape, xgb_te_smape)

print("\n🎯 SUBMISSION STRATEGY:")
print("=" * 80)

if best_smape == 58.09:
    print("✅ STICK WITH TEXT-ONLY MODEL (58.09% SMAPE)")
    print("\nReason: Adding features made things worse with limited data")
    print("\nRECOMMENDATION:")
    print("1. Submit your ORIGINAL text-only submission (test_out.csv)")
    print("2. It's your best model with 58.09% SMAPE")
    print("3. Expected rank: Top 50-100")
    print("\nTO IMPROVE FURTHER:")
    print("- Process full 75K dataset (not just 10K)")
    print("- Use text embeddings across all data")
    print("- Try ensemble of text-only models with different seeds")
else:
    print(f"✅ USE IMPROVED MODEL ({best_smape:.4f}% SMAPE)")
    print("\nGenerate new submission with best approach...")

print("\n" + "=" * 80)
print("⏰ TIME CHECK")
print("=" * 80)
print("Current time: 10:23 AM IST")
print("Estimated deadline: 7:00 PM IST (or check competition page)")
print("Time remaining: ~8-9 hours")

print("\n💡 QUICK WIN STRATEGIES:")
print("1. ✅ Submit current best model NOW")
print("2. Process full 75K dataset if time permits")
print("3. Add text embeddings to full dataset")
print("4. Cross-validation ensemble on full data")


In [None]:
# ============================================================
# STRATEGY: FULL DATASET WITH TEXT EMBEDDINGS
# ============================================================

print("\n" + "=" * 80)
print("🚀 WINNING STRATEGY: TEXT EMBEDDINGS ON FULL 75K DATASET")
print("=" * 80)

# Load full datasets
train_df_full = pd.read_csv('train_processed.csv')
test_df_full = pd.read_csv('test_processed.csv')

print(f"Full training data: {len(train_df_full):,} samples")
print(f"Full test data: {len(test_df_full):,} samples")

# Extract text embeddings for FULL dataset
print("\n🔥 Extracting embeddings for FULL 75K train + 75K test...")
print("⏱️  This will take ~5-6 minutes...")

from sentence_transformers import SentenceTransformer

# Load model (already cached)
text_model = SentenceTransformer('all-MiniLM-L6-v2')

# Extract for full train
print("\n1. Processing 75K training texts...")
train_text_emb_full = text_model.encode(
    train_df_full['catalog_content'].tolist(), 
    show_progress_bar=True, 
    batch_size=256
)

# Extract for full test
print("\n2. Processing 75K test texts...")
test_text_emb_full = text_model.encode(
    test_df_full['catalog_content'].tolist(), 
    show_progress_bar=True, 
    batch_size=256
)

# Save
np.save('train_text_embeddings_full.npy', train_text_emb_full)
np.save('test_text_embeddings_full.npy', test_text_emb_full)

print(f"\n✅ Full embeddings extracted!")
print(f"   Train: {train_text_emb_full.shape}")
print(f"   Test: {test_text_emb_full.shape}")


In [None]:
# ============================================================
# BUILD FULL DATASET WITH TEXT EMBEDDINGS
# ============================================================

print("\n" + "=" * 80)
print("COMBINING FULL DATA: TEXT FEATURES + EMBEDDINGS")
print("=" * 80)

# Create embedding feature names
text_emb_names = [f'text_emb_{i}' for i in range(384)]

# Add embeddings to full dataframes
for i, col_name in enumerate(text_emb_names):
    train_df_full[col_name] = train_text_emb_full[:, i]
    test_df_full[col_name] = test_text_emb_full[:, i]

# Combined features: 13 + 384 = 397
full_features = text_features + text_emb_names

print(f"✅ Combined features: {len(full_features)}")

# Prepare full training data
X_train_full = train_df_full[full_features].values
y_train_full = train_df_full['price'].values
y_train_log_full = np.log1p(y_train_full)

X_test_full = test_df_full[full_features].values
test_ids_full = test_df_full['sample_id'].values

print(f"\n📊 Full dataset prepared:")
print(f"   Training: {X_train_full.shape}")
print(f"   Test: {X_test_full.shape}")
print(f"   Feature-to-sample ratio: 1:{X_train_full.shape[0]//X_train_full.shape[1]}")
print(f"   ✅ Much better ratio for learning!")


In [None]:
# ============================================================
# TRAIN FINAL MODEL ON FULL 75K DATASET
# ============================================================

print("\n" + "=" * 80)
print("🏆 TRAINING FINAL MODEL ON FULL 75K DATASET")
print("=" * 80)

# Split for validation
from sklearn.model_selection import train_test_split

X_tr_full, X_val_full, y_tr_full, y_val_full = train_test_split(
    X_train_full, y_train_full, test_size=0.15, random_state=42
)

y_tr_log_full = np.log1p(y_tr_full)
y_val_log_full = np.log1p(y_val_full)

print(f"Training: {X_tr_full.shape[0]:,} samples")
print(f"Validation: {X_val_full.shape[0]:,} samples")

# Optimized parameters for full dataset
xgb_full_params = {
    'objective': 'reg:squarederror',
    'learning_rate': 0.05,
    'max_depth': 7,
    'min_child_weight': 3,
    'subsample': 0.8,
    'colsample_bytree': 0.7,
    'gamma': 0.1,
    'reg_alpha': 0.2,
    'reg_lambda': 1.5,
    'random_state': 42,
    'n_estimators': 500,
    'tree_method': 'gpu_hist',
    'gpu_id': 0,
    'early_stopping_rounds': 50
}

print("\n🚀 Training XGBoost on full 75K dataset with GPU...")
xgb_full_model = xgb.XGBRegressor(**xgb_full_params)

xgb_full_model.fit(
    X_tr_full, y_tr_log_full,
    eval_set=[(X_val_full, y_val_log_full)],
    verbose=True
)

# Predictions
xgb_full_pred_val = np.expm1(xgb_full_model.predict(X_val_full))
xgb_full_pred_val = np.maximum(xgb_full_pred_val, 0.1)

# Metrics
xgb_full_smape = calculate_smape(y_val_full, xgb_full_pred_val)
xgb_full_mae = mean_absolute_error(y_val_full, xgb_full_pred_val)
xgb_full_rmse = np.sqrt(mean_squared_error(y_val_full, xgb_full_pred_val))

print(f"\n✅ FULL DATASET RESULTS:")
print(f"   SMAPE: {xgb_full_smape:.4f}%")
print(f"   MAE: ${xgb_full_mae:.2f}")
print(f"   RMSE: ${xgb_full_rmse:.2f}")

improvement = 58.09 - xgb_full_smape
if improvement > 0:
    print(f"\n🎉 MAJOR IMPROVEMENT: {improvement:.4f}% better than text-only!")
    print(f"   This is what we needed! Full data makes embeddings work!")
else:
    print(f"\n⚠️  Still similar to text-only: {xgb_full_smape:.4f}%")


In [None]:
# ============================================================
# FINAL SUBMISSION - FULL 75K TEST SET
# ============================================================

print("\n" + "=" * 80)
print("📝 GENERATING FINAL SUBMISSION - FULL 75K")
print("=" * 80)

# Predict on full test set
final_predictions = np.expm1(xgb_full_model.predict(X_test_full))
final_predictions = np.maximum(final_predictions, 0.1)

# Create submission
final_submission = pd.DataFrame({
    'sample_id': test_ids_full,
    'price': final_predictions
})

final_submission = final_submission.sort_values('sample_id').reset_index(drop=True)
final_submission.to_csv('test_out_FINAL.csv', index=False)

print(f"✅ FINAL submission created: test_out_FINAL.csv")
print(f"   Samples: {len(final_submission):,}")
print(f"   Expected SMAPE: ~{xgb_full_smape:.2f}%")
print(f"   Model: XGBoost + Text Embeddings (75K data)")

# Verify
print("\n" + "=" * 80)
print("SUBMISSION VERIFICATION")
print("=" * 80)
print(f"✅ Total rows: {len(final_submission)}")
print(f"✅ Unique sample_ids: {final_submission['sample_id'].nunique()}")
print(f"✅ Missing values: {final_submission.isnull().sum().sum()}")
print(f"✅ Price range: ${final_submission['price'].min():.2f} - ${final_submission['price'].max():.2f}")

if len(final_submission) == 75000 and final_submission['sample_id'].nunique() == 75000:
    print("\n🎉 PERFECT! Submission is ready for upload!")
else:
    print("\n⚠️  Check submission format!")


In [None]:
# ============================================================
# FINAL DECISION MATRIX
# ============================================================

print("\n" + "=" * 80)
print("📊 COMPLETE DECISION MATRIX")
print("=" * 80)

final_comparison = pd.DataFrame({
    'Model': [
        'Text-Only (13 feat) - Full 75K',
        'Text+Embeddings (397 feat) - Full 75K'
    ],
    'Training Data': [
        '75,000',
        '75,000'
    ],
    'Features': [13, 397],
    'SMAPE (%)': [
        58.09,  # Your original best
        xgb_full_smape  # New full dataset result
    ],
    'File': [
        'test_out.csv (original)',
        'test_out_FINAL.csv (new)'
    ]
})

display(final_comparison)

# Decision logic
best_idx = final_comparison['SMAPE (%)'].idxmin()
best_model = final_comparison.loc[best_idx]

print(f"\nBest Model: {best_model['Model']}")
print(f"SMAPE: {best_model['SMAPE (%)']:.4f}%")
print(f"Submit: {best_model['File']}")

if xgb_full_smape < 58.09:
    print("\n✅ RECOMMENDATION: Submit test_out_FINAL.csv")
    print("   Text embeddings work well with full 75K data!")
    print(f"   Improvement: {58.09 - xgb_full_smape:.4f}%")
else:
    print("\n✅ RECOMMENDATION: Submit test_out.csv (original)")
    print("   Original text-only ensemble is still best")
    print("   (Text embeddings didn't improve enough)")



In [None]:
# ============================================================
# STRATEGY 1: ENSEMBLE WITH MULTIPLE RANDOM SEEDS
# ============================================================

print("\n" + "=" * 80)
print("ENSEMBLE STRATEGY: MULTIPLE RANDOM SEEDS")
print("=" * 80)

# Train 5 models with different random seeds
n_models = 5
models = []
predictions_val = []
predictions_test = []

for seed in [42, 123, 456, 789, 2024]:
    print(f"\nTraining model with seed {seed}...")
    
    # Split with different seed
    X_tr_seed, X_val_seed, y_tr_seed, y_val_seed = train_test_split(
        X_train_full, y_train_full, test_size=0.15, random_state=seed
    )
    y_tr_log_seed = np.log1p(y_tr_seed)
    
    # Train model
    model_seed = xgb.XGBRegressor(
        objective='reg:squarederror',
        learning_rate=0.05,
        max_depth=7,
        min_child_weight=3,
        subsample=0.8,
        colsample_bytree=0.7,
        reg_alpha=0.2,
        reg_lambda=1.5,
        random_state=seed,
        n_estimators=500,
        tree_method='gpu_hist',
        gpu_id=0
    )
    
    model_seed.fit(X_tr_seed, y_tr_log_seed, verbose=False)
    
    # Predict
    pred_val = np.expm1(model_seed.predict(X_val_seed))
    pred_test = np.expm1(model_seed.predict(X_test_full))
    
    predictions_val.append((pred_val, y_val_seed))
    predictions_test.append(pred_test)
    models.append(model_seed)
    
    # Individual model SMAPE
    smape_seed = calculate_smape(y_val_seed, np.maximum(pred_val, 0.1))
    print(f"   Seed {seed} SMAPE: {smape_seed:.4f}%")

# Average ensemble
print("\nCreating ensemble average...")
ensemble_test = np.mean(predictions_test, axis=0)
ensemble_test = np.maximum(ensemble_test, 0.1)

# Calculate ensemble validation SMAPE (approximate)
print("✅ Ensemble created with 5 models!")


In [None]:
# ============================================================
# FIX: CATBOOST PARAMETERS (REMOVE SUBSAMPLE)
# ============================================================

print("\n2. Training CatBoost on full data (FIXED)...")
cat_full_params = {
    'iterations': 500,
    'learning_rate': 0.05,
    'depth': 7,
    'l2_leaf_reg': 3,
    'random_seed': 42,
    'verbose': 0,
    'early_stopping_rounds': 50,
    'loss_function': 'RMSE',
    'task_type': 'GPU',
    'bootstrap_type': 'Bernoulli',  # Required for subsample
    'subsample': 0.8  # Now it will work
}

cat_full_model = CatBoostRegressor(**cat_full_params)
cat_full_model.fit(
    X_tr_full, y_tr_log_full,
    eval_set=(X_val_full, y_val_log_full),
    verbose=False
)

cat_pred_val = np.expm1(cat_full_model.predict(X_val_full))
cat_pred_test = np.expm1(cat_full_model.predict(X_test_full))

cat_smape = calculate_smape(y_val_full, np.maximum(cat_pred_val, 0.1))
print(f"   CatBoost SMAPE: {cat_smape:.4f}%")


In [None]:
# ============================================================
# OPTIMIZED WEIGHTED ENSEMBLE
# =======================================
# Collect all predictions
all_preds_val = [xgb_full_pred_val, lgb_pred_val, cat_pred_val]
all_preds_test = [
    np.expm1(xgb_full_model.predict(X_test_full)),
    lgb_pred_test,
    cat_pred_test
]

# Model SMAPEs
model_smapes = {
    'XGBoost': xgb_full_smape,  # 55.57%
    'LightGBM': 56.24,  # From your output
    'CatBoost': cat_smape  # Will be calculated
}

print("\nIndividual Model Performance:")
for name, smape in model_smapes.items():
    print(f"  {name}: {smape:.4f}%")

# Calculate weights based on inverse SMAPE
smapes = np.array([xgb_full_smape, 56.24, cat_smape])
weights_optimal = 1 / smapes
weights_optimal = weights_optimal / weights_optimal.sum()

print("\nOptimal weights:")
print(f"  XGBoost:  {weights_optimal[0]:.4f}")
print(f"  LightGBM: {weights_optimal[1]:.4f}")
print(f"  CatBoost: {weights_optimal[2]:.4f}")

# Create weighted ensemble
final_ensemble_val = sum(w * p for w, p in zip(weights_optimal, all_preds_val))
final_ensemble_test = sum(w * p for w, p in zip(weights_optimal, all_preds_test))

final_ensemble_val = np.maximum(final_ensemble_val, 0.1)
final_ensemble_test = np.maximum(final_ensemble_test, 0.1)

# Calculate ensemble SMAPE
ensemble_smape = calculate_smape(y_val_full, final_ensemble_val)
print(f"\n✅ Weighted Ensemble SMAPE: {ensemble_smape:.4f}%")

improvement = xgb_full_smape - ensemble_smape
if improvement > 0:
    print(f"🎉 IMPROVED! {improvement:.4f}% better than single XGBoost!")
else:
    print(f"⚠️  XGBoost alone still best ({xgb_full_smape:.4f}%)")


In [None]:

multi_seed_test = np.mean(predictions_test, axis=0)
multi_seed_test = np.maximum(multi_seed_test, 0.1)

multi_seed_val_smapes = [53.79, 53.72, 54.14, 53.94, 54.46]  # From output
avg_multi_seed_smape = np.mean(multi_seed_val_smapes)

print(f"\nMulti-seed ensemble average SMAPE: {avg_multi_seed_smapes:.4f}%")
print(f"Best single seed: {min(multi_seed_val_smapes):.4f}%")
print(f"Worst single seed: {max(multi_seed_val_smapes):.4f}%")
print(f"Standard deviation: {np.std(multi_seed_val_smapes):.4f}%")

print("\n💡 Multi-seed ensemble reduces variance!")


In [None]:
# Compare all approaches
final_comparison = pd.DataFrame({
    'Approach': [
        'Single XGBoost (seed=42)',
        'Multi-seed XGBoost (5 models)',
        'XGB+LGB+CAT Weighted Ensemble'
    ],
    'Validation SMAPE (%)': [
        55.57,
        avg_multi_seed_smape,
        ensemble_smape
    ]
})

final_comparison = final_comparison.sort_values('Validation SMAPE (%)')
display(final_comparison)

# Select best approach
best_approach = final_comparison.iloc[0]
best_final_smape = best_approach['Validation SMAPE (%)']

print(f"\n🎯 BEST APPROACH: {best_approach['Approach']}")
print(f"   Validation SMAPE: {best_final_smape:.4f}%")

# Determine which predictions to use
if best_approach['Approach'] == 'Multi-seed XGBoost (5 models)':
    final_best_predictions = multi_seed_test
    final_method = "Multi-seed XGBoost Ensemble"
elif best_approach['Approach'] == 'XGB+LGB+CAT Weighted Ensemble':
    final_best_predictions = final_ensemble_test
    final_method = "Weighted Multi-Algorithm Ensemble"
else:
    final_best_predictions = np.expm1(xgb_full_model.predict(X_test_full))
    final_best_predictions = np.maximum(final_best_predictions, 0.1)
    final_method = "Single XGBoost"


In [None]:
# ============================================================
# FINAL ULTRA-OPTIMIZED SUBMISSION
# ============================================================

print("\n" + "=" * 80)
print("📝 GENERATING ULTRA-OPTIMIZED SUBMISSION")
print("=" * 80)

# Create final submission
ultra_submission = pd.DataFrame({
    'sample_id': test_ids_full,
    'price': final_best_predictions
})

ultra_submission = ultra_submission.sort_values('sample_id').reset_index(drop=True)
ultra_submission.to_csv('test_out_ULTRA.csv', index=False)

print(f"✅ Ultra-optimized submission created!")
print(f"   File: test_out_ULTRA.csv")
print(f"   Method: {final_method}")
print(f"   Validation SMAPE: {best_final_smape:.4f}%")
print(f"   Total samples: {len(ultra_submission):,}")

# Verify
print("\n" + "=" * 80)
print("FINAL VERIFICATION")
print("=" * 80)
print(f"✅ Rows: {len(ultra_submission)} (expected: 75,000)")
print(f"✅ Unique IDs: {ultra_submission['sample_id'].nunique()} (expected: 75,000)")
print(f"✅ Missing values: {ultra_submission.isnull().sum().sum()}")
print(f"✅ Price range: ${ultra_submission['price'].min():.2f} - ${ultra_submission['price'].max():.2f}")

if len(ultra_submission) == 75000 and ultra_submission['sample_id'].nunique() == 75000:
    print("\n🎉 PERFECT! Ready for submission!")


In [None]:
# ============================================================
# PERFORMANCE SUMMARY & RANK ESTIMATION
# ============================================================



summary_df = pd.DataFrame({
    'Stage': [
        'Baseline (Text-Only)',
        'Text + Embeddings',
        'Optimized Ensemble'
    ],
    'SMAPE (%)': [
        58.09,
        55.57,
        best_final_smape
    ],
    'Improvement': [
        '—',
        f'-{58.09-55.57:.2f}%',
        f'-{58.09-best_final_smape:.2f}%'
    ],
    'File': [
        'test_out.csv',
        'test_out_FINAL.csv',
        'test_out_ULTRA.csv'
    ]
})

display(summary_df)

total_improvement = 58.09 - best_final_smape
relative_improvement = (total_improvement / 58.09) * 100

print(f"\n🎯 TOTAL PROGRESS:")
print(f"   Starting SMAPE: 58.09%")
print(f"   Final SMAPE: {best_final_smape:.4f}%")
print(f"   Absolute improvement: {total_improvement:.2f}%")
print(f"   Relative improvement: {relative_improvement:.1f}%")




In [None]:
import os
import shutil
from datetime import datetime

# Create submission folder
submission_folder = f'amazon_ml_submission_{datetime.now().strftime("%Y%m%d_%H%M")}'
os.makedirs(submission_folder, exist_ok=True)

# Copy files
shutil.copy('test_out_ULTRA.csv', f'{submission_folder}/test_out.csv')
shutil.copy('FINAL_SUBMISSION_DOCUMENTATION.txt', f'{submission_folder}/approach_documentation.txt')

print(f"✅ Submission package created: {submission_folder}/")
print("\nPackage contents:")
print("  1. test_out.csv (75,000 predictions)")
print("  2. approach_documentation.txt (methodology)")



In [None]:
# ============================================================
# IMPROVEMENT 1: DOWNLOAD & PROCESS ALL 75K IMAGES
# ============================================================

import os
import requests
from PIL import Image
from io import BytesIO
import concurrent.futures
from tqdm import tqdm
import time

# Create image directories
os.makedirs('images/train_full', exist_ok=True)
os.makedirs('images/test_full', exist_ok=True)

print("\n" + "=" * 80)
print("STEP 1: DOWNLOAD ALL 75K TRAIN + 75K TEST IMAGES")
print("=" * 80)

def download_image_fast(url, save_path, timeout=5, retries=2):
    """Fast image download with minimal retries"""
    for attempt in range(retries):
        try:
            response = requests.get(url, timeout=timeout, stream=True)
            if response.status_code == 200:
                img = Image.open(BytesIO(response.content))
                if img.mode != 'RGB':
                    img = img.convert('RGB')
                img = img.resize((224, 224), Image.Resampling.LANCZOS)
                img.save(save_path, quality=85, optimize=True)
                return True
        except:
            if attempt == retries - 1:
                return False
            time.sleep(0.3)
    return False

def batch_download_images_parallel(df, split='train_full', max_workers=64):
    """Download images with high parallelization"""
    print(f"\n📥 Downloading {len(df):,} {split} images...")
    print(f"   Using {max_workers} parallel workers...")
    
    successful = 0
    failed = 0
    
    def download_row(row):
        sample_id = row['sample_id']
        url = row['image_link']
        save_path = f'images/{split}/{sample_id}.jpg'
        
        if os.path.exists(save_path):
            return True
        
        return download_image_fast(url, save_path)
    
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        results = list(tqdm(
            executor.map(download_row, [row for _, row in df.iterrows()]),
            total=len(df),
            desc=f"Downloading {split}"
        ))
    
    successful = sum(results)
    failed = len(results) - successful
    
    print(f"✅ {split.upper()} download complete!")
    print(f"   Successful: {successful:,} ({successful/len(df)*100:.1f}%)")
    print(f"   Failed: {failed:,}")
    
    return successful, failed

# Download training images (FULL 75K)
print("\n⏱️  Estimated time: ~10-15 minutes for 75K images")
train_success, train_failed = batch_download_images_parallel(
    train_df_full, 
    split='train_full', 
    max_workers=64
)

# Download test images (FULL 75K)
test_success, test_failed = batch_download_images_parallel(
    test_df_full, 
    split='test_full', 
    max_workers=64
)

print(f"\n✅ TOTAL IMAGES DOWNLOADED:")
print(f"   Train: {train_success:,} / 75,000 ({train_success/75000*100:.1f}%)")
print(f"   Test: {test_success:,} / 75,000 ({test_success/75000*100:.1f}%)")


In [None]:

import os
import tensorflow as tf

# Force TensorFlow to use CPU
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

# Restart Python kernel first, then run:
print("✅ TensorFlow will now use CPU")
print("⏱️  Expected time: ~20-25 minutes for 75K images (slower but stable)")

# NOW reload ResNet50
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.applications.resnet50 import preprocess_input
from tensorflow.keras.preprocessing import image as keras_image

base_model = ResNet50(weights='imagenet', include_top=False, pooling='avg')
print(f"✅ ResNet50 loaded on CPU! Output: {base_model.output_shape[1]} features")

# Use the SAME extraction code, it will work on CPU
def extract_features_batch(df, split='train_full', model=None, batch_size=32):  # Smaller batch for CPU
    """Extract features in batches (CPU-optimized)"""
    print(f"\nExtracting features from {len(df):,} images (CPU mode)...")
    
    feature_list = []
    sample_ids = []
    
    n_batches = (len(df) + batch_size - 1) // batch_size
    
    for batch_idx in tqdm(range(n_batches), desc=f"Extracting {split}"):
        start_idx = batch_idx * batch_size
        end_idx = min(start_idx + batch_size, len(df))
        batch_df = df.iloc[start_idx:end_idx]
        
        batch_images = []
        batch_ids = []
        
        for _, row in batch_df.iterrows():
            sample_id = row['sample_id']
            image_path = f'images/{split}/{sample_id}.jpg'
            
            if os.path.exists(image_path):
                try:
                    img = keras_image.load_img(image_path, target_size=(224, 224))
                    img_array = keras_image.img_to_array(img)
                    batch_images.append(img_array)
                    batch_ids.append(sample_id)
                except:
                    batch_images.append(np.zeros((224, 224, 3)))
                    batch_ids.append(sample_id)
            else:
                batch_images.append(np.zeros((224, 224, 3)))
                batch_ids.append(sample_id)
        
        if batch_images:
            batch_array = np.array(batch_images)
            batch_array = preprocess_input(batch_array)
            features = model.predict(batch_array, verbose=0)
            feature_list.extend(features)
            sample_ids.extend(batch_ids)
    
    features_array = np.array(feature_list)
    print(f"✅ Extracted: {features_array.shape}")
    return features_array, sample_ids

# Extract features
train_img_feat_full, train_img_ids = extract_features_batch(
    train_df_full,
    split='train_full',
    model=base_model,
    batch_size=32
)

test_img_feat_full, test_img_ids = extract_features_batch(
    test_df_full,
    split='test_full',
    model=base_model,
    batch_size=32
)

# Save
np.save('train_image_features_75k.npy', train_img_feat_full)
np.save('test_image_features_75k.npy', test_img_feat_full)


In [None]:
# ============================================================
# ALTERNATIVE: USE EFFICIENTNET-LITE (LIGHTER & FASTER)
# =========
import subprocess
subprocess.check_call(['pip', 'install', '-q', 'efficientnet'])

import efficientnet.keras as efn
from tensorflow.keras.applications.efficientnet import preprocess_input

# Load EfficientNetB0 (lighter, faster, fewer features)
base_model = efn.EfficientNetB0(
    weights='imagenet',
    include_top=False,
    pooling='avg',
    input_shape=(224, 224, 3)
)

print(f"✅ EfficientNetB0 loaded! Output: {base_model.output_shape[1]} features (1280)")


In [None]:
# ============================================================
# CLIP (NO TENSORFLOW NEEDED)
# ============================================================


# Install CLIP
import subprocess
subprocess.check_call(['pip', 'install', '-q', 'git+https://github.com/openai/CLIP.git'])

import torch
import clip
from PIL import Image
from tqdm import tqdm

# Load CLIP model
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

print(f"✅ CLIP loaded on {device}! Output: 512 features")

def extract_clip_features(df, split='train_full'):
    """Extract CLIP features (FAST)"""
    print(f"\nExtracting CLIP features from {len(df):,} images...")
    
    features_list = []
    ids_list = []
    
    for _, row in tqdm(df.iterrows(), total=len(df), desc=f"CLIP {split}"):
        sample_id = row['sample_id']
        image_path = f'images/{split}/{sample_id}.jpg'
        
        try:
            if os.path.exists(image_path):
                image = preprocess(Image.open(image_path)).unsqueeze(0).to(device)
                with torch.no_grad():
                    features = model.encode_image(image)
                    features = features.cpu().numpy().flatten()
            else:
                features = np.zeros(512)
        except:
            features = np.zeros(512)
        
        features_list.append(features)
        ids_list.append(sample_id)
    
    return np.array(features_list), ids_list

# Extract features
train_img_feat_full, train_img_ids = extract_clip_features(train_df_full, 'train_full')
test_img_feat_full, test_img_ids = extract_clip_features(test_df_full, 'test_full')

# Save
np.save('train_image_features_clip_75k.npy', train_img_feat_full)
np.save('test_image_features_clip_75k.npy', test_img_feat_full)

print(f"\n✅ CLIP features extracted!")
print(f"   Shape: {train_img_feat_full.shape} (512 dims instead of 2048)")
print(f"   Time saved: ~50% faster than ResNet50")


In [None]:
# ============================================================
# STEP 3: DIMENSIONALITY REDUCTION WITH PCA
# ============================================================

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Standardize features
print("Standardizing image features...")
scaler_img = StandardScaler()
train_img_scaled = scaler_img.fit_transform(train_img_feat_full)
test_img_scaled = scaler_img.transform(test_img_feat_full)

# Apply PCA (keep 100 components for good balance)
print("Applying PCA (2048 → 100 dimensions)...")
pca_img = PCA(n_components=100, random_state=42)
train_img_pca_full = pca_img.fit_transform(train_img_scaled)
test_img_pca_full = pca_img.transform(test_img_scaled)

explained_var = pca_img.explained_variance_ratio_.sum()
print(f"✅ PCA complete!")
print(f"   Explained variance: {explained_var*100:.2f}%")
print(f"   Shape: {train_img_pca_full.shape}")

# Save PCA features
np.save('train_image_pca_75k.npy', train_img_pca_full)
np.save('test_image_pca_75k.npy', test_img_pca_full)


In [None]:
# ============================================================
# STEP 4: COMBINE TEXT + TEXT EMBEDDINGS + IMAGE PCA
# ============================================================


# Load existing features
train_text_emb_full = np.load('train_text_embeddings_full.npy')
test_text_emb_full = np.load('test_text_embeddings_full.npy')

# Create comprehensive feature set
# 13 text features + 384 text embeddings + 100 image PCA = 497 features

# Add image PCA to dataframes
img_pca_names = [f'img_pca_{i}' for i in range(100)]

for i, col_name in enumerate(img_pca_names):
    train_df_full[col_name] = train_img_pca_full[:, i]
    test_df_full[col_name] = test_img_pca_full[:, i]

# Combined features
ultimate_features = text_features + text_emb_names + img_pca_names

print(f"✅ Ultimate feature set created!")
print(f"   Text features: {len(text_features)}")
print(f"   Text embeddings: {len(text_emb_names)}")
print(f"   Image PCA: {len(img_pca_names)}")
print(f"   TOTAL: {len(ultimate_features)} features")

# Prepare data
X_train_ultimate = train_df_full[ultimate_features].values
y_train_ultimate = train_df_full['price'].values
y_train_log_ultimate = np.log1p(y_train_ultimate)

X_test_ultimate = test_df_full[ultimate_features].values
test_ids_ultimate = test_df_full['sample_id'].values

print(f"\n📊 Final dataset:")
print(f"   Training: {X_train_ultimate.shape}")
print(f"   Test: {X_test_ultimate.shape}")
print(f"   Feature-to-sample ratio: 1:{X_train_ultimate.shape[0]//X_train_ultimate.shape[1]}")


In [None]:
# ============================================================
# STEP 5: TRAIN ULTIMATE MODEL WITH ALL FEATURES
# ============================================================
from sklearn.model_selection import train_test_split

# Split
X_tr_ult, X_val_ult, y_tr_ult, y_val_ult = train_test_split(
    X_train_ultimate, y_train_ultimate, test_size=0.15, random_state=42
)
y_tr_log_ult = np.log1p(y_tr_ult)
y_val_log_ult = np.log1p(y_val_ult)

print(f"Training: {X_tr_ult.shape[0]:,} samples")
print(f"Validation: {X_val_ult.shape[0]:,} samples")
print(f"Features: {X_tr_ult.shape[1]}")

# Optimized parameters for multimodal data
xgb_ultimate_params = {
    'objective': 'reg:squarederror',
    'learning_rate': 0.03,  # Lower for more features
    'max_depth': 6,
    'min_child_weight': 3,
    'subsample': 0.8,
    'colsample_bytree': 0.5,  # Lower due to many features
    'colsample_bylevel': 0.7,
    'gamma': 0.1,
    'reg_alpha': 0.5,  # Higher regularization
    'reg_lambda': 2.0,
    'random_state': 42,
    'n_estimators': 500,
    'tree_method': 'gpu_hist',
    'gpu_id': 0,
    'early_stopping_rounds': 50
}

print("\n🚀 Training ultimate XGBoost model...")
xgb_ultimate = xgb.XGBRegressor(**xgb_ultimate_params)
xgb_ultimate.fit(
    X_tr_ult, y_tr_log_ult,
    eval_set=[(X_val_ult, y_val_log_ult)],
    verbose=True
)

# Predict
xgb_ult_pred_val = np.expm1(xgb_ultimate.predict(X_val_ult))
xgb_ult_pred_val = np.maximum(xgb_ult_pred_val, 0.1)

# Metrics
xgb_ult_smape = calculate_smape(y_val_ult, xgb_ult_pred_val)
xgb_ult_mae = mean_absolute_error(y_val_ult, xgb_ult_pred_val)
xgb_ult_rmse = np.sqrt(mean_squared_error(y_val_ult, xgb_ult_pred_val))

print(f"\n✅ ULTIMATE MODEL RESULTS:")
print(f"   SMAPE: {xgb_ult_smape:.4f}%")
print(f"   MAE: ${xgb_ult_mae:.2f}")
print(f"   RMSE: ${xgb_ult_rmse:.2f}")

In [None]:

xgb_ultimate_params_no_es = {
    'objective': 'reg:squarederror',
    'learning_rate': 0.03,
    'max_depth': 6,
    'min_child_weight': 3,
    'subsample': 0.8,
    'colsample_bytree': 0.5,
    'colsample_bylevel': 0.7,
    'gamma': 0.1,
    'reg_alpha': 0.5,
    'reg_lambda': 2.0,
    'n_estimators': 400,  # Fixed number, no early stopping
    'tree_method': 'gpu_hist',
    'gpu_id': 0
}

ultimate_predictions_test = []
ultimate_smapes = []

for seed in [42, 123, 456]:  # Use fewer seeds for speed
    print(f"\nTraining ultimate model with seed {seed}...")
    
    X_tr_seed, X_val_seed, y_tr_seed, y_val_seed = train_test_split(
        X_train_ultimate, y_train_ultimate, test_size=0.15, random_state=seed
    )
    y_tr_log_seed = np.log1p(y_tr_seed)
    
    model_seed = xgb.XGBRegressor(**{**xgb_ultimate_params_no_es, 'random_state': seed})
    model_seed.fit(X_tr_seed, y_tr_log_seed, verbose=False)
    
    pred_val = np.expm1(model_seed.predict(X_val_seed))
    pred_test = np.expm1(model_seed.predict(X_test_ultimate))
    
    ultimate_predictions_test.append(pred_test)
    
    smape_seed = calculate_smape(y_val_seed, np.maximum(pred_val, 0.1))
    ultimate_smapes.append(smape_seed)
    print(f"   Seed {seed} SMAPE: {smape_seed:.4f}%")

# Ensemble
ultimate_ensemble_test = np.mean(ultimate_predictions_test, axis=0)
ultimate_ensemble_test = np.maximum(ultimate_ensemble_test, 0.1)

avg_ultimate_smape = np.mean(ultimate_smapes)
print(f"\n✅ Ultimate Multi-Seed Ensemble (with images):")
print(f"   Average SMAPE: {avg_ultimate_smape:.4f}%")


In [None]:
# ============================================================
# BETTER APPROACH: OPTIMIZE TEXT+EMBEDDINGS (NO IMAGES)


# Load text+embeddings features (397 features, no images)
X_train_optimized = train_df_full[text_features + text_emb_names].values
y_train_optimized = train_df_full['price'].values
y_train_log_optimized = np.log1p(y_train_optimized)

X_test_optimized = test_df_full[text_features + text_emb_names].values

# Split
X_tr_opt, X_val_opt, y_tr_opt, y_val_opt = train_test_split(
    X_train_optimized, y_train_optimized, test_size=0.15, random_state=42
)
y_tr_log_opt = np.log1p(y_tr_opt)
y_val_log_opt = np.log1p(y_val_opt)

print(f"\nOptimized dataset:")
print(f"   Features: {X_train_optimized.shape[1]} (text+embeddings only)")
print(f"   Training: {X_tr_opt.shape[0]:,}")
print(f"   Validation: {X_val_opt.shape[0]:,}")


In [None]:
# ============================================================
# HYPERPARAMETER TUNING WITH OPTUNA (FAST)
# ========
# Install optuna
import subprocess
subprocess.check_call(['pip', 'install', '-q', 'optuna'])

import optuna
from optuna.samplers import TPESampler

def objective(trial):
    """Optuna objective function"""
    params = {
        'objective': 'reg:squarederror',
        'tree_method': 'gpu_hist',
        'random_state': 42,
        
        # Tunable parameters
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'max_depth': trial.suggest_int('max_depth', 5, 9),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 7),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 0.5),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.01, 2.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.1, 5.0, log=True),
        'n_estimators': 300
    }
    
    model = xgb.XGBRegressor(**params)
    model.fit(X_tr_opt, y_tr_log_opt, verbose=False)
    
    pred = np.expm1(model.predict(X_val_opt))
    pred = np.maximum(pred, 0.1)
    
    smape = calculate_smape(y_val_opt, pred)
    return smape

# Run optimization (20 trials = ~15 minutes)
print("\n🔍 Running Optuna optimization (20 trials, ~15 minutes)...")
study = optuna.create_study(
    direction='minimize',
    sampler=TPESampler(seed=42)
)

study.optimize(objective, n_trials=20, show_progress_bar=True)

print(f"\n✅ Optimization complete!")
print(f"   Best SMAPE: {study.best_value:.4f}%")
print(f"   Best params:")
for key, value in study.best_params.items():
    print(f"      {key}: {value}")


In [None]:
# ============================================================
# TRAIN FINAL MODEL WITH BEST PARAMETERS
# ============================================================

best_params = {
    'objective': 'reg:squarederror',
    'tree_method': 'gpu_hist',
    'random_state': 42,
    'n_estimators': 500,
    **study.best_params
}

# Train on full data
model_optimized = xgb.XGBRegressor(**best_params)
model_optimized.fit(
    X_train_optimized, y_train_log_optimized, 
    verbose=False
)

# Predict on validation
pred_opt_val = np.expm1(model_optimized.predict(X_val_opt))
pred_opt_val = np.maximum(pred_opt_val, 0.1)

opt_smape = calculate_smape(y_val_opt, pred_opt_val)

print(f"\n✅ OPTIMIZED MODEL RESULTS:")
print(f"   SMAPE: {opt_smape:.4f}%")
print(f"   Improvement from baseline: {54.01 - opt_smape:.4f}%")


In [None]:
# Compare all approaches
comparison = pd.DataFrame({
    'Approach': [
        'Original Multi-Seed (Text+Emb)',
        'With Images (Text+Emb+CLIP)',
        'Optuna Tuned (Text+Emb)'
    ],
    'SMAPE (%)': [
        54.01,
        avg_ultimate_smape if 'avg_ultimate_smape' in locals() else 55.31,
        opt_smape
    ]
})

comparison = comparison.sort_values('SMAPE (%)')
display(comparison)

best_approach_row = comparison.iloc[0]
print(f"\n🏆 BEST APPROACH: {best_approach_row['Approach']}")
print(f"   SMAPE: {best_approach_row['SMAPE (%)']:.4f}%")

# Use best model for final submission
if best_approach_row['Approach'] == 'Optuna Tuned (Text+Emb)':
    final_predictions = np.expm1(model_optimized.predict(X_test_optimized))
    final_smape_est = opt_smape
    final_method = "Optuna-Tuned XGBoost (Text+Embeddings)"
elif best_approach_row['Approach'] == 'Original Multi-Seed (Text+Emb)':
    # Use your original multi-seed ensemble (54.01%)
    final_predictions = np.mean(predictions_test, axis=0)  # From earlier
    final_smape_est = 54.01
    final_method = "Multi-Seed XGBoost Ensemble (Text+Embeddings)"
else:
    final_predictions = ultimate_ensemble_test
    final_smape_est = avg_ultimate_smape
    final_method = "Multi-Seed with Images"

final_predictions = np.maximum(final_predictions, 0.1)

# Create submission
final_submission_v2 = pd.DataFrame({
    'sample_id': test_df_full['sample_id'],
    'price': final_predictions
})

final_submission_v2 = final_submission_v2.sort_values('sample_id').reset_index(drop=True)
final_submission_v2.to_csv('test_out_FINAL_V2.csv', index=False)

print(f"\n✅ FINAL SUBMISSION CREATED:")
print(f"   File: test_out_FINAL_V2.csv")
print(f"   Method: {final_method}")
print(f"   Expected SMAPE: {final_smape_est:.4f}%")
print(f"   Samples: {len(final_submission_v2):,}")

print(f"\nCurrent leaderboard: 53.961% (Rank 1868)")
print(f"Expected new score: {final_smape_est:.2f}%")



In [None]:
print("\n1. Checking validation data...")
print(f"   X_val_opt shape: {X_val_opt.shape}")
print(f"   y_val_opt shape: {y_val_opt.shape}")
print(f"   Unique values in y_val_opt: {len(np.unique(y_val_opt))}")

# Check predictions
print("\n2. Checking predictions...")
print(f"   pred_opt_val shape: {pred_opt_val.shape}")
print(f"   pred_opt_val range: ${pred_opt_val.min():.2f} - ${pred_opt_val.max():.2f}")
print(f"   pred_opt_val mean: ${pred_opt_val.mean():.2f}")
print(f"   y_val_opt mean: ${y_val_opt.mean():.2f}")

# Recalculate SMAPE manually
manual_smape = calculate_smape(y_val_opt, pred_opt_val)
print(f"\n3. Manual SMAPE calculation: {manual_smape:.4f}%")

# Check if model was evaluated on training data by mistake
train_pred = np.expm1(model_optimized.predict(X_train_optimized))
train_pred = np.maximum(train_pred, 0.1)
train_smape = calculate_smape(y_train_optimized, train_pred)
print(f"\n4. SMAPE on FULL TRAINING data: {train_smape:.4f}%")

print("\n⚠️  DIAGNOSIS:")
if train_smape < 30:
    print("   Likely evaluated on TRAINING data (overfitting)")
    print("   Model memorized training data!")
elif manual_smape > 50:
    print("   Validation split issue or calculation error")
else:
    print("   Unknown error - need investigation")


In [None]:
# ============================================================
# STRATEGY 3: INTERACTION FEATURES
# ============================================================

print("\n" + "=" * 80)
print("CREATING INTERACTION FEATURES")
print("=" * 80)

def create_interactions(df):
    """Create meaningful feature interactions"""
    
    # Quantity-based interactions
    df['pack_x_unit'] = df['pack_count'] * df['unit_value']
    df['quantity_x_bullets'] = df['total_quantity'] * df['num_bullet_points']
    
    # Text-based interactions
    df['words_x_chars'] = df['word_count'] * df['char_count']
    df['words_per_bullet'] = df['word_count'] / np.maximum(df['num_bullet_points'], 1)
    
    # Category interactions with quantities
    df['category_x_pack'] = df['category_encoded'] * df['pack_count']
    df['unit_type_x_value'] = df['unit_type_encoded'] * df['unit_value']
    
    # Price-related interactions
    df['pack_x_weight'] = df['pack_count'] * df['weight_value']
    df['bullets_x_chars'] = df['num_bullet_points'] * df['char_count']
    
    return df

train_df_adv = create_interactions(train_df_adv)
test_df_adv = create_interactions(test_df_adv)

interaction_features = [
    'pack_x_unit', 'quantity_x_bullets', 'words_x_chars', 
    'words_per_bullet', 'category_x_pack', 'unit_type_x_value',
    'pack_x_weight', 'bullets_x_chars'
]

print(f"✅ Added {len(interaction_features)} interaction features")


In [None]:
# ============================================================
# STRATEGY 4: ULTIMATE FEATURE SET
# ============================================================

print("\n" + "=" * 80)
print("CREATING ULTIMATE FEATURE SET")
print("=" * 80)

# Original features
original_features = [
    'unit_value', 'pack_count', 'total_quantity', 'weight_value',
    'num_bullet_points', 'word_count', 'char_count', 'num_numbers', 'avg_word_length',
    'unit_type_encoded', 'category_encoded', 'unit_type_freq', 'category_freq'
]

# Text embeddings
text_emb_features = [f'text_emb_{i}' for i in range(384)]

# All new features
ultimate_feature_set = (
    original_features + 
    text_emb_features + 
    advanced_features + 
    target_features + 
    interaction_features
)

print(f"📊 ULTIMATE FEATURE SET:")
print(f"   Original features: {len(original_features)}")
print(f"   Text embeddings: {len(text_emb_features)}")
print(f"   Advanced features: {len(advanced_features)}")
print(f"   Target encoded: {len(target_features)}")
print(f"   Interactions: {len(interaction_features)}")
print(f"   TOTAL: {len(ultimate_feature_set)} features")

# Check which features exist
existing_features = [f for f in ultimate_feature_set if f in train_df_adv.columns]
missing_features = [f for f in ultimate_feature_set if f not in train_df_adv.columns]

if missing_features:
    print(f"\n⚠️  Missing features: {missing_features[:5]}...")
    ultimate_feature_set = existing_features

# Prepare data
X_train_ultimate = train_df_adv[ultimate_feature_set].fillna(0).values
y_train_ultimate = train_df_adv['price'].values
y_train_log_ultimate = np.log1p(y_train_ultimate)

X_test_ultimate = test_df_adv[ultimate_feature_set].fillna(0).values
test_ids_ultimate = test_df_adv['sample_id'].values

print(f"\n✅ Final data prepared: {X_train_ultimate.shape}")
print(f"   Using {len(ultimate_feature_set)} features")


In [None]:
# ============================================================
# STRATEGY 5: FAST SINGLE MODEL TEST
# ============================================================

print("\n" + "=" * 80)
print("TESTING ADVANCED FEATURES")
print("=" * 80)

from sklearn.model_selection import train_test_split

# Split for validation
X_tr, X_val, y_tr, y_val = train_test_split(
    X_train_ultimate, y_train_ultimate, test_size=0.15, random_state=42
)
y_tr_log = np.log1p(y_tr)

# Quick XGBoost test
xgb_test_params = {
    'objective': 'reg:squarederror',
    'learning_rate': 0.05,
    'max_depth': 7,
    'min_child_weight': 3,
    'subsample': 0.8,
    'colsample_bytree': 0.7,
    'reg_alpha': 0.2,
    'reg_lambda': 1.5,
    'random_state': 42,
    'n_estimators': 300,
    'tree_method': 'gpu_hist'
}

print("🚀 Testing XGBoost with advanced features...")
xgb_test = xgb.XGBRegressor(**xgb_test_params)
xgb_test.fit(X_tr, y_tr_log, verbose=False)

pred_test = np.expm1(xgb_test.predict(X_val))
pred_test = np.maximum(pred_test, 0.1)
smape_test = calculate_smape(y_val, pred_test)

print(f"\n✅ ADVANCED FEATURES TEST RESULT:")
print(f"   SMAPE: {smape_test:.4f}%")

improvement = 53.96 - smape_test
print(f"\n🎯 IMPROVEMENT CHECK:")
print(f"   Current leaderboard: 53.96%")
print(f"   Advanced features: {smape_test:.4f}%")
print(f"   Improvement: {improvement:.4f}%")

if improvement > 1:
    print(f"\n🎉 EXCELLENT! >1% improvement - continue with ensemble!")
    continue_ensemble = True
elif improvement > 0:
    print(f"\n✅ GOOD! Some improvement - worth submitting!")
    continue_ensemble = True
else:
    print(f"\n⚠️  No improvement - stick with current model")
    continue_ensemble = False

if continue_ensemble:
    print("\n📊 Feature importance (top 10):")
    importance_df = pd.DataFrame({
        'feature': [ultimate_feature_set[i] for i in range(len(ultimate_feature_set))],
        'importance': xgb_test.feature_importances_
    }).sort_values('importance', ascending=False)
    
    display(importance_df.head(10))


In [None]:
# ============================================================
# STRATEGY: STACKING ENSEMBLE (PROVEN TECHNIQUE)
# ============================================================

print("=" * 80)
print("🎯 STRATEGY: STACKING ENSEMBLE FOR SCORE BOOST")
print("=" * 80)
print("\nInstead of new features, let's optimize model combination")

# Load your proven best features
with open('feature_config.json', 'r') as f:
    feature_config = json.load(f)

best_features = feature_config['all_features']  # Your original 13 features

# Add text embeddings
text_emb_features = [f'text_emb_{i}' for i in range(384)]
proven_features = best_features + text_emb_features  # 397 features total

# Load data with proven features
train_df_best = pd.read_csv('train_processed.csv')
test_df_best = pd.read_csv('test_processed.csv')

# Add embeddings if needed
if 'text_emb_0' not in train_df_best.columns:
    train_text_emb = np.load('train_text_embeddings_full.npy')
    test_text_emb = np.load('test_text_embeddings_full.npy')
    
    for i in range(384):
        train_df_best[f'text_emb_{i}'] = train_text_emb[:, i]
        test_df_best[f'text_emb_{i}'] = test_text_emb[:, i]

X_train_best = train_df_best[proven_features].fillna(0).values
y_train_best = train_df_best['price'].values
y_train_log_best = np.log1p(y_train_best)

X_test_best = test_df_best[proven_features].fillna(0).values
test_ids_best = test_df_best['sample_id'].values

print(f"✅ Using proven features: {len(proven_features)}")
print(f"   Data shape: {X_train_best.shape}")


In [None]:
# ============================================================
# STACKING ENSEMBLE - LEVEL 1 MODELS
# ============================================================

print("\n" + "=" * 80)
print("TRAINING STACKING ENSEMBLE - LEVEL 1")
print("=" * 80)

from sklearn.model_selection import KFold
from sklearn.linear_model import Ridge

# Create out-of-fold predictions for stacking
n_folds = 5
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

# Initialize OOF prediction arrays
oof_xgb_stack = np.zeros(len(X_train_best))
oof_lgb_stack = np.zeros(len(X_train_best))
oof_cat_stack = np.zeros(len(X_train_best))

# Initialize test prediction arrays
test_xgb_stack = np.zeros(len(X_test_best))
test_lgb_stack = np.zeros(len(X_test_best))
test_cat_stack = np.zeros(len(X_test_best))

print("Creating out-of-fold predictions for stacking...")

# Optimized parameters for stacking
xgb_stack_params = {
    'objective': 'reg:squarederror',
    'learning_rate': 0.05,
    'max_depth': 7,
    'min_child_weight': 3,
    'subsample': 0.8,
    'colsample_bytree': 0.7,
    'reg_alpha': 0.2,
    'reg_lambda': 1.5,
    'random_state': 42,
    'n_estimators': 400,
    'tree_method': 'gpu_hist'
}

lgb_stack_params = {
    'objective': 'regression',
    'learning_rate': 0.05,
    'num_leaves': 50,
    'max_depth': 7,
    'min_child_samples': 20,
    'subsample': 0.8,
    'colsample_bytree': 0.7,
    'reg_alpha': 0.2,
    'reg_lambda': 1.5,
    'random_state': 42,
    'n_estimators': 400,
    'device': 'gpu',
    'verbose': -1
}

cat_stack_params = {
    'iterations': 400,
    'learning_rate': 0.05,
    'depth': 7,
    'l2_leaf_reg': 3,
    'bootstrap_type': 'Bernoulli',
    'subsample': 0.8,
    'random_seed': 42,
    'verbose': 0,
    'task_type': 'GPU'
}

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_best), 1):
    print(f"\nFold {fold}/{n_folds}")
    
    X_tr_fold = X_train_best[train_idx]
    X_val_fold = X_train_best[val_idx]
    y_tr_fold = y_train_log_best[train_idx]
    
    # XGBoost
    xgb_fold = xgb.XGBRegressor(**xgb_stack_params)
    xgb_fold.fit(X_tr_fold, y_tr_fold, verbose=False)
    
    oof_xgb_stack[val_idx] = xgb_fold.predict(X_val_fold)
    test_xgb_stack += xgb_fold.predict(X_test_best) / n_folds
    
    # LightGBM
    lgb_fold = lgb.LGBMRegressor(**lgb_stack_params)
    lgb_fold.fit(X_tr_fold, y_tr_fold)
    
    oof_lgb_stack[val_idx] = lgb_fold.predict(X_val_fold)
    test_lgb_stack += lgb_fold.predict(X_test_best) / n_folds
    
    # CatBoost
    cat_fold = CatBoostRegressor(**cat_stack_params)
    cat_fold.fit(X_tr_fold, y_tr_fold, verbose=False)
    
    oof_cat_stack[val_idx] = cat_fold.predict(X_val_fold)
    test_cat_stack += cat_fold.predict(X_test_best) / n_folds

print("\n✅ Level 1 models complete!")


In [None]:
# ============================================================
# STACKING ENSEMBLE - LEVEL 2 META-LEARNER
# ============================================================

print("\n" + "=" * 80)
print("TRAINING LEVEL 2 META-LEARNER")
print("=" * 80)

# Create meta-features (OOF predictions from level 1)
meta_features = np.column_stack([
    oof_xgb_stack,
    oof_lgb_stack, 
    oof_cat_stack
])

# Meta-learner (Ridge regression)
meta_learner = Ridge(alpha=1.0, random_state=42)
meta_learner.fit(meta_features, y_train_log_best)

print(f"✅ Meta-learner trained!")
print(f"   Meta-features shape: {meta_features.shape}")
print(f"   Ridge coefficients: {meta_learner.coef_}")

# Generate final stacking predictions
test_meta_features = np.column_stack([
    test_xgb_stack,
    test_lgb_stack,
    test_cat_stack
])

stacking_pred_log = meta_learner.predict(meta_features)  # Validation predictions
stacking_pred_test_log = meta_learner.predict(test_meta_features)  # Test predictions

# Convert back to original price scale
stacking_pred_val = np.expm1(stacking_pred_log)
stacking_pred_test = np.expm1(stacking_pred_test_log)

stacking_pred_val = np.maximum(stacking_pred_val, 0.1)
stacking_pred_test = np.maximum(stacking_pred_test, 0.1)

# Calculate stacking SMAPE
stacking_smape = calculate_smape(y_train_best, stacking_pred_val)

print(f"\n✅ STACKING ENSEMBLE RESULTS:")
print(f"   Stacking SMAPE: {stacking_smape:.4f}%")

improvement = 53.96 - stacking_smape
print(f"\n🎯 IMPROVEMENT CHECK:")
print(f"   Current leaderboard: 53.96%")
print(f"   Stacking ensemble: {stacking_smape:.4f}%")
print(f"   Improvement: {improvement:.4f}%")

if improvement > 0.5:
    print(f"\n🎉 GOOD IMPROVEMENT! Submit stacking model!")
    use_stacking = True
elif improvement > 0:
    print(f"\n✅ Small improvement - worth trying!")
    use_stacking = True
else:
    print(f"\n⚠️  No significant improvement")
    use_stacking = False


In [None]:
# ============================================================
# ALTERNATIVE: HYPERPARAMETER OPTIMIZATION ROUND 2
# ============================================================

print("\n" + "=" * 80)
print("ALTERNATIVE: DEEPER HYPERPARAMETER TUNING")
print("=" * 80)

# If stacking doesn't work, try more aggressive hyperparameter tuning
if not use_stacking or improvement < 1:
    print("Trying deeper hyperparameter optimization...")
    
    import optuna
    
    def objective_v2(trial):
        """More aggressive hyperparameter search"""
        params = {
            'objective': 'reg:squarederror',
            'tree_method': 'gpu_hist',
            'random_state': 42,
            
            # Wider search ranges
            'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.15, log=True),
            'max_depth': trial.suggest_int('max_depth', 4, 12),
            'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
            'subsample': trial.suggest_float('subsample', 0.5, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3, 1.0),
            'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.3, 1.0),
            'gamma': trial.suggest_float('gamma', 0, 1.0),
            'reg_alpha': trial.suggest_float('reg_alpha', 0.001, 5.0, log=True),
            'reg_lambda': trial.suggest_float('reg_lambda', 0.01, 10.0, log=True),
            'n_estimators': 350
        }
        
        # Cross-validation with proper split
        X_tr_opt, X_val_opt, y_tr_opt, y_val_opt = train_test_split(
            X_train_best, y_train_best, test_size=0.2, random_state=trial.number
        )
        y_tr_log_opt = np.log1p(y_tr_opt)
        
        model = xgb.XGBRegressor(**params)
        model.fit(X_tr_opt, y_tr_log_opt, verbose=False)
        
        pred = np.expm1(model.predict(X_val_opt))
        pred = np.maximum(pred, 0.1)
        
        smape = calculate_smape(y_val_opt, pred)
        return smape
    
    print("\n🔍 Running deeper optimization (30 trials)...")
    study_v2 = optuna.create_study(direction='minimize', sampler=optuna.samplers.TPESampler(seed=42))
    study_v2.optimize(objective_v2, n_trials=30, show_progress_bar=True)
    
    print(f"\n✅ Deep optimization complete!")
    print(f"   Best SMAPE: {study_v2.best_value:.4f}%")
    
    # Train final model with best params
    best_params_v2 = {
        'objective': 'reg:squarederror',
        'tree_method': 'gpu_hist',
        'random_state': 42,
        'n_estimators': 500,
        **study_v2.best_params
    }
    
    final_model_opt = xgb.XGBRegressor(**best_params_v2)
    final_model_opt.fit(X_train_best, y_train_log_best, verbose=False)
    
    pred_opt_test = np.expm1(final_model_opt.predict(X_test_best))
    pred_opt_test = np.maximum(pred_opt_test, 0.1)
    
    deep_opt_improvement = 53.96 - study_v2.best_value
    print(f"\n🎯 DEEP OPTIMIZATION RESULTS:")
    print(f"   Best SMAPE: {study_v2.best_value:.4f}%")
    print(f"   Improvement: {deep_opt_improvement:.4f}%")
    
    if deep_opt_improvement > 0.5:
        print(f"\n🎉 USE DEEP OPTIMIZED MODEL!")
        final_predictions_advanced = pred_opt_test
        final_method_advanced = "Deep Hyperparameter Optimized XGBoost"
        final_smape_advanced = study_v2.best_value
    else:
        print(f"\n⚠️  Stick with original multi-seed ensemble")
        final_predictions_advanced = None
        final_method_advanced = "Original Multi-Seed Ensemble"
        final_smape_advanced = 54.01


In [None]:
# ============================================================
# FINAL DECISION & SUBMISSION
# ============================================================

print("\n" + "=" * 80)
print("FINAL MODEL SELECTION")
print("=" * 80)

# Compare all approaches
results_final = pd.DataFrame({
    'Approach': [
        'Original Multi-Seed (Proven)',
        'Stacking Ensemble',
        'Deep Hyperparameter Tuning' if 'study_v2' in locals() else 'Not Tested'
    ],
    'Expected SMAPE (%)': [
        54.01,
        stacking_smape if 'stacking_smape' in locals() else 'N/A',
        study_v2.best_value if 'study_v2' in locals() else 'N/A'
    ],
    'Status': [
        '✅ Validated (53.96% LB)',
        '🧪 Experimental',
        '🧪 Experimental'
    ]
})

display(results_final)

print("\n" + "=" * 80)
print("📊 RECOMMENDATION")
print("=" * 80)

# Check which model is actually best
best_validated_smape = 54.01  # Your proven score

if 'stacking_smape' in locals() and stacking_smape < best_validated_smape - 0.5:
    print("✅ SUBMIT: Stacking Ensemble")
    final_submit_predictions = stacking_pred_test
    final_submit_method = "Stacking Ensemble"
    final_submit_smape = stacking_smape
elif 'study_v2' in locals() and study_v2.best_value < best_validated_smape - 0.5:
    print("✅ SUBMIT: Deep Optimized Model")
    final_submit_predictions = pred_opt_test
    final_submit_method = "Deep Hyperparameter Optimized"
    final_submit_smape = study_v2.best_value
else:
    print("✅ SUBMIT: Original Multi-Seed Ensemble (SAFEST)")
    print("   Advanced techniques didn't provide significant improvement")
    print("   Your 54.01% model is already well-optimized!")
    final_submit_method = "Original Multi-Seed Ensemble"
    final_submit_smape = 54.01

print(f"\n📈 Expected Performance:")
print(f"   Method: {final_submit_method}")
print(f"   Expected SMAPE: {final_submit_smape:.2f}%")
print(f"   Expected Rank: Top 500-1000")


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor
import warnings
warnings.filterwarnings('ignore')

# Load data
train_df = pd.read_csv('train_processed.csv')
test_df = pd.read_csv('test_processed.csv')

print(f"\n✅ Data loaded: {len(train_df):,} train, {len(test_df):,} test")


In [None]:
# ============================================================
# TECHNIQUE 1: FIND DUPLICATES & NEAR-DUPLICATES
# ============================================================

print("\n" + "=" * 80)
print("TECHNIQUE 1: DUPLICATE DETECTION & PRICE TRANSFER")
print("=" * 80)

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import hashlib

def create_text_hash(row):
    """Create hash from key text fields"""
    text = f"{row['item_name']}_{row['catalog_content']}"
    return hashlib.md5(text.encode()).hexdigest()

# Create text hashes
train_df['text_hash'] = train_df.apply(create_text_hash, axis=1)
test_df['text_hash'] = test_df.apply(create_text_hash, axis=1)

# Find exact duplicates between train and test
train_hashes = set(train_df['text_hash'])
test_hashes = set(test_df['text_hash'])
common_hashes = train_hashes & test_hashes

print(f"\n🔍 Duplicate Analysis:")
print(f"   Exact duplicates: {len(common_hashes)}")

# For exact duplicates, use train price as feature
if len(common_hashes) > 0:
    hash_price_map = train_df.groupby('text_hash')['price'].mean().to_dict()
    
    train_df['duplicate_price'] = train_df['text_hash'].map(hash_price_map)
    test_df['duplicate_price'] = test_df['text_hash'].map(hash_price_map)
    
    # Fill missing with 0 (no duplicate found)
    train_df['duplicate_price'] = train_df['duplicate_price'].fillna(0)
    test_df['duplicate_price'] = test_df['duplicate_price'].fillna(0)
    
    train_df['is_duplicate'] = (train_df['duplicate_price'] > 0).astype(int)
    test_df['is_duplicate'] = (test_df['duplicate_price'] > 0).astype(int)
    
    print(f"   Train with duplicates: {train_df['is_duplicate'].sum():,}")
    print(f"   Test with duplicates: {test_df['is_duplicate'].sum():,}")
else:
    train_df['duplicate_price'] = 0
    test_df['duplicate_price'] = 0
    train_df['is_duplicate'] = 0
    test_df['is_duplicate'] = 0

# Near-duplicates using TF-IDF similarity
print(f"\n🔍 Finding near-duplicates (cosine similarity > 0.95)...")

# Sample for speed (use all if time permits)
sample_size = min(10000, len(train_df))
train_sample = train_df.sample(sample_size, random_state=42)

tfidf = TfidfVectorizer(max_features=1000, ngram_range=(1,2))
train_tfidf = tfidf.fit_transform(train_sample['catalog_content'].fillna(''))
test_tfidf = tfidf.transform(test_df['catalog_content'].fillna(''))

# Find similar products
similarity_threshold = 0.95
test_df['near_duplicate_price'] = 0.0

for i in range(min(1000, len(test_df))):  # Process first 1000 for speed
    similarities = cosine_similarity(test_tfidf[i:i+1], train_tfidf).flatten()
    max_sim_idx = similarities.argmax()
    
    if similarities[max_sim_idx] > similarity_threshold:
        test_df.loc[test_df.index[i], 'near_duplicate_price'] = train_sample.iloc[max_sim_idx]['price']

train_df['near_duplicate_price'] = 0.0  # Train doesn't need this

near_dup_count = (test_df['near_duplicate_price'] > 0).sum()
print(f"   Near-duplicates found: {near_dup_count}")

print("\n✅ TECHNIQUE 1 COMPLETE")


In [None]:
# ============================================================
# TECHNIQUE 2: TEST DISTRIBUTION ADAPTATION
# ============================================================

print("\n" + "=" * 80)
print("TECHNIQUE 2: TEST DISTRIBUTION ADAPTATION")
print("=" * 80)

# Analyze distribution differences
print("\n📊 Distribution Analysis:")

# Category distribution
train_cat_dist = train_df['category'].value_counts(normalize=True)
test_cat_dist = test_df['category'].value_counts(normalize=True)

# Find categories more common in test
test_heavy_cats = []
for cat in test_cat_dist.index:
    if cat in train_cat_dist.index:
        if test_cat_dist[cat] > train_cat_dist[cat] * 1.5:  # 50% more common in test
            test_heavy_cats.append(cat)
            print(f"   {cat}: Train {train_cat_dist[cat]:.3f} → Test {test_cat_dist[cat]:.3f}")

# Create feature for test-heavy categories
train_df['test_heavy_category'] = train_df['category'].isin(test_heavy_cats).astype(int)
test_df['test_heavy_category'] = test_df['category'].isin(test_heavy_cats).astype(int)

# Unseen values
train_units = set(train_df['unit_type'].unique())
test_units = set(test_df['unit_type'].unique())
unseen_units = test_units - train_units

print(f"\n   Unseen unit types in test: {len(unseen_units)}")
test_df['has_unseen_unit'] = test_df['unit_type'].isin(unseen_units).astype(int)
train_df['has_unseen_unit'] = 0

# Price distribution feature
train_price_percentiles = np.percentile(train_df['price'], [25, 50, 75])
train_df['price_quartile'] = pd.cut(train_df['price'], 
                                     bins=[0] + list(train_price_percentiles) + [np.inf],
                                     labels=[0,1,2,3]).astype(int)

# For test, we'll estimate quartile based on features
test_df['price_quartile'] = 1  # Default to median quartile

print("\n✅ TECHNIQUE 2 COMPLETE")


In [None]:
# ============================================================
# TECHNIQUE 3: SEMANTIC CLUSTERING & GROUP STATISTICS
# ============================================================

print("\n" + "=" * 80)
print("TECHNIQUE 3: SEMANTIC CLUSTERING (EMBEDDINGS)")
print("=" * 80)

# Load text embeddings
train_text_emb = np.load('train_text_embeddings_full.npy')
test_text_emb = np.load('test_text_embeddings_full.npy')

print(f"✅ Embeddings loaded: {train_text_emb.shape}")

# K-means clustering on embeddings
from sklearn.cluster import MiniBatchKMeans

n_clusters = 100
print(f"\n🔍 Clustering products into {n_clusters} groups...")

kmeans = MiniBatchKMeans(n_clusters=n_clusters, random_state=42, batch_size=1000)
train_df['cluster'] = kmeans.fit_predict(train_text_emb)
test_df['cluster'] = kmeans.predict(test_text_emb)

# Calculate cluster statistics
cluster_stats = train_df.groupby('cluster')['price'].agg(['mean', 'std', 'count']).reset_index()
cluster_stats.columns = ['cluster', 'cluster_price_mean', 'cluster_price_std', 'cluster_count']

# Merge cluster stats
train_df = train_df.merge(cluster_stats, on='cluster', how='left')
test_df = test_df.merge(cluster_stats, on='cluster', how='left')

# Fill missing
train_df['cluster_price_mean'] = train_df['cluster_price_mean'].fillna(train_df['price'].mean())
train_df['cluster_price_std'] = train_df['cluster_price_std'].fillna(train_df['price'].std())
test_df['cluster_price_mean'] = test_df['cluster_price_mean'].fillna(train_df['price'].mean())
test_df['cluster_price_std'] = test_df['cluster_price_std'].fillna(train_df['price'].std())

# Cluster size feature
train_df['large_cluster'] = (train_df['cluster_count'] > 500).astype(int)
test_df['large_cluster'] = (test_df['cluster_count'].fillna(0) > 500).astype(int)

print(f"\n   Clusters created: {n_clusters}")
print(f"   Avg cluster size: {cluster_stats['cluster_count'].mean():.0f}")
print(f"   Price variation across clusters: {cluster_stats['cluster_price_mean'].std():.2f}")

print("\n✅ TECHNIQUE 3 COMPLETE")


In [None]:
# ============================================================
# TECHNIQUE 4: BRAND & PRODUCT FAMILY FEATURES
# ============================================================

print("\n" + "=" * 80)
print("TECHNIQUE 4: BRAND & PRODUCT FAMILY MINING")
print("=" * 80)

import re

def extract_brand_features(df):
    """Extract brand-related features"""
    
    # Common brand keywords
    df['has_popular_brand'] = df['catalog_content'].str.contains(
        r'coca|pepsi|nestle|kraft|kellogg|general mills|nabisco|frito',
        case=False, na=False
    ).astype(int)
    
    # Extract first capitalized word as potential brand
    def get_first_cap_word(text):
        if pd.isna(text):
            return 'unknown'
        match = re.search(r'\b[A-Z][a-z]+\b', str(text))
        return match.group(0) if match else 'unknown'
    
    df['extracted_brand'] = df['item_name'].apply(get_first_cap_word)
    
    # Brand frequency encoding
    brand_freq = df['extracted_brand'].value_counts()
    df['brand_frequency'] = df['extracted_brand'].map(brand_freq).fillna(1)
    
    # Rare brand indicator
    df['rare_brand'] = (df['brand_frequency'] < 5).astype(int)
    
    return df

train_df = extract_brand_features(train_df)
test_df = extract_brand_features(test_df)

# Calculate brand price statistics
brand_price_stats = train_df.groupby('extracted_brand')['price'].agg(['mean', 'std']).reset_index()
brand_price_stats.columns = ['extracted_brand', 'brand_price_mean', 'brand_price_std']

train_df = train_df.merge(brand_price_stats, on='extracted_brand', how='left')
test_df = test_df.merge(brand_price_stats, on='extracted_brand', how='left')

# Fill missing with global mean
global_mean = train_df['price'].mean()
global_std = train_df['price'].std()
train_df['brand_price_mean'] = train_df['brand_price_mean'].fillna(global_mean)
train_df['brand_price_std'] = train_df['brand_price_std'].fillna(global_std)
test_df['brand_price_mean'] = test_df['brand_price_mean'].fillna(global_mean)
test_df['brand_price_std'] = test_df['brand_price_std'].fillna(global_std)

print(f"\n   Unique extracted brands: {train_df['extracted_brand'].nunique()}")
print(f"   Popular brands found: {train_df['has_popular_brand'].sum():,}")

print("\n✅ TECHNIQUE 4 COMPLETE")


In [None]:
# ============================================================
# TECHNIQUE 5: PRICE-PREDICTIVE PATTERN MINING
# ============================================================

print("\n" + "=" * 80)
print("TECHNIQUE 5: PRICE PATTERN MINING")
print("=" * 80)

def extract_price_patterns(df):
    """Extract patterns that correlate with price"""
    
    # Size/quantity indicators
    df['has_family_size'] = df['catalog_content'].str.contains(
        r'family size|party size|value pack|economy', case=False, na=False
    ).astype(int)
    
    df['has_single_serve'] = df['catalog_content'].str.contains(
        r'single serve|individual|travel size|mini', case=False, na=False
    ).astype(int)
    
    # Premium indicators
    df['premium_words'] = df['catalog_content'].str.count(
        r'premium|organic|natural|gourmet|artisan|craft|premium quality'
    )
    
    # Discount indicators
    df['discount_words'] = df['catalog_content'].str.count(
        r'sale|discount|save|deal|value|cheap|affordable'
    )
    
    # Health indicators (often premium priced)
    df['health_words'] = df['catalog_content'].str.count(
        r'gluten free|sugar free|low fat|diet|healthy|nutrition|vitamin'
    )
    
    # Packaging complexity (correlates with price)
    df['packaging_complexity'] = (
        df['catalog_content'].str.count(r'reseal|zip|container|bottle|can') +
        df['catalog_content'].str.count(r'package|wrapped|sealed')
    )
    
    # Price per unit calculation
    df['estimated_price_per_unit'] = df['total_quantity'] * df['unit_value'] / 100
    
    return df

train_df = extract_price_patterns(train_df)
test_df = extract_price_patterns(test_df)

print(f"\n   Pattern features extracted")
print(f"   Premium products: {(train_df['premium_words'] > 0).sum():,}")
print(f"   Discount products: {(train_df['discount_words'] > 0).sum():,}")

print("\n✅ TECHNIQUE 5 COMPLETE")


In [None]:
# ============================================================
# TECHNIQUE 6: PSEUDO-LABELING PREPARATION
# ============================================================

print("\n" + "=" * 80)
print("TECHNIQUE 6: PSEUDO-LABELING SETUP")
print("=" * 80)

# We'll train initial model, then use confident predictions as pseudo-labels
# This will be done after initial training

print("✅ Pseudo-labeling will be applied after initial training")
print("\n✅ TECHNIQUE 6 PREPARED")


In [None]:
# ============================================================
# TECHNIQUE 7: COMPILE ULTIMATE FEATURE SET
# ============================================================

print("\n" + "=" * 80)
print("TECHNIQUE 7: COMPILING ULTIMATE FEATURE SET")
print("=" * 80)

# Original features
original_features = [
    'unit_value', 'pack_count', 'total_quantity', 'weight_value',
    'num_bullet_points', 'word_count', 'char_count', 'num_numbers', 'avg_word_length',
    'unit_type_encoded', 'category_encoded', 'unit_type_freq', 'category_freq'
]

# Text embeddings
text_emb_features = [f'text_emb_{i}' for i in range(384)]

# New advanced features from techniques 1-5
advanced_features = [
    # Technique 1: Duplicates
    'duplicate_price', 'is_duplicate', 'near_duplicate_price',
    # Technique 2: Distribution
    'test_heavy_category', 'has_unseen_unit',
    # Technique 3: Clustering
    'cluster_price_mean', 'cluster_price_std', 'large_cluster',
    # Technique 4: Brand
    'has_popular_brand', 'brand_frequency', 'rare_brand',
    'brand_price_mean', 'brand_price_std',
    # Technique 5: Patterns
    'has_family_size', 'has_single_serve', 'premium_words', 'discount_words',
    'health_words', 'packaging_complexity', 'estimated_price_per_unit'
]

# Add embeddings to dataframes if not already there
for i in range(384):
    if f'text_emb_{i}' not in train_df.columns:
        train_df[f'text_emb_{i}'] = train_text_emb[:, i]
        test_df[f'text_emb_{i}'] = test_text_emb[:, i]

# Combine all features
ultimate_features = original_features + text_emb_features + advanced_features

# Check which features exist
existing_features = [f for f in ultimate_features if f in train_df.columns]
missing_features = [f for f in ultimate_features if f not in train_df.columns]

if missing_features:
    print(f"\n⚠️  Missing {len(missing_features)} features, using {len(existing_features)} available")
    ultimate_features = existing_features

print(f"\n📊 ULTIMATE FEATURE SET:")
print(f"   Total features: {len(ultimate_features)}")
print(f"   Breakdown:")
print(f"     - Original: {len([f for f in ultimate_features if f in original_features])}")
print(f"     - Embeddings: {len([f for f in ultimate_features if 'text_emb_' in f])}")
print(f"     - Advanced: {len([f for f in ultimate_features if f in advanced_features])}")

# Prepare final datasets
X_train_ultimate = train_df[ultimate_features].fillna(0).values
y_train_ultimate = train_df['price'].values
y_train_log_ultimate = np.log1p(y_train_ultimate)

X_test_ultimate = test_df[ultimate_features].fillna(0).values
test_ids_ultimate = test_df['sample_id'].values

print(f"\n✅ Final data shape: {X_train_ultimate.shape}")
print("\n✅ TECHNIQUE 7 COMPLETE")
