# üîç DIAGNOSTIC ANALYSIS - Find What Actually Matters

## Goal: Understand what features ACTUALLY predict price

**Questions to Answer:**
1. Are sentence embeddings helping at all?
2. What's the brand overlap between train/test?
3. Which features have highest importance?
4. Why is validation-test gap 5.7%?
5. Are we ignoring images (the secret weapon)?

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error
import lightgbm as lgb

print("‚úÖ Imports successful!")

## 1. Load Data and Extract BASIC Features

In [None]:
# Load data
train = pd.read_csv('dataset/train.csv', encoding='latin1')
test = pd.read_csv('dataset/test.csv', encoding='latin1')

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")
print(f"\nTrain columns: {train.columns.tolist()}")
print(f"\nPrice statistics:")
print(train['price'].describe())

## 2. Extract Simple Features (No Embeddings)

In [None]:
def extract_simple_features(df):
    """Extract ONLY simple, obvious features"""
    
    # Value (the NUMBER in the product)
    df['value'] = df['catalog_content'].str.extract(r'Value:\s*([\d.,]+)').astype(str).str.replace(',', '').astype(float)
    
    # Unit
    df['unit'] = df['catalog_content'].str.extract(r'Unit:\s*([A-Za-z\s]+)', flags=re.IGNORECASE)[0].str.strip().str.lower()
    
    # Brand (first capitalized word in item name)
    def extract_brand(text):
        item_name = re.search(r'Item Name:\s*(.*?)(?=\n|Product|$)', str(text), re.IGNORECASE)
        if item_name:
            words = item_name.group(1).split()
            for word in words[:3]:
                if word and len(word) > 2 and word[0].isupper():
                    return word.lower()
        return 'unknown'
    
    df['brand'] = df['catalog_content'].apply(extract_brand)
    
    # Text length
    df['text_len'] = df['catalog_content'].str.len()
    
    # Pack count
    def extract_pack(text):
        patterns = [r'(\d+)\s*[-\s]?pack', r'pack\s*of\s*(\d+)', r'(\d+)\s*count']
        for pattern in patterns:
            match = re.search(pattern, str(text).lower())
            if match:
                return int(match.group(1))
        return 1
    
    df['pack_count'] = df['catalog_content'].apply(extract_pack)
    
    # Log transforms
    df['log_value'] = np.log1p(df['value'].fillna(0))
    df['log_pack'] = np.log1p(df['pack_count'])
    
    return df

train = extract_simple_features(train)
test = extract_simple_features(test)

print("‚úÖ Simple features extracted!")
print(f"\nFeatures: value, unit, brand, text_len, pack_count, log_value, log_pack")

## 3. CRITICAL: Check Brand Overlap

In [None]:
# Brand overlap analysis
train_brands = set(train['brand'].unique())
test_brands = set(test['brand'].unique())

common_brands = train_brands & test_brands
test_only_brands = test_brands - train_brands
train_only_brands = train_brands - test_brands

print("üîç BRAND OVERLAP ANALYSIS")
print("="*50)
print(f"Total train brands: {len(train_brands)}")
print(f"Total test brands: {len(test_brands)}")
print(f"Common brands: {len(common_brands)} ({len(common_brands)/len(test_brands)*100:.1f}% of test)")
print(f"Test-only brands: {len(test_only_brands)} ({len(test_only_brands)/len(test_brands)*100:.1f}% of test)")

# Check how many test samples have unseen brands
test_unseen_brand = test[~test['brand'].isin(train_brands)]
print(f"\nTest samples with unseen brands: {len(test_unseen_brand)} ({len(test_unseen_brand)/len(test)*100:.1f}%)")

if len(test_unseen_brand) > 10000:
    print("\n‚ö†Ô∏è WARNING: > 10K test samples have unseen brands!")
    print("   This explains the validation-test gap!")

## 4. Distribution Comparison (Adversarial Validation)

In [None]:
# Compare distributions
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Value distribution
axes[0, 0].hist(train['value'].dropna(), bins=50, alpha=0.5, label='Train', density=True)
axes[0, 0].hist(test['value'].dropna(), bins=50, alpha=0.5, label='Test', density=True)
axes[0, 0].set_xlabel('Value')
axes[0, 0].set_title('Value Distribution')
axes[0, 0].legend()
axes[0, 0].set_xlim(0, 1000)  # Zoom to see differences

# Pack count distribution
axes[0, 1].hist(train['pack_count'], bins=20, alpha=0.5, label='Train', density=True)
axes[0, 1].hist(test['pack_count'], bins=20, alpha=0.5, label='Test', density=True)
axes[0, 1].set_xlabel('Pack Count')
axes[0, 1].set_title('Pack Count Distribution')
axes[0, 1].legend()

# Text length distribution
axes[1, 0].hist(train['text_len'], bins=50, alpha=0.5, label='Train', density=True)
axes[1, 0].hist(test['text_len'], bins=50, alpha=0.5, label='Test', density=True)
axes[1, 0].set_xlabel('Text Length')
axes[1, 0].set_title('Text Length Distribution')
axes[1, 0].legend()

# Price distribution (train only)
axes[1, 1].hist(train['price'], bins=50, alpha=0.7, color='blue')
axes[1, 1].set_xlabel('Price')
axes[1, 1].set_title('Price Distribution (Train Only)')
axes[1, 1].axvline(train['price'].median(), color='red', linestyle='--', label=f'Median: ${train["price"].median():.2f}')
axes[1, 1].legend()

plt.tight_layout()
plt.show()

print("\nüìä Distribution Statistics:")
print(f"Train value mean: {train['value'].mean():.2f}, Test value mean: {test['value'].mean():.2f}")
print(f"Train pack mean: {train['pack_count'].mean():.2f}, Test pack mean: {test['pack_count'].mean():.2f}")
print(f"Train text len mean: {train['text_len'].mean():.0f}, Test text len mean: {test['text_len'].mean():.0f}")

## 5. Baseline Model (ONLY Simple Features)

In [None]:
# Prepare features
feature_cols = ['value', 'text_len', 'pack_count', 'log_value', 'log_pack']
X = train[feature_cols].fillna(0)
y = np.log1p(train['price'])

# Split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15, random_state=42)

# Train LightGBM
params = {
    'objective': 'regression',
    'metric': 'mae',
    'learning_rate': 0.05,
    'num_leaves': 31,
    'verbose': -1
}

train_data = lgb.Dataset(X_train, label=y_train)
val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)

model = lgb.train(
    params,
    train_data,
    num_boost_round=500,
    valid_sets=[val_data],
    callbacks=[lgb.early_stopping(50), lgb.log_evaluation(0)]
)

# Predict
y_pred_log = model.predict(X_val)
y_pred = np.expm1(y_pred_log)
y_val_actual = np.expm1(y_val)

# SMAPE
def smape(y_true, y_pred):
    return np.mean(np.abs(y_true - y_pred) / ((np.abs(y_true) + np.abs(y_pred)) / 2)) * 100

baseline_smape = smape(y_val_actual, y_pred)

print("\n" + "="*50)
print("üéØ BASELINE MODEL (ONLY 5 simple features)")
print("="*50)
print(f"Validation SMAPE: {baseline_smape:.2f}%")
print(f"\nFeature Importance:")
for feat, imp in zip(feature_cols, model.feature_importance()):
    print(f"  {feat:15s}: {imp:6.0f}")

print("\nüí° KEY INSIGHT:")
if baseline_smape < 55:
    print("   Simple features alone get < 55% SMAPE!")
    print("   This means our 768-dim embeddings are NOT adding much value!")
else:
    print("   Simple features get > 55% SMAPE")
    print("   We need more sophisticated features (brand, category, images)")

## 6. Add Brand Encoding

In [None]:
# Simple brand encoding (frequency + mean)
brand_freq = train['brand'].value_counts()
train['brand_freq'] = train['brand'].map(brand_freq).fillna(0)

brand_mean_price = train.groupby('brand')['price'].mean()
global_mean = train['price'].mean()
train['brand_mean_price'] = train['brand'].map(brand_mean_price).fillna(global_mean)

# Use same for test
test['brand_freq'] = test['brand'].map(brand_freq).fillna(0)
test['brand_mean_price'] = test['brand'].map(brand_mean_price).fillna(global_mean)

# New feature set
feature_cols_with_brand = feature_cols + ['brand_freq', 'brand_mean_price']
X_brand = train[feature_cols_with_brand].fillna(0)

X_train, X_val, y_train, y_val = train_test_split(X_brand, y, test_size=0.15, random_state=42)

# Train
train_data = lgb.Dataset(X_train, label=y_train)
val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)

model_brand = lgb.train(
    params,
    train_data,
    num_boost_round=500,
    valid_sets=[val_data],
    callbacks=[lgb.early_stopping(50), lgb.log_evaluation(0)]
)

# Predict
y_pred_log = model_brand.predict(X_val)
y_pred = np.expm1(y_pred_log)
y_val_actual = np.expm1(y_val)

brand_smape = smape(y_val_actual, y_pred)

print("\n" + "="*50)
print("üéØ WITH BRAND FEATURES")
print("="*50)
print(f"Validation SMAPE: {brand_smape:.2f}%")
print(f"Improvement: {baseline_smape - brand_smape:.2f}%")
print(f"\nFeature Importance:")
for feat, imp in zip(feature_cols_with_brand, model_brand.feature_importance()):
    print(f"  {feat:20s}: {imp:6.0f}")

print("\nüí° KEY INSIGHT:")
if brand_smape < baseline_smape - 2:
    print("   Brand features help significantly!")
    print("   Brand extraction and encoding is CRITICAL")
else:
    print("   Brand features don't help much")
    print("   Either: (1) Brand extraction is bad, or (2) Brand doesn't matter for this dataset")

## 7. Image Feature Test (Download 100 samples)

In [None]:
# Check if images are available
print("\nüñºÔ∏è IMAGE ANALYSIS")
print("="*50)
print(f"Train samples with image links: {train['image_link'].notna().sum()} ({train['image_link'].notna().sum()/len(train)*100:.1f}%)")
print(f"Test samples with image links: {test['image_link'].notna().sum()} ({test['image_link'].notna().sum()/len(test)*100:.1f}%)")

print("\nüí° NEXT STEP: Download images and extract features using ResNet50")
print("   Expected: If images matter, they should reduce SMAPE by 3-5%")
print("   This is likely the SECRET WEAPON of top teams!")

## 8. Summary and Recommendations

In [None]:
print("\n" + "="*70)
print("üìä DIAGNOSTIC SUMMARY")
print("="*70)

print(f"\n1. BASELINE (5 simple features): {baseline_smape:.2f}% SMAPE")
print(f"2. WITH BRAND FEATURES:          {brand_smape:.2f}% SMAPE")
print(f"3. CURRENT BEST (embeddings):    52.33% SMAPE (XGBoost)")

print("\nüîç KEY FINDINGS:")
print(f"   ‚Ä¢ Brand overlap: {len(common_brands)/len(test_brands)*100:.1f}% of test brands seen in train")
print(f"   ‚Ä¢ Test samples with unseen brands: {len(test_unseen_brand)/len(test)*100:.1f}%")

print("\nüí° RECOMMENDATIONS:")
if baseline_smape < 54:
    print("   ‚úÖ Simple features are strong! Focus on:")
    print("      1. Better brand extraction (fuzzy matching)")
    print("      2. Quantity normalization (all to same unit)")
    print("      3. IMAGE FEATURES (likely the missing piece!)")
else:
    print("   ‚ö†Ô∏è Simple features are weak. Need:")
    print("      1. Better feature engineering")
    print("      2. Category extraction (food, electronics, etc.)")
    print("      3. IMAGE FEATURES (definitely needed!)")

if len(test_unseen_brand) > 10000:
    print("\n   ‚ö†Ô∏è CRITICAL: Many test brands not in train!")
    print("      This explains the validation-test gap!")
    print("      Solution: Use fallback features (category, value, images)")

print("\nüéØ NEXT ACTIONS:")
print("   1. Implement image feature extraction (ResNet50)")
print("   2. Better brand extraction with fuzzy matching")
print("   3. Quantity normalization to standard units")
print("   4. Category classification (keyword-based)")
print("   5. Re-train with these features (drop embeddings if they don't help)")