In [None]:
# ============================================================================
# LOCAL-ONLY TRAINING SCRIPT - NO HUGGINGFACE HUB DEPENDENCIES
# ============================================================================
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from tqdm.auto import tqdm
import warnings
from sklearn.preprocessing import LabelEncoder
import gc

warnings.filterwarnings('ignore')

# Set environment to avoid HF Hub calls
os.environ['HF_HUB_OFFLINE'] = '1'
os.environ['TRANSFORMERS_OFFLINE'] = '1'
os.environ['HF_DATASETS_OFFLINE'] = '1'

print("="*80)
print("LOCAL-ONLY TRAINING - NO HUGGINGFACE HUB ACCESS REQUIRED")
print("="*80)

# --- CONFIGURATION ---
V4_OUTPUT_PATH = '/kaggle/input/aml_new/keras/default/1/output_v4_advanced'
FINAL_OUTPUT_DIR = '/kaggle/working/output_v8_hierarchical'
TRAIN_CSV_PATH = "/kaggle/input/aml-csv/train.csv"
TEST_CSV_PATH = "/kaggle/input/aml-csv/test.csv"
IMAGE_DIR = "/kaggle/input/aml-train/AMAZON_ML_TRAIN"
TEST_IMAGE_DIR = "/kaggle/input/amazon-ml-test/AMAZON_ML_TEST"
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BATCH_SIZE = 64
LEARNING_RATE = 2e-5
EPOCHS = 100
PATIENCE = 15

# LOCAL MODEL PATHS - These must be Kaggle datasets you've added
TEXT_MODEL_PATH = '/kaggle/input/deberta-v3-base/transformers/default/1/deberta-v3-base'
IMAGE_MODEL_PATH = '/kaggle/input/dinov2/pytorch/large/1'

# Alternative paths to check (if primary path fails)
ALT_IMAGE_PATHS = [
    '/kaggle/input/dinov2/pytorch/large/1',
    '/kaggle/input/dinov2/pytorch/base/1',
    '/kaggle/input/dinov2',
]

os.makedirs(FINAL_OUTPUT_DIR, exist_ok=True)

print(f"Device: {DEVICE}")
print(f"PyTorch version: {torch.__version__}")

# --- IMPORT TRANSFORMERS AFTER SETTING ENV VARS ---
from transformers import (
    AutoModel, 
    AutoTokenizer, 
    AutoImageProcessor,
    AutoConfig
)

# --- LOSS FUNCTIONS ---
def smape_loss(y_pred, y_true, eps=1e-9):
    numerator = torch.abs(y_pred - y_true)
    denominator = (torch.abs(y_true) + torch.abs(y_pred)) / 2.0
    return torch.mean(numerator / (denominator + eps))

def smape_safe(y_true, y_pred, eps=1e-9):
    numerator = np.abs(y_pred - y_true)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2.0
    return np.mean(numerator / (denominator + eps))

def info_nce_loss(text_embeds, image_embeds, temperature=0.07):
    text_embeds = F.normalize(text_embeds, p=2, dim=-1)
    image_embeds = F.normalize(image_embeds, p=2, dim=-1)
    logits = torch.matmul(text_embeds, image_embeds.T) / temperature
    labels = torch.arange(len(logits), device=logits.device)
    loss_i = F.cross_entropy(logits, labels)
    loss_t = F.cross_entropy(logits.T, labels)
    return (loss_i + loss_t) / 2.0

# --- DATA PREPARATION ---
print("\n✓ Loading and preparing data...")
train_df_full = pd.read_csv(TRAIN_CSV_PATH)

if not os.path.exists(f'{V4_OUTPUT_PATH}/train_indices.csv'):
    from sklearn.model_selection import train_test_split
    train_indices, val_indices = train_test_split(
        train_df_full.index, test_size=0.2, random_state=42
    )
    print("  -> Created new train/val split")
else:
    train_indices = pd.read_csv(f'{V4_OUTPUT_PATH}/train_indices.csv')['train_idx'].values
    val_indices = pd.read_csv(f'{V4_OUTPUT_PATH}/val_indices.csv')['val_idx'].values
    print("  -> Loaded existing train/val split")

train_df = train_df_full.iloc[train_indices].reset_index(drop=True)
val_df = train_df_full.iloc[val_indices].reset_index(drop=True)
y_val_price_linear = val_df['price'].values

def extract_category_simple(text):
    text = str(text).lower()
    if 'electronic' in text: return 'electronics'
    if 'book' in text: return 'books'
    return 'other'

def extract_brand_simple(text):
    return str(text).split()[0] if text else 'unknown'

train_df['category'] = train_df['catalog_content'].apply(extract_category_simple)
val_df['category'] = val_df['catalog_content'].apply(extract_category_simple)
train_df['brand'] = train_df['catalog_content'].apply(extract_brand_simple)
val_df['brand'] = val_df['catalog_content'].apply(extract_brand_simple)

cat_encoder = LabelEncoder().fit(pd.concat([train_df['category'], val_df['category']]))
brand_encoder = LabelEncoder().fit(pd.concat([train_df['brand'], val_df['brand']]))
num_categories = len(cat_encoder.classes_)
num_brands = len(brand_encoder.classes_)

print(f"  -> Train samples: {len(train_df)}, Val samples: {len(val_df)}")
print(f"  -> Categories: {num_categories}, Brands: {num_brands}")

# --- MODEL ARCHITECTURE ---
class HierarchicalFusionLayer(nn.Module):
    def __init__(self, dim, num_heads, num_queries=8):
        super().__init__()
        self.query_tokens = nn.Parameter(torch.randn(1, num_queries, dim))
        self.attn_layer = nn.TransformerEncoderLayer(
            d_model=dim, nhead=num_heads, dim_feedforward=dim*2,
            dropout=0.1, activation='gelu', batch_first=True
        )
        self.attn = nn.TransformerEncoder(self.attn_layer, num_layers=2)
    
    def forward(self, text_emb, image_emb):
        queries = self.query_tokens.expand(text_emb.shape[0], -1, -1)
        transformer_input = torch.cat([queries, text_emb.unsqueeze(1), image_emb.unsqueeze(1)], dim=1)
        output = self.attn(transformer_input)
        return output[:, :self.query_tokens.size(1)].mean(dim=1)

class HierarchicalFusionNet(nn.Module):
    def __init__(self, text_dim=768, image_dim=768, num_levels=3, fusion_dim=512,
                 num_categories=10, num_brands=101):
        super().__init__()
        self.num_levels = num_levels
        
        self.text_projs = nn.ModuleList([nn.Linear(text_dim, fusion_dim) for _ in range(num_levels)])
        self.image_projs = nn.ModuleList([nn.Linear(image_dim, fusion_dim) for _ in range(num_levels)])
        self.fusion_layers = nn.ModuleList([HierarchicalFusionLayer(fusion_dim, 8) for _ in range(num_levels)])
        
        self.final_aggregator = nn.Sequential(
            nn.LayerNorm(fusion_dim * num_levels),
            nn.Linear(fusion_dim * num_levels, fusion_dim),
            nn.GELU(),
            nn.Dropout(0.1)
        )
        
        self.price_head = nn.Linear(fusion_dim, 1)
        self.category_head = nn.Linear(fusion_dim, num_categories)
        self.brand_head = nn.Linear(fusion_dim, num_brands)
    
    def forward(self, text_layers, image_layers):
        fused_level_outputs = []
        for i in range(self.num_levels):
            text_proj = self.text_projs[i](text_layers[:, i, :])
            image_proj = self.image_projs[i](image_layers[:, i, :])
            fused = self.fusion_layers[i](text_proj, image_proj)
            fused_level_outputs.append(fused)
        
        hierarchical_fused = torch.cat(fused_level_outputs, dim=1)
        final_embedding = self.final_aggregator(hierarchical_fused)
        
        price_log = self.price_head(final_embedding).squeeze(-1)
        category_logits = self.category_head(final_embedding)
        brand_logits = self.brand_head(final_embedding)
        
        top_text_proj = self.text_projs[-1](text_layers[:, -1, :])
        top_image_proj = self.image_projs[-1](image_layers[:, -1, :])
        
        return price_log, category_logits, brand_logits, top_text_proj, top_image_proj

print("✓ Hierarchical Fusion Network architecture defined.")

# --- DATASET ---
class EndToEndDataset(Dataset):
    def __init__(self, df, image_dir):
        self.df = df.copy()
        self.image_dir = image_dir
        self.df['cat_label'] = cat_encoder.transform(self.df['category'])
        self.df['brand_label'] = brand_encoder.transform(self.df['brand'])

    def __len__(self): 
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        return (
            row['catalog_content'], 
            os.path.join(self.image_dir, f"{row['sample_id']}.jpg"),
            np.log1p(row['price']), 
            row['cat_label'], 
            row['brand_label']
        )

def collate_fn(batch, tokenizer, processor):
    texts, paths, prices, cats, brands = zip(*batch)
    text_inputs = tokenizer(
        list(texts), padding="max_length", truncation=True, 
        max_length=256, return_tensors='pt'
    )
    
    images = []
    for p in paths:
        try:
            if os.path.exists(p):
                images.append(Image.open(p).convert('RGB'))
            else:
                images.append(Image.new('RGB', (224, 224)))
        except:
            images.append(Image.new('RGB', (224, 224)))
    
    image_inputs = processor(images, return_tensors='pt')
    
    return (
        text_inputs, image_inputs, 
        torch.tensor(prices, dtype=torch.float), 
        torch.tensor(cats, dtype=torch.long), 
        torch.tensor(brands, dtype=torch.long)
    )

# --- LOAD MODELS FROM LOCAL FILES ONLY ---
print("\n✓ Loading models from local files...")

def find_image_model_path():
    """Try to find the image model in possible locations"""
    for path in ALT_IMAGE_PATHS:
        if os.path.exists(path):
            print(f"  -> Found image model at: {path}")
            return path
    # Default fallback
    return IMAGE_MODEL_PATH

try:
    print(f"  -> Loading text model from: {TEXT_MODEL_PATH}")
    
    # Check if path exists
    if not os.path.exists(TEXT_MODEL_PATH):
        raise FileNotFoundError(
            f"Text model path not found: {TEXT_MODEL_PATH}\n"
            "Please add the 'deberta-v3-base' dataset to your Kaggle notebook."
        )
    
    text_model = AutoModel.from_pretrained(
        TEXT_MODEL_PATH, 
        local_files_only=True
    ).to(DEVICE)
    
    tokenizer = AutoTokenizer.from_pretrained(
        TEXT_MODEL_PATH, 
        local_files_only=True
    )
    
    print("  -> ✓ Text model loaded successfully")
    
    # Try to find and load image model
    actual_image_path = find_image_model_path()
    
    if not os.path.exists(actual_image_path):
        print(f"\n  -> WARNING: Image model not found at {actual_image_path}")
        print("  -> You need to add the DINOv2 model as a Kaggle dataset")
        print("  -> Attempting to download from HuggingFace (requires internet)...")
        
        image_model = AutoModel.from_pretrained(
            'facebook/dinov2-base'
        ).to(DEVICE)
        
        processor = AutoImageProcessor.from_pretrained(
            'facebook/dinov2-base'
        )
    else:
        print(f"  -> Loading image model from: {actual_image_path}")
        image_model = AutoModel.from_pretrained(
            actual_image_path, 
            local_files_only=True
        ).to(DEVICE)
        
        processor = AutoImageProcessor.from_pretrained(
            actual_image_path, 
            local_files_only=True
        )
    
    print("  -> ✓ Image model loaded successfully")
    
except Exception as e:
    print("\n" + "="*80)
    print("ERROR LOADING MODELS")
    print("="*80)
    print(f"Error: {e}")
    print("\nPlease ensure you have added these Kaggle datasets:")
    print("1. deberta-v3-base")
    print("2. dinov2-base or facebook/dinov2-base")
    print("\nTo add a dataset:")
    print("  1. Go to Add Data > Search for 'deberta-v3-base'")
    print("  2. Add it to your notebook")
    print("  3. Repeat for 'dinov2-base'")
    print("="*80)
    raise

# Freeze models
for param in text_model.parameters(): 
    param.requires_grad = False
for param in image_model.parameters(): 
    param.requires_grad = False

text_model.eval()
image_model.eval()

print(f"  -> Text model hidden size: {text_model.config.hidden_size}")
print(f"  -> Image model hidden size: {image_model.config.hidden_size}")

# --- CREATE DATALOADERS ---
print("\n✓ Creating data loaders...")
train_dataset = EndToEndDataset(train_df, IMAGE_DIR)
val_dataset = EndToEndDataset(val_df, IMAGE_DIR)

train_loader = DataLoader(
    train_dataset, batch_size=BATCH_SIZE, shuffle=True, 
    num_workers=0, collate_fn=lambda b: collate_fn(b, tokenizer, processor)
)
val_loader = DataLoader(
    val_dataset, batch_size=BATCH_SIZE*2, shuffle=False, 
    num_workers=0, collate_fn=lambda b: collate_fn(b, tokenizer, processor)
)

print(f"  -> Train batches: {len(train_loader)}, Val batches: {len(val_loader)}")

# --- INITIALIZE MODEL & OPTIMIZER ---
print("\n✓ Initializing hierarchical fusion model...")
hierarchical_model = HierarchicalFusionNet(
    text_dim=text_model.config.hidden_size, 
    image_dim=image_model.config.hidden_size,
    num_categories=num_categories, 
    num_brands=num_brands
).to(DEVICE)

optimizer = torch.optim.AdamW(hierarchical_model.parameters(), lr=LEARNING_RATE)
price_mse = nn.MSELoss()
cat_ce = nn.CrossEntropyLoss()
brand_ce = nn.CrossEntropyLoss()

total_params = sum(p.numel() for p in hierarchical_model.parameters() if p.requires_grad)
print(f"  -> Trainable parameters: {total_params:,}")

# --- TRAINING LOOP ---
print("\n" + "="*80)
print("STARTING TRAINING")
print("="*80)

best_val_smape = float('inf')
patience_counter = 0

for epoch in range(EPOCHS):
    hierarchical_model.train()
    epoch_loss = 0
    num_batches = 0
    
    pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS}")
    for text_in, image_in, price_t, cat_t, brand_t in pbar:
        text_in = {k: v.to(DEVICE) for k, v in text_in.items()}
        image_in = {k: v.to(DEVICE) for k, v in image_in.items()}
        price_t = price_t.to(DEVICE)
        cat_t = cat_t.to(DEVICE)
        brand_t = brand_t.to(DEVICE)
        
        # Extract features
        with torch.no_grad():
            text_hs = text_model(**text_in, output_hidden_states=True).hidden_states
            image_hs = image_model(**image_in, output_hidden_states=True).hidden_states
            text_layers = torch.stack([text_hs[i][:,0,:] for i in [4, 8, 12]], dim=1)
            image_layers = torch.stack([image_hs[i][:,0,:] for i in [4, 8, 12]], dim=1)

        # Forward pass
        optimizer.zero_grad()
        price_p, cat_p, brand_p, top_text, top_image = hierarchical_model(text_layers, image_layers)
        
        # Combined loss
        loss = (0.5 * smape_loss(torch.expm1(price_p), torch.expm1(price_t)) + 
                0.1 * price_mse(price_p, price_t)) + \
               (0.15 * cat_ce(cat_p, cat_t)) + \
               (0.15 * brand_ce(brand_p, brand_t)) + \
               (0.1 * info_nce_loss(top_text, top_image))
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        num_batches += 1
        pbar.set_postfix({'loss': f'{loss.item():.4f}'})

    avg_train_loss = epoch_loss / num_batches

    # --- Validation ---
    hierarchical_model.eval()
    val_preds_log = []
    
    with torch.no_grad():
        for text_in, image_in, _, _, _ in tqdm(val_loader, desc="Validation", leave=False):
            text_in = {k: v.to(DEVICE) for k, v in text_in.items()}
            image_in = {k: v.to(DEVICE) for k, v in image_in.items()}
            
            text_hs = text_model(**text_in, output_hidden_states=True).hidden_states
            image_hs = image_model(**image_in, output_hidden_states=True).hidden_states
            text_layers = torch.stack([text_hs[i][:,0,:] for i in [4, 8, 12]], dim=1)
            image_layers = torch.stack([image_hs[i][:,0,:] for i in [4, 8, 12]], dim=1)
            
            price_p, _, _, _, _ = hierarchical_model(text_layers, image_layers)
            val_preds_log.append(price_p.cpu().numpy())
            
    val_preds_log = np.concatenate(val_preds_log)
    val_preds_linear = np.expm1(val_preds_log)
    val_smape = smape_safe(y_val_price_linear, val_preds_linear)
    
    print(f"\nEpoch {epoch+1}/{EPOCHS} | Train Loss: {avg_train_loss:.4f} | Val SMAPE: {val_smape:.4f}")
    
    if val_smape < best_val_smape:
        best_val_smape = val_smape
        patience_counter = 0
        torch.save(
            hierarchical_model.state_dict(), 
            f'{FINAL_OUTPUT_DIR}/best_hierarchical_model.pth'
        )
        print(f"  -> ✓ New best model saved! Val SMAPE: {best_val_smape:.4f}")
    else:
        patience_counter += 1
        print(f"  -> No improvement (patience: {patience_counter}/{PATIENCE})")
        if patience_counter >= PATIENCE:
            print("  -> Early stopping triggered.")
            break

print("\n" + "="*80)
print(f"TRAINING COMPLETE - Best Val SMAPE: {best_val_smape:.4f}")
print("="*80)

# Cleanup
del train_loader, val_loader
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()

# --- FINAL PREDICTION ---
print("\n" + "="*80)
print("GENERATING FINAL PREDICTIONS")
print("="*80)

# Load best model
final_model = HierarchicalFusionNet(
    text_dim=text_model.config.hidden_size, 
    image_dim=image_model.config.hidden_size,
    num_categories=num_categories, 
    num_brands=num_brands
).to(DEVICE)

final_model.load_state_dict(
    torch.load(f'{FINAL_OUTPUT_DIR}/best_hierarchical_model.pth')
)
final_model.eval()
print("✓ Best model loaded")

# Prepare test data
print("✓ Loading test data...")
test_df = pd.read_csv(TEST_CSV_PATH)
test_df['price'] = 0
test_df['category'] = 'other'
test_df['brand'] = 'unknown'

test_dataset = EndToEndDataset(test_df, TEST_IMAGE_DIR)
test_loader = DataLoader(
    test_dataset, batch_size=BATCH_SIZE*2, shuffle=False, 
    num_workers=0, collate_fn=lambda b: collate_fn(b, tokenizer, processor)
)

# Inference
print("✓ Running inference...")
test_preds_log = []

with torch.no_grad():
    for text_in, image_in, _, _, _ in tqdm(test_loader, desc="Predicting"):
        text_in = {k: v.to(DEVICE) for k, v in text_in.items()}
        image_in = {k: v.to(DEVICE) for k, v in image_in.items()}
        
        text_hs = text_model(**text_in, output_hidden_states=True).hidden_states
        image_hs = image_model(**image_in, output_hidden_states=True).hidden_states
        text_layers = torch.stack([text_hs[i][:,0,:] for i in [4, 8, 12]], dim=1)
        image_layers = torch.stack([image_hs[i][:,0,:] for i in [4, 8, 12]], dim=1)
        
        price_p, _, _, _, _ = final_model(text_layers, image_layers)
        test_preds_log.append(price_p.cpu().numpy())

test_preds_log = np.concatenate(test_preds_log)
final_test_preds = np.expm1(test_preds_log)

# Create submission
if 'sample_id' not in test_df.columns:
    test_df['sample_id'] = test_df.index

submission = pd.DataFrame({
    'sample_id': test_df['sample_id'], 
    'price': final_test_preds
})

submission_path = f'{FINAL_OUTPUT_DIR}/submission.csv'
submission.to_csv(submission_path, index=False)

print("\n" + "="*80)
print("SUBMISSION COMPLETE")
print("="*80)
print(f"✓ Best Validation SMAPE: {best_val_smape:.4f}")
print(f"✓ Test predictions: {len(final_test_preds):,}")
print(f"✓ Price range: [{final_test_preds.min():.2f}, {final_test_preds.max():.2f}]")
print(f"✓ Submission file: {submission_path}")
print("="*80)

LOCAL-ONLY TRAINING - NO HUGGINGFACE HUB ACCESS REQUIRED
Device: cuda
PyTorch version: 2.6.0+cu124

✓ Loading and preparing data...
  -> Loaded existing train/val split
  -> Train samples: 60000, Val samples: 15000
  -> Categories: 3, Brands: 1
✓ Hierarchical Fusion Network architecture defined.

✓ Loading models from local files...
  -> Loading text model from: /kaggle/input/deberta-v3-base/transformers/default/1/deberta-v3-base
  -> ✓ Text model loaded successfully
  -> Found image model at: /kaggle/input/dinov2/pytorch/large/1
  -> Loading image model from: /kaggle/input/dinov2/pytorch/large/1


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


  -> ✓ Image model loaded successfully
  -> Text model hidden size: 768
  -> Image model hidden size: 1024

✓ Creating data loaders...
  -> Train batches: 938, Val batches: 118

✓ Initializing hierarchical fusion model...
  -> Trainable parameters: 22,485,509

STARTING TRAINING


Epoch 1/100:   0%|          | 0/938 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

In [None]:
import kagglehub

kagglehub.login()

# Replace with path to directory containing model files.
LOCAL_MODEL_DIR = '/kaggle/working/'

MODEL_SLUG = 'aml_hirercial' # Replace with model slug.

# Learn more about naming model variations at
# https://www.kaggle.com/docs/models#name-model.
VARIATION_SLUG = 'default' # Replace with variation slug.

kagglehub.model_upload(
  handle = f"kartikgarg74/{MODEL_SLUG}/keras/{VARIATION_SLUG}",
  local_model_dir = LOCAL_MODEL_DIR,
  version_notes = 'Update 2025-10-12')

In [1]:
# ============================================================================
# HIERARCHICAL FUSION WITH PRE-COMPUTED EMBEDDINGS - FAST VERSION
# ============================================================================
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, TensorDataset
from tqdm.auto import tqdm
import warnings
from sklearn.preprocessing import LabelEncoder
import gc

warnings.filterwarnings('ignore')

print("="*80)
print("HIERARCHICAL FUSION - USING PRE-COMPUTED EMBEDDINGS")
print("="*80)

# --- CONFIGURATION ---
EMBEDDINGS_PATH = '/kaggle/input/aml_new/keras/default/1/output_v4_advanced/embeddings'
V4_OUTPUT_PATH = '/kaggle/input/aml_new/keras/default/1/output_v4_advanced'
FINAL_OUTPUT_DIR = '/kaggle/working/output_v9_hierarchical_fast'
TRAIN_CSV_PATH = "/kaggle/input/aml-csv/train.csv"
TEST_CSV_PATH = "/kaggle/input/aml-csv/test.csv"

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BATCH_SIZE = 256  # Much larger since we're just using embeddings
LEARNING_RATE = 3e-4  # Can use higher LR with pre-computed features
EPOCHS = 50
PATIENCE = 10

os.makedirs(FINAL_OUTPUT_DIR, exist_ok=True)

print(f"Device: {DEVICE}")
print(f"PyTorch version: {torch.__version__}")

# --- LOSS FUNCTIONS ---
def smape_loss(y_pred, y_true, eps=1e-9):
    numerator = torch.abs(y_pred - y_true)
    denominator = (torch.abs(y_true) + torch.abs(y_pred)) / 2.0
    return torch.mean(numerator / (denominator + eps))

def smape_safe(y_true, y_pred, eps=1e-9):
    numerator = np.abs(y_pred - y_true)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2.0
    return np.mean(numerator / (denominator + eps))

def info_nce_loss(text_embeds, image_embeds, temperature=0.07):
    text_embeds = F.normalize(text_embeds, p=2, dim=-1)
    image_embeds = F.normalize(image_embeds, p=2, dim=-1)
    logits = torch.matmul(text_embeds, image_embeds.T) / temperature
    labels = torch.arange(len(logits), device=logits.device)
    loss_i = F.cross_entropy(logits, labels)
    loss_t = F.cross_entropy(logits.T, labels)
    return (loss_i + loss_t) / 2.0

# --- LOAD PRE-COMPUTED EMBEDDINGS ---
print("\n✓ Loading pre-computed embeddings...")
print(f"  -> From: {EMBEDDINGS_PATH}")

X_text_train = np.load(f'{EMBEDDINGS_PATH}/text_train.npy')
X_text_val = np.load(f'{EMBEDDINGS_PATH}/text_val.npy')
X_text_test = np.load(f'{EMBEDDINGS_PATH}/text_test.npy')

X_image_train = np.load(f'{EMBEDDINGS_PATH}/image_train.npy')
X_image_val = np.load(f'{EMBEDDINGS_PATH}/image_val.npy')
X_image_test = np.load(f'{EMBEDDINGS_PATH}/image_test.npy')

print(f"  -> Text embeddings: Train {X_text_train.shape}, Val {X_text_val.shape}, Test {X_text_test.shape}")
print(f"  -> Image embeddings: Train {X_image_train.shape}, Val {X_image_val.shape}, Test {X_image_test.shape}")

# --- LOAD LABELS ---
print("\n✓ Loading labels and indices...")
train_indices = pd.read_csv(f'{V4_OUTPUT_PATH}/train_indices.csv')['train_idx'].values
val_indices = pd.read_csv(f'{V4_OUTPUT_PATH}/val_indices.csv')['val_idx'].values

train_df_full = pd.read_csv(TRAIN_CSV_PATH)
train_df = train_df_full.iloc[train_indices].reset_index(drop=True)
val_df = train_df_full.iloc[val_indices].reset_index(drop=True)

y_train_price = train_df['price'].values
y_val_price = val_df['price'].values
y_train_price_log = np.log1p(y_train_price)
y_val_price_log = np.log1p(y_val_price)

# Create category and brand labels
def extract_category_simple(text):
    text = str(text).lower()
    if 'electronic' in text: return 'electronics'
    if 'book' in text: return 'books'
    return 'other'

def extract_brand_simple(text):
    return str(text).split()[0] if text else 'unknown'

train_df['category'] = train_df['catalog_content'].apply(extract_category_simple)
val_df['category'] = val_df['catalog_content'].apply(extract_category_simple)
train_df['brand'] = train_df['catalog_content'].apply(extract_brand_simple)
val_df['brand'] = val_df['catalog_content'].apply(extract_brand_simple)

cat_encoder = LabelEncoder().fit(pd.concat([train_df['category'], val_df['category']]))
brand_encoder = LabelEncoder().fit(pd.concat([train_df['brand'], val_df['brand']]))

y_train_cat = cat_encoder.transform(train_df['category'])
y_val_cat = cat_encoder.transform(val_df['category'])
y_train_brand = brand_encoder.transform(train_df['brand'])
y_val_brand = brand_encoder.transform(val_df['brand'])

num_categories = len(cat_encoder.classes_)
num_brands = len(brand_encoder.classes_)

print(f"  -> Train: {len(train_df)} samples")
print(f"  -> Val: {len(val_df)} samples")
print(f"  -> Categories: {num_categories}, Brands: {num_brands}")

# --- MODEL ARCHITECTURE ---
class HierarchicalFusionLayer(nn.Module):
    """Single fusion block with attention"""
    def __init__(self, dim, num_heads, num_queries=8):
        super().__init__()
        self.query_tokens = nn.Parameter(torch.randn(1, num_queries, dim))
        attn_layer = nn.TransformerEncoderLayer(
            d_model=dim, nhead=num_heads, dim_feedforward=dim*2,
            dropout=0.1, activation='gelu', batch_first=True
        )
        self.attn = nn.TransformerEncoder(attn_layer, num_layers=2)
    
    def forward(self, text_emb, image_emb):
        batch_size = text_emb.shape[0]
        queries = self.query_tokens.expand(batch_size, -1, -1)
        transformer_input = torch.cat([queries, text_emb.unsqueeze(1), image_emb.unsqueeze(1)], dim=1)
        output = self.attn(transformer_input)
        return output[:, :self.query_tokens.size(1)].mean(dim=1)

class SimpleFusionNet(nn.Module):
    """Simplified fusion using single-layer embeddings with hierarchical-style fusion"""
    def __init__(self, text_dim=768, image_dim=1024, fusion_dim=512,
                 num_categories=10, num_brands=101):
        super().__init__()
        
        # Project to common dimension
        self.text_proj = nn.Linear(text_dim, fusion_dim)
        self.image_proj = nn.Linear(image_dim, fusion_dim)
        
        # Fusion layer with attention
        self.fusion_layer = HierarchicalFusionLayer(fusion_dim, num_heads=8)
        
        # Additional processing
        self.final_mlp = nn.Sequential(
            nn.LayerNorm(fusion_dim),
            nn.Linear(fusion_dim, fusion_dim),
            nn.GELU(),
            nn.Dropout(0.1),
            nn.Linear(fusion_dim, fusion_dim // 2),
            nn.GELU()
        )
        
        # Prediction heads
        self.price_head = nn.Linear(fusion_dim // 2, 1)
        self.category_head = nn.Linear(fusion_dim // 2, num_categories)
        self.brand_head = nn.Linear(fusion_dim // 2, num_brands)
    
    def forward(self, text_emb, image_emb):
        # Project to common space
        text_proj = self.text_proj(text_emb)
        image_proj = self.image_proj(image_emb)
        
        # Fuse with attention
        fused = self.fusion_layer(text_proj, image_proj)
        
        # Final processing
        final_embedding = self.final_mlp(fused)
        
        # Predictions
        price_log = self.price_head(final_embedding).squeeze(-1)
        category_logits = self.category_head(final_embedding)
        brand_logits = self.brand_head(final_embedding)
        
        return price_log, category_logits, brand_logits, text_proj, image_proj

print("\n✓ Hierarchical Fusion Network architecture defined.")

# --- CREATE DATASETS ---
train_dataset = TensorDataset(
    torch.from_numpy(X_text_train).float(),
    torch.from_numpy(X_image_train).float(),
    torch.from_numpy(y_train_price_log).float(),
    torch.from_numpy(y_train_cat).long(),
    torch.from_numpy(y_train_brand).long()
)

val_dataset = TensorDataset(
    torch.from_numpy(X_text_val).float(),
    torch.from_numpy(X_image_val).float(),
    torch.from_numpy(y_val_price_log).float(),
    torch.from_numpy(y_val_cat).long(),
    torch.from_numpy(y_val_brand).long()
)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE*2, shuffle=False, num_workers=2)

print(f"\n✓ Data loaders created")
print(f"  -> Train batches: {len(train_loader)}")
print(f"  -> Val batches: {len(val_loader)}")

# --- INITIALIZE MODEL ---
model = SimpleFusionNet(
    text_dim=X_text_train.shape[1],
    image_dim=X_image_train.shape[1],
    num_categories=num_categories,
    num_brands=num_brands
).to(DEVICE)

optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=0.01)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3)

price_mse = nn.MSELoss()
cat_ce = nn.CrossEntropyLoss()
brand_ce = nn.CrossEntropyLoss()

total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"\n✓ Model initialized")
print(f"  -> Trainable parameters: {total_params:,}")

# --- TRAINING LOOP ---
print("\n" + "="*80)
print("STARTING FAST TRAINING")
print("="*80)

best_val_smape = float('inf')
patience_counter = 0

for epoch in range(EPOCHS):
    # Training
    model.train()
    epoch_loss = 0
    num_batches = 0
    
    for text_emb, image_emb, price_t, cat_t, brand_t in tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS}"):
        text_emb = text_emb.to(DEVICE)
        image_emb = image_emb.to(DEVICE)
        price_t = price_t.to(DEVICE)
        cat_t = cat_t.to(DEVICE)
        brand_t = brand_t.to(DEVICE)
        
        optimizer.zero_grad()
        
        price_p, cat_p, brand_p, text_proj, image_proj = model(text_emb, image_emb)
        
        # Combined loss
        loss = (0.5 * smape_loss(torch.expm1(price_p), torch.expm1(price_t)) + 
                0.1 * price_mse(price_p, price_t)) + \
               (0.15 * cat_ce(cat_p, cat_t)) + \
               (0.15 * brand_ce(brand_p, brand_t)) + \
               (0.1 * info_nce_loss(text_proj, image_proj))
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        
        epoch_loss += loss.item()
        num_batches += 1
    
    avg_train_loss = epoch_loss / num_batches
    
    # Validation
    model.eval()
    val_preds_log = []
    
    with torch.no_grad():
        for text_emb, image_emb, _, _, _ in val_loader:
            text_emb = text_emb.to(DEVICE)
            image_emb = image_emb.to(DEVICE)
            
            price_p, _, _, _, _ = model(text_emb, image_emb)
            val_preds_log.append(price_p.cpu().numpy())
    
    val_preds_log = np.concatenate(val_preds_log)
    val_preds_linear = np.expm1(val_preds_log)
    val_smape = smape_safe(y_val_price, val_preds_linear)
    
    scheduler.step(val_smape)
    
    print(f"\nEpoch {epoch+1}/{EPOCHS} | Train Loss: {avg_train_loss:.4f} | Val SMAPE: {val_smape:.4f}")
    
    if val_smape < best_val_smape:
        best_val_smape = val_smape
        patience_counter = 0
        torch.save(model.state_dict(), f'{FINAL_OUTPUT_DIR}/best_model.pth')
        print(f"  -> ✓ New best model saved! Val SMAPE: {best_val_smape:.4f}")
    else:
        patience_counter += 1
        print(f"  -> No improvement (patience: {patience_counter}/{PATIENCE})")
        if patience_counter >= PATIENCE:
            print("  -> Early stopping triggered.")
            break

print("\n" + "="*80)
print(f"TRAINING COMPLETE - Best Val SMAPE: {best_val_smape:.4f}")
print("="*80)

# --- FINAL PREDICTION ---
print("\n✓ Generating final predictions...")

# Load best model
model.load_state_dict(torch.load(f'{FINAL_OUTPUT_DIR}/best_model.pth'))
model.eval()

# Test predictions
test_dataset = TensorDataset(torch.from_numpy(X_text_test).float(), torch.from_numpy(X_image_test).float())
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE*2, shuffle=False)

test_preds_log = []
with torch.no_grad():
    for text_emb, image_emb in tqdm(test_loader, desc="Predicting"):
        text_emb = text_emb.to(DEVICE)
        image_emb = image_emb.to(DEVICE)
        
        price_p, _, _, _, _ = model(text_emb, image_emb)
        test_preds_log.append(price_p.cpu().numpy())

test_preds_log = np.concatenate(test_preds_log)
final_test_preds = np.expm1(test_preds_log)

# Create submission
test_df = pd.read_csv(TEST_CSV_PATH)
if 'sample_id' not in test_df.columns:
    test_df['sample_id'] = test_df.index

submission = pd.DataFrame({
    'sample_id': test_df['sample_id'], 
    'price': final_test_preds
})

submission_path = f'{FINAL_OUTPUT_DIR}/submission.csv'
submission.to_csv(submission_path, index=False)

print("\n" + "="*80)
print("SUBMISSION COMPLETE")
print("="*80)
print(f"✓ Best Validation SMAPE: {best_val_smape:.4f}")
print(f"✓ Test predictions: {len(final_test_preds):,}")
print(f"✓ Price range: [{final_test_preds.min():.2f}, {final_test_preds.max():.2f}]")
print(f"✓ Submission file: {submission_path}")
print("="*80)


HIERARCHICAL FUSION - USING PRE-COMPUTED EMBEDDINGS
Device: cuda
PyTorch version: 2.6.0+cu124

✓ Loading pre-computed embeddings...
  -> From: /kaggle/input/aml_new/keras/default/1/output_v4_advanced/embeddings
  -> Text embeddings: Train (60000, 768), Val (15000, 768), Test (75000, 768)
  -> Image embeddings: Train (60000, 1024), Val (15000, 1024), Test (75000, 1024)

✓ Loading labels and indices...
  -> Train: 60000 samples
  -> Val: 15000 samples
  -> Categories: 3, Brands: 1

✓ Hierarchical Fusion Network architecture defined.

✓ Data loaders created
  -> Train batches: 235
  -> Val batches: 30

✓ Model initialized
  -> Trainable parameters: 5,524,485

STARTING FAST TRAINING


Epoch 1/50:   0%|          | 0/235 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x78e13196cd60>
Traceback (most recent call last):
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x78e13196cd60>  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__

Traceback (most recent call last):
      File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
self._shutdown_workers()    
self._shutdown_workers()  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers

  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers
    if w.is_alive():    
if w.is_alive(): 
            ^ ^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/multiprocessing/process.py", line 160, in is_alive

  File "/usr/lib/python3.11/multiprocessing/process.py", line 160, in is_alive
        assert self.


Epoch 1/50 | Train Loss: 0.9923 | Val SMAPE: 0.7450
  -> ✓ New best model saved! Val SMAPE: 0.7450


Epoch 2/50:   0%|          | 0/235 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x78e13196cd60>
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers
    if w.is_alive():
       Exception ignored in: ^<function _MultiProcessingDataLoaderIter.__del__ at 0x78e13196cd60>^
Traceback (most recent call last):
^^  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
^    self._shutdown_workers()^^
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers
^    ^if w.is_alive():^^
 ^
   File "/usr/lib/python3.11/multiprocessing/process.py", line 160, in is_alive
       assert self._parent_pid == os.getpid(), 'can only test a child process' 
  ^ ^ ^ ^ ^ ^ ^^ ^ ^^  ^^
^^  File "/


Epoch 2/50 | Train Loss: 0.9152 | Val SMAPE: 0.6525
  -> ✓ New best model saved! Val SMAPE: 0.6525


Epoch 3/50:   0%|          | 0/235 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x78e13196cd60>
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers
    if w.is_alive():
       ^^^^Exception ignored in: ^<function _MultiProcessingDataLoaderIter.__del__ at 0x78e13196cd60>^
^^Traceback (most recent call last):
^  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
^    ^self._shutdown_workers()^
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers

    if w.is_alive():  File "/usr/lib/python3.11/multiprocessing/process.py", line 160, in is_alive

     assert self._parent_pid == os.getpid(), 'can only test a child process' 
          ^ ^ ^^^ ^ ^ ^ ^^^^^^^^
^  File 


Epoch 3/50 | Train Loss: 0.8255 | Val SMAPE: 0.6223
  -> ✓ New best model saved! Val SMAPE: 0.6223


Epoch 4/50:   0%|          | 0/235 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x78e13196cd60>
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers
    if w.is_alive():
       ^^^^^^^^^^^^
  File "/usr/lib/python3.11/multiprocessing/process.py", line 160, in is_alive
Exception ignored in:     <function _MultiProcessingDataLoaderIter.__del__ at 0x78e13196cd60>assert self._parent_pid == os.getpid(), 'can only test a child process'

 Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
      self._shutdown_workers() 
   File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers
       if w.is_alive(): 
    ^ ^ ^^ ^  ^^^^^^^^^^^^^^^^^^^^


Epoch 4/50 | Train Loss: 0.7962 | Val SMAPE: 0.6128
  -> ✓ New best model saved! Val SMAPE: 0.6128


Epoch 5/50:   0%|          | 0/235 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x78e13196cd60>
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers
    if w.is_alive():
       ^^^^^^^Exception ignored in: ^<function _MultiProcessingDataLoaderIter.__del__ at 0x78e13196cd60>^
Traceback (most recent call last):
^  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
^    ^self._shutdown_workers()

  File "/usr/lib/python3.11/multiprocessing/process.py", line 160, in is_alive
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers
        assert self._parent_pid == os.getpid(), 'can only test a child process'if w.is_alive():

               ^ ^ ^ ^^^^^^^^^^^^^^^^^
^  F


Epoch 5/50 | Train Loss: 0.7745 | Val SMAPE: 0.6098
  -> ✓ New best model saved! Val SMAPE: 0.6098


Epoch 6/50:   0%|          | 0/235 [00:00<?, ?it/s]

Exception ignored in: Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x78e13196cd60><function _MultiProcessingDataLoaderIter.__del__ at 0x78e13196cd60>

Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
        self._shutdown_workers()self._shutdown_workers()

  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers
        if w.is_alive():if w.is_alive():

             ^ ^^^^^^^^^^^Exception ignored in: Exception ignored in: ^^<function _MultiProcessingDataLoaderIter.__del__ at 0x78e13196cd60>^<function _MultiProcessingDataLoaderIter.__del__ at 0x78e13196cd60>
^
^Traceba


Epoch 6/50 | Train Loss: 0.7596 | Val SMAPE: 0.6027
  -> ✓ New best model saved! Val SMAPE: 0.6027


Epoch 7/50:   0%|          | 0/235 [00:00<?, ?it/s]

Exception ignored in: Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x78e13196cd60><function _MultiProcessingDataLoaderIter.__del__ at 0x78e13196cd60>

Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
        self._shutdown_workers()self._shutdown_workers()

  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers
    if w.is_alive():
       if w.is_alive(): 
     ^^ ^ ^  ^ ^^^^^^^^^^^^^
^  File "/usr/lib/python3.11/multiprocessing/process.py", line 160, in is_alive
Exception ignored in:     <function _MultiProcessingDataLoaderIter.__del__ at 0x78e13196cd60>^assert self


Epoch 7/50 | Train Loss: 0.7467 | Val SMAPE: 0.6133
  -> No improvement (patience: 1/10)


Epoch 8/50:   0%|          | 0/235 [00:00<?, ?it/s]

Exception ignored in: Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x78e13196cd60><function _MultiProcessingDataLoaderIter.__del__ at 0x78e13196cd60>

Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
        self._shutdown_workers()self._shutdown_workers()
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers

  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers
        if w.is_alive():
 
if w.is_alive():            ^^ ^^^^^^Exception ignored in: ^<function _MultiProcessingDataLoaderIter.__del__ at 0x78e13196cd60>Exception ignored in: ^^
^<function _MultiProcessingDataLoaderIter.__del__ at 0x78e13196cd60>Traceback (mo


Epoch 8/50 | Train Loss: 0.7343 | Val SMAPE: 0.6003
  -> ✓ New best model saved! Val SMAPE: 0.6003


Epoch 9/50:   0%|          | 0/235 [00:00<?, ?it/s]

Exception ignored in: Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x78e13196cd60><function _MultiProcessingDataLoaderIter.__del__ at 0x78e13196cd60>

Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
    self._shutdown_workers()    
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers
self._shutdown_workers()
    if w.is_alive():  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers

     if w.is_alive(): 
          ^ ^ ^^^^^^^^^^^^^^Exception ignored in: ^^<function _MultiProcessingDataLoaderIter.__del__ at 0x78e13196cd60>Exception ignored in: ^^^
<function _MultiProcessingDataLoaderIter.__del__ at 0x78e13196cd60>^^
T


Epoch 9/50 | Train Loss: 0.7249 | Val SMAPE: 0.5972
  -> ✓ New best model saved! Val SMAPE: 0.5972


Exception ignored in: 

Epoch 10/50:   0%|          | 0/235 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x78e13196cd60><function _MultiProcessingDataLoaderIter.__del__ at 0x78e13196cd60>

Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
        self._shutdown_workers()self._shutdown_workers()

  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers
        if w.is_alive():if w.is_alive():

              ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/multiprocessing/process.py", line 160, in is_alive

  File "/usr/lib/python3.11/multiprocessing/process.py", line 160, in is_alive
        assert self._parent_pid == os.getp


Epoch 10/50 | Train Loss: 0.7144 | Val SMAPE: 0.6024
  -> No improvement (patience: 1/10)


Epoch 11/50:   0%|          | 0/235 [00:00<?, ?it/s]

Exception ignored in: Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x78e13196cd60><function _MultiProcessingDataLoaderIter.__del__ at 0x78e13196cd60>

Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
        self._shutdown_workers()self._shutdown_workers()

  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers
        if w.is_alive():if w.is_alive():

             ^ ^^^^^^^^^^^^^^^^^^^^^^^

  File "/usr/lib/python3.11/multiprocessing/process.py", line 160, in is_alive
  File "/usr/lib/python3.11/multiprocessing/process.py", line 160, in is_alive
        assert self.


Epoch 11/50 | Train Loss: 0.7071 | Val SMAPE: 0.5888
  -> ✓ New best model saved! Val SMAPE: 0.5888


Epoch 12/50:   0%|          | 0/235 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x78e13196cd60>
Traceback (most recent call last):
Exception ignored in:   File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
<function _MultiProcessingDataLoaderIter.__del__ at 0x78e13196cd60>
    Traceback (most recent call last):
self._shutdown_workers()  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
    
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers
self._shutdown_workers()    
if w.is_alive():  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers

       if w.is_alive(): 
       ^ ^ ^ ^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/multiprocessing/process.py", line 160, in is_alive
^    ^assert self._parent_pid == os.getpid(), 'can only test a child process'^

   File "/usr/lib/pytho


Epoch 12/50 | Train Loss: 0.7014 | Val SMAPE: 0.5927
  -> No improvement (patience: 1/10)


Epoch 13/50:   0%|          | 0/235 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x78e13196cd60>
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers
    if w.is_alive():
       ^^^^^^Exception ignored in: ^^<function _MultiProcessingDataLoaderIter.__del__ at 0x78e13196cd60>
^Traceback (most recent call last):
^  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
^    ^
self._shutdown_workers()  File "/usr/lib/python3.11/multiprocessing/process.py", line 160, in is_alive

      File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers
assert self._parent_pid == os.getpid(), 'can only test a child process'    
if w.is_alive(): 
              ^ ^^ ^ ^^^^^^^^^^^^^^^^^^
^ 


Epoch 13/50 | Train Loss: 0.6958 | Val SMAPE: 0.5907
  -> No improvement (patience: 2/10)


Epoch 14/50:   0%|          | 0/235 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x78e13196cd60>
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers
    if w.is_alive():
       ^^^Exception ignored in: ^<function _MultiProcessingDataLoaderIter.__del__ at 0x78e13196cd60>^^
Traceback (most recent call last):
^  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
^    ^self._shutdown_workers()^
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers
^^    
  File "/usr/lib/python3.11/multiprocessing/process.py", line 160, in is_alive
if w.is_alive():
    assert self._parent_pid == os.getpid(), 'can only test a child process' 
            ^^ ^ ^ ^ ^^ ^^^^^^^^^^
  File 


Epoch 14/50 | Train Loss: 0.6918 | Val SMAPE: 0.5868
  -> ✓ New best model saved! Val SMAPE: 0.5868


Epoch 15/50:   0%|          | 0/235 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x78e13196cd60>
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers
    if w.is_alive():
       ^^^Exception ignored in: ^<function _MultiProcessingDataLoaderIter.__del__ at 0x78e13196cd60>^
^^Traceback (most recent call last):
^  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
^    ^self._shutdown_workers()^
^  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers

  File "/usr/lib/python3.11/multiprocessing/process.py", line 160, in is_alive
        if w.is_alive():assert self._parent_pid == os.getpid(), 'can only test a child process'

              ^ ^ ^ ^ ^^^^^^^^^^^^^^^^
^  F


Epoch 15/50 | Train Loss: 0.6855 | Val SMAPE: 0.5964
  -> No improvement (patience: 1/10)


Epoch 16/50:   0%|          | 0/235 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x78e13196cd60>
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers
    if w.is_alive():
       ^^^^^^^^^Exception ignored in: ^<function _MultiProcessingDataLoaderIter.__del__ at 0x78e13196cd60>^
^Traceback (most recent call last):

  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
  File "/usr/lib/python3.11/multiprocessing/process.py", line 160, in is_alive
        self._shutdown_workers()assert self._parent_pid == os.getpid(), 'can only test a child process'

   File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers
       if w.is_alive(): 
           ^ ^ ^^^^^^^^^^^^^^^^^^^^^^^


Epoch 16/50 | Train Loss: 0.6806 | Val SMAPE: 0.5873
  -> No improvement (patience: 2/10)


Epoch 17/50:   0%|          | 0/235 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x78e13196cd60>
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers
    if w.is_alive():
       ^^^^^^Exception ignored in: ^<function _MultiProcessingDataLoaderIter.__del__ at 0x78e13196cd60>^
^Traceback (most recent call last):
^  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
^    ^self._shutdown_workers()

  File "/usr/lib/python3.11/multiprocessing/process.py", line 160, in is_alive
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers
        assert self._parent_pid == os.getpid(), 'can only test a child process'if w.is_alive():

                 ^ ^^^^^^^^^^^^^^^^^^^^^^
 


Epoch 17/50 | Train Loss: 0.6788 | Val SMAPE: 0.5894
  -> No improvement (patience: 3/10)


Epoch 18/50:   0%|          | 0/235 [00:00<?, ?it/s]

Exception ignored in: Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x78e13196cd60><function _MultiProcessingDataLoaderIter.__del__ at 0x78e13196cd60>

Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
        self._shutdown_workers()self._shutdown_workers()

  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers
        if w.is_alive():if w.is_alive():

             ^ ^^^^^^^^^^^^^^^^^^^^^^
^Exception ignored in:   File "/usr/lib/python3.11/multiprocessing/process.py", line 160, in is_alive
    
assert self._parent_pid == os.getpid(), 'can only test a child process'  


Epoch 18/50 | Train Loss: 0.6744 | Val SMAPE: 0.5838
  -> ✓ New best model saved! Val SMAPE: 0.5838


Epoch 19/50:   0%|          | 0/235 [00:00<?, ?it/s]

Exception ignored in: Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x78e13196cd60><function _MultiProcessingDataLoaderIter.__del__ at 0x78e13196cd60>

Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
        self._shutdown_workers()
self._shutdown_workers()  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers

    if w.is_alive():  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers
 
      if w.is_alive():  
      ^  ^ ^^^^^^^^^^^^^^^Exception ignored in: ^^<function _MultiProcessingDataLoaderIter.__del__ at 0x78e13196cd60>^
^^
^Traceback (most recent call last):
Exception ignored in: ^  File "/usr/lib/python3.11/mult


Epoch 19/50 | Train Loss: 0.6683 | Val SMAPE: 0.5841
  -> No improvement (patience: 1/10)


Epoch 20/50:   0%|          | 0/235 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x78e13196cd60>
Exception ignored in: Traceback (most recent call last):
<function _MultiProcessingDataLoaderIter.__del__ at 0x78e13196cd60>  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__

    Traceback (most recent call last):
self._shutdown_workers()  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__

      File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers
self._shutdown_workers()
    if w.is_alive():  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers

     if w.is_alive(): 
            ^^^^^^^^^^^^Exception ignored in: ^^Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x78e13196cd60>^^<function _MultiProcessingDataLoaderIter.__del__ at 0x78e13196cd60>^
^
^^Trac


Epoch 20/50 | Train Loss: 0.6676 | Val SMAPE: 0.5951
  -> No improvement (patience: 2/10)


Epoch 21/50:   0%|          | 0/235 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x78e13196cd60>
Traceback (most recent call last):
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x78e13196cd60>  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__

    Traceback (most recent call last):
self._shutdown_workers()  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__

      File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers
self._shutdown_workers()    if w.is_alive():

   File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers
         
if w.is_alive(): ^^^ ^ ^ ^  ^^ ^Exception ignored in:  ^<function _MultiProcessingDataLoaderIter.__del__ at 0x78e13196cd60>
^Exception ignored in: ^^Traceback (most recent call last):
^<function _MultiProcessingDataLoaderIter.__


Epoch 21/50 | Train Loss: 0.6621 | Val SMAPE: 0.5869
  -> No improvement (patience: 3/10)


Exception ignored in: Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x78e13196cd60><function _MultiProcessingDataLoaderIter.__del__ at 0x78e13196cd60>

Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__


Epoch 22/50:   0%|          | 0/235 [00:00<?, ?it/s]

        self._shutdown_workers()self._shutdown_workers()

  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers
        if w.is_alive():if w.is_alive():

              ^^^^^^^^^^^^^^^^^^^^^^^^

  File "/usr/lib/python3.11/multiprocessing/process.py", line 160, in is_alive
      File "/usr/lib/python3.11/multiprocessing/process.py", line 160, in is_alive
assert self._parent_pid == os.getpid(), 'can only test a child process'
     assert self._parent_pid == os.getpid(), 'can only test a child process' 
                 ^ ^ ^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
AssertionError^: can only test a child process
^^
Exception ignored in: AssertionError<function _MultiProcessingDataLoaderIter.__del__ at 0x78e13196cd60>
: Traceback (most recent call last):
can only test a child process  File "/usr/local/lib


Epoch 22/50 | Train Loss: 0.6620 | Val SMAPE: 0.5936
  -> No improvement (patience: 4/10)


Epoch 23/50:   0%|          | 0/235 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x78e13196cd60>
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers
    if w.is_alive():Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x78e13196cd60>
 
 Traceback (most recent call last):
   File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
       self._shutdown_workers()
 ^  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers
^^^    ^if w.is_alive():^
^^ ^ ^ ^^ 
   File "/usr/lib/python3.11/multiprocessing/process.py", line 160, in is_alive
     assert self._parent_pid == os.getpid(), 'can only test a child process' ^
^ ^ ^^ ^ ^^  ^ ^^ ^ 
   File "/usr/


Epoch 23/50 | Train Loss: 0.6424 | Val SMAPE: 0.5816
  -> ✓ New best model saved! Val SMAPE: 0.5816


Epoch 24/50:   0%|          | 0/235 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x78e13196cd60>
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers
    if w.is_alive():
       ^^^^^^^^Exception ignored in: ^<function _MultiProcessingDataLoaderIter.__del__ at 0x78e13196cd60>^
^Traceback (most recent call last):
^  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__

  File "/usr/lib/python3.11/multiprocessing/process.py", line 160, in is_alive
        self._shutdown_workers()
assert self._parent_pid == os.getpid(), 'can only test a child process'
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers
      if w.is_alive(): 
              ^ ^^^^^^^^^^^^^^^^^^^^^^
^


Epoch 24/50 | Train Loss: 0.6379 | Val SMAPE: 0.5804
  -> ✓ New best model saved! Val SMAPE: 0.5804


Epoch 25/50:   0%|          | 0/235 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x78e13196cd60>
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers
    if w.is_alive():
       ^^^^^^^^Exception ignored in: ^<function _MultiProcessingDataLoaderIter.__del__ at 0x78e13196cd60>^
^Traceback (most recent call last):
^  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__

  File "/usr/lib/python3.11/multiprocessing/process.py", line 160, in is_alive
        self._shutdown_workers()
assert self._parent_pid == os.getpid(), 'can only test a child process'  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers

      if w.is_alive(): 
               ^^^^^^^^^^^^^^^^^^^^^^^^^


Epoch 25/50 | Train Loss: 0.6337 | Val SMAPE: 0.5838
  -> No improvement (patience: 1/10)


Epoch 26/50:   0%|          | 0/235 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x78e13196cd60>
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers
    if w.is_alive():
       ^^^^^^^Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x78e13196cd60>^
^Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
^    ^^self._shutdown_workers()

  File "/usr/lib/python3.11/multiprocessing/process.py", line 160, in is_alive
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers
    assert self._parent_pid == os.getpid(), 'can only test a child process'    
if w.is_alive(): 
                ^^ ^^^^^^^^^^^^^^^^^^^^
^ 


Epoch 26/50 | Train Loss: 0.6315 | Val SMAPE: 0.5903
  -> No improvement (patience: 2/10)


Epoch 27/50:   0%|          | 0/235 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x78e13196cd60>
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers
    if w.is_alive():
       ^^^^Exception ignored in: ^<function _MultiProcessingDataLoaderIter.__del__ at 0x78e13196cd60>^
^Traceback (most recent call last):
^  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
^    ^self._shutdown_workers()^
^  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers

      File "/usr/lib/python3.11/multiprocessing/process.py", line 160, in is_alive
    if w.is_alive():assert self._parent_pid == os.getpid(), 'can only test a child process'

               ^ ^^ ^ ^^^^^^^^^^^^^^^^^^
^ 


Epoch 27/50 | Train Loss: 0.6295 | Val SMAPE: 0.5863
  -> No improvement (patience: 3/10)


Epoch 28/50:   0%|          | 0/235 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x78e13196cd60>
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers
    if w.is_alive():
       ^^^^^^^Exception ignored in: ^^<function _MultiProcessingDataLoaderIter.__del__ at 0x78e13196cd60>^
^Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
^
      File "/usr/lib/python3.11/multiprocessing/process.py", line 160, in is_alive
    self._shutdown_workers()assert self._parent_pid == os.getpid(), 'can only test a child process'

  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers
       if w.is_alive(): 
              ^^^^^^^^^^^^^^^^^^^^^^^^^


Epoch 28/50 | Train Loss: 0.6265 | Val SMAPE: 0.5844
  -> No improvement (patience: 4/10)


Epoch 29/50:   0%|          | 0/235 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x78e13196cd60>
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers
    if w.is_alive():
       ^^^^^^^Exception ignored in: ^<function _MultiProcessingDataLoaderIter.__del__ at 0x78e13196cd60>^^^
^Traceback (most recent call last):

  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
  File "/usr/lib/python3.11/multiprocessing/process.py", line 160, in is_alive
        self._shutdown_workers()
assert self._parent_pid == os.getpid(), 'can only test a child process'
   File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers
     if w.is_alive():  
              ^^^^^^^^^^^^^^^^^^^^^^
^ 


Epoch 29/50 | Train Loss: 0.6142 | Val SMAPE: 0.5842
  -> No improvement (patience: 5/10)


Epoch 30/50:   0%|          | 0/235 [00:00<?, ?it/s]

Exception ignored in: Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x78e13196cd60><function _MultiProcessingDataLoaderIter.__del__ at 0x78e13196cd60>

Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
        self._shutdown_workers()self._shutdown_workers()

  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers
        if w.is_alive():
if w.is_alive():
            ^ ^ ^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/multiprocessing/process.py", line 160, in is_alive

  File "/usr/lib/python3.11/multiprocessing/process.py", line 160, in is_alive
    assert self._par


Epoch 30/50 | Train Loss: 0.6115 | Val SMAPE: 0.5792
  -> ✓ New best model saved! Val SMAPE: 0.5792


Epoch 31/50:   0%|          | 0/235 [00:00<?, ?it/s]

Exception ignored in: Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x78e13196cd60><function _MultiProcessingDataLoaderIter.__del__ at 0x78e13196cd60>

Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
      File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
self._shutdown_workers()
      File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers
self._shutdown_workers()    
if w.is_alive():  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers

     if w.is_alive():
             ^^^^^^^^^^^^^^^^^^^^^^^

^  File "/usr/lib/python3.11/multiprocessing/process.py", line 160, in is_alive
Exception ignored in:       File "/usr/lib/python3.11/multiprocessing/process.py", line 160, in is_


Epoch 31/50 | Train Loss: 0.6086 | Val SMAPE: 0.5809
  -> No improvement (patience: 1/10)


Epoch 32/50:   0%|          | 0/235 [00:00<?, ?it/s]

Exception ignored in: Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x78e13196cd60><function _MultiProcessingDataLoaderIter.__del__ at 0x78e13196cd60>

Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
        self._shutdown_workers()self._shutdown_workers()

  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers
        if w.is_alive():
if w.is_alive():
           ^ ^^ ^ Exception ignored in: ^^^<function _MultiProcessingDataLoaderIter.__del__ at 0x78e13196cd60>^^
^^^Traceback (most recent call last):
^Exception ignored in: ^^  File "/usr/local/lib/python3.11/dist-pac


Epoch 32/50 | Train Loss: 0.6072 | Val SMAPE: 0.5818
  -> No improvement (patience: 2/10)


Epoch 33/50:   0%|          | 0/235 [00:00<?, ?it/s]

Exception ignored in: Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x78e13196cd60><function _MultiProcessingDataLoaderIter.__del__ at 0x78e13196cd60>

Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
    self._shutdown_workers()    
self._shutdown_workers()  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers

  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers
        if w.is_alive():
if w.is_alive():  
       ^^ ^^  ^  ^^^^^^^^^Exception ignored in: ^<function _MultiProcessingDataLoaderIter.__del__ at 0x78e13196cd60>^^
Exception ignored in: ^^Traceback (most recent call last):
<function _MultiProcessingDataLoaderIt


Epoch 40/50 | Train Loss: 0.5886 | Val SMAPE: 0.5814
  -> No improvement (patience: 10/10)
  -> Early stopping triggered.

TRAINING COMPLETE - Best Val SMAPE: 0.5792

✓ Generating final predictions...


Predicting:   0%|          | 0/147 [00:00<?, ?it/s]


SUBMISSION COMPLETE
✓ Best Validation SMAPE: 0.5792
✓ Test predictions: 75,000
✓ Price range: [2.09, 122.72]
✓ Submission file: /kaggle/working/output_v9_hierarchical_fast/submission.csv


In [4]:
import kagglehub

kagglehub.login()

# Replace with path to directory containing model files.
LOCAL_MODEL_DIR = '/kaggle/working/'

MODEL_SLUG = 'aml_hirercial' # Replace with model slug.

# Learn more about naming model variations at
# https://www.kaggle.com/docs/models#name-model.
VARIATION_SLUG = 'default' # Replace with variation slug.

kagglehub.model_upload(
  handle = f"kartikgarg74/{MODEL_SLUG}/keras/{VARIATION_SLUG}",
  local_model_dir = LOCAL_MODEL_DIR,
  version_notes = 'Update 2025-10-12')

VBox(children=(HTML(value='<center> <img\nsrc=https://www.kaggle.com/static/images/site-logo.png\nalt=\'Kaggle…

Uploading Model https://www.kaggle.com/models/kartikgarg74/aml_hirercial/keras/default ...
Starting upload for file /kaggle/working/output_v9_hierarchical_fast/best_model.pth


Uploading: 100%|██████████| 22.1M/22.1M [00:00<00:00, 37.5MB/s]

Upload successful: /kaggle/working/output_v9_hierarchical_fast/best_model.pth (21MB)
Starting upload for file /kaggle/working/output_v9_hierarchical_fast/submission.csv



Uploading: 100%|██████████| 1.22M/1.22M [00:00<00:00, 2.95MB/s]

Upload successful: /kaggle/working/output_v9_hierarchical_fast/submission.csv (1MB)





Your model instance version has been created.
Files are being processed...
See at: https://www.kaggle.com/models/kartikgarg74/aml_hirercial/keras/default


In [5]:
# ============================================================================
# COMPREHENSIVE FUSION COMPARISON WITH HYPERPARAMETER TUNING
# ============================================================================
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, TensorDataset
from tqdm.auto import tqdm
import warnings
from sklearn.preprocessing import LabelEncoder
import json
import gc

warnings.filterwarnings('ignore')

print("="*80)
print("COMPREHENSIVE FUSION COMPARISON - 3 METHODS")
print("="*80)

# --- CONFIGURATION ---
EMBEDDINGS_PATH = '/kaggle/input/aml_new/keras/default/1/output_v4_advanced/embeddings'
V4_OUTPUT_PATH = '/kaggle/input/aml_new/keras/default/1/output_v4_advanced'
FINAL_OUTPUT_DIR = '/kaggle/working/output_v10_fusion_comparison'
TRAIN_CSV_PATH = "/kaggle/input/aml-csv/train.csv"
TEST_CSV_PATH = "/kaggle/input/aml-csv/test.csv"

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BATCH_SIZE = 256
BASE_LR = 3e-4
EPOCHS = 60
PATIENCE = 12

os.makedirs(FINAL_OUTPUT_DIR, exist_ok=True)

print(f"Device: {DEVICE}")

# --- LOSS FUNCTIONS ---
def smape_loss(y_pred, y_true, eps=1e-9):
    numerator = torch.abs(y_pred - y_true)
    denominator = (torch.abs(y_true) + torch.abs(y_pred)) / 2.0
    return torch.mean(numerator / (denominator + eps))

def smape_safe(y_true, y_pred, eps=1e-9):
    numerator = np.abs(y_pred - y_true)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2.0
    return np.mean(numerator / (denominator + eps)) * 100

def info_nce_loss(text_embeds, image_embeds, temperature=0.07):
    text_embeds = F.normalize(text_embeds, p=2, dim=-1)
    image_embeds = F.normalize(image_embeds, p=2, dim=-1)
    logits = torch.matmul(text_embeds, image_embeds.T) / temperature
    labels = torch.arange(len(logits), device=logits.device)
    loss_i = F.cross_entropy(logits, labels)
    loss_t = F.cross_entropy(logits.T, labels)
    return (loss_i + loss_t) / 2.0

# --- LOAD DATA ---
print("\n✓ Loading pre-computed embeddings...")
X_text_train = np.load(f'{EMBEDDINGS_PATH}/text_train.npy')
X_text_val = np.load(f'{EMBEDDINGS_PATH}/text_val.npy')
X_text_test = np.load(f'{EMBEDDINGS_PATH}/text_test.npy')
X_image_train = np.load(f'{EMBEDDINGS_PATH}/image_train.npy')
X_image_val = np.load(f'{EMBEDDINGS_PATH}/image_val.npy')
X_image_test = np.load(f'{EMBEDDINGS_PATH}/image_test.npy')

train_indices = pd.read_csv(f'{V4_OUTPUT_PATH}/train_indices.csv')['train_idx'].values
val_indices = pd.read_csv(f'{V4_OUTPUT_PATH}/val_indices.csv')['val_idx'].values

train_df_full = pd.read_csv(TRAIN_CSV_PATH)
train_df = train_df_full.iloc[train_indices].reset_index(drop=True)
val_df = train_df_full.iloc[val_indices].reset_index(drop=True)

y_train_price = train_df['price'].values
y_val_price = val_df['price'].values
y_train_price_log = np.log1p(y_train_price)
y_val_price_log = np.log1p(y_val_price)

def extract_category_simple(text):
    text = str(text).lower()
    if 'electronic' in text: return 'electronics'
    if 'book' in text: return 'books'
    return 'other'

def extract_brand_simple(text):
    return str(text).split()[0] if text else 'unknown'

train_df['category'] = train_df['catalog_content'].apply(extract_category_simple)
val_df['category'] = val_df['catalog_content'].apply(extract_category_simple)
train_df['brand'] = train_df['catalog_content'].apply(extract_brand_simple)
val_df['brand'] = val_df['catalog_content'].apply(extract_brand_simple)

cat_encoder = LabelEncoder().fit(pd.concat([train_df['category'], val_df['category']]))
brand_encoder = LabelEncoder().fit(pd.concat([train_df['brand'], val_df['brand']]))

y_train_cat = cat_encoder.transform(train_df['category'])
y_val_cat = cat_encoder.transform(val_df['category'])
y_train_brand = brand_encoder.transform(train_df['brand'])
y_val_brand = brand_encoder.transform(val_df['brand'])

num_categories = len(cat_encoder.classes_)
num_brands = len(brand_encoder.classes_)

print(f"  -> Train: {len(train_df)}, Val: {len(val_df)}")
print(f"  -> Text dim: {X_text_train.shape[1]}, Image dim: {X_image_train.shape[1]}")

# --- FUSION ARCHITECTURES ---

# 1. MULTIPLICATIVE FUSION
class MultiplicativeFusion(nn.Module):
    """Concat + Element-wise Product Fusion"""
    def __init__(self, text_dim, image_dim, fusion_dim=512, num_categories=10, num_brands=101):
        super().__init__()
        # Project to same dimension for multiplication
        self.text_proj = nn.Linear(text_dim, fusion_dim)
        self.image_proj = nn.Linear(image_dim, fusion_dim)
        
        # Fusion MLP (concat + product)
        self.fusion_mlp = nn.Sequential(
            nn.LayerNorm(fusion_dim * 3),  # text + image + product
            nn.Linear(fusion_dim * 3, fusion_dim * 2),
            nn.GELU(),
            nn.Dropout(0.2),
            nn.Linear(fusion_dim * 2, fusion_dim),
            nn.GELU(),
            nn.Dropout(0.1)
        )
        
        # Prediction heads
        self.price_head = nn.Linear(fusion_dim, 1)
        self.category_head = nn.Linear(fusion_dim, num_categories)
        self.brand_head = nn.Linear(fusion_dim, num_brands)
    
    def forward(self, text_emb, image_emb):
        t_proj = self.text_proj(text_emb)
        i_proj = self.image_proj(image_emb)
        
        # Multiplicative interaction
        mult = t_proj * i_proj
        
        # Concat all three
        fused_input = torch.cat([t_proj, i_proj, mult], dim=1)
        fused = self.fusion_mlp(fused_input)
        
        price_log = self.price_head(fused).squeeze(-1)
        category_logits = self.category_head(fused)
        brand_logits = self.brand_head(fused)
        
        return price_log, category_logits, brand_logits, t_proj, i_proj

# 2. GATED FUSION
class GatedFusion(nn.Module):
    """Learnable gating mechanism"""
    def __init__(self, text_dim, image_dim, fusion_dim=512, num_categories=10, num_brands=101):
        super().__init__()
        self.text_proj = nn.Linear(text_dim, fusion_dim)
        self.image_proj = nn.Linear(image_dim, fusion_dim)
        
        # Gating network
        self.gate_net = nn.Sequential(
            nn.Linear(fusion_dim * 2, fusion_dim),
            nn.GELU(),
            nn.Dropout(0.1),
            nn.Linear(fusion_dim, fusion_dim),
            nn.GELU(),
            nn.Linear(fusion_dim, 1),
            nn.Sigmoid()
        )
        
        # Additional processing
        self.post_fusion = nn.Sequential(
            nn.LayerNorm(fusion_dim),
            nn.Linear(fusion_dim, fusion_dim),
            nn.GELU(),
            nn.Dropout(0.1)
        )
        
        self.price_head = nn.Linear(fusion_dim, 1)
        self.category_head = nn.Linear(fusion_dim, num_categories)
        self.brand_head = nn.Linear(fusion_dim, num_brands)
    
    def forward(self, text_emb, image_emb):
        t_proj = self.text_proj(text_emb)
        i_proj = self.image_proj(image_emb)
        
        # Compute gate
        concat = torch.cat([t_proj, i_proj], dim=1)
        gate = self.gate_net(concat)  # (B, 1)
        
        # Gated fusion
        fused = gate * t_proj + (1 - gate) * i_proj
        fused = self.post_fusion(fused)
        
        price_log = self.price_head(fused).squeeze(-1)
        category_logits = self.category_head(fused)
        brand_logits = self.brand_head(fused)
        
        return price_log, category_logits, brand_logits, t_proj, i_proj

# 3. FiLM FUSION
class FiLMFusion(nn.Module):
    """Feature-wise Linear Modulation"""
    def __init__(self, text_dim, image_dim, fusion_dim=512, num_categories=10, num_brands=101):
        super().__init__()
        self.text_encoder = nn.Linear(text_dim, fusion_dim)
        self.image_encoder = nn.Linear(image_dim, fusion_dim)
        
        # FiLM parameters: text generates gamma and beta for image
        self.gamma_net = nn.Sequential(
            nn.Linear(fusion_dim, fusion_dim),
            nn.GELU(),
            nn.Linear(fusion_dim, fusion_dim)
        )
        self.beta_net = nn.Sequential(
            nn.Linear(fusion_dim, fusion_dim),
            nn.GELU(),
            nn.Linear(fusion_dim, fusion_dim)
        )
        
        # Post-modulation processing
        self.post_film = nn.Sequential(
            nn.LayerNorm(fusion_dim * 2),
            nn.Linear(fusion_dim * 2, fusion_dim),
            nn.GELU(),
            nn.Dropout(0.15),
            nn.Linear(fusion_dim, fusion_dim),
            nn.GELU()
        )
        
        self.price_head = nn.Linear(fusion_dim, 1)
        self.category_head = nn.Linear(fusion_dim, num_categories)
        self.brand_head = nn.Linear(fusion_dim, num_brands)
    
    def forward(self, text_emb, image_emb):
        t_enc = self.text_encoder(text_emb)
        i_enc = self.image_encoder(image_emb)
        
        # Generate FiLM parameters from text
        gamma = self.gamma_net(t_enc)
        beta = self.beta_net(t_enc)
        
        # Modulate image with text
        modulated_image = gamma * i_enc + beta
        
        # Combine modulated image with text
        fused_input = torch.cat([t_enc, modulated_image], dim=1)
        fused = self.post_film(fused_input)
        
        price_log = self.price_head(fused).squeeze(-1)
        category_logits = self.category_head(fused)
        brand_logits = self.brand_head(fused)
        
        return price_log, category_logits, brand_logits, t_enc, i_enc

# --- TRAINING FUNCTION ---
def train_model(model, model_name, train_loader, val_loader, lr=BASE_LR, epochs=EPOCHS):
    print(f"\n{'='*80}")
    print(f"TRAINING: {model_name}")
    print(f"{'='*80}")
    
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=0.01)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=10, T_mult=2)
    
    price_mse = nn.MSELoss()
    cat_ce = nn.CrossEntropyLoss()
    brand_ce = nn.CrossEntropyLoss()
    
    best_val_smape = float('inf')
    patience_counter = 0
    history = {'train_loss': [], 'val_smape': []}
    
    for epoch in range(epochs):
        # Training
        model.train()
        epoch_loss = 0
        num_batches = 0
        
        for text_emb, image_emb, price_t, cat_t, brand_t in train_loader:
            text_emb = text_emb.to(DEVICE)
            image_emb = image_emb.to(DEVICE)
            price_t = price_t.to(DEVICE)
            cat_t = cat_t.to(DEVICE)
            brand_t = brand_t.to(DEVICE)
            
            optimizer.zero_grad()
            
            price_p, cat_p, brand_p, t_proj, i_proj = model(text_emb, image_emb)
            
            # Multi-task loss
            loss = (0.5 * smape_loss(torch.expm1(price_p), torch.expm1(price_t)) + 
                    0.1 * price_mse(price_p, price_t)) + \
                   (0.15 * cat_ce(cat_p, cat_t)) + \
                   (0.15 * brand_ce(brand_p, brand_t)) + \
                   (0.1 * info_nce_loss(t_proj, i_proj))
            
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            
            epoch_loss += loss.item()
            num_batches += 1
        
        avg_train_loss = epoch_loss / num_batches
        history['train_loss'].append(avg_train_loss)
        
        # Validation
        model.eval()
        val_preds_log = []
        
        with torch.no_grad():
            for text_emb, image_emb, _, _, _ in val_loader:
                text_emb = text_emb.to(DEVICE)
                image_emb = image_emb.to(DEVICE)
                
                price_p, _, _, _, _ = model(text_emb, image_emb)
                val_preds_log.append(price_p.cpu().numpy())
        
        val_preds_log = np.concatenate(val_preds_log)
        val_preds_linear = np.expm1(val_preds_log)
        val_smape = smape_safe(y_val_price, val_preds_linear)
        history['val_smape'].append(val_smape)
        
        scheduler.step()
        
        if epoch % 5 == 0 or val_smape < best_val_smape:
            print(f"Epoch {epoch+1}/{epochs} | Loss: {avg_train_loss:.4f} | Val SMAPE: {val_smape:.2f}%")
        
        if val_smape < best_val_smape:
            best_val_smape = val_smape
            patience_counter = 0
            torch.save(model.state_dict(), f'{FINAL_OUTPUT_DIR}/{model_name}_best.pth')
        else:
            patience_counter += 1
            if patience_counter >= PATIENCE:
                print(f"  -> Early stopping at epoch {epoch+1}")
                break
    
    print(f"\n{'='*40}")
    print(f"{model_name} FINAL RESULT")
    print(f"{'='*40}")
    print(f"Best Val SMAPE: {best_val_smape:.2f}%")
    print(f"{'='*40}\n")
    
    return best_val_smape, history

# --- CREATE DATASETS ---
train_dataset = TensorDataset(
    torch.from_numpy(X_text_train).float(),
    torch.from_numpy(X_image_train).float(),
    torch.from_numpy(y_train_price_log).float(),
    torch.from_numpy(y_train_cat).long(),
    torch.from_numpy(y_train_brand).long()
)

val_dataset = TensorDataset(
    torch.from_numpy(X_text_val).float(),
    torch.from_numpy(X_image_val).float(),
    torch.from_numpy(y_val_price_log).float(),
    torch.from_numpy(y_val_cat).long(),
    torch.from_numpy(y_val_brand).long()
)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE*2, shuffle=False, num_workers=2)

# --- TRAIN ALL 3 MODELS ---
results = {}

# 1. Multiplicative Fusion
model1 = MultiplicativeFusion(
    X_text_train.shape[1], X_image_train.shape[1],
    num_categories=num_categories, num_brands=num_brands
).to(DEVICE)
smape1, hist1 = train_model(model1, "MultiplicativeFusion", train_loader, val_loader)
results['MultiplicativeFusion'] = {'smape': smape1, 'history': hist1}
del model1; gc.collect(); torch.cuda.empty_cache()

# 2. Gated Fusion
model2 = GatedFusion(
    X_text_train.shape[1], X_image_train.shape[1],
    num_categories=num_categories, num_brands=num_brands
).to(DEVICE)
smape2, hist2 = train_model(model2, "GatedFusion", train_loader, val_loader)
results['GatedFusion'] = {'smape': smape2, 'history': hist2}
del model2; gc.collect(); torch.cuda.empty_cache()

# 3. FiLM Fusion
model3 = FiLMFusion(
    X_text_train.shape[1], X_image_train.shape[1],
    num_categories=num_categories, num_brands=num_brands
).to(DEVICE)
smape3, hist3 = train_model(model3, "FiLMFusion", train_loader, val_loader)
results['FiLMFusion'] = {'smape': smape3, 'history': hist3}
del model3; gc.collect(); torch.cuda.empty_cache()

# --- FINAL COMPARISON ---
print("\n" + "="*80)
print("FINAL COMPARISON OF ALL 3 FUSION METHODS")
print("="*80)
print(f"{'Method':<25} {'Val SMAPE':<15}")
print("-" * 80)
for name, res in results.items():
    print(f"{name:<25} {res['smape']:<15.2f}%")
print("="*80)

# Save results
with open(f'{FINAL_OUTPUT_DIR}/comparison_results.json', 'w') as f:
    json.dump({k: {'smape': float(v['smape'])} for k, v in results.items()}, f, indent=2)

# Find best model
best_method = min(results.items(), key=lambda x: x[1]['smape'])
print(f"\n🏆 WINNER: {best_method[0]} with {best_method[1]['smape']:.2f}% SMAPE")

# --- GENERATE FINAL PREDICTIONS WITH BEST MODEL ---
print(f"\n✓ Generating final predictions with {best_method[0]}...")

# Reload best model
if best_method[0] == 'MultiplicativeFusion':
    final_model = MultiplicativeFusion(X_text_train.shape[1], X_image_train.shape[1], num_categories=num_categories, num_brands=num_brands).to(DEVICE)
elif best_method[0] == 'GatedFusion':
    final_model = GatedFusion(X_text_train.shape[1], X_image_train.shape[1], num_categories=num_categories, num_brands=num_brands).to(DEVICE)
else:
    final_model = FiLMFusion(X_text_train.shape[1], X_image_train.shape[1], num_categories=num_categories, num_brands=num_brands).to(DEVICE)

final_model.load_state_dict(torch.load(f'{FINAL_OUTPUT_DIR}/{best_method[0]}_best.pth'))
final_model.eval()

test_dataset = TensorDataset(torch.from_numpy(X_text_test).float(), torch.from_numpy(X_image_test).float())
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE*2, shuffle=False)

test_preds_log = []
with torch.no_grad():
    for text_emb, image_emb in tqdm(test_loader, desc="Predicting"):
        text_emb = text_emb.to(DEVICE)
        image_emb = image_emb.to(DEVICE)
        price_p, _, _, _, _ = final_model(text_emb, image_emb)
        test_preds_log.append(price_p.cpu().numpy())

final_test_preds = np.expm1(np.concatenate(test_preds_log))

test_df = pd.read_csv(TEST_CSV_PATH)
if 'sample_id' not in test_df.columns:
    test_df['sample_id'] = test_df.index

submission = pd.DataFrame({'sample_id': test_df['sample_id'], 'price': final_test_preds})
submission.to_csv(f'{FINAL_OUTPUT_DIR}/submission_best.csv', index=False)

print(f"\n✓ Submission saved: {FINAL_OUTPUT_DIR}/submission_best.csv")
print(f"✓ Best method: {best_method[0]} ({best_method[1]['smape']:.2f}% SMAPE)")


COMPREHENSIVE FUSION COMPARISON - 3 METHODS
Device: cuda

✓ Loading pre-computed embeddings...
  -> Train: 60000, Val: 15000
  -> Text dim: 768, Image dim: 1024

TRAINING: MultiplicativeFusion
Epoch 1/60 | Loss: 0.9437 | Val SMAPE: 60.42%
Epoch 2/60 | Loss: 0.8530 | Val SMAPE: 60.10%
Epoch 3/60 | Loss: 0.8076 | Val SMAPE: 59.49%
Epoch 4/60 | Loss: 0.7685 | Val SMAPE: 57.49%
Epoch 5/60 | Loss: 0.7304 | Val SMAPE: 56.84%
Epoch 6/60 | Loss: 0.6942 | Val SMAPE: 56.65%
Epoch 7/60 | Loss: 0.6606 | Val SMAPE: 56.08%
Epoch 8/60 | Loss: 0.6317 | Val SMAPE: 55.79%
Epoch 9/60 | Loss: 0.6110 | Val SMAPE: 55.64%
Epoch 11/60 | Loss: 0.6592 | Val SMAPE: 58.41%
Epoch 16/60 | Loss: 0.5882 | Val SMAPE: 57.55%
Epoch 21/60 | Loss: 0.5194 | Val SMAPE: 56.00%
  -> Early stopping at epoch 21

MultiplicativeFusion FINAL RESULT
Best Val SMAPE: 55.64%


TRAINING: GatedFusion
Epoch 1/60 | Loss: 0.9296 | Val SMAPE: 60.80%
Epoch 2/60 | Loss: 0.8562 | Val SMAPE: 58.46%
Epoch 3/60 | Loss: 0.8110 | Val SMAPE: 58.40%


Predicting:   0%|          | 0/147 [00:00<?, ?it/s]


✓ Submission saved: /kaggle/working/output_v10_fusion_comparison/submission_best.csv
✓ Best method: GatedFusion (55.64% SMAPE)


In [6]:
import kagglehub

kagglehub.login()

# Replace with path to directory containing model files.
LOCAL_MODEL_DIR = '/kaggle/working/'

MODEL_SLUG = 'aml_all_new' # Replace with model slug.

# Learn more about naming model variations at
# https://www.kaggle.com/docs/models#name-model.
VARIATION_SLUG = 'default' # Replace with variation slug.

kagglehub.model_upload(
  handle = f"kartikgarg74/{MODEL_SLUG}/keras/{VARIATION_SLUG}",
  local_model_dir = LOCAL_MODEL_DIR,
  version_notes = 'Update 2025-10-13')

VBox(children=(HTML(value='<center> <img\nsrc=https://www.kaggle.com/static/images/site-logo.png\nalt=\'Kaggle…

Uploading Model https://www.kaggle.com/models/kartikgarg74/aml_all_new/keras/default ...
Model 'aml_all_new' does not exist or access is forbidden for user 'kartikgarg74'. Creating or handling Model...
Model 'aml_all_new' Created.
Starting upload for file /kaggle/working/output_v10_fusion_comparison/submission_best.csv


Uploading: 100%|██████████| 1.22M/1.22M [00:00<00:00, 2.85MB/s]

Upload successful: /kaggle/working/output_v10_fusion_comparison/submission_best.csv (1MB)
Starting upload for file /kaggle/working/output_v10_fusion_comparison/MultiplicativeFusion_best.pth



Uploading: 100%|██████████| 12.1M/12.1M [00:00<00:00, 23.4MB/s]

Upload successful: /kaggle/working/output_v10_fusion_comparison/MultiplicativeFusion_best.pth (12MB)
Starting upload for file /kaggle/working/output_v10_fusion_comparison/FiLMFusion_best.pth



Uploading: 100%|██████████| 11.1M/11.1M [00:00<00:00, 21.1MB/s]

Upload successful: /kaggle/working/output_v10_fusion_comparison/FiLMFusion_best.pth (11MB)
Starting upload for file /kaggle/working/output_v10_fusion_comparison/GatedFusion_best.pth



Uploading: 100%|██████████| 7.90M/7.90M [00:00<00:00, 16.3MB/s]

Upload successful: /kaggle/working/output_v10_fusion_comparison/GatedFusion_best.pth (8MB)
Starting upload for file /kaggle/working/output_v10_fusion_comparison/comparison_results.json



Uploading: 100%|██████████| 177/177 [00:00<00:00, 430B/s]

Upload successful: /kaggle/working/output_v10_fusion_comparison/comparison_results.json (177B)
Starting upload for file /kaggle/working/output_v9_hierarchical_fast/best_model.pth



Uploading: 100%|██████████| 22.1M/22.1M [00:00<00:00, 34.2MB/s]

Upload successful: /kaggle/working/output_v9_hierarchical_fast/best_model.pth (21MB)
Starting upload for file /kaggle/working/output_v9_hierarchical_fast/submission.csv



Uploading: 100%|██████████| 1.22M/1.22M [00:00<00:00, 2.84MB/s]

Upload successful: /kaggle/working/output_v9_hierarchical_fast/submission.csv (1MB)





Your model instance has been created.
Files are being processed...
See at: https://www.kaggle.com/models/kartikgarg74/aml_all_new/keras/default


In [9]:
# ============================================================================
# OPTIMIZED GATED FUSION - FIXED (NO MULTIPROCESSING ERRORS)
# ============================================================================
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
from torch.optim.swa_utils import AveragedModel, SWALR
from tqdm.auto import tqdm
import warnings
from sklearn.preprocessing import LabelEncoder
import gc

warnings.filterwarnings('ignore')

print("="*80)
print("OPTIMIZED GATED FUSION - CLEAN VERSION")
print("="*80)

# --- CONFIGURATION ---
EMBEDDINGS_PATH = '/kaggle/input/aml_new/keras/default/1/output_v4_advanced/embeddings'
V4_OUTPUT_PATH = '/kaggle/input/aml_new/keras/default/1/output_v4_advanced'
FINAL_OUTPUT_DIR = '/kaggle/working/output_v11_gated_clean'
TRAIN_CSV_PATH = "/kaggle/input/aml-csv/train.csv"
TEST_CSV_PATH = "/kaggle/input/aml-csv/test.csv"

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BATCH_SIZE = 256
BASE_LR = 3e-4
WARMUP_EPOCHS = 5
EPOCHS = 80
PATIENCE = 15
SWA_START = 50
MIXUP_ALPHA = 0.2

os.makedirs(FINAL_OUTPUT_DIR, exist_ok=True)
print(f"Device: {DEVICE}")

# --- LOSS FUNCTIONS ---
def smape_loss(y_pred, y_true, eps=1e-9):
    numerator = torch.abs(y_pred - y_true)
    denominator = (torch.abs(y_true) + torch.abs(y_pred)) / 2.0
    return torch.mean(numerator / (denominator + eps))

def smape_safe(y_true, y_pred, eps=1e-9):
    numerator = np.abs(y_pred - y_true)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2.0
    return np.mean(numerator / (denominator + eps)) * 100

def info_nce_loss(text_embeds, image_embeds, temperature=0.07):
    text_embeds = F.normalize(text_embeds, p=2, dim=-1)
    image_embeds = F.normalize(image_embeds, p=2, dim=-1)
    logits = torch.matmul(text_embeds, image_embeds.T) / temperature
    labels = torch.arange(len(logits), device=logits.device)
    loss_i = F.cross_entropy(logits, labels)
    loss_t = F.cross_entropy(logits.T, labels)
    return (loss_i + loss_t) / 2.0

class FocalLoss(nn.Module):
    def __init__(self, alpha=0.25, gamma=2.0):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma
    
    def forward(self, inputs, targets):
        ce_loss = F.cross_entropy(inputs, targets, reduction='none')
        pt = torch.exp(-ce_loss)
        focal_loss = self.alpha * (1 - pt) ** self.gamma * ce_loss
        return focal_loss.mean()

def mixup_data(text_emb, image_emb, price, cat, brand, alpha=0.2):
    if alpha > 0:
        lam = np.random.beta(alpha, alpha)
    else:
        lam = 1
    
    batch_size = text_emb.size(0)
    index = torch.randperm(batch_size).to(DEVICE)
    
    mixed_text = lam * text_emb + (1 - lam) * text_emb[index]
    mixed_image = lam * image_emb + (1 - lam) * image_emb[index]
    mixed_price = lam * price + (1 - lam) * price[index]
    
    return mixed_text, mixed_image, mixed_price, cat, brand

# --- LOAD DATA ---
print("\n✓ Loading embeddings...")
X_text_train = np.load(f'{EMBEDDINGS_PATH}/text_train.npy')
X_text_val = np.load(f'{EMBEDDINGS_PATH}/text_val.npy')
X_text_test = np.load(f'{EMBEDDINGS_PATH}/text_test.npy')
X_image_train = np.load(f'{EMBEDDINGS_PATH}/image_train.npy')
X_image_val = np.load(f'{EMBEDDINGS_PATH}/image_val.npy')
X_image_test = np.load(f'{EMBEDDINGS_PATH}/image_test.npy')

train_indices = pd.read_csv(f'{V4_OUTPUT_PATH}/train_indices.csv')['train_idx'].values
val_indices = pd.read_csv(f'{V4_OUTPUT_PATH}/val_indices.csv')['val_idx'].values
train_df_full = pd.read_csv(TRAIN_CSV_PATH)
train_df = train_df_full.iloc[train_indices].reset_index(drop=True)
val_df = train_df_full.iloc[val_indices].reset_index(drop=True)

y_train_price = train_df['price'].values
y_val_price = val_df['price'].values
y_train_price_log = np.log1p(y_train_price)
y_val_price_log = np.log1p(y_val_price)

def extract_category_simple(text):
    text = str(text).lower()
    if 'electronic' in text: return 'electronics'
    if 'book' in text: return 'books'
    return 'other'

def extract_brand_simple(text):
    return str(text).split()[0] if text else 'unknown'

train_df['category'] = train_df['catalog_content'].apply(extract_category_simple)
val_df['category'] = val_df['catalog_content'].apply(extract_category_simple)
train_df['brand'] = train_df['catalog_content'].apply(extract_brand_simple)
val_df['brand'] = val_df['catalog_content'].apply(extract_brand_simple)

cat_encoder = LabelEncoder().fit(pd.concat([train_df['category'], val_df['category']]))
brand_encoder = LabelEncoder().fit(pd.concat([train_df['brand'], val_df['brand']]))

y_train_cat = cat_encoder.transform(train_df['category'])
y_val_cat = cat_encoder.transform(val_df['category'])
y_train_brand = brand_encoder.transform(train_df['brand'])
y_val_brand = brand_encoder.transform(val_df['brand'])

num_categories = len(cat_encoder.classes_)
num_brands = len(brand_encoder.classes_)
print(f"  -> Train: {len(train_df)}, Val: {len(val_df)}")

# --- MODEL ---
class OptimizedGatedFusion(nn.Module):
    def __init__(self, text_dim, image_dim, fusion_dim=512, num_categories=10, num_brands=101):
        super().__init__()
        self.text_proj = nn.Sequential(
            nn.Linear(text_dim, fusion_dim),
            nn.LayerNorm(fusion_dim),
            nn.GELU(),
            nn.Dropout(0.15)
        )
        self.image_proj = nn.Sequential(
            nn.Linear(image_dim, fusion_dim),
            nn.LayerNorm(fusion_dim),
            nn.GELU(),
            nn.Dropout(0.15)
        )
        self.gate_net = nn.Sequential(
            nn.Linear(fusion_dim * 2, fusion_dim),
            nn.LayerNorm(fusion_dim),
            nn.GELU(),
            nn.Dropout(0.1),
            nn.Linear(fusion_dim, fusion_dim // 2),
            nn.GELU(),
            nn.Dropout(0.1),
            nn.Linear(fusion_dim // 2, 1),
            nn.Sigmoid()
        )
        self.residual_weight = nn.Parameter(torch.tensor(0.5))
        self.post_fusion = nn.Sequential(
            nn.LayerNorm(fusion_dim),
            nn.Linear(fusion_dim, fusion_dim * 2),
            nn.GELU(),
            nn.Dropout(0.15),
            nn.Linear(fusion_dim * 2, fusion_dim),
            nn.GELU(),
            nn.Dropout(0.1)
        )
        self.price_head = nn.Sequential(
            nn.LayerNorm(fusion_dim),
            nn.Linear(fusion_dim, fusion_dim // 2),
            nn.GELU(),
            nn.Dropout(0.1),
            nn.Linear(fusion_dim // 2, 1)
        )
        self.category_head = nn.Sequential(nn.LayerNorm(fusion_dim), nn.Linear(fusion_dim, num_categories))
        self.brand_head = nn.Sequential(nn.LayerNorm(fusion_dim), nn.Linear(fusion_dim, num_brands))
    
    def forward(self, text_emb, image_emb):
        t_proj = self.text_proj(text_emb)
        i_proj = self.image_proj(image_emb)
        gate = self.gate_net(torch.cat([t_proj, i_proj], dim=1))
        fused = gate * t_proj + (1 - gate) * i_proj
        residual = torch.sigmoid(self.residual_weight) * (t_proj + i_proj) / 2
        fused = fused + residual
        fused = self.post_fusion(fused)
        return self.price_head(fused).squeeze(-1), self.category_head(fused), self.brand_head(fused), t_proj, i_proj

# --- DATASETS (NO MULTIPROCESSING) ---
train_dataset = TensorDataset(
    torch.from_numpy(X_text_train).float(), torch.from_numpy(X_image_train).float(),
    torch.from_numpy(y_train_price_log).float(), torch.from_numpy(y_train_cat).long(), torch.from_numpy(y_train_brand).long()
)
val_dataset = TensorDataset(
    torch.from_numpy(X_text_val).float(), torch.from_numpy(X_image_val).float(),
    torch.from_numpy(y_val_price_log).float(), torch.from_numpy(y_val_cat).long(), torch.from_numpy(y_val_brand).long()
)

# *** FIX: num_workers=0 to avoid multiprocessing errors ***
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=0, pin_memory=False)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE*2, shuffle=False, num_workers=0, pin_memory=False)

print(f"  -> Train batches: {len(train_loader)}, Val batches: {len(val_loader)}")

# --- TRAINING ---
model = OptimizedGatedFusion(X_text_train.shape[1], X_image_train.shape[1], num_categories=num_categories, num_brands=num_brands).to(DEVICE)
swa_model = AveragedModel(model)
optimizer = torch.optim.AdamW(model.parameters(), lr=BASE_LR, weight_decay=0.02)

total_steps = len(train_loader) * EPOCHS
warmup_steps = len(train_loader) * WARMUP_EPOCHS

def get_lr(step):
    if step < warmup_steps:
        return float(step) / float(max(1, warmup_steps))
    progress = float(step - warmup_steps) / float(max(1, total_steps - warmup_steps))
    return max(0.0, 0.5 * (1.0 + np.cos(np.pi * progress)))

scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, get_lr)
swa_scheduler = SWALR(optimizer, swa_lr=BASE_LR * 0.5)

price_mse = nn.MSELoss()
cat_ce = nn.CrossEntropyLoss(label_smoothing=0.1)
brand_ce = FocalLoss()

print(f"\n✓ Training started\n")
best_val_smape = float('inf')
patience_counter = 0

for epoch in range(EPOCHS):
    model.train()
    epoch_loss = 0
    
    for text_emb, image_emb, price_t, cat_t, brand_t in train_loader:
        text_emb, image_emb, price_t, cat_t, brand_t = text_emb.to(DEVICE), image_emb.to(DEVICE), price_t.to(DEVICE), cat_t.to(DEVICE), brand_t.to(DEVICE)
        
        if np.random.rand() < 0.5 and epoch > WARMUP_EPOCHS:
            text_emb, image_emb, price_t, cat_t, brand_t = mixup_data(text_emb, image_emb, price_t, cat_t, brand_t, MIXUP_ALPHA)
        
        optimizer.zero_grad()
        price_p, cat_p, brand_p, t_proj, i_proj = model(text_emb, image_emb)
        
        loss = (0.6 * smape_loss(torch.expm1(price_p), torch.expm1(price_t)) + 0.05 * price_mse(price_p, price_t)) + \
               (0.1 * cat_ce(cat_p, cat_t)) + (0.15 * brand_ce(brand_p, brand_t)) + (0.1 * info_nce_loss(t_proj, i_proj))
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        epoch_loss += loss.item()
    
    if epoch >= SWA_START:
        swa_model.update_parameters(model)
        swa_scheduler.step()
    
    eval_model = swa_model if epoch >= SWA_START else model
    eval_model.eval()
    val_preds_log = []
    
    with torch.no_grad():
        for text_emb, image_emb, _, _, _ in val_loader:
            price_p, _, _, _, _ = eval_model(text_emb.to(DEVICE), image_emb.to(DEVICE))
            val_preds_log.append(price_p.cpu().numpy())
    
    val_smape = smape_safe(y_val_price, np.expm1(np.concatenate(val_preds_log)))
    swa_flag = " [SWA]" if epoch >= SWA_START else ""
    
    print(f"Epoch {epoch+1}/{EPOCHS}{swa_flag} | Loss: {epoch_loss/len(train_loader):.4f} | Val SMAPE: {val_smape:.2f}%")
    
    if val_smape < best_val_smape:
        best_val_smape = val_smape
        patience_counter = 0
        torch.save((swa_model if epoch >= SWA_START else model).state_dict(), f'{FINAL_OUTPUT_DIR}/best_model.pth')
        print(f"  -> ✓ Saved! Best: {best_val_smape:.2f}%")
    else:
        patience_counter += 1
        if patience_counter >= PATIENCE:
            print("  -> Early stop"); break

print(f"\n{'='*80}\nBest Val SMAPE: {best_val_smape:.2f}%\n{'='*80}")

# --- PREDICTION ---
final_model = OptimizedGatedFusion(X_text_train.shape[1], X_image_train.shape[1], num_categories=num_categories, num_brands=num_brands).to(DEVICE)
final_model.load_state_dict(torch.load(f'{FINAL_OUTPUT_DIR}/best_model.pth'))
final_model.eval()

test_loader = DataLoader(TensorDataset(torch.from_numpy(X_text_test).float(), torch.from_numpy(X_image_test).float()), batch_size=512, shuffle=False, num_workers=0)
test_preds = []
with torch.no_grad():
    for text_emb, image_emb in test_loader:
        test_preds.append(final_model(text_emb.to(DEVICE), image_emb.to(DEVICE))[0].cpu().numpy())

test_df = pd.read_csv(TEST_CSV_PATH)
pd.DataFrame({'sample_id': test_df.get('sample_id', test_df.index), 'price': np.expm1(np.concatenate(test_preds))}).to_csv(f'{FINAL_OUTPUT_DIR}/submission.csv', index=False)
print(f"✓ Submission saved: {FINAL_OUTPUT_DIR}/submission.csv")


OPTIMIZED GATED FUSION - CLEAN VERSION
Device: cuda

✓ Loading embeddings...
  -> Train: 60000, Val: 15000
  -> Train batches: 235, Val batches: 30

✓ Training started

Epoch 1/80 | Loss: 1.2162 | Val SMAPE: 65.16%
  -> ✓ Saved! Best: 65.16%
Epoch 2/80 | Loss: 0.9778 | Val SMAPE: 65.59%
Epoch 3/80 | Loss: 0.9468 | Val SMAPE: 60.91%
  -> ✓ Saved! Best: 60.91%
Epoch 4/80 | Loss: 0.9141 | Val SMAPE: 60.27%
  -> ✓ Saved! Best: 60.27%
Epoch 5/80 | Loss: 0.8855 | Val SMAPE: 62.20%
Epoch 6/80 | Loss: 0.8567 | Val SMAPE: 57.70%
  -> ✓ Saved! Best: 57.70%
Epoch 7/80 | Loss: 0.8229 | Val SMAPE: 57.89%
Epoch 8/80 | Loss: 0.8031 | Val SMAPE: 57.16%
  -> ✓ Saved! Best: 57.16%
Epoch 9/80 | Loss: 0.7860 | Val SMAPE: 57.78%
Epoch 10/80 | Loss: 0.7681 | Val SMAPE: 56.37%
  -> ✓ Saved! Best: 56.37%
Epoch 11/80 | Loss: 0.7565 | Val SMAPE: 55.57%
  -> ✓ Saved! Best: 55.57%
Epoch 12/80 | Loss: 0.7417 | Val SMAPE: 55.68%
Epoch 13/80 | Loss: 0.7291 | Val SMAPE: 56.23%
Epoch 14/80 | Loss: 0.7196 | Val SMAPE: 

In [1]:
# ============================================================================
# ULTIMATE ENSEMBLE PIPELINE - ALL 3 STRATEGIES COMBINED
# ============================================================================
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
from torch.optim.swa_utils import AveragedModel, SWALR
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from PIL import Image
import warnings
import gc
import json

warnings.filterwarnings('ignore')

print("="*80)
print("ULTIMATE ENSEMBLE PIPELINE")
print("Step 1: Feature Engineering")
print("Step 2: Train 5x GatedFusion Models")
print("Step 3: Train Optimized LightGBM")
print("Step 4: Meta-Ensemble & Final Prediction")
print("="*80)

# --- CONFIGURATION ---
EMBEDDINGS_PATH = '/kaggle/input/aml_new/keras/default/1/output_v4_advanced/embeddings'
V4_OUTPUT_PATH = '/kaggle/input/aml_new/keras/default/1/output_v4_advanced'
IMAGE_DIR = "/kaggle/input/aml-train/AMAZON_ML_TRAIN"
TEST_IMAGE_DIR = "/kaggle/input/amazon-ml-test/AMAZON_ML_TEST"
TRAIN_CSV_PATH = "/kaggle/input/aml-csv/train.csv"
TEST_CSV_PATH = "/kaggle/input/aml-csv/test.csv"
FINAL_OUTPUT_DIR = '/kaggle/working/output_ultimate_ensemble'

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BATCH_SIZE = 256
BASE_LR = 3e-4
EPOCHS = 60
PATIENCE = 12
NUM_MODELS = 5  # Ensemble 5 models
RANDOM_SEEDS = [42, 123, 456, 789, 2024]

os.makedirs(FINAL_OUTPUT_DIR, exist_ok=True)

# --- HELPER FUNCTIONS ---
def smape_loss(y_pred, y_true, eps=1e-9):
    numerator = torch.abs(y_pred - y_true)
    denominator = (torch.abs(y_true) + torch.abs(y_pred)) / 2.0
    return torch.mean(numerator / (denominator + eps))

def smape_safe(y_true, y_pred, eps=1e-9):
    numerator = np.abs(y_pred - y_true)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2.0
    return np.mean(numerator / (denominator + eps)) * 100

def info_nce_loss(text_embeds, image_embeds, temperature=0.07):
    text_embeds = F.normalize(text_embeds, p=2, dim=-1)
    image_embeds = F.normalize(image_embeds, p=2, dim=-1)
    logits = torch.matmul(text_embeds, image_embeds.T) / temperature
    labels = torch.arange(len(logits), device=logits.device)
    loss_i = F.cross_entropy(logits, labels)
    loss_t = F.cross_entropy(logits.T, labels)
    return (loss_i + loss_t) / 2.0

class FocalLoss(nn.Module):
    def __init__(self, alpha=0.25, gamma=2.0):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma
    
    def forward(self, inputs, targets):
        ce_loss = F.cross_entropy(inputs, targets, reduction='none')
        pt = torch.exp(-ce_loss)
        return torch.mean(self.alpha * (1 - pt) ** self.gamma * ce_loss)

def mixup_data(text_emb, image_emb, price, cat, brand, alpha=0.2):
    lam = np.random.beta(alpha, alpha) if alpha > 0 else 1
    batch_size = text_emb.size(0)
    index = torch.randperm(batch_size).to(DEVICE)
    return (lam * text_emb + (1 - lam) * text_emb[index],
            lam * image_emb + (1 - lam) * image_emb[index],
            lam * price + (1 - lam) * price[index], cat, brand)

# ============================================================================
# STEP 1: FEATURE ENGINEERING
# ============================================================================
print("\n" + "="*80)
print("STEP 1: FEATURE ENGINEERING")
print("="*80)

X_text_train = np.load(f'{EMBEDDINGS_PATH}/text_train.npy')
X_text_val = np.load(f'{EMBEDDINGS_PATH}/text_val.npy')
X_text_test = np.load(f'{EMBEDDINGS_PATH}/text_test.npy')
X_image_train = np.load(f'{EMBEDDINGS_PATH}/image_train.npy')
X_image_val = np.load(f'{EMBEDDINGS_PATH}/image_val.npy')
X_image_test = np.load(f'{EMBEDDINGS_PATH}/image_test.npy')

train_indices = pd.read_csv(f'{V4_OUTPUT_PATH}/train_indices.csv')['train_idx'].values
val_indices = pd.read_csv(f'{V4_OUTPUT_PATH}/val_indices.csv')['val_idx'].values
train_df_full = pd.read_csv(TRAIN_CSV_PATH)
train_df = train_df_full.iloc[train_indices].reset_index(drop=True)
val_df = train_df_full.iloc[val_indices].reset_index(drop=True)
test_df = pd.read_csv(TEST_CSV_PATH)

y_train_price = train_df['price'].values
y_val_price = val_df['price'].values
y_train_price_log = np.log1p(y_train_price)
y_val_price_log = np.log1p(y_val_price)

# Extract handcrafted features
def extract_text_features(text):
    text = str(text).lower()
    return {
        'text_len': len(text),
        'word_count': len(text.split()),
        'has_brand': int(any(brand in text for brand in ['apple', 'samsung', 'sony', 'nike', 'adidas'])),
        'has_price_keywords': int(any(kw in text for kw in ['price', 'cost', 'cheap', 'expensive', 'value'])),
        'has_numbers': int(any(c.isdigit() for c in text)),
        'avg_word_len': np.mean([len(w) for w in text.split()]) if text.split() else 0
    }

def extract_image_features(image_path):
    try:
        img = Image.open(image_path).convert('RGB')
        img_array = np.array(img)
        return {
            'aspect_ratio': img.width / img.height if img.height > 0 else 1.0,
            'brightness': np.mean(img_array),
            'color_variance': np.var(img_array)
        }
    except:
        return {'aspect_ratio': 1.0, 'brightness': 128.0, 'color_variance': 1000.0}

print("  -> Extracting text features...")
train_text_feats = pd.DataFrame([extract_text_features(t) for t in train_df['catalog_content']])
val_text_feats = pd.DataFrame([extract_text_features(t) for t in val_df['catalog_content']])
test_text_feats = pd.DataFrame([extract_text_features(t) for t in test_df['catalog_content']])

print("  -> Extracting image features...")
train_image_feats = pd.DataFrame([extract_image_features(os.path.join(IMAGE_DIR, f"{sid}.jpg")) for sid in train_df['sample_id']])
val_image_feats = pd.DataFrame([extract_image_features(os.path.join(IMAGE_DIR, f"{sid}.jpg")) for sid in val_df['sample_id']])
test_image_feats = pd.DataFrame([extract_image_features(os.path.join(TEST_IMAGE_DIR, f"{sid}.jpg")) for sid in test_df['sample_id']])

# Normalize features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
train_handcrafted = scaler.fit_transform(pd.concat([train_text_feats, train_image_feats], axis=1))
val_handcrafted = scaler.transform(pd.concat([val_text_feats, val_image_feats], axis=1))
test_handcrafted = scaler.transform(pd.concat([test_text_feats, test_image_feats], axis=1))

print(f"  -> ✓ Handcrafted features: {train_handcrafted.shape[1]} dimensions")

# Category/Brand encoding
def extract_category_simple(text):
    text = str(text).lower()
    if 'electronic' in text: return 'electronics'
    if 'book' in text: return 'books'
    return 'other'

def extract_brand_simple(text):
    return str(text).split()[0] if text else 'unknown'

train_df['category'] = train_df['catalog_content'].apply(extract_category_simple)
val_df['category'] = val_df['catalog_content'].apply(extract_category_simple)
test_df['category'] = test_df['catalog_content'].apply(extract_category_simple)
train_df['brand'] = train_df['catalog_content'].apply(extract_brand_simple)
val_df['brand'] = val_df['catalog_content'].apply(extract_brand_simple)
test_df['brand'] = test_df['catalog_content'].apply(extract_brand_simple)

cat_encoder = LabelEncoder().fit(pd.concat([train_df['category'], val_df['category'], test_df['category']]))
brand_encoder = LabelEncoder().fit(pd.concat([train_df['brand'], val_df['brand'], test_df['brand']]))

y_train_cat = cat_encoder.transform(train_df['category'])
y_val_cat = cat_encoder.transform(val_df['category'])
y_train_brand = brand_encoder.transform(train_df['brand'])
y_val_brand = brand_encoder.transform(val_df['brand'])

num_categories = len(cat_encoder.classes_)
num_brands = len(brand_encoder.classes_)

# Concatenate embeddings + handcrafted features
X_train_combined = np.concatenate([X_text_train, X_image_train, train_handcrafted], axis=1)
X_val_combined = np.concatenate([X_text_val, X_image_val, val_handcrafted], axis=1)
X_test_combined = np.concatenate([X_text_test, X_image_test, test_handcrafted], axis=1)

print(f"  -> ✓ Combined feature dim: {X_train_combined.shape[1]}")

# ============================================================================
# STEP 2: TRAIN 5x GATED FUSION MODELS (ENSEMBLE)
# ============================================================================
print("\n" + "="*80)
print("STEP 2: TRAINING 5x GATED FUSION MODELS")
print("="*80)

class OptimizedGatedFusion(nn.Module):
    def __init__(self, input_dim, fusion_dim=512, num_categories=10, num_brands=101):
        super().__init__()
        self.input_proj = nn.Sequential(
            nn.Linear(input_dim, fusion_dim),
            nn.LayerNorm(fusion_dim),
            nn.GELU(),
            nn.Dropout(0.15)
        )
        self.gate_net = nn.Sequential(
            nn.Linear(fusion_dim, fusion_dim // 2),
            nn.LayerNorm(fusion_dim // 2),
            nn.GELU(),
            nn.Dropout(0.1),
            nn.Linear(fusion_dim // 2, 1),
            nn.Sigmoid()
        )
        self.post_fusion = nn.Sequential(
            nn.LayerNorm(fusion_dim),
            nn.Linear(fusion_dim, fusion_dim * 2),
            nn.GELU(),
            nn.Dropout(0.15),
            nn.Linear(fusion_dim * 2, fusion_dim),
            nn.GELU(),
            nn.Dropout(0.1)
        )
        self.price_head = nn.Sequential(
            nn.LayerNorm(fusion_dim),
            nn.Linear(fusion_dim, fusion_dim // 2),
            nn.GELU(),
            nn.Dropout(0.1),
            nn.Linear(fusion_dim // 2, 1)
        )
        self.category_head = nn.Sequential(nn.LayerNorm(fusion_dim), nn.Linear(fusion_dim, num_categories))
        self.brand_head = nn.Sequential(nn.LayerNorm(fusion_dim), nn.Linear(fusion_dim, num_brands))
    
    def forward(self, x):
        proj = self.input_proj(x)
        gate = self.gate_net(proj)
        fused = gate * proj + (1 - gate) * proj  # Simplified gating
        fused = self.post_fusion(fused)
        return self.price_head(fused).squeeze(-1), self.category_head(fused), self.brand_head(fused), proj

def train_single_model(seed, X_train, X_val, y_train_log, y_val_log, y_train_cat, y_val_cat, y_train_brand, y_val_brand):
    torch.manual_seed(seed)
    np.random.seed(seed)
    
    train_dataset = TensorDataset(
        torch.from_numpy(X_train).float(),
        torch.from_numpy(y_train_log).float(),
        torch.from_numpy(y_train_cat).long(),
        torch.from_numpy(y_train_brand).long()
    )
    val_dataset = TensorDataset(
        torch.from_numpy(X_val).float(),
        torch.from_numpy(y_val_log).float(),
        torch.from_numpy(y_val_cat).long(),
        torch.from_numpy(y_val_brand).long()
    )
    
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=0)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE*2, shuffle=False, num_workers=0)
    
    model = OptimizedGatedFusion(X_train.shape[1], num_categories=num_categories, num_brands=num_brands).to(DEVICE)
    optimizer = torch.optim.AdamW(model.parameters(), lr=BASE_LR, weight_decay=0.02)
    
    price_mse = nn.MSELoss()
    cat_ce = nn.CrossEntropyLoss(label_smoothing=0.1)
    brand_ce = FocalLoss()
    
    best_smape = float('inf')
    patience = 0
    
    for epoch in range(EPOCHS):
        model.train()
        for x, price_t, cat_t, brand_t in train_loader:
            x, price_t, cat_t, brand_t = x.to(DEVICE), price_t.to(DEVICE), cat_t.to(DEVICE), brand_t.to(DEVICE)
            
            optimizer.zero_grad()
            price_p, cat_p, brand_p, proj = model(x)
            loss = (0.7 * smape_loss(torch.expm1(price_p), torch.expm1(price_t)) + 0.05 * price_mse(price_p, price_t)) + \
                   (0.1 * cat_ce(cat_p, cat_t)) + (0.15 * brand_ce(brand_p, brand_t))
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
        
        model.eval()
        val_preds = []
        with torch.no_grad():
            for x, _, _, _ in val_loader:
                val_preds.append(model(x.to(DEVICE))[0].cpu().numpy())
        
        val_smape = smape_safe(y_val_price, np.expm1(np.concatenate(val_preds)))
        
        if val_smape < best_smape:
            best_smape = val_smape
            patience = 0
            torch.save(model.state_dict(), f'{FINAL_OUTPUT_DIR}/model_seed{seed}.pth')
        else:
            patience += 1
            if patience >= PATIENCE:
                break
    
    print(f"  -> Model {seed}: Best Val SMAPE = {best_smape:.2f}%")
    return best_smape

# Train ensemble
ensemble_scores = []
for i, seed in enumerate(RANDOM_SEEDS):
    print(f"\n  Training Model {i+1}/{NUM_MODELS} (seed={seed})...")
    score = train_single_model(seed, X_train_combined, X_val_combined, y_train_price_log, y_val_price_log,
                                y_train_cat, y_val_cat, y_train_brand, y_val_brand)
    ensemble_scores.append(score)
    gc.collect()
    torch.cuda.empty_cache()

print(f"\n  ✓ Ensemble Average Val SMAPE: {np.mean(ensemble_scores):.2f}%")

# ============================================================================
# STEP 3: TRAIN OPTIMIZED LIGHTGBM
# ============================================================================
print("\n" + "="*80)
print("STEP 3: TRAINING OPTIMIZED LIGHTGBM")
print("="*80)

lgb_params = {
    'objective': 'regression',
    'metric': 'mae',
    'boosting_type': 'gbdt',
    'num_leaves': 63,
    'learning_rate': 0.03,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'max_depth': 10,
    'min_child_samples': 20,
    'reg_alpha': 0.1,
    'reg_lambda': 0.1,
    'verbose': -1
}

lgb_train = lgb.Dataset(X_train_combined, y_train_price_log)
lgb_val = lgb.Dataset(X_val_combined, y_val_price_log, reference=lgb_train)

print("  -> Training LightGBM...")
lgb_model = lgb.train(
    lgb_params,
    lgb_train,
    num_boost_round=2000,
    valid_sets=[lgb_val],
    callbacks=[lgb.early_stopping(stopping_rounds=100), lgb.log_evaluation(period=0)]
)

lgb_val_preds = np.expm1(lgb_model.predict(X_val_combined))
lgb_smape = smape_safe(y_val_price, lgb_val_preds)
print(f"  -> ✓ LightGBM Val SMAPE: {lgb_smape:.2f}%")

# ============================================================================
# STEP 4: META-ENSEMBLE & FINAL PREDICTION
# ============================================================================
print("\n" + "="*80)
print("STEP 4: META-ENSEMBLE & FINAL PREDICTION")
print("="*80)

# Load all neural network models and get validation predictions
nn_val_preds = []
nn_test_preds = []

test_dataset = TensorDataset(torch.from_numpy(X_test_combined).float())
test_loader = DataLoader(test_dataset, batch_size=512, shuffle=False, num_workers=0)

for seed in RANDOM_SEEDS:
    model = OptimizedGatedFusion(X_train_combined.shape[1], num_categories=num_categories, num_brands=num_brands).to(DEVICE)
    model.load_state_dict(torch.load(f'{FINAL_OUTPUT_DIR}/model_seed{seed}.pth'))
    model.eval()
    
    # Validation predictions
    val_loader_single = DataLoader(TensorDataset(torch.from_numpy(X_val_combined).float()), batch_size=512, shuffle=False, num_workers=0)
    val_preds = []
    with torch.no_grad():
        for (x,) in val_loader_single:
            val_preds.append(model(x.to(DEVICE))[0].cpu().numpy())
    nn_val_preds.append(np.expm1(np.concatenate(val_preds)))
    
    # Test predictions
    test_preds = []
    with torch.no_grad():
        for (x,) in test_loader:
            test_preds.append(model(x.to(DEVICE))[0].cpu().numpy())
    nn_test_preds.append(np.expm1(np.concatenate(test_preds)))

# Average neural network predictions
nn_val_avg = np.mean(nn_val_preds, axis=0)
nn_test_avg = np.mean(nn_test_preds, axis=0)

# Get LightGBM test predictions
lgb_test_preds = np.expm1(lgb_model.predict(X_test_combined))

# Find optimal blending weight
print("  -> Finding optimal ensemble weights...")
best_blend_smape = float('inf')
best_weight = 0.5

for weight in np.linspace(0.3, 0.7, 21):
    blended_val = weight * nn_val_avg + (1 - weight) * lgb_val_preds
    blend_smape = smape_safe(y_val_price, blended_val)
    if blend_smape < best_blend_smape:
        best_blend_smape = blend_smape
        best_weight = weight

print(f"  -> Optimal weight: NN={best_weight:.2f}, LGB={1-best_weight:.2f}")
print(f"  -> ✓ Final Ensemble Val SMAPE: {best_blend_smape:.2f}%")

# Final test predictions
final_test_preds = best_weight * nn_test_avg + (1 - best_weight) * lgb_test_preds

# Save submission
if 'sample_id' not in test_df.columns:
    test_df['sample_id'] = test_df.index

submission = pd.DataFrame({'sample_id': test_df['sample_id'], 'price': final_test_preds})
submission.to_csv(f'{FINAL_OUTPUT_DIR}/submission_ultimate.csv', index=False)

# Save results summary
results = {
    'individual_nn_models': {f'seed_{seed}': float(score) for seed, score in zip(RANDOM_SEEDS, ensemble_scores)},
    'nn_ensemble_avg': float(smape_safe(y_val_price, nn_val_avg)),
    'lightgbm': float(lgb_smape),
    'final_ensemble': float(best_blend_smape),
    'optimal_weight': {'nn': float(best_weight), 'lgb': float(1-best_weight)}
}

with open(f'{FINAL_OUTPUT_DIR}/results.json', 'w') as f:
    json.dump(results, f, indent=2)

print("\n" + "="*80)
print("ULTIMATE ENSEMBLE COMPLETE")
print("="*80)
print(f"✓ Final Val SMAPE: {best_blend_smape:.2f}%")
print(f"✓ Improvement: {57.9 - best_blend_smape:.2f}% from baseline")
print(f"✓ Submission: {FINAL_OUTPUT_DIR}/submission_ultimate.csv")
print("="*80)


ULTIMATE ENSEMBLE PIPELINE
Step 1: Feature Engineering
Step 2: Train 5x GatedFusion Models
Step 3: Train Optimized LightGBM
Step 4: Meta-Ensemble & Final Prediction

STEP 1: FEATURE ENGINEERING
  -> Extracting text features...
  -> Extracting image features...
  -> ✓ Handcrafted features: 9 dimensions
  -> ✓ Combined feature dim: 1801

STEP 2: TRAINING 5x GATED FUSION MODELS

  Training Model 1/5 (seed=42)...
  -> Model 42: Best Val SMAPE = 55.39%

  Training Model 2/5 (seed=123)...
  -> Model 123: Best Val SMAPE = 55.94%

  Training Model 3/5 (seed=456)...
  -> Model 456: Best Val SMAPE = 55.92%

  Training Model 4/5 (seed=789)...
  -> Model 789: Best Val SMAPE = 55.66%

  Training Model 5/5 (seed=2024)...
  -> Model 2024: Best Val SMAPE = 55.86%

  ✓ Ensemble Average Val SMAPE: 55.75%

STEP 3: TRAINING OPTIMIZED LIGHTGBM
  -> Training LightGBM...
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[2000]	valid_0's l1: 0.567022

In [2]:
import kagglehub

kagglehub.login()

# Replace with path to directory containing model files.
LOCAL_MODEL_DIR = '/kaggle/working/'

MODEL_SLUG = 'aml_multi_combo' # Replace with model slug.

# Learn more about naming model variations at
# https://www.kaggle.com/docs/models#name-model.
VARIATION_SLUG = 'default' # Replace with variation slug.

kagglehub.model_upload(
  handle = f"kartikgarg74/{MODEL_SLUG}/keras/{VARIATION_SLUG}",
  local_model_dir = LOCAL_MODEL_DIR,
  version_notes = 'Update 2025-10-13')

VBox(children=(HTML(value='<center> <img\nsrc=https://www.kaggle.com/static/images/site-logo.png\nalt=\'Kaggle…

Uploading Model https://www.kaggle.com/models/kartikgarg74/aml_multi_combo/keras/default ...
Model 'aml_multi_combo' does not exist or access is forbidden for user 'kartikgarg74'. Creating or handling Model...
Model 'aml_multi_combo' Created.
Starting upload for file /kaggle/working/output_ultimate_ensemble/model_seed789.pth


Uploading: 100%|██████████| 8.99M/8.99M [00:00<00:00, 17.7MB/s]

Upload successful: /kaggle/working/output_ultimate_ensemble/model_seed789.pth (9MB)
Starting upload for file /kaggle/working/output_ultimate_ensemble/model_seed123.pth



Uploading: 100%|██████████| 8.99M/8.99M [00:00<00:00, 19.0MB/s]

Upload successful: /kaggle/working/output_ultimate_ensemble/model_seed123.pth (9MB)
Starting upload for file /kaggle/working/output_ultimate_ensemble/model_seed42.pth



Uploading: 100%|██████████| 8.99M/8.99M [00:00<00:00, 18.2MB/s]

Upload successful: /kaggle/working/output_ultimate_ensemble/model_seed42.pth (9MB)
Starting upload for file /kaggle/working/output_ultimate_ensemble/submission_ultimate.csv



Uploading: 100%|██████████| 1.88M/1.88M [00:00<00:00, 4.41MB/s]

Upload successful: /kaggle/working/output_ultimate_ensemble/submission_ultimate.csv (2MB)
Starting upload for file /kaggle/working/output_ultimate_ensemble/model_seed2024.pth



Uploading: 100%|██████████| 8.99M/8.99M [00:00<00:00, 18.2MB/s]

Upload successful: /kaggle/working/output_ultimate_ensemble/model_seed2024.pth (9MB)
Starting upload for file /kaggle/working/output_ultimate_ensemble/model_seed456.pth



Uploading: 100%|██████████| 8.99M/8.99M [00:00<00:00, 18.4MB/s]

Upload successful: /kaggle/working/output_ultimate_ensemble/model_seed456.pth (9MB)
Starting upload for file /kaggle/working/output_ultimate_ensemble/results.json



Uploading: 100%|██████████| 395/395 [00:00<00:00, 968B/s]

Upload successful: /kaggle/working/output_ultimate_ensemble/results.json (395B)





Your model instance has been created.
Files are being processed...
See at: https://www.kaggle.com/models/kartikgarg74/aml_multi_combo/keras/default


## new embeddings 

In [3]:
# ============================================================================
# ULTRA-OPTIMIZED: GPU TEXT + CPU IMAGES PARALLEL EXTRACTION
# ============================================================================
# STEP 0: RESTART KERNEL FIRST!
# In Kaggle: Click "Restart & Clear Output" button (top right)
# ============================================================================

import os
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from transformers import AutoModel, AutoTokenizer, AutoImageProcessor
from PIL import Image
from tqdm.auto import tqdm
import gc
import warnings
import threading
from queue import Queue
warnings.filterwarnings('ignore')

print("="*80)
print("ULTRA-FAST PARALLEL EXTRACTION: GPU TEXT + CPU IMAGES")
print("="*80)

# --- CONFIG ---
TRAIN_CSV_PATH = "/kaggle/input/aml-csv/train.csv"
TEST_CSV_PATH = "/kaggle/input/aml-csv/test.csv"
TRAIN_IMAGE_DIR = "/kaggle/input/aml-train/AMAZON_ML_TRAIN"
TEST_IMAGE_DIR = "/kaggle/input/amazon-ml-test/AMAZON_ML_TEST"
OUTPUT_DIR = "/kaggle/working/"

TEXT_MODEL_PATH = '/kaggle/input/qwen-3-embedding/transformers/0.6b/1'
IMAGE_MODEL_PATH = '/kaggle/input/google-siglip-so400m-patch14-384/transformers/default/1'

DEVICE_GPU = torch.device("cuda")
DEVICE_CPU = torch.device("cpu")
BATCH_SIZE_TEXT = 24
BATCH_SIZE_IMAGE = 12

# --- CLEAR GPU ---
torch.cuda.empty_cache()
gc.collect()

print(f"GPU Free: {torch.cuda.mem_get_info()[0]/1e9:.2f}GB / {torch.cuda.mem_get_info()[1]/1e9:.2f}GB")

# --- LOAD DATA ---
print("\n✓ Loading data...")
train_df = pd.read_csv(TRAIN_CSV_PATH)
test_df = pd.read_csv(TEST_CSV_PATH)
print(f"  Train: {len(train_df)}, Test: {len(test_df)}")

# ============================================================================
# PHASE 1: TEXT ON GPU (FP16)
# ============================================================================
print("\n" + "="*80)
print("PHASE 1: TEXT EMBEDDINGS ON GPU (FP16)")
print("="*80)

print("\n✓ Loading text model (FP16)...")
text_model = AutoModel.from_pretrained(
    TEXT_MODEL_PATH, 
    trust_remote_code=True,
    torch_dtype=torch.float16
).to(DEVICE_GPU)
text_tokenizer = AutoTokenizer.from_pretrained(TEXT_MODEL_PATH, trust_remote_code=True)
text_model.eval()

print(f"  -> GPU allocated: {torch.cuda.memory_allocated()/1e9:.2f}GB")

def extract_text_fp16(texts, batch_size):
    embeddings = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Text GPU"):
        batch = texts[i:i+batch_size]
        inputs = text_tokenizer(
            batch, padding=True, truncation=True, 
            max_length=256, return_tensors='pt'
        ).to(DEVICE_GPU)
        
        with torch.no_grad(), torch.cuda.amp.autocast():
            outputs = text_model(**inputs)
            emb = outputs.last_hidden_state[:, 0, :] if hasattr(outputs, 'last_hidden_state') else outputs.pooler_output
            emb = F.normalize(emb.float(), p=2, dim=1)
            embeddings.append(emb.cpu().numpy())
        
        del inputs, outputs, emb
        if i % 200 == 0:
            torch.cuda.empty_cache()
    
    return np.vstack(embeddings)

# Extract train text
print("\n1. Train text...")
train_texts = train_df['catalog_content'].fillna('').tolist()
train_text_emb = extract_text_fp16(train_texts, BATCH_SIZE_TEXT)
np.save(f'{OUTPUT_DIR}/train_text_normalized.npy', train_text_emb)
print(f"   ✓ Saved: {train_text_emb.shape}, L2={np.linalg.norm(train_text_emb[0]):.4f}")
del train_text_emb, train_texts

# Extract test text
print("\n2. Test text...")
test_texts = test_df['catalog_content'].fillna('').tolist()
test_text_emb = extract_text_fp16(test_texts, BATCH_SIZE_TEXT)
np.save(f'{OUTPUT_DIR}/test_text_normalized.npy', test_text_emb)
print(f"   ✓ Saved: {test_text_emb.shape}, L2={np.linalg.norm(test_text_emb[0]):.4f}")
del test_text_emb, test_texts

# CRITICAL: Unload text model
del text_model, text_tokenizer
gc.collect()
torch.cuda.empty_cache()
torch.cuda.synchronize()

print(f"\n✓ Text done. GPU freed: {torch.cuda.mem_get_info()[0]/1e9:.2f}GB")

# ============================================================================
# PHASE 2: IMAGES ON CPU (PARALLEL LOADING)
# ============================================================================
print("\n" + "="*80)
print("PHASE 2: IMAGE EMBEDDINGS ON CPU (MULTI-THREADED)")
print("="*80)

print("\n✓ Loading image model on CPU...")
image_model = AutoModel.from_pretrained(
    IMAGE_MODEL_PATH,
    torch_dtype=torch.float32  # FP32 on CPU is fine
).to(DEVICE_CPU)
image_processor = AutoImageProcessor.from_pretrained(IMAGE_MODEL_PATH)
image_model.eval()

def load_images_parallel(paths, num_threads=4):
    """Parallel image loading"""
    images = [None] * len(paths)
    queue = Queue()
    
    for idx, path in enumerate(paths):
        queue.put((idx, path))
    
    def worker():
        while not queue.empty():
            try:
                idx, path = queue.get(timeout=1)
                try:
                    images[idx] = Image.open(path).convert('RGB')
                except:
                    images[idx] = Image.new('RGB', (384, 384))
                queue.task_done()
            except:
                break
    
    threads = [threading.Thread(target=worker) for _ in range(num_threads)]
    for t in threads:
        t.start()
    for t in threads:
        t.join()
    
    return images

def extract_image_cpu(paths, batch_size):
    embeddings = []
    
    for i in tqdm(range(0, len(paths), batch_size), desc="Image CPU"):
        batch_paths = paths[i:i+batch_size]
        
        # Parallel image loading (4 threads)
        images = load_images_parallel(batch_paths, num_threads=4)
        
        # Process batch
        inputs = image_processor(images, return_tensors='pt').to(DEVICE_CPU)
        
        with torch.no_grad():
            outputs = image_model(**inputs)
            emb = outputs.image_embeds if hasattr(outputs, 'image_embeds') else outputs.pooler_output
            emb = F.normalize(emb, p=2, dim=1)
            embeddings.append(emb.numpy())
        
        del inputs, outputs, emb, images
        if i % 200 == 0:
            gc.collect()
    
    return np.vstack(embeddings)

# Extract train images
print("\n1. Train images...")
train_paths = [os.path.join(TRAIN_IMAGE_DIR, f"{sid}.jpg") for sid in train_df['sample_id']]
train_image_emb = extract_image_cpu(train_paths, BATCH_SIZE_IMAGE)
np.save(f'{OUTPUT_DIR}/train_image_normalized.npy', train_image_emb)
print(f"   ✓ Saved: {train_image_emb.shape}, L2={np.linalg.norm(train_image_emb[0]):.4f}")
del train_image_emb, train_paths

# Extract test images
print("\n2. Test images...")
test_paths = [os.path.join(TEST_IMAGE_DIR, f"{sid}.jpg") for sid in test_df['sample_id']]
test_image_emb = extract_image_cpu(test_paths, BATCH_SIZE_IMAGE)
np.save(f'{OUTPUT_DIR}/test_image_normalized.npy', test_image_emb)
print(f"   ✓ Saved: {test_image_emb.shape}, L2={np.linalg.norm(test_image_emb[0]):.4f}")
del test_image_emb, test_paths

del image_model, image_processor
gc.collect()

# ============================================================================
# PHASE 3: VERIFY & UPLOAD
# ============================================================================
print("\n" + "="*80)
print("PHASE 3: VERIFICATION & UPLOAD")
print("="*80)

# Verify files
print("\n✓ Verifying saved files...")
files = {
    'train_text_normalized.npy': None,
    'train_image_normalized.npy': None,
    'test_text_normalized.npy': None,
    'test_image_normalized.npy': None
}

for fname in files:
    path = f'{OUTPUT_DIR}/{fname}'
    if os.path.exists(path):
        arr = np.load(path)
        size_mb = os.path.getsize(path) / (1024**2)
        files[fname] = arr.shape
        print(f"  -> {fname}: {arr.shape}, {size_mb:.2f}MB, L2 norm={np.linalg.norm(arr[0]):.4f}")
    else:
        print(f"  -> ERROR: {fname} not found!")

# Upload to Kaggle
print("\n✓ Uploading to Kaggle...")
try:
    import kagglehub
    kagglehub.login()
    
    print("  -> Starting upload...")
    kagglehub.model_upload(
        handle="kartikgarg74/aml-embed-siglip-qwen3-normalized/keras/default",
        local_model_dir='/kaggle/working/',
        version_notes='SigLIP-So400m + Qwen3-0.6B | L2 Normalized | GPU+CPU Optimized | 2025-10-13'
    )
    
    print("\n" + "="*80)
    print("✓ UPLOAD SUCCESSFUL!")
    print("="*80)
    print("\nDataset Path for Part 2:")
    print("  /kaggle/input/aml-embed-siglip-qwen3-normalized/keras/default/1/")
    print("\nFiles uploaded:")
    for fname, shape in files.items():
        print(f"  - {fname}: {shape}")
    
except Exception as e:
    print(f"\n  -> Upload error: {e}")
    print("  -> Files saved locally in /kaggle/working/")
    print("  -> You can manually upload them as a Kaggle dataset")

print("\n" + "="*80)
print("PART 1 COMPLETE - ALL EMBEDDINGS EXTRACTED!")
print("="*80)
print("\nTime saved with CPU parallel loading: ~30-40%")
print("Ready for Part 2: 5-Fold Training")
print("\nNext steps:")
print("1. Note the dataset path above")
print("2. Create new notebook")
print("3. Run Part 2 training code")
print("="*80)


ULTRA-FAST PARALLEL EXTRACTION: GPU TEXT + CPU IMAGES
GPU Free: 0.01GB / 15.83GB

✓ Loading data...
  Train: 75000, Test: 75000

PHASE 1: TEXT EMBEDDINGS ON GPU (FP16)

✓ Loading text model (FP16)...
  -> GPU allocated: 6.65GB

1. Train text...


Text GPU:   0%|          | 0/3125 [00:00<?, ?it/s]

   ✓ Saved: (75000, 1024), L2=1.0000

2. Test text...


Text GPU:   0%|          | 0/3125 [00:00<?, ?it/s]

   ✓ Saved: (75000, 1024), L2=1.0000

✓ Text done. GPU freed: 9.39GB

PHASE 2: IMAGE EMBEDDINGS ON CPU (MULTI-THREADED)

✓ Loading image model on CPU...

1. Train images...


Image CPU:   0%|          | 0/6250 [00:00<?, ?it/s]

ValueError: You have to specify input_ids

In [4]:
# ============================================================================
# IMAGE EMBEDDINGS ONLY - FIXED FOR SIGLIP VISION ENCODER
# ============================================================================
import os
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from transformers import AutoModel, AutoImageProcessor
from PIL import Image
from tqdm.auto import tqdm
import threading
from queue import Queue
import gc
import warnings
warnings.filterwarnings('ignore')

print("="*80)
print("IMAGE EMBEDDINGS EXTRACTION - SIGLIP VISION")
print("="*80)

# --- CONFIG ---
TRAIN_CSV_PATH = "/kaggle/input/aml-csv/train.csv"
TEST_CSV_PATH = "/kaggle/input/aml-csv/test.csv"
TRAIN_IMAGE_DIR = "/kaggle/input/aml-train/AMAZON_ML_TRAIN"
TEST_IMAGE_DIR = "/kaggle/input/amazon-ml-test/AMAZON_ML_TEST"
OUTPUT_DIR = "/kaggle/working/"

IMAGE_MODEL_PATH = '/kaggle/input/google-siglip-so400m-patch14-384/transformers/default/1'

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BATCH_SIZE = 32  # GPU has 9GB free, we can use it now!

print(f"Device: {DEVICE}")
if torch.cuda.is_available():
    print(f"GPU Free: {torch.cuda.mem_get_info()[0]/1e9:.2f}GB")

# --- LOAD DATA ---
print("\n✓ Loading data...")
train_df = pd.read_csv(TRAIN_CSV_PATH)
test_df = pd.read_csv(TEST_CSV_PATH)
print(f"  Train: {len(train_df)}, Test: {len(test_df)}")

# ============================================================================
# LOAD SIGLIP VISION MODEL (VISION ENCODER ONLY!)
# ============================================================================
print("\n✓ Loading SigLIP VISION encoder...")

# Load full model first
full_model = AutoModel.from_pretrained(
    IMAGE_MODEL_PATH,
    torch_dtype=torch.float16  # FP16 for speed
).to(DEVICE)

# Extract ONLY the vision encoder (not the full vision-language model!)
vision_model = full_model.vision_model
vision_model.eval()

del full_model  # Free memory
gc.collect()
torch.cuda.empty_cache()

image_processor = AutoImageProcessor.from_pretrained(IMAGE_MODEL_PATH)

print(f"  -> Vision encoder loaded: {torch.cuda.memory_allocated()/1e9:.2f}GB")

# ============================================================================
# PARALLEL IMAGE LOADING
# ============================================================================
def load_images_parallel(paths, num_threads=4):
    """Multi-threaded image loading"""
    images = [None] * len(paths)
    queue = Queue()
    
    for idx, path in enumerate(paths):
        queue.put((idx, path))
    
    def worker():
        while not queue.empty():
            try:
                idx, path = queue.get(timeout=1)
                try:
                    images[idx] = Image.open(path).convert('RGB')
                except:
                    # Fallback to blank image if file corrupted
                    images[idx] = Image.new('RGB', (384, 384))
                queue.task_done()
            except:
                break
    
    threads = [threading.Thread(target=worker) for _ in range(num_threads)]
    for t in threads:
        t.start()
    for t in threads:
        t.join()
    
    return images

# ============================================================================
# IMAGE EMBEDDING EXTRACTION
# ============================================================================
def extract_image_embeddings(paths, batch_size):
    """Extract L2-normalized image embeddings"""
    embeddings = []
    
    for i in tqdm(range(0, len(paths), batch_size), desc="Image embeddings"):
        batch_paths = paths[i:i+batch_size]
        
        # Parallel load images
        images = load_images_parallel(batch_paths, num_threads=4)
        
        # Process batch
        inputs = image_processor(images, return_tensors='pt')
        pixel_values = inputs['pixel_values'].to(DEVICE)
        
        with torch.no_grad(), torch.cuda.amp.autocast():
            # Pass only pixel_values to VISION encoder
            vision_outputs = vision_model(pixel_values=pixel_values)
            
            # Get pooled image features
            image_embeds = vision_outputs.pooler_output  # [batch_size, 1152]
            
            # L2 normalize
            image_embeds_normalized = F.normalize(image_embeds.float(), p=2, dim=1)
            embeddings.append(image_embeds_normalized.cpu().numpy())
        
        del inputs, pixel_values, vision_outputs, image_embeds, images
        
        # Clear cache periodically
        if i % 200 == 0:
            torch.cuda.empty_cache()
    
    return np.vstack(embeddings)

# ============================================================================
# EXTRACT TRAIN IMAGES
# ============================================================================
print("\n" + "="*80)
print("EXTRACTING TRAIN IMAGES")
print("="*80)

train_paths = [os.path.join(TRAIN_IMAGE_DIR, f"{sid}.jpg") for sid in train_df['sample_id']]
train_image_emb = extract_image_embeddings(train_paths, BATCH_SIZE)

# Save
save_path_train = f'{OUTPUT_DIR}/train_image_normalized.npy'
np.save(save_path_train, train_image_emb)

print(f"\n✓ Train images saved:")
print(f"  -> Path: {save_path_train}")
print(f"  -> Shape: {train_image_emb.shape}")
print(f"  -> L2 norm: {np.linalg.norm(train_image_emb[0]):.4f}")
print(f"  -> Size: {os.path.getsize(save_path_train)/(1024**2):.2f} MB")

del train_image_emb, train_paths
gc.collect()

# ============================================================================
# EXTRACT TEST IMAGES
# ============================================================================
print("\n" + "="*80)
print("EXTRACTING TEST IMAGES")
print("="*80)

test_paths = [os.path.join(TEST_IMAGE_DIR, f"{sid}.jpg") for sid in test_df['sample_id']]
test_image_emb = extract_image_embeddings(test_paths, BATCH_SIZE)

# Save
save_path_test = f'{OUTPUT_DIR}/test_image_normalized.npy'
np.save(save_path_test, test_image_emb)

print(f"\n✓ Test images saved:")
print(f"  -> Path: {save_path_test}")
print(f"  -> Shape: {test_image_emb.shape}")
print(f"  -> L2 norm: {np.linalg.norm(test_image_emb[0]):.4f}")
print(f"  -> Size: {os.path.getsize(save_path_test)/(1024**2):.2f} MB")

del test_image_emb, test_paths, vision_model, image_processor
gc.collect()
torch.cuda.empty_cache()

# ============================================================================
# VERIFICATION
# ============================================================================
print("\n" + "="*80)
print("VERIFICATION - ALL EMBEDDINGS")
print("="*80)

files_to_check = [
    'train_text_normalized.npy',
    'train_image_normalized.npy',
    'test_text_normalized.npy',
    'test_image_normalized.npy'
]

all_good = True
for fname in files_to_check:
    path = f'{OUTPUT_DIR}/{fname}'
    if os.path.exists(path):
        arr = np.load(path)
        size_mb = os.path.getsize(path) / (1024**2)
        l2_norm = np.linalg.norm(arr[0])
        print(f"✓ {fname}")
        print(f"  Shape: {arr.shape}, Size: {size_mb:.2f}MB, L2: {l2_norm:.4f}")
    else:
        print(f"✗ {fname} - NOT FOUND!")
        all_good = False

if all_good:
    print("\n" + "="*80)
    print("✓ ALL EMBEDDINGS READY FOR UPLOAD!")
    print("="*80)
    print("\nNow run the upload code to save to Kaggle dataset")
else:
    print("\n✗ Some files missing. Check errors above.")

# ============================================================================
# UPLOAD TO KAGGLE
# ============================================================================
print("\n" + "="*80)
print("UPLOADING TO KAGGLE")
print("="*80)

try:
    import kagglehub
    kagglehub.login()
    
    print("\n  -> Starting upload...")
    kagglehub.model_upload(
        handle="kartikgarg74/aml-embed-siglip-qwen3-normalized/keras/default",
        local_model_dir='/kaggle/working/',
        version_notes='SigLIP-So400m + Qwen3-0.6B | L2 Normalized | Complete | 2025-10-13'
    )
    
    print("\n" + "="*80)
    print("✓ UPLOAD COMPLETE!")
    print("="*80)
    print("\nDataset path for Part 2:")
    print("  /kaggle/input/aml-embed-siglip-qwen3-normalized/keras/default/1/")
    print("\nReady to proceed with 5-Fold training!")
    
except Exception as e:
    print(f"\n  Upload error: {e}")
    print("  Files are saved in /kaggle/working/")
    print("  You can manually create a Kaggle dataset and upload them")

print("\n" + "="*80)
print("COMPLETE! ALL EMBEDDINGS EXTRACTED & SAVED")
print("="*80)


IMAGE EMBEDDINGS EXTRACTION - SIGLIP VISION
Device: cuda
GPU Free: 9.39GB

✓ Loading data...
  Train: 75000, Test: 75000

✓ Loading SigLIP VISION encoder...
  -> Vision encoder loaded: 0.88GB

EXTRACTING TRAIN IMAGES


Image embeddings:   0%|          | 0/2344 [00:00<?, ?it/s]


✓ Train images saved:
  -> Path: /kaggle/working//train_image_normalized.npy
  -> Shape: (75000, 1152)
  -> L2 norm: 1.0000
  -> Size: 329.59 MB

EXTRACTING TEST IMAGES


Image embeddings:   0%|          | 0/2344 [00:00<?, ?it/s]


✓ Test images saved:
  -> Path: /kaggle/working//test_image_normalized.npy
  -> Shape: (75000, 1152)
  -> L2 norm: 1.0000
  -> Size: 329.59 MB

VERIFICATION - ALL EMBEDDINGS
✓ train_text_normalized.npy
  Shape: (75000, 1024), Size: 292.97MB, L2: 1.0000
✓ train_image_normalized.npy
  Shape: (75000, 1152), Size: 329.59MB, L2: 1.0000
✓ test_text_normalized.npy
  Shape: (75000, 1024), Size: 292.97MB, L2: 1.0000
✓ test_image_normalized.npy
  Shape: (75000, 1152), Size: 329.59MB, L2: 1.0000

✓ ALL EMBEDDINGS READY FOR UPLOAD!

Now run the upload code to save to Kaggle dataset

UPLOADING TO KAGGLE


VBox(children=(HTML(value='<center> <img\nsrc=https://www.kaggle.com/static/images/site-logo.png\nalt=\'Kaggle…


  -> Starting upload...
Uploading Model https://www.kaggle.com/models/kartikgarg74/aml-embed-siglip-qwen3-normalized/keras/default ...
Model 'aml-embed-siglip-qwen3-normalized' does not exist or access is forbidden for user 'kartikgarg74'. Creating or handling Model...
Model 'aml-embed-siglip-qwen3-normalized' Created.
Starting upload for file /kaggle/working/test_image_normalized.npy


Uploading: 100%|██████████| 346M/346M [00:02<00:00, 119MB/s]  

Upload successful: /kaggle/working/test_image_normalized.npy (330MB)
Starting upload for file /kaggle/working/test_text_normalized.npy



Uploading: 100%|██████████| 307M/307M [00:02<00:00, 125MB/s]  

Upload successful: /kaggle/working/test_text_normalized.npy (293MB)
Starting upload for file /kaggle/working/train_text_normalized.npy



Uploading: 100%|██████████| 307M/307M [00:02<00:00, 120MB/s]  

Upload successful: /kaggle/working/train_text_normalized.npy (293MB)
Starting upload for file /kaggle/working/train_image_normalized.npy



Uploading: 100%|██████████| 346M/346M [00:02<00:00, 129MB/s] 

Upload successful: /kaggle/working/train_image_normalized.npy (330MB)





Your model instance has been created.
Files are being processed...
See at: https://www.kaggle.com/models/kartikgarg74/aml-embed-siglip-qwen3-normalized/keras/default

✓ UPLOAD COMPLETE!

Dataset path for Part 2:
  /kaggle/input/aml-embed-siglip-qwen3-normalized/keras/default/1/

Ready to proceed with 5-Fold training!

COMPLETE! ALL EMBEDDINGS EXTRACTED & SAVED


In [6]:
# ============================================================================
# PART 2: 5-FOLD CV TRAINING WITH NORMALIZED EMBEDDINGS
# ============================================================================
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import KFold
from tqdm.auto import tqdm
import gc
import json
import warnings
warnings.filterwarnings('ignore')

print("="*80)
print("PART 2: 5-FOLD CV TRAINING")
print("="*80)

# --- CONFIGURATION ---
EMBEDDINGS_PATH = '/kaggle/working/'  # Update after upload
TRAIN_CSV_PATH = "/kaggle/input/aml-csv/train.csv"
TEST_CSV_PATH = "/kaggle/input/aml-csv/test.csv"
OUTPUT_DIR = '/kaggle/working/output_final'

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BATCH_SIZE = 256
LEARNING_RATE = 5e-4
EPOCHS_PER_FOLD = 20
NUM_FOLDS = 5

os.makedirs(OUTPUT_DIR, exist_ok=True)

# --- LOSS FUNCTIONS ---
def smape_loss(y_pred, y_true, eps=1e-9):
    numerator = torch.abs(y_pred - y_true)
    denominator = (torch.abs(y_true) + torch.abs(y_pred)) / 2.0
    return torch.mean(numerator / (denominator + eps))

def smape_metric(y_true, y_pred, eps=1e-9):
    numerator = np.abs(y_pred - y_true)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2.0
    return np.mean(numerator / (denominator + eps)) * 100

# --- LOAD EMBEDDINGS ---
print("\n✓ Loading normalized embeddings...")
train_text = np.load(f'{EMBEDDINGS_PATH}/train_text_normalized.npy')
train_image = np.load(f'{EMBEDDINGS_PATH}/train_image_normalized.npy')
test_text = np.load(f'{EMBEDDINGS_PATH}/test_text_normalized.npy')
test_image = np.load(f'{EMBEDDINGS_PATH}/test_image_normalized.npy')

# Concatenate
train_fused = np.concatenate([train_text, train_image], axis=1)
test_fused = np.concatenate([test_text, test_image], axis=1)

print(f"  -> Train fused: {train_fused.shape}")
print(f"  -> Test fused: {test_fused.shape}")

# Load labels
train_df = pd.read_csv(TRAIN_CSV_PATH)
test_df = pd.read_csv(TEST_CSV_PATH)
y_train = np.log1p(train_df['price'].values)

# --- MODEL ARCHITECTURE ---
class SimpleFusionMLP(nn.Module):
    """Simple MLP for concatenated embeddings - following friend's approach"""
    def __init__(self, input_dim, hidden_dim=1024):
        super().__init__()
        self.mlp = nn.Sequential(
            nn.LayerNorm(input_dim),
            nn.Linear(input_dim, hidden_dim),
            nn.GELU(),
            nn.Dropout(0.2),
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.GELU(),
            nn.Dropout(0.15),
            nn.Linear(hidden_dim // 2, hidden_dim // 4),
            nn.GELU(),
            nn.Dropout(0.1),
            nn.Linear(hidden_dim // 4, 1)
        )
    
    def forward(self, x):
        return self.mlp(x).squeeze(-1)

# --- 5-FOLD TRAINING ---
print("\n" + "="*80)
print("STARTING 5-FOLD CROSS-VALIDATION")
print("="*80)

kfold = KFold(n_splits=NUM_FOLDS, shuffle=True, random_state=42)
fold_scores = []
fold_models = []

for fold, (train_idx, val_idx) in enumerate(kfold.split(train_fused)):
    print(f"\n{'='*80}")
    print(f"FOLD {fold+1}/{NUM_FOLDS}")
    print(f"{'='*80}")
    
    # Split data
    X_train_fold = train_fused[train_idx]
    X_val_fold = train_fused[val_idx]
    y_train_fold = y_train[train_idx]
    y_val_fold = y_train[val_idx]
    y_val_price = np.expm1(y_val_fold)
    
    # Create datasets
    train_dataset = TensorDataset(
        torch.from_numpy(X_train_fold).float(),
        torch.from_numpy(y_train_fold).float()
    )
    val_dataset = TensorDataset(
        torch.from_numpy(X_val_fold).float(),
        torch.from_numpy(y_val_fold).float()
    )
    
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=0)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE*2, shuffle=False, num_workers=0)
    
    # Initialize model
    model = SimpleFusionMLP(input_dim=train_fused.shape[1]).to(DEVICE)
    optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=0.01)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=EPOCHS_PER_FOLD)
    
    best_smape = float('inf')
    
    # Training loop
    for epoch in range(EPOCHS_PER_FOLD):
        model.train()
        train_loss = 0
        
        for x, y in train_loader:
            x, y = x.to(DEVICE), y.to(DEVICE)
            
            optimizer.zero_grad()
            pred = model(x)
            loss = 0.8 * smape_loss(torch.expm1(pred), torch.expm1(y)) + 0.2 * F.mse_loss(pred, y)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            
            train_loss += loss.item()
        
        scheduler.step()
        
        # Validation
        model.eval()
        val_preds = []
        with torch.no_grad():
            for x, _ in val_loader:
                val_preds.append(model(x.to(DEVICE)).cpu().numpy())
        
        val_preds = np.concatenate(val_preds)
        val_smape = smape_metric(y_val_price, np.expm1(val_preds))
        
        if val_smape < best_smape:
            best_smape = val_smape
            torch.save(model.state_dict(), f'{OUTPUT_DIR}/model_fold{fold}.pth')
        
        if epoch % 5 == 0:
            print(f"  Epoch {epoch+1}/{EPOCHS_PER_FOLD} | Loss: {train_loss/len(train_loader):.4f} | Val SMAPE: {val_smape:.2f}%")
    
    print(f"\n  ✓ Fold {fold+1} Best SMAPE: {best_smape:.2f}%")
    fold_scores.append(best_smape)
    
    # Cleanup
    del model, optimizer, scheduler
    gc.collect()
    torch.cuda.empty_cache()

print(f"\n{'='*80}")
print(f"5-FOLD CV COMPLETE")
print(f"{'='*80}")
print(f"Fold Scores: {fold_scores}")
print(f"Average: {np.mean(fold_scores):.2f}% ± {np.std(fold_scores):.2f}%")

# --- ENSEMBLE PREDICTION ---
print(f"\n{'='*80}")
print("GENERATING ENSEMBLE PREDICTIONS")
print(f"{'='*80}")

test_dataset = TensorDataset(torch.from_numpy(test_fused).float())
test_loader = DataLoader(test_dataset, batch_size=512, shuffle=False, num_workers=0)

all_fold_preds = []

for fold in range(NUM_FOLDS):
    model = SimpleFusionMLP(input_dim=train_fused.shape[1]).to(DEVICE)
    model.load_state_dict(torch.load(f'{OUTPUT_DIR}/model_fold{fold}.pth'))
    model.eval()
    
    fold_preds = []
    with torch.no_grad():
        for (x,) in test_loader:
            fold_preds.append(model(x.to(DEVICE)).cpu().numpy())
    
    all_fold_preds.append(np.concatenate(fold_preds))
    print(f"  -> Fold {fold+1} predictions generated")

# Average predictions
final_preds_log = np.mean(all_fold_preds, axis=0)
final_preds = np.expm1(final_preds_log)

# Create submission
if 'sample_id' not in test_df.columns:
    test_df['sample_id'] = test_df.index

submission = pd.DataFrame({
    'sample_id': test_df['sample_id'],
    'price': final_preds
})

submission.to_csv(f'{OUTPUT_DIR}/submission_final.csv', index=False)

# Save results
results = {
    'fold_scores': [float(s) for s in fold_scores],
    'mean_cv_score': float(np.mean(fold_scores)),
    'std_cv_score': float(np.std(fold_scores)),
    'models': 'SigLIP-So400m + Qwen3-0.6B',
    'fusion': 'Simple Concat + MLP',
    'normalization': 'L2',
    'folds': NUM_FOLDS,
    'epochs_per_fold': EPOCHS_PER_FOLD
}

with open(f'{OUTPUT_DIR}/results.json', 'w') as f:
    json.dump(results, f, indent=2)

print(f"\n{'='*80}")
print("FINAL RESULTS")
print(f"{'='*80}")
print(f"✓ CV Score: {np.mean(fold_scores):.2f}% ± {np.std(fold_scores):.2f}%")
print(f"✓ Test predictions: {len(final_preds):,}")
print(f"✓ Price range: [{final_preds.min():.2f}, {final_preds.max():.2f}]")
print(f"✓ Submission: {OUTPUT_DIR}/submission_final.csv")
print(f"{'='*80}")


PART 2: 5-FOLD CV TRAINING

✓ Loading normalized embeddings...
  -> Train fused: (75000, 2176)
  -> Test fused: (75000, 2176)

STARTING 5-FOLD CROSS-VALIDATION

FOLD 1/5
  Epoch 1/20 | Loss: 0.6909 | Val SMAPE: 57.62%
  Epoch 6/20 | Loss: 0.5046 | Val SMAPE: 53.61%
  Epoch 11/20 | Loss: 0.4340 | Val SMAPE: 52.37%
  Epoch 16/20 | Loss: 0.3762 | Val SMAPE: 51.62%

  ✓ Fold 1 Best SMAPE: 51.29%

FOLD 2/5
  Epoch 1/20 | Loss: 0.6779 | Val SMAPE: 56.67%
  Epoch 6/20 | Loss: 0.5091 | Val SMAPE: 52.85%
  Epoch 11/20 | Loss: 0.4398 | Val SMAPE: 51.03%
  Epoch 16/20 | Loss: 0.3807 | Val SMAPE: 50.38%

  ✓ Fold 2 Best SMAPE: 50.36%

FOLD 3/5
  Epoch 1/20 | Loss: 0.6865 | Val SMAPE: 56.89%
  Epoch 6/20 | Loss: 0.5051 | Val SMAPE: 53.10%
  Epoch 11/20 | Loss: 0.4354 | Val SMAPE: 51.61%
  Epoch 16/20 | Loss: 0.3785 | Val SMAPE: 51.32%

  ✓ Fold 3 Best SMAPE: 51.20%

FOLD 4/5
  Epoch 1/20 | Loss: 0.6884 | Val SMAPE: 55.59%
  Epoch 6/20 | Loss: 0.5098 | Val SMAPE: 51.35%
  Epoch 11/20 | Loss: 0.4405 

In [7]:
import kagglehub

kagglehub.login()

# Replace with path to directory containing model files.
LOCAL_MODEL_DIR = '/kaggle/working/'

MODEL_SLUG = 'aml_hirercial_best' # Replace with model slug.

# Learn more about naming model variations at
# https://www.kaggle.com/docs/models#name-model.
VARIATION_SLUG = 'default' # Replace with variation slug.

kagglehub.model_upload(
  handle = f"kartikgarg74/{MODEL_SLUG}/keras/{VARIATION_SLUG}",
  local_model_dir = LOCAL_MODEL_DIR,
  version_notes = 'Update 2025-10-12')

VBox(children=(HTML(value='<center> <img\nsrc=https://www.kaggle.com/static/images/site-logo.png\nalt=\'Kaggle…

Uploading Model https://www.kaggle.com/models/kartikgarg74/aml_hirercial_best/keras/default ...
Model 'aml_hirercial_best' does not exist or access is forbidden for user 'kartikgarg74'. Creating or handling Model...
Model 'aml_hirercial_best' Created.
Starting upload for file /kaggle/working/test_image_normalized.npy


Uploading: 100%|██████████| 346M/346M [00:02<00:00, 133MB/s] 

Upload successful: /kaggle/working/test_image_normalized.npy (330MB)
Starting upload for file /kaggle/working/test_text_normalized.npy



Uploading: 100%|██████████| 307M/307M [00:02<00:00, 117MB/s] 

Upload successful: /kaggle/working/test_text_normalized.npy (293MB)
Starting upload for file /kaggle/working/train_text_normalized.npy



Uploading: 100%|██████████| 307M/307M [00:03<00:00, 99.2MB/s] 

Upload successful: /kaggle/working/train_text_normalized.npy (293MB)
Starting upload for file /kaggle/working/train_image_normalized.npy



Uploading: 100%|██████████| 346M/346M [00:02<00:00, 128MB/s]  

Upload successful: /kaggle/working/train_image_normalized.npy (330MB)
Starting upload for file /kaggle/working/output_final/model_fold3.pth



Uploading: 100%|██████████| 11.6M/11.6M [00:00<00:00, 22.1MB/s]

Upload successful: /kaggle/working/output_final/model_fold3.pth (11MB)
Starting upload for file /kaggle/working/output_final/model_fold2.pth



Uploading: 100%|██████████| 11.6M/11.6M [00:00<00:00, 22.1MB/s]

Upload successful: /kaggle/working/output_final/model_fold2.pth (11MB)
Starting upload for file /kaggle/working/output_final/results.json



Uploading: 100%|██████████| 356/356 [00:00<00:00, 847B/s]

Upload successful: /kaggle/working/output_final/results.json (356B)
Starting upload for file /kaggle/working/output_final/model_fold1.pth



Uploading: 100%|██████████| 11.6M/11.6M [00:00<00:00, 22.8MB/s]

Upload successful: /kaggle/working/output_final/model_fold1.pth (11MB)
Starting upload for file /kaggle/working/output_final/model_fold0.pth



Uploading: 100%|██████████| 11.6M/11.6M [00:00<00:00, 18.2MB/s]

Upload successful: /kaggle/working/output_final/model_fold0.pth (11MB)
Starting upload for file /kaggle/working/output_final/model_fold4.pth



Uploading: 100%|██████████| 11.6M/11.6M [00:00<00:00, 22.6MB/s]

Upload successful: /kaggle/working/output_final/model_fold4.pth (11MB)
Starting upload for file /kaggle/working/output_final/submission_final.csv



Uploading: 100%|██████████| 1.22M/1.22M [00:00<00:00, 2.95MB/s]

Upload successful: /kaggle/working/output_final/submission_final.csv (1MB)





Your model instance has been created.
Files are being processed...
See at: https://www.kaggle.com/models/kartikgarg74/aml_hirercial_best/keras/default


## Train 6 different NN architectures → 5-Fold each → Ensemble best models

In [8]:
# ============================================================================
# MULTI-ARCHITECTURE NEURAL NETWORK PIPELINE
# Train 6 different NN architectures → 5-Fold each → Ensemble best models
# ============================================================================
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import KFold
from tqdm.auto import tqdm
import gc
import json
import warnings
warnings.filterwarnings('ignore')

print("="*80)
print("MULTI-ARCHITECTURE NN PIPELINE WITH ENSEMBLE")
print("="*80)

# --- CONFIG ---
EMBEDDINGS_PATH = '/kaggle/input/aml-embed-siglip-qwen3-normalized/keras/default/1'
TRAIN_CSV_PATH = "/kaggle/input/aml-csv/train.csv"
TEST_CSV_PATH = "/kaggle/input/aml-csv/test.csv"
OUTPUT_DIR = '/kaggle/working/output_multi_arch'

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BATCH_SIZE = 256
LEARNING_RATE = 5e-4
EPOCHS_PER_FOLD = 15
NUM_FOLDS = 5
PATIENCE = 5

os.makedirs(OUTPUT_DIR, exist_ok=True)

# --- LOSS & METRICS ---
def smape_loss(y_pred, y_true, eps=1e-9):
    numerator = torch.abs(y_pred - y_true)
    denominator = (torch.abs(y_true) + torch.abs(y_pred)) / 2.0
    return torch.mean(numerator / (denominator + eps))

def smape_metric(y_true, y_pred, eps=1e-9):
    numerator = np.abs(y_pred - y_true)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2.0
    return np.mean(numerator / (denominator + eps)) * 100

# --- LOAD EMBEDDINGS ---
print("\n✓ Loading embeddings...")
train_text = np.load(f'{EMBEDDINGS_PATH}/train_text_normalized.npy')
train_image = np.load(f'{EMBEDDINGS_PATH}/train_image_normalized.npy')
test_text = np.load(f'{EMBEDDINGS_PATH}/test_text_normalized.npy')
test_image = np.load(f'{EMBEDDINGS_PATH}/test_image_normalized.npy')

train_fused = np.concatenate([train_text, train_image], axis=1)
test_fused = np.concatenate([test_text, test_image], axis=1)

train_df = pd.read_csv(TRAIN_CSV_PATH)
test_df = pd.read_csv(TEST_CSV_PATH)
y_train = np.log1p(train_df['price'].values)

print(f"  -> Train: {train_fused.shape}, Test: {test_fused.shape}")

# ============================================================================
# DEFINE 6 DIFFERENT ARCHITECTURES
# ============================================================================

# 1. SIMPLE MLP (Baseline)
class SimpleMLP(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.LayerNorm(input_dim),
            nn.Linear(input_dim, 512),
            nn.GELU(),
            nn.Dropout(0.2),
            nn.Linear(512, 256),
            nn.GELU(),
            nn.Dropout(0.15),
            nn.Linear(256, 1)
        )
    
    def forward(self, x):
        return self.net(x).squeeze(-1)

# 2. DEEP RESIDUAL MLP
class ResidualMLP(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.input_proj = nn.Linear(input_dim, 512)
        
        self.block1 = nn.Sequential(
            nn.LayerNorm(512),
            nn.Linear(512, 512),
            nn.GELU(),
            nn.Dropout(0.15)
        )
        self.block2 = nn.Sequential(
            nn.LayerNorm(512),
            nn.Linear(512, 512),
            nn.GELU(),
            nn.Dropout(0.15)
        )
        self.output = nn.Sequential(
            nn.LayerNorm(512),
            nn.Linear(512, 256),
            nn.GELU(),
            nn.Linear(256, 1)
        )
    
    def forward(self, x):
        x = self.input_proj(x)
        x = x + self.block1(x)  # Residual
        x = x + self.block2(x)  # Residual
        return self.output(x).squeeze(-1)

# 3. WIDE & DEEP
class WideDeep(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        # Wide part
        self.wide = nn.Linear(input_dim, 1)
        
        # Deep part
        self.deep = nn.Sequential(
            nn.Linear(input_dim, 768),
            nn.GELU(),
            nn.Dropout(0.2),
            nn.Linear(768, 384),
            nn.GELU(),
            nn.Dropout(0.15),
            nn.Linear(384, 1)
        )
    
    def forward(self, x):
        wide_out = self.wide(x)
        deep_out = self.deep(x)
        return (wide_out + deep_out).squeeze(-1)

# 4. ATTENTION-BASED
class AttentionNet(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.proj = nn.Linear(input_dim, 512)
        self.attn = nn.MultiheadAttention(512, num_heads=8, batch_first=True)
        self.mlp = nn.Sequential(
            nn.LayerNorm(512),
            nn.Linear(512, 256),
            nn.GELU(),
            nn.Dropout(0.15),
            nn.Linear(256, 1)
        )
    
    def forward(self, x):
        x = self.proj(x).unsqueeze(1)  # [B, 1, 512]
        x, _ = self.attn(x, x, x)
        x = x.squeeze(1)
        return self.mlp(x).squeeze(-1)

# 5. GATED LINEAR UNIT (GLU)
class GLUNet(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.gate_proj = nn.Linear(input_dim, 512)
        self.value_proj = nn.Linear(input_dim, 512)
        
        self.mlp = nn.Sequential(
            nn.LayerNorm(512),
            nn.Linear(512, 256),
            nn.GELU(),
            nn.Dropout(0.15),
            nn.Linear(256, 1)
        )
    
    def forward(self, x):
        gate = torch.sigmoid(self.gate_proj(x))
        value = self.value_proj(x)
        gated = gate * value
        return self.mlp(gated).squeeze(-1)

# 6. MIXTURE OF EXPERTS (MoE)
class MixtureOfExperts(nn.Module):
    def __init__(self, input_dim, num_experts=4):
        super().__init__()
        self.num_experts = num_experts
        
        # Gating network
        self.gate = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, num_experts),
            nn.Softmax(dim=1)
        )
        
        # Experts
        self.experts = nn.ModuleList([
            nn.Sequential(
                nn.Linear(input_dim, 256),
                nn.GELU(),
                nn.Dropout(0.15),
                nn.Linear(256, 1)
            ) for _ in range(num_experts)
        ])
    
    def forward(self, x):
        gates = self.gate(x)  # [B, num_experts]
        expert_outputs = torch.stack([expert(x).squeeze(-1) for expert in self.experts], dim=1)  # [B, num_experts]
        return (gates * expert_outputs).sum(dim=1)

# ============================================================================
# TRAINING FUNCTION
# ============================================================================
def train_architecture(model_class, model_name, input_dim, X, y):
    print(f"\n{'='*80}")
    print(f"TRAINING: {model_name}")
    print(f"{'='*80}")
    
    kfold = KFold(n_splits=NUM_FOLDS, shuffle=True, random_state=42)
    fold_scores = []
    fold_predictions = []  # Store OOF predictions
    test_predictions = []
    
    for fold, (train_idx, val_idx) in enumerate(kfold.split(X)):
        print(f"\n--- Fold {fold+1}/{NUM_FOLDS} ---")
        
        X_train_fold = X[train_idx]
        X_val_fold = X[val_idx]
        y_train_fold = y[train_idx]
        y_val_fold = y[val_idx]
        y_val_price = np.expm1(y_val_fold)
        
        # Datasets
        train_dataset = TensorDataset(torch.from_numpy(X_train_fold).float(), torch.from_numpy(y_train_fold).float())
        val_dataset = TensorDataset(torch.from_numpy(X_val_fold).float(), torch.from_numpy(y_val_fold).float())
        
        train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=0)
        val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE*2, shuffle=False, num_workers=0)
        
        # Model
        model = model_class(input_dim).to(DEVICE)
        optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=0.01)
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=EPOCHS_PER_FOLD)
        
        best_smape = float('inf')
        patience_counter = 0
        
        for epoch in range(EPOCHS_PER_FOLD):
            # Train
            model.train()
            for x, y_batch in train_loader:
                x, y_batch = x.to(DEVICE), y_batch.to(DEVICE)
                optimizer.zero_grad()
                pred = model(x)
                loss = 0.8 * smape_loss(torch.expm1(pred), torch.expm1(y_batch)) + 0.2 * F.mse_loss(pred, y_batch)
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()
            
            scheduler.step()
            
            # Validate
            model.eval()
            val_preds = []
            with torch.no_grad():
                for x, _ in val_loader:
                    val_preds.append(model(x.to(DEVICE)).cpu().numpy())
            
            val_preds = np.concatenate(val_preds)
            val_smape = smape_metric(y_val_price, np.expm1(val_preds))
            
            if val_smape < best_smape:
                best_smape = val_smape
                patience_counter = 0
                torch.save(model.state_dict(), f'{OUTPUT_DIR}/{model_name}_fold{fold}.pth')
                best_val_preds = val_preds.copy()
            else:
                patience_counter += 1
                if patience_counter >= PATIENCE:
                    break
            
            if epoch % 3 == 0:
                print(f"  Epoch {epoch+1}/{EPOCHS_PER_FOLD} | Val SMAPE: {val_smape:.2f}%")
        
        print(f"  ✓ Fold {fold+1} Best SMAPE: {best_smape:.2f}%")
        fold_scores.append(best_smape)
        fold_predictions.append((val_idx, best_val_preds))
        
        # Test predictions
        model.load_state_dict(torch.load(f'{OUTPUT_DIR}/{model_name}_fold{fold}.pth'))
        model.eval()
        test_loader = DataLoader(TensorDataset(torch.from_numpy(test_fused).float()), batch_size=512, shuffle=False)
        test_preds = []
        with torch.no_grad():
            for (x,) in test_loader:
                test_preds.append(model(x.to(DEVICE)).cpu().numpy())
        test_predictions.append(np.concatenate(test_preds))
        
        del model, optimizer
        gc.collect()
        torch.cuda.empty_cache()
    
    # Aggregate results
    avg_score = np.mean(fold_scores)
    std_score = np.std(fold_scores)
    test_pred_avg = np.mean(test_predictions, axis=0)
    
    # Compute OOF score
    oof_preds = np.zeros(len(y))
    for val_idx, preds in fold_predictions:
        oof_preds[val_idx] = preds
    oof_smape = smape_metric(np.expm1(y), np.expm1(oof_preds))
    
    print(f"\n{'='*40}")
    print(f"{model_name} RESULTS")
    print(f"{'='*40}")
    print(f"CV Score: {avg_score:.2f}% ± {std_score:.2f}%")
    print(f"OOF Score: {oof_smape:.2f}%")
    print(f"{'='*40}")
    
    return {
        'model_name': model_name,
        'cv_score': avg_score,
        'std_score': std_score,
        'oof_score': oof_smape,
        'fold_scores': fold_scores,
        'test_predictions': test_pred_avg,
        'oof_predictions': oof_preds
    }

# ============================================================================
# TRAIN ALL ARCHITECTURES
# ============================================================================
print("\n" + "="*80)
print("PHASE 1: TRAINING ALL ARCHITECTURES")
print("="*80)

architectures = [
    (SimpleMLP, "SimpleMLP"),
    (ResidualMLP, "ResidualMLP"),
    (WideDeep, "WideDeep"),
    (AttentionNet, "AttentionNet"),
    (GLUNet, "GLUNet"),
    (MixtureOfExperts, "MixtureOfExperts")
]

results = []
for model_class, model_name in architectures:
    result = train_architecture(model_class, model_name, train_fused.shape[1], train_fused, y_train)
    results.append(result)

# ============================================================================
# ENSEMBLE BEST MODELS
# ============================================================================
print("\n" + "="*80)
print("PHASE 2: ENSEMBLE BEST MODELS")
print("="*80)

# Sort by OOF score
results_sorted = sorted(results, key=lambda x: x['oof_score'])

print("\nRanking by OOF Score:")
for i, res in enumerate(results_sorted):
    print(f"{i+1}. {res['model_name']}: {res['oof_score']:.2f}% (CV: {res['cv_score']:.2f}% ± {res['std_score']:.2f}%)")

# Ensemble top 3 models
top3 = results_sorted[:3]
print(f"\n✓ Ensembling top 3: {[r['model_name'] for r in top3]}")

# Weighted average (better models get higher weight)
weights = [1.0, 0.8, 0.6]
weights = np.array(weights) / sum(weights)

ensemble_test = sum(w * res['test_predictions'] for w, res in zip(weights, top3))
ensemble_oof = sum(w * res['oof_predictions'] for w, res in zip(weights, top3))

ensemble_oof_smape = smape_metric(np.expm1(y_train), np.expm1(ensemble_oof))

print(f"\n{'='*40}")
print(f"ENSEMBLE RESULTS")
print(f"{'='*40}")
print(f"Ensemble OOF SMAPE: {ensemble_oof_smape:.2f}%")
print(f"Improvement: {results_sorted[0]['oof_score'] - ensemble_oof_smape:.2f}%")
print(f"{'='*40}")

# ============================================================================
# SAVE RESULTS
# ============================================================================
submission = pd.DataFrame({
    'sample_id': test_df['sample_id'] if 'sample_id' in test_df else test_df.index,
    'price': np.expm1(ensemble_test)
})
submission.to_csv(f'{OUTPUT_DIR}/submission_ensemble.csv', index=False)

summary = {
    'individual_models': [
        {
            'name': r['model_name'],
            'cv_score': float(r['cv_score']),
            'oof_score': float(r['oof_score']),
            'fold_scores': [float(s) for s in r['fold_scores']]
        } for r in results
    ],
    'ensemble': {
        'models': [r['model_name'] for r in top3],
        'weights': [float(w) for w in weights],
        'oof_score': float(ensemble_oof_smape)
    }
}

with open(f'{OUTPUT_DIR}/results.json', 'w') as f:
    json.dump(summary, f, indent=2)

print(f"\n✓ Submission saved: {OUTPUT_DIR}/submission_ensemble.csv")
print(f"✓ Results saved: {OUTPUT_DIR}/results.json")
print("\n" + "="*80)
print("COMPLETE!")
print("="*80)


MULTI-ARCHITECTURE NN PIPELINE WITH ENSEMBLE

✓ Loading embeddings...
  -> Train: (75000, 2176), Test: (75000, 2176)

PHASE 1: TRAINING ALL ARCHITECTURES

TRAINING: SimpleMLP

--- Fold 1/5 ---
  Epoch 1/15 | Val SMAPE: 58.16%
  Epoch 4/15 | Val SMAPE: 57.11%
  Epoch 7/15 | Val SMAPE: 53.70%
  Epoch 10/15 | Val SMAPE: 52.77%
  Epoch 13/15 | Val SMAPE: 52.19%
  ✓ Fold 1 Best SMAPE: 52.10%

--- Fold 2/5 ---
  Epoch 1/15 | Val SMAPE: 59.71%
  Epoch 4/15 | Val SMAPE: 54.95%
  Epoch 7/15 | Val SMAPE: 52.81%
  Epoch 10/15 | Val SMAPE: 51.38%
  Epoch 13/15 | Val SMAPE: 51.23%
  ✓ Fold 2 Best SMAPE: 50.96%

--- Fold 3/5 ---
  Epoch 1/15 | Val SMAPE: 57.61%
  Epoch 4/15 | Val SMAPE: 54.14%
  Epoch 7/15 | Val SMAPE: 52.69%
  Epoch 10/15 | Val SMAPE: 52.03%
  Epoch 13/15 | Val SMAPE: 51.78%
  ✓ Fold 3 Best SMAPE: 51.60%

--- Fold 4/5 ---
  Epoch 1/15 | Val SMAPE: 60.35%
  Epoch 4/15 | Val SMAPE: 55.85%
  Epoch 7/15 | Val SMAPE: 52.68%
  Epoch 10/15 | Val SMAPE: 51.02%
  Epoch 13/15 | Val SMAPE: 50

In [9]:
import kagglehub

kagglehub.login()

# Replace with path to directory containing model files.
LOCAL_MODEL_DIR = '/kaggle/working/'

MODEL_SLUG = 'aml_hirercial_best' # Replace with model slug.

# Learn more about naming model variations at
# https://www.kaggle.com/docs/models#name-model.
VARIATION_SLUG = 'default' # Replace with variation slug.

kagglehub.model_upload(
  handle = f"kartikgarg74/{MODEL_SLUG}/keras/{VARIATION_SLUG}",
  local_model_dir = LOCAL_MODEL_DIR,
  version_notes = 'Update 2025-10-12')

VBox(children=(HTML(value='<center> <img\nsrc=https://www.kaggle.com/static/images/site-logo.png\nalt=\'Kaggle…

Uploading Model https://www.kaggle.com/models/kartikgarg74/aml_hirercial_best/keras/default ...
Starting upload for file /kaggle/working/test_image_normalized.npy


Uploading: 100%|██████████| 346M/346M [00:03<00:00, 103MB/s]  

Upload successful: /kaggle/working/test_image_normalized.npy (330MB)
Starting upload for file /kaggle/working/test_text_normalized.npy



Uploading: 100%|██████████| 307M/307M [00:02<00:00, 104MB/s]  

Upload successful: /kaggle/working/test_text_normalized.npy (293MB)
Starting upload for file /kaggle/working/train_text_normalized.npy



Uploading: 100%|██████████| 307M/307M [00:02<00:00, 128MB/s]  

Upload successful: /kaggle/working/train_text_normalized.npy (293MB)
Starting upload for file /kaggle/working/train_image_normalized.npy



Uploading: 100%|██████████| 346M/346M [00:02<00:00, 119MB/s] 

Upload successful: /kaggle/working/train_image_normalized.npy (330MB)
Starting upload for file /kaggle/working/output_final/model_fold3.pth



Uploading: 100%|██████████| 11.6M/11.6M [00:00<00:00, 23.8MB/s]

Upload successful: /kaggle/working/output_final/model_fold3.pth (11MB)
Starting upload for file /kaggle/working/output_final/model_fold2.pth



Uploading: 100%|██████████| 11.6M/11.6M [00:00<00:00, 20.9MB/s]

Upload successful: /kaggle/working/output_final/model_fold2.pth (11MB)
Starting upload for file /kaggle/working/output_final/results.json



Uploading: 100%|██████████| 356/356 [00:00<00:00, 850B/s]

Upload successful: /kaggle/working/output_final/results.json (356B)
Starting upload for file /kaggle/working/output_final/model_fold1.pth



Uploading: 100%|██████████| 11.6M/11.6M [00:00<00:00, 22.7MB/s]

Upload successful: /kaggle/working/output_final/model_fold1.pth (11MB)
Starting upload for file /kaggle/working/output_final/model_fold0.pth



Uploading: 100%|██████████| 11.6M/11.6M [00:00<00:00, 22.9MB/s]

Upload successful: /kaggle/working/output_final/model_fold0.pth (11MB)
Starting upload for file /kaggle/working/output_final/model_fold4.pth



Uploading: 100%|██████████| 11.6M/11.6M [00:00<00:00, 21.8MB/s]

Upload successful: /kaggle/working/output_final/model_fold4.pth (11MB)
Starting upload for file /kaggle/working/output_final/submission_final.csv



Uploading: 100%|██████████| 1.22M/1.22M [00:00<00:00, 2.80MB/s]

Upload successful: /kaggle/working/output_final/submission_final.csv (1MB)
Starting upload for file /kaggle/working/output_multi_arch/AttentionNet_fold4.pth



Uploading: 100%|██████████| 9.20M/9.20M [00:00<00:00, 17.0MB/s]

Upload successful: /kaggle/working/output_multi_arch/AttentionNet_fold4.pth (9MB)
Starting upload for file /kaggle/working/output_multi_arch/GLUNet_fold2.pth



Uploading: 100%|██████████| 9.45M/9.45M [00:00<00:00, 17.8MB/s]

Upload successful: /kaggle/working/output_multi_arch/GLUNet_fold2.pth (9MB)
Starting upload for file /kaggle/working/output_multi_arch/ResidualMLP_fold3.pth



Uploading: 100%|██████████| 7.10M/7.10M [00:00<00:00, 14.3MB/s]

Upload successful: /kaggle/working/output_multi_arch/ResidualMLP_fold3.pth (7MB)
Starting upload for file /kaggle/working/output_multi_arch/AttentionNet_fold1.pth



Uploading: 100%|██████████| 9.20M/9.20M [00:00<00:00, 18.2MB/s]

Upload successful: /kaggle/working/output_multi_arch/AttentionNet_fold1.pth (9MB)
Starting upload for file /kaggle/working/output_multi_arch/WideDeep_fold2.pth



Uploading: 100%|██████████| 7.88M/7.88M [00:00<00:00, 14.9MB/s]

Upload successful: /kaggle/working/output_multi_arch/WideDeep_fold2.pth (8MB)
Starting upload for file /kaggle/working/output_multi_arch/SimpleMLP_fold3.pth



Uploading: 100%|██████████| 5.01M/5.01M [00:00<00:00, 10.3MB/s]

Upload successful: /kaggle/working/output_multi_arch/SimpleMLP_fold3.pth (5MB)
Starting upload for file /kaggle/working/output_multi_arch/GLUNet_fold3.pth



Uploading: 100%|██████████| 9.45M/9.45M [00:00<00:00, 18.8MB/s]

Upload successful: /kaggle/working/output_multi_arch/GLUNet_fold3.pth (9MB)
Starting upload for file /kaggle/working/output_multi_arch/ResidualMLP_fold2.pth



Uploading: 100%|██████████| 7.10M/7.10M [00:00<00:00, 12.3MB/s]

Upload successful: /kaggle/working/output_multi_arch/ResidualMLP_fold2.pth (7MB)
Starting upload for file /kaggle/working/output_multi_arch/MixtureOfExperts_fold2.pth



Uploading: 100%|██████████| 10.0M/10.0M [00:00<00:00, 19.4MB/s]

Upload successful: /kaggle/working/output_multi_arch/MixtureOfExperts_fold2.pth (10MB)
Starting upload for file /kaggle/working/output_multi_arch/MixtureOfExperts_fold1.pth



Uploading: 100%|██████████| 10.0M/10.0M [00:00<00:00, 18.7MB/s]

Upload successful: /kaggle/working/output_multi_arch/MixtureOfExperts_fold1.pth (10MB)
Starting upload for file /kaggle/working/output_multi_arch/ResidualMLP_fold1.pth



Uploading: 100%|██████████| 7.10M/7.10M [00:00<00:00, 14.2MB/s]

Upload successful: /kaggle/working/output_multi_arch/ResidualMLP_fold1.pth (7MB)
Starting upload for file /kaggle/working/output_multi_arch/SimpleMLP_fold1.pth



Uploading: 100%|██████████| 5.01M/5.01M [00:00<00:00, 9.91MB/s]

Upload successful: /kaggle/working/output_multi_arch/SimpleMLP_fold1.pth (5MB)
Starting upload for file /kaggle/working/output_multi_arch/GLUNet_fold4.pth



Uploading: 100%|██████████| 9.45M/9.45M [00:00<00:00, 18.5MB/s]

Upload successful: /kaggle/working/output_multi_arch/GLUNet_fold4.pth (9MB)
Starting upload for file /kaggle/working/output_multi_arch/WideDeep_fold0.pth



Uploading: 100%|██████████| 7.88M/7.88M [00:00<00:00, 15.6MB/s]

Upload successful: /kaggle/working/output_multi_arch/WideDeep_fold0.pth (8MB)
Starting upload for file /kaggle/working/output_multi_arch/WideDeep_fold4.pth



Uploading: 100%|██████████| 7.88M/7.88M [00:00<00:00, 16.3MB/s]

Upload successful: /kaggle/working/output_multi_arch/WideDeep_fold4.pth (8MB)
Starting upload for file /kaggle/working/output_multi_arch/results.json



Uploading: 100%|██████████| 1.95k/1.95k [00:00<00:00, 4.72kB/s]

Upload successful: /kaggle/working/output_multi_arch/results.json (2KB)
Starting upload for file /kaggle/working/output_multi_arch/MixtureOfExperts_fold4.pth



Uploading: 100%|██████████| 10.0M/10.0M [00:00<00:00, 18.7MB/s]

Upload successful: /kaggle/working/output_multi_arch/MixtureOfExperts_fold4.pth (10MB)
Starting upload for file /kaggle/working/output_multi_arch/GLUNet_fold0.pth



Uploading: 100%|██████████| 9.45M/9.45M [00:00<00:00, 18.0MB/s]

Upload successful: /kaggle/working/output_multi_arch/GLUNet_fold0.pth (9MB)
Starting upload for file /kaggle/working/output_multi_arch/WideDeep_fold1.pth



Uploading: 100%|██████████| 7.88M/7.88M [00:00<00:00, 15.3MB/s]

Upload successful: /kaggle/working/output_multi_arch/WideDeep_fold1.pth (8MB)
Starting upload for file /kaggle/working/output_multi_arch/ResidualMLP_fold4.pth



Uploading: 100%|██████████| 7.10M/7.10M [00:00<00:00, 14.1MB/s]

Upload successful: /kaggle/working/output_multi_arch/ResidualMLP_fold4.pth (7MB)
Starting upload for file /kaggle/working/output_multi_arch/SimpleMLP_fold4.pth



Uploading: 100%|██████████| 5.01M/5.01M [00:00<00:00, 9.46MB/s]

Upload successful: /kaggle/working/output_multi_arch/SimpleMLP_fold4.pth (5MB)
Starting upload for file /kaggle/working/output_multi_arch/ResidualMLP_fold0.pth



Uploading: 100%|██████████| 7.10M/7.10M [00:00<00:00, 15.0MB/s]

Upload successful: /kaggle/working/output_multi_arch/ResidualMLP_fold0.pth (7MB)
Starting upload for file /kaggle/working/output_multi_arch/submission_ensemble.csv



Uploading: 100%|██████████| 1.22M/1.22M [00:00<00:00, 2.75MB/s]

Upload successful: /kaggle/working/output_multi_arch/submission_ensemble.csv (1MB)
Starting upload for file /kaggle/working/output_multi_arch/SimpleMLP_fold2.pth



Uploading: 100%|██████████| 5.01M/5.01M [00:00<00:00, 10.0MB/s]

Upload successful: /kaggle/working/output_multi_arch/SimpleMLP_fold2.pth (5MB)
Starting upload for file /kaggle/working/output_multi_arch/GLUNet_fold1.pth



Uploading: 100%|██████████| 9.45M/9.45M [00:00<00:00, 18.1MB/s]

Upload successful: /kaggle/working/output_multi_arch/GLUNet_fold1.pth (9MB)
Starting upload for file /kaggle/working/output_multi_arch/AttentionNet_fold3.pth



Uploading: 100%|██████████| 9.20M/9.20M [00:00<00:00, 17.1MB/s]

Upload successful: /kaggle/working/output_multi_arch/AttentionNet_fold3.pth (9MB)
Starting upload for file /kaggle/working/output_multi_arch/MixtureOfExperts_fold3.pth



Uploading: 100%|██████████| 10.0M/10.0M [00:00<00:00, 19.6MB/s]

Upload successful: /kaggle/working/output_multi_arch/MixtureOfExperts_fold3.pth (10MB)
Starting upload for file /kaggle/working/output_multi_arch/MixtureOfExperts_fold0.pth



Uploading: 100%|██████████| 10.0M/10.0M [00:00<00:00, 20.6MB/s]

Upload successful: /kaggle/working/output_multi_arch/MixtureOfExperts_fold0.pth (10MB)
Starting upload for file /kaggle/working/output_multi_arch/SimpleMLP_fold0.pth



Uploading: 100%|██████████| 5.01M/5.01M [00:00<00:00, 10.5MB/s]

Upload successful: /kaggle/working/output_multi_arch/SimpleMLP_fold0.pth (5MB)
Starting upload for file /kaggle/working/output_multi_arch/WideDeep_fold3.pth



Uploading: 100%|██████████| 7.88M/7.88M [00:00<00:00, 16.7MB/s]

Upload successful: /kaggle/working/output_multi_arch/WideDeep_fold3.pth (8MB)
Starting upload for file /kaggle/working/output_multi_arch/AttentionNet_fold0.pth



Uploading: 100%|██████████| 9.20M/9.20M [00:00<00:00, 18.2MB/s]

Upload successful: /kaggle/working/output_multi_arch/AttentionNet_fold0.pth (9MB)
Starting upload for file /kaggle/working/output_multi_arch/AttentionNet_fold2.pth



Uploading: 100%|██████████| 9.20M/9.20M [00:00<00:00, 14.4MB/s]

Upload successful: /kaggle/working/output_multi_arch/AttentionNet_fold2.pth (9MB)





Your model instance version has been created.
Files are being processed...
See at: https://www.kaggle.com/models/kartikgarg74/aml_hirercial_best/keras/default


In [None]:
# ======================================================================================
# Qwen3 (text) + average(SigLIP, DINOv2) (image)
# Robust version with automatic fallback if DINOv2 count mismatches
# ======================================================================================

import os
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# -------------------------
# PATHS (match your files exactly)
# -------------------------
SIGLIP_QWEN_DIR = "/kaggle/input/aml-embed-siglip-qwen3-normalized/keras/default/1"
DINO_DIR        = "/kaggle/input/aml_new/keras/default/1/output_v4_advanced/embeddings"

TRAIN_CSV_PATH  = "/kaggle/input/aml-csv/train.csv"
TEST_CSV_PATH   = "/kaggle/input/aml-csv/test.csv"
OUT_DIR         = "/kaggle/working/output_fusion_qwen_siglip_dinov2"

os.makedirs(OUT_DIR, exist_ok=True)

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
EPOCHS = 40
BATCH_SIZE = 256
LR = 3e-4
PATIENCE = 8
FUSION_DIM = 512
DROPOUT = 0.1

# Loss weights
INFO_NCE_WEIGHT   = 0.10
PRICE_SMAPE_WEIGHT= 0.50
PRICE_MSE_WEIGHT  = 0.10
CAT_CE_WEIGHT     = 0.15
BRAND_CE_WEIGHT   = 0.15

print(f"Device: {DEVICE}")

# -------------------------
# UTILS
# -------------------------
def l2_normalize(x, axis=1, eps=1e-9):
    return x / (np.sqrt((x * x).sum(axis=axis, keepdims=True)) + eps)

def smape_torch(y_pred, y_true, eps=1e-9):
    num = torch.abs(y_pred - y_true)
    den = (torch.abs(y_true) + torch.abs(y_pred)) / 2.0
    return torch.mean(num / (den + eps))

def smape_np(y_true, y_pred, eps=1e-9):
    num = np.abs(y_pred - y_true)
    den = (np.abs(y_true) + np.abs(y_pred)) / 2.0
    return np.mean(num / (den + eps))

def info_nce(text_embeds, image_embeds, temperature=0.07):
    t = F.normalize(text_embeds, p=2, dim=-1)
    v = F.normalize(image_embeds, p=2, dim=-1)
    logits = (t @ v.T) / temperature
    labels = torch.arange(logits.size(0), device=logits.device)
    return (F.cross_entropy(logits, labels) + F.cross_entropy(logits.T, labels)) / 2.0

def extract_category_simple(text):
    s = str(text).lower()
    if "electronic" in s: return "electronics"
    if "book" in s: return "books"
    if any(k in s for k in ["fashion","clothing","wear"]): return "fashion"
    if any(k in s for k in ["home","kitchen"]): return "home"
    return "other"

def extract_brand_simple(text):
    s = str(text).strip().split()
    return s[0].lower() if len(s) else "unknown"

# -------------------------
# LOAD DATA
# -------------------------
train_df = pd.read_csv(TRAIN_CSV_PATH)
test_df  = pd.read_csv(TEST_CSV_PATH)

train_df["category"] = train_df["catalog_content"].apply(extract_category_simple)
train_df["brand"]    = train_df["catalog_content"].apply(extract_brand_simple)

cat_le = LabelEncoder().fit(train_df["category"])
brand_le = LabelEncoder().fit(train_df["brand"])

y_price_log = np.log1p(train_df["price"].astype(np.float32).values)
y_cat   = cat_le.transform(train_df["category"]).astype(np.int64)
y_brand = brand_le.transform(train_df["brand"]).astype(np.int64)

num_categories = len(cat_le.classes_)
num_brands     = len(brand_le.classes_)
print(f"Train: {len(train_df)} | Test: {len(test_df)} | Cats: {num_categories}, Brands: {num_brands}")

# -------------------------
# LOAD EMBEDDINGS
# -------------------------
# Qwen3 text (normalized)
q_train = np.load(os.path.join(SIGLIP_QWEN_DIR, "train_text_normalized.npy"))
q_test  = np.load(os.path.join(SIGLIP_QWEN_DIR, "test_text_normalized.npy"))

# SigLIP image (normalized)
sig_train = np.load(os.path.join(SIGLIP_QWEN_DIR, "train_image_normalized.npy"))
sig_test  = np.load(os.path.join(SIGLIP_QWEN_DIR, "test_image_normalized.npy"))

print("Loaded shapes:")
print("  Qwen3 :", q_train.shape, q_test.shape)
print("  SigLIP:", sig_train.shape, sig_test.shape)

# DINOv2 image
try:
    dino_train = np.load(os.path.join(DINO_DIR, "image_train.npy"))
    dino_test  = np.load(os.path.join(DINO_DIR, "image_test.npy"))
    print(f"  DINOv2: {dino_train.shape}, {dino_test.shape}")

    if dino_train.shape[0] != len(train_df) or dino_test.shape[0] != len(test_df):
        print("⚠️ DINOv2 count mismatch detected → using SigLIP only for image embeddings.")
        dino_train = sig_train.copy()
        dino_test  = sig_test.copy()
    else:
        dino_train = l2_normalize(dino_train)
        dino_test  = l2_normalize(dino_test)
except Exception as e:
    print(f"⚠️ Could not load DINOv2 ({e}) → using SigLIP only.")
    dino_train = sig_train.copy()
    dino_test  = sig_test.copy()

# Normalize Qwen3 and SigLIP
q_train, q_test = l2_normalize(q_train), l2_normalize(q_test)
sig_train, sig_test = l2_normalize(sig_train), l2_normalize(sig_test)

# Average SigLIP + DINOv2 → single image embedding
img_train = l2_normalize((sig_train + dino_train) / 2.0)
img_test  = l2_normalize((sig_test  + dino_test)  / 2.0)

print("Final image (avg) dims:", img_train.shape, img_test.shape)

# -------------------------
# TRAIN/VAL SPLIT (stratify by category)
# -------------------------
tr_idx, val_idx = train_test_split(
    np.arange(len(train_df)),
    test_size=0.2,
    random_state=42,
    stratify=y_cat
)

Xq_tr, Xi_tr = q_train[tr_idx], img_train[tr_idx]
Xq_val, Xi_val = q_train[val_idx], img_train[val_idx]
ypl_tr, ypl_val = y_price_log[tr_idx], y_price_log[val_idx]
ycl_tr, ycl_val = y_cat[tr_idx], y_cat[val_idx]
ybl_tr, ybl_val = y_brand[tr_idx], y_brand[val_idx]

print("Shapes → text:", Xq_tr.shape, Xq_val.shape, "| image:", Xi_tr.shape, Xi_val.shape)

# -------------------------
# DATA LOADERS
# -------------------------
train_ds = TensorDataset(
    torch.from_numpy(Xq_tr).float(),
    torch.from_numpy(Xi_tr).float(),
    torch.from_numpy(ypl_tr).float(),
    torch.from_numpy(ycl_tr).long(),
    torch.from_numpy(ybl_tr).long(),
)
val_ds = TensorDataset(
    torch.from_numpy(Xq_val).float(),
    torch.from_numpy(Xi_val).float(),
    torch.from_numpy(ypl_val).float(),
    torch.from_numpy(ycl_val).long(),
    torch.from_numpy(ybl_val).long(),
)
test_ds = TensorDataset(
    torch.from_numpy(q_test).float(),
    torch.from_numpy(img_test).float()
)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=2, pin_memory=True)
val_loader   = DataLoader(val_ds, batch_size=BATCH_SIZE*2, shuffle=False, num_workers=2)
test_loader  = DataLoader(test_ds, batch_size=BATCH_SIZE*2, shuffle=False, num_workers=2)

# -------------------------
# MODEL
# -------------------------
class Projection(nn.Module):
    def __init__(self, in_dim, out_dim, p=0.1):
        super().__init__()
        self.net = nn.Sequential(
            nn.LayerNorm(in_dim),
            nn.Linear(in_dim, out_dim),
            nn.GELU(),
            nn.Dropout(p),
        )
    def forward(self, x): return self.net(x)

class FusionNet(nn.Module):
    def __init__(self, text_dim, image_dim, fusion_dim, n_cat, n_brand):
        super().__init__()
        self.text_proj  = Projection(text_dim, fusion_dim, DROPOUT)
        self.image_proj = Projection(image_dim, fusion_dim, DROPOUT)
        self.fusion = nn.Sequential(
            nn.LayerNorm(fusion_dim*2),
            nn.Linear(fusion_dim*2, fusion_dim),
            nn.GELU(),
            nn.Dropout(DROPOUT),
            nn.Linear(fusion_dim, fusion_dim//2),
            nn.GELU(),
            nn.Dropout(DROPOUT),
        )
        self.price_head = nn.Linear(fusion_dim//2, 1)
        self.cat_head   = nn.Linear(fusion_dim//2, n_cat)
        self.brand_head = nn.Linear(fusion_dim//2, n_brand)

    def forward(self, t_inp, v_inp):
        t = self.text_proj(t_inp)
        v = self.image_proj(v_inp)
        h = self.fusion(torch.cat([t, v], dim=1))
        return (self.price_head(h).squeeze(-1), self.cat_head(h), self.brand_head(h), t, v)

model = FusionNet(
    text_dim=Xq_tr.shape[1],
    image_dim=Xi_tr.shape[1],
    fusion_dim=FUSION_DIM,
    n_cat=num_categories,
    n_brand=num_brands
).to(DEVICE)

opt   = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=1e-2)
sched = torch.optim.lr_scheduler.ReduceLROnPlateau(opt, mode="min", factor=0.5, patience=3, verbose=True)
mse, ce = nn.MSELoss(), nn.CrossEntropyLoss()

print(f"Trainable params: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")

# -------------------------
# TRAIN
# -------------------------
best_smape = float("inf")
pat = 0
BEST = os.path.join(OUT_DIR, "best_model.pth")

for epoch in range(1, EPOCHS+1):
    model.train()
    total = 0.0
    for tq, ti, ypl, ycl, ybl in tqdm(train_loader, desc=f"Epoch {epoch}/{EPOCHS}"):
        tq, ti, ypl, ycl, ybl = tq.to(DEVICE), ti.to(DEVICE), ypl.to(DEVICE), ycl.to(DEVICE), ybl.to(DEVICE)
        opt.zero_grad()
        price_log, cat_logits, brand_logits, tp, vp = model(tq, ti)
        loss = (
            PRICE_SMAPE_WEIGHT * smape_torch(torch.expm1(price_log), torch.expm1(ypl)) +
            PRICE_MSE_WEIGHT   * mse(price_log, ypl) +
            CAT_CE_WEIGHT      * ce(cat_logits, ycl) +
            BRAND_CE_WEIGHT    * ce(brand_logits, ybl) +
            INFO_NCE_WEIGHT    * info_nce(tp, vp)
        )
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        opt.step()
        total += loss.item()
    train_loss = total / len(train_loader)

    # Validation
    model.eval()
    val_preds = []
    with torch.no_grad():
        for tq, ti, _, _, _ in val_loader:
            price_log, _, _, _, _ = model(tq.to(DEVICE), ti.to(DEVICE))
            val_preds.append(price_log.cpu().numpy())
    val_preds = np.concatenate(val_preds)
    val_smape = smape_np(np.expm1(ypl_val), np.expm1(val_preds))
    sched.step(val_smape)

    print(f"Epoch {epoch:02d} | train_loss={train_loss:.4f} | val_SMAPE={val_smape:.4f}")
    if val_smape < best_smape:
        best_smape, pat = val_smape, 0
        torch.save(model.state_dict(), BEST)
        print(f"  ✓ New best saved: {BEST}")
    else:
        pat += 1
        print(f"  No improvement (patience {pat}/{PATIENCE})")
        if pat >= PATIENCE:
            print("  → Early stopping.")
            break

print(f"\nBest Validation SMAPE: {best_smape:.4f}")

# -------------------------
# PREDICT TEST
# -------------------------
model.load_state_dict(torch.load(BEST, map_location=DEVICE))
model.eval()

test_logits = []
with torch.no_grad():
    for tq, ti in tqdm(test_loader, desc="Predicting"):
        price_log, _, _, _, _ = model(tq.to(DEVICE), ti.to(DEVICE))
        test_logits.append(price_log.cpu().numpy())
test_logits = np.concatenate(test_logits)
test_price = np.expm1(test_logits)

sub = pd.DataFrame({
    "sample_id": test_df.get("sample_id", pd.Series(range(len(test_df)))),
    "price": test_price
})
sub_path = os.path.join(OUT_DIR, "submission.csv")
sub.to_csv(sub_path, index=False)
print(f"✓ Submission saved: {sub_path}")


Device: cuda
Train: 75000 | Test: 75000 | Cats: 5, Brands: 1
Loaded shapes:
  Qwen3 : (75000, 1024) (75000, 1024)
  SigLIP: (75000, 1152) (75000, 1152)
  DINOv2: (60000, 1024), (75000, 1024)
⚠️ DINOv2 count mismatch detected → using SigLIP only for image embeddings.
Final image (avg) dims: (75000, 1152) (75000, 1152)
Shapes → text: (60000, 1024) (15000, 1024) | image: (60000, 1152) (15000, 1152)
Trainable params: 1,779,463




Epoch 1/40:   0%|          | 0/235 [00:00<?, ?it/s]

Exception ignored in: Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x78160b63ba60>
<function _MultiProcessingDataLoaderIter.__del__ at 0x78160b63ba60>Traceback (most recent call last):

  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
    self._shutdown_workers()    
self._shutdown_workers()  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers
    
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers
if w.is_alive():
     if w.is_alive():  
       ^  ^^ ^ ^^^^^^^^^^^^^^^^
^  File "/usr/lib/python3.11/multiprocessing/process.py", line 160, in is_alive
^    ^assert self._parent_pid == os.getpid(), 'can only test a child process'^

  File "/usr/lib/python

Epoch 01 | train_loss=1.0068 | val_SMAPE=0.5624
  ✓ New best saved: /kaggle/working/output_fusion_qwen_siglip_dinov2/best_model.pth


Epoch 2/40:   0%|          | 0/235 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x78160b63ba60>
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
    self._shutdown_workers()Exception ignored in: 
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers
<function _MultiProcessingDataLoaderIter.__del__ at 0x78160b63ba60>
    Traceback (most recent call last):
if w.is_alive():  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__

       self._shutdown_workers()
   File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers
      if w.is_alive(): 
^ ^ ^ ^ ^ ^ ^ ^^^^^^^^^^
^  File "/usr/lib/python3.11/multiprocessing/process.py", line 160, in is_alive
^    ^assert self._parent_pid == os.getpid(), 'can only test a child process'^
 ^  ^ ^ 
    File "/usr/lib

Epoch 02 | train_loss=0.9481 | val_SMAPE=0.5826
  No improvement (patience 1/8)


Epoch 3/40:   0%|          | 0/235 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x78160b63ba60>
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers
    if w.is_alive():
       Exception ignored in: ^<function _MultiProcessingDataLoaderIter.__del__ at 0x78160b63ba60>^^
^Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
^^    self._shutdown_workers()^
^  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers
^^    ^if w.is_alive():^

   File "/usr/lib/python3.11/multiprocessing/process.py", line 160, in is_alive
      assert self._parent_pid == os.getpid(), 'can only test a child process' 
      ^ ^ ^^  ^ ^^ ^  ^^^^^^^^^
^^  Fil

Epoch 03 | train_loss=0.9223 | val_SMAPE=0.5186
  ✓ New best saved: /kaggle/working/output_fusion_qwen_siglip_dinov2/best_model.pth


Epoch 4/40:   0%|          | 0/235 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x78160b63ba60>
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers
    if w.is_alive():
       ^^^^^^^^^^^^
  File "/usr/lib/python3.11/multiprocessing/process.py", line 160, in is_alive
    Exception ignored in: assert self._parent_pid == os.getpid(), 'can only test a child process'<function _MultiProcessingDataLoaderIter.__del__ at 0x78160b63ba60>
Exception ignored in: 
<function _MultiProcessingDataLoaderIter.__del__ at 0x78160b63ba60> Traceback (most recent call last):

   File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
Traceback (most recent call last):
   File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py

Epoch 04 | train_loss=0.8964 | val_SMAPE=0.5182
  ✓ New best saved: /kaggle/working/output_fusion_qwen_siglip_dinov2/best_model.pth


Epoch 5/40:   0%|          | 0/235 [00:00<?, ?it/s]

Exception ignored in: Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x78160b63ba60><function _MultiProcessingDataLoaderIter.__del__ at 0x78160b63ba60>

Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
        self._shutdown_workers()self._shutdown_workers()

  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers
    if w.is_alive():    
 if w.is_alive():
          ^ ^ ^^ ^^^Exception ignored in: ^Exception ignored in: ^^<function _MultiProcessingDataLoaderIter.__del__ at 0x78160b63ba60>^
<function _MultiProcessingDataLoaderIter.__del__ at 0x78160b63ba60>^^Traceback (m

Epoch 05 | train_loss=0.8727 | val_SMAPE=0.5177
  ✓ New best saved: /kaggle/working/output_fusion_qwen_siglip_dinov2/best_model.pth


Exception ignored in: 

Epoch 6/40:   0%|          | 0/235 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x78160b63ba60><function _MultiProcessingDataLoaderIter.__del__ at 0x78160b63ba60>

Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
        self._shutdown_workers()self._shutdown_workers()

  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers
    if w.is_alive():      File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers
if w.is_alive():

              ^^^^^^^^^^Exception ignored in: ^^<function _MultiProcessingDataLoaderIter.__del__ at 0x78160b63ba60>^Exception ignored in: ^^
^^<function _MultiProcessingDataLoaderIter.__del__ at 0x78160b63ba60>Traceback (most recent call la

Epoch 06 | train_loss=0.8516 | val_SMAPE=0.5131
  ✓ New best saved: /kaggle/working/output_fusion_qwen_siglip_dinov2/best_model.pth


Epoch 7/40:   0%|          | 0/235 [00:00<?, ?it/s]

Exception ignored in: Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x78160b63ba60><function _MultiProcessingDataLoaderIter.__del__ at 0x78160b63ba60>
Traceback (most recent call last):

Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
        self._shutdown_workers()self._shutdown_workers()

  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers
        if w.is_alive():if w.is_alive():
 
       Exception ignored in:   <function _MultiProcessingDataLoaderIter.__del__ at 0x78160b63ba60>   ^ 
^Traceback (most recent call last):
^^  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader

Epoch 07 | train_loss=0.8333 | val_SMAPE=0.5108
  ✓ New best saved: /kaggle/working/output_fusion_qwen_siglip_dinov2/best_model.pth


Epoch 8/40:   0%|          | 0/235 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x78160b63ba60>
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers
    if w.is_alive():
       ^^^^^^^^^^^^
  File "/usr/lib/python3.11/multiprocessing/process.py", line 160, in is_alive
    assert self._parent_pid == os.getpid(), 'can only test a child process'
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
AssertionError: can only test a child process
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x78160b63ba60>
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 16

Epoch 08 | train_loss=0.8148 | val_SMAPE=0.5115
  No improvement (patience 1/8)


Epoch 9/40:   0%|          | 0/235 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x78160b63ba60>
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
Exception ignored in:     <function _MultiProcessingDataLoaderIter.__del__ at 0x78160b63ba60>
Traceback (most recent call last):
self._shutdown_workers()  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__

      File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers
    self._shutdown_workers()if w.is_alive():

  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers
       if w.is_alive(): 
       ^^  ^^ ^^^^^^^^^^^^^^^^^
^  File "/usr/lib/python3.11/multiprocessing/process.py", line 160, in is_alive
^    ^
assert self._parent_pid == os.getpid(), 'can only test a child process'  File "/usr/lib/python3

Epoch 09 | train_loss=0.8001 | val_SMAPE=0.5389
  No improvement (patience 2/8)


Epoch 10/40:   0%|          | 0/235 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x78160b63ba60>
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
Exception ignored in:     <function _MultiProcessingDataLoaderIter.__del__ at 0x78160b63ba60>self._shutdown_workers()

Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
    if w.is_alive():
     self._shutdown_workers() 
   File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers
      if w.is_alive(): 
  ^ ^ ^ ^^  ^^ ^^^^^^^^^
^^  File "/usr/lib/python3.11/multiprocessing/process.py", line 160, in is_alive
    ^^assert self._parent_pid == os.getpid(), 'can only test a child process'^
^ ^ ^ 
    File "/usr/lib/p

Epoch 10 | train_loss=0.7849 | val_SMAPE=0.5163
  No improvement (patience 3/8)


Epoch 11/40:   0%|          | 0/235 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x78160b63ba60>
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers
    if w.is_alive():
       ^^^^^Exception ignored in: ^<function _MultiProcessingDataLoaderIter.__del__ at 0x78160b63ba60>^^
^Traceback (most recent call last):
^  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
^^    self._shutdown_workers()

  File "/usr/lib/python3.11/multiprocessing/process.py", line 160, in is_alive
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers
        if w.is_alive():assert self._parent_pid == os.getpid(), 'can only test a child process'

               ^ ^^ ^ ^^^^^^^^^^^^^^^^
^  F

Epoch 11 | train_loss=0.7753 | val_SMAPE=0.5243
  No improvement (patience 4/8)


Epoch 12/40:   0%|          | 0/235 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x78160b63ba60>
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x78160b63ba60>    
self._shutdown_workers()Traceback (most recent call last):

  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers
        self._shutdown_workers()if w.is_alive():

  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers
      if w.is_alive():  
     ^ ^ ^^  ^ ^^^^^^^^^^^^^^
^^  File "/usr/lib/python3.11/multiprocessing/process.py", line 160, in is_alive
^    assert self._parent_pid == os.getpid(), 'can only test a child process'^
^
   File "/usr/lib/pytho

Epoch 12 | train_loss=0.7507 | val_SMAPE=0.5123
  No improvement (patience 5/8)


Epoch 13/40:   0%|          | 0/235 [00:00<?, ?it/s]

Exception ignored in: Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x78160b63ba60><function _MultiProcessingDataLoaderIter.__del__ at 0x78160b63ba60>

Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
        self._shutdown_workers()self._shutdown_workers()
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers

    if w.is_alive():  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers

      if w.is_alive(): 
         ^  ^^^^^^^^^^^^^^^^^^^^^Exception ignored in: ^^
<function _MultiProcessingDataLoaderIter.__del__ at 0x78160b63ba60>

  File "/usr/lib/python3.11/multiprocessing/process.py", line 160, in is_alive
Traceback

Epoch 13 | train_loss=0.7380 | val_SMAPE=0.5188
  No improvement (patience 6/8)


Epoch 14/40:   0%|          | 0/235 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x78160b63ba60>Exception ignored in: 
<function _MultiProcessingDataLoaderIter.__del__ at 0x78160b63ba60>Traceback (most recent call last):

  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
Traceback (most recent call last):
      File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
self._shutdown_workers()    
self._shutdown_workers()  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers

  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers
    
    if w.is_alive():if w.is_alive():
             ^^ ^^^^Exception ignored in: ^^<function _MultiProcessingDataLoaderIter.__del__ at 0x78160b63ba60>^^
^Traceback (most recent call last):
^^^  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/

Epoch 14 | train_loss=0.7302 | val_SMAPE=0.5336
  No improvement (patience 7/8)


Epoch 15/40:   0%|          | 0/235 [00:00<?, ?it/s]

Exception ignored in: Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x78160b63ba60><function _MultiProcessingDataLoaderIter.__del__ at 0x78160b63ba60>

Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
        self._shutdown_workers()self._shutdown_workers()

  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers
    if w.is_alive():    
if w.is_alive():  
         ^ ^ ^ ^^^^^Exception ignored in: ^^<function _MultiProcessingDataLoaderIter.__del__ at 0x78160b63ba60>^Exception ignored in: ^^
^Traceback (most recent call last):
<function _MultiProcessingDataLoaderIter.__

## qwen_3

In [5]:
# ======================================================================================
# End-to-End Multimodal Pipeline (Re-extract + Train + Predict)
# Qwen3 (text) + average(SigLIP, DINO) (image)  |  Kaggle-ready, robust fallbacks
# Saves ALL outputs under /kaggle/working/
# ======================================================================================

import os, gc, math, json, threading
from queue import Queue

import numpy as np
import pandas as pd
from PIL import Image
from tqdm.auto import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# -------------------------
# PATHS (EDIT ONLY IF NEEDED)
# -------------------------
TRAIN_CSV_PATH = "/kaggle/input/aml-csv/train.csv"
TEST_CSV_PATH  = "/kaggle/input/aml-csv/test.csv"

IMAGES_TRAIN = "/kaggle/input/aml-train/AMAZON_ML_TRAIN"
IMAGES_TEST  = "/kaggle/input/amazon-ml-test/AMAZON_ML_TEST"

# Local HF model datasets in Kaggle (change if your folders differ)
QWEN3_MODEL_PATH  = "/kaggle/input/qwen-3-embedding/transformers/0.6b/1"        # Qwen3 0.6B Text Embeddings
SIGLIP_MODEL_PATH = "/kaggle/input/google-siglip-so400m-patch14-384/transformers/default/1"  # SigLIP-So400m

# Try DINOv3 first, then DINOv2; if none load, we fall back to SigLIP-only image embeddings
DINO_MODEL_CANDIDATES = [
    # Add your DINOv3 Kaggle dataset path here if you have one:
    # "/kaggle/input/dinov3-large-.../transformers/default/1",
    # Common DINOv2 Kaggle packs (example — update if you have another):
    "/kaggle/input/facebook-dinov2-base/transformers/default/1",
    "/kaggle/input/facebook-dinov2-large/transformers/default/1",
]

OUT_DIR = "/kaggle/working/"
os.makedirs(OUT_DIR, exist_ok=True)

# -------------------------
# RUNTIME CONFIG
# -------------------------
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
DEVICE_CPU = torch.device("cpu")
TORCH_DTYPE_HALF = torch.float16 if torch.cuda.is_available() else torch.float32

TEXT_MAXLEN = 256
BATCH_TEXT  = 24
BATCH_IMG   = 24

EPOCHS = 40
BATCH_TRAIN = 256
LR = 3e-4
PATIENCE = 8
FUSION_DIM = 512
DROPOUT = 0.10
WEIGHT_DECAY = 1e-2

# Loss weights
INFO_NCE_W   = 0.10
PRICE_SMAPE_W= 0.50
PRICE_MSE_W  = 0.10
CAT_CE_W     = 0.15
BRAND_CE_W   = 0.15

print(f"Device: {DEVICE}")

# -------------------------
# HELPERS
# -------------------------
def l2_normalize(x, axis=1, eps=1e-9):
    return x / (np.sqrt((x * x).sum(axis=axis, keepdims=True)) + eps)

def smape_torch(y_pred, y_true, eps=1e-9):
    num = torch.abs(y_pred - y_true)
    den = (torch.abs(y_true) + torch.abs(y_pred)) / 2.0
    return torch.mean(num / (den + eps))

def smape_np(y_true, y_pred, eps=1e-9):
    num = np.abs(y_pred - y_true)
    den = (np.abs(y_true) + np.abs(y_pred)) / 2.0
    return np.mean(num / (den + eps))

def info_nce(t, v, temperature=0.07):
    t = F.normalize(t, p=2, dim=-1)
    v = F.normalize(v, p=2, dim=-1)
    logits = (t @ v.T) / temperature
    labels = torch.arange(logits.size(0), device=logits.device)
    return (F.cross_entropy(logits, labels) + F.cross_entropy(logits.T, labels)) / 2.0

def extract_category_simple(text):
    s = str(text).lower()
    if "electronic" in s: return "electronics"
    if "book" in s: return "books"
    if any(k in s for k in ["fashion","clothing","wear"]): return "fashion"
    if any(k in s for k in ["home","kitchen"]): return "home"
    return "other"

def extract_brand_simple(text):
    s = str(text).strip().split()
    return s[0].lower() if len(s) else "unknown"

def load_image_paths(df, root):
    return [os.path.join(root, f"{sid}.jpg") for sid in df["sample_id"]]

def parallel_load_images(paths, num_threads=4):
    images = [None]*len(paths)
    q = Queue()
    for i,p in enumerate(paths): q.put((i,p))

    def worker():
        while not q.empty():
            try:
                i, p = q.get_nowait()
                try:
                    im = Image.open(p).convert("RGB")
                except:
                    im = Image.new("RGB", (384, 384))
                images[i] = im
                q.task_done()
            except:
                break

    thrs = [threading.Thread(target=worker) for _ in range(num_threads)]
    [t.start() for t in thrs]; [t.join() for t in thrs]
    return images

# -------------------------
# LOAD CSVs
# -------------------------
train_df = pd.read_csv(TRAIN_CSV_PATH)
test_df  = pd.read_csv(TEST_CSV_PATH)
if "sample_id" not in train_df.columns:
    train_df["sample_id"] = np.arange(len(train_df))
if "sample_id" not in test_df.columns:
    test_df["sample_id"] = np.arange(len(test_df))

# Simple aux labels
train_df["category"] = train_df["catalog_content"].apply(extract_category_simple)
train_df["brand"]    = train_df["catalog_content"].apply(extract_brand_simple)

cat_le = LabelEncoder().fit(train_df["category"])
brand_le = LabelEncoder().fit(train_df["brand"])

y_price_log = np.log1p(train_df["price"].astype(np.float32).values)
y_cat   = cat_le.transform(train_df["category"]).astype(np.int64)
y_brand = brand_le.transform(train_df["brand"]).astype(np.int64)

num_categories = len(cat_le.classes_)
num_brands = len(brand_le.classes_)

print(f"Train: {len(train_df)}, Test: {len(test_df)} | Cats: {num_categories}, Brands: {num_brands}")

# -------------------------
# EXTRACT TEXT (Qwen3)
# -------------------------
print("\n=== TEXT (Qwen3) ===")
from transformers import AutoModel, AutoTokenizer

text_model = AutoModel.from_pretrained(
    QWEN3_MODEL_PATH,
    trust_remote_code=True,
    torch_dtype=TORCH_DTYPE_HALF
).to(DEVICE).eval()
text_tok = AutoTokenizer.from_pretrained(QWEN3_MODEL_PATH, trust_remote_code=True)

def encode_text(texts, bs=24):
    out = []
    for i in tqdm(range(0, len(texts), bs), desc="Qwen3 Text"):
        batch = texts[i:i+bs]
        inputs = text_tok(batch, padding=True, truncation=True, max_length=TEXT_MAXLEN, return_tensors="pt")
        inputs = {k: v.to(DEVICE) for k,v in inputs.items()}
        with torch.no_grad():
            with torch.cuda.amp.autocast(enabled=(DEVICE.type=="cuda")):
                model_out = text_model(**inputs)
                if hasattr(model_out, "last_hidden_state"):
                    emb = model_out.last_hidden_state[:,0,:]
                else:
                    emb = model_out.pooler_output
                emb = F.normalize(emb.float(), p=2, dim=1)
        out.append(emb.cpu().numpy())
        del inputs, model_out, emb
        if DEVICE.type=="cuda" and i%200==0:
            torch.cuda.empty_cache()
    return np.vstack(out)

train_texts = train_df["catalog_content"].fillna("").tolist()
test_texts  = test_df["catalog_content"].fillna("").tolist()

q_train = encode_text(train_texts, BATCH_TEXT)
q_test  = encode_text(test_texts,  BATCH_TEXT)

np.save(os.path.join(OUT_DIR, "qwen3_train.npy"), q_train)
np.save(os.path.join(OUT_DIR, "qwen3_test.npy"),  q_test)
print("Qwen3:", q_train.shape, q_test.shape)

del text_model, text_tok; gc.collect()
if DEVICE.type=="cuda":
    torch.cuda.empty_cache(); torch.cuda.synchronize()

# -------------------------
# EXTRACT IMAGES (SigLIP)
# -------------------------
print("\n=== IMAGES (SigLIP) ===")
from transformers import AutoImageProcessor

sig_model = AutoModel.from_pretrained(
    SIGLIP_MODEL_PATH,
    torch_dtype=TORCH_DTYPE_HALF
).to(DEVICE).eval()
sig_proc = AutoImageProcessor.from_pretrained(SIGLIP_MODEL_PATH)

train_img_paths = load_image_paths(train_df, IMAGES_TRAIN)
test_img_paths  = load_image_paths(test_df,  IMAGES_TEST)

def encode_images(model, processor, paths, bs=24, tag="SigLIP", device=DEVICE):
    out = []
    for i in tqdm(range(0, len(paths), bs), desc=f"{tag} Images"):
        batch_paths = paths[i:i+bs]
        images = parallel_load_images(batch_paths, num_threads=4)
        inputs = processor(images, return_tensors="pt").to(device)
        with torch.no_grad():
            with torch.cuda.amp.autocast(enabled=(device.type=="cuda")):
                mo = model(**inputs)
                if hasattr(mo, "image_embeds"):
                    emb = mo.image_embeds
                elif hasattr(mo, "last_hidden_state"):
                    emb = mo.last_hidden_state.mean(dim=1)
                else:
                    emb = mo.pooler_output
                emb = F.normalize(emb.float(), p=2, dim=1)
        out.append(emb.cpu().numpy())
        del images, inputs, mo, emb
        if device.type=="cuda" and i%200==0:
            torch.cuda.empty_cache()
    return np.vstack(out)

sig_train = encode_images(sig_model, sig_proc, train_img_paths, BATCH_IMG, "SigLIP")
sig_test  = encode_images(sig_model, sig_proc, test_img_paths,  BATCH_IMG, "SigLIP")

np.save(os.path.join(OUT_DIR, "siglip_train.npy"), sig_train)
np.save(os.path.join(OUT_DIR, "siglip_test.npy"),  sig_test)
print("SigLIP:", sig_train.shape, sig_test.shape)

del sig_model, sig_proc; gc.collect()
if DEVICE.type=="cuda":
    torch.cuda.empty_cache(); torch.cuda.synchronize()

# -------------------------
# EXTRACT IMAGES (DINO: try v3, then v2; else fallback)
# -------------------------
print("\n=== IMAGES (DINO) ===")
dino_loaded = False
dino_train = None; dino_test = None

for dino_path in DINO_MODEL_CANDIDATES:
    if not os.path.exists(dino_path): 
        continue
    try:
        print(f"Trying DINO at: {dino_path}")
        dino_model = AutoModel.from_pretrained(dino_path, torch_dtype=TORCH_DTYPE_HALF).to(DEVICE).eval()
        dino_proc  = AutoImageProcessor.from_pretrained(dino_path)
        dino_train = encode_images(dino_model, dino_proc, train_img_paths, BATCH_IMG, "DINO", device=DEVICE)
        dino_test  = encode_images(dino_model, dino_proc, test_img_paths,  BATCH_IMG, "DINO", device=DEVICE)
        np.save(os.path.join(OUT_DIR, "dino_train.npy"), dino_train)
        np.save(os.path.join(OUT_DIR, "dino_test.npy"),  dino_test)
        print("DINO:", dino_train.shape, dino_test.shape)
        dino_loaded = True
        del dino_model, dino_proc; gc.collect()
        if DEVICE.type=="cuda":
            torch.cuda.empty_cache(); torch.cuda.synchronize()
        break
    except Exception as e:
        print(f"  -> Failed to use {dino_path}: {e}")

if not dino_loaded:
    print("⚠️ No DINO model available → using SigLIP-only for image embeddings.")
    dino_train = sig_train.copy()
    dino_test  = sig_test.copy()

# -------------------------
# AVERAGE SigLIP + DINO (with count check)
# -------------------------
if dino_train.shape[0] != len(train_df) or dino_test.shape[0] != len(test_df):
    print("⚠️ DINO count mismatch → using SigLIP-only image embeddings.")
    dino_train = sig_train.copy()
    dino_test  = sig_test.copy()

img_train = l2_normalize((sig_train + dino_train) / 2.0)
img_test  = l2_normalize((sig_test  + dino_test)  / 2.0)

# Save averaged image embeddings + text
np.save(os.path.join(OUT_DIR, "image_train_avg.npy"), img_train)
np.save(os.path.join(OUT_DIR, "image_test_avg.npy"),  img_test)
np.save(os.path.join(OUT_DIR, "text_train.npy"), q_train)
np.save(os.path.join(OUT_DIR, "text_test.npy"),  q_test)

print("\nFinal embeddings:")
print("  Text :", q_train.shape, q_test.shape)
print("  Image:", img_train.shape, img_test.shape)

# -------------------------
# TRAIN/VAL SPLIT
# -------------------------
tr_idx, val_idx = train_test_split(
    np.arange(len(train_df)),
    test_size=0.2,
    random_state=42,
    stratify=y_cat
)

Xq_tr, Xi_tr = q_train[tr_idx], img_train[tr_idx]
Xq_val, Xi_val = q_train[val_idx], img_train[val_idx]
ypl_tr, ypl_val = y_price_log[tr_idx], y_price_log[val_idx]
ycl_tr, ycl_val = y_cat[tr_idx], y_cat[val_idx]
ybl_tr, ybl_val = y_brand[tr_idx], y_brand[val_idx]

# -------------------------
# DATA LOADERS
# -------------------------
train_ds = TensorDataset(
    torch.from_numpy(Xq_tr).float(),
    torch.from_numpy(Xi_tr).float(),
    torch.from_numpy(ypl_tr).float(),
    torch.from_numpy(ycl_tr).long(),
    torch.from_numpy(ybl_tr).long(),
)
val_ds = TensorDataset(
    torch.from_numpy(Xq_val).float(),
    torch.from_numpy(Xi_val).float(),
    torch.from_numpy(ypl_val).float(),
    torch.from_numpy(ycl_val).long(),
    torch.from_numpy(ybl_val).long(),
)
test_ds = TensorDataset(
    torch.from_numpy(q_test).float(),
    torch.from_numpy(img_test).float()
)

train_loader = DataLoader(train_ds, batch_size=BATCH_TRAIN, shuffle=True,  num_workers=2, pin_memory=True)
val_loader   = DataLoader(val_ds,   batch_size=BATCH_TRAIN*2, shuffle=False, num_workers=2)
test_loader  = DataLoader(test_ds,  batch_size=BATCH_TRAIN*2, shuffle=False, num_workers=2)

# -------------------------
# MODEL
# -------------------------
class Projection(nn.Module):
    def __init__(self, in_dim, out_dim, p=0.1):
        super().__init__()
        self.seq = nn.Sequential(
            nn.LayerNorm(in_dim),
            nn.Linear(in_dim, out_dim),
            nn.GELU(),
            nn.Dropout(p)
        )
    def forward(self, x): return self.seq(x)

class FusionNet(nn.Module):
    def __init__(self, text_dim, image_dim, fusion_dim, n_cat, n_brand, use_brand_head=True):
        super().__init__()
        self.text_proj  = Projection(text_dim, fusion_dim, DROPOUT)
        self.image_proj = Projection(image_dim, fusion_dim, DROPOUT)
        self.fusion = nn.Sequential(
            nn.LayerNorm(fusion_dim*2),
            nn.Linear(fusion_dim*2, fusion_dim),
            nn.GELU(),
            nn.Dropout(DROPOUT),
            nn.Linear(fusion_dim, fusion_dim//2),
            nn.GELU(),
            nn.Dropout(DROPOUT),
        )
        self.price_head = nn.Linear(fusion_dim//2, 1)
        self.cat_head   = nn.Linear(fusion_dim//2, n_cat)
        self.use_brand_head = use_brand_head
        if use_brand_head:
            self.brand_head = nn.Linear(fusion_dim//2, n_brand)

    def forward(self, t_inp, v_inp):
        t = self.text_proj(t_inp)
        v = self.image_proj(v_inp)
        h = self.fusion(torch.cat([t, v], dim=1))
        price_log = self.price_head(h).squeeze(-1)
        cat_logits = self.cat_head(h)
        if self.use_brand_head:
            brand_logits = self.brand_head(h)
        else:
            brand_logits = None
        return price_log, cat_logits, brand_logits, t, v

use_brand = (num_brands >= 2)
model = FusionNet(
    text_dim=Xq_tr.shape[1],
    image_dim=Xi_tr.shape[1],
    fusion_dim=FUSION_DIM,
    n_cat=num_categories,
    n_brand=num_brands,
    use_brand_head=use_brand
).to(DEVICE)

opt   = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
sched = torch.optim.lr_scheduler.ReduceLROnPlateau(opt, mode="min", factor=0.5, patience=3, verbose=True)
mse, ce = nn.MSELoss(), nn.CrossEntropyLoss()

print(f"Trainable params: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")

# -------------------------
# TRAINING
# -------------------------
best_smape = float("inf")
pat = 0
BEST_PATH = os.path.join(OUT_DIR, "fusion_best.pth")

for epoch in range(1, EPOCHS+1):
    model.train()
    train_loss = 0.0

    for tq, ti, ypl, ycl, ybl in tqdm(train_loader, desc=f"Epoch {epoch}/{EPOCHS}"):
        tq, ti = tq.to(DEVICE), ti.to(DEVICE)
        ypl, ycl, ybl = ypl.to(DEVICE), ycl.to(DEVICE), ybl.to(DEVICE)

        opt.zero_grad()
        price_log, cat_logits, brand_logits, tp, vp = model(tq, ti)

        loss = (
            PRICE_SMAPE_W * smape_torch(torch.expm1(price_log), torch.expm1(ypl))
            + PRICE_MSE_W * mse(price_log, ypl)
            + CAT_CE_W * ce(cat_logits, ycl)
            + INFO_NCE_W * info_nce(tp, vp)
        )
        if use_brand and brand_logits is not None:
            loss = loss + BRAND_CE_W * ce(brand_logits, ybl)

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        opt.step()
        train_loss += loss.item()

    train_loss /= max(len(train_loader), 1)

    # validation
    model.eval()
    val_logits = []
    with torch.no_grad():
        for tq, ti, _, _, _ in val_loader:
            price_log, _, _, _, _ = model(tq.to(DEVICE), ti.to(DEVICE))
            val_logits.append(price_log.detach().cpu().numpy())
    val_logits = np.concatenate(val_logits)
    val_smape = smape_np(np.expm1(ypl_val), np.expm1(val_logits))
    sched.step(val_smape)

    print(f"Epoch {epoch:02d} | train_loss={train_loss:.4f} | val_SMAPE={val_smape:.4f}")
    if val_smape < best_smape:
        best_smape, pat = val_smape, 0
        torch.save(model.state_dict(), BEST_PATH)
        print(f"  ✓ New best saved: {BEST_PATH}")
    else:
        pat += 1
        print(f"  No improvement (patience {pat}/{PATIENCE})")
        if pat >= PATIENCE:
            print("  → Early stopping.")
            break

print(f"\nBest Validation SMAPE: {best_smape:.4f}")

# -------------------------
# TEST INFERENCE + SAVE
# -------------------------
model.load_state_dict(torch.load(BEST_PATH, map_location=DEVICE))
model.eval()

test_logits = []
with torch.no_grad():
    for tq, ti in tqdm(test_loader, desc="Predicting"):
        price_log, _, _, _, _ = model(tq.to(DEVICE), ti.to(DEVICE))
        test_logits.append(price_log.detach().cpu().numpy())
test_logits = np.concatenate(test_logits)
test_price  = np.expm1(test_logits)

submission = pd.DataFrame({
    "sample_id": test_df["sample_id"],
    "price": test_price
})
sub_path = os.path.join(OUT_DIR, "submission.csv")
submission.to_csv(sub_path, index=False)

# Save a compact run manifest
manifest = {
    "text_model": QWEN3_MODEL_PATH,
    "siglip_model": SIGLIP_MODEL_PATH,
    "dino_candidates": DINO_MODEL_CANDIDATES,
    "used_brand_head": use_brand,
    "best_val_smape": float(best_smape),
    "embedding_shapes": {
        "qwen3_train": list(q_train.shape),
        "siglip_train": list(sig_train.shape),
        "dino_train": list(dino_train.shape),
        "image_avg_train": list(img_train.shape)
    }
}
with open(os.path.join(OUT_DIR, "run_manifest.json"), "w") as f:
    json.dump(manifest, f, indent=2)

print("="*80)
print(f"✓ All done. Submission: {sub_path}")
print("✓ Embeddings and checkpoints saved in:", OUT_DIR)
print("="*80)


Device: cuda
Train: 75000, Test: 75000 | Cats: 5, Brands: 1

=== TEXT (Qwen3) ===


2025-10-21 08:45:12.328529: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1761036312.728873      37 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1761036312.831431      37 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Qwen3 Text:   0%|          | 0/3125 [00:00<?, ?it/s]

  with torch.cuda.amp.autocast(enabled=(DEVICE.type=="cuda")):


Qwen3 Text:   0%|          | 0/3125 [00:00<?, ?it/s]

Qwen3: (75000, 1024) (75000, 1024)

=== IMAGES (SigLIP) ===


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


SigLIP Images:   0%|          | 0/3125 [00:00<?, ?it/s]

  with torch.cuda.amp.autocast(enabled=(device.type=="cuda")):


ValueError: You have to specify input_ids

## rest

In [7]:
# ======================================================================================
# SigLIP IMAGE EMBEDDING EXTRACTION (corrected version)
# Uses only the vision encoder (no input_ids error)
# Outputs: siglip_train.npy, siglip_test.npy in /kaggle/working/
# ======================================================================================

import os, gc, threading
from queue import Queue
import numpy as np
from tqdm.auto import tqdm
from PIL import Image
import torch
from transformers import AutoImageProcessor, SiglipVisionModel

# -------------------------
# PATHS
# -------------------------
IMAGES_TRAIN = "/kaggle/input/aml-train/AMAZON_ML_TRAIN"
IMAGES_TEST  = "/kaggle/input/amazon-ml-test/AMAZON_ML_TEST"
SIGLIP_MODEL_PATH = "/kaggle/input/google-siglip-so400m-patch14-384/transformers/default/1"
OUT_DIR = "/kaggle/working/"
os.makedirs(OUT_DIR, exist_ok=True)

# -------------------------
# CONFIG
# -------------------------
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
DTYPE  = torch.float16 if torch.cuda.is_available() else torch.float32
BATCH_SIZE = 24
NUM_THREADS = 4
print(f"Device: {DEVICE}")

# -------------------------
# HELPERS
# -------------------------
def parallel_load_images(paths, num_threads=4):
    """Load images concurrently using threads for speed."""
    images = [None]*len(paths)
    q = Queue()
    for i, p in enumerate(paths):
        q.put((i, p))

    def worker():
        while not q.empty():
            try:
                i, p = q.get_nowait()
                try:
                    im = Image.open(p).convert("RGB")
                except:
                    im = Image.new("RGB", (384, 384))
                images[i] = im
                q.task_done()
            except:
                break

    threads = [threading.Thread(target=worker) for _ in range(num_threads)]
    [t.start() for t in threads]
    [t.join() for t in threads]
    return images

def l2_normalize(x, axis=1, eps=1e-9):
    return x / (np.sqrt((x * x).sum(axis=axis, keepdims=True)) + eps)

def encode_images(model, processor, image_paths, batch_size=24, device=DEVICE, tag="SigLIP"):
    """Encode all images into embeddings using the vision-only SigLIP encoder."""
    all_embeddings = []
    for i in tqdm(range(0, len(image_paths), batch_size), desc=f"{tag} Images"):
        batch_paths = image_paths[i:i+batch_size]
        imgs = parallel_load_images(batch_paths, num_threads=NUM_THREADS)
        inputs = processor(imgs, return_tensors="pt").to(device)
        with torch.no_grad(), torch.amp.autocast("cuda", enabled=(device.type=="cuda")):
            out = model(**inputs)
            emb = out.pooler_output  # (batch, hidden_size)
            emb = torch.nn.functional.normalize(emb.float(), p=2, dim=1)
        all_embeddings.append(emb.cpu().numpy())
        del imgs, inputs, out, emb
        if device.type == "cuda" and i % 200 == 0:
            torch.cuda.empty_cache()
    return np.vstack(all_embeddings)

# -------------------------
# LOAD MODEL
# -------------------------
print("\n=== Loading SigLIP Vision Encoder ===")
model = SiglipVisionModel.from_pretrained(SIGLIP_MODEL_PATH, torch_dtype=DTYPE).to(DEVICE).eval()
processor = AutoImageProcessor.from_pretrained(SIGLIP_MODEL_PATH)

# -------------------------
# GET IMAGE PATHS
# -------------------------
train_paths = sorted([os.path.join(IMAGES_TRAIN, x) for x in os.listdir(IMAGES_TRAIN) if x.lower().endswith(".jpg")])
test_paths  = sorted([os.path.join(IMAGES_TEST,  x) for x in os.listdir(IMAGES_TEST)  if x.lower().endswith(".jpg")])

print(f"Found {len(train_paths)} training and {len(test_paths)} testing images.")

# -------------------------
# EXTRACT EMBEDDINGS
# -------------------------
train_embs = encode_images(model, processor, train_paths, batch_size=BATCH_SIZE, tag="SigLIP-Train")
test_embs  = encode_images(model, processor, test_paths,  batch_size=BATCH_SIZE, tag="SigLIP-Test")

train_embs = l2_normalize(train_embs)
test_embs  = l2_normalize(test_embs)

# -------------------------
# SAVE
# -------------------------
np.save(os.path.join(OUT_DIR, "siglip_train.npy"), train_embs)
np.save(os.path.join(OUT_DIR, "siglip_test.npy"),  test_embs)

print("\n✅ SigLIP embeddings extracted successfully.")
print(f"Train embeddings: {train_embs.shape} | Test embeddings: {test_embs.shape}")
print(f"Saved to: {OUT_DIR}")


Device: cuda

=== Loading SigLIP Vision Encoder ===
Found 74999 training and 74999 testing images.


SigLIP-Train Images:   0%|          | 0/3125 [00:00<?, ?it/s]

SigLIP-Test Images:   0%|          | 0/3125 [00:00<?, ?it/s]


✅ SigLIP embeddings extracted successfully.
Train embeddings: (74999, 1152) | Test embeddings: (74999, 1152)
Saved to: /kaggle/working/


In [9]:
import kagglehub

kagglehub.login()

# Replace with path to directory containing model files.
LOCAL_MODEL_DIR = '/kaggle/working/'

MODEL_SLUG = 'aml_last_shit' # Replace with model slug.

# Learn more about naming model variations at
# https://www.kaggle.com/docs/models#name-model.
VARIATION_SLUG = 'default' # Replace with variation slug.

kagglehub.model_upload(
  handle = f"kartikgarg74/{MODEL_SLUG}/keras/{VARIATION_SLUG}",
  local_model_dir = LOCAL_MODEL_DIR,
  version_notes = 'Update 2025-10-21')

VBox(children=(HTML(value='<center> <img\nsrc=https://www.kaggle.com/static/images/site-logo.png\nalt=\'Kaggle…

Uploading Model https://www.kaggle.com/models/kartikgarg74/aml_last_shit/keras/default ...
Model 'aml_last_shit' does not exist or access is forbidden for user 'kartikgarg74'. Creating or handling Model...
Model 'aml_last_shit' Created.
Starting upload for file /kaggle/working/siglip_train.npy


Uploading: 100%|██████████| 346M/346M [00:03<00:00, 103MB/s] 

Upload successful: /kaggle/working/siglip_train.npy (330MB)
Starting upload for file /kaggle/working/qwen3_train.npy



Uploading: 100%|██████████| 307M/307M [00:02<00:00, 105MB/s]  

Upload successful: /kaggle/working/qwen3_train.npy (293MB)
Starting upload for file /kaggle/working/siglip_test.npy



Uploading: 100%|██████████| 346M/346M [00:03<00:00, 109MB/s] 

Upload successful: /kaggle/working/siglip_test.npy (330MB)
Starting upload for file /kaggle/working/qwen3_test.npy



Uploading: 100%|██████████| 307M/307M [00:02<00:00, 107MB/s]  

Upload successful: /kaggle/working/qwen3_test.npy (293MB)
Starting upload for file /kaggle/working/output_fusion_qwen_siglip_dinov2/submission.csv



Uploading: 100%|██████████| 1.22M/1.22M [00:00<00:00, 2.40MB/s]

Upload successful: /kaggle/working/output_fusion_qwen_siglip_dinov2/submission.csv (1MB)
Starting upload for file /kaggle/working/output_fusion_qwen_siglip_dinov2/best_model.pth



Uploading: 100%|██████████| 7.13M/7.13M [00:00<00:00, 15.2MB/s]

Upload successful: /kaggle/working/output_fusion_qwen_siglip_dinov2/best_model.pth (7MB)





Your model instance has been created.
Files are being processed...
See at: https://www.kaggle.com/models/kartikgarg74/aml_last_shit/keras/default


In [8]:
# ======================================================================================
# DINOv3 IMAGE EMBEDDING EXTRACTION (robust, same error handling as SigLIP)
# Outputs: dinov3_train.npy, dinov3_test.npy in /kaggle/working/
# ======================================================================================

import os, gc, threading
from queue import Queue
import numpy as np
from tqdm.auto import tqdm
from PIL import Image
import torch
from transformers import AutoImageProcessor, Dinov3Model

# -------------------------
# PATHS
# -------------------------
IMAGES_TRAIN = "/kaggle/input/aml-train/AMAZON_ML_TRAIN"
IMAGES_TEST  = "/kaggle/input/amazon-ml-test/AMAZON_ML_TEST"
DINO_MODEL_PATH = "/kaggle/input/facebook-dinov3-large-224/transformers/default/1"  # <-- change if needed
OUT_DIR = "/kaggle/working/"
SIGLIP_TRAIN = os.path.join(OUT_DIR, "siglip_train.npy")
SIGLIP_TEST  = os.path.join(OUT_DIR, "siglip_test.npy")

os.makedirs(OUT_DIR, exist_ok=True)

# -------------------------
# CONFIG
# -------------------------
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
DTYPE  = torch.float16 if torch.cuda.is_available() else torch.float32
BATCH_SIZE = 24
NUM_THREADS = 4
print(f"Device: {DEVICE}")

# -------------------------
# HELPERS
# -------------------------
def parallel_load_images(paths, num_threads=4):
    """Load images concurrently using threads for speed."""
    images = [None]*len(paths)
    q = Queue()
    for i, p in enumerate(paths):
        q.put((i, p))

    def worker():
        while not q.empty():
            try:
                i, p = q.get_nowait()
                try:
                    im = Image.open(p).convert("RGB")
                except:
                    im = Image.new("RGB", (224, 224))
                images[i] = im
                q.task_done()
            except:
                break

    threads = [threading.Thread(target=worker) for _ in range(num_threads)]
    [t.start() for t in threads]
    [t.join() for t in threads]
    return images

def l2_normalize(x, axis=1, eps=1e-9):
    return x / (np.sqrt((x * x).sum(axis=axis, keepdims=True)) + eps)

def encode_images(model, processor, image_paths, batch_size=24, device=DEVICE, tag="DINOv3"):
    """Encode all images into embeddings using the DINOv3 vision encoder."""
    all_embeddings = []
    for i in tqdm(range(0, len(image_paths), batch_size), desc=f"{tag} Images"):
        batch_paths = image_paths[i:i+batch_size]
        imgs = parallel_load_images(batch_paths, num_threads=NUM_THREADS)
        inputs = processor(imgs, return_tensors="pt").to(device)
        with torch.no_grad(), torch.amp.autocast("cuda", enabled=(device.type=="cuda")):
            out = model(**inputs)
            if hasattr(out, "pooler_output"):
                emb = out.pooler_output
            elif hasattr(out, "last_hidden_state"):
                emb = out.last_hidden_state.mean(dim=1)
            else:
                emb = list(out.values())[0]
            emb = torch.nn.functional.normalize(emb.float(), p=2, dim=1)
        all_embeddings.append(emb.cpu().numpy())
        del imgs, inputs, out, emb
        if device.type == "cuda" and i % 200 == 0:
            torch.cuda.empty_cache()
    return np.vstack(all_embeddings)

# -------------------------
# LOAD MODEL
# -------------------------
print("\n=== Loading DINOv3 Vision Encoder ===")
try:
    model = Dinov3Model.from_pretrained(DINO_MODEL_PATH, torch_dtype=DTYPE).to(DEVICE).eval()
    processor = AutoImageProcessor.from_pretrained(DINO_MODEL_PATH)
    dino_available = True
except Exception as e:
    print(f"⚠️ Failed to load DINOv3 model: {e}")
    dino_available = False

# -------------------------
# GET IMAGE PATHS
# -------------------------
train_paths = sorted([os.path.join(IMAGES_TRAIN, x) for x in os.listdir(IMAGES_TRAIN) if x.lower().endswith(".jpg")])
test_paths  = sorted([os.path.join(IMAGES_TEST,  x) for x in os.listdir(IMAGES_TEST)  if x.lower().endswith(".jpg")])

print(f"Found {len(train_paths)} training and {len(test_paths)} testing images.")

# -------------------------
# EXTRACT EMBEDDINGS
# -------------------------
if dino_available:
    try:
        train_embs = encode_images(model, processor, train_paths, batch_size=BATCH_SIZE, tag="DINOv3-Train")
        test_embs  = encode_images(model, processor, test_paths,  batch_size=BATCH_SIZE, tag="DINOv3-Test")

        train_embs = l2_normalize(train_embs)
        test_embs  = l2_normalize(test_embs)

        np.save(os.path.join(OUT_DIR, "dinov3_train.npy"), train_embs)
        np.save(os.path.join(OUT_DIR, "dinov3_test.npy"),  test_embs)

        print("\n✅ DINOv3 embeddings extracted successfully.")
        print(f"Train embeddings: {train_embs.shape} | Test embeddings: {test_embs.shape}")
        print(f"Saved to: {OUT_DIR}")

    except Exception as e:
        print(f"⚠️ Error during DINOv3 embedding extraction: {e}")
        dino_available = False

# -------------------------
# FALLBACK (if DINOv3 failed)
# -------------------------
if not dino_available:
    print("\n⚠️ Using SigLIP embeddings as fallback for DINOv3.")
    if os.path.exists(SIGLIP_TRAIN) and os.path.exists(SIGLIP_TEST):
        train_embs = np.load(SIGLIP_TRAIN)
        test_embs  = np.load(SIGLIP_TEST)
        np.save(os.path.join(OUT_DIR, "dinov3_train.npy"), train_embs)
        np.save(os.path.join(OUT_DIR, "dinov3_test.npy"),  test_embs)
        print(f"✓ Fallback used: copied SigLIP embeddings as dinov3_train/test.npy → {train_embs.shape}")
    else:
        raise FileNotFoundError("❌ No SigLIP fallback found! Please run SigLIP extraction first.")

print("\nAll done.")


ImportError: cannot import name 'Dinov3Model' from 'transformers' (/usr/local/lib/python3.11/dist-packages/transformers/__init__.py)

In [None]:
# ======================================================================================
# FINAL MULTIMODAL TRAINING: Qwen3 + SigLIP + DINOv3
# Based on RudrakshSJoshi/amlc-multimodal-mlp
# ======================================================================================

import os, gc
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split

# -------------------------
# CONFIG
# -------------------------
DATA_DIR = "/kaggle/input/aml-csv"
OUT_DIR = "/kaggle/working/"
os.makedirs(OUT_DIR, exist_ok=True)

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
DTYPE = torch.float16 if torch.cuda.is_available() else torch.float32

EPOCHS = 30
BATCH_SIZE = 128
LR = 2e-4
PATIENCE = 5
print(f"Device: {DEVICE}")

# -------------------------
# LOAD DATA
# -------------------------
train_df = pd.read_csv(os.path.join(DATA_DIR, "train.csv"))
test_df  = pd.read_csv(os.path.join(DATA_DIR, "test.csv"))

y_train = np.log1p(train_df["price"].values).astype(np.float32)

# -------------------------
# LOAD EMBEDDINGS
# -------------------------
print("\n=== Loading precomputed embeddings ===")
q_train = np.load("/kaggle/working/qwen3_train.npy")
q_test  = np.load("/kaggle/working/qwen3_test.npy")

sig_train = np.load("/kaggle/working/siglip_train.npy")
sig_test  = np.load("/kaggle/working/siglip_test.npy")

dino_train = np.load("/kaggle/working/dinov3_train.npy")
dino_test  = np.load("/kaggle/working/dinov3_test.npy")

# Ensure consistent dimensions
if dino_train.shape[0] != q_train.shape[0]:
    print("⚠️ DINOv3 train count mismatch → using SigLIP only")
    dino_train = sig_train.copy()
if dino_test.shape[0] != q_test.shape[0]:
    print("⚠️ DINOv3 test count mismatch → using SigLIP only")
    dino_test = sig_test.copy()

# Average image embeddings (SigLIP + DINOv3)
img_train = (sig_train + dino_train) / 2.0
img_test  = (sig_test  + dino_test)  / 2.0

# Normalize embeddings
def l2_normalize(x, axis=1, eps=1e-9):
    return x / (np.sqrt((x * x).sum(axis=axis, keepdims=True)) + eps)

q_train, q_test = l2_normalize(q_train), l2_normalize(q_test)
img_train, img_test = l2_normalize(img_train), l2_normalize(img_test)

# Fuse text + image embeddings
train_fused = np.concatenate([q_train, img_train], axis=1)
test_fused  = np.concatenate([q_test,  img_test], axis=1)
print(f"Train fused: {train_fused.shape} | Test fused: {test_fused.shape}")

# -------------------------
# SPLIT DATA
# -------------------------
X_tr, X_val, y_tr, y_val = train_test_split(train_fused, y_train, test_size=0.1, random_state=42)
print(f"Train/Val Split: {X_tr.shape}, {X_val.shape}")

train_ds = TensorDataset(torch.from_numpy(X_tr).float(), torch.from_numpy(y_tr).float())
val_ds   = TensorDataset(torch.from_numpy(X_val).float(), torch.from_numpy(y_val).float())
test_ds  = TensorDataset(torch.from_numpy(test_fused).float())

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=2, pin_memory=True)
val_loader   = DataLoader(val_ds,   batch_size=BATCH_SIZE*2, shuffle=False, num_workers=2)
test_loader  = DataLoader(test_ds,  batch_size=BATCH_SIZE*2, shuffle=False, num_workers=2)

# -------------------------
# MODEL
# -------------------------
class MultimodalMLP(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.LayerNorm(input_dim),
            nn.Linear(input_dim, 1024),
            nn.GELU(),
            nn.Dropout(0.2),
            nn.Linear(1024, 512),
            nn.GELU(),
            nn.Dropout(0.15),
            nn.Linear(512, 256),
            nn.GELU(),
            nn.Dropout(0.1),
            nn.Linear(256, 1),
        )
    def forward(self, x):
        return self.net(x).squeeze(-1)

def smape_torch(y_pred, y_true, eps=1e-9):
    num = torch.abs(y_pred - y_true)
    den = (torch.abs(y_true) + torch.abs(y_pred)) / 2.0
    return torch.mean(num / (den + eps))

def smape_np(y_true, y_pred, eps=1e-9):
    num = np.abs(y_pred - y_true)
    den = (np.abs(y_true) + np.abs(y_pred)) / 2.0
    return np.mean(num / (den + eps))

# -------------------------
# TRAINING
# -------------------------
model = MultimodalMLP(train_fused.shape[1]).to(DEVICE)
opt = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=1e-2)
sched = torch.optim.lr_scheduler.ReduceLROnPlateau(opt, mode="min", factor=0.5, patience=2, verbose=True)

best_smape = float("inf")
pat = 0
best_path = os.path.join(OUT_DIR, "best_multimodal_model.pth")

for epoch in range(1, EPOCHS + 1):
    model.train()
    total_loss = 0
    for xb, yb in tqdm(train_loader, desc=f"Epoch {epoch}/{EPOCHS}"):
        xb, yb = xb.to(DEVICE), yb.to(DEVICE)
        opt.zero_grad()
        pred = model(xb)
        loss = 0.8 * smape_torch(torch.expm1(pred), torch.expm1(yb)) + 0.2 * F.mse_loss(pred, yb)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        opt.step()
        total_loss += loss.item()
    avg_train = total_loss / len(train_loader)

    # Validation
    model.eval()
    val_preds = []
    with torch.no_grad():
        for xb, _ in val_loader:
            xb = xb.to(DEVICE)
            pred = model(xb)
            val_preds.append(pred.cpu().numpy())
    val_preds = np.concatenate(val_preds)
    val_smape = smape_np(np.expm1(y_val), np.expm1(val_preds))
    sched.step(val_smape)

    print(f"Epoch {epoch:02d} | train_loss={avg_train:.4f} | val_SMAPE={val_smape:.4f}")

    if val_smape < best_smape:
        best_smape, pat = val_smape, 0
        torch.save(model.state_dict(), best_path)
        print(f"  ✓ New best saved ({best_path})")
    else:
        pat += 1
        print(f"  No improvement ({pat}/{PATIENCE})")
        if pat >= PATIENCE:
            print("  → Early stopping.")
            break

print(f"\nBest Validation SMAPE: {best_smape:.4f}")

# -------------------------
# INFERENCE
# -------------------------
model.load_state_dict(torch.load(best_path, map_location=DEVICE))
model.eval()

test_preds = []
with torch.no_grad():
    for (xb,) in tqdm(test_loader, desc="Predicting"):
        xb = xb.to(DEVICE)
        pred = model(xb)
        test_preds.append(pred.cpu().numpy())
test_preds = np.concatenate(test_preds)
test_prices = np.expm1(test_preds)

# -------------------------
# SAVE SUBMISSION
# -------------------------
submission = pd.DataFrame({
    "sample_id": test_df["sample_id"] if "sample_id" in test_df else np.arange(len(test_df)),
    "price": test_prices
})
sub_path = os.path.join(OUT_DIR, "submission_final.csv")
submission.to_csv(sub_path, index=False)
print(f"\n✅ Final submission saved to: {sub_path}")
