In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/amazon-ml-2025/train.csv


In [2]:
!pip install transformers peft torch torchvision tqdm pillow

Collecting huggingface-hub<1.0,>=0.30.0 (from transformers)
  Downloading huggingface_hub-0.35.3-py3-none-any.whl.metadata (14 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py

In [3]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import requests
from io import BytesIO
from transformers import CLIPModel, CLIPProcessor
from peft import LoraConfig, get_peft_model
from tqdm import tqdm
import warnings
import re
warnings.filterwarnings('ignore')

class ProductDataset(Dataset):
    def __init__(self, df, processor, augment=True):
        self.df = df
        self.processor = processor
        self.augment = augment

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        text = augment_text(row)
        url = row["image_link"]

        try:
            image = Image.open(requests.get(url, stream=True, timeout=10).raw).convert("RGB")
        except:
            image = Image.new("RGB", (224,224))

        # Process individually without batching to avoid tensor size issues
        inputs = self.processor(
            text=text,  # Single text, not list
            images=image, 
            return_tensors="pt", 
            padding="max_length",  # Ensure consistent padding
            truncation=True,
            max_length=77  # CLIP's maximum sequence length
        )
        # Return single tensors, not batched
        return {
            'input_ids': inputs['input_ids'].squeeze(0),
            'attention_mask': inputs['attention_mask'].squeeze(0),
            'pixel_values': inputs['pixel_values'].squeeze(0)
        }

UNIT_MAP = {
    "fl oz": "fl_oz", "ounce": "oz", "oz": "oz",
    "lb": "lb", "pound": "lb", "g": "g", "kg": "kg",
    "count": "count", "pack": "count", "ct": "count"
}

def clean_text(s):
    if not isinstance(s, str): return ""
    s = re.sub(r"<[^>]+>", " ", s)
    s = s.replace("&#38;", "&").replace("&amp;", "&")
    s = re.sub(r"\s+", " ", s).strip()
    return s

def parse_value_unit(text):
    value, unit = None, None
    m = re.search(r"Value\s*:\s*([0-9]+(?:\.[0-9]+)?)", text, re.I)
    if m: value = float(m.group(1))
    m2 = re.search(r"Unit\s*:\s*([A-Za-z0-9 _\./]+)", text, re.I)
    if m2: unit = m2.group(1).strip().lower()
    if unit:
        for k,v in UNIT_MAP.items():
            if k in unit:
                unit = v
                break
    return value, unit

def augment_text(row):
    """Advanced text cleaning and augmentation for better CLIP training"""
    text = clean_text(row["catalog_content"])
    v, u = parse_value_unit(text)
    if v is not None: text += f" <|VALUE|> {v}"
    if u is not None: text += f" <|UNIT|> {u}"
    return text

def load_and_preprocess_data(test_mode=False, test_samples=100):
    """Load training data and create augmented text with advanced cleaning"""
    df_train = pd.read_csv('/kaggle/input/amazon-ml-2025/train.csv')
    
    # For testing, use only first N samples
    if test_mode:
        df_train = df_train.head(test_samples)
        print(f"üß™ Test mode: Using only {len(df_train)} samples")
    
    df_train['augmented_text'] = df_train.apply(augment_text, axis=1)
    return df_train


def setup_clip_model():
    """Initialize CLIP model with LoRA adapters"""
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model_name = "openai/clip-vit-base-patch32"
    model = CLIPModel.from_pretrained(model_name).to(device)
    processor = CLIPProcessor.from_pretrained(model_name)

    # Temporarily disable LoRA to test the pipeline
    # Apply PEFT (LoRA) adapters
    # lora_config = LoraConfig(
    #     r=16, 
    #     lora_alpha=32, 
    #     target_modules=["q_proj", "v_proj", "k_proj", "out_proj"], 
    #     lora_dropout=0.05, 
    #     bias="none", 
    #     task_type="FEATURE_EXTRACTION"
    # )
    # model = get_peft_model(model, lora_config)
    # model.print_trainable_parameters()
    
    # For testing, freeze everything except the final projections
    for name, param in model.named_parameters():
        if 'projection' not in name and 'logit_scale' not in name:
            param.requires_grad = False
    
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    total_params = sum(p.numel() for p in model.parameters())
    print(f"‚úÖ Trainable parameters: {trainable_params:,} || Total: {total_params:,} || Trainable%: {100 * trainable_params / total_params:.4f}")
    
    return model, processor, device

def custom_collate_fn(batch):
    """Custom collate function to ensure all tensors have the same size"""
    # Extract all keys from the first item
    keys = batch[0].keys()
    
    result = {}
    for key in keys:
        if key in ['input_ids', 'attention_mask']:
            # Ensure all text tensors are exactly 77 tokens
            tensors = []
            for item in batch:
                tensor = item[key]
                if len(tensor) < 77:
                    # Pad to 77
                    if key == 'input_ids':
                        padding = torch.zeros(77 - len(tensor), dtype=tensor.dtype)
                    else:  # attention_mask
                        padding = torch.zeros(77 - len(tensor), dtype=tensor.dtype)
                    tensor = torch.cat([tensor, padding])
                elif len(tensor) > 77:
                    # Truncate to 77
                    tensor = tensor[:77]
                tensors.append(tensor)
            result[key] = torch.stack(tensors)
        else:
            # For pixel_values and other tensors, stack normally
            result[key] = torch.stack([item[key] for item in batch])
    
    return result

def fine_tune_clip(df_train, model, processor, device, epochs=2):
    """Fine-tune CLIP model with LoRA on product data"""
    train_ds = ProductDataset(df_train, processor)
    # Use custom collate function to handle tensor size issues
    train_dl = DataLoader(
        train_ds, 
        batch_size=4,  # Further reduced batch size
        shuffle=True, 
        num_workers=0,
        collate_fn=custom_collate_fn
    )

    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        batch_count = 0
        for batch in tqdm(train_dl, desc=f"Epoch {epoch+1}"):
            try:
                # Prepare inputs for CLIP model - only pass what's needed
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                pixel_values = batch['pixel_values'].to(device)
                
                # Call model with explicit arguments only
                outs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    pixel_values=pixel_values,
                    return_dict=True
                )
                
                img_emb = F.normalize(outs.image_embeds, dim=-1)
                txt_emb = F.normalize(outs.text_embeds, dim=-1)
                
                # Access logit_scale directly from the model, not the output
                logit_scale = model.logit_scale.exp()
                logits = img_emb @ txt_emb.t() * logit_scale
                
                labels = torch.arange(len(logits), device=device)
                loss_i = F.cross_entropy(logits, labels)
                loss_t = F.cross_entropy(logits.t(), labels)
                loss = (loss_i + loss_t) / 2
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                total_loss += loss.item()
                batch_count += 1
            except (RuntimeError, TypeError) as e:
                print(f"‚ö†Ô∏è Skipping batch due to error: {e}")
                continue
        
        avg_loss = total_loss / max(batch_count, 1)
        print(f"Epoch {epoch+1} | Loss: {avg_loss:.4f}")

    torch.save(model.state_dict(), "clip_lora_finetuned.pth")
    print("‚úÖ LoRA fine-tuning complete")

def generate_embeddings(df, model, processor, device, prefix="train"):
    """Generate CLIP embeddings for text and images"""
    model.eval()
    txt_embs, img_embs = [], []
    with torch.no_grad():
        for _, row in tqdm(df.iterrows(), total=len(df), desc=f"Generating {prefix} embeddings"):
            text = augment_text(row)
            url = row["image_link"]
            try:
                image = Image.open(requests.get(url, stream=True, timeout=10).raw).convert("RGB")
            except:
                image = Image.new("RGB", (224,224))
            
            # Use same processing as training with truncation
            inputs = processor(
                text=[text], 
                images=image, 
                return_tensors="pt", 
                padding="max_length",
                truncation=True,
                max_length=77
            )
            
            # Use explicit parameter passing like in training
            outs = model(
                input_ids=inputs['input_ids'].to(device),
                attention_mask=inputs['attention_mask'].to(device),
                pixel_values=inputs['pixel_values'].to(device),
                return_dict=True
            )
            
            txt_embs.append(outs.text_embeds.cpu().numpy())
            img_embs.append(outs.image_embeds.cpu().numpy())
    
    txt_embs = np.vstack(txt_embs)
    img_embs = np.vstack(img_embs)
    np.save(f"{prefix}_txt_emb.npy", txt_embs)
    np.save(f"{prefix}_img_emb.npy", img_embs)
    print(f"‚úÖ Saved embeddings: {prefix}")
    return txt_embs, img_embs

class Regressor(nn.Module):
    def __init__(self, in_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, 512),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(512, 128),
            nn.ReLU(),
            nn.Linear(128, 1)
        )
    
    def forward(self, x): 
        return self.net(x)

def train_price_regressor(img_embs, txt_embs, prices, device, epochs=10):
    """Train price prediction regressor on CLIP embeddings"""
    X = torch.tensor(np.concatenate([img_embs, txt_embs], axis=1), dtype=torch.float32).to(device)
    y = torch.tensor(prices, dtype=torch.float32).to(device).view(-1,1)

    reg = Regressor(X.shape[1]).to(device)
    opt = optim.AdamW(reg.parameters(), lr=1e-4)
    loss_fn = nn.L1Loss()

    for epoch in range(epochs):
        reg.train()
        opt.zero_grad()
        preds = reg(X)
        loss = loss_fn(preds, y)
        loss.backward()
        opt.step()
        print(f"Epoch {epoch+1} | MAE: {loss.item():.4f}")

    torch.save(reg.state_dict(), "regressor.pth")
    return reg

def main(test_mode=True):
    """Main training pipeline"""
    print("üöÄ Starting CLIP-based Product Pricing Pipeline")
    
    # 1. Load and preprocess data
    print("üìä Loading training data...")
    df_train = load_and_preprocess_data(test_mode=test_mode, test_samples=100)
    print(f"Loaded {len(df_train)} training samples")
    
    # 2. Setup CLIP model
    print("ü§ñ Setting up CLIP model with LoRA...")
    model, processor, device = setup_clip_model()
    
    # 3. Fine-tune CLIP
    print("üîß Fine-tuning CLIP on product data...")
    fine_tune_clip(df_train, model, processor, device, epochs=1 if test_mode else 2)
    
    # 4. Generate embeddings
    print("üéØ Generating embeddings...")
    img_embs, txt_embs = generate_embeddings(df_train, model, processor, device)
    
    # 5. Train price regressor
    print("üí∞ Training price regressor...")
    regressor = train_price_regressor(img_embs, txt_embs, df_train["price"].values, device, epochs=5 if test_mode else 10)
    
    print("‚úÖ Training pipeline complete!")
    print("Saved models: clip_lora_finetuned.pth, regressor.pth")
    print("Saved embeddings: train_txt_emb.npy, train_img_emb.npy")

if __name__ == "__main__":
    # Set test_mode=True for quick testing with 100 samples
    # Set test_mode=False for full training
    main(test_mode=True)


2025-10-11 12:21:06.575801: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1760185266.792885      13 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1760185266.859508      13 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


üöÄ Starting CLIP-based Product Pricing Pipeline
üìä Loading training data...
üß™ Test mode: Using only 100 samples
Loaded 100 training samples
ü§ñ Setting up CLIP model with LoRA...


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

‚úÖ Trainable parameters: 655,361 || Total: 151,277,313 || Trainable%: 0.4332
üîß Fine-tuning CLIP on product data...


Epoch 1: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 25/25 [00:30<00:00,  1.22s/it]


Epoch 1 | Loss: 0.0391
‚úÖ LoRA fine-tuning complete
üéØ Generating embeddings...


Generating train embeddings: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [00:32<00:00,  3.10it/s]

‚úÖ Saved embeddings: train
üí∞ Training price regressor...
Epoch 1 | MAE: 29.3274
Epoch 2 | MAE: 29.3209
Epoch 3 | MAE: 29.3154
Epoch 4 | MAE: 29.3090
Epoch 5 | MAE: 29.3037
‚úÖ Training pipeline complete!
Saved models: clip_lora_finetuned.pth, regressor.pth
Saved embeddings: train_txt_emb.npy, train_img_emb.npy



