# **ML Challenge 2025: Smart Product Pricing Solution**

#### Team Name: Algorithmic Minds
#### Team Members: Yash Handa, Suhani Kumari, Vishakha Gupta
#### Submission Date: 13/10/2025

In [1]:
import os
import re
import time
import urllib
from pathlib import Path
from functools import partial
from tqdm import tqdm
import numpy as np
import pandas as pd
from google.colab import files
from sentence_transformers import SentenceTransformer
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from torchvision import models, transforms
from sklearn.model_selection import KFold

In [2]:
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

In [3]:
BATCH_SIZE = 64
NUM_EPOCHS = 100
LR = 1e-4
WEIGHT_DECAY = 1e-5
PATIENCE = 6
MODEL_SAVE_PATH = 'models/multimodal_model.pth'
SEED = 42
NUM_DOWNLOAD_WORKERS = 8
IMAGE_SIZE = (300, 300)
# Switch to all-mpnet-base-v2 for text embeddings
TEXT_EMB_MODEL_NAME = 'all-mpnet-base-v2'
ACCUMULATION_STEPS = 2 # gradient accumulation
NUM_FOLDS = 5

In [4]:
torch.manual_seed(SEED)
np.random.seed(SEED)
os.makedirs('models', exist_ok=True)
os.makedirs('features', exist_ok=True)

In [5]:
# upload train.csv from datasets folder
uploaded = files.upload()

Saving train.csv to train.csv


In [6]:
# upload test.csv from datasets folder
uploaded = files.upload()

Saving test.csv to test.csv


In [7]:
train_df = pd.read_csv('train.csv')

In [8]:
def clean_text(text):
    """Lightweight cleaning used before sending to sentence-transformers."""
    if isinstance(text, str):
        text = re.sub(r'<.*?>', ' ', text)
        text = re.sub(r'[^a-zA-Z0-9., ]', ' ', text)
        text = re.sub(r'\s+', ' ', text).strip()
        return text.lower()
    return ""

In [9]:
def smape(y_true, y_pred, eps=1e-8):
    num = np.abs(y_pred - y_true)
    den = (np.abs(y_true) + np.abs(y_pred)) / 2.0
    return float(np.mean(num / (den + eps)) * 100.0)

In [10]:
def create_blank_image(save_path, size=IMAGE_SIZE):
    img = Image.new('RGB', size, (0, 0, 0))
    img.save(save_path)

In [11]:
def download_image(image_link, savefolder, max_retries=3, timeout=10):
    """Download single image with retries. Returns path or None."""
    try:
        if not isinstance(image_link, str) or image_link.strip() == "":
            return None
        filename = Path(image_link).name
        image_save_path = os.path.join(savefolder, filename)
        if os.path.exists(image_save_path):
            return image_save_path
        for attempt in range(max_retries):
            try:
                urllib.request.urlretrieve(image_link, image_save_path)
                return image_save_path
            except Exception as e:
                if attempt + 1 == max_retries:
                    # create placeholder
                    try:
                        create_blank_image(image_save_path)
                        return image_save_path
                    except Exception:
                        return None
                time.sleep(1 + attempt)
    except Exception:
        return None

In [12]:
def download_images_parallel(image_links, download_folder, workers=NUM_DOWNLOAD_WORKERS):
    os.makedirs(download_folder, exist_ok=True)
    results = []
    download_partial = partial(download_image, savefolder=download_folder)
    # simple threaded approach using map for portability
    from multiprocessing.pool import ThreadPool
    with ThreadPool(workers) as pool:
        for path in tqdm(pool.imap(download_partial, image_links), total=len(image_links)):
            results.append(path)
    return results

In [13]:
def compute_text_embeddings(df, text_col='catalog_content', out_pkl='features/train_text_embeddings.pkl', model_name=TEXT_EMB_MODEL_NAME):
    """
    Compute or load text embeddings.
    Output: pandas DataFrame with columns: sample_id, text_0, text_1, ..., text_{D-1}
    """
    if os.path.exists(out_pkl):
        print("Loading text embeddings from", out_pkl)
        return pd.read_pickle(out_pkl)

    if SentenceTransformer is None:
        raise RuntimeError("sentence-transformers not available. Install sentence-transformers.")

    print("Cleaning text...")
    df['clean_content'] = df[text_col].astype(str).apply(clean_text)

    print("Loading sentence-transformer:", model_name)
    model = SentenceTransformer(model_name, device=DEVICE)

    texts = df['clean_content'].tolist()
    print("Encoding texts:", len(texts))
    embeddings = model.encode(texts, batch_size=64, show_progress_bar=True, convert_to_numpy=True, device=DEVICE)
    print("Embeddings shape:", embeddings.shape)

    emb_df = pd.DataFrame(embeddings, columns=[f"text_{i}" for i in range(embeddings.shape[1])])
    emb_df.insert(0, 'sample_id', df['sample_id'].values)
    emb_df.to_pickle(out_pkl)
    print("Saved text embeddings to", out_pkl)
    return emb_df

In [14]:
def compute_image_features_from_folder(df, image_folder, out_pkl='features/train_image_features.pkl'):
    """
    Extract image features with EfficientNet-B3 pretrained (remove classifier).
    Output: pandas DataFrame with columns: sample_id, img_0, img_1, ..., img_{M-1}
    """
    if os.path.exists(out_pkl):
        print("Loading image features from", out_pkl)
        return pd.read_pickle(out_pkl)

    device = DEVICE
    preprocess = transforms.Compose([
        transforms.Resize(IMAGE_SIZE),
        transforms.CenterCrop(IMAGE_SIZE),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225])
    ])

    try:
        eff = models.efficientnet_b3(weights=models.EfficientNet_B3_Weights.IMAGENET1K_V1)
    except Exception:
        # fallback: try efficientnet_b4
        eff = models.efficientnet_b4(weights=models.EfficientNet_B4_Weights.IMAGENET1K_V1)
    # remove classifier
    if hasattr(eff, 'classifier'):
        eff.classifier = nn.Identity()
    eff = eff.to(device)
    eff.eval()

    # determine feature dim by a dummy forward
    with torch.no_grad():
        dummy = torch.zeros((1, 3, IMAGE_SIZE[0], IMAGE_SIZE[1])).to(device)
        try:
            feat_dim = eff(dummy).cpu().numpy().flatten().shape[0]
        except Exception:
            # fallback: run a single real image shape through
            feat_dim = 1536

    features = []
    sample_ids = []
    print("Extracting image features for", len(df), "samples")
    for idx, row in tqdm(df.iterrows(), total=len(df)):
        sample_id = row['sample_id']
        image_url = row.get('image_link', "")
        filename = os.path.basename(str(image_url))
        img_path = os.path.join(image_folder, filename)

        try:
            img = Image.open(img_path).convert('RGB')
            tensor = preprocess(img).unsqueeze(0).to(device)
            with torch.no_grad():
                feat = eff(tensor).cpu().numpy().flatten()
        except Exception as e:
            # fallback zero vector with expected dim
            # print(f"Warning: failed to extract image for sample {sample_id}. Using zeros. Err:", e)
            feat = np.zeros((feat_dim,), dtype=np.float32)

        features.append(feat)
        sample_ids.append(sample_id)

    feat_arr = np.vstack(features)
    img_df = pd.DataFrame(feat_arr, columns=[f"img_{i}" for i in range(feat_arr.shape[1])])
    img_df.insert(0, 'sample_id', sample_ids)
    img_df.to_pickle(out_pkl)
    print("Saved image features to", out_pkl)
    return img_df

In [15]:
def load_and_merge_features(text_pkl, img_pkl, csv_with_price=None):
    """
    Loads text_pkl and img_pkl (pandas), optionally merges price column from csv_with_price.
    Returns merged dataframe with columns: sample_id, text_*, img_*, price (if available)
    """
    text_df = pd.read_pickle(text_pkl)
    img_df = pd.read_pickle(img_pkl)
    merged = text_df.merge(img_df, on='sample_id', how='inner')

    if csv_with_price is not None:
        clean_df = pd.read_csv(csv_with_price)[['sample_id', 'price']]
        merged = merged.merge(clean_df, on='sample_id', how='inner')
    return merged

In [16]:
class TrainMultimodalDataset(Dataset):
    def __init__(self, df, target_col='price'):
        self.df = df.reset_index(drop=True)
        self.sample_ids = self.df['sample_id'].values
        self.y = self.df[target_col].values.astype(np.float32)
        # Drop identifier and target -> float32 features
        self.X = self.df.drop(columns=['sample_id', target_col]).values.astype(np.float32)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        return torch.from_numpy(self.X[idx]), torch.tensor(self.y[idx], dtype=torch.float32)

In [17]:
class TestMultimodalDataset(Dataset):
    def __init__(self, df):
        self.df = df.reset_index(drop=True)
        self.sample_ids = self.df['sample_id'].values
        self.X = self.df.drop(columns=['sample_id']).values.astype(np.float32)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        return torch.from_numpy(self.X[idx]), self.sample_ids[idx]

In [18]:
class MultimodalRegressor(nn.Module):
    def __init__(self, text_dim, img_dim, hidden_dim=256, dropout=0.3):
        super().__init__()
        self.text_dim = text_dim
        self.img_dim = img_dim

        # simple encoders
        self.text_fc = nn.Sequential(
            nn.Linear(self.text_dim, hidden_dim),
            nn.ReLU(inplace=True),
            nn.LayerNorm(hidden_dim),
            nn.Dropout(dropout)
        )
        self.img_fc = nn.Sequential(
            nn.Linear(self.img_dim, hidden_dim),
            nn.ReLU(inplace=True),
            nn.LayerNorm(hidden_dim),
            nn.Dropout(dropout)
        )

        # cross-attention / gating improvement: compute attention gate from concatenated features
        self.attn_gate = nn.Sequential(
            nn.Linear(hidden_dim * 2, hidden_dim // 2),
            nn.ReLU(inplace=True),
            nn.Linear(hidden_dim // 2, hidden_dim),
            nn.Sigmoid()
        )

        self.fusion = nn.Sequential(
            nn.Linear(hidden_dim * 2, hidden_dim),
            nn.ReLU(inplace=True),
            nn.LayerNorm(hidden_dim),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, 64),
            nn.ReLU(inplace=True),
            nn.LayerNorm(64),
            nn.Dropout(dropout),
            nn.Linear(64, 1)
        )

    def forward(self, x):
        # x: [B, text_dim + img_dim]
        text = x[:, :self.text_dim]
        img = x[:, self.text_dim:self.text_dim + self.img_dim]
        t = self.text_fc(text)
        v = self.img_fc(img)
        # attention gate
        gate = self.attn_gate(torch.cat([t, v], dim=1))
        t = t * gate
        v = v * gate
        fused = torch.cat([t, v], dim=1)
        out = self.fusion(fused)
        return out.view(-1)

In [19]:
def train_loop(train_loader, val_loader, model, num_epochs=NUM_EPOCHS, accum_steps=ACCUMULATION_STEPS):
    # use Huber-like loss (Smooth L1) as requested
    criterion = nn.SmoothL1Loss()
    optimizer = optim.AdamW(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3)

    best_val_smape = float('inf')
    epochs_no_improve = 0

    scaler = torch.cuda.amp.GradScaler(enabled=(DEVICE=='cuda'))

    for epoch in range(1, num_epochs + 1):
        model.train()
        train_losses = []
        optimizer.zero_grad()
        for step, (X_batch, y_batch) in enumerate(tqdm(train_loader, desc=f"Epoch {epoch} train", leave=False)):
            X_batch = X_batch.to(DEVICE)
            y_batch = y_batch.to(DEVICE)

            y_log = torch.log1p(y_batch)  # train on log target
            with torch.cuda.amp.autocast(enabled=(DEVICE=='cuda')):
                preds_log = model(X_batch)
                loss = criterion(preds_log, y_log)
                loss = loss / accum_steps

            scaler.scale(loss).backward()

            if (step + 1) % accum_steps == 0 or (step + 1) == len(train_loader):
                scaler.unscale_(optimizer)
                torch.nn.utils.clip_grad_norm_(model.parameters(), 3.0)
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad()

            train_losses.append(loss.item() * accum_steps)

        avg_train_loss = float(np.mean(train_losses)) if train_losses else 0.0

        # evaluate
        val_preds, val_trues = evaluate(model, val_loader)
        val_preds = np.clip(val_preds, 0.0, None)
        val_smape = smape(val_trues, val_preds)

        print(f"Epoch {epoch} | TrainLoss(logHuber): {avg_train_loss:.6f} | Val SMAPE: {val_smape:.4f}%")
        scheduler.step(val_smape)

        if val_smape < best_val_smape - 1e-8:
            best_val_smape = val_smape
            epochs_no_improve = 0
            torch.save({
                'model_state_dict': model.state_dict(),
                'text_dim': model.text_dim,
                'img_dim': model.img_dim,
                'epoch': epoch,
                'val_smape': val_smape
            }, MODEL_SAVE_PATH)
            print(f"--> Best model saved (SMAPE {val_smape:.4f}%) at epoch {epoch}")
        else:
            epochs_no_improve += 1
            if epochs_no_improve >= PATIENCE:
                print("Early stopping triggered.")
                break

    print(f"Training finished. Best val SMAPE: {best_val_smape:.4f}%")

In [20]:
def evaluate(model, loader):
    model.eval()
    preds, trues = [], []
    with torch.no_grad():
        for X_batch, y_batch in loader:
            X_batch = X_batch.to(DEVICE)
            y_batch = y_batch.to(DEVICE)
            out_log = model(X_batch)
            pred = torch.expm1(out_log).cpu().numpy()
            preds.append(pred)
            trues.append(y_batch.cpu().numpy())
    if len(preds) == 0:
        return np.array([]), np.array([])
    return np.concatenate(preds), np.concatenate(trues)

In [21]:
def predict_test_and_save(model_path, test_merged_df, out_csv='submission.csv'):
    ckpt = torch.load(model_path, map_location=DEVICE)
    text_dim = ckpt['text_dim']
    img_dim = ckpt['img_dim']
    model = MultimodalRegressor(text_dim=text_dim, img_dim=img_dim).to(DEVICE)
    model.load_state_dict(ckpt['model_state_dict'])
    model.eval()

    # ensure ordering of columns: sample_id then text_* then img_*
    cols = list(test_merged_df.columns)
    assert 'sample_id' in cols
    feature_cols = [c for c in cols if c != 'sample_id']

    test_ds = TestMultimodalDataset(test_merged_df[['sample_id'] + feature_cols])
    test_loader = DataLoader(test_ds, batch_size=256, shuffle=False, num_workers=2)

    sample_ids = []
    all_preds = []
    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Predicting test"):
            X_batch, ids = batch
            X_batch = X_batch.to(DEVICE)
            out_log = model(X_batch)
            pred = torch.expm1(out_log).cpu().numpy()
            all_preds.extend(pred.tolist())
            sample_ids.extend(ids)

    preds = np.clip(np.array(all_preds), 0.0, None)
    out_df = pd.DataFrame({'sample_id': sample_ids, 'price': preds})
    # Ensure sample_id order matches test file order if provided
    out_df.to_csv(out_csv, index=False)
    print("Saved predictions to", out_csv)

In [22]:
if __name__ == "__main__":
    TRAIN_CSV = 'train.csv'
    TEST_CSV = 'test.csv'
    IMAGE_FOLDER_TRAIN = 'dataset/train_images'
    IMAGE_FOLDER_TEST = 'dataset/test_images'
    TEXT_PKL_TRAIN = 'features/train_text_embeddings.pkl'
    IMG_PKL_TRAIN = 'features/train_image_features.pkl'
    TEXT_PKL_TEST = 'features/test_text_embeddings.pkl'
    IMG_PKL_TEST = 'features/test_image_features.pkl'

    # --- Load CSVs ---
    train_df = pd.read_csv(TRAIN_CSV)
    test_df = pd.read_csv(TEST_CSV)
    print("Train samples:", len(train_df), "Test samples:", len(test_df))

    # --- Compute / Load text embeddings ---
    text_train_df = compute_text_embeddings(train_df, out_pkl=TEXT_PKL_TRAIN)
    text_test_df = compute_text_embeddings(test_df, out_pkl=TEXT_PKL_TEST)

    # --- Download images (train + test) to local folder ---
    train_image_links = train_df['image_link'].fillna("").astype(str).tolist()
    os.makedirs(IMAGE_FOLDER_TRAIN, exist_ok=True)
    print("Downloading train images...")
    download_images_parallel(train_image_links, IMAGE_FOLDER_TRAIN, workers=NUM_DOWNLOAD_WORKERS)

    test_image_links = test_df['image_link'].fillna("").astype(str).tolist()
    os.makedirs(IMAGE_FOLDER_TEST, exist_ok=True)
    print("Downloading test images...")
    download_images_parallel(test_image_links, IMAGE_FOLDER_TEST, workers=NUM_DOWNLOAD_WORKERS)

    # --- Compute image features (from downloaded folder) ---
    img_train_df = compute_image_features_from_folder(train_df, IMAGE_FOLDER_TRAIN, out_pkl=IMG_PKL_TRAIN)
    img_test_df = compute_image_features_from_folder(test_df, IMAGE_FOLDER_TEST, out_pkl=IMG_PKL_TEST)

    # --- Merge features with price (train) and without price (test) ---
    train_merged = text_train_df.merge(img_train_df, on='sample_id', how='inner').merge(train_df[['sample_id', 'price']], on='sample_id', how='inner')
    test_merged = text_test_df.merge(img_test_df, on='sample_id', how='inner')  # no price

    print("Train merged shape:", train_merged.shape)
    print("Test merged shape:", test_merged.shape)

    # --- K-Fold cross-validation ---
    feature_cols = [c for c in train_merged.columns if c not in ('sample_id', 'price')]
    text_dim = len([c for c in feature_cols if c.startswith('text_')])
    img_dim = len([c for c in feature_cols if c.startswith('img_')])
    print("Detected text_dim:", text_dim, "img_dim:", img_dim)

    kf = KFold(n_splits=NUM_FOLDS, shuffle=True, random_state=SEED)
    fold = 0
    val_smape_scores = []
    for train_idx, val_idx in kf.split(train_merged):
        fold += 1
        print(f"Starting fold {fold}/{NUM_FOLDS}")
        train_part = train_merged.iloc[train_idx].reset_index(drop=True)
        val_part = train_merged.iloc[val_idx].reset_index(drop=True)

        train_dataset = TrainMultimodalDataset(train_part)
        val_dataset = TrainMultimodalDataset(val_part)
        train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2, pin_memory=True)
        val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=2, pin_memory=True)

        # build model for this fold
        model = MultimodalRegressor(text_dim=text_dim, img_dim=img_dim, hidden_dim=256, dropout=0.3).to(DEVICE)
        print(model)

        train_loop(train_loader, val_loader, model, num_epochs=NUM_EPOCHS, accum_steps=ACCUMULATION_STEPS)

        # after training load best saved model and evaluate on val
        if os.path.exists(MODEL_SAVE_PATH):
            ckpt = torch.load(MODEL_SAVE_PATH, map_location=DEVICE)
            model.load_state_dict(ckpt['model_state_dict'])
            val_preds, val_trues = evaluate(model, val_loader)
            val_preds = np.clip(val_preds, 0.0, None)
            val_smape = smape(val_trues, val_preds)
            val_smape_scores.append(val_smape)
            print(f"Fold {fold} val SMAPE: {val_smape:.4f}%")

    if len(val_smape_scores) > 0:
        print("K-Fold CV mean SMAPE:", np.mean(val_smape_scores))

    # --- predict on test and save CSV ---
    predict_test_and_save(MODEL_SAVE_PATH, test_merged, out_csv='submission.csv')

    print("All done. Submission: submission.csv")


Train samples: 75000 Test samples: 75000
Cleaning text...
Loading sentence-transformer: all-mpnet-base-v2


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Encoding texts: 75000


Batches:   0%|          | 0/1172 [00:00<?, ?it/s]

Embeddings shape: (75000, 768)
Saved text embeddings to features/train_text_embeddings.pkl
Cleaning text...
Loading sentence-transformer: all-mpnet-base-v2
Encoding texts: 75000


Batches:   0%|          | 0/1172 [00:00<?, ?it/s]

Embeddings shape: (75000, 768)
Saved text embeddings to features/test_text_embeddings.pkl
Downloading train images...


100%|██████████| 75000/75000 [23:21<00:00, 53.51it/s] 


Downloading test images...


100%|██████████| 75000/75000 [19:28<00:00, 64.18it/s]


Downloading: "https://download.pytorch.org/models/efficientnet_b3_rwightman-b3899882.pth" to /root/.cache/torch/hub/checkpoints/efficientnet_b3_rwightman-b3899882.pth


100%|██████████| 47.2M/47.2M [00:00<00:00, 134MB/s]


Extracting image features for 75000 samples


100%|██████████| 75000/75000 [1:14:48<00:00, 16.71it/s]


Saved image features to features/train_image_features.pkl
Extracting image features for 75000 samples


100%|██████████| 75000/75000 [1:13:53<00:00, 16.92it/s]


Saved image features to features/test_image_features.pkl
Train merged shape: (75000, 2306)
Test merged shape: (75000, 2305)
Detected text_dim: 768 img_dim: 1536
Starting fold 1/5


  scaler = torch.cuda.amp.GradScaler(enabled=(DEVICE=='cuda'))


MultimodalRegressor(
  (text_fc): Sequential(
    (0): Linear(in_features=768, out_features=256, bias=True)
    (1): ReLU(inplace=True)
    (2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
    (3): Dropout(p=0.3, inplace=False)
  )
  (img_fc): Sequential(
    (0): Linear(in_features=1536, out_features=256, bias=True)
    (1): ReLU(inplace=True)
    (2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
    (3): Dropout(p=0.3, inplace=False)
  )
  (attn_gate): Sequential(
    (0): Linear(in_features=512, out_features=128, bias=True)
    (1): ReLU(inplace=True)
    (2): Linear(in_features=128, out_features=256, bias=True)
    (3): Sigmoid()
  )
  (fusion): Sequential(
    (0): Linear(in_features=512, out_features=256, bias=True)
    (1): ReLU(inplace=True)
    (2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
    (3): Dropout(p=0.3, inplace=False)
    (4): Linear(in_features=256, out_features=64, bias=True)
    (5): ReLU(inplace=True)
    (6): LayerNorm((64,), eps=

  with torch.cuda.amp.autocast(enabled=(DEVICE=='cuda')):


Epoch 1 | TrainLoss(logHuber): 0.381640 | Val SMAPE: 61.2165%
--> Best model saved (SMAPE 61.2165%) at epoch 1




Epoch 2 | TrainLoss(logHuber): 0.306088 | Val SMAPE: 59.8276%
--> Best model saved (SMAPE 59.8276%) at epoch 2




Epoch 3 | TrainLoss(logHuber): 0.285142 | Val SMAPE: 59.8496%




Epoch 4 | TrainLoss(logHuber): 0.272037 | Val SMAPE: 57.0454%
--> Best model saved (SMAPE 57.0454%) at epoch 4




Epoch 5 | TrainLoss(logHuber): 0.258697 | Val SMAPE: 56.7215%
--> Best model saved (SMAPE 56.7215%) at epoch 5




Epoch 6 | TrainLoss(logHuber): 0.246963 | Val SMAPE: 56.0469%
--> Best model saved (SMAPE 56.0469%) at epoch 6




Epoch 7 | TrainLoss(logHuber): 0.237092 | Val SMAPE: 55.6171%
--> Best model saved (SMAPE 55.6171%) at epoch 7




Epoch 8 | TrainLoss(logHuber): 0.225345 | Val SMAPE: 55.6660%




Epoch 9 | TrainLoss(logHuber): 0.214295 | Val SMAPE: 55.1846%
--> Best model saved (SMAPE 55.1846%) at epoch 9




Epoch 10 | TrainLoss(logHuber): 0.206720 | Val SMAPE: 54.8657%
--> Best model saved (SMAPE 54.8657%) at epoch 10




Epoch 11 | TrainLoss(logHuber): 0.195670 | Val SMAPE: 55.6432%




Epoch 12 | TrainLoss(logHuber): 0.187411 | Val SMAPE: 55.0145%




Epoch 13 | TrainLoss(logHuber): 0.179850 | Val SMAPE: 54.9545%




Epoch 14 | TrainLoss(logHuber): 0.173035 | Val SMAPE: 54.6125%
--> Best model saved (SMAPE 54.6125%) at epoch 14




Epoch 15 | TrainLoss(logHuber): 0.165182 | Val SMAPE: 54.3408%
--> Best model saved (SMAPE 54.3408%) at epoch 15




Epoch 16 | TrainLoss(logHuber): 0.158567 | Val SMAPE: 54.3087%
--> Best model saved (SMAPE 54.3087%) at epoch 16




Epoch 17 | TrainLoss(logHuber): 0.152170 | Val SMAPE: 54.8101%




Epoch 18 | TrainLoss(logHuber): 0.147241 | Val SMAPE: 54.7158%




Epoch 19 | TrainLoss(logHuber): 0.143239 | Val SMAPE: 54.9617%




Epoch 20 | TrainLoss(logHuber): 0.137518 | Val SMAPE: 54.2922%
--> Best model saved (SMAPE 54.2922%) at epoch 20




Epoch 21 | TrainLoss(logHuber): 0.132432 | Val SMAPE: 54.1924%
--> Best model saved (SMAPE 54.1924%) at epoch 21




Epoch 22 | TrainLoss(logHuber): 0.129716 | Val SMAPE: 54.2564%




Epoch 23 | TrainLoss(logHuber): 0.126061 | Val SMAPE: 54.5573%




Epoch 24 | TrainLoss(logHuber): 0.122391 | Val SMAPE: 54.3917%




Epoch 25 | TrainLoss(logHuber): 0.119316 | Val SMAPE: 54.0458%
--> Best model saved (SMAPE 54.0458%) at epoch 25




Epoch 26 | TrainLoss(logHuber): 0.116391 | Val SMAPE: 54.2111%




Epoch 27 | TrainLoss(logHuber): 0.112709 | Val SMAPE: 54.0489%




Epoch 28 | TrainLoss(logHuber): 0.109600 | Val SMAPE: 53.7998%
--> Best model saved (SMAPE 53.7998%) at epoch 28




Epoch 29 | TrainLoss(logHuber): 0.108394 | Val SMAPE: 53.8749%




Epoch 30 | TrainLoss(logHuber): 0.106153 | Val SMAPE: 54.0273%




Epoch 31 | TrainLoss(logHuber): 0.103679 | Val SMAPE: 54.3541%




Epoch 32 | TrainLoss(logHuber): 0.101778 | Val SMAPE: 53.9099%




Epoch 33 | TrainLoss(logHuber): 0.093876 | Val SMAPE: 53.7251%
--> Best model saved (SMAPE 53.7251%) at epoch 33




Epoch 34 | TrainLoss(logHuber): 0.089965 | Val SMAPE: 53.7965%




Epoch 35 | TrainLoss(logHuber): 0.087132 | Val SMAPE: 53.9188%




Epoch 36 | TrainLoss(logHuber): 0.086209 | Val SMAPE: 53.4406%
--> Best model saved (SMAPE 53.4406%) at epoch 36




Epoch 37 | TrainLoss(logHuber): 0.085182 | Val SMAPE: 53.6911%




Epoch 38 | TrainLoss(logHuber): 0.083524 | Val SMAPE: 53.2726%
--> Best model saved (SMAPE 53.2726%) at epoch 38




Epoch 39 | TrainLoss(logHuber): 0.082779 | Val SMAPE: 53.4936%




Epoch 40 | TrainLoss(logHuber): 0.081833 | Val SMAPE: 53.6175%




Epoch 41 | TrainLoss(logHuber): 0.081244 | Val SMAPE: 53.7011%




Epoch 42 | TrainLoss(logHuber): 0.080444 | Val SMAPE: 53.4888%




Epoch 43 | TrainLoss(logHuber): 0.076104 | Val SMAPE: 53.3279%




Epoch 44 | TrainLoss(logHuber): 0.075264 | Val SMAPE: 53.3058%
Early stopping triggered.
Training finished. Best val SMAPE: 53.2726%
Fold 1 val SMAPE: 53.2726%
Starting fold 2/5


  scaler = torch.cuda.amp.GradScaler(enabled=(DEVICE=='cuda'))


MultimodalRegressor(
  (text_fc): Sequential(
    (0): Linear(in_features=768, out_features=256, bias=True)
    (1): ReLU(inplace=True)
    (2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
    (3): Dropout(p=0.3, inplace=False)
  )
  (img_fc): Sequential(
    (0): Linear(in_features=1536, out_features=256, bias=True)
    (1): ReLU(inplace=True)
    (2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
    (3): Dropout(p=0.3, inplace=False)
  )
  (attn_gate): Sequential(
    (0): Linear(in_features=512, out_features=128, bias=True)
    (1): ReLU(inplace=True)
    (2): Linear(in_features=128, out_features=256, bias=True)
    (3): Sigmoid()
  )
  (fusion): Sequential(
    (0): Linear(in_features=512, out_features=256, bias=True)
    (1): ReLU(inplace=True)
    (2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
    (3): Dropout(p=0.3, inplace=False)
    (4): Linear(in_features=256, out_features=64, bias=True)
    (5): ReLU(inplace=True)
    (6): LayerNorm((64,), eps=

  with torch.cuda.amp.autocast(enabled=(DEVICE=='cuda')):


Epoch 1 | TrainLoss(logHuber): 0.371077 | Val SMAPE: 62.3610%
--> Best model saved (SMAPE 62.3610%) at epoch 1




Epoch 2 | TrainLoss(logHuber): 0.302685 | Val SMAPE: 58.7174%
--> Best model saved (SMAPE 58.7174%) at epoch 2




Epoch 3 | TrainLoss(logHuber): 0.284561 | Val SMAPE: 58.0519%
--> Best model saved (SMAPE 58.0519%) at epoch 3




Epoch 4 | TrainLoss(logHuber): 0.269416 | Val SMAPE: 56.3063%
--> Best model saved (SMAPE 56.3063%) at epoch 4




Epoch 5 | TrainLoss(logHuber): 0.258096 | Val SMAPE: 56.1859%
--> Best model saved (SMAPE 56.1859%) at epoch 5




Epoch 6 | TrainLoss(logHuber): 0.245256 | Val SMAPE: 55.2185%
--> Best model saved (SMAPE 55.2185%) at epoch 6




Epoch 7 | TrainLoss(logHuber): 0.233877 | Val SMAPE: 54.6743%
--> Best model saved (SMAPE 54.6743%) at epoch 7




Epoch 8 | TrainLoss(logHuber): 0.224213 | Val SMAPE: 54.1193%
--> Best model saved (SMAPE 54.1193%) at epoch 8




Epoch 9 | TrainLoss(logHuber): 0.213279 | Val SMAPE: 53.9952%
--> Best model saved (SMAPE 53.9952%) at epoch 9




Epoch 10 | TrainLoss(logHuber): 0.203030 | Val SMAPE: 53.9546%
--> Best model saved (SMAPE 53.9546%) at epoch 10




Epoch 11 | TrainLoss(logHuber): 0.195022 | Val SMAPE: 53.6335%
--> Best model saved (SMAPE 53.6335%) at epoch 11




Epoch 12 | TrainLoss(logHuber): 0.185040 | Val SMAPE: 54.0308%




Epoch 13 | TrainLoss(logHuber): 0.176652 | Val SMAPE: 53.1727%
--> Best model saved (SMAPE 53.1727%) at epoch 13




Epoch 14 | TrainLoss(logHuber): 0.168047 | Val SMAPE: 53.3944%




Epoch 15 | TrainLoss(logHuber): 0.161406 | Val SMAPE: 53.8602%




Epoch 16 | TrainLoss(logHuber): 0.154216 | Val SMAPE: 53.1229%
--> Best model saved (SMAPE 53.1229%) at epoch 16




Epoch 17 | TrainLoss(logHuber): 0.149221 | Val SMAPE: 53.4516%




Epoch 18 | TrainLoss(logHuber): 0.143748 | Val SMAPE: 53.4558%




Epoch 19 | TrainLoss(logHuber): 0.138334 | Val SMAPE: 53.1272%




Epoch 20 | TrainLoss(logHuber): 0.133310 | Val SMAPE: 52.9691%
--> Best model saved (SMAPE 52.9691%) at epoch 20




Epoch 21 | TrainLoss(logHuber): 0.130458 | Val SMAPE: 53.1432%




Epoch 22 | TrainLoss(logHuber): 0.126003 | Val SMAPE: 53.7619%




Epoch 23 | TrainLoss(logHuber): 0.122251 | Val SMAPE: 52.9018%
--> Best model saved (SMAPE 52.9018%) at epoch 23




Epoch 24 | TrainLoss(logHuber): 0.118054 | Val SMAPE: 53.1031%




Epoch 25 | TrainLoss(logHuber): 0.116027 | Val SMAPE: 52.9049%




Epoch 26 | TrainLoss(logHuber): 0.113026 | Val SMAPE: 53.0155%




Epoch 27 | TrainLoss(logHuber): 0.109751 | Val SMAPE: 53.2169%




Epoch 28 | TrainLoss(logHuber): 0.101030 | Val SMAPE: 52.6844%
--> Best model saved (SMAPE 52.6844%) at epoch 28




Epoch 29 | TrainLoss(logHuber): 0.095399 | Val SMAPE: 52.9264%




Epoch 30 | TrainLoss(logHuber): 0.094737 | Val SMAPE: 52.7187%




Epoch 31 | TrainLoss(logHuber): 0.091392 | Val SMAPE: 52.8357%




Epoch 32 | TrainLoss(logHuber): 0.091148 | Val SMAPE: 52.6385%
--> Best model saved (SMAPE 52.6385%) at epoch 32




Epoch 33 | TrainLoss(logHuber): 0.089567 | Val SMAPE: 52.6598%




Epoch 34 | TrainLoss(logHuber): 0.088605 | Val SMAPE: 52.5499%
--> Best model saved (SMAPE 52.5499%) at epoch 34




Epoch 35 | TrainLoss(logHuber): 0.086743 | Val SMAPE: 52.5899%




Epoch 36 | TrainLoss(logHuber): 0.085581 | Val SMAPE: 52.8258%




Epoch 37 | TrainLoss(logHuber): 0.084730 | Val SMAPE: 52.4511%
--> Best model saved (SMAPE 52.4511%) at epoch 37




Epoch 38 | TrainLoss(logHuber): 0.083361 | Val SMAPE: 52.6447%




Epoch 39 | TrainLoss(logHuber): 0.082207 | Val SMAPE: 52.8249%




Epoch 40 | TrainLoss(logHuber): 0.081121 | Val SMAPE: 52.7718%




Epoch 41 | TrainLoss(logHuber): 0.080808 | Val SMAPE: 52.7022%




Epoch 42 | TrainLoss(logHuber): 0.076794 | Val SMAPE: 52.6197%




Epoch 43 | TrainLoss(logHuber): 0.075796 | Val SMAPE: 52.5856%
Early stopping triggered.
Training finished. Best val SMAPE: 52.4511%
Fold 2 val SMAPE: 52.4511%
Starting fold 3/5


  scaler = torch.cuda.amp.GradScaler(enabled=(DEVICE=='cuda'))


MultimodalRegressor(
  (text_fc): Sequential(
    (0): Linear(in_features=768, out_features=256, bias=True)
    (1): ReLU(inplace=True)
    (2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
    (3): Dropout(p=0.3, inplace=False)
  )
  (img_fc): Sequential(
    (0): Linear(in_features=1536, out_features=256, bias=True)
    (1): ReLU(inplace=True)
    (2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
    (3): Dropout(p=0.3, inplace=False)
  )
  (attn_gate): Sequential(
    (0): Linear(in_features=512, out_features=128, bias=True)
    (1): ReLU(inplace=True)
    (2): Linear(in_features=128, out_features=256, bias=True)
    (3): Sigmoid()
  )
  (fusion): Sequential(
    (0): Linear(in_features=512, out_features=256, bias=True)
    (1): ReLU(inplace=True)
    (2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
    (3): Dropout(p=0.3, inplace=False)
    (4): Linear(in_features=256, out_features=64, bias=True)
    (5): ReLU(inplace=True)
    (6): LayerNorm((64,), eps=

  with torch.cuda.amp.autocast(enabled=(DEVICE=='cuda')):


Epoch 1 | TrainLoss(logHuber): 0.389589 | Val SMAPE: 61.0422%
--> Best model saved (SMAPE 61.0422%) at epoch 1




Epoch 2 | TrainLoss(logHuber): 0.310511 | Val SMAPE: 59.2568%
--> Best model saved (SMAPE 59.2568%) at epoch 2




Epoch 3 | TrainLoss(logHuber): 0.288268 | Val SMAPE: 57.1625%
--> Best model saved (SMAPE 57.1625%) at epoch 3




Epoch 4 | TrainLoss(logHuber): 0.277800 | Val SMAPE: 56.3021%
--> Best model saved (SMAPE 56.3021%) at epoch 4




Epoch 5 | TrainLoss(logHuber): 0.263979 | Val SMAPE: 56.1446%
--> Best model saved (SMAPE 56.1446%) at epoch 5




Epoch 6 | TrainLoss(logHuber): 0.254006 | Val SMAPE: 56.4466%




Epoch 7 | TrainLoss(logHuber): 0.242035 | Val SMAPE: 55.0240%
--> Best model saved (SMAPE 55.0240%) at epoch 7




Epoch 8 | TrainLoss(logHuber): 0.231174 | Val SMAPE: 54.5891%
--> Best model saved (SMAPE 54.5891%) at epoch 8




Epoch 9 | TrainLoss(logHuber): 0.220702 | Val SMAPE: 54.2681%
--> Best model saved (SMAPE 54.2681%) at epoch 9




Epoch 10 | TrainLoss(logHuber): 0.208669 | Val SMAPE: 54.0060%
--> Best model saved (SMAPE 54.0060%) at epoch 10




Epoch 11 | TrainLoss(logHuber): 0.202579 | Val SMAPE: 54.2133%




Epoch 12 | TrainLoss(logHuber): 0.191545 | Val SMAPE: 53.9222%
--> Best model saved (SMAPE 53.9222%) at epoch 12




Epoch 13 | TrainLoss(logHuber): 0.183752 | Val SMAPE: 53.9308%




Epoch 14 | TrainLoss(logHuber): 0.175654 | Val SMAPE: 53.8146%
--> Best model saved (SMAPE 53.8146%) at epoch 14




Epoch 15 | TrainLoss(logHuber): 0.167981 | Val SMAPE: 53.9784%




Epoch 16 | TrainLoss(logHuber): 0.161920 | Val SMAPE: 53.9602%




Epoch 17 | TrainLoss(logHuber): 0.155771 | Val SMAPE: 53.6622%
--> Best model saved (SMAPE 53.6622%) at epoch 17




Epoch 18 | TrainLoss(logHuber): 0.149687 | Val SMAPE: 53.7943%




Epoch 19 | TrainLoss(logHuber): 0.143717 | Val SMAPE: 54.0613%




Epoch 20 | TrainLoss(logHuber): 0.140759 | Val SMAPE: 53.7223%




Epoch 21 | TrainLoss(logHuber): 0.136423 | Val SMAPE: 53.6241%
--> Best model saved (SMAPE 53.6241%) at epoch 21




Epoch 22 | TrainLoss(logHuber): 0.132237 | Val SMAPE: 53.5421%
--> Best model saved (SMAPE 53.5421%) at epoch 22




Epoch 23 | TrainLoss(logHuber): 0.128988 | Val SMAPE: 53.4172%
--> Best model saved (SMAPE 53.4172%) at epoch 23




Epoch 24 | TrainLoss(logHuber): 0.124101 | Val SMAPE: 53.8099%




Epoch 25 | TrainLoss(logHuber): 0.121650 | Val SMAPE: 53.6536%




Epoch 26 | TrainLoss(logHuber): 0.117693 | Val SMAPE: 53.6741%




Epoch 27 | TrainLoss(logHuber): 0.115741 | Val SMAPE: 53.2289%
--> Best model saved (SMAPE 53.2289%) at epoch 27




Epoch 28 | TrainLoss(logHuber): 0.112768 | Val SMAPE: 53.7330%




Epoch 29 | TrainLoss(logHuber): 0.110037 | Val SMAPE: 53.5923%




Epoch 30 | TrainLoss(logHuber): 0.107810 | Val SMAPE: 53.3266%




Epoch 31 | TrainLoss(logHuber): 0.105416 | Val SMAPE: 53.3108%




Epoch 32 | TrainLoss(logHuber): 0.096292 | Val SMAPE: 53.1613%
--> Best model saved (SMAPE 53.1613%) at epoch 32




Epoch 33 | TrainLoss(logHuber): 0.091884 | Val SMAPE: 53.1901%




Epoch 34 | TrainLoss(logHuber): 0.090330 | Val SMAPE: 53.4164%




Epoch 35 | TrainLoss(logHuber): 0.090356 | Val SMAPE: 53.2362%




Epoch 36 | TrainLoss(logHuber): 0.088222 | Val SMAPE: 53.1395%
--> Best model saved (SMAPE 53.1395%) at epoch 36




Epoch 37 | TrainLoss(logHuber): 0.087175 | Val SMAPE: 53.2142%




Epoch 38 | TrainLoss(logHuber): 0.086164 | Val SMAPE: 53.2377%




Epoch 39 | TrainLoss(logHuber): 0.084476 | Val SMAPE: 53.0311%
--> Best model saved (SMAPE 53.0311%) at epoch 39




Epoch 40 | TrainLoss(logHuber): 0.083770 | Val SMAPE: 53.0314%




Epoch 41 | TrainLoss(logHuber): 0.082502 | Val SMAPE: 53.2564%




Epoch 42 | TrainLoss(logHuber): 0.081167 | Val SMAPE: 52.9746%
--> Best model saved (SMAPE 52.9746%) at epoch 42




Epoch 43 | TrainLoss(logHuber): 0.080710 | Val SMAPE: 53.2901%




Epoch 44 | TrainLoss(logHuber): 0.079108 | Val SMAPE: 52.9426%
--> Best model saved (SMAPE 52.9426%) at epoch 44




Epoch 45 | TrainLoss(logHuber): 0.078849 | Val SMAPE: 52.9139%
--> Best model saved (SMAPE 52.9139%) at epoch 45




Epoch 46 | TrainLoss(logHuber): 0.077756 | Val SMAPE: 53.0710%




Epoch 47 | TrainLoss(logHuber): 0.076226 | Val SMAPE: 53.0328%




Epoch 48 | TrainLoss(logHuber): 0.076680 | Val SMAPE: 52.8871%
--> Best model saved (SMAPE 52.8871%) at epoch 48




Epoch 49 | TrainLoss(logHuber): 0.075086 | Val SMAPE: 53.1758%




Epoch 50 | TrainLoss(logHuber): 0.074834 | Val SMAPE: 52.9081%




Epoch 51 | TrainLoss(logHuber): 0.073794 | Val SMAPE: 52.7971%
--> Best model saved (SMAPE 52.7971%) at epoch 51




Epoch 52 | TrainLoss(logHuber): 0.073912 | Val SMAPE: 53.2466%




Epoch 53 | TrainLoss(logHuber): 0.072964 | Val SMAPE: 53.1290%




Epoch 54 | TrainLoss(logHuber): 0.072219 | Val SMAPE: 52.9254%




Epoch 55 | TrainLoss(logHuber): 0.071860 | Val SMAPE: 52.8449%




Epoch 56 | TrainLoss(logHuber): 0.068377 | Val SMAPE: 53.0522%




Epoch 57 | TrainLoss(logHuber): 0.066690 | Val SMAPE: 52.9007%
Early stopping triggered.
Training finished. Best val SMAPE: 52.7971%
Fold 3 val SMAPE: 52.7971%
Starting fold 4/5


  scaler = torch.cuda.amp.GradScaler(enabled=(DEVICE=='cuda'))


MultimodalRegressor(
  (text_fc): Sequential(
    (0): Linear(in_features=768, out_features=256, bias=True)
    (1): ReLU(inplace=True)
    (2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
    (3): Dropout(p=0.3, inplace=False)
  )
  (img_fc): Sequential(
    (0): Linear(in_features=1536, out_features=256, bias=True)
    (1): ReLU(inplace=True)
    (2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
    (3): Dropout(p=0.3, inplace=False)
  )
  (attn_gate): Sequential(
    (0): Linear(in_features=512, out_features=128, bias=True)
    (1): ReLU(inplace=True)
    (2): Linear(in_features=128, out_features=256, bias=True)
    (3): Sigmoid()
  )
  (fusion): Sequential(
    (0): Linear(in_features=512, out_features=256, bias=True)
    (1): ReLU(inplace=True)
    (2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
    (3): Dropout(p=0.3, inplace=False)
    (4): Linear(in_features=256, out_features=64, bias=True)
    (5): ReLU(inplace=True)
    (6): LayerNorm((64,), eps=

  with torch.cuda.amp.autocast(enabled=(DEVICE=='cuda')):


Epoch 1 | TrainLoss(logHuber): 0.385241 | Val SMAPE: 59.6294%
--> Best model saved (SMAPE 59.6294%) at epoch 1




Epoch 2 | TrainLoss(logHuber): 0.308989 | Val SMAPE: 58.3829%
--> Best model saved (SMAPE 58.3829%) at epoch 2




Epoch 3 | TrainLoss(logHuber): 0.289845 | Val SMAPE: 57.5078%
--> Best model saved (SMAPE 57.5078%) at epoch 3




Epoch 4 | TrainLoss(logHuber): 0.275933 | Val SMAPE: 56.1101%
--> Best model saved (SMAPE 56.1101%) at epoch 4




Epoch 5 | TrainLoss(logHuber): 0.263473 | Val SMAPE: 55.2910%
--> Best model saved (SMAPE 55.2910%) at epoch 5




Epoch 6 | TrainLoss(logHuber): 0.251371 | Val SMAPE: 54.8662%
--> Best model saved (SMAPE 54.8662%) at epoch 6




Epoch 7 | TrainLoss(logHuber): 0.240816 | Val SMAPE: 53.8924%
--> Best model saved (SMAPE 53.8924%) at epoch 7




Epoch 8 | TrainLoss(logHuber): 0.228950 | Val SMAPE: 54.7280%




Epoch 9 | TrainLoss(logHuber): 0.218932 | Val SMAPE: 53.8959%




Epoch 10 | TrainLoss(logHuber): 0.209944 | Val SMAPE: 53.2185%
--> Best model saved (SMAPE 53.2185%) at epoch 10




Epoch 11 | TrainLoss(logHuber): 0.199140 | Val SMAPE: 53.1443%
--> Best model saved (SMAPE 53.1443%) at epoch 11




Epoch 12 | TrainLoss(logHuber): 0.191429 | Val SMAPE: 53.0283%
--> Best model saved (SMAPE 53.0283%) at epoch 12




Epoch 13 | TrainLoss(logHuber): 0.182473 | Val SMAPE: 53.1976%




Epoch 14 | TrainLoss(logHuber): 0.176112 | Val SMAPE: 53.2508%




Epoch 15 | TrainLoss(logHuber): 0.167641 | Val SMAPE: 53.0606%




Epoch 16 | TrainLoss(logHuber): 0.161855 | Val SMAPE: 52.9047%
--> Best model saved (SMAPE 52.9047%) at epoch 16




Epoch 17 | TrainLoss(logHuber): 0.156231 | Val SMAPE: 52.8216%
--> Best model saved (SMAPE 52.8216%) at epoch 17




Epoch 18 | TrainLoss(logHuber): 0.149612 | Val SMAPE: 52.8859%




Epoch 19 | TrainLoss(logHuber): 0.145056 | Val SMAPE: 52.9185%




Epoch 20 | TrainLoss(logHuber): 0.140162 | Val SMAPE: 53.7320%




Epoch 21 | TrainLoss(logHuber): 0.135519 | Val SMAPE: 53.0410%




Epoch 22 | TrainLoss(logHuber): 0.123413 | Val SMAPE: 52.5860%
--> Best model saved (SMAPE 52.5860%) at epoch 22




Epoch 23 | TrainLoss(logHuber): 0.119619 | Val SMAPE: 52.6637%




Epoch 24 | TrainLoss(logHuber): 0.116137 | Val SMAPE: 52.5156%
--> Best model saved (SMAPE 52.5156%) at epoch 24




Epoch 25 | TrainLoss(logHuber): 0.113281 | Val SMAPE: 52.8228%




Epoch 26 | TrainLoss(logHuber): 0.110804 | Val SMAPE: 52.6277%




Epoch 27 | TrainLoss(logHuber): 0.108183 | Val SMAPE: 52.6233%




Epoch 28 | TrainLoss(logHuber): 0.106652 | Val SMAPE: 52.4918%
--> Best model saved (SMAPE 52.4918%) at epoch 28




Epoch 29 | TrainLoss(logHuber): 0.103628 | Val SMAPE: 52.4872%
--> Best model saved (SMAPE 52.4872%) at epoch 29




Epoch 30 | TrainLoss(logHuber): 0.103498 | Val SMAPE: 52.4677%
--> Best model saved (SMAPE 52.4677%) at epoch 30




Epoch 31 | TrainLoss(logHuber): 0.101157 | Val SMAPE: 52.5782%




Epoch 32 | TrainLoss(logHuber): 0.100238 | Val SMAPE: 52.5551%




Epoch 33 | TrainLoss(logHuber): 0.098974 | Val SMAPE: 52.6986%




Epoch 34 | TrainLoss(logHuber): 0.096705 | Val SMAPE: 52.8225%




Epoch 35 | TrainLoss(logHuber): 0.092320 | Val SMAPE: 52.4863%




Epoch 36 | TrainLoss(logHuber): 0.090350 | Val SMAPE: 52.3084%
--> Best model saved (SMAPE 52.3084%) at epoch 36




Epoch 37 | TrainLoss(logHuber): 0.089445 | Val SMAPE: 52.4784%




Epoch 38 | TrainLoss(logHuber): 0.088507 | Val SMAPE: 52.4599%




Epoch 39 | TrainLoss(logHuber): 0.087028 | Val SMAPE: 52.4067%




Epoch 40 | TrainLoss(logHuber): 0.086914 | Val SMAPE: 52.4769%




Epoch 41 | TrainLoss(logHuber): 0.083392 | Val SMAPE: 52.4501%




Epoch 42 | TrainLoss(logHuber): 0.083183 | Val SMAPE: 52.3981%
Early stopping triggered.
Training finished. Best val SMAPE: 52.3084%
Fold 4 val SMAPE: 52.3084%
Starting fold 5/5


  scaler = torch.cuda.amp.GradScaler(enabled=(DEVICE=='cuda'))


MultimodalRegressor(
  (text_fc): Sequential(
    (0): Linear(in_features=768, out_features=256, bias=True)
    (1): ReLU(inplace=True)
    (2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
    (3): Dropout(p=0.3, inplace=False)
  )
  (img_fc): Sequential(
    (0): Linear(in_features=1536, out_features=256, bias=True)
    (1): ReLU(inplace=True)
    (2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
    (3): Dropout(p=0.3, inplace=False)
  )
  (attn_gate): Sequential(
    (0): Linear(in_features=512, out_features=128, bias=True)
    (1): ReLU(inplace=True)
    (2): Linear(in_features=128, out_features=256, bias=True)
    (3): Sigmoid()
  )
  (fusion): Sequential(
    (0): Linear(in_features=512, out_features=256, bias=True)
    (1): ReLU(inplace=True)
    (2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
    (3): Dropout(p=0.3, inplace=False)
    (4): Linear(in_features=256, out_features=64, bias=True)
    (5): ReLU(inplace=True)
    (6): LayerNorm((64,), eps=

  with torch.cuda.amp.autocast(enabled=(DEVICE=='cuda')):


Epoch 1 | TrainLoss(logHuber): 0.370249 | Val SMAPE: 60.6468%
--> Best model saved (SMAPE 60.6468%) at epoch 1




Epoch 2 | TrainLoss(logHuber): 0.301027 | Val SMAPE: 58.2692%
--> Best model saved (SMAPE 58.2692%) at epoch 2




Epoch 3 | TrainLoss(logHuber): 0.281943 | Val SMAPE: 57.0579%
--> Best model saved (SMAPE 57.0579%) at epoch 3




Epoch 4 | TrainLoss(logHuber): 0.269328 | Val SMAPE: 56.5123%
--> Best model saved (SMAPE 56.5123%) at epoch 4




Epoch 5 | TrainLoss(logHuber): 0.257593 | Val SMAPE: 55.7574%
--> Best model saved (SMAPE 55.7574%) at epoch 5




Epoch 6 | TrainLoss(logHuber): 0.246605 | Val SMAPE: 55.1922%
--> Best model saved (SMAPE 55.1922%) at epoch 6




Epoch 7 | TrainLoss(logHuber): 0.235420 | Val SMAPE: 54.7395%
--> Best model saved (SMAPE 54.7395%) at epoch 7




Epoch 8 | TrainLoss(logHuber): 0.224569 | Val SMAPE: 54.5371%
--> Best model saved (SMAPE 54.5371%) at epoch 8




Epoch 9 | TrainLoss(logHuber): 0.214341 | Val SMAPE: 54.5464%




Epoch 10 | TrainLoss(logHuber): 0.203407 | Val SMAPE: 54.4137%
--> Best model saved (SMAPE 54.4137%) at epoch 10




Epoch 11 | TrainLoss(logHuber): 0.194414 | Val SMAPE: 54.1992%
--> Best model saved (SMAPE 54.1992%) at epoch 11




Epoch 12 | TrainLoss(logHuber): 0.185639 | Val SMAPE: 53.9809%
--> Best model saved (SMAPE 53.9809%) at epoch 12




Epoch 13 | TrainLoss(logHuber): 0.178124 | Val SMAPE: 54.2154%




Epoch 14 | TrainLoss(logHuber): 0.170875 | Val SMAPE: 53.9629%
--> Best model saved (SMAPE 53.9629%) at epoch 14




Epoch 15 | TrainLoss(logHuber): 0.163091 | Val SMAPE: 53.8011%
--> Best model saved (SMAPE 53.8011%) at epoch 15




Epoch 16 | TrainLoss(logHuber): 0.158189 | Val SMAPE: 53.9229%




Epoch 17 | TrainLoss(logHuber): 0.151543 | Val SMAPE: 53.8119%




Epoch 18 | TrainLoss(logHuber): 0.146037 | Val SMAPE: 54.4627%




Epoch 19 | TrainLoss(logHuber): 0.141116 | Val SMAPE: 53.7379%
--> Best model saved (SMAPE 53.7379%) at epoch 19




Epoch 20 | TrainLoss(logHuber): 0.135025 | Val SMAPE: 53.6084%
--> Best model saved (SMAPE 53.6084%) at epoch 20




Epoch 21 | TrainLoss(logHuber): 0.130806 | Val SMAPE: 53.5152%
--> Best model saved (SMAPE 53.5152%) at epoch 21




Epoch 22 | TrainLoss(logHuber): 0.127157 | Val SMAPE: 53.7265%




Epoch 23 | TrainLoss(logHuber): 0.123731 | Val SMAPE: 53.8599%




Epoch 24 | TrainLoss(logHuber): 0.119530 | Val SMAPE: 53.4601%
--> Best model saved (SMAPE 53.4601%) at epoch 24




Epoch 25 | TrainLoss(logHuber): 0.116686 | Val SMAPE: 53.4560%
--> Best model saved (SMAPE 53.4560%) at epoch 25




Epoch 26 | TrainLoss(logHuber): 0.113975 | Val SMAPE: 53.5766%




Epoch 27 | TrainLoss(logHuber): 0.111617 | Val SMAPE: 53.8272%




Epoch 28 | TrainLoss(logHuber): 0.107855 | Val SMAPE: 53.5858%




Epoch 29 | TrainLoss(logHuber): 0.099323 | Val SMAPE: 53.2559%
--> Best model saved (SMAPE 53.2559%) at epoch 29




Epoch 30 | TrainLoss(logHuber): 0.095444 | Val SMAPE: 53.1910%
--> Best model saved (SMAPE 53.1910%) at epoch 30




Epoch 31 | TrainLoss(logHuber): 0.092298 | Val SMAPE: 53.4966%




Epoch 32 | TrainLoss(logHuber): 0.090380 | Val SMAPE: 53.1816%
--> Best model saved (SMAPE 53.1816%) at epoch 32




Epoch 33 | TrainLoss(logHuber): 0.089302 | Val SMAPE: 53.2895%




Epoch 34 | TrainLoss(logHuber): 0.089645 | Val SMAPE: 53.1217%
--> Best model saved (SMAPE 53.1217%) at epoch 34




Epoch 35 | TrainLoss(logHuber): 0.087063 | Val SMAPE: 53.4029%




Epoch 36 | TrainLoss(logHuber): 0.086156 | Val SMAPE: 53.4574%




Epoch 37 | TrainLoss(logHuber): 0.083766 | Val SMAPE: 53.0093%
--> Best model saved (SMAPE 53.0093%) at epoch 37




Epoch 38 | TrainLoss(logHuber): 0.083326 | Val SMAPE: 53.3972%




Epoch 39 | TrainLoss(logHuber): 0.082046 | Val SMAPE: 53.0922%




Epoch 40 | TrainLoss(logHuber): 0.080994 | Val SMAPE: 53.1742%




Epoch 41 | TrainLoss(logHuber): 0.080263 | Val SMAPE: 53.3170%




Epoch 42 | TrainLoss(logHuber): 0.077298 | Val SMAPE: 53.1633%




Epoch 43 | TrainLoss(logHuber): 0.074894 | Val SMAPE: 53.0818%
Early stopping triggered.
Training finished. Best val SMAPE: 53.0093%
Fold 5 val SMAPE: 53.0093%
K-Fold CV mean SMAPE: 52.76771545410156


Predicting test: 100%|██████████| 293/293 [00:04<00:00, 72.16it/s]


Saved predictions to submission.csv
All done. Submission: submission.csv


In [25]:
df_sub = pd.read_csv('submission.csv')
df_sub.shape

(75000, 2)

In [26]:
df_sub.head()

Unnamed: 0,sample_id,price
0,tensor(100179),12.246477
1,tensor(245611),23.463348
2,tensor(146263),22.017868
3,tensor(95658),2.920306
4,tensor(36806),16.240362


In [27]:
df_sub['sample_id'] = df_sub['sample_id'].astype(str).str.extract(r'tensor\((\d+)\)')[0].astype(int)
df_sub.to_csv('test_out.csv', index=False)

df_sub.head()

Unnamed: 0,sample_id,price
0,100179,12.246477
1,245611,23.463348
2,146263,22.017868
3,95658,2.920306
4,36806,16.240362
