In [70]:

# Auto-reload external modules when their source changes
%load_ext autoreload
%autoreload 2

import sys
from pathlib import Path
import src.train as train
import src.models as models
import utils.helpers as helpers
import pandas as pd
from glob import glob
import seaborn as sns
import matplotlib.pyplot as plt

import os
import math
import random
import argparse
from typing import List, Tuple, Dict

import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler

import cv2
import albumentations as A
from albumentations.pytorch import ToTensorV2

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

import timm


import warnings
import sklearn.exceptions
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings("ignore", category=sklearn.exceptions.UndefinedMetricWarning)

# Path variables
BASE_PATH = "src/inputs/"
TRAIN_PATH = BASE_PATH + "train.csv"
TEST_PATH = BASE_PATH + "test.csv"
train_jpg = glob(BASE_PATH + "train/*.jpg")
test_jpg = glob(BASE_PATH + "test/*.jpg")



The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [71]:
helpers.set_seeds()

In [4]:
train_df = pd.read_csv(BASE_PATH + "train.csv")
test_df =  pd.read_csv(BASE_PATH + "test.csv")


In [5]:
train_df.head()

Unnamed: 0,Id,Subject Focus,Eyes,Face,Near,Action,Accessory,Group,Collage,Human,Occlusion,Info,Blur,Pawpularity
0,0007de18844b0dbbb5e1f607da0606e0,0,1,1,1,0,0,1,0,0,0,0,0,63
1,0009c66b9439883ba2750fb825e1d7db,0,1,1,0,0,0,0,0,0,0,0,0,42
2,0013fd999caf9a3efe1352ca1b0d937e,0,1,1,1,0,0,0,0,1,1,0,0,28
3,0018df346ac9c1d8413cfcc888ca8246,0,1,1,1,0,0,0,0,0,0,0,0,15
4,001dc955e10590d3ca4673f034feeef2,0,0,0,1,0,0,1,0,0,0,0,0,72


In [6]:
#path column added to access the images path
train_df['path'] = train_df['Id'].map(lambda x:str(BASE_PATH+'train/'+x)+'.jpg')
train_df = train_df.drop(columns=['Id'])
train_df = train_df.sample(frac=1).reset_index(drop=True) #shuffle dataframe
train_df.head()

Unnamed: 0,Subject Focus,Eyes,Face,Near,Action,Accessory,Group,Collage,Human,Occlusion,Info,Blur,Pawpularity,path
0,0,1,1,1,0,0,0,0,0,0,0,0,40,src/inputs/train/48d53aeabee4f92f77eee3a323343c77.jpg
1,0,1,1,0,0,0,0,1,0,0,0,0,64,src/inputs/train/b60e82fd313066b801fa4431d1ce4f4e.jpg
2,0,1,1,1,0,1,0,0,0,0,0,0,26,src/inputs/train/09ae71fc4eda1e0ae05680d1950bc009.jpg
3,0,0,1,1,0,0,1,0,0,0,0,0,73,src/inputs/train/263879abce68de4af02ef5f7ef873d24.jpg
4,0,1,1,1,0,0,1,0,0,0,0,0,39,src/inputs/train/4ea0587a137c7983ca92851b6cb36ca1.jpg


In [7]:
#normalizing 
train_df['norm_score'] = train_df['Pawpularity']/100
train_df.head()

Unnamed: 0,Subject Focus,Eyes,Face,Near,Action,Accessory,Group,Collage,Human,Occlusion,Info,Blur,Pawpularity,path,norm_score
0,0,1,1,1,0,0,0,0,0,0,0,0,40,src/inputs/train/48d53aeabee4f92f77eee3a323343c77.jpg,0.4
1,0,1,1,0,0,0,0,1,0,0,0,0,64,src/inputs/train/b60e82fd313066b801fa4431d1ce4f4e.jpg,0.64
2,0,1,1,1,0,1,0,0,0,0,0,0,26,src/inputs/train/09ae71fc4eda1e0ae05680d1950bc009.jpg,0.26
3,0,0,1,1,0,0,1,0,0,0,0,0,73,src/inputs/train/263879abce68de4af02ef5f7ef873d24.jpg,0.73
4,0,1,1,1,0,0,1,0,0,0,0,0,39,src/inputs/train/4ea0587a137c7983ca92851b6cb36ca1.jpg,0.39


In [68]:

def rmse(y_true: np.ndarray, y_pred: np.ndarray) -> float:
    return float(np.sqrt(np.mean((y_true - y_pred) ** 2)))

In [69]:

def make_stratified_folds(df: pd.DataFrame, target_col: str, n_folds: int, seed: int) -> pd.DataFrame:
    num_bins = 10
    bins = pd.cut(df[target_col], bins=num_bins, labels=False, include_lowest=True)
    df = df.copy()
    df["bins"] = bins.values
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed)
    df["fold"] = -1
    for fold, (_, val_idx) in enumerate(skf.split(df, df["bins"])):
        df.loc[df.index[val_idx], "fold"] = fold
    return df.drop(columns=["bins"])


In [72]:

# -----------------------------
# Dataset
# -----------------------------
class PetDataset(Dataset):
    def __init__(self, df: pd.DataFrame, data_dir: str, img_dirname: str, meta_cols, target_col, scaler: StandardScaler, transforms, label_scale: float):
        self.df = df.reset_index(drop=True)
        self.data_dir = data_dir
        self.img_dir = os.path.join(data_dir, img_dirname)
        self.meta_cols = list(meta_cols)
        self.target_col = target_col
        self.transforms = transforms
        self.scaler = scaler
        self.label_scale = label_scale

        self.meta = self.scaler.transform(self.df[self.meta_cols].astype(float).values)
        self.targets = (self.df[self.target_col].values.astype(np.float32) / self.label_scale).reshape(-1, 1)
        self.ids = self.df["Id"].values

    def __len__(self):
        return len(self.df)

    def _read_image(self, img_id):
        path = os.path.join(self.img_dir, f"{img_id}.jpg")
        img = cv2.imread(path, cv2.IMREAD_COLOR)
        if img is None:
            raise FileNotFoundError(f"Image not found: {path}")
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        return img

    def __getitem__(self, idx):
        img_id = self.ids[idx]
        image = self._read_image(img_id)
        meta = self.meta[idx].astype(np.float32)
        target = self.targets[idx].astype(np.float32)

        if self.transforms:
            out = self.transforms(image=image)
            image = out["image"]

        meta_tensor = torch.tensor(meta, dtype=torch.float32)
        target_tensor = torch.tensor(target, dtype=torch.float32).squeeze(-1)
        return image, meta_tensor, target_tensor, img_id


In [73]:

def get_transforms(image_size: int, is_train: bool):
    if is_train:
        return A.Compose([
            A.RandomResizedCrop(image_size, image_size, scale=(0.8, 1.0), ratio=(0.8, 1.25), p=1.0),
            A.HorizontalFlip(p=0.5),
            A.VerticalFlip(p=0.2),
            A.ShiftScaleRotate(shift_limit=0.05, scale_limit=0.1, rotate_limit=20, border_mode=cv2.BORDER_REFLECT_101, p=0.7),
            A.ColorJitter(brightness=0.25, contrast=0.25, saturation=0.2, hue=0.05, p=0.7),
            A.OneOf([
                A.CoarseDropout(max_holes=8, max_height=int(0.1*image_size), max_width=int(0.1*image_size), p=1.0),
                A.Cutout(num_holes=6, max_h_size=int(0.1*image_size), max_w_size=int(0.1*image_size), p=1.0),
            ], p=0.5),
            A.Normalize(mean=(0.485,0.456,0.406), std=(0.229,0.224,0.225)),
            ToTensorV2(),
        ])
    else:
        return A.Compose([
            A.LongestMaxSize(max_size=image_size, interpolation=cv2.INTER_CUBIC),
            A.PadIfNeeded(image_size, image_size, border_mode=cv2.BORDER_REFLECT_101),
            A.Normalize(mean=(0.485,0.456,0.406), std=(0.229,0.224,0.225)),
            ToTensorV2(),
        ])


In [74]:

# -----------------------------
# Model: Image Encoder + Cross-Attention Fusion + Regressor
# -----------------------------
class ImageBackbone(nn.Module):
    """ConvNeXtV2 backbone via timm; outputs a global embedding."""
    def __init__(self, model_name: str, pretrained: bool = True):
        super().__init__()
        self.backbone = timm.create_model(model_name, pretrained=pretrained, num_classes=0, global_pool='avg')
        self.out_dim = self.backbone.num_features

    def forward(self, x):
        return self.backbone(x)  # (B, C)


In [75]:

class CrossAttentionFusion(nn.Module):
    """
    Cross-attention between image tokens and metadata tokens.
    - Image embedding -> n_img_tokens via linear projection.
    - Metadata scalars -> tokens via linear + feature embeddings.
    """
    def __init__(self, d_img: int, n_meta: int, d_model: int = 512, n_heads: int = 8, n_img_tokens: int = 4, dropout: float = 0.1):
        super().__init__()
        self.n_meta = n_meta
        self.d_model = d_model
        self.n_img_tokens = n_img_tokens

        self.img_proj = nn.Linear(d_img, d_model * n_img_tokens)
        self.img_ln = nn.LayerNorm(d_model)

        self.meta_scalar_proj = nn.Linear(1, d_model)
        self.meta_feature_embed = nn.Embedding(n_meta, d_model)

        self.attn = nn.MultiheadAttention(d_model, n_heads, batch_first=True, dropout=dropout)
        self.attn_ln = nn.LayerNorm(d_model)

        self.img_align = nn.Linear(d_img, d_model)
        self.gate = nn.Sequential(
            nn.Linear(d_model, d_model),
            nn.SiLU(),
            nn.Linear(d_model, d_model),
            nn.Sigmoid()
        )

        self.ffn = nn.Sequential(
            nn.Linear(d_model, d_model * 4),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(d_model * 4, d_model),
        )
        self.ffn_ln = nn.LayerNorm(d_model)

    def forward(self, img_vec: torch.Tensor, meta_vec: torch.Tensor):
        # img_vec: (B, d_img); meta_vec: (B, n_meta)
        B, n_meta = meta_vec.shape
        assert n_meta == self.n_meta

        img_tokens = self.img_proj(img_vec).view(B, self.n_img_tokens, self.d_model)
        img_tokens = self.img_ln(img_tokens)

        feature_ids = torch.arange(n_meta, device=meta_vec.device).unsqueeze(0).expand(B, -1)
        meta_tokens = self.meta_scalar_proj(meta_vec.unsqueeze(-1)) + self.meta_feature_embed(feature_ids)

        attn_out, _ = self.attn(query=meta_tokens, key=img_tokens, value=img_tokens, need_weights=False)
        x = self.attn_ln(meta_tokens + attn_out)
        pooled_meta = x.mean(dim=1)

        img_aligned = self.img_align(img_vec)
        g = self.gate(pooled_meta)
        fused = pooled_meta + g * img_aligned
        fused = self.ffn_ln(fused + self.ffn(fused))
        return fused  # (B, d_model)


In [76]:

class PawpularityModel(nn.Module):
    def __init__(self, backbone_name: str, n_meta: int, d_model: int, n_heads: int, n_img_tokens: int, dropout: float, pretrained: bool = True):
        super().__init__()
        self.backbone = ImageBackbone(backbone_name, pretrained=pretrained)
        self.fusion = CrossAttentionFusion(
            d_img=self.backbone.out_dim,
            n_meta=n_meta,
            d_model=d_model,
            n_heads=n_heads,
            n_img_tokens=n_img_tokens,
            dropout=dropout
        )
        self.head = nn.Sequential(
            nn.Linear(d_model, d_model // 2),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(d_model // 2, 1)
        )

    def forward(self, images, meta):
        img_vec = self.backbone(images)
        fused = self.fusion(img_vec, meta)
        out = self.head(fused).squeeze(1)
        return out


In [77]:

# -----------------------------
# Training helpers
# -----------------------------
class EarlyStopping:
    def __init__(self, patience: int = 5, min_delta: float = 0.0):
        self.patience = patience
        self.min_delta = min_delta
        self.best = None
        self.counter = 0
        self.should_stop = False

    def step(self, metric: float) -> bool:
        if self.best is None or (self.best - metric) > self.min_delta:
            self.best = metric
            self.counter = 0
        else:
            self.counter += 1
            if self.counter >= self.patience:
                self.should_stop = True
        return self.should_stop


In [78]:

def train_one_epoch(model, loader, optimizer, scaler, device, loss_fn, max_grad_norm=None):
    model.train()
    total_loss = 0.0
    for images, meta, targets, _ in loader:
        images = images.to(device)
        meta = meta.to(device)
        targets = targets.to(device).float()

        optimizer.zero_grad(set_to_none=True)
        if scaler is not None:
            with torch.cuda.amp.autocast():
                preds = model(images, meta)
                loss = loss_fn(preds, targets)
            scaler.scale(loss).backward()
            if max_grad_norm is not None:
                scaler.unscale_(optimizer)
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
            scaler.step(optimizer)
            scaler.update()
        else:
            preds = model(images, meta)
            loss = loss_fn(preds, targets)
            loss.backward()
            if max_grad_norm is not None:
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
            optimizer.step()

        total_loss += loss.item() * images.size(0)
    return total_loss / len(loader.dataset)

@torch.no_grad()
def validate(model, loader, device, loss_fn, label_scale: float):
    model.eval()
    total_loss = 0.0
    all_preds, all_targets, all_ids = [], [], []
    for images, meta, targets, ids in loader:
        images = images.to(device)
        meta = meta.to(device)
        targets = targets.to(device).float()
        preds = model(images, meta)
        loss = loss_fn(preds, targets)
        total_loss += loss.item() * images.size(0)

        preds_np = (preds.detach().cpu().numpy() * label_scale).clip(0, label_scale)
        targs_np = (targets.detach().cpu().numpy() * label_scale)
        all_preds.append(preds_np)
        all_targets.append(targs_np)
        all_ids.extend(ids)

    all_preds = np.concatenate(all_preds)
    all_targets = np.concatenate(all_targets)
    val_rmse = rmse(all_targets, all_preds)
    avg_loss = total_loss / len(loader.dataset)
    return avg_loss, val_rmse, all_preds, all_targets, all_ids


In [79]:
import utils.helpers as helpers
# Main (K-Fold)
# -----------------------------

parser = argparse.ArgumentParser()
parser.add_argument("--data_dir", type=str, required=True)
parser.add_argument("--out_dir", type=str, default="./outputs")
parser.add_argument("--img_dirname", type=str, default="train")
parser.add_argument("--csv_name", type=str, default="train.csv")

# Model & training hyperparams
parser.add_argument("--image_size", type=int, default=384)
parser.add_argument("--backbone", type=str, default="convnextv2_base.fcmae_ft_in22k_in1k_384")
parser.add_argument("--pretrained", action="store_true", default=True)
parser.add_argument("--n_folds", type=int, default=5)
parser.add_argument("--epochs", type=int, default=20)
parser.add_argument("--batch_size", type=int, default=16)
parser.add_argument("--num_workers", type=int, default=4)
parser.add_argument("--lr", type=float, default=2e-4)
parser.add_argument("--min_lr", type=float, default=1e-6)
parser.add_argument("--weight_decay", type=float, default=0.05)
parser.add_argument("--patience", type=int, default=5)
parser.add_argument("--seed", type=int, default=42)
parser.add_argument("--no_amp", action="store_true", help="Disable mixed precision")

# Fusion hyperparams
parser.add_argument("--d_model", type=int, default=512)
parser.add_argument("--n_heads", type=int, default=8)
parser.add_argument("--n_img_tokens", type=int, default=4)
parser.add_argument("--dropout", type=float, default=0.1)

args = parser.parse_args()

# Simple config variables (no dataclass)
data_dir = args.data_dir
out_dir = args.out_dir
img_dirname = args.img_dirname
csv_name = args.csv_name

image_size = args.image_size
backbone = args.backbone
pretrained = args.pretrained
n_folds = args.n_folds
epochs = args.epochs
batch_size = args.batch_size
num_workers = args.num_workers
lr = args.lr
min_lr = args.min_lr
weight_decay = args.weight_decay
patience = args.patience
seed = args.seed
mixed_precision = (not args.no_amp)
grad_clip_norm = 1.0
label_scale = 100.0  # normalize targets to [0,1] during training

# Fusion params
d_model = args.d_model
n_heads = args.n_heads
n_img_tokens = args.n_img_tokens
dropout = args.dropout

# PetFinder metadata columns (12)
meta_cols = (
    "Subject Focus","Eyes","Face","Near","Action","Accessory",
    "Group","Collage","Human","Occlusion","Info","Blur"
)
target_col = "Pawpularity"

os.makedirs(out_dir, exist_ok=True)
helpers.set_seeds(seed)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load data
csv_path = os.path.join(data_dir, csv_name)
df = pd.read_csv(csv_path)

# Check required columns
needed_cols = ["Id", target_col, *meta_cols]
for c in needed_cols:
    if c not in df.columns:
        raise ValueError(f"Column '{c}' not found in {csv_path}. Expected PetFinder format.")

# Make folds
df_folds = make_stratified_folds(df, target_col, n_folds, seed)

# Fit meta scaler on all rows for stability (then used in each split)
scaler = StandardScaler().fit(df_folds[list(meta_cols)].astype(float).values)

fold_metrics = []

print("===== SIMPLE CONFIG =====")
print({
    "data_dir": data_dir, "out_dir": out_dir, "img_dirname": img_dirname, "csv_name": csv_name,
    "image_size": image_size, "backbone": backbone, "n_folds": n_folds, "epochs": epochs,
    "batch_size": batch_size, "lr": lr, "weight_decay": weight_decay, "min_lr": min_lr,
    "patience": patience, "seed": seed, "d_model": d_model, "n_heads": n_heads,
    "n_img_tokens": n_img_tokens, "dropout": dropout, "mixed_precision": mixed_precision
})

for fold in range(n_folds):
    print(f"\n========== Fold {fold} / {n_folds} ==========")
    trn_df = df_folds[df_folds["fold"] != fold].reset_index(drop=True)
    val_df = df_folds[df_folds["fold"] == fold].reset_index(drop=True)

    train_ds = PetDataset(trn_df, data_dir, img_dirname, meta_cols, target_col, scaler,
                            transforms=get_transforms(image_size, is_train=True), label_scale=label_scale)
    val_ds   = PetDataset(val_df, data_dir, img_dirname, meta_cols, target_col, scaler,
                            transforms=get_transforms(image_size, is_train=False), label_scale=label_scale)

    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True,
                                num_workers=num_workers, pin_memory=True, drop_last=True)
    val_loader = DataLoader(val_ds, batch_size=batch_size*2, shuffle=False,
                            num_workers=num_workers, pin_memory=True, drop_last=False)

    # Model
    model = PawpularityModel(
        backbone_name=backbone,
        n_meta=len(meta_cols),
        d_model=d_model,
        n_heads=n_heads,
        n_img_tokens=n_img_tokens,
        dropout=dropout,
        pretrained=pretrained
    ).to(device)

    # Optimizer & Cosine LR (simple)
    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=max(1, epochs), eta_min=min_lr)
    loss_fn = nn.SmoothL1Loss(beta=0.5)  # Huber loss
    scaler_amp = torch.cuda.amp.GradScaler() if (mixed_precision and device.type == "cuda") else None
    early_stopper = EarlyStopping(patience=patience, min_delta=0.0)

    # Checkpointing
    fold_dir = os.path.join(out_dir, f"fold_{fold}")
    os.makedirs(fold_dir, exist_ok=True)
    best_ckpt_path = os.path.join(fold_dir, "best_model.pt")

    best_rmse = float("inf")
    history = []

    for epoch in range(epochs):
        tr_loss = train_one_epoch(model, train_loader, optimizer, scaler_amp, device, loss_fn, max_grad_norm=1.0)
        val_loss, val_rmse, val_preds, val_targs, val_ids = validate(model, val_loader, device, loss_fn, label_scale)

        scheduler.step()

        history.append({
            "epoch": epoch,
            "train_loss": tr_loss,
            "val_loss": val_loss,
            "val_rmse": val_rmse,
            "lr": optimizer.param_groups[0]["lr"]
        })
        print(f"Epoch {epoch:02d} | TrainLoss {tr_loss:.4f} | ValLoss {val_loss:.4f} | ValRMSE {val_rmse:.4f}")

        if val_rmse < best_rmse:
            best_rmse = val_rmse
            torch.save({
                "model_state": model.state_dict(),
                "optimizer_state": optimizer.state_dict(),
                "cfg": {
                    "backbone": backbone, "image_size": image_size, "meta_cols": meta_cols,
                    "d_model": d_model, "n_heads": n_heads, "n_img_tokens": n_img_tokens,
                    "dropout": dropout, "label_scale": label_scale
                },
                "fold": fold,
                "epoch": epoch,
                "best_val_rmse": best_rmse,
                "scaler_mean": scaler.mean_,
                "scaler_scale": scaler.scale_,
            }, best_ckpt_path)

        if early_stopper.step(val_rmse):
            print(f"Early stopping at epoch {epoch}. Best Val RMSE so far: {best_rmse:.4f}")
            break

    # Load best and re-evaluate on validation for final fold RMSE
    best = torch.load(best_ckpt_path, map_location=device)
    model.load_state_dict(best["model_state"])
    _, final_val_rmse, val_preds, val_targs, val_ids = validate(model, val_loader, device, loss_fn, label_scale)

    print(f"[FOLD {fold}] Best Val RMSE: {final_val_rmse:.4f}")
    fold_metrics.append(final_val_rmse)

    # Save logs and out-of-fold preds
    pd.DataFrame(history).to_csv(os.path.join(fold_dir, "training_log.csv"), index=False)
    pd.DataFrame({
        "Id": val_ids,
        "target": val_targs.astype(np.float32),
        "pred": val_preds.astype(np.float32),
        "fold": fold
    }).to_csv(os.path.join(fold_dir, "oof_val_preds.csv"), index=False)

print("\n===== CV SUMMARY =====")
for f, m in enumerate(fold_metrics):
    print(f"Fold {f}: Val RMSE = {m:.4f}")
print(f"Mean RMSE: {np.mean(fold_metrics):.4f} | Std: {np.std(fold_metrics):.4f}")

pd.DataFrame({"fold": list(range(n_folds)), "val_rmse": fold_metrics}).to_csv(os.path.join(out_dir, "cv_summary.csv"), index=False)




usage: ipykernel_launcher.py [-h] --data_dir DATA_DIR [--out_dir OUT_DIR]
                             [--img_dirname IMG_DIRNAME] [--csv_name CSV_NAME]
                             [--image_size IMAGE_SIZE] [--backbone BACKBONE]
                             [--pretrained] [--n_folds N_FOLDS]
                             [--epochs EPOCHS] [--batch_size BATCH_SIZE]
                             [--num_workers NUM_WORKERS] [--lr LR]
                             [--min_lr MIN_LR] [--weight_decay WEIGHT_DECAY]
                             [--patience PATIENCE] [--seed SEED] [--no_amp]
                             [--d_model D_MODEL] [--n_heads N_HEADS]
                             [--n_img_tokens N_IMG_TOKENS] [--dropout DROPOUT]
ipykernel_launcher.py: error: the following arguments are required: --data_dir


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
