# DL Lab 1: TOP1 Pipeline (Clean Train + Group-aware CV + Ensemble)

Этот ноутбук делает полный production-like пайплайн для соревнования:
- использует очищенный `train` (`strict/aggressive`) и **не трогает** test-файлы;
- делает валидацию с учетом структуры `class + plu`;
- обучает несколько архитектур по фолдам;
- выбирает модели по OOF-метрикам;
- строит weighted-ensemble и готовит submission.

## Основной принцип
Финальный выбор моделей/весов делается по локальной OOF/CV, а не по случайному росту public leaderboard.

## 0) Перед запуском

1. Проверьте пути в `CFG`.
2. Убедитесь, что установлены пакеты (`torch`, `timm`, `albumentations`, `scikit-learn`).
3. Запускайте обучение сначала на `strict` cleaned train.
4. `test`-директорию не меняем, не чистим, не дополняем.

In [None]:
# Если пакетов нет, запустите в отдельной ячейке (локально или на Kaggle):
# !pip install -U torch torchvision timm albumentations scikit-learn pandas numpy pillow tqdm

In [None]:
from __future__ import annotations

import os
import gc
import json
import math
import random
from dataclasses import dataclass, asdict
from pathlib import Path
from typing import Dict, List, Tuple, Optional

import numpy as np
import pandas as pd
from PIL import Image
from tqdm.auto import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler

import albumentations as A
from albumentations.pytorch import ToTensorV2

import timm

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

In [None]:
@dataclass
class CFG:
    seed: int = 42

    # Paths
    root: str = '/kaggle/input/dl-lab-1-image-classification'  # Kaggle path пример
    local_root: str = '/Users/fedorgracev/Desktop/NeuralNetwork/dl_lab1/unzipped'  # local path пример
    use_local_paths: bool = True

    # Clean train variants:
    # - strict: drop only obvious noise
    # - aggressive: additionally excludes quarantine rows
    clean_variant: str = 'strict'  # ['strict', 'aggressive', 'raw']

    # CV
    n_folds: int = 5
    fold_seed: int = 42

    # Training
    num_workers: int = 0
    batch_size: int = 48
    epochs: int = 14
    warmup_epochs: int = 2
    lr: float = 3e-4
    weight_decay: float = 1e-4
    label_smoothing: float = 0.05

    # Aug
    img_size: int = 224
    use_mixup: bool = True
    mixup_alpha: float = 0.2
    mixup_prob: float = 0.3

    # Runtime flags
    run_training: bool = True
    run_inference: bool = False

    # Ensemble search
    ensemble_trials: int = 5000

    # Output
    out_dir: str = '/Users/fedorgracev/Desktop/NeuralNetwork/dl_lab1/outputs_top1_mps'

    # Model zoo for ensemble
    model_configs: Tuple[Dict, ...] = (
        {
            'alias': 'convnext_small',
            'timm_name': 'convnext_small.fb_in22k_ft_in1k',
            'img_size': 224,
            'drop_rate': 0.2,
            'drop_path_rate': 0.2,
        },
        {
            'alias': 'effnetv2_s',
            'timm_name': 'tf_efficientnetv2_s.in21k_ft_in1k',
            'img_size': 224,
            'drop_rate': 0.2,
            'drop_path_rate': 0.2,
        },
        {
            'alias': 'resnet50',
            'timm_name': 'resnet50.a1_in1k',
            'img_size': 224,
            'drop_rate': 0.1,
            'drop_path_rate': 0.1,
        },
    )

CFG = CFG()

# Runtime overrides via env vars (useful for smoke/full runs without editing notebook)
CFG.batch_size = int(os.getenv('BATCH_SIZE', CFG.batch_size))
CFG.epochs = int(os.getenv('EPOCHS', CFG.epochs))
CFG.n_folds = int(os.getenv('N_FOLDS', CFG.n_folds))
CFG.num_workers = int(os.getenv('NUM_WORKERS', CFG.num_workers))
CFG.ensemble_trials = int(os.getenv('ENSEMBLE_TRIALS', CFG.ensemble_trials))

if os.getenv('RUN_TRAINING') is not None:
    CFG.run_training = os.getenv('RUN_TRAINING') == '1'

if os.getenv('RUN_INFERENCE') is not None:
    CFG.run_inference = os.getenv('RUN_INFERENCE') == '1'

if os.getenv('CLEAN_VARIANT') is not None:
    CFG.clean_variant = os.getenv('CLEAN_VARIANT')

max_models = int(os.getenv('MAX_MODELS', '0'))
if max_models > 0:
    CFG.model_configs = CFG.model_configs[:max_models]

print(asdict(CFG))


In [None]:
def resolve_paths(cfg: CFG) -> Dict[str, Path]:
    base = Path(cfg.local_root if cfg.use_local_paths else cfg.root)

    if cfg.clean_variant == 'strict':
        train_csv = base / 'cleaning' / 'train_clean_strict.csv'
    elif cfg.clean_variant == 'aggressive':
        train_csv = base / 'cleaning' / 'train_clean_aggressive.csv'
    else:
        train_csv = base / 'train.csv'

    paths = {
        'base': base,
        'train_csv': train_csv,
        'test_csv': base / 'test.csv',
        'sample_submission': base / 'sample_submission.csv',
        'train_images_dir': base / 'train' / 'train',
        'test_images_dir': base / 'test_images' / 'test_images',
        'clean_manifest': base / 'cleaning' / 'train_clean_manifest.csv',
    }
    return paths

paths = resolve_paths(CFG)
for k, v in paths.items():
    print(f'{k}: {v} | exists={v.exists()}')

In [None]:
def seed_everything(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(CFG.seed)
torch.set_float32_matmul_precision('high')

if not torch.backends.mps.is_available():
    raise RuntimeError('MPS is not available in current process. Run notebook outside sandbox / with full system access.')

device = torch.device('mps')
print('device:', device)


In [None]:
def load_train_df(paths: Dict[str, Path]) -> pd.DataFrame:
    df = pd.read_csv(paths['train_csv'])
    df['label'] = df['label'].astype(int)
    splits = df['image_id'].str.split('/', expand=True)
    df['class_name'] = splits[0]
    df['plu'] = splits[1]
    df['file_name'] = splits[2]

    # Existence sanity check
    missing = []
    for rel in df['image_id'].head(2000):
        if not (paths['train_images_dir'] / rel).exists():
            missing.append(rel)
    if missing:
        raise FileNotFoundError(f'Found missing train files, e.g. {missing[:3]}')

    return df


def load_test_df(paths: Dict[str, Path]) -> pd.DataFrame:
    df = pd.read_csv(paths['test_csv'])
    missing = []
    for rel in df['image_id'].head(2000):
        if not (paths['test_images_dir'] / rel).exists():
            missing.append(rel)
    if missing:
        raise FileNotFoundError(f'Found missing test files, e.g. {missing[:3]}')
    return df

train_df = load_train_df(paths)
test_df = load_test_df(paths)

print('train rows:', len(train_df))
print('test rows :', len(test_df))
print('classes   :', train_df['label'].nunique())
print(train_df.head(3))

In [None]:
# Label map and class weights for CrossEntropyLoss
label_to_class = (
    train_df[['label', 'class_name']]
    .drop_duplicates()
    .sort_values('label')
    .set_index('label')['class_name']
    .to_dict()
)

label_counts = train_df['label'].value_counts().sort_index()
num_classes = len(label_counts)
N = len(train_df)

# Inverse-frequency style (effective baseline)
class_weights = {k: N / (num_classes * v) for k, v in label_counts.to_dict().items()}
max_w = max(class_weights.values())
class_weights = {k: v / max_w for k, v in class_weights.items()}
class_weights_tensor = torch.tensor([class_weights[i] for i in range(num_classes)], dtype=torch.float32)

print('label_to_class:', label_to_class)
print('class_counts:')
print(label_counts)
print('class_weights (normalized):', class_weights)

## 1) Fold strategy

Используем стратификацию по `label + plu` (ключ вида `"11_4314547"`).
Это учитывает идею преподавателя: не только класс, но и подгруппа внутри класса.

Дополнительно сохраняем folds на диск для воспроизводимости.

In [None]:
def make_folds(df: pd.DataFrame, n_splits: int, seed: int) -> pd.DataFrame:
    df = df.copy()
    df['fold'] = -1
    df['strat_key'] = df['label'].astype(str) + '_' + df['plu'].astype(str)

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    for fold, (_, val_idx) in enumerate(skf.split(df, df['strat_key'])):
        df.loc[df.index[val_idx], 'fold'] = fold

    assert (df['fold'] >= 0).all()
    return df

fold_df = make_folds(train_df, CFG.n_folds, CFG.fold_seed)

out_dir = Path(CFG.out_dir)
out_dir.mkdir(parents=True, exist_ok=True)
folds_path = out_dir / f'folds_{CFG.clean_variant}_{CFG.n_folds}f.csv'
fold_df.to_csv(folds_path, index=False)
print('saved:', folds_path)

print(fold_df.groupby('fold')['label'].count())

In [None]:
def check_fold_balance(df: pd.DataFrame):
    stats = []
    for f in sorted(df['fold'].unique()):
        d = df[df['fold'] == f]
        cls = d['label'].value_counts().sort_index()
        stats.append((f, len(d), cls.min(), cls.max(), cls.std()))
    balance = pd.DataFrame(stats, columns=['fold', 'n', 'min_cls', 'max_cls', 'std_cls'])
    return balance

balance_df = check_fold_balance(fold_df)
print(balance_df)

## 2) Augmentations

Подход:
- умеренно сильные геометрические и цветовые искажения;
- без слишком агрессивной деформации формы;
- `CoarseDropout` для устойчивости к окклюзии;
- валидация без stochastic augmentations.

In [None]:
def build_train_tfms(img_size: int) -> A.Compose:
    return A.Compose([
        A.RandomResizedCrop(size=(img_size, img_size), scale=(0.65, 1.0), ratio=(0.8, 1.25), p=1.0),
        A.HorizontalFlip(p=0.5),
        A.ShiftScaleRotate(
            shift_limit=0.05,
            scale_limit=0.15,
            rotate_limit=20,
            border_mode=0,
            p=0.35,
        ),
        A.OneOf([
            A.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1, p=1.0),
            A.HueSaturationValue(hue_shift_limit=12, sat_shift_limit=20, val_shift_limit=20, p=1.0),
            A.RandomBrightnessContrast(brightness_limit=0.2, contrast_limit=0.2, p=1.0),
        ], p=0.6),
        A.OneOf([
            A.GaussianBlur(blur_limit=(3, 5), p=1.0),
            A.GaussNoise(std_range=(0.02, 0.08), p=1.0),
            A.ImageCompression(quality_range=(70, 100), p=1.0),
        ], p=0.3),
        A.CoarseDropout(
            num_holes_range=(1, 6),
            hole_height_range=(0.04, 0.12),
            hole_width_range=(0.04, 0.12),
            fill=0,
            p=0.2,
        ),
        A.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
        ToTensorV2(),
    ])


def build_valid_tfms(img_size: int) -> A.Compose:
    return A.Compose([
        A.SmallestMaxSize(max_size=img_size),
        A.CenterCrop(height=img_size, width=img_size),
        A.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
        ToTensorV2(),
    ])


In [None]:
class FruitDataset(Dataset):
    def __init__(
        self,
        df: pd.DataFrame,
        img_dir: Path,
        transform: Optional[A.Compose] = None,
        is_test: bool = False,
    ):
        self.df = df.reset_index(drop=True)
        self.img_dir = img_dir
        self.transform = transform
        self.is_test = is_test

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx: int):
        row = self.df.iloc[idx]
        img_path = self.img_dir / row['image_id']
        image = np.array(Image.open(img_path).convert('RGB'))

        if self.transform is not None:
            image = self.transform(image=image)['image']

        if self.is_test:
            return image, row['image_id']

        label = int(row['label'])
        return image, label

In [None]:
def mixup_data(x, y, alpha=0.2):
    if alpha <= 0:
        return x, y, y, 1.0
    lam = np.random.beta(alpha, alpha)
    batch_size = x.size(0)
    index = torch.randperm(batch_size, device=x.device)

    mixed_x = lam * x + (1 - lam) * x[index, :]
    y_a, y_b = y, y[index]
    return mixed_x, y_a, y_b, lam


def mixup_criterion(criterion, pred, y_a, y_b, lam):
    return lam * criterion(pred, y_a) + (1 - lam) * criterion(pred, y_b)

## 3) Model, scheduler, train/eval loops

In [None]:
def create_model(timm_name: str, num_classes: int, drop_rate: float = 0.2, drop_path_rate: float = 0.2):
    # Fallback names for portability
    fallback = {
        'convnext_small.fb_in22k_ft_in1k': 'convnext_small',
        'tf_efficientnetv2_s.in21k_ft_in1k': 'tf_efficientnetv2_s',
        'resnet50.a1_in1k': 'resnet50',
    }

    try:
        model = timm.create_model(
            timm_name,
            pretrained=True,
            num_classes=num_classes,
            drop_rate=drop_rate,
            drop_path_rate=drop_path_rate,
        )
    except Exception:
        model = timm.create_model(
            fallback.get(timm_name, timm_name),
            pretrained=True,
            num_classes=num_classes,
            drop_rate=drop_rate,
            drop_path_rate=drop_path_rate,
        )
    return model


class WarmupCosineScheduler(torch.optim.lr_scheduler._LRScheduler):
    def __init__(self, optimizer, total_epochs, warmup_epochs=2, min_lr=1e-6, last_epoch=-1):
        self.total_epochs = total_epochs
        self.warmup_epochs = warmup_epochs
        self.min_lr = min_lr
        super().__init__(optimizer, last_epoch)

    def get_lr(self):
        epoch = self.last_epoch + 1
        lrs = []
        for base_lr in self.base_lrs:
            if epoch <= self.warmup_epochs:
                lr = base_lr * epoch / max(1, self.warmup_epochs)
            else:
                progress = (epoch - self.warmup_epochs) / max(1, self.total_epochs - self.warmup_epochs)
                cosine = 0.5 * (1 + math.cos(math.pi * progress))
                lr = self.min_lr + (base_lr - self.min_lr) * cosine
            lrs.append(lr)
        return lrs

In [None]:
@torch.no_grad()
def predict_logits(model, loader, device, tta: bool = False):
    model.eval()
    all_logits = []
    all_ids = []

    for batch in tqdm(loader, leave=False):
        x, ids = batch
        x = x.to(device)

        logits = model(x)

        if tta:
            x_flip = torch.flip(x, dims=[3])  # horizontal flip
            logits_flip = model(x_flip)
            logits = 0.5 * (logits + logits_flip)

        all_logits.append(logits.detach().cpu())
        all_ids.extend(ids)

    return torch.cat(all_logits, dim=0).numpy(), all_ids


def evaluate_logits(y_true: np.ndarray, logits: np.ndarray) -> Dict[str, float]:
    y_pred = logits.argmax(1)
    return {
        'acc': float(accuracy_score(y_true, y_pred)),
        'f1_macro': float(f1_score(y_true, y_pred, average='macro')),
    }

In [None]:
def train_one_fold(
    fold: int,
    model_cfg: Dict,
    df: pd.DataFrame,
    cfg: CFG,
    paths: Dict[str, Path],
    class_weights_tensor: torch.Tensor,
    device: torch.device,
):
    train_part = df[df['fold'] != fold].reset_index(drop=True)
    val_part = df[df['fold'] == fold].reset_index(drop=True)

    img_size = model_cfg['img_size']
    train_ds = FruitDataset(train_part, paths['train_images_dir'], transform=build_train_tfms(img_size), is_test=False)
    val_ds = FruitDataset(val_part, paths['train_images_dir'], transform=build_valid_tfms(img_size), is_test=False)

    # Weighted sampler for imbalance
    sample_weights = train_part['label'].map(class_weights).values.astype(np.float32)
    sampler = WeightedRandomSampler(weights=sample_weights, num_samples=len(sample_weights), replacement=True)

    train_loader = DataLoader(
        train_ds,
        batch_size=cfg.batch_size,
        sampler=sampler,
        num_workers=cfg.num_workers,
        pin_memory=(device.type == 'cuda'),
        drop_last=True,
    )
    val_loader = DataLoader(
        val_ds,
        batch_size=cfg.batch_size,
        shuffle=False,
        num_workers=cfg.num_workers,
        pin_memory=(device.type == 'cuda'),
    )

    model = create_model(
        timm_name=model_cfg['timm_name'],
        num_classes=num_classes,
        drop_rate=model_cfg.get('drop_rate', 0.2),
        drop_path_rate=model_cfg.get('drop_path_rate', 0.2),
    ).to(device)

    criterion = nn.CrossEntropyLoss(
        weight=class_weights_tensor.to(device),
        label_smoothing=cfg.label_smoothing,
    )

    optimizer = torch.optim.AdamW(model.parameters(), lr=cfg.lr, weight_decay=cfg.weight_decay)
    scheduler = WarmupCosineScheduler(
        optimizer,
        total_epochs=cfg.epochs,
        warmup_epochs=cfg.warmup_epochs,
        min_lr=cfg.lr * 0.03,
    )

    scaler = torch.cuda.amp.GradScaler(enabled=(device.type == 'cuda'))

    fold_dir = Path(cfg.out_dir) / model_cfg['alias'] / f'fold_{fold}'
    fold_dir.mkdir(parents=True, exist_ok=True)

    best_val_loss = float('inf')
    best_path = fold_dir / 'best_by_val_loss.pt'

    for epoch in range(cfg.epochs):
        model.train()
        train_losses = []

        for x, y in tqdm(train_loader, desc=f'{model_cfg["alias"]} f{fold} e{epoch+1}', leave=False):
            x = x.to(device)
            y = y.to(device)

            optimizer.zero_grad(set_to_none=True)

            use_mix = cfg.use_mixup and (random.random() < cfg.mixup_prob)

            if use_mix:
                x, y_a, y_b, lam = mixup_data(x, y, alpha=cfg.mixup_alpha)

            with torch.autocast(device_type='cuda', enabled=(device.type == 'cuda')):
                logits = model(x)
                if use_mix:
                    loss = mixup_criterion(criterion, logits, y_a, y_b, lam)
                else:
                    loss = criterion(logits, y)

            if device.type == 'cuda':
                scaler.scale(loss).backward()
                scaler.step(optimizer)
                scaler.update()
            else:
                loss.backward()
                optimizer.step()

            train_losses.append(loss.item())

        scheduler.step()

        # Validation
        model.eval()
        val_losses = []
        all_logits = []
        all_targets = []

        with torch.no_grad():
            for x, y in val_loader:
                x = x.to(device)
                y = y.to(device)

                logits = model(x)
                loss = criterion(logits, y)

                val_losses.append(loss.item())
                all_logits.append(logits.detach().cpu())
                all_targets.append(y.detach().cpu())

        val_logits = torch.cat(all_logits).numpy()
        val_targets = torch.cat(all_targets).numpy()
        val_metrics = evaluate_logits(val_targets, val_logits)
        val_loss = float(np.mean(val_losses))
        tr_loss = float(np.mean(train_losses))

        print(
            f"[fold={fold}][{model_cfg['alias']}] epoch={epoch+1}/{cfg.epochs} "
            f"train_loss={tr_loss:.4f} val_loss={val_loss:.4f} "
            f"val_acc={val_metrics['acc']:.4f} val_f1m={val_metrics['f1_macro']:.4f}"
        )

        # Save best by val_loss
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(
                {
                    'model_state_dict': model.state_dict(),
                    'epoch': epoch + 1,
                    'val_loss': val_loss,
                    'val_acc': val_metrics['acc'],
                    'model_cfg': model_cfg,
                    'cfg': asdict(cfg),
                },
                best_path,
            )

    del model, train_loader, val_loader, train_ds, val_ds
    gc.collect()
    if device.type == 'cuda':
        torch.cuda.empty_cache()

    return best_path

In [None]:
def run_cv_for_model(model_cfg: Dict, df: pd.DataFrame, cfg: CFG, paths: Dict[str, Path], device: torch.device):
    model_alias = model_cfg['alias']
    print(f'\n=== Running CV for: {model_alias} ===')

    oof_logits = np.zeros((len(df), num_classes), dtype=np.float32)
    oof_targets = df['label'].values.astype(np.int64)

    test_ds = FruitDataset(
        test_df,
        paths['test_images_dir'],
        transform=build_valid_tfms(model_cfg['img_size']),
        is_test=True,
    )
    test_loader = DataLoader(
        test_ds,
        batch_size=CFG.batch_size,
        shuffle=False,
        num_workers=CFG.num_workers,
        pin_memory=(device.type == 'cuda'),
    )
    test_logits_folds = []

    for fold in range(cfg.n_folds):
        best_path = train_one_fold(
            fold=fold,
            model_cfg=model_cfg,
            df=df,
            cfg=cfg,
            paths=paths,
            class_weights_tensor=class_weights_tensor,
            device=device,
        )

        # Load best checkpoint for fold inference
        model = create_model(
            timm_name=model_cfg['timm_name'],
            num_classes=num_classes,
            drop_rate=model_cfg.get('drop_rate', 0.2),
            drop_path_rate=model_cfg.get('drop_path_rate', 0.2),
        ).to(device)

        ckpt = torch.load(best_path, map_location=device)
        model.load_state_dict(ckpt['model_state_dict'])
        model.eval()

        # OOF logits
        val_part = df[df['fold'] == fold].reset_index(drop=True)
        val_ds = FruitDataset(
            val_part,
            paths['train_images_dir'],
            transform=build_valid_tfms(model_cfg['img_size']),
            is_test=False,
        )
        val_loader = DataLoader(
            val_ds,
            batch_size=cfg.batch_size,
            shuffle=False,
            num_workers=cfg.num_workers,
            pin_memory=(device.type == 'cuda'),
        )

        with torch.no_grad():
            fold_logits = []
            for x, y in tqdm(val_loader, desc=f'OOF {model_alias} fold{fold}', leave=False):
                x = x.to(device)
                logits = model(x)
                fold_logits.append(logits.detach().cpu())
            fold_logits = torch.cat(fold_logits).numpy()

        oof_logits[df['fold'].values == fold] = fold_logits

        # Test logits with TTA
        fold_test_logits, test_ids = predict_logits(model, test_loader, device, tta=True)
        test_logits_folds.append(fold_test_logits)

        del model, val_ds, val_loader
        gc.collect()
        if device.type == 'cuda':
            torch.cuda.empty_cache()

    # Aggregate test logits across folds
    test_logits = np.mean(np.stack(test_logits_folds, axis=0), axis=0)

    # Save artifacts
    model_dir = Path(cfg.out_dir) / model_alias
    model_dir.mkdir(parents=True, exist_ok=True)

    np.save(model_dir / 'oof_logits.npy', oof_logits)
    np.save(model_dir / 'oof_targets.npy', oof_targets)
    np.save(model_dir / 'test_logits.npy', test_logits)

    metrics = evaluate_logits(oof_targets, oof_logits)
    print(f"OOF metrics [{model_alias}] -> acc={metrics['acc']:.5f}, f1_macro={metrics['f1_macro']:.5f}")

    with open(model_dir / 'metrics.json', 'w', encoding='utf-8') as f:
        json.dump(metrics, f, ensure_ascii=False, indent=2)

    return metrics

## 4) Запуск обучения (много часов)

Сначала запустите 1 модель и проверьте, что пайплайн корректен.
Потом масштабируйте до нескольких архитектур.

In [None]:
if CFG.run_training:
    all_metrics = {}
    for mcfg in CFG.model_configs:
        metrics = run_cv_for_model(mcfg, fold_df, CFG, paths, device)
        all_metrics[mcfg['alias']] = metrics

    with open(Path(CFG.out_dir) / 'all_model_oof_metrics.json', 'w', encoding='utf-8') as f:
        json.dump(all_metrics, f, ensure_ascii=False, indent=2)

    print('Training done. Metrics:')
    print(json.dumps(all_metrics, ensure_ascii=False, indent=2))
else:
    print('CFG.run_training=False -> skipping training.')

## 5) Ensemble по OOF (веса ищутся на train OOF)

Идея: подбираем веса моделей так, чтобы максимизировать OOF accuracy.
Это обычно надежнее, чем выбирать веса по public leaderboard.

In [None]:
def load_model_artifacts(out_dir: Path, model_aliases: List[str]):
    oof_logits_list = []
    test_logits_list = []

    for alias in model_aliases:
        model_dir = out_dir / alias
        oof_logits = np.load(model_dir / 'oof_logits.npy')
        test_logits = np.load(model_dir / 'test_logits.npy')
        oof_logits_list.append(oof_logits)
        test_logits_list.append(test_logits)

    return oof_logits_list, test_logits_list


def score_weights(oof_logits_list: List[np.ndarray], y_true: np.ndarray, w: np.ndarray) -> float:
    w = w / (w.sum() + 1e-12)
    blended = np.zeros_like(oof_logits_list[0])
    for wi, lg in zip(w, oof_logits_list):
        blended += wi * lg
    pred = blended.argmax(1)
    return accuracy_score(y_true, pred)


def search_best_weights(oof_logits_list: List[np.ndarray], y_true: np.ndarray, trials: int = 5000, seed: int = 42):
    rng = np.random.default_rng(seed)
    n = len(oof_logits_list)

    # start from uniform
    best_w = np.ones(n, dtype=np.float64) / n
    best_score = score_weights(oof_logits_list, y_true, best_w)

    for _ in tqdm(range(trials), desc='weight search'):
        w = rng.dirichlet(np.ones(n))
        s = score_weights(oof_logits_list, y_true, w)
        if s > best_score:
            best_score = s
            best_w = w

    return best_w, best_score

In [None]:
model_aliases = [m['alias'] for m in CFG.model_configs]
out_dir = Path(CFG.out_dir)

missing = [a for a in model_aliases if not (out_dir / a / 'oof_logits.npy').exists()]
if missing:
    print('Missing model artifacts:', missing)
    print('Train models first, then run ensemble.')
else:
    oof_logits_list, test_logits_list = load_model_artifacts(out_dir, model_aliases)
    y_true = np.load(out_dir / model_aliases[0] / 'oof_targets.npy')

    # Individual model scores
    print('Individual OOF accuracy:')
    for alias, lg in zip(model_aliases, oof_logits_list):
        print(alias, accuracy_score(y_true, lg.argmax(1)))

    best_w, best_oof = search_best_weights(
        oof_logits_list,
        y_true,
        trials=CFG.ensemble_trials,
        seed=CFG.seed,
    )

    print('Best OOF accuracy:', best_oof)
    print('Best weights:')
    for a, w in zip(model_aliases, best_w):
        print(f'  {a}: {w:.4f}')

    # Blend test logits
    best_w = best_w / best_w.sum()
    test_blend = np.zeros_like(test_logits_list[0])
    for w, lg in zip(best_w, test_logits_list):
        test_blend += w * lg

    test_pred = test_blend.argmax(1)

    sub = pd.read_csv(paths['sample_submission'])
    sub['label'] = test_pred

    sub_path = out_dir / 'submission_ensemble_oof_optimized.csv'
    sub.to_csv(sub_path, index=False)
    print('Saved submission:', sub_path)

    with open(out_dir / 'ensemble_weights.json', 'w', encoding='utf-8') as f:
        json.dump({a: float(w) for a, w in zip(model_aliases, best_w)}, f, ensure_ascii=False, indent=2)

## 6) Анти-overfit правила (обязательные)

1. Не выбирайте финальный сабмит по одному public score.
2. Делайте shortlist по OOF: высокий mean + низкий std между фолдами.
3. Submit only top-k кандидатов по OOF (например, 2-3 в день).
4. Не меняйте data cleaning и folds в середине серии сравнений.
5. Логируйте все эксперименты (`model`, `seed`, `img_size`, `aug`, `loss`, `OOF`).

## 7) Что улучшать дальше

- Stage-2 fine-tune (последние эпохи на большем `img_size`, меньшем LR).
- EMA весов модели.
- Псевдолейблы только с очень высоким порогом уверенности (и только после сильного OOF baseline).
- Дополнительные архитектуры в ансамбль (например, `coatnet`, `beit`, `swin`) после проверки OOF-дельты.