In [1]:
from google.colab import drive
drive.mount('/content/drive')
base_path = '/content/drive/My Drive/bach'

Mounted at /content/drive


In [2]:
!pip install wandb



In [3]:
import pandas as pd
import numpy as np

import os
import random
from glob import glob
from queue import Queue
from threading import Thread
from typing import Dict, List, Tuple, Optional

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from torchvision.tv_tensors._image import Image

from dataclasses import dataclass

import wandb
from tqdm.auto import tqdm




In [4]:
PROCESSED_IMAGES_PATH = base_path + "/processed_images"
HIERARCHY_FILE_PATH = base_path + "/hierarchy.csv"

In [5]:
class Hierarchy():
    def __init__(self, path: Optional[str] = None):
        if path is None:
            path = HIERARCHY_FILE_PATH
        self.hierarchy: pd.DataFrame = pd.read_csv(path)

    def get_root_id(self):
        root_node = self.hierarchy[self.hierarchy["parent_id"].isnull()]

        return root_node["id"].values[0]

    def get_children(self, parent_id: str):

        children = self.hierarchy[self.hierarchy["parent_id"] == parent_id]

        return children["id"].tolist()

    def get_leaf_nodes(self, root_id: str) -> List[str]:
        """
        Recursively get all leaf nodes ids under a given root node
        """

        children = self.get_children(root_id)

        if not children:
            return [root_id]

        leaf_nodes = []

        for child in children:
            leaf_nodes.extend(self.get_leaf_nodes(child))

        return leaf_nodes


In [6]:
class ImageDataset(Dataset):
    def __init__(self, root_dir: str, categories: Dict[str, List[str]], split: str = 'train',
                 train_ratio: float = 0.70, val_ratio: float = 0.15):

        self.root_dir = os.path.normpath(root_dir)
        self.categories = categories
        self.split = split

        # Create category to index mapping
        self.cat_mapping = {cat: idx for idx,
                            cat in enumerate(categories.keys())}

        self.train_augmentations = v2.Compose([
            v2.RandomApply([
                v2.ColorJitter(
                    brightness=0.3,
                    contrast=0.3,
                    saturation=0.3,
                    hue=0.15
                )
            ], p=0.5),
            v2.RandomApply([
                v2.GaussianBlur(kernel_size=(3, 3), sigma=(0.1, 2.0))
            ], p=0.4),
            v2.RandomApply([
                v2.RandomAdjustSharpness(sharpness_factor=2)
            ], p=0.4),
            v2.RandomErasing(p=0.5, scale=(0.02, 0.2), ratio=(0.3, 3.3)),
            v2.RandomApply([
                v2.RandomPerspective(distortion_scale=0.3)
            ], p=0.3),
            v2.RandomApply([
                v2.RandomRotation(degrees=15)
            ], p=0.3),
            v2.RandomHorizontalFlip(p=0.5),
            v2.RandomAutocontrast(p=0.3),
        ])

        self.rebalancing_augmentations = [
            'flip_h',         # Horizontal flip
            'flip_v',         # Vertical flip
            'rot90',          # 90 degree rotation
            'rot180',         # 180 degree rotation
            'rot270',         # 270 degree rotation
            'flip_h_rot90',   # Combine flip and rotation
            'flip_v_rot90',   # Another combination
            'identity'        # No change
        ]

        # Collect all file paths and their categories
        self.samples = []

        n_samples = {}
        for cat, leaves in categories.items():
            n_samples[cat] = 0
            for leaf in leaves:
                cat_dir = os.path.join(self.root_dir, leaf)
                tensor_files = glob(os.path.join(cat_dir, '*.pt'))

                # Generate deterministic train/val/test split
                n_files = len(tensor_files)
                indices = list(range(n_files))

                # Seed random number generator for reproducibility
                random.Random(42).shuffle(indices)

                n_train = int(n_files * train_ratio)
                n_val = int(n_files * val_ratio)

                if split == 'train':
                    selected_indices = indices[:n_train]
                elif split == 'val':
                    selected_indices = indices[n_train:n_train + n_val]
                else:  # test
                    selected_indices = indices[n_train + n_val:]

                n_samples[cat] += len(selected_indices)
                for idx in selected_indices:
                    self.samples.append({
                        'path': tensor_files[idx],
                        'category': cat,
                        'label': self.cat_mapping[cat]
                    })

        # print sample distribution
        # percentage of samples in each category

        total_samples = sum(n_samples.values())

        for cat, n in n_samples.items():
            print(f"{cat}: {n} samples ({n/total_samples:.2%})")

        # Only balance classes for training data
        if split != 'train':
            return

        # Balance classes by creating transformations of underrepresented classes
        max_cat = max(n_samples, key=n_samples.get)
        max_cat_samples = max(n_samples.values())

        # create tmp path for storing transformed tensors
        tmp_dir = os.path.join(self.root_dir, 'tmp')
        os.makedirs(tmp_dir, exist_ok=True)

        # Balance classes by creating transformations
        for cat in self.categories.keys():
            if cat == max_cat:
                print(
                    f"Skipping balanced class {cat}, already has {max_cat_samples} samples")
                continue

            cat_samples = [
                sample for sample in self.samples if sample['category'] == cat]
            n_samples_cat = n_samples[cat]
            while n_samples_cat < max_cat_samples:
                # Select a random sample from the class
                sample = random.choice(cat_samples)
                # Create a random transformation
                tensor = torch.load(sample['path'], weights_only=True)

                transformation_type = random.choice(
                    self.rebalancing_augmentations)
                if transformation_type == 'flip_h':
                    transformed_tensor = torch.flip(
                        tensor, [2])  # Horizontal flip
                elif transformation_type == 'flip_v':
                    transformed_tensor = torch.flip(
                        tensor, [1])  # Vertical flip
                elif transformation_type == 'rot90':
                    transformed_tensor = torch.rot90(tensor, k=1, dims=(1, 2))
                elif transformation_type == 'rot180':
                    transformed_tensor = torch.rot90(tensor, k=2, dims=(1, 2))
                elif transformation_type == 'rot270':
                    transformed_tensor = torch.rot90(tensor, k=3, dims=(1, 2))
                elif transformation_type == 'flip_h_rot90':
                    transformed_tensor = torch.rot90(
                        torch.flip(tensor, [2]), k=1, dims=(1, 2))
                elif transformation_type == 'flip_v_rot90':
                    transformed_tensor = torch.rot90(
                        torch.flip(tensor, [1]), k=1, dims=(1, 2))
                else:  # identity
                    transformed_tensor = tensor

                # Save the transformed tensor to a temporary path
                tmp_path = os.path.join(
                    tmp_dir, f'{n_samples_cat}_{cat}.pt')

                assert transformed_tensor.shape == tensor.shape, \
                    f"Transformed tensor shape {transformed_tensor.shape} does not match original tensor shape {tensor.shape}"

                torch.save(transformed_tensor, tmp_path)

                self.samples.append({
                    'path': tmp_path,
                    'category': cat,
                    'label': self.cat_mapping[cat]
                })

                n_samples_cat += 1

            print(f"Balanced class {cat} to {n_samples_cat} samples")

    def __len__(self) -> int:
        return len(self.samples)

    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
        sample = self.samples[idx]
        # Load preprocessed tensor
        tensor = torch.load(sample['path'], weights_only=True)

        if self.split == 'train':
            old_shape = tensor.shape
            tensor = self.train_augmentations(tensor)

            assert tensor.shape == old_shape, \
                f"Augmented tensor shape {tensor.shape} does not match original tensor shape {old_shape}"

        return tensor, torch.tensor(sample['label'], dtype=torch.long)


class PrefetchLoader:
    def __init__(self, loader: DataLoader, buffer_size: int = 2, device: torch.device = None, root_dir: str = None):
        self.loader = loader
        self.buffer_size = buffer_size
        self.device = device or torch.device(
            'cuda' if torch.cuda.is_available() else 'cpu')
        self.buffer = Queue(maxsize=buffer_size)
        self.stop_event = None
        self.prefetch_thread = None
        self.root_dir = root_dir
        self._active = False

    def prefetch_worker(self):
        try:
            for batch in self.loader:
                if self.stop_event.is_set():
                    break

                # Move batch to device in the background thread
                if isinstance(batch, (tuple, list)):
                    batch = tuple(t.to(self.device, non_blocking=True)
                                  if isinstance(t, torch.Tensor) else t
                                  for t in batch)
                elif isinstance(batch, dict):
                    batch = {k: v.to(self.device, non_blocking=True)
                             if isinstance(v, torch.Tensor) else v
                             for k, v in batch.items()}
                elif isinstance(batch, torch.Tensor):
                    batch = batch.to(self.device, non_blocking=True)

                self.buffer.put(batch)
        except Exception as e:
            print(f"Prefetch worker error: {e}")
            self.stop_event.set()
        finally:
            self.buffer.put(None)  # Signal end of data
            self._active = False

    def __iter__(self):
        # Clean up previous iteration if necessary
        if self._active:
            self.stop_event.set()
            if self.prefetch_thread is not None:
                self.prefetch_thread.join()
            while not self.buffer.empty():
                self.buffer.get()

        # Start new iteration
        self._active = True
        self.stop_event = torch.multiprocessing.Event()
        self.prefetch_thread = Thread(
            target=self.prefetch_worker,
            daemon=True
        )
        self.prefetch_thread.start()

        while True:
            if self.stop_event.is_set():
                raise RuntimeError("Prefetch worker encountered an error")

            batch = self.buffer.get()
            if batch is None:
                break
            yield batch

    def __len__(self):
        return len(self.loader)

    def __del__(self):
        if self._active:
            self.stop_event.set()
            if self.prefetch_thread is not None:
                self.prefetch_thread.join()

    def remove_tmp_folder(self):
        tmp_dir = os.path.join(self.root_dir, 'tmp')
        for file in os.listdir(tmp_dir):
            os.remove(os.path.join(tmp_dir, file))

        os.rmdir(tmp_dir)


def create_images_dataloader(
    categories: Dict[str, List[str]],
    batch_size: int = 32,
    split: str = 'train',
    num_workers: int = 4,
    n_prefetch_batches: int = 3,
    device: torch.device = None
) -> PrefetchLoader:
    """
    Creates a dataloader with prefetching for the specified categories.

    Args:
        root_dir: Path to processed data directory
        categories: List of category names to include
        batch_size: Batch size
        split: One of 'train', 'val', 'test'
        num_workers: Number of worker processes
        prefetch_factor: Number of batches to prefetch
    """

    dataset = ImageDataset(PROCESSED_IMAGES_PATH, categories, split)

    # Create base DataLoader
    loader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=(split == 'train'),
        num_workers=num_workers,
        drop_last=True,
    )

    # Wrap with prefetching
    return PrefetchLoader(loader, buffer_size=n_prefetch_batches, device=device, root_dir=PROCESSED_IMAGES_PATH)



In [7]:
import torch.nn as nn


class ResidualBlock(nn.Module):
    def __init__(self, in_channels, out_channels, stride=1, dropout_rate=0.2):
        super().__init__()

        self.dropout = nn.Dropout2d(dropout_rate)

        self.bn1 = nn.BatchNorm2d(in_channels)
        self.relu = nn.ReLU(inplace=True)
        self.conv1 = nn.Conv2d(in_channels, out_channels,
                               kernel_size=3, stride=stride, padding=1)

        self.bn2 = nn.BatchNorm2d(out_channels)
        self.conv2 = nn.Conv2d(out_channels, out_channels,
                               kernel_size=3, padding=1)

        if stride != 1 or in_channels != out_channels:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_channels, out_channels,
                          kernel_size=1, stride=stride)
            )
        else:
            self.shortcut = nn.Identity()

    def forward(self, x):
        # Pre-activation pattern
        out = self.bn1(x)
        out = self.relu(out)
        out = self.conv1(out)

        out = self.bn2(out)
        out = self.relu(out)
        out = self.dropout(out)
        out = self.conv2(out)

        out += self.shortcut(x)

        return out


class HierarchyNodeModel(nn.Module):
    def __init__(self, num_classes, input_channels=3):
        super().__init__()


        self.conv1 = nn.Sequential(
            nn.Conv2d(input_channels, 16, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(16),
            nn.ReLU(inplace=True),
            nn.Dropout2d(0.1),
            nn.MaxPool2d(kernel_size=2, stride=2)
        )

        # Wider but keep block count
        self.layer1 = nn.Sequential(
            ResidualBlock(16, 48, dropout_rate=0.1),
            ResidualBlock(48, 48, dropout_rate=0.1),
            ResidualBlock(48, 48, dropout_rate=0.1)
        )

        self.layer2 = nn.Sequential(
            ResidualBlock(48, 96, stride=2, dropout_rate=0.15),
            ResidualBlock(96, 96, dropout_rate=0.15),
            ResidualBlock(96, 96, dropout_rate=0.15)
        )

        self.layer3 = nn.Sequential(
            ResidualBlock(96, 192, stride=2, dropout_rate=0.2),
            ResidualBlock(192, 192, dropout_rate=0.2)
        )

        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))

        # Slightly modified head
        self.head = nn.Sequential(
            nn.Flatten(),
            nn.Dropout(0.3),
            nn.Linear(192, 96),
            nn.ReLU(inplace=True),
            nn.Dropout(0.3),
            nn.Linear(96, num_classes)
        )

        # Weight initialization
        self.apply(self._init_weights)

    def _init_weights(self, m):
        if isinstance(m, nn.Conv2d):
            nn.init.kaiming_normal_(
                m.weight, mode='fan_in', nonlinearity='relu')
        elif isinstance(m, nn.BatchNorm2d):
            nn.init.constant_(m.weight, 1)
            nn.init.constant_(m.bias, 0)
        elif isinstance(m, nn.Linear):
            nn.init.normal_(m.weight, 0, 0.01)
            nn.init.constant_(m.bias, 0)

    def forward(self, x):
        x = self.conv1(x)
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.avgpool(x)
        x = self.head(x)
        return x


In [8]:
@dataclass
class TrainConfig:
    # train parameters
    epochs: int
    batch_size: int

    # lr parameters
    max_lr: float
    div_factor: float
    final_div_factor: float
    pct_start: float

    # regularization parameters
    grad_clip_value: float
    weight_decay: float
    early_stopping_patience: int
    label_smoothing: float

    optimizer: str

    @property
    def initial_lr(self) -> float:
        return self.max_lr / self.div_factor

    @property
    def min_lr(self) -> float:
        return self.max_lr / self.final_div_factor


In [9]:
def __get_data_loaders(hierarchy: Hierarchy, node_id: str, train_config: TrainConfig) -> Tuple[DataLoader, DataLoader, int]:
    children = hierarchy.get_children(node_id)
    categories_dict = {child: hierarchy.get_leaf_nodes(
        child) for child in children}

    train_loader = create_images_dataloader(
        categories_dict, split='train', batch_size=train_config.batch_size)
    val_loader = create_images_dataloader(
        categories_dict, split='val', batch_size=train_config.batch_size)

    return train_loader, val_loader, len(children)


def __create_optimizer(model: nn.Module, train_config: TrainConfig) -> torch.optim.Optimizer:
    if train_config.optimizer == 'adamw':
        return torch.optim.AdamW(
            model.parameters(),
            lr=train_config.initial_lr,
            weight_decay=train_config.weight_decay
        )
    elif train_config.optimizer == 'adam':
        return torch.optim.Adam(
            model.parameters(),
            lr=train_config.initial_lr
        )
    else:
        raise ValueError(f'Invalid optimizer: {train_config.optimizer}')


class MetricSmoother:
    def __init__(self, window_size: int = 3):
        self.window_size = window_size
        self.values = []

    def update(self, value: float) -> float:
        self.values.append(value)
        if len(self.values) > self.window_size:
            self.values.pop(0)
        return self.get_smoothed()

    def get_smoothed(self) -> float:
        return sum(self.values) / len(self.values)


def get_mixup_alpha(train_acc, val_acc, epoch, total_epochs, grad_norm):
    """Dynamically adjust mixup alpha based on overfitting gap"""
    gap = train_acc - val_acc
# Adjust alpha based on gradient norm
    if grad_norm < 0.01:  # Very small gradients
        base_alpha = 0.4  # Less mixing to allow bigger steps
    elif grad_norm > 2.0:  # Large gradients
        base_alpha = 0.6  # More mixing for stability
    elif grad_norm > 1.5:
        base_alpha = 0.7
    elif grad_norm > 1.0:
        base_alpha = 0.8
    else:
        base_alpha = 0.6

    # Adjust for accuracy gap
    if gap > 0.15:
        base_alpha *= 1.2

    # Early training adjustment
    if epoch < total_epochs * 0.1:
        return min(1.0, base_alpha * 1.1)
    return base_alpha


def mixup_data(x, y, alpha):
    """Performs mixup on the input and target"""
    if alpha > 0:
        lam = np.random.beta(alpha, alpha)
    else:
        lam = 1

    batch_size = x.size()[0]
    index = torch.randperm(batch_size)

    mixed_x = lam * x + (1 - lam) * x[index]
    y_a, y_b = y, y[index]
    return mixed_x, y_a, y_b, lam


def get_gradient_norm(model):
    total_norm = 0
    for p in model.parameters():
        if p.grad is not None:
            param_norm = p.grad.data.norm(2)
            total_norm += param_norm.item() ** 2
    return total_norm ** 0.5


In [10]:
def __train_epoch(
    model: nn.Module,
    loader: PrefetchLoader,
    optimizer: torch.optim.Optimizer,
    scheduler: torch.optim.lr_scheduler.OneCycleLR,
    grad_clip_value: float = 0.5,
    label_smoothing: float = 0.1,
    prev_train_acc: float = 0.0,
    prev_val_acc: float = 0.0,
    epoch: int = 0,
    total_epochs: int = 100,
    avg_grad_norm: float = 0.0
) -> Tuple[float, float, float, float]:

    model.train()
    total_loss = 0
    correct = 0
    total_samples = 0

    alpha = get_mixup_alpha(prev_train_acc, prev_val_acc,
                            epoch, total_epochs, avg_grad_norm)
    print(f"Mixup alpha: {alpha}")

    progress_bar = tqdm(loader, desc='Training')
    grad_norms = []

    criterion = nn.CrossEntropyLoss(label_smoothing=label_smoothing)

    for batch_idx, (X_batch, y_batch) in enumerate(progress_bar):
        optimizer.zero_grad()

        mixed_X, y_a, y_b, lam = mixup_data(X_batch, y_batch, alpha)

        y_pred = model(mixed_X)

        loss = lam * criterion(y_pred, y_a) + (1 - lam) * \
            criterion(y_pred, y_b)

        loss.backward()

        grad_norm = get_gradient_norm(model)
        grad_norms.append(grad_norm)

        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(
            model.parameters(), grad_clip_value, norm_type=2)

        optimizer.step()
        scheduler.step()

        # Update metrics
        model.eval()
        with torch.no_grad():
            original_outputs = model(X_batch)
            pred = original_outputs.argmax(dim=1)
            correct += (pred == y_batch).sum().item()
        model.train()

        total_samples += y_batch.size(0)
        total_loss += loss.item()

        # Update progress bar
        progress_bar.set_postfix(
            loss=f"{loss.item():.4f}",
            acc=f"{100.0 * correct / total_samples:.2f}%"
        )

    epoch_loss = total_loss / len(loader)
    epoch_acc = correct / total_samples

    avg_grad_norm = np.mean(grad_norms)

    return epoch_loss, epoch_acc, alpha, avg_grad_norm





In [11]:
@torch.no_grad()
def __validate(model: nn.Module, loader: PrefetchLoader) -> Tuple[float, float]:
    model.eval()
    total_loss = 0
    correct = 0
    total_samples = 0

    criterion = nn.CrossEntropyLoss()
    progress_bar = tqdm(
        loader, desc='Validation')
    for X_batch, y_batch in progress_bar:
        y_pred = model(X_batch)
        loss = criterion(y_pred, y_batch)

        total_loss += loss.item()
        correct += (y_pred.argmax(dim=1) == y_batch).sum().item()
        total_samples += y_batch.size(0)

        progress_bar.set_postfix(
            loss=f"{loss.item():.4f}",
            acc=f"{100.0 * correct / total_samples:.2f}%"
        )

    epoch_loss = total_loss / len(loader)
    epoch_acc = correct / total_samples
    return epoch_loss, epoch_acc



In [12]:
from datetime import datetime
from torchvision.transforms import v2


In [13]:
def train_singular_model(hierarchy: Hierarchy, node_id: str, train_config: TrainConfig, device: str) -> None:

    # init wandb run
    run_name = f"node_{node_id}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
    wandb.init(
        project="bachelor-resnet-icaam",
        name=run_name,
        config={
            "architecture": "HierarchyNodeModel",
            "epochs": train_config.epochs,
            "batch_size": train_config.batch_size,
            "optimizer": train_config.optimizer,
            "weight_decay": train_config.weight_decay,
            "grad_clip_value": train_config.grad_clip_value,
            "label_smoothing": train_config.label_smoothing,
            "early_stopping_patience": train_config.early_stopping_patience,
            "pct_start": train_config.pct_start,
            "div_factor": train_config.div_factor,
            "final_div_factor": train_config.final_div_factor,
            "max_lr": train_config.max_lr,
            "node_id": node_id,
            "num_classes": None  # Will be set after loading data
        }
    )
    # Setup data
    train_loader, val_loader, num_classes = __get_data_loaders(
        hierarchy, node_id, train_config)

    # Update wandb config
    wandb.config.update({"num_classes": num_classes}, allow_val_change=True)

    # Setup logging
    #logger = create_file_logger(f'train_{node_id}.log')
    #logger.info(f'Training model for node {node_id}')

    # Create model
    model = HierarchyNodeModel(num_classes=num_classes)
    model.to(device)

    wandb.watch(model, log="all", log_freq=10)

    # Setup training
    optimizer = __create_optimizer(model, train_config)

    # Setup schedulers
    num_steps = len(train_loader) * train_config.epochs
    scheduler = torch.optim.lr_scheduler.OneCycleLR(
        optimizer,
        max_lr=train_config.max_lr,
        total_steps=num_steps,
        pct_start=train_config.pct_start,  # 20% warmup
        anneal_strategy='cos',
        div_factor=train_config.div_factor,
        final_div_factor=train_config.final_div_factor,
        three_phase=True
    )
    # Setup validation loss smoother
    val_smoother = MetricSmoother(window_size=5)

    # Training loop
    best_val_loss = float('inf')
    patience = 0

    train_acc = 0.0
    val_acc = 0.0
    avg_grad_norm = 0.0

    for epoch in range(train_config.epochs):
        print(f'\nEpoch {epoch + 1}/{train_config.epochs}')

        # Training phase
        train_loss, train_acc, mixup_alpha, avg_grad_norm = __train_epoch(
            model=model,
            loader=train_loader,
            optimizer=optimizer,
            scheduler=scheduler,
            grad_clip_value=train_config.grad_clip_value,
            label_smoothing=train_config.label_smoothing,
            prev_train_acc=train_acc,
            prev_val_acc=val_acc,
            epoch=epoch,
            total_epochs=train_config.epochs,
            avg_grad_norm=avg_grad_norm
        )

        # Validation phase
        val_loss, val_acc = __validate(model, val_loader)
        smoothed_val_loss = val_smoother.update(val_loss)

        # Log metrics to wandb
        wandb.log({
            "epoch": epoch + 1,
            "train/loss": train_loss,
            "train/accuracy": train_acc,
            "val/loss": val_loss,
            "val/accuracy": val_acc,
            "val/smoothed_loss": smoothed_val_loss,
            "metrics/acc_gap": train_acc - val_acc,
            "learning_rate": optimizer.param_groups[0]["lr"],
            "mixup_alpha": mixup_alpha,
            "avg_grad_norm": avg_grad_norm
        })

        # Logging
        print(
            f'Epoch {epoch + 1}: '
            f'Train Loss: {train_loss:.4f}, '
            f'Train Acc: {train_acc:.4f}, '
            f'Val Loss: {val_loss:.4f}, '
            f'Val Acc: {val_acc:.4f}, '
            f"Smoothed Val Loss: {smoothed_val_loss:.4f}, "
            f'Gap: {(train_acc - val_acc):.4f}, '
            f'LR: {optimizer.param_groups[0]["lr"]:.6f}'
        )

        # Model checkpointing
        if smoothed_val_loss < best_val_loss:
            best_val_loss = smoothed_val_loss
            patience = 0
            torch.save(model.state_dict(), f'model_{node_id}.pth')
            print(f'Saved new best model with val_loss: {val_loss:.4f}')

            wandb.save(f'model_{node_id}.pth')
            wandb.run.summary["best_val_loss"] = best_val_loss
            wandb.run.summary["best_val_acc"] = val_acc
        else:
            patience += 1
            if patience >= train_config.early_stopping_patience:
                print('Early stopping triggered')
                wandb.run.summary["early_stopping_epoch"] = epoch + 1
                break

    print('Training completed')
    wandb.finish()



In [14]:
config = TrainConfig(
        epochs=150,
        batch_size=16,
        max_lr=1.2e-3,
        div_factor=15.0,
        final_div_factor=100.0,
        pct_start=0.4,
        weight_decay=0.1,    # Increased for more regularization
        grad_clip_value=0.4,
        label_smoothing=0.1,
        early_stopping_patience=30,
        optimizer="adamw",
    )

In [15]:
hierarchy = Hierarchy()

In [16]:
torch.serialization.add_safe_globals([Image])

In [20]:
root_id = "n03862676"

In [21]:
train_singular_model(hierarchy, root_id, config, "cuda")

[34m[1mwandb[0m: Currently logged in as: [33mhedrekao[0m ([33mhedrekao-via-university-college[0m). Use [1m`wandb login --relogin`[0m to force relogin


n03259280: 337 samples (51.06%)
n04111531: 323 samples (48.94%)
Skipping balanced class n03259280, already has 337 samples
Balanced class n04111531 to 337 samples
n03259280: 72 samples (51.06%)
n04111531: 69 samples (48.94%)

Epoch 1/150
Mixup alpha: 0.44000000000000006


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 1: Train Loss: 0.6927, Train Acc: 0.5327, Val Loss: 0.6891, Val Acc: 0.5469, Smoothed Val Loss: 0.6891, Gap: -0.0141, LR: 0.000081
Saved new best model with val_loss: 0.6891

Epoch 2/150
Mixup alpha: 0.66


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 2: Train Loss: 0.6926, Train Acc: 0.5640, Val Loss: 0.6876, Val Acc: 0.5391, Smoothed Val Loss: 0.6883, Gap: 0.0249, LR: 0.000083
Saved new best model with val_loss: 0.6876

Epoch 3/150
Mixup alpha: 0.8800000000000001


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 3: Train Loss: 0.6870, Train Acc: 0.5818, Val Loss: 0.6854, Val Acc: 0.5391, Smoothed Val Loss: 0.6873, Gap: 0.0428, LR: 0.000087
Saved new best model with val_loss: 0.6854

Epoch 4/150
Mixup alpha: 0.8800000000000001


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 4: Train Loss: 0.6872, Train Acc: 0.5789, Val Loss: 0.6946, Val Acc: 0.5469, Smoothed Val Loss: 0.6892, Gap: 0.0320, LR: 0.000092

Epoch 5/150
Mixup alpha: 0.8800000000000001


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 5: Train Loss: 0.6860, Train Acc: 0.5997, Val Loss: 0.6745, Val Acc: 0.5625, Smoothed Val Loss: 0.6862, Gap: 0.0372, LR: 0.000099
Saved new best model with val_loss: 0.6745

Epoch 6/150
Mixup alpha: 0.8800000000000001


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 6: Train Loss: 0.6845, Train Acc: 0.6220, Val Loss: 0.6714, Val Acc: 0.5469, Smoothed Val Loss: 0.6827, Gap: 0.0751, LR: 0.000107
Saved new best model with val_loss: 0.6714

Epoch 7/150
Mixup alpha: 0.8800000000000001


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 7: Train Loss: 0.6825, Train Acc: 0.6190, Val Loss: 0.6812, Val Acc: 0.5625, Smoothed Val Loss: 0.6814, Gap: 0.0565, LR: 0.000117
Saved new best model with val_loss: 0.6812

Epoch 8/150
Mixup alpha: 0.77


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 8: Train Loss: 0.6816, Train Acc: 0.6384, Val Loss: 0.6701, Val Acc: 0.5625, Smoothed Val Loss: 0.6784, Gap: 0.0759, LR: 0.000128
Saved new best model with val_loss: 0.6701

Epoch 9/150
Mixup alpha: 0.77


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 9: Train Loss: 0.6862, Train Acc: 0.6369, Val Loss: 0.6484, Val Acc: 0.5938, Smoothed Val Loss: 0.6691, Gap: 0.0432, LR: 0.000141
Saved new best model with val_loss: 0.6484

Epoch 10/150
Mixup alpha: 0.8800000000000001


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 10: Train Loss: 0.6871, Train Acc: 0.6116, Val Loss: 0.6652, Val Acc: 0.5391, Smoothed Val Loss: 0.6673, Gap: 0.0725, LR: 0.000155
Saved new best model with val_loss: 0.6652

Epoch 11/150
Mixup alpha: 0.8800000000000001


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 11: Train Loss: 0.6849, Train Acc: 0.6235, Val Loss: 0.6712, Val Acc: 0.5781, Smoothed Val Loss: 0.6673, Gap: 0.0454, LR: 0.000170
Saved new best model with val_loss: 0.6712

Epoch 12/150
Mixup alpha: 0.8800000000000001


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 12: Train Loss: 0.6753, Train Acc: 0.6473, Val Loss: 0.6447, Val Acc: 0.6016, Smoothed Val Loss: 0.6600, Gap: 0.0458, LR: 0.000187
Saved new best model with val_loss: 0.6447

Epoch 13/150
Mixup alpha: 0.8800000000000001


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 13: Train Loss: 0.6804, Train Acc: 0.6384, Val Loss: 0.6471, Val Acc: 0.6016, Smoothed Val Loss: 0.6553, Gap: 0.0368, LR: 0.000205
Saved new best model with val_loss: 0.6471

Epoch 14/150
Mixup alpha: 0.77


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 14: Train Loss: 0.6847, Train Acc: 0.6012, Val Loss: 0.6897, Val Acc: 0.5312, Smoothed Val Loss: 0.6636, Gap: 0.0699, LR: 0.000224

Epoch 15/150
Mixup alpha: 0.8800000000000001


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 15: Train Loss: 0.6768, Train Acc: 0.6562, Val Loss: 0.6404, Val Acc: 0.5938, Smoothed Val Loss: 0.6586, Gap: 0.0625, LR: 0.000244

Epoch 16/150
Mixup alpha: 0.8


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 16: Train Loss: 0.6729, Train Acc: 0.6771, Val Loss: 0.6500, Val Acc: 0.6016, Smoothed Val Loss: 0.6544, Gap: 0.0755, LR: 0.000265
Saved new best model with val_loss: 0.6500

Epoch 17/150
Mixup alpha: 0.7


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 17: Train Loss: 0.6720, Train Acc: 0.6429, Val Loss: 0.6235, Val Acc: 0.6250, Smoothed Val Loss: 0.6501, Gap: 0.0179, LR: 0.000288
Saved new best model with val_loss: 0.6235

Epoch 18/150
Mixup alpha: 0.7


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 18: Train Loss: 0.6752, Train Acc: 0.6548, Val Loss: 0.7076, Val Acc: 0.5078, Smoothed Val Loss: 0.6622, Gap: 0.1469, LR: 0.000311

Epoch 19/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 19: Train Loss: 0.6744, Train Acc: 0.6399, Val Loss: 0.6399, Val Acc: 0.5859, Smoothed Val Loss: 0.6523, Gap: 0.0539, LR: 0.000335

Epoch 20/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 20: Train Loss: 0.6650, Train Acc: 0.6756, Val Loss: 0.7213, Val Acc: 0.5547, Smoothed Val Loss: 0.6685, Gap: 0.1209, LR: 0.000360

Epoch 21/150
Mixup alpha: 0.7


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 21: Train Loss: 0.6791, Train Acc: 0.6354, Val Loss: 0.6751, Val Acc: 0.5938, Smoothed Val Loss: 0.6735, Gap: 0.0417, LR: 0.000386

Epoch 22/150
Mixup alpha: 0.7


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 22: Train Loss: 0.6741, Train Acc: 0.6964, Val Loss: 0.6450, Val Acc: 0.5234, Smoothed Val Loss: 0.6778, Gap: 0.1730, LR: 0.000412

Epoch 23/150
Mixup alpha: 0.72


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 23: Train Loss: 0.6700, Train Acc: 0.6592, Val Loss: 0.6556, Val Acc: 0.6172, Smoothed Val Loss: 0.6674, Gap: 0.0420, LR: 0.000440

Epoch 24/150
Mixup alpha: 0.7


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 24: Train Loss: 0.6717, Train Acc: 0.6815, Val Loss: 0.6485, Val Acc: 0.6172, Smoothed Val Loss: 0.6691, Gap: 0.0644, LR: 0.000467

Epoch 25/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 25: Train Loss: 0.6724, Train Acc: 0.6637, Val Loss: 0.6443, Val Acc: 0.6328, Smoothed Val Loss: 0.6537, Gap: 0.0309, LR: 0.000495

Epoch 26/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 26: Train Loss: 0.6524, Train Acc: 0.7054, Val Loss: 0.6659, Val Acc: 0.6016, Smoothed Val Loss: 0.6519, Gap: 0.1038, LR: 0.000524

Epoch 27/150
Mixup alpha: 0.7


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 27: Train Loss: 0.6694, Train Acc: 0.6741, Val Loss: 0.6453, Val Acc: 0.6094, Smoothed Val Loss: 0.6519, Gap: 0.0647, LR: 0.000553

Epoch 28/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 28: Train Loss: 0.6618, Train Acc: 0.6726, Val Loss: 0.6229, Val Acc: 0.6094, Smoothed Val Loss: 0.6454, Gap: 0.0632, LR: 0.000582
Saved new best model with val_loss: 0.6229

Epoch 29/150
Mixup alpha: 0.7


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 29: Train Loss: 0.6750, Train Acc: 0.7009, Val Loss: 0.6286, Val Acc: 0.5938, Smoothed Val Loss: 0.6414, Gap: 0.1071, LR: 0.000611
Saved new best model with val_loss: 0.6286

Epoch 30/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 30: Train Loss: 0.6678, Train Acc: 0.6801, Val Loss: 0.6211, Val Acc: 0.6484, Smoothed Val Loss: 0.6368, Gap: 0.0316, LR: 0.000640
Saved new best model with val_loss: 0.6211

Epoch 31/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 31: Train Loss: 0.6653, Train Acc: 0.6756, Val Loss: 0.6614, Val Acc: 0.5859, Smoothed Val Loss: 0.6358, Gap: 0.0897, LR: 0.000670
Saved new best model with val_loss: 0.6614

Epoch 32/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 32: Train Loss: 0.6589, Train Acc: 0.7232, Val Loss: 0.6467, Val Acc: 0.5938, Smoothed Val Loss: 0.6361, Gap: 0.1295, LR: 0.000699

Epoch 33/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 33: Train Loss: 0.6741, Train Acc: 0.6696, Val Loss: 0.6223, Val Acc: 0.6484, Smoothed Val Loss: 0.6360, Gap: 0.0212, LR: 0.000728

Epoch 34/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 34: Train Loss: 0.6637, Train Acc: 0.6771, Val Loss: 0.6089, Val Acc: 0.6406, Smoothed Val Loss: 0.6321, Gap: 0.0365, LR: 0.000757
Saved new best model with val_loss: 0.6089

Epoch 35/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 35: Train Loss: 0.6630, Train Acc: 0.6637, Val Loss: 0.7706, Val Acc: 0.6094, Smoothed Val Loss: 0.6620, Gap: 0.0543, LR: 0.000785

Epoch 36/150
Mixup alpha: 0.7


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 36: Train Loss: 0.6525, Train Acc: 0.6920, Val Loss: 0.7731, Val Acc: 0.5469, Smoothed Val Loss: 0.6843, Gap: 0.1451, LR: 0.000813

Epoch 37/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 37: Train Loss: 0.6837, Train Acc: 0.6726, Val Loss: 0.6290, Val Acc: 0.6562, Smoothed Val Loss: 0.6808, Gap: 0.0164, LR: 0.000841

Epoch 38/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 38: Train Loss: 0.6706, Train Acc: 0.6741, Val Loss: 0.6224, Val Acc: 0.6172, Smoothed Val Loss: 0.6808, Gap: 0.0569, LR: 0.000868

Epoch 39/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 39: Train Loss: 0.6573, Train Acc: 0.7054, Val Loss: 0.5973, Val Acc: 0.6641, Smoothed Val Loss: 0.6785, Gap: 0.0413, LR: 0.000895

Epoch 40/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 40: Train Loss: 0.6560, Train Acc: 0.6994, Val Loss: 0.6025, Val Acc: 0.6719, Smoothed Val Loss: 0.6448, Gap: 0.0275, LR: 0.000920

Epoch 41/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 41: Train Loss: 0.6550, Train Acc: 0.6964, Val Loss: 0.6200, Val Acc: 0.6250, Smoothed Val Loss: 0.6142, Gap: 0.0714, LR: 0.000945
Saved new best model with val_loss: 0.6200

Epoch 42/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 42: Train Loss: 0.6695, Train Acc: 0.6488, Val Loss: 0.6306, Val Acc: 0.5859, Smoothed Val Loss: 0.6145, Gap: 0.0629, LR: 0.000970

Epoch 43/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 43: Train Loss: 0.6530, Train Acc: 0.6875, Val Loss: 0.6735, Val Acc: 0.6562, Smoothed Val Loss: 0.6248, Gap: 0.0312, LR: 0.000993

Epoch 44/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 44: Train Loss: 0.6591, Train Acc: 0.6771, Val Loss: 0.6382, Val Acc: 0.6562, Smoothed Val Loss: 0.6329, Gap: 0.0208, LR: 0.001015

Epoch 45/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 45: Train Loss: 0.6599, Train Acc: 0.6949, Val Loss: 0.6175, Val Acc: 0.6719, Smoothed Val Loss: 0.6359, Gap: 0.0231, LR: 0.001036

Epoch 46/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 46: Train Loss: 0.6744, Train Acc: 0.6949, Val Loss: 0.6271, Val Acc: 0.6094, Smoothed Val Loss: 0.6374, Gap: 0.0856, LR: 0.001057

Epoch 47/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 47: Train Loss: 0.6579, Train Acc: 0.7054, Val Loss: 0.6213, Val Acc: 0.6328, Smoothed Val Loss: 0.6355, Gap: 0.0725, LR: 0.001076

Epoch 48/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 48: Train Loss: 0.6606, Train Acc: 0.6801, Val Loss: 0.7675, Val Acc: 0.6094, Smoothed Val Loss: 0.6543, Gap: 0.0707, LR: 0.001093

Epoch 49/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 49: Train Loss: 0.6878, Train Acc: 0.6845, Val Loss: 0.6060, Val Acc: 0.6484, Smoothed Val Loss: 0.6479, Gap: 0.0361, LR: 0.001110

Epoch 50/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 50: Train Loss: 0.6556, Train Acc: 0.7247, Val Loss: 0.6247, Val Acc: 0.6484, Smoothed Val Loss: 0.6493, Gap: 0.0763, LR: 0.001125

Epoch 51/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 51: Train Loss: 0.6627, Train Acc: 0.6786, Val Loss: 0.6129, Val Acc: 0.6719, Smoothed Val Loss: 0.6465, Gap: 0.0067, LR: 0.001139

Epoch 52/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 52: Train Loss: 0.6584, Train Acc: 0.6994, Val Loss: 0.5881, Val Acc: 0.6406, Smoothed Val Loss: 0.6398, Gap: 0.0588, LR: 0.001152

Epoch 53/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 53: Train Loss: 0.6607, Train Acc: 0.7321, Val Loss: 0.6313, Val Acc: 0.6328, Smoothed Val Loss: 0.6126, Gap: 0.0993, LR: 0.001163
Saved new best model with val_loss: 0.6313

Epoch 54/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 54: Train Loss: 0.6618, Train Acc: 0.6994, Val Loss: 0.5777, Val Acc: 0.7031, Smoothed Val Loss: 0.6069, Gap: -0.0037, LR: 0.001173
Saved new best model with val_loss: 0.5777

Epoch 55/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 55: Train Loss: 0.6463, Train Acc: 0.7247, Val Loss: 0.5959, Val Acc: 0.6797, Smoothed Val Loss: 0.6012, Gap: 0.0450, LR: 0.001181
Saved new best model with val_loss: 0.5959

Epoch 56/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 56: Train Loss: 0.6626, Train Acc: 0.6741, Val Loss: 0.6202, Val Acc: 0.6328, Smoothed Val Loss: 0.6026, Gap: 0.0413, LR: 0.001188

Epoch 57/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 57: Train Loss: 0.6483, Train Acc: 0.7232, Val Loss: 0.5853, Val Acc: 0.6562, Smoothed Val Loss: 0.6021, Gap: 0.0670, LR: 0.001193

Epoch 58/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 58: Train Loss: 0.6553, Train Acc: 0.7009, Val Loss: 0.5835, Val Acc: 0.6406, Smoothed Val Loss: 0.5925, Gap: 0.0603, LR: 0.001197
Saved new best model with val_loss: 0.5835

Epoch 59/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 59: Train Loss: 0.6705, Train Acc: 0.7113, Val Loss: 0.6313, Val Acc: 0.6641, Smoothed Val Loss: 0.6032, Gap: 0.0472, LR: 0.001199

Epoch 60/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 60: Train Loss: 0.6552, Train Acc: 0.7188, Val Loss: 0.5573, Val Acc: 0.7266, Smoothed Val Loss: 0.5955, Gap: -0.0078, LR: 0.001200

Epoch 61/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 61: Train Loss: 0.6593, Train Acc: 0.6949, Val Loss: 0.6605, Val Acc: 0.5859, Smoothed Val Loss: 0.6036, Gap: 0.1090, LR: 0.001199

Epoch 62/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 62: Train Loss: 0.6608, Train Acc: 0.7068, Val Loss: 0.5624, Val Acc: 0.6406, Smoothed Val Loss: 0.5990, Gap: 0.0662, LR: 0.001197

Epoch 63/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 63: Train Loss: 0.6467, Train Acc: 0.7292, Val Loss: 0.5653, Val Acc: 0.6719, Smoothed Val Loss: 0.5954, Gap: 0.0573, LR: 0.001193

Epoch 64/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 64: Train Loss: 0.6479, Train Acc: 0.6935, Val Loss: 0.6641, Val Acc: 0.6250, Smoothed Val Loss: 0.6019, Gap: 0.0685, LR: 0.001188

Epoch 65/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 65: Train Loss: 0.6706, Train Acc: 0.7202, Val Loss: 0.5738, Val Acc: 0.6641, Smoothed Val Loss: 0.6052, Gap: 0.0562, LR: 0.001181

Epoch 66/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 66: Train Loss: 0.6444, Train Acc: 0.7158, Val Loss: 0.5865, Val Acc: 0.6562, Smoothed Val Loss: 0.5904, Gap: 0.0595, LR: 0.001172
Saved new best model with val_loss: 0.5865

Epoch 67/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 67: Train Loss: 0.6430, Train Acc: 0.7470, Val Loss: 0.7302, Val Acc: 0.6172, Smoothed Val Loss: 0.6240, Gap: 0.1298, LR: 0.001163

Epoch 68/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 68: Train Loss: 0.6584, Train Acc: 0.7247, Val Loss: 0.6038, Val Acc: 0.6406, Smoothed Val Loss: 0.6317, Gap: 0.0841, LR: 0.001151

Epoch 69/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 69: Train Loss: 0.6492, Train Acc: 0.7336, Val Loss: 0.7853, Val Acc: 0.6328, Smoothed Val Loss: 0.6559, Gap: 0.1008, LR: 0.001139

Epoch 70/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 70: Train Loss: 0.6521, Train Acc: 0.7247, Val Loss: 0.5748, Val Acc: 0.7031, Smoothed Val Loss: 0.6561, Gap: 0.0216, LR: 0.001125

Epoch 71/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 71: Train Loss: 0.6432, Train Acc: 0.7381, Val Loss: 0.5688, Val Acc: 0.6953, Smoothed Val Loss: 0.6526, Gap: 0.0428, LR: 0.001109

Epoch 72/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 72: Train Loss: 0.6297, Train Acc: 0.7396, Val Loss: 0.6000, Val Acc: 0.6719, Smoothed Val Loss: 0.6266, Gap: 0.0677, LR: 0.001093

Epoch 73/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 73: Train Loss: 0.6523, Train Acc: 0.7262, Val Loss: 0.5555, Val Acc: 0.6719, Smoothed Val Loss: 0.6169, Gap: 0.0543, LR: 0.001075

Epoch 74/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 74: Train Loss: 0.6376, Train Acc: 0.7634, Val Loss: 0.5578, Val Acc: 0.7266, Smoothed Val Loss: 0.5714, Gap: 0.0368, LR: 0.001056
Saved new best model with val_loss: 0.5578

Epoch 75/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 75: Train Loss: 0.6304, Train Acc: 0.7515, Val Loss: 0.5555, Val Acc: 0.6797, Smoothed Val Loss: 0.5675, Gap: 0.0718, LR: 0.001035
Saved new best model with val_loss: 0.5555

Epoch 76/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 76: Train Loss: 0.6582, Train Acc: 0.7292, Val Loss: 0.5723, Val Acc: 0.6875, Smoothed Val Loss: 0.5682, Gap: 0.0417, LR: 0.001014

Epoch 77/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 77: Train Loss: 0.6528, Train Acc: 0.7411, Val Loss: 0.5943, Val Acc: 0.6641, Smoothed Val Loss: 0.5671, Gap: 0.0770, LR: 0.000992
Saved new best model with val_loss: 0.5943

Epoch 78/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 78: Train Loss: 0.6415, Train Acc: 0.7470, Val Loss: 0.5560, Val Acc: 0.7031, Smoothed Val Loss: 0.5672, Gap: 0.0439, LR: 0.000968

Epoch 79/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 79: Train Loss: 0.6375, Train Acc: 0.7455, Val Loss: 0.5864, Val Acc: 0.6953, Smoothed Val Loss: 0.5729, Gap: 0.0502, LR: 0.000944

Epoch 80/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 80: Train Loss: 0.6325, Train Acc: 0.7708, Val Loss: 0.5641, Val Acc: 0.6875, Smoothed Val Loss: 0.5746, Gap: 0.0833, LR: 0.000919

Epoch 81/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 81: Train Loss: 0.6161, Train Acc: 0.7440, Val Loss: 0.5320, Val Acc: 0.7500, Smoothed Val Loss: 0.5666, Gap: -0.0060, LR: 0.000893
Saved new best model with val_loss: 0.5320

Epoch 82/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 82: Train Loss: 0.6427, Train Acc: 0.7560, Val Loss: 0.5497, Val Acc: 0.6719, Smoothed Val Loss: 0.5577, Gap: 0.0841, LR: 0.000867
Saved new best model with val_loss: 0.5497

Epoch 83/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 83: Train Loss: 0.6230, Train Acc: 0.7560, Val Loss: 0.5912, Val Acc: 0.6719, Smoothed Val Loss: 0.5647, Gap: 0.0841, LR: 0.000840

Epoch 84/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 84: Train Loss: 0.6469, Train Acc: 0.7530, Val Loss: 0.5560, Val Acc: 0.7109, Smoothed Val Loss: 0.5586, Gap: 0.0420, LR: 0.000812

Epoch 85/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 85: Train Loss: 0.6232, Train Acc: 0.7500, Val Loss: 0.5266, Val Acc: 0.7266, Smoothed Val Loss: 0.5511, Gap: 0.0234, LR: 0.000784
Saved new best model with val_loss: 0.5266

Epoch 86/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 86: Train Loss: 0.6253, Train Acc: 0.7366, Val Loss: 0.5721, Val Acc: 0.7109, Smoothed Val Loss: 0.5591, Gap: 0.0257, LR: 0.000755

Epoch 87/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 87: Train Loss: 0.6279, Train Acc: 0.7649, Val Loss: 0.5615, Val Acc: 0.7031, Smoothed Val Loss: 0.5615, Gap: 0.0618, LR: 0.000727

Epoch 88/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 88: Train Loss: 0.6226, Train Acc: 0.7307, Val Loss: 0.6371, Val Acc: 0.6484, Smoothed Val Loss: 0.5707, Gap: 0.0822, LR: 0.000698

Epoch 89/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 89: Train Loss: 0.6232, Train Acc: 0.7500, Val Loss: 0.5571, Val Acc: 0.6875, Smoothed Val Loss: 0.5709, Gap: 0.0625, LR: 0.000668

Epoch 90/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 90: Train Loss: 0.6343, Train Acc: 0.7946, Val Loss: 0.5118, Val Acc: 0.7422, Smoothed Val Loss: 0.5679, Gap: 0.0525, LR: 0.000639

Epoch 91/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 91: Train Loss: 0.6184, Train Acc: 0.7500, Val Loss: 0.5298, Val Acc: 0.6875, Smoothed Val Loss: 0.5595, Gap: 0.0625, LR: 0.000610

Epoch 92/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 92: Train Loss: 0.6390, Train Acc: 0.7842, Val Loss: 0.5758, Val Acc: 0.6484, Smoothed Val Loss: 0.5623, Gap: 0.1358, LR: 0.000580

Epoch 93/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 93: Train Loss: 0.6352, Train Acc: 0.7708, Val Loss: 0.5416, Val Acc: 0.7109, Smoothed Val Loss: 0.5432, Gap: 0.0599, LR: 0.000551
Saved new best model with val_loss: 0.5416

Epoch 94/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 94: Train Loss: 0.6171, Train Acc: 0.7693, Val Loss: 0.5252, Val Acc: 0.7188, Smoothed Val Loss: 0.5368, Gap: 0.0506, LR: 0.000522
Saved new best model with val_loss: 0.5252

Epoch 95/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 95: Train Loss: 0.6170, Train Acc: 0.7857, Val Loss: 0.5277, Val Acc: 0.7500, Smoothed Val Loss: 0.5400, Gap: 0.0357, LR: 0.000494

Epoch 96/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 96: Train Loss: 0.6172, Train Acc: 0.7812, Val Loss: 0.5259, Val Acc: 0.7500, Smoothed Val Loss: 0.5392, Gap: 0.0312, LR: 0.000466

Epoch 97/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 97: Train Loss: 0.6364, Train Acc: 0.7649, Val Loss: 0.5323, Val Acc: 0.7344, Smoothed Val Loss: 0.5305, Gap: 0.0305, LR: 0.000438
Saved new best model with val_loss: 0.5323

Epoch 98/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 98: Train Loss: 0.6157, Train Acc: 0.7946, Val Loss: 0.5340, Val Acc: 0.7031, Smoothed Val Loss: 0.5290, Gap: 0.0915, LR: 0.000411
Saved new best model with val_loss: 0.5340

Epoch 99/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 99: Train Loss: 0.6122, Train Acc: 0.7708, Val Loss: 0.5320, Val Acc: 0.6875, Smoothed Val Loss: 0.5304, Gap: 0.0833, LR: 0.000385

Epoch 100/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 100: Train Loss: 0.6161, Train Acc: 0.7723, Val Loss: 0.5487, Val Acc: 0.6875, Smoothed Val Loss: 0.5346, Gap: 0.0848, LR: 0.000359

Epoch 101/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 101: Train Loss: 0.6090, Train Acc: 0.7932, Val Loss: 0.5521, Val Acc: 0.7500, Smoothed Val Loss: 0.5398, Gap: 0.0432, LR: 0.000334

Epoch 102/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 102: Train Loss: 0.6088, Train Acc: 0.7783, Val Loss: 0.5250, Val Acc: 0.7188, Smoothed Val Loss: 0.5384, Gap: 0.0595, LR: 0.000310

Epoch 103/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 103: Train Loss: 0.6039, Train Acc: 0.7976, Val Loss: 0.5224, Val Acc: 0.7266, Smoothed Val Loss: 0.5360, Gap: 0.0711, LR: 0.000287

Epoch 104/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 104: Train Loss: 0.6310, Train Acc: 0.7946, Val Loss: 0.5288, Val Acc: 0.7266, Smoothed Val Loss: 0.5354, Gap: 0.0681, LR: 0.000264

Epoch 105/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 105: Train Loss: 0.5877, Train Acc: 0.8065, Val Loss: 0.5163, Val Acc: 0.7500, Smoothed Val Loss: 0.5289, Gap: 0.0565, LR: 0.000243
Saved new best model with val_loss: 0.5163

Epoch 106/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 106: Train Loss: 0.6130, Train Acc: 0.7857, Val Loss: 0.5168, Val Acc: 0.7422, Smoothed Val Loss: 0.5219, Gap: 0.0435, LR: 0.000223
Saved new best model with val_loss: 0.5168

Epoch 107/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 107: Train Loss: 0.6125, Train Acc: 0.7812, Val Loss: 0.5066, Val Acc: 0.7422, Smoothed Val Loss: 0.5182, Gap: 0.0391, LR: 0.000204
Saved new best model with val_loss: 0.5066

Epoch 108/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 108: Train Loss: 0.6040, Train Acc: 0.8036, Val Loss: 0.5127, Val Acc: 0.7578, Smoothed Val Loss: 0.5162, Gap: 0.0458, LR: 0.000186
Saved new best model with val_loss: 0.5127

Epoch 109/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 109: Train Loss: 0.6068, Train Acc: 0.7798, Val Loss: 0.5040, Val Acc: 0.7578, Smoothed Val Loss: 0.5113, Gap: 0.0219, LR: 0.000170
Saved new best model with val_loss: 0.5040

Epoch 110/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 110: Train Loss: 0.5938, Train Acc: 0.8185, Val Loss: 0.5041, Val Acc: 0.7578, Smoothed Val Loss: 0.5088, Gap: 0.0606, LR: 0.000154
Saved new best model with val_loss: 0.5041

Epoch 111/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 111: Train Loss: 0.5874, Train Acc: 0.8065, Val Loss: 0.5070, Val Acc: 0.7656, Smoothed Val Loss: 0.5069, Gap: 0.0409, LR: 0.000140
Saved new best model with val_loss: 0.5070

Epoch 112/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 112: Train Loss: 0.6049, Train Acc: 0.8304, Val Loss: 0.5109, Val Acc: 0.7500, Smoothed Val Loss: 0.5077, Gap: 0.0804, LR: 0.000128

Epoch 113/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 113: Train Loss: 0.5987, Train Acc: 0.7991, Val Loss: 0.5251, Val Acc: 0.7422, Smoothed Val Loss: 0.5102, Gap: 0.0569, LR: 0.000117

Epoch 114/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 114: Train Loss: 0.5874, Train Acc: 0.8051, Val Loss: 0.5110, Val Acc: 0.7578, Smoothed Val Loss: 0.5116, Gap: 0.0472, LR: 0.000107

Epoch 115/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 115: Train Loss: 0.6003, Train Acc: 0.8259, Val Loss: 0.5044, Val Acc: 0.7656, Smoothed Val Loss: 0.5117, Gap: 0.0603, LR: 0.000099

Epoch 116/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 116: Train Loss: 0.5856, Train Acc: 0.8244, Val Loss: 0.4973, Val Acc: 0.7734, Smoothed Val Loss: 0.5097, Gap: 0.0510, LR: 0.000092

Epoch 117/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 117: Train Loss: 0.5919, Train Acc: 0.8259, Val Loss: 0.5054, Val Acc: 0.7422, Smoothed Val Loss: 0.5086, Gap: 0.0837, LR: 0.000087

Epoch 118/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 118: Train Loss: 0.5814, Train Acc: 0.8244, Val Loss: 0.4984, Val Acc: 0.7734, Smoothed Val Loss: 0.5033, Gap: 0.0510, LR: 0.000083
Saved new best model with val_loss: 0.4984

Epoch 119/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 119: Train Loss: 0.5967, Train Acc: 0.8185, Val Loss: 0.4948, Val Acc: 0.7734, Smoothed Val Loss: 0.5001, Gap: 0.0450, LR: 0.000081
Saved new best model with val_loss: 0.4948

Epoch 120/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 120: Train Loss: 0.6129, Train Acc: 0.8378, Val Loss: 0.5019, Val Acc: 0.7578, Smoothed Val Loss: 0.4995, Gap: 0.0800, LR: 0.000080
Saved new best model with val_loss: 0.5019

Epoch 121/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 121: Train Loss: 0.5832, Train Acc: 0.8095, Val Loss: 0.4951, Val Acc: 0.7578, Smoothed Val Loss: 0.4991, Gap: 0.0517, LR: 0.000080
Saved new best model with val_loss: 0.4951

Epoch 122/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 122: Train Loss: 0.5863, Train Acc: 0.8185, Val Loss: 0.4972, Val Acc: 0.7734, Smoothed Val Loss: 0.4975, Gap: 0.0450, LR: 0.000079
Saved new best model with val_loss: 0.4972

Epoch 123/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 123: Train Loss: 0.5860, Train Acc: 0.8199, Val Loss: 0.4924, Val Acc: 0.7656, Smoothed Val Loss: 0.4963, Gap: 0.0543, LR: 0.000078
Saved new best model with val_loss: 0.4924

Epoch 124/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 124: Train Loss: 0.5835, Train Acc: 0.8274, Val Loss: 0.4915, Val Acc: 0.7734, Smoothed Val Loss: 0.4956, Gap: 0.0539, LR: 0.000077
Saved new best model with val_loss: 0.4915

Epoch 125/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 125: Train Loss: 0.5960, Train Acc: 0.8051, Val Loss: 0.4863, Val Acc: 0.7812, Smoothed Val Loss: 0.4925, Gap: 0.0238, LR: 0.000075
Saved new best model with val_loss: 0.4863

Epoch 126/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 126: Train Loss: 0.5863, Train Acc: 0.8274, Val Loss: 0.4838, Val Acc: 0.7578, Smoothed Val Loss: 0.4903, Gap: 0.0696, LR: 0.000072
Saved new best model with val_loss: 0.4838

Epoch 127/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 127: Train Loss: 0.5710, Train Acc: 0.8229, Val Loss: 0.4916, Val Acc: 0.7734, Smoothed Val Loss: 0.4891, Gap: 0.0495, LR: 0.000070
Saved new best model with val_loss: 0.4916

Epoch 128/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 128: Train Loss: 0.5961, Train Acc: 0.8318, Val Loss: 0.4940, Val Acc: 0.7578, Smoothed Val Loss: 0.4895, Gap: 0.0740, LR: 0.000067

Epoch 129/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 129: Train Loss: 0.5623, Train Acc: 0.8259, Val Loss: 0.4844, Val Acc: 0.7656, Smoothed Val Loss: 0.4880, Gap: 0.0603, LR: 0.000064
Saved new best model with val_loss: 0.4844

Epoch 130/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 130: Train Loss: 0.5909, Train Acc: 0.8155, Val Loss: 0.4879, Val Acc: 0.7422, Smoothed Val Loss: 0.4883, Gap: 0.0733, LR: 0.000060

Epoch 131/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 131: Train Loss: 0.5825, Train Acc: 0.8348, Val Loss: 0.4846, Val Acc: 0.7656, Smoothed Val Loss: 0.4885, Gap: 0.0692, LR: 0.000056

Epoch 132/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 132: Train Loss: 0.5999, Train Acc: 0.8363, Val Loss: 0.4872, Val Acc: 0.7500, Smoothed Val Loss: 0.4876, Gap: 0.0863, LR: 0.000052
Saved new best model with val_loss: 0.4872

Epoch 133/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 133: Train Loss: 0.5670, Train Acc: 0.8155, Val Loss: 0.5019, Val Acc: 0.7500, Smoothed Val Loss: 0.4892, Gap: 0.0655, LR: 0.000048

Epoch 134/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 134: Train Loss: 0.6011, Train Acc: 0.8155, Val Loss: 0.4855, Val Acc: 0.7734, Smoothed Val Loss: 0.4894, Gap: 0.0420, LR: 0.000044

Epoch 135/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 135: Train Loss: 0.5937, Train Acc: 0.8318, Val Loss: 0.4878, Val Acc: 0.7734, Smoothed Val Loss: 0.4894, Gap: 0.0584, LR: 0.000040

Epoch 136/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 136: Train Loss: 0.5921, Train Acc: 0.8348, Val Loss: 0.4825, Val Acc: 0.7734, Smoothed Val Loss: 0.4890, Gap: 0.0614, LR: 0.000036

Epoch 137/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 137: Train Loss: 0.5866, Train Acc: 0.8378, Val Loss: 0.4811, Val Acc: 0.7656, Smoothed Val Loss: 0.4878, Gap: 0.0722, LR: 0.000032

Epoch 138/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 138: Train Loss: 0.5793, Train Acc: 0.8125, Val Loss: 0.4799, Val Acc: 0.7734, Smoothed Val Loss: 0.4834, Gap: 0.0391, LR: 0.000028
Saved new best model with val_loss: 0.4799

Epoch 139/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 139: Train Loss: 0.5815, Train Acc: 0.8229, Val Loss: 0.4804, Val Acc: 0.7656, Smoothed Val Loss: 0.4823, Gap: 0.0573, LR: 0.000024
Saved new best model with val_loss: 0.4804

Epoch 140/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 140: Train Loss: 0.5803, Train Acc: 0.8378, Val Loss: 0.4803, Val Acc: 0.7734, Smoothed Val Loss: 0.4808, Gap: 0.0644, LR: 0.000020
Saved new best model with val_loss: 0.4803

Epoch 141/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 141: Train Loss: 0.5794, Train Acc: 0.8095, Val Loss: 0.4785, Val Acc: 0.7812, Smoothed Val Loss: 0.4800, Gap: 0.0283, LR: 0.000017
Saved new best model with val_loss: 0.4785

Epoch 142/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 142: Train Loss: 0.5834, Train Acc: 0.8393, Val Loss: 0.4764, Val Acc: 0.7656, Smoothed Val Loss: 0.4791, Gap: 0.0737, LR: 0.000014
Saved new best model with val_loss: 0.4764

Epoch 143/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 143: Train Loss: 0.5988, Train Acc: 0.8363, Val Loss: 0.4769, Val Acc: 0.7734, Smoothed Val Loss: 0.4785, Gap: 0.0629, LR: 0.000011
Saved new best model with val_loss: 0.4769

Epoch 144/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 144: Train Loss: 0.5979, Train Acc: 0.8185, Val Loss: 0.4777, Val Acc: 0.7656, Smoothed Val Loss: 0.4780, Gap: 0.0528, LR: 0.000008
Saved new best model with val_loss: 0.4777

Epoch 145/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 145: Train Loss: 0.5704, Train Acc: 0.8185, Val Loss: 0.4807, Val Acc: 0.7734, Smoothed Val Loss: 0.4781, Gap: 0.0450, LR: 0.000006

Epoch 146/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 146: Train Loss: 0.5823, Train Acc: 0.8259, Val Loss: 0.4760, Val Acc: 0.7656, Smoothed Val Loss: 0.4775, Gap: 0.0603, LR: 0.000004
Saved new best model with val_loss: 0.4760

Epoch 147/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 147: Train Loss: 0.5921, Train Acc: 0.8259, Val Loss: 0.4779, Val Acc: 0.7812, Smoothed Val Loss: 0.4778, Gap: 0.0446, LR: 0.000003

Epoch 148/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 148: Train Loss: 0.5802, Train Acc: 0.8304, Val Loss: 0.4792, Val Acc: 0.7734, Smoothed Val Loss: 0.4783, Gap: 0.0569, LR: 0.000002

Epoch 149/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 149: Train Loss: 0.5645, Train Acc: 0.8452, Val Loss: 0.4790, Val Acc: 0.7578, Smoothed Val Loss: 0.4785, Gap: 0.0874, LR: 0.000001

Epoch 150/150
Mixup alpha: 0.6


Training:   0%|          | 0/42 [00:00<?, ?it/s]

Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Epoch 150: Train Loss: 0.5798, Train Acc: 0.8036, Val Loss: 0.4783, Val Acc: 0.7734, Smoothed Val Loss: 0.4781, Gap: 0.0301, LR: 0.000001
Training completed


0,1
avg_grad_norm,▁▂▁▂▂▂▂▃▄▃▃▆▃▆▅▅▆▆██▄▆▅▆▆▆▅▅▅▅▆▅▅▆▅▅▅▅▄▄
epoch,▁▁▁▁▁▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇█████
learning_rate,▂▂▂▂▂▃▄▄▄▅▆▇▇▇█████▇▆▅▅▄▄▃▃▂▂▂▁▁▁▁▁▁▁▁▁▁
metrics/acc_gap,▃▃▄▆▄▆▄▃▅█▄▂▃▃▇▂▅▁█▆█▃▆▄▆▁▇▆▅▃▆▅▄▅▃▅▄▃▅▄
mixup_alpha,█▅▅█▃▃▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/accuracy,▁▂▂▂▁▃▂▃▄▄▃▃▄▃▄▄▅▅▃▅▅▅▅▅▆▆▆▅▇▆▆▇▇▇▆█▇███
train/loss,█████▇▆▇▇▆▆▇▆▆▇▆▆▆▆▅▄▄▄▅▄▃▂▄▃▂▃▂▂▃▂▂▁▂▂▂
val/accuracy,▁▁▂▃▁▂▃▃▃▄▅▄▅▃▄▅▃▅▃▅▆▆▆▆▆▇▆▇█▇█████▇████
val/loss,▆▆▆▅▆▆▅▄█▅▄▅▄▄▃▇▃▃▄▃▃▂▃▃▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁
val/smoothed_loss,██▇▇▇▇▆▆▆█▆▅▅▅▅▇▇▄▄▄▄▄▄▄▄▃▃▃▃▃▂▁▁▁▁▁▁▁▁▁

0,1
avg_grad_norm,2.50995
best_val_acc,0.76562
best_val_loss,0.47754
epoch,150.0
learning_rate,0.0
metrics/acc_gap,0.03013
mixup_alpha,0.6
train/accuracy,0.80357
train/loss,0.57978
val/accuracy,0.77344


In [22]:
state_dict = torch.load(f'model_{root_id}.pth')

children = hierarchy.get_children(root_id)
model = HierarchyNodeModel(len(children))

model.load_state_dict(state_dict)
model.to("cuda")

categories_dict = {child: hierarchy.get_leaf_nodes(
        child) for child in children}

test_loader = create_images_dataloader(
        categories_dict, split='test', batch_size=config.batch_size)

test_loss, test_acc = __validate(model, test_loader)

print(f"Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.4f}")

n03259280: 73 samples (51.05%)
n04111531: 70 samples (48.95%)


  state_dict = torch.load(f'model_{root_id}.pth')


Validation:   0%|          | 0/8 [00:00<?, ?it/s]

Test Loss: 0.3976, Test Acc: 0.8516
