# Лабораторная работа №7 (Проведение исследований моделями семантической сегментации)

## Создание бейзлайна и оценка качества

Загрузка библиотек, если не установлено

In [1]:
%pip install segmentation-models-pytorch

Defaulting to user installation because normal site-packages is not writeable
Collecting segmentation-models-pytorch
  Downloading segmentation_models_pytorch-0.5.0-py3-none-any.whl.metadata (17 kB)
Collecting huggingface-hub>=0.24 (from segmentation-models-pytorch)
  Downloading huggingface_hub-0.30.2-py3-none-any.whl.metadata (13 kB)
Collecting safetensors>=0.3.1 (from segmentation-models-pytorch)
  Downloading safetensors-0.5.3-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Collecting timm>=0.9 (from segmentation-models-pytorch)
  Downloading timm-1.0.15-py3-none-any.whl.metadata (52 kB)
Downloading segmentation_models_pytorch-0.5.0-py3-none-any.whl (154 kB)
Downloading huggingface_hub-0.30.2-py3-none-any.whl (481 kB)
Downloading safetensors-0.5.3-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (471 kB)
Downloading timm-1.0.15-py3-none-any.whl (2.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m45.7 MB/s[0m

In [72]:
%pip install einops

Defaulting to user installation because normal site-packages is not writeable
Collecting einops
  Downloading einops-0.8.1-py3-none-any.whl.metadata (13 kB)
Downloading einops-0.8.1-py3-none-any.whl (64 kB)
Installing collected packages: einops
Successfully installed einops-0.8.1

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


Импорт библиотек

In [1]:
import cv2
import numpy as np
import torch
import torch.nn as nn
import torchvision
from segmentation_models_pytorch import Segformer, Unet
from sklearn.metrics import f1_score, hamming_loss
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
from torchvision.datasets import VOCSegmentation

import albumentations as A
from albumentations.pytorch import ToTensorV2

from einops import rearrange, reduce, repeat
from einops.layers.torch import Rearrange

Создадим функции обучения и оценки

In [34]:
def train_one_epoch(model, train_loader, criterion, optimizer, device):
    model.train()
    running_loss = 0.0
    for images, masks in train_loader:
        images = images.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()

        outputs = model(images)
        loss = criterion(outputs, masks)

        loss.backward()
        optimizer.step()

        running_loss += loss.item() * images.size(0)

    epoch_loss = running_loss / len(train_loader.dataset)
    print(f"Train Loss: {epoch_loss:.4f}")

In [11]:
def evaluate(model, val_loader, device):
    model.eval()
    
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for images, masks in val_loader:
            images = images.to(device)
            masks = masks.to(device)

            outputs = model(images)
            preds = torch.argmax(outputs, dim=1).cpu().numpy().flatten()
            masks = masks.cpu().numpy().flatten()

            all_preds.extend(preds)
            all_labels.extend(masks)

    # Вычисляем метрики
    # Пропускаем игнорируемые индексы
    idx = (np.array(all_labels) != 255)
    all_preds = np.array(all_preds)[idx]
    all_labels = np.array(all_labels)[idx]
    
    f1 = f1_score(all_labels, all_preds, average='weighted')
    h_loss = hamming_loss(all_labels, all_preds)

    # mAP как IoU среднее по всем классам
    mAP = np.mean([np.sum((all_preds == i) & (all_labels == i)) / np.sum((all_preds == i) | (all_labels == i)) 
                    for i in range(NUM_CLASSES) if np.sum(all_labels == i) > 0])

    print(f"Hamming Loss: {h_loss:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"mAP (mean IoU): {mAP:.4f}")

Зададим параметры

In [5]:
VOC_CLASSES = [
    "background",
    "aeroplane",
    "bicycle",
    "bird",
    "boat",
    "bottle",
    "bus",
    "car",
    "cat",
    "chair",
    "cow",
    "diningtable",
    "dog",
    "horse",
    "motorbike",
    "person",
    "potted plant",
    "sheep",
    "sofa",
    "train",
    "tv/monitor",
]

LEARNING_RATE = 0.0001
BATCH_SIZE = 10
NUM_CLASSES = len(VOC_CLASSES)
INPUT_DIM = (320, 320)

Загрузка датасетов

In [35]:
transform = transforms.Compose(
    [
        transforms.Resize(INPUT_DIM),
        transforms.ToTensor(),
        transforms.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
    ]
)
target_transform = transforms.Compose(
    [
        transforms.Resize(
            INPUT_DIM, interpolation=transforms.InterpolationMode.NEAREST
        ),
        transforms.Lambda(lambda t: torch.as_tensor(np.array(t), dtype=torch.int64)),
    ]
)

train_dataset = VOCSegmentation(
    root="./data",
    year="2007",
    image_set="train",
    transform=transform,
    target_transform=target_transform,
)
test_dataset = VOCSegmentation(
    root="./data",
    year="2007",
    image_set="val",
    transform=transform,
    target_transform=target_transform,
)

train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
)
test_loader = DataLoader(
    dataset=test_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
)

### Обучение моделей

Инициализация моделей Unet и Segformer

In [7]:
device = torch.device("cpu") # torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_unet = Unet(
    encoder_name="resnet34",
    encoder_weights="imagenet",
    classes=NUM_CLASSES,
    activation=None
)
model_unet.to(device)
criterion_unet = nn.CrossEntropyLoss(ignore_index=255)
optimizer_unet = torch.optim.Adam(model_unet.parameters(), lr=LEARNING_RATE)

model_segformer = Segformer(
    encoder_name="resnet34",
    encoder_weights="imagenet",
    classes=NUM_CLASSES,
)
model_segformer.to(device)
criterion_segformer = nn.CrossEntropyLoss(ignore_index=255)
optimizer_segformer = torch.optim.Adam(model_segformer.parameters(), lr=LEARNING_RATE)

Обучение модели Unet

In [10]:
train_one_epoch(
    model=model_unet,
    train_loader=train_loader,
    criterion=criterion_unet,
    optimizer=optimizer_unet,
    device=device,
)

Train Loss: 2.9815


Обучение модели Segformer

In [41]:
train_one_epoch(
    model=model_segformer,
    train_loader=train_loader,
    criterion=criterion_segformer,
    optimizer=optimizer_segformer,
    device=device,
)

Train Loss: 3.8079


### Оценка качества модели

Оценка модели Unet

In [17]:
evaluate(model=model_unet, val_loader=test_loader, device=device)

Hamming Loss: 0.7594
F1 Score: 0.3290
mAP (mean IoU): 0.0290


Оценка модели Segformer

In [42]:
evaluate(model=model_segformer, val_loader=test_loader, device=device)

Hamming Loss: 0.9767
F1 Score: 0.0161
mAP (mean IoU): 0.0175


## Улучшение бейзлайна

### Формулирование гипотез

#### Гипотеза 1: Аугментация данных

Цель: Повышение генерализирующей способности модели путем создания более разнообразного набора данных.

#### Гипотеза 2: Подбор гиперпараметров

Цель: Найти оптимальные гиперпараметры для улучшения обучения и обобщения.

### Проверка гипотез

#### Аугментация данных

Настройка аугментации и создание датасета

In [42]:
# Класс для аугментации данных и преобразования в тензоры
class VOCDataset(Dataset):
    def __init__(self, voc_dataset, augmentations=None):
        self.voc_dataset = voc_dataset
        self.augmentations = augmentations

    def __len__(self):
        return len(self.voc_dataset)

    def __getitem__(self, idx):
        img, mask = self.voc_dataset[idx]
        img = np.array(img)
        mask = np.array(mask)

        if self.augmentations:
            augmented = self.augmentations(image=img, mask=mask)
            img = augmented['image']
            mask = augmented['mask']

        # конвертация маски обратно в long тип для CrossEntropyLoss
        mask = mask.clone().detach().type(torch.long)

        return img, mask

# Аугментация
augmentations = A.Compose([
    A.Resize(*INPUT_DIM),
    A.HorizontalFlip(p=0.5),
    A.RandomBrightnessContrast(p=0.1),
    A.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
    ToTensorV2(transpose_mask=True),
])

# Создание датасета с аугментацией
train_dataset_aug_orig = VOCSegmentation(
    root="./data",
    year="2007",
    image_set="train",
)

train_dataset_aug = VOCDataset(train_dataset_aug_orig, augmentations=augmentations)

train_loader_aug = DataLoader(
    dataset=train_dataset_aug,
    batch_size=BATCH_SIZE,
    shuffle=True
)

Обучение и оценка модели Unet

In [30]:
model_unet = Unet(
    encoder_name="resnet34",
    encoder_weights="imagenet",
    classes=NUM_CLASSES,
    activation=None
)
model_unet.to(device)

train_one_epoch(
    model=model_unet,
    train_loader=train_loader_aug,
    criterion=criterion_unet,
    optimizer=optimizer_unet,
    device=device,
)

evaluate(model=model_unet, val_loader=test_loader, device=device)

Train Loss: 3.4658
Hamming Loss: 0.9861
F1 Score: 0.0100
mAP (mean IoU): 0.0075


Обучение и оценка модели Segformer

In [44]:
model_segformer = Segformer(
    encoder_name="resnet34",
    encoder_weights="imagenet",
    classes=NUM_CLASSES,
)
model_segformer.to(device)

train_one_epoch(
    model=model_segformer,
    train_loader=train_loader_aug,
    criterion=criterion_segformer,
    optimizer=optimizer_segformer,
    device=device,
)

evaluate(model=model_segformer, val_loader=test_loader, device=device)

Train Loss: 3.5348
Hamming Loss: 0.9725
F1 Score: 0.0433
mAP (mean IoU): 0.0052


#### Подбор гиперпараметров

Попробуем увелечить количество эпох и уменьшить learning_rate

In [37]:
EPOCHS=10
LEARNING_RATE=0.00001

Обучение и оценка модели Unet

In [38]:
model_unet = Unet(
    encoder_name="resnet34",
    encoder_weights="imagenet",
    classes=NUM_CLASSES,
    activation=None
)
model_unet.to(device)
criterion_unet = nn.CrossEntropyLoss(ignore_index=255)
optimizer_unet = torch.optim.Adam(model_unet.parameters(), lr=LEARNING_RATE)

for i in range(EPOCHS):
    print(f"EPOCH {i+1}")
    train_one_epoch(
        model=model_unet,
        train_loader=train_loader,
        criterion=criterion_unet,
        optimizer=optimizer_unet,
        device=device,
    )

evaluate(model=model_unet, val_loader=test_loader, device=device)

EPOCH 1
Train Loss: 2.9824
EPOCH 2
Train Loss: 2.9054
EPOCH 3
Train Loss: 2.8424
EPOCH 4
Train Loss: 2.7843
EPOCH 5
Train Loss: 2.7265
EPOCH 6
Train Loss: 2.6712
EPOCH 7
Train Loss: 2.6189
EPOCH 8
Train Loss: 2.5713
EPOCH 9
Train Loss: 2.5276
EPOCH 10
Train Loss: 2.4873
Hamming Loss: 0.5363
F1 Score: 0.5508
mAP (mean IoU): 0.0493


Обучение и оценка модели Segformer

In [51]:
model_segformer = Segformer(
    encoder_name="resnet34",
    encoder_weights="imagenet",
    classes=NUM_CLASSES,
)
model_segformer.to(device)
criterion_segformer = nn.CrossEntropyLoss(ignore_index=255)
optimizer_segformer = torch.optim.Adam(model_unet.parameters(), lr=LEARNING_RATE)

for i in range(EPOCHS):
    print(f"EPOCH {i+1}")
    train_one_epoch(
        model=model_segformer,
        train_loader=train_loader,
        criterion=criterion_segformer,
        optimizer=optimizer_segformer,
        device=device,
    )

evaluate(model=model_segformer, val_loader=test_loader, device=device)

EPOCH 1
Train Loss: 3.1200
EPOCH 2
Train Loss: 3.1246
EPOCH 3
Train Loss: 3.1145
EPOCH 4
Train Loss: 3.1172
EPOCH 5
Train Loss: 3.1169
EPOCH 6
Train Loss: 3.1220
EPOCH 7
Train Loss: 3.1169
EPOCH 8
Train Loss: 3.1207
EPOCH 9
Train Loss: 3.1251
EPOCH 10
Train Loss: 3.1106
Hamming Loss: 0.9125
F1 Score: 0.1435
mAP (mean IoU): 0.0109


### Формирование улучшенного бейзлайна

Сформируем датасет с аугментацией

In [52]:
train_dataset_orig = VOCSegmentation(
    root="./data",
    year="2007",
    image_set="train",
)

train_dataset = VOCDataset(train_dataset_orig, augmentations=augmentations)

train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True
)

Сформируем бейзлайн для Unet

In [57]:
EPOCHS=10
LEARNING_RATE=0.00001

model_unet = Unet(
    encoder_name="resnet34",
    encoder_weights="imagenet",
    classes=NUM_CLASSES,
    activation=None
)
model_unet.to(device)
criterion_unet = nn.CrossEntropyLoss(ignore_index=255)
optimizer_unet = torch.optim.Adam(model_unet.parameters(), lr=LEARNING_RATE)

Сформируем бейзлайн для Segformer

In [58]:
model_segformer = Segformer(
    encoder_name="resnet34",
    encoder_weights="imagenet",
    classes=NUM_CLASSES,
)
model_segformer.to(device)
criterion_segformer = nn.CrossEntropyLoss(ignore_index=255)
optimizer_segformer = torch.optim.Adam(model_unet.parameters(), lr=LEARNING_RATE)

### Обучение моделей

Обучение Unet

In [59]:
for i in range(EPOCHS):
    print(f"EPOCH {i+1}")
    train_one_epoch(
        model=model_unet,
        train_loader=train_loader,
        criterion=criterion_unet,
        optimizer=optimizer_unet,
        device=device,
    )

EPOCH 1
Train Loss: 3.0118
EPOCH 2
Train Loss: 2.9388
EPOCH 3
Train Loss: 2.8761
EPOCH 4
Train Loss: 2.8160
EPOCH 5
Train Loss: 2.7545
EPOCH 6
Train Loss: 2.7000
EPOCH 7
Train Loss: 2.6372
EPOCH 8
Train Loss: 2.5788
EPOCH 9
Train Loss: 2.5146
EPOCH 10
Train Loss: 2.4750


Обучение Segformer

In [60]:
for i in range(EPOCHS):
    print(f"EPOCH {i+1}")
    train_one_epoch(
        model=model_segformer,
        train_loader=train_loader,
        criterion=criterion_segformer,
        optimizer=optimizer_segformer,
        device=device,
    )

EPOCH 1
Train Loss: 3.2782
EPOCH 2
Train Loss: 3.2932
EPOCH 3
Train Loss: 3.2859
EPOCH 4
Train Loss: 3.2951
EPOCH 5
Train Loss: 3.2790
EPOCH 6
Train Loss: 3.2915
EPOCH 7
Train Loss: 3.2887
EPOCH 8
Train Loss: 3.2860
EPOCH 9
Train Loss: 3.2871
EPOCH 10
Train Loss: 3.2818


### Оценка моделей

Оценка Unet

In [61]:
evaluate(model=model_unet, val_loader=test_loader, device=device)

Hamming Loss: 0.6631
F1 Score: 0.4430
mAP (mean IoU): 0.0606


Оценка Segformer

In [62]:
evaluate(model=model_segformer, val_loader=test_loader, device=device)

Hamming Loss: 0.9582
F1 Score: 0.0705
mAP (mean IoU): 0.0057


### Сравнение результатов

|              | Unet (base)  | Unet (upgrade) | Segformer (base)    | Segformer (upgrade) |
|--------------|:------------:|:--------------:|:-------------------:|:-------------------:|
| Hamming Loss | 0.7594       | 0.6631         | 0.9767              | 0.9582              |
| F1 Score     | 0.3290       | 0.4430         | 0.0161              | 0.0705              |
| mAP          | 0.0290       | 0.0606         | 0.0175              | 0.0057              |

### Выводы

Метрики для Unet стали немного лучше. Возможно, следовало бы увеличить количество эпох -- тогда бы результат стал ещё лучше. Метрики Segformer остались плохими. Нужно строить и проверять другие гипотезы.

## Имплементация алгоритма машинного обучения 

### Имплементации моделей

Напишем упрощенную имплементацию модели Unet

In [2]:
class UNet(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(UNet, self).__init__()
        self.encoder = nn.Sequential(
            self.conv_block(in_channels, 64),
            self.conv_block(64, 128),
        )
        
        self.decoder = nn.Sequential(
            self.conv_block(128, 64),
            nn.Conv2d(64, out_channels, kernel_size=1)
        )
        
    def conv_block(self, in_channels, out_channels):
        return nn.Sequential(
            nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1),
            nn.ReLU()
        )
    
    def forward(self, x):
        x1 = self.encoder(x)
        x2 = self.decoder(x1)
        return x2

Напишем упрощенную имлементацию Vision Transformer

In [32]:
class SimpleViT(nn.Module):
    def __init__(self, image_size, patch_size, num_classes, dim, depth, heads, mlp_dim, channels=3):
        super().__init__()

        assert image_size % patch_size == 0, 'Image dimensions must be divisible by the patch size.'

        num_patches = (image_size // patch_size) ** 2
        patch_dim = channels * patch_size ** 2
        
        self.patch_size = patch_size

        # Переход на патчей
        self.to_patch_embedding = nn.Sequential(
            Rearrange('b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1=patch_size, p2=patch_size),
            nn.Linear(patch_dim, dim),
        )

        self.pos_embedding = nn.Parameter(torch.randn(1, num_patches, dim))

        encoder_layer = nn.TransformerEncoderLayer(d_model=dim, nhead=heads, dim_feedforward=mlp_dim)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=depth)

        # Генерация выхода
        self.to_out = nn.Sequential(
            nn.Linear(dim, dim),
            nn.ReLU(),
            nn.Linear(dim, num_classes)
        )

    def forward(self, img):
        x = self.to_patch_embedding(img)
        x += self.pos_embedding

        x = self.transformer_encoder(x)

        x = self.to_out(x)  # (B, num_patches, num_classes)

        # Рассчитываем размер перед изменением формы
        n_patches = img.shape[2] // self.patch_size
        x = x.permute(0, 2, 1).reshape(img.shape[0], -1, n_patches, n_patches)
        
        # Интерполяция до исходного размера изображения
        x = nn.functional.interpolate(x, size=(img.shape[2], img.shape[3]), mode='bilinear', align_corners=False)

        return x

### Обучение моделей

Подготовка датасета

In [8]:
transform = transforms.Compose(
    [
        transforms.Resize(INPUT_DIM),
        transforms.ToTensor(),
        transforms.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
    ]
)
target_transform = transforms.Compose(
    [
        transforms.Resize(
            INPUT_DIM, interpolation=transforms.InterpolationMode.NEAREST
        ),
        transforms.Lambda(lambda t: torch.as_tensor(np.array(t), dtype=torch.int64)),
    ]
)

train_dataset = VOCSegmentation(
    root="./data",
    year="2007",
    image_set="train",
    transform=transform,
    target_transform=target_transform,
)
test_dataset = VOCSegmentation(
    root="./data",
    year="2007",
    image_set="val",
    transform=transform,
    target_transform=target_transform,
)

train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
)
test_loader = DataLoader(
    dataset=test_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
)

Обучение сверточной модели

In [12]:
LEARNING_RATE = 0.0001

in_channels = 3
out_channels = NUM_CLASSES
    
model_unet_simple = UNet(in_channels, out_channels).to(device)
criterion_unet_simple = nn.CrossEntropyLoss(ignore_index=255)
optimizer_unet_simple = torch.optim.Adam(model_unet_simple.parameters(), lr=LEARNING_RATE)

train_one_epoch(
    model=model_unet_simple,
    train_loader=train_loader,
    criterion=criterion_unet_simple,
    optimizer=optimizer_unet_simple,
    device=device,
)

Train Loss: 2.9463


Обучение трансформерной модели

In [35]:
LEARNING_RATE = 0.0001
image_size = INPUT_DIM[0]
patch_size = 16
num_classes = NUM_CLASSES
dim = 64
depth = 6
heads = 8
mlp_dim = 128

model_vit = SimpleViT(image_size=image_size, patch_size=patch_size, num_classes=num_classes, dim=dim, depth=depth, heads=heads, mlp_dim=mlp_dim).to(device)
criterion_vit = nn.CrossEntropyLoss(ignore_index=255)
optimizer_vit = torch.optim.Adam(model_vit.parameters(), lr=LEARNING_RATE)

train_one_epoch(
    model=model_vit,
    train_loader=train_loader,
    criterion=criterion_vit,
    optimizer=optimizer_vit,
    device=device,
)

Train Loss: 2.7387


### Оценка моделей

Оценка сверточной модели

In [37]:
evaluate(model=model_unet_simple, val_loader=test_loader, device=device)

Hamming Loss: 0.3307
F1 Score: 0.6025
mAP (mean IoU): 0.0341


Оценка трансформерной модели

In [38]:
evaluate(model=model_vit, val_loader=test_loader, device=device)

Hamming Loss: 0.2622
F1 Score: 0.6268
mAP (mean IoU): 0.0352


### Сравнение результатов с базовыми моделями

|              | Unet (base)  | Simple Unet | Segformer (base) | Simple ViT          |
|--------------|:------------:|:-----------:|:----------------:|:-------------------:|
| Hamming Loss | 0.7594       | 0.3307      | 0.9767           | 0.2622              |
| F1 Score     | 0.3290       | 0.6025      | 0.0161           | 0.6268              |
| mAP          | 0.0290       | 0.0341      | 0.0175           | 0.0352              |

### Выводы

Имплементированные модели могли оказаться лучше базовых, потому что они были оптимизированы для конкретных данных, что повысило их эффективность и точность. Либо базовые модели были плохо настроены изначально.

### Добавление техник из улучшенного бейзлайна

Формирование датасета

In [43]:
train_dataset_orig = VOCSegmentation(
    root="./data",
    year="2007",
    image_set="train",
)

train_dataset = VOCDataset(train_dataset_orig, augmentations=augmentations)

train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True
)

Улучшение сверточной модели

In [45]:
EPOCHS = 10
LEARNING_RATE = 0.0001

model_unet_simple = UNet(in_channels, out_channels).to(device)
criterion_unet_simple = nn.CrossEntropyLoss(ignore_index=255)
optimizer_unet_simple = torch.optim.Adam(model_unet_simple.parameters(), lr=LEARNING_RATE)

Улучшение трансформерной модели

In [46]:
model_vit = SimpleViT(image_size=image_size, patch_size=patch_size, num_classes=num_classes, dim=dim, depth=depth, heads=heads, mlp_dim=mlp_dim).to(device)
criterion_vit = nn.CrossEntropyLoss(ignore_index=255)
optimizer_vit = torch.optim.Adam(model_vit.parameters(), lr=LEARNING_RATE)

### Обучение улучшенных моделей

Обучение сверточной модели

In [47]:
for i in range(EPOCHS):
    print(f"EPOCH {i+1}")
    train_one_epoch(
        model=model_unet_simple,
        train_loader=train_loader,
        criterion=criterion_unet_simple,
        optimizer=optimizer_unet_simple,
        device=device,
    )

EPOCH 1
Train Loss: 2.7845
EPOCH 2
Train Loss: 1.7260
EPOCH 3
Train Loss: 1.5469
EPOCH 4
Train Loss: 1.4593
EPOCH 5
Train Loss: 1.3986
EPOCH 6
Train Loss: 1.3675
EPOCH 7
Train Loss: 1.3531
EPOCH 8
Train Loss: 1.3353
EPOCH 9
Train Loss: 1.3103
EPOCH 10
Train Loss: 1.2957


Обучение трансформерной модели

In [48]:
for i in range(EPOCHS):
    print(f"EPOCH {i+1}")
    train_one_epoch(
        model=model_vit,
        train_loader=train_loader,
        criterion=criterion_vit,
        optimizer=optimizer_vit,
        device=device,
    )

EPOCH 1
Train Loss: 2.5208
EPOCH 2
Train Loss: 1.9067
EPOCH 3
Train Loss: 1.5717
EPOCH 4
Train Loss: 1.4242
EPOCH 5
Train Loss: 1.3585
EPOCH 6
Train Loss: 1.3353
EPOCH 7
Train Loss: 1.3225
EPOCH 8
Train Loss: 1.3163
EPOCH 9
Train Loss: 1.3105
EPOCH 10
Train Loss: 1.3068


### Оценка улучшенных моделей

Оценка сверточной модели

In [49]:
evaluate(model=model_unet_simple, val_loader=test_loader, device=device)

Hamming Loss: 0.2618
F1 Score: 0.6270
mAP (mean IoU): 0.0352


Оценка трансформерной модели

In [50]:
evaluate(model=model_vit, val_loader=test_loader, device=device)

Hamming Loss: 0.2618
F1 Score: 0.6270
mAP (mean IoU): 0.0352


### Сравнение результатов с улучшенным бейзлайном

|              |Unet (upgrade)| Simple Unet (upgrade) |Segformer (upgrade)| Simple ViT (upgrade) |
|--------------|:------------:|:---------------------:|:-----------------:|:--------------------:|
| Hamming Loss | 0.6631       | 0.2618                | 0.9582            | 0.2618               |
| F1 Score     | 0.4430       | 0.6270                | 0.0705            | 0.6270               |
| mAP          | 0.0606       | 0.0352                | 0.0057            | 0.0352               |


### Выводы

Имплементированные модели могли оказаться лучше базовых , потому что они были оптимизированы для конкретных данных, что повысило их эффективность и точность. Либо базовые модели были плохо настроены изначально.