# Modelovanie a vyhodnotenie DETR

**Poznámka:** Aby sme si uľahčili prácu so súbormi údajov, obrázkami a modelmi, vytvoríme konštantu `HOME`.

In [1]:
import os
HOME = os.getcwd()
print(HOME)

/content


## Nastavenie prostredia

In [None]:
!pip install -i https://test.pypi.org/simple/ supervision==0.3.0
!pip install -q transformers
!pip install -q pytorch-lightning
!pip install -q timm

In [None]:
import torch
import cv2
!nvcc --version
TORCH_VERSION = ".".join(torch.__version__.split(".")[:2])
CUDA_VERSION = torch.__version__.split("+")[-1]
print("torch: ", TORCH_VERSION, "; cuda: ", CUDA_VERSION)

import supervision
import transformers
import pytorch_lightning

print(
    "; supervision:", supervision.__version__,
    "; transformers:", transformers.__version__,
    "; pytorch_lightning:", pytorch_lightning.__version__
)

### Načítanie modelu DETR do premennej `CHECKPOINT`

In [None]:
import torch
from transformers import DetrForObjectDetection, DetrImageProcessor
import supervision as sv

# settings
DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
CHECKPOINT = 'facebook/detr-resnet-50'
CONFIDENCE_TRESHOLD = 0.5
IOU_TRESHOLD = 0.8

image_processor = DetrImageProcessor.from_pretrained(CHECKPOINT)
model = DetrForObjectDetection.from_pretrained(CHECKPOINT)
model.to(DEVICE)

In [30]:
dataset_location = 'Meteory_format_COCO_aug'

### Vytváranie dátových loaderov (DataLoader) na trénovanie, validáciu a testovanie modelu

In [None]:
import os
import torchvision
from torch.utils.data import DataLoader

# settings
ANNOTATION_FILE_NAME = "_annotations.coco.json"
TRAIN_DIRECTORY = os.path.join(dataset_location, "train")
VAL_DIRECTORY = os.path.join(dataset_location, "valid")
TEST_DIRECTORY = os.path.join(dataset_location, "test")


class CocoDetection(torchvision.datasets.CocoDetection):
    def __init__(
        self,
        image_directory_path: str,
        image_processor,
        train: bool = True
    ):
        annotation_file_path = os.path.join(image_directory_path, ANNOTATION_FILE_NAME)
        super(CocoDetection, self).__init__(image_directory_path, annotation_file_path)
        self.image_processor = image_processor

    def __getitem__(self, idx):
        images, annotations = super(CocoDetection, self).__getitem__(idx)
        image_id = self.ids[idx]
        annotations = {'image_id': image_id, 'annotations': annotations}
        encoding = self.image_processor(images=images, annotations=annotations, return_tensors="pt")
        pixel_values = encoding["pixel_values"].squeeze()
        target = encoding["labels"][0]

        return pixel_values, target


TRAIN_DATASET = CocoDetection(
    image_directory_path=TRAIN_DIRECTORY,
    image_processor=image_processor,
    train=True)
VAL_DATASET = CocoDetection(
    image_directory_path=VAL_DIRECTORY,
    image_processor=image_processor,
    train=False)
TEST_DATASET = CocoDetection(
    image_directory_path=TEST_DIRECTORY,
    image_processor=image_processor,
    train=False)

def collate_fn(batch):

    pixel_values = [item[0] for item in batch]
    encoding = image_processor.pad(pixel_values, return_tensors="pt")
    labels = [item[1] for item in batch]
    return {
        'pixel_values': encoding['pixel_values'],
        'pixel_mask': encoding['pixel_mask'],
        'labels': labels
    }

TRAIN_DATALOADER = DataLoader(dataset=TRAIN_DATASET, collate_fn=collate_fn, batch_size=16, shuffle=True)
VAL_DATALOADER = DataLoader(dataset=VAL_DATASET, collate_fn=collate_fn, batch_size=16)
TEST_DATALOADER = DataLoader(dataset=TEST_DATASET, collate_fn=collate_fn, batch_size=16)

categories = TEST_DATASET.coco.cats
id2label = {k: v['name'] for k,v in categories.items()}
box_annotator = sv.BoxAnnotator()

print("Number of training examples:", len(TRAIN_DATASET))
print("Number of validation examples:", len(VAL_DATASET))
print("Number of test examples:", len(TEST_DATASET))

## Tréning modelu pomocou PyTorch Lightning

### DETR implementovaný pomocou PyTorch Lightning bez Optuna


In [34]:
import pytorch_lightning as pl
from transformers import DetrForObjectDetection
import torch

# Definícia triedy Detr
class Detr(pl.LightningModule):

    def __init__(self, lr, lr_backbone, weight_decay):
        super().__init__()
        # Načítanie predtrénovanej modelu DETR pre detekciu objektov
        self.model = DetrForObjectDetection.from_pretrained(
            pretrained_model_name_or_path=CHECKPOINT,  # Premenná CHECKPOINT obsahuje cestu k predtrénovanému modelu
            num_labels=len(id2label),  # Počet tried (etikiet) objektov v dátach
            ignore_mismatched_sizes=True  # Ignorovanie nezhôd veľkostí obrázkov
        )

        # Inicializácia parametrov učenia
        self.lr = lr
        self.lr_backbone = lr_backbone
        self.weight_decay = weight_decay

    # Dopredu (forward) funkcia modelu
    def forward(self, pixel_values, pixel_mask):
        return self.model(pixel_values=pixel_values, pixel_mask=pixel_mask)

    # Spoločná funkcia pre krok (step) trénovania a validácie
    def common_step(self, batch, batch_idx):
        pixel_values = batch["pixel_values"]
        pixel_mask = batch["pixel_mask"]
        labels = [{k: v.to(self.device) for k, v in t.items()} for t in batch["labels"]]

        # Výpočet predikcií a straty
        outputs = self.model(pixel_values=pixel_values, pixel_mask=pixel_mask, labels=labels)
        loss = outputs.loss
        loss_dict = outputs.loss_dict

        return loss, loss_dict

    # Trénovací krok
    def training_step(self, batch, batch_idx):
        loss, loss_dict = self.common_step(batch, batch_idx)
        # Logovanie metrík pre každý trénovací krok a priemerná hodnota počas epochy
        self.log("training_loss", loss)
        for k,v in loss_dict.items():
            self.log("train_" + k, v.item())

        return loss

    # Validácia krok
    def validation_step(self, batch, batch_idx):
        loss, loss_dict = self.common_step(batch, batch_idx)
        self.log("validation/loss", loss)
        for k, v in loss_dict.items():
            self.log("validation_" + k, v.item())

        return loss

    # Konfigurácia optimalizátorov
    def configure_optimizers(self):
        param_dicts = [
            {
                "params": [p for n, p in self.named_parameters() if "backbone" not in n and p.requires_grad]},
            {
                "params": [p for n, p in self.named_parameters() if "backbone" in n and p.requires_grad],
                "lr": self.lr_backbone,
            },
        ]
        return torch.optim.AdamW(param_dicts, lr=self.lr, weight_decay=self.weight_decay)

    # DataLoader pre trénovacie dáta
    def train_dataloader(self):
        return TRAIN_DATALOADER

    # DataLoader pre validačné dáta
    def val_dataloader(self):
        return VAL_DATALOADER

model = Detr(lr=1e-4, lr_backbone=1e-5, weight_decay=1e-4)

batch = next(iter(TRAIN_DATALOADER))
outputs = model(pixel_values=batch['pixel_values'], pixel_mask=batch['pixel_mask'])
outputs.logits.shape

In [None]:
from pytorch_lightning import Trainer

%cd {HOME}

MAX_EPOCHS = 100

trainer = Trainer(devices=1, accelerator="gpu", max_epochs=MAX_EPOCHS, gradient_clip_val=0.1, accumulate_grad_batches=8, log_every_n_steps=5)

trainer.fit(model)

### DETR implementovaný pomocou PyTorch Lightning s Optuna

In [None]:
!pip install coco_eval
!pip install optuna

In [50]:
import pytorch_lightning as pl
from transformers import DetrForObjectDetection
import torch
from coco_eval import CocoEvaluator
from tqdm.notebook import tqdm
import numpy as np

# Trieda Detr rozširuje PyTorch Lightning Module
class Detr(pl.LightningModule):

    def __init__(self, lr, lr_backbone, weight_decay, num_encoder_layers, num_decoder_layers):
        super().__init__()
        # Načítanie predtrénovaného modelu DETR pre detekciu objektov
        self.model = DetrForObjectDetection.from_pretrained(
            pretrained_model_name_or_path=CHECKPOINT,  # Premenná CHECKPOINT obsahuje cestu k predtrénovanému modelu
            num_labels=len(id2label),  # Počet tried objektov v dátach
            ignore_mismatched_sizes=True  # Ignorovanie nezhôd veľkostí obrázkov
        )

        # Inicializácia parametrov učenia
        self.lr = lr
        self.lr_backbone = lr_backbone
        self.weight_decay = weight_decay
        self.num_encoder_layers = num_encoder_layers
        self.num_decoder_layers = num_decoder_layers

        # Nastavenie počtu vrstiev enkodera a dekodera modelu
        self.model.config.encoder_layers = num_encoder_layers
        self.model.config.decoder_layers = num_decoder_layers

    # Forward funkcia modelu
    def forward(self, pixel_values, pixel_mask):
        return self.model(pixel_values=pixel_values, pixel_mask=pixel_mask)

    # Spoločná funkcia pre krok (step) trénovania a validácie
    def common_step(self, batch, batch_idx):
        pixel_values = batch["pixel_values"]
        pixel_mask = batch["pixel_mask"]
        labels = [{k: v.to(self.device) for k, v in t.items()} for t in batch["labels"]]

        # Výpočet predikcií a straty
        outputs = self.model(pixel_values=pixel_values, pixel_mask=pixel_mask, labels=labels)
        loss = outputs.loss
        loss_dict = outputs.loss_dict

        return loss, loss_dict

    # Trénovací krok
    def training_step(self, batch, batch_idx):
        loss, loss_dict = self.common_step(batch, batch_idx)
        self.log("training_loss", loss)
        for k, v in loss_dict.items():
            self.log("train_" + k, v.item())

        return loss

    # Validácia krok
    def validation_step(self, batch, batch_idx):
        loss, loss_dict = self.common_step(batch, batch_idx)
        self.log("validation/loss", loss)
        for k, v in loss_dict.items():
            self.log("validation_" + k, v.item())

        return loss

    # Konfigurácia optimalizátorov
    def configure_optimizers(self):
        param_dicts = [
            {
                "params": [p for n, p in self.named_parameters() if "backbone" not in n and p.requires_grad]
            },
            {
                "params": [p for n, p in self.named_parameters() if "backbone" in n and p.requires_grad],
                "lr": self.lr_backbone,
            },
        ]
        return torch.optim.AdamW(param_dicts, lr=self.lr, weight_decay=self.weight_decay)

    # DataLoader pre trénovacie dáta
    def train_dataloader(self):
        return TRAIN_DATALOADER

    # DataLoader pre validačné dáta
    def val_dataloader(self):
        return VAL_DATALOADER

# Funkcia na konverziu koordinátov boxes na formát xywh
def convert_to_xywh(boxes):
    xmin, ymin, xmax, ymax = boxes.unbind(1)
    return torch.stack((xmin, ymin, xmax - xmin, ymax - ymin), dim=1)

# Funkcia na prípravu výstupov modelu pre detekciu objektov pre evaluáciu pomocou COCO metriky
def prepare_for_coco_detection(predictions):
    coco_results = []
    for original_id, prediction in predictions.items():
        if len(prediction) == 0:
            continue

        boxes = prediction["boxes"]
        boxes = convert_to_xywh(boxes).tolist()
        scores = prediction["scores"].tolist()
        labels = prediction["labels"].tolist()

        coco_results.extend(
            [
                {
                    "image_id": original_id,
                    "category_id": labels[k],
                    "bbox": box,
                    "score": scores[k],
                }
                for k, box in enumerate(boxes)
            ]
        )
    return coco_results

# Funkcia pre vyhodnotenie modelu
def eva():
    evaluator = CocoEvaluator(coco_gt=VAL_DATASET.coco, iou_types=["bbox"])

    print("Running evaluation...")

    for idx, batch in enumerate(tqdm(VAL_DATALOADER)):
        pixel_values = batch["pixel_values"].to(DEVICE)
        pixel_mask = batch["pixel_mask"].to(DEVICE)
        labels = [{k: v.to(DEVICE) for k, v in t.items()} for t in batch["labels"]]

        with torch.no_grad():
            outputs = model(pixel_values=pixel_values, pixel_mask=pixel_mask)

        orig_target_sizes = torch.stack([target["orig_size"] for target in labels], dim=0)
        results = image_processor.post_process_object_detection(outputs, target_sizes=orig_target_sizes)

        predictions = {target['image_id'].item(): output for target, output in zip(labels, results)}
        predictions = prepare_for_coco_detection(predictions)
        evaluator.update(predictions)

    evaluator.synchronize_between_processes()
    evaluator.accumulate()
    evaluator.summarize()
    coco_eval = evaluator.coco_eval['bbox']
    ap50 = coco_eval.stats[1]  # AP at IoU=0.5 (AP50)
    print(f"AP50: {ap50:.4f}")
    return ap50


In [None]:
import optuna
from optuna.samplers import GridSampler

# Definícia priestoru vyhľadávania pre hyperparametre
search_space = {
    "lr": [0.0001, 0.0005],  # Learning rate
    "num_encoder_layers": [4, 6],  # Počet vrstiev kódovača
    "num_decoder_layers": [4, 6]   # Počet vrstiev dekódovača
}

# Objektívna funkcia pre optimalizáciu
def objective(trial):
    lr = trial.suggest_categorical('lr', [0.0001, 0.0005])  # Výber hodnoty pre learning rate z rozsahu
    num_encoder_layers = trial.suggest_categorical('num_encoder_layers', [4, 6])  # Výber počtu vrstiev kódovača
    num_decoder_layers = trial.suggest_categorical('num_decoder_layers', [4, 6])  # Výber počtu vrstiev dekódovača

    # Vytvorenie modelu DETR s danými hyperparametrami
    model = Detr(
        lr=lr,
        lr_backbone=1e-5,
        weight_decay=1e-5,
        num_encoder_layers=num_encoder_layers,
        num_decoder_layers=num_decoder_layers
    )
    model = model.to(DEVICE)  # Presunutie modelu na dané zariadenie (GPU alebo CPU)

    # Vytvorenie trénera pre trénovanie modelu
    trainer = Trainer(
        devices=1,  # Počet zariadení (1 pre GPU, 0 pre CPU)
        accelerator="gpu" if torch.cuda.is_available() else "cpu",  # Akcelerátor (gpu alebo cpu)
        max_epochs=100,  # Maximálny počet epoch pre trénovanie
        gradient_clip_val=0.1,  # Hodnota pre orezanie gradientov
        accumulate_grad_batches=8,  # Počet dávok pre akumuláciu gradientov
        log_every_n_steps=5  # Logovanie každých N krokov
    )

    # Trénovanie modelu
    trainer.fit(model)

    model.to(DEVICE)  # Presunutie modelu späť na zariadenie (GPU alebo CPU)
    ap = eva()  # Vykonanie evaluácie modelu a výpočet AP (Average Precision)

    return ap  # Návrat hodnoty AP pre Optuna

# Vytvorenie štúdie pre optimalizáciu smerom k maximalizácii
study = optuna.create_study(direction='maximize', sampler=GridSampler(search_space))
study.optimize(objective, n_trials=10)  # Spustenie optimalizácie pre 10 pokusov
print("best params: ", study.best_params)  # Výpis najlepších hyperparametrov
print("best AP: ", study.best_value)  # Výpis najlepšej dosiahnutej hodnoty AP


## Inference na testovacom súbore údajov

In [None]:
model.to(DEVICE)

In [None]:
import os
import cv2
import torch
import random
import numpy as np

# pomocné funkcie
categories = TEST_DATASET.coco.cats
id2label = {k: v['name'] for k, v in categories.items()}  # Mapovanie ID tried na názvy tried
box_annotator = sv.BoxAnnotator()  # Inicializácia anotátora pre ohraničujúce boxy

# Prechádzanie všetkých obrázkov v testovacej sade
for image_id in TEST_DATASET.coco.getImgIds():
    print('Spracovanie obrázka #{}'.format(image_id))

    # Načítanie obrázka a anotácií
    image_info = TEST_DATASET.coco.loadImgs(image_id)[0]
    annotations = TEST_DATASET.coco.imgToAnns[image_id]
    image_path = os.path.join(TEST_DATASET.root, image_info['file_name'])
    image = cv2.imread(image_path)

    # Anotácia ground truth
    detections = sv.Detections.from_coco_annotations(coco_annotation=annotations)
    labels = [f"{id2label[class_id]}" for _, _, class_id, _ in detections]
    frame = box_annotator.annotate(scene=image.copy(), detections=detections, labels=labels)

    print('Ground Truth pre obrázok #{}'.format(image_id))
    %matplotlib inline
    sv.show_frame_in_notebook(frame, (16, 16))

    # Inferencia
    with torch.no_grad():

        # Načítanie obrázka a predikcia
        inputs = image_processor(images=image, return_tensors='pt').to(DEVICE)
        outputs = model(**inputs)

        # Post-processing
        target_sizes = torch.tensor([image.shape[:2]]).to(DEVICE)
        results = image_processor.post_process_object_detection(
            outputs=outputs,
            threshold=CONFIDENCE_TRESHOLD,
            target_sizes=target_sizes
        )[0]

    # Anotácia detekcií
    detections = sv.Detections.from_transformers(transformers_results=results).with_nms(threshold=0.5)
    labels = [f"{id2label[class_id]} {confidence:.2f}" for _, confidence, class_id, _ in detections]
    frame = box_annotator.annotate(scene=image.copy(), detections=detections, labels=labels)

    print('Detekcie pre obrázok #{}'.format(image_id))
    %matplotlib inline
    sv.show_frame_in_notebook(frame, (16, 16))


## Vyhodnotenie na testovacom súbore údajov

In [None]:
!pip install -q coco_eval

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for coco_eval (setup.py) ... [?25l[?25hdone


In [None]:
from coco_eval import CocoEvaluator
from tqdm.notebook import tqdm

import numpy as np

def convert_to_xywh(boxes):
    xmin, ymin, xmax, ymax = boxes.unbind(1)
    return torch.stack((xmin, ymin, xmax - xmin, ymax - ymin), dim=1)

def prepare_for_coco_detection(predictions):
    coco_results = []
    for original_id, prediction in predictions.items():
        if len(prediction) == 0:
            continue

        boxes = prediction["boxes"]
        boxes = convert_to_xywh(boxes).tolist()
        scores = prediction["scores"].tolist()
        labels = prediction["labels"].tolist()

        coco_results.extend(
            [
                {
                    "image_id": original_id,
                    "category_id": labels[k],
                    "bbox": box,
                    "score": scores[k],
                }
                for k, box in enumerate(boxes)
            ]
        )
    return coco_results

evaluator = CocoEvaluator(coco_gt=TEST_DATASET.coco, iou_types=["bbox"])

print("Running evaluation...")

for idx, batch in enumerate(tqdm(TEST_DATALOADER)):
    pixel_values = batch["pixel_values"].to(DEVICE)
    pixel_mask = batch["pixel_mask"].to(DEVICE)
    labels = [{k: v.to(DEVICE) for k, v in t.items()} for t in batch["labels"]]

    with torch.no_grad():
      outputs = model(pixel_values=pixel_values, pixel_mask=pixel_mask)

    orig_target_sizes = torch.stack([target["orig_size"] for target in labels], dim=0)
    results = image_processor.post_process_object_detection(outputs, target_sizes=orig_target_sizes)

    predictions = {target['image_id'].item(): output for target, output in zip(labels, results)}
    predictions = prepare_for_coco_detection(predictions)
    evaluator.update(predictions)

evaluator.synchronize_between_processes()
evaluator.accumulate()
evaluator.summarize()