# DETA

## Version 1.0

Experiment mit standard DETA

In [None]:
import os
import sys
import requests
import datetime
import pandas as pd
import numpy as np
import torch, torchvision, torchaudio
import pytorch_lightning as pl
import wandb
import json
import copy
import shutil
import matplotlib.pyplot as plt
import time

from torchmetrics.detection import MeanAveragePrecision
from torchvision.transforms import v2
from torchvision import datasets, tv_tensors
from torchvision.io import read_image
from coco_eval import CocoEvaluator
from tqdm.notebook import tqdm
from torch.utils.data import DataLoader
from pytorch_lightning.loggers import WandbLogger
from transformers import AutoImageProcessor, DetaForObjectDetection
from transformers import DetaConfig, DetaImageProcessor
from pytorch_lightning import Trainer
from PIL import Image, ImageDraw
from mean_average_precision import MetricBuilder

HyperparameterSweep = False
Training = False

if HyperparameterSweep and Training:
    raise ValueError("Its not advised to use both at the same time.")

# Training

## Loads Data

In [None]:


class CocoDetection(torchvision.datasets.CocoDetection):
    def __init__(self, img_folder, processor, image_name, transforms=None):
        ann_file = os.path.join(img_folder, image_name)
        super(CocoDetection, self).__init__(img_folder, ann_file, transforms)
        self.processor = processor

    def __getitem__(self, idx):
        # read in PIL image and target in COCO format
        # feel free to add data augmentation here before passing them to the next step
        
		
        img, target = super(CocoDetection, self).__getitem__(idx)

        # preprocess image and target (converting target to DETR format, resizing + normalization of both image and target)
        image_id = self.ids[idx]
        target = {'image_id': image_id, 'annotations': target}
        encoding = self.processor(images=img, annotations=target, return_tensors="pt")
        pixel_values = encoding["pixel_values"].squeeze() # remove batch dimension
        target = encoding["labels"][0] # remove batch dimension

        return pixel_values, target

processor = DetaImageProcessor.from_pretrained("jozhang97/deta-resnet-50")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Create Train and Validation Dataset

    #Minimal Dataset
# train_dataset = CocoDetection(img_folder='../../../old/BAA/Data/train', processor=processor, image_name="coco_train.json")
# val_dataset = CocoDetection(img_folder='../../../old/BAA/Data/val', processor=processor, image_name="coco_val.json")

    #Maximal Dataset
#train_dataset = CocoDetection(img_folder='../../../old/BAA/Data/train_max', processor=processor, image_name="coco_train_max.json")
#val_dataset = CocoDetection(img_folder='../../../old/BAA/Data/val_max', processor=processor, image_name="coco_val_max.json")

    # Dataset with Augmentations
train_dataset = CocoDetection(img_folder='../../../old/BAA/Data/train_combined', processor=processor, image_name="coco_Combined.json")
val_dataset = CocoDetection(img_folder='../../../old/BAA/Data/val_combined', processor=processor, image_name="coco_Combined.json")

test_dataset = CocoDetection(img_folder='../../../old/BAA/Data/test', processor=processor, image_name="coco_test.json")

print("Number of training examples:", len(train_dataset))
print("Number of validation examples:", len(val_dataset))



cats = train_dataset.coco.cats
id2label = {k: v['name'] for k,v in cats.items()}

def collate_fn(batch):
	pixel_values = [item[0] for item in batch]
	encoding = processor.pad(pixel_values, return_tensors="pt")
	labels = [item[1] for item in batch]
	batch = {}
	batch['pixel_values'] = encoding['pixel_values']
	batch['pixel_mask'] = encoding['pixel_mask']
	batch['labels'] = labels
	return batch

train_dataloader = DataLoader(train_dataset, collate_fn=collate_fn, batch_size=1, shuffle=True)
val_dataloader = DataLoader(val_dataset, collate_fn=collate_fn, batch_size=1)

test_dataloader = DataLoader(test_dataset, collate_fn=collate_fn, batch_size=1)

batch = next(iter(train_dataloader))

## define DETA

outcommented lines reflect the first try with torchmetrics, the goal was to get the AP on runtime

In [None]:
class Deta(pl.LightningModule):
    def __init__(self, lr, lr_backbone, weight_decay):
        super().__init__()
        # replace COCO classification head with custom head
        # we specify the "no_timm" variant here to not rely on the timm library
        # for the convolutional backbone
        self.model = DetaForObjectDetection.from_pretrained("jozhang97/deta-resnet-50",
                                                            num_labels=len(id2label),
                                                            auxiliary_loss=True,
                                                            ignore_mismatched_sizes=True)
        # see https://github.com/PyTorchLightning/pytorch-lightning/pull/1896
        #self.processor = DetaImageProcessor.from_pretrained("jozhang97/deta-resnet-50")
        self.save_hyperparameters()
        #self.val_epoch_count = 0 # to not log at the first run
        self.lr = lr
        self.lr_backbone = lr_backbone
        self.weight_decay = weight_decay
        #self.val_ap = []
        #self.train_ap = []
        #self.train_metric_fn = MetricBuilder.build_evaluation_metric("map_2d", async_mode=True, num_classes=9)
        #self.val_metric_fn = MetricBuilder.build_evaluation_metric("map_2d", async_mode=True, num_classes=9)

    def forward(self, pixel_values, pixel_mask):
        outputs = self.model(pixel_values=pixel_values, pixel_mask=pixel_mask)
        return outputs

    def common_step(self, batch, batch_idx, t_v):
        pixel_values = batch["pixel_values"]
        pixel_mask = batch["pixel_mask"]
        labels = [{k: v.to(self.device) for k, v in t.items()} for t in batch["labels"]]

        outputs = self.model(pixel_values=pixel_values, pixel_mask=pixel_mask, labels=labels)

        loss = outputs.loss
        loss_dict = outputs.loss_dict

        

        # turn into a list of dictionaries (one item for each example in the batch)
        #orig_target_sizes = torch.stack([target["orig_size"] for target in labels], dim=0)
        #results = processor.post_process_object_detection(outputs, target_sizes=orig_target_sizes, threshold=0.02)
        
        #gt = []
        #size = batch["labels"][0]["class_labels"].size()
        #for i in range(0, size[0]):
        #    n = []
        #    n.extend(batch["labels"][0]["boxes"][i])
        #    n.extend([batch["labels"][0]["class_labels"][i]])
        #    n.extend([0]) # difficulty
        #    n.extend([batch["labels"][0]["iscrowd"][i]])
        #    gt.append(n)

        #preds = []
        #sizep = len(results[0]["labels"])
        #for i in range(0, sizep):
        #    m = []
        #    m.extend(results[0]["boxes"][i])
        #    m.extend([results[0]["labels"][i]])
        #    m.extend([results[0]["scores"][i]])
        #    preds.append(m)

        #gt = torch.tensor(gt)
        #preds = torch.tensor(preds)
        #if t_v:
        #	self.data_prep_train(pixel_values, pixel_mask, labels)
        #else:
        #	self.data_prep_val(pixel_values, pixel_mask, labels)

        return loss, loss_dict #, preds, gt

    def training_step(self, batch, batch_idx):
        # add: , preds, gt
        loss, loss_dict = self.common_step(batch, batch_idx, True)
        # logs metrics for each training_step,
        # and the average across the epoch
        #self.train_metric_fn.add(preds, gt)
        self.log("training_loss", loss)
        for k,v in loss_dict.items():
            self.log("train_" + k, v.item())
        return loss

    """def on_train_epoch_end(self) -> None:
        metrics = self.train_metric_fn.value(iou_thresholds=np.arange(0.5, 1.0, 0.05), recall_thresholds=np.arange(0., 1.01, 0.01), mpolicy='soft')
        ap = metrics["mAP"]
        print(ap)
        self.train_ap.append(ap)
        #wandb.log({"train_epoch_AP": ap})
        self.train_metric_fn = MetricBuilder.build_evaluation_metric("map_2d", async_mode=True, num_classes=9)"""

    """def on_train_epoch_end(self) -> None:
        print("onTrainEpoch")
        t1 = time.time()
        self.cmetric = MeanAveragePrecision(iou_type="bbox")
        self.cmetric.update(self.training_step_pred, self.training_step_targ)
        result = self.cmetric.compute()
        counter = 0
        for k, v in result.items():
            counter += 1
            if counter < 15:
                wandb.log({"train_epoch_" + k: v.item()})
                print({"train_" + k: v.item()})
        self.training_step_pred = []
        self.training_step_targ = []
        t2 = time.time()
        print(round(t2 - t1, 4))"""

    def validation_step(self, batch, batch_idx):
        # add: , preds, gt
        loss, loss_dict = self.common_step(batch, batch_idx, False)
        #self.val_metric_fn.add(preds, gt)
        self.log("validation_loss", loss)
        for k,v in loss_dict.items():
            self.log("validation_" + k, v.item())
        return loss

    """def on_validation_epoch_end(self) -> None:
        vmetrics = self.val_metric_fn.value(iou_thresholds=np.arange(0.5, 1.0, 0.05), recall_thresholds=np.arange(0., 1.01, 0.01), mpolicy='soft')
        ap = vmetrics["mAP"]
        print(ap)
        self.val_ap.append(ap)
        wandb.log({"val_epoch_AP": ap})
        self.val_metric_fn = MetricBuilder.build_evaluation_metric("map_2d", async_mode=True, num_classes=9)"""

    """def on_validation_epoch_end(self) -> None:
        if self.val_epoch_count == 1:
            print("onValEpoch")
            t1 = time.time()
            self.cmetric = MeanAveragePrecision(iou_type="bbox")
            self.cmetric.update(self.training_step_pred, self.training_step_targ)
            result = self.cmetric.compute()
            counter = 0
            for k, v in result.items():
                counter += 1
                if counter < 15:
                    wandb.log({"validation_epoch_" + k: v.item()})
                    print({"validation_epoch_" + k: v.item()})
            self.validation_step_pred = []
            self.validation_step_targ = []
            t2 = time.time()
            print(round(t2 - t1, 4))
        else:
            self.val_epoch_count = 1"""

    def configure_optimizers(self):
        param_dicts = [
                {"params": [p for n, p in self.named_parameters() if "backbone" not in n and p.requires_grad]},
                {
                    "params": [p for n, p in self.named_parameters() if "backbone" in n and p.requires_grad],
                    "lr": self.lr_backbone,
                },
        ]
        optimizer = torch.optim.AdamW(param_dicts, lr=self.lr,
                                    weight_decay=self.weight_decay)

        return optimizer

    def train_dataloader(self):
        return train_dataloader

    def val_dataloader(self):
        return val_dataloader

    """def data_prep_train(self, pixel_values, pixel_mask, labels):

        with torch.no_grad():
            outputs = self.model(pixel_values=pixel_values, pixel_mask=pixel_mask)

        orig_target_sizes = torch.stack([target["orig_size"] for target in labels], dim=0)
        results = self.processor.post_process_object_detection(outputs, target_sizes=orig_target_sizes, threshold=0)
        for n in results:
            self.training_step_pred.append(n)
        for i in labels:
            self.training_step_targ.append({"boxes":i["boxes"], "labels":i["class_labels"]})

    def data_prep_val(self, pixel_values, pixel_mask, labels):

        with torch.no_grad():
            outputs = self.model(pixel_values=pixel_values, pixel_mask=pixel_mask)

        orig_target_sizes = torch.stack([target["orig_size"] for target in labels], dim=0)
        results = self.processor.post_process_object_detection(outputs, target_sizes=orig_target_sizes, threshold=0)
        for n in results:
            self.validation_step_pred.append(n)
        for i in labels:
            self.validation_step_targ.append({"boxes":i["boxes"], "labels":i["class_labels"]})"""

## define Weights & Biases Sweep

In [None]:
if HyperparameterSweep:
    import pprint
    sweep_config = {
        "name": "Sweeps_DETA",
        "method":"random"
    }
    metric = {
        "name": "validation_loss",
        "goal": "minimize"
    }

    sweep_config["metric"] = metric

    parameters_dict = {
        "learning_rate_backbone" : {
            "distribution": "uniform",
            "min": 0.000001,
            "max": 0.001
        },
        "learning_rate_transformer" : {
            "distribution": "uniform",
            "min": 0.000001,
            "max": 0.001
        },
        "weight_decay" : {
            "distribution": "uniform",
            "min": 0.000001,
            "max": 0.001
        },
    }
    sweep_config['parameters'] = parameters_dict

    pprint.pprint(sweep_config)

    sweep_id = wandb.sweep(sweep_config, project="BAA_Book_Damage_Detection_Sweeps")
    sweep_id


## Training

In [None]:
# Training
if HyperparameterSweep:
    def train(config=None):
        # login to weights and biases, to relogin: wandb.login(key="YOUR KEY", relogin=True)
        wandb.login()

        with wandb.init(config=config):

            config = wandb.config

            #date = datetime.datetime.now()
            #run_name = "Run at the {}".format(date)

            lr = config.learning_rate_transformer #1e-4 original
            lr_backbone=config.learning_rate_backbone #1e-5 original
            weight_decay=config.weight_decay #1e-4 original

            project = "BAA_Book_Damage_Detection_Sweeps"

            model = Deta(lr=lr, lr_backbone=lr_backbone, weight_decay=weight_decay)

            model.to(device)

            # outputs = model(pixel_values=batch['pixel_values'].to(device), pixel_mask=batch['pixel_mask'].to(device))
            # print(outputs.logits.shape)

            max_steps_var = 5500

            wandb_logger = WandbLogger("DETA", "../models/", project=project, log_model=True, checkpoint_name=f"DETA_Sweep_{max_steps_var}_Steps")
            trainer = Trainer(max_steps=max_steps_var, gradient_clip_val=0.1, logger=wandb_logger)
            trainer.fit(model)
            
            wandb.finish()

In [None]:
if Training:
    # login to weights and biases, to relogin: wandb.login(key="YOUR KEY", relogin=True)
    wandb.login()

    date = datetime.datetime.now()
    run_name = "Run at the {}".format(date)

    lr =2.12e-4 #1e-4
    lr_backbone=4.87e-5 #1e-5
    weight_decay=3.33e-5 #1e-4

    project = "BAA_Book_Damage_Detection"

    wandb.init(
        project=project,
        name=run_name,
        config={"learning_rate_transformer": lr,
                "learning_rate_backbone": lr_backbone,
                "weight_decay": weight_decay,
                "architecture": "DETA",
                "dataset": "test_normal_set",
                "train_set_size": len(train_dataset)}
        )

    model = Deta(lr=lr, lr_backbone=lr_backbone, weight_decay=weight_decay)

    model.to(device)

    #outputs = model(pixel_values=batch['pixel_values'].to(device), pixel_mask=batch['pixel_mask'].to(device))
    #print(outputs.logits.shape)

    max_steps_var = 34368

    wandb_logger = WandbLogger("Optimize_Logging", "../models/", project=project, log_model=True, checkpoint_name=f"DETA_{max_steps_var}_Steps")
    trainer = Trainer(max_steps=max_steps_var, gradient_clip_val=0.1, logger=wandb_logger)
    trainer.fit(model)

    wandb.finish()

## Start Sweep

In [None]:
if HyperparameterSweep:
    wandb.agent(sweep_id=sweep_id, function=train)
    
    wandb.finish()
    wandb.teardown()

# load the checkpoint

run = wandb.init()
artifact = run.use_artifact('damaged-books-detection-ml/BAA_Book_Damage_Detection/DETA_300_Steps:v0', type='model')
artifact_dir = artifact.download()

model = Deta.load_from_checkpoint(artifact_dir + "/model.ckpt")

In [None]:
# load the checkpoint

run = wandb.init()
artifact = run.use_artifact('damaged-books-detection-ml/BAA_Book_Damage_Detection_Sweeps/DETA_Sweep_6000_Steps:v7', type='model')
# artifact = run.use_artifact("../models/ut10b6wq/epoch=108-step=93631.ckpt")
artifact_dir = artifact.download()

model = Deta.load_from_checkpoint(artifact_dir + "/model.ckpt")

In [None]:
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

## Evaluation

In [None]:
def convert_to_xywh(boxes):
    xmin, ymin, xmax, ymax = boxes.unbind(1)
    return torch.stack((xmin, ymin, xmax - xmin, ymax - ymin), dim=1)

def prepare_for_coco_detection(predictions):
    coco_results = []
    for original_id, prediction in predictions.items():
        if len(prediction) == 0:
            continue

        boxes = prediction["boxes"]
        boxes = convert_to_xywh(boxes).tolist()
        scores = prediction["scores"].tolist()
        labels = prediction["labels"].tolist()

        coco_results.extend(
            [
                {
                    "image_id": original_id,
                    "category_id": labels[k],
                    "bbox": box,
                    "score": scores[k],
                }
                for k, box in enumerate(boxes)
            ]
        )
    return coco_results

# initialize evaluator with ground truth (gt)
evaluator = CocoEvaluator(coco_gt=test_dataset.coco, iou_types=["bbox"])

print("Running evaluation...")
for idx, batch in enumerate(tqdm(test_dataloader)):
    # get the inputs
    pixel_values = batch["pixel_values"].to(device)
    pixel_mask = batch["pixel_mask"].to(device)
    labels = [{k: v.to(device) for k, v in t.items()} for t in batch["labels"]] # these are in DETR format, resized + normalized

    # forward pass
    with torch.no_grad():
        outputs = model(pixel_values=pixel_values, pixel_mask=pixel_mask)

    # turn into a list of dictionaries (one item for each example in the batch)
    orig_target_sizes = torch.stack([target["orig_size"] for target in labels], dim=0)
    results = processor.post_process_object_detection(outputs, target_sizes=orig_target_sizes, threshold=0.0
                                                      )

    # provide to metric
    # metric expects a list of dictionaries, each item
    # containing image_id, category_id, bbox and score keys
    predictions = {target['image_id'].item(): output for target, output in zip(labels, results)}
    predictions = prepare_for_coco_detection(predictions)
    evaluator.update(predictions)

evaluator.synchronize_between_processes()
evaluator.accumulate()
evaluator.summarize()

## Visualisierung

In [None]:
# Visualize

# TODO for a specific label use one specific color
for z in range(6):
    pixel_values, target = test_dataset[z]
    pixel_values = pixel_values.unsqueeze(0).to(device)

    

    annotations = test_dataset.coco.imgToAnns["image_id"]

    with torch.no_grad():
        # forward pass to get class logits and bounding boxes
        outputs = model(pixel_values=pixel_values, pixel_mask=None)

    # colors for visualization
    COLORS = [[0.000, 0.447, 0.741], [0.850, 0.325, 0.098], [0.929, 0.694, 0.125],
            [0.494, 0.184, 0.556], [0.466, 0.674, 0.188], [0.301, 0.745, 0.933]]

    def plot_results(pil_img, scores, labels, boxes, anno):
        plt.figure(figsize=(16,10))
        plt.imshow(pil_img)
        ax = plt.gca()
        colors = COLORS * 100
        types = {}
        for score, label, (xmin, ymin, xmax, ymax),c  in zip(scores.tolist(), labels.tolist(), boxes.tolist(), colors):
            ax.add_patch(plt.Rectangle((xmin, ymin), xmax - xmin, ymax - ymin,
                                fill=False, color=c, linewidth=3))
            text = f'{id2label[label]}: {score:0.2f}'
            textx = text.split(": ")
            if textx[0] in types:
                types.update({textx[0]: types[textx[0]] + 1})
            else: 
                types.update({textx[0]:1})
            ax.text(xmin, ymin, text, fontsize=10,
                    bbox=dict(facecolor='yellow', alpha=0.5))
        print(types)
        
        # draw annotatet boxes
        typesAnno = {}
        color = [0.000, 0.000, 0.000]
        # TODO Resize bbox
        for annot in anno:
            (x, y, w, h) = annot["bbox"]
            (xmin, ymin, xmax, ymax) = (x, y, x + w, y + h)
            ax.add_patch(plt.Rectangle((xmin, ymin), xmax - xmin, ymax - ymin,
                                fill=False, color=color, linewidth=3))
            text = f'{id2label[annot["category_id"]]}'
            if text in typesAnno:
                typesAnno.update({text: typesAnno[text] + 1})
            else: 
                typesAnno.update({text:1})
            ax.text(xmin, ymin, text, fontsize=0,
                    bbox=dict(facecolor='black', alpha=0.0))
        print(typesAnno)
        plt.axis('off')
        plt.show()

    # load image based on ID
    image_id = target['image_id'].item()
    image = test_dataset.coco.loadImgs(image_id)[0]
    anno = test_dataset.coco.imgToAnns[image_id]
    image = Image.open(os.path.join('../../../old/BAA/Data/test', image['file_name']))

    # postprocess model outputs
    width, height = image.size
    postprocessed_outputs = processor.post_process_object_detection(outputs,
                                                                    target_sizes=[(height, width)],
                                                                    threshold=0.18)
    results = postprocessed_outputs[0]
    print(results)
    plot_results(image, results['scores'], results['labels'], results['boxes'], anno)
    

## Testing AP computing

In [None]:
x = {'labels': [{'size': torch.tensor([ 800, 1296], device='cuda:0'), 'image_id': torch.tensor([20443], device='cuda:0'), 'class_labels': torch.tensor([3, 3, 4, 3, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 3], device='cuda:0'), 'boxes': torch.tensor([[0.2096, 0.4859, 0.0399, 0.0215],
        [0.2438, 0.4840, 0.0172, 0.0364],
        [0.2883, 0.3713, 0.0383, 0.2544],
        [0.3304, 0.4504, 0.0321, 0.1073],
        [0.4138, 0.5002, 0.0445, 0.0279],
        [0.3943, 0.4495, 0.0798, 0.1226],
        [0.6195, 0.4441, 0.0096, 0.0776],
        [0.6457, 0.4289, 0.0181, 0.1400],
        [0.6899, 0.2837, 0.0076, 0.0479],
        [0.6951, 0.3741, 0.0092, 0.1248],
        [0.1875, 0.7129, 0.0057, 0.0093],
        [0.5074, 0.6600, 0.0050, 0.0074],
        [0.6039, 0.7395, 0.0078, 0.0371],
        [0.6086, 0.6238, 0.0057, 0.0160],
        [0.7040, 0.6147, 0.0073, 0.1277],
        [0.5725, 0.4416, 0.0511, 0.1027],
        [0.2317, 0.6155, 0.0192, 0.0892]], device='cuda:0'), 'area': torch.tensor([ 847.6823,  419.4721, 9632.8975, 2344.4541,  324.3682, 6766.8066,
         560.3406, 2288.1863,  183.3673,  661.5347,   36.9824,   29.0828,
         236.8991,   67.3009,  397.1856, 4885.4731, 1037.4435],
       device='cuda:0'), 'iscrowd': torch.tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0'), 'orig_size': torch.tensor([1200, 1944], device='cuda:0')}]}

In [None]:
x["labels"][0]["class_labels"][2]

In [None]:
for x in range(0, 0):
    print("djalkfj")

In [None]:
y = torch.tensor([0])

In [None]:
y.size()

In [None]:
z = y.size()

In [None]:
z = torch.tensor([0.2438, 0.4840, 0.0172, 0.0364])
z[0]

In [None]:
if z == torch.tensor([]):
    print("hi")

In [None]:
z[0]

In [None]:
n = [{'scores': torch.tensor([0.6273, 0.6238, 0.6178, 0.6165, 0.6139, 0.6118, 0.6084, 0.6084, 0.6073,
        0.6072, 0.6066, 0.6059, 0.6058, 0.6054, 0.6054, 0.6052, 0.6050, 0.6043,
        0.6025, 0.6020, 0.6017, 0.6010, 0.6002, 0.5995, 0.5993, 0.5990, 0.5989,
        0.5988, 0.5964, 0.5962, 0.5960, 0.5960, 0.5941, 0.5933, 0.5930, 0.5920,
        0.5916, 0.5912, 0.5880, 0.5872, 0.5858, 0.5846, 0.5838, 0.5837, 0.5831,
        0.5831, 0.5828, 0.5819, 0.5809, 0.5807, 0.5802, 0.5801, 0.5799, 0.5775,
        0.5769, 0.5766, 0.5758, 0.5749, 0.5746, 0.5742, 0.5740, 0.5734, 0.5717,
        0.5704, 0.5696, 0.5695, 0.5689, 0.5686, 0.5684, 0.5684, 0.5683, 0.5669,
        0.5665, 0.5662, 0.5658, 0.5651, 0.5646, 0.5643, 0.5640, 0.5640, 0.5635,
        0.5632, 0.5631, 0.5629, 0.5628, 0.5628, 0.5623, 0.5614, 0.5613, 0.5612,
        0.5612, 0.5609, 0.5606, 0.5605, 0.5604, 0.5600, 0.5600, 0.5594, 0.5592,
        0.5590], device='cuda:0'), 'labels': torch.tensor([2, 2, 4, 9, 4, 2, 2, 2, 4, 9, 9, 9, 4, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
        4, 2, 2, 9, 9, 4, 4, 9, 4, 9, 9, 4, 9, 4, 4, 2, 9, 4, 9, 4, 9, 9, 2, 4,
        2, 2, 2, 2, 9, 2, 9, 9, 9, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 9, 9, 2, 9,
        9, 4, 0, 9, 4, 4, 9, 4, 4, 4, 4, 4, 0, 4, 9, 4, 4, 4, 4, 9, 4, 4, 4, 4,
        4, 4, 0, 4], device='cuda:0'), 'boxes': torch.tensor([[ 676.2388,  277.1254,  850.7404,  625.0176],
        [ 491.5782,  608.8383,  664.5103,  935.7572],
        [ 245.1959,   30.2981, 1908.2277, 1173.9956],
        [ 862.0541, 1072.7997,  989.9567, 1112.1863],
        [ 317.2450,   22.0593, 1450.0930,  330.5922],
        [ 388.6223,    5.3880,  637.9678,  199.8394],
        [ 606.6195,  338.1898,  677.1177,  614.6801],
        [ 832.0270,  614.6859,  973.1371,  932.7524],
        [ 288.2072,   10.3697, 1837.5369,  447.6088],
        [ 764.4576,  160.3179,  911.5189,  198.4788],
        [ 701.6495,  159.8817,  943.4083,  194.7039],
        [ 793.9598,  158.5940,  946.8500,  197.9476],
        [ 271.4940,   72.6817, 1910.3503,  611.1729],
        [ 970.0051,  106.4567, 1097.7736,  142.6047],
        [ 843.1724,  106.4497,  961.6071,  141.5147],
        [ 984.1047,  154.2677, 1203.7831,  187.1603],
        [ 889.5905,  106.8443, 1006.0449,  140.3231],
        [ 748.9282,  158.6208,  997.1929,  191.0836],
        [ 898.5062,  106.3222, 1101.9955,  144.9065],
        [ 761.7839,  156.8068, 1050.6472,  187.2452],
        [ 856.7856,  161.5986, 1078.5540,  198.9951],
        [ 629.3372,   72.2920,  706.5010,  130.5994],
        [ 831.1205,  105.8166,  992.0811,  136.3994],
        [ 735.0187,  104.6248,  998.1118,  135.4265],
        [ 362.8256,    7.4454, 1430.8937,  251.0522],
        [1062.8165,  287.1315, 1233.4741,  603.4262],
        [ 407.9402,  606.9929,  644.2662,  940.0078],
        [ 783.8613,  153.9332, 1066.2512,  181.5553],
        [ 823.7746,  105.5136, 1064.1918,  135.8379],
        [ 204.1723,  598.1271, 1878.0404, 1177.5947],
        [ 288.9090,  200.1757, 1437.0602,  973.5871],
        [ 855.7324,  163.4534, 1164.6575,  198.6117],
        [ 270.7104,   91.4500, 1456.1766,  700.7239],
        [ 920.4334,  106.2333, 1166.8995,  135.2824],
        [ 879.5668,  105.8654, 1106.7937,  135.1730],
        [ 233.2137,  387.9187, 1509.7170, 1175.8739],
        [ 710.6248,   79.1035,  947.1807,  110.2706],
        [ 287.1901,  245.8605, 1421.9178,  622.1282],
        [ 744.9787,   46.2645, 1857.5127, 1153.2631],
        [ 444.7609,  288.1731,  637.1275,  608.7245],
        [ 841.8706,  217.1818,  986.1880,  246.9627],
        [ 264.0420,  216.1142, 1428.3958,  332.6553],
        [ 796.8151,  217.5832,  891.8583,  247.7887],
        [ 398.1796,   59.0355, 1419.2971,  211.7155],
        [1066.6624,  211.2775, 1304.0228,  244.4843],
        [1087.7819,  201.9277, 1307.7440,  235.8141],
        [ 612.2525,  621.8962,  757.0402,  935.7299],
        [ -19.4005,  976.6158, 1795.9170, 1188.5854],
        [ 400.7997,    4.5456,  568.1109,  196.2637],
        [ 321.0241,  605.9488,  547.1797,  936.0553],
        [ 428.1929,   38.3956,  554.8650,  199.8660],
        [ 746.6036,  615.0887,  971.1607,  935.0892],
        [1102.4276,  207.2869, 1372.0725,  242.2401],
        [1072.1843,  268.3727, 1380.2273,  940.6075],
        [ 946.0789,  212.6116, 1191.3138,  244.1134],
        [ 855.3904,  214.8240, 1148.2393,  244.2358],
        [ 526.0251,   84.6113,  956.9733,  159.2933],
        [1602.9233, 1065.2534, 1829.8510, 1178.2430],
        [-242.1230, -158.3021,  591.4835,  859.7520],
        [1035.1334,   24.6038, 1912.5560, 1176.9390],
        [ 335.6111,  284.0186,  677.6199,  943.9149],
        [ 135.1845,  549.4729, 1398.5144, 1165.3425],
        [-271.7565, -259.6675,  620.7223,  518.3907],
        [ 266.8770,  979.6051, 1503.4895, 1183.5458],
        [ 413.4843,   28.3230, 1088.3103,  201.7365],
        [ 120.5829,  922.5416, 1641.0110, 1187.8499],
        [1040.3624,  269.0762, 1410.5273,  947.1584],
        [ 336.1257,  986.8702, 1197.7725, 1164.8531],
        [ 771.3053,  614.1293,  871.9078,  646.2388],
        [ 808.9615,  917.7413, 1042.6924,  969.2343],
        [ 361.3281,  288.9001,  601.4477,  612.9789],
        [1798.4371,  -47.2978, 2046.5907,   95.5865],
        [ 346.3434,  216.3328,  481.6689,  238.2121],
        [ 605.1952,  703.0776, 1852.5463, 1195.7296],
        [ 346.3434,  216.3328,  481.6689,  238.2121],
        [ 559.0224,   68.1338, 1055.6260,  148.4330],
        [1664.7755, 1051.6240, 1876.0095, 1191.6969],
        [ 214.1127,  209.4996, 1016.6656, 1164.9806],
        [ 511.7601,   85.6599, 1151.7854,  191.8203],
        [1834.2233, 1116.4379, 1945.0957, 1196.0771],
        [-205.5402, -105.0520,  916.1711,  466.4155],
        [ 254.4914, 1122.7365,  511.3465, 1189.0867],
        [ 257.3645,  217.7309, 1414.2401,  250.7157],
        [1281.4097,   21.8314, 1939.2560, 1181.8925],
        [1331.4860, -106.3575, 2310.1462,  391.7843],
        [1646.0857,   41.0714, 1847.3046,  135.3579],
        [ 356.8601,  865.6786,  858.8425, 1001.6323],
        [1797.8419,  112.0865, 1922.6658,  372.2739],
        [ 511.7601,   85.6599, 1151.7854,  191.8203],
        [  21.5554,  859.8798, 1493.3877, 1180.0155],
        [ 263.6172,  212.8943, 1442.7341,  240.9092],
        [ 516.8245,   82.9084,  857.6564,  198.6778],
        [1579.3196,  865.5213, 1845.1697, 1170.8385],
        [ 998.0509,  -69.6900, 2263.0579,  420.4548],
        [1000.9614,  593.9443, 1878.0334, 1170.8518],
        [ 875.6599, 1072.2188, 1000.1871, 1112.0150],
        [ 303.8547,  262.9725,  969.4770,  947.5837],
        [  52.9637,  755.6959, 1336.9678, 1213.2312],
        [ 998.0509,  -69.6900, 2263.0579,  420.4548],
        [ -46.3515,  570.4091,  991.9885, 1162.0546]], device='cuda:0')}]

## Export predictions of new images

In [None]:
def writeJson(jsonObject, filename):
    if filename.split(".")[-1] != "json":
        filename = f"{filename}.json"
    with open(filename, "w") as g:
        g.write(jsonObject)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

path_new_images = "../new_images/"
path_predictions = "../predicted_images"
input_file = "coco_new.json"
pred_coco_filename = "coco_predictions.json"
threshold_postprocessing = 0.2 # zu 50% sicher ist sich das modell, dass dies ein schaden ist

list_paths = [os.path.join(path_new_images, filename) for filename in os.listdir(path_new_images) if
                            filename.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp'))]


processor = DetaImageProcessor.from_pretrained("facebook/detr-resnet-50")

new_dataset = CocoDetection(img_folder=path_new_images, processor=processor, image_name=input_file)

print("Number of training examples:", len(new_dataset))

# Copy json if not in folder else create
pred_path = os.path.join(path_predictions, pred_coco_filename)

with open(os.path.join(path_new_images, input_file), "r") as f:
    coco_tmp = json.load(f)

if os.path.exists(pred_path):
    with open(pred_path, "r") as f:
        coco_predict = json.load(f)
    
    # append image infos
    for x in coco_tmp["images"]:
        tmp = True
        for z in coco_predict["images"]:
            if x["id"] == z["id"]:
                tmp = False
        if tmp:
            coco_predict["images"].append(x)

else:
    coco_predict = copy.copy(coco_tmp)

# start id for annotations
if coco_predict["annotations"] == []:
    id_count = 0
else:
    id_count = coco_predict["annotations"][-1]["id"] + 1

for z in range(len(list_paths)):
    pixel_values, target = new_dataset[z]
    pixel_values = pixel_values.unsqueeze(0).to(device)

    with torch.no_grad():
        # forward pass to get class logits and bounding boxes
        outputs = model(pixel_values=pixel_values, pixel_mask=None)
    
    # postprocess model outputs
    width, height = target["orig_size"]
    p_postprocessed_outputs = processor.post_process_object_detection(outputs,
                                                                    target_sizes=[(height, width)],
                                                                    threshold=threshold_postprocessing)
    # darf nur einmal durchlaufen, sonst gibts doppelte einträge
    for d in range(len(p_postprocessed_outputs[0]["labels"])):
        # Write predictions into annotations from pred_coco_filename
        # boxes == [xmin, ymin, xmax, ymax]
        box = p_postprocessed_outputs[0]["boxes"][d].tolist()
        dict_new_pred = {"id":id_count, 
                         "image_id":torch.Tensor.item(target["image_id"][0]), 
                         "category_id":int(p_postprocessed_outputs[0]["labels"][d]),
                         "segmentation":[], 
                         "bbox":p_postprocessed_outputs[0]["boxes"][d].tolist(), 
                         "ignore":0, 
                         "iscrowd":0, 
                         "area":((box[2]-box[0])*(box[3]-box[1]))}
        coco_predict["annotations"].append(dict_new_pred)
        id_count += 1

    for i in range(len(coco_tmp["images"])):
        if coco_tmp["images"][i]["id"] == torch.Tensor.item(target["image_id"][0]):
            del coco_tmp["images"][i]
    # move image to new location
    shutil.move(list_paths[z], os.path.join(path_predictions, list_paths[z].split("/")[-1]))


# uncomment to run
# writeJson(json.dumps(coco_tmp, indent=4), os.path.join(path_new_images, input_file))
# writeJson(json.dumps(coco_predict, indent=4), pred_path)
    