In [184]:
import torch
import matplotlib.pyplot as plt
import numpy as np
import torchvision.transforms as T 
from PIL import Image
from torchvision.transforms import functional as F
import os
import json

# Json File

In [185]:
train = json.load(open('./new_annotations/EMOTIC_train_x1y1x2y2.json'))
train_anno = train['annotations'] # dictionnary of annotations
train_img = train['images'] # dictionnary of images
train_img


[{'database': 'EMOTIC',
  'file_name': 'COCO_val2014_000000562243.jpg',
  'folder': 'mscoco/images',
  'name': 'mscoco',
  'height': 640,
  'width': 640,
  'id': 0,
  'coco_ids': {'image_id': 562243, 'annotations_id': 448867}},
 {'database': 'EMOTIC',
  'file_name': 'COCO_train2014_000000288841.jpg',
  'folder': 'mscoco/images',
  'name': 'mscoco',
  'height': 480,
  'width': 640,
  'id': 1,
  'coco_ids': {'image_id': 288841, 'annotations_id': 1750456}},
 {'database': 'EMOTIC',
  'file_name': 'COCO_val2014_000000558171.jpg',
  'folder': 'mscoco/images',
  'name': 'mscoco',
  'height': 480,
  'width': 640,
  'id': 2,
  'coco_ids': {'image_id': 558171, 'annotations_id': 467799}},
 {'database': 'EMOTIC',
  'file_name': 'COCO_train2014_000000369575.jpg',
  'folder': 'mscoco/images',
  'name': 'mscoco',
  'height': 640,
  'width': 480,
  'id': 3,
  'coco_ids': {'image_id': 369575, 'annotations_id': 192459}},
 {'database': 'EMOTIC',
  'file_name': 'COCO_train2014_000000213009.jpg',
  'folder

# Images dictionnary


In [186]:
# Importing model 
model = torch.hub.load('facebookresearch/detr:main', 'detr_resnet50', pretrained=True)
model.eval()

Using cache found in C:\Users\JALAL/.cache\torch\hub\facebookresearch_detr_main


DETR(
  (transformer): Transformer(
    (encoder): TransformerEncoder(
      (layers): ModuleList(
        (0-5): 6 x TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
          )
          (linear1): Linear(in_features=256, out_features=2048, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear2): Linear(in_features=2048, out_features=256, bias=True)
          (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.1, inplace=False)
          (dropout2): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (decoder): TransformerDecoder(
      (layers): ModuleList(
        (0-5): 6 x TransformerDecoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=256, ou

In [187]:

# standard PyTorch mean-std input image normalization
transform = T.Compose([
    T.Resize(800),
    T.ToTensor(),
    T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

def rescale_bboxes(out_bbox, size):
    img_w, img_h = size
    b = box_cxcywh_to_xyxy(out_bbox)
    b = b * torch.tensor([img_w, img_h, img_w, img_h], dtype=torch.float32)
    return b

def box_cxcywh_to_xyxy(x):
    x_c, y_c, w, h = x.unbind(1)
    b = [(x_c - 0.5 * w), (y_c - 0.5 * h),
         (x_c + 0.5 * w), (y_c + 0.5 * h)]
    return torch.stack(b, dim=1)

In [188]:
def model_results(img):
    img = transform(img).unsqueeze(0)
    outputs = model(img)
    # keep only predictions with 0.9+ confidence and labeled as "person"
    probas = outputs['pred_logits'].softmax(-1)[0, :, :-1]
    keep = (probas.max(-1).values > 0.9) & (probas.argmax(-1) == 1)  # Filter for "person" class
    # convert boxes from [0; 1] to image scales
    # Correcting the line causing TypeError
    bboxes_scaled = rescale_bboxes(outputs['pred_boxes'][0, keep], img.size()[2:])
    bboxes_scaled = bboxes_scaled.tolist()
    return bboxes_scaled , probas[keep]

In [189]:
original_path = "EMOTIC (1)/EMOTIC/PAMI/emotic"
# liste d'appairement des images et des annotations en dictionnaires
list_appair = []
i = 0
for image in train_img:
    # image est le dictionnaire d'information d'une image
    # train image est le dictionnaire d'information de toutes les images
    if i<5:
        file_name = image['file_name']
        folder = image['folder']
        img_path = original_path + '/' + folder + '/' + file_name
        img = Image.open(img_path)
        bboxes , probas = model_results(img)
        list_appair.append({'id': image['id'], 'bboxes': bboxes})
    i+=1
print(list_appair)

[{'id': 0, 'bboxes': [[104.82051086425781, 79.68885803222656, 705.2249145507812, 796.5303344726562]]}, {'id': 1, 'bboxes': [[558.7202758789062, 853.2990112304688, 680.6359252929688, 1065.699462890625], [756.3778686523438, 333.6680603027344, 793.0950317382812, 502.22509765625], [743.842041015625, 297.0343322753906, 800.0303344726562, 839.23779296875], [188.13223266601562, 364.7883605957031, 262.90679931640625, 667.5501098632812], [200.64874267578125, 211.2132568359375, 420.8241882324219, 1059.3511962890625], [757.4473266601562, 337.278564453125, 783.6183471679688, 415.2267761230469], [338.9683837890625, 161.04525756835938, 541.1212158203125, 1009.454833984375], [501.7662658691406, 223.66307067871094, 574.97802734375, 698.0384521484375], [604.4768676757812, 332.0246276855469, 760.1392211914062, 1048.1500244140625]]}, {'id': 2, 'bboxes': [[619.4775390625, 829.3400268554688, 800.140869140625, 1060.1058349609375], [12.161705017089844, 704.91845703125, 23.83639144897461, 745.393798828125], [

# Annotations dictionnary 

In [190]:
def iou(box1, box2):
    x1, y1, x2, y2 = box1
    x1g, y1g, x2g, y2g = box2
    # determine the coordinates of the intersection rectangle
    xA = max(x1, x1g)
    yA = max(y1, y1g)
    xB = min(x2, x2g)
    yB = min(y2, y2g)
    # compute the area of intersection rectangle
    interArea = max(0, xB - xA + 1) * max(0, yB - yA + 1)
    # compute the area of both the prediction and ground-truth
    # rectangles
    boxAArea = (x2 - x1 + 1) * (y2 - y1 + 1)
    boxBArea = (x2g - x1g + 1) * (y2g - y1g + 1)
    # compute the intersection over union by taking the intersection
    # area and dividing it by the sum of prediction + ground-truth
    # areas - the interesection area
    iou = interArea / float(boxAArea + boxBArea - interArea)
    # return the intersection over union value
    return iou

def get_iou(bbox1, bbox2, thresh, new_annots = []):
    iou_score = iou(bbox1, bbox2)
    if iou_score < thresh:
        new_annots.append(bbox1)
        new_annots.append(bbox2)
    else:
        pass
    return new_annots

def remove_duplicates(lst):
    return [list(t) for t in {tuple(item) for item in lst}]

In [191]:
print(train_anno[0]['bbox'])
train_new_annots = train_anno

[86, 58, 564, 628]


In [211]:
for i, anno in enumerate(train_new_annots):
    if i < 5:
        img_id = anno['image_id']
        anno_id = anno['id']
        bbox = anno['bbox']
        
        # Check if bbox is not already a list of lists
        if not isinstance(bbox[0], list):
            bbox = [bbox]
        
        print(img_id, anno_id, bbox)
        
        for appair in list_appair:
            if appair['id'] == img_id:
                # Initialize new annotations list for managing bboxes for each image
                new_annots = []
                
                for single_bbox in bbox:  # Use single_bbox to avoid confusion with the outer bbox
                    for bbox2 in appair['bboxes']:
                        new_annots = get_iou(single_bbox, bbox2, 0.99, new_annots)
                
                anno['bbox'] = remove_duplicates(new_annots)
        
        if len(train_anno[i]['bbox']) > len(train_new_annots[i]['annotations_categories']):
            train_new_annots[i]['annotations_categories'].extend([None] * (len(train_anno[i]['bbox']) - len(train_new_annots[i]['annotations_categories'])))


0 0 [[104.82051086425781, 79.68885803222656, 705.2249145507812, 796.5303344726562], [86, 58, 564, 628]]
1 1 [[338.9683837890625, 161.04525756835938, 541.1212158203125, 1009.454833984375], [200.64874267578125, 211.2132568359375, 420.8241882324219, 1059.3511962890625], [485, 149, 605, 473], [604.4768676757812, 332.0246276855469, 760.1392211914062, 1048.1500244140625], [757.4473266601562, 337.278564453125, 783.6183471679688, 415.2267761230469], [558.7202758789062, 853.2990112304688, 680.6359252929688, 1065.699462890625], [501.7662658691406, 223.66307067871094, 574.97802734375, 698.0384521484375], [756.3778686523438, 333.6680603027344, 793.0950317382812, 502.22509765625], [743.842041015625, 297.0343322753906, 800.0303344726562, 839.23779296875], [188.13223266601562, 364.7883605957031, 262.90679931640625, 667.5501098632812]]
2 2 [[12.161705017089844, 704.91845703125, 23.83639144897461, 745.393798828125], [386.17138671875, 207.0525665283203, 576.8663940429688, 1018.915283203125], [305, 92, 4

In [203]:
# Saving the new annotations 
train_new_annots[0]['bbox']   

[[104.82051086425781, 79.68885803222656, 705.2249145507812, 796.5303344726562],
 [86, 58, 564, 628]]

In [215]:

# Create a dictionary with the images and annotations
mixed_data = {'images': train_img, 'annotations': train_new_annots}

# Save the mixed data as a JSON file
with open('./newest_annots.json', 'w') as f:
    json.dump(mixed_data, f)
