# Dataset analysis

In [None]:
import json
import pandas as pd
# file = open('/home/manuela/Downloads/refcocog-20230409T170358Z-001/refcocog/annotations/refs(umd).p', 'rb')
# obj = pickle.load(file)

path = '../data/raw/refcocog/'
obj = pd.read_pickle(path + 'annotations/refs(umd).p')
refs = json.loads(json.dumps(obj))

with open(path + 'annotations/instances.json', 'r') as file:
    inst = json.load(file)


In [None]:
# images = pd.DataFrame(inst['images'])
# print(images.head())
print(refs[1])
print(inst['info'])
print(inst['images'][0])
print(inst['annotations'][0])
print(inst['categories'])
print(inst['licenses'])
splits = [refs[i]['split'] for i in range(len(refs))]
print(set(splits))
for i in inst:
    print(i)


In [None]:
# Check number of images in images, annotations and references
images = set()
for image in inst['images']:
    images.add(image['id'])

img_ann = set()
for ann in inst['annotations']:
    img_ann.add(ann['image_id'])

for ann in inst['annotations']:
    img_ann.add(ann['image_id'])

# img_refs = set(refs['image_id'].unique())

print(len(images), len(img_ann),  len(inst['annotations']))


In [None]:
import torch
from torchvision.utils import draw_bounding_boxes
from torchvision.io import read_image
from torchvision import transforms as t
from torchvision.ops import box_convert
ref = refs[157]

# print(ref['file_name']) remove the annotation id from name
bbox = [ann['bbox'] for ann in inst['annotations'] if ann['id'] == ref['ann_id']][0]
# print(bbox)
image_name = '_'.join(ref['file_name'].split('_')[0:-1]) + '.jpg'
print(image_name)
print(ref['split'])
box = box_convert(torch.tensor([bbox]), in_fmt='xywh', out_fmt='xyxy')
# print(box.shape)
print(ref['sentences'])
image = read_image(path + 'images/'+image_name)
res = draw_bounding_boxes(image, boxes=box, colors='red', width=5)
t.ToPILImage()(res)


In [None]:
# Possible class structure for dataset sample
class Example():
    def __init__(self, id:str, split:str, sentences:list, image_id:str, file_name:str, category:dict, bbox:torch.Tensor) -> None:
        self.id = id
        self.split = split
        self.sentences = sentences
        self.image_id = image_id
        self.file_name = file_name
        self.category = category
        self.bbox = bbox
    
    def __str__(self):
        return f'''
[
    Id: {self.id},
    Split: {self.split},
    Sentences: {self.sentences},
    Image: {self.file_name},
    Category: {self.category['name']}
]'''

# example:Example = Example()
examples = []
for ref in refs[0:100]:
    sentences = []
    for sentence in ref['sentences']:
        sentences.append({
            'tokens': sentence['tokens'], 
            'sent': sentence['sent']
        })
    file_name = '_'.join(ref['file_name'].split('_')[0:-1]) + '.jpg'
    category = [{'category_id': cat['id'], 'name': cat['name']} for cat in inst['categories'] if cat['id'] == ref['category_id']][0]
    bbox = [ann['bbox'] for ann in inst['annotations'] if ann['id'] == ref['ann_id']][0]
    box = box_convert(torch.tensor([bbox]), in_fmt='xywh', out_fmt='xyxy')
    
    e = Example(ref['ref_id'], 
                    ref['split'], 
                    sentences,
                    ref['image_id'],
                    file_name,
                    category,
                    box)
    examples.append(e)



In [None]:
# Check images width and height to find the best one for resizing for batches

widths = []
heights = []
for image in inst['images']:
    widths.append(image['width'])
    heights.append(image['height'])

print(f'Max width: {max(widths)}, Min width: {min(widths)}, Avg width: {sum(widths)/len(widths)}')
print(f'Max height: {max(heights)}, Min height: {min(heights)}, Avg height: {sum(heights)/len(heights)}')

splits = [ref['split'] for ref in refs]
from collections import Counter
print(Counter(splits))


In [None]:
# Parsing with dependency graphs
sents = []
for ref in refs[0:200]:
    sents.append(ref['sentences'][0]['sent'])
    # print(ref['sentences'][0]['sent'])

In [None]:
def get_subject_phrase(doc):
    for token in doc:
        # Mainly -ing verbs
        if("acl" in token.dep_):
            subtree = list(token.subtree)
            end = subtree[0].i
            sent = doc[0:end]
            if len(sent) > 1:
                return sent

    # subject which/that something
    for token in doc:
        if("relcl" in token.dep_):
            subtree = list(token.subtree)
            end = subtree[0].i
            sent = doc[0:end]
            if len(sent) > 1:
                return sent
            
    # Subjects
    for token in doc:
        if ("subj" in token.dep_):
            subtree = list(token.subtree)
            start = subtree[0].i
            end = subtree[-1].i + 1
            sent = doc[start:end]
            if len(sent) > 1:
                return sent
    
    return doc

In [None]:
# Parsing with name chunks
import spacy

nlp = spacy.load('en_core_web_lg')

for sentence in sents:
    doc = nlp(sentence)
    # subject_phrase = get_subject_phrase(doc)
    print(sentence)
    # print(subject_phrase)
    for chunk in doc.noun_chunks:
        print(chunk.text)
    print('----------------')


In [None]:
from spacy import displacy

doc = nlp("a lady pouring wine in the bigger glass")
displacy.render(doc, style='dep')

for token in doc:
    print(token.text, token.dep_, token.lemma_)

In [None]:
# Test for image blurring
import torch
from torchvision.io import read_image
from torchvision import transforms as T
from torchvision.ops import box_convert
import matplotlib.pyplot as plt

ref = refs[51]

# print(ref['file_name']) remove the annotation id from name
bbox = [ann['bbox'] for ann in inst['annotations'] if ann['id'] == ref['ann_id']][0]
# print(bbox)
image_name = '_'.join(ref['file_name'].split('_')[0:-1]) + '.jpg'
print(image_name)
box = box_convert(torch.tensor([bbox]), in_fmt='xywh', out_fmt='xyxy')
# print(box.shape)
print(ref['sentences'][0]['sent'])
original = read_image(path + 'images/'+image_name)

# cv2.imshow('image', original)
blurred = T.GaussianBlur(25, 20)(original)
# blurred = cv2.GaussianBlur(original, (25,25), 0)
box = box.squeeze(0).numpy().astype(int)
blurred[:,box[1]:box[3], box[0]:box[2]] = original[:,box[1]:box[3], box[0]:box[2]]
print(blurred.shape)
plt.imshow(blurred.permute(1,2,0))


In [None]:
# Test for bounding boxes resizing
from typing import List, Tuple
import numpy as np
from torch import Tensor
import torchvision.transforms as T
from PIL import Image
from torchvision.ops import box_convert 
import torch


def transform_sample(image: Image.Image , box: List[int], target_size: int = 224)-> Tuple[Tensor, Tensor]:
    x, y = image.size

    x_scale = target_size / x
    y_scale = target_size / y

    trans = T.Compose(transforms=[
        T.Resize((target_size, target_size)),
        T.CenterCrop(target_size),
        T.PILToTensor()
    ])
    image_tensor: Tensor = trans(image)

    # original frame as named values
    box_tensor = box_convert(torch.tensor([box]), in_fmt='xywh', out_fmt='xyxy').squeeze(0)
    xmin, ymin, xmax, ymax = box_tensor

    xmin = int(np.round(xmin * x_scale))
    ymin = int(np.round(ymin* y_scale))
    xmax = int(np.round(xmax * x_scale))
    ymax = int(np.round(ymax * y_scale))

    bbox_tensor: Tensor = torch.tensor([[xmin, ymin, xmax, ymax]])
    return image_tensor, bbox_tensor



In [None]:
from typing import List
from PIL import Image
import matplotlib.pyplot as plt
from torchvision.utils import draw_bounding_boxes
ref = refs[157]

image_name: str = '_'.join(ref['file_name'].split('_')[0:-1]) + '.jpg'
image: Image.Image = Image.open(path + 'images/'+image_name)
bbox:List[int] = [ann['bbox'] for ann in inst['annotations'] if ann['id'] == ref['ann_id']][0]

image_tensor, bbox_tensor = transform_sample(image, bbox, target_size=224)
# bbox_tensor = torch.as_tensor(data=bbox_tensor)
result = draw_bounding_boxes(image_tensor, bbox_tensor, width=2, colors=['red'])
plt.imshow(result.permute(1,2,0))
print(ref['sentences'][0]['sent'])

