In [74]:
from transformers import AutoModelForSequenceClassification, OFATokenizer, utils
from PIL import Image
from torchvision import transforms
from transformers import OFATokenizer, OFAModel
from transformers.models.ofa.generate import sequence_generator
import torch

In [93]:
import pandas as pd
from PIL import Image
import io
import base64

# Load the data
data = pd.read_csv('dataset/vision_language_examples.tsv', delimiter='\t', header=None)

# Select the first row's image data (assuming it's in the second column)
image_data = data.iloc[7, 1]

# Decode the base64 string
image_bytes = base64.b64decode(image_data)

# Convert to a bytes buffer
image_buffer = io.BytesIO(image_bytes)

# Open the image
image = Image.open(image_buffer)

# Save the image
image.save('examples/vision_language_examples/teacher.jpg', 'JPEG')

In [76]:
# also take the caption
caption = data.iloc[7, 2]
caption

'ewa makai teacher walking around classroom while giving a lecture to her students.'

In [77]:
from transformers import AutoModel

model = AutoModel.from_pretrained("OFA-Sys/ofa-huge-vqa", output_attentions=True, output_hidden_states=True)
tokenizer = OFATokenizer.from_pretrained("OFA-Sys/ofa-huge-vqa")

OFA-Sys/ofa-huge-vqa
<super: <class 'OFATokenizer'>, <OFATokenizer object>>


In [78]:
mean, std = [0.5, 0.5, 0.5], [0.5, 0.5, 0.5]
resolution = 480
patch_resize_transform = transforms.Compose([
        lambda image: image.convert("RGB"),
        transforms.Resize((resolution, resolution), interpolation=Image.BICUBIC),
        transforms.ToTensor(), 
        transforms.Normalize(mean=mean, std=std)
    ])

In [163]:
# txt = "what is the woman in the centre doing?"
# txt = " what is this?"
# txt = " what city are the giants from?"
# txt = "what number is the small hand on?"
# txt = "what sort of vehicle uses this item?"
# txt = "why is [person1] pointing a gun at [person2]?"
txt = 'why is the woman pointing a gun at the man?'
inputs = tokenizer([txt], return_tensors="pt").input_ids
image = Image.open('examples/vcr_3880.jpg')
patch_img = patch_resize_transform(image).unsqueeze(0)

In [164]:
generator = sequence_generator.SequenceGenerator(
                    tokenizer=tokenizer,
                    beam_size=5,
                    max_len_b=16, 
                    min_len=0,
                    no_repeat_ngram_size=3,
                    temperature=0.5,
                )

In [165]:
data = {}
data["net_input"] = {"input_ids": inputs, 'patch_images': patch_img, 'patch_masks':torch.tensor([True])}
gen_output = generator.generate([model], data)
gen = [gen_output[i][0]["tokens"] for i in range(len(gen_output))]

Type of input_ids: <class 'torch.Tensor'>
Shape of input_ids: torch.Size([1, 13])


In [166]:
decoded_output = tokenizer.batch_decode(gen, skip_special_tokens=True)
print(decoded_output)

[' no']


In [167]:
gen = model.generate(inputs, patch_images=patch_img, num_beams=3, no_repeat_ngram_size=3, temperature=0.5) 

print(tokenizer.batch_decode(gen, skip_special_tokens=True))

Type of input_ids: <class 'torch.Tensor'>
Shape of input_ids: torch.Size([1, 13])
[" not sure why this isn't obvious to anyone else but it's not! she doesn't"]
