# Prepping for real run. 

Todo: 
* Define `max_source_length` and `max_target_length` for the model (otherwise truncated).
padding token should be replaced with -100, which is the 'ignore_index' of `CrossEntorpyLoss` in PT and TF. For Flax, use `decoder_attention_mask`. 
Attention_mask. ensures madding tokens of inputs are ignored. 

* Install apex. "model will automatically use apex.normalization.FusedRMSNorm instead of T5LayerNorm." The former uses an optimized fused kernel which is several times faster than the latter.

A note on model sizes: 
T5-11B (original, not v1.1) weights in float32 are 45.2GB. 
See this post for using huggingface endpoints on SINGLE GPU for cheap inference: https://www.philschmid.de/deploy-t5-11b
Uses mixed precision and sharding, and LLM.int8(). 

In [2]:
import torch
import clip
import os
import numpy as np
from transformers import T5Tokenizer, T5ForConditionalGeneration, T5EncoderModel, T5Model, T5Config, AutoModelWithLMHead
import accelerate
# import wandb
from tqdm import tqdm
import lovely_tensors as lt
import math
from PIL import Image
lt.monkey_patch()
# !wandb login  -- reactivate later
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [19]:
# Split VAL2014 into train and test datasets 

train_set = []
test_set = []

for i, img_name in enumerate(os.listdir("/home/kastan/thesis/video-pretrained-transformer/vqa/val2014")):
    if i >= 39000:
        test_set.append(os.path.join("/home/kastan/thesis/video-pretrained-transformer/vqa/val2014", img_name))
    elif i <= 5000:
        train_set.append(os.path.join("/home/kastan/thesis/video-pretrained-transformer/vqa/val2014", img_name))

print(len(train_set))
print(len(test_set))

5001
1504


In [20]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("device: ", device)

# Initialize preprocessing models for collate_fn 
print("Initializing clip and scene graph models...")
clip_model, clip_preprocess = clip.load('ViT-L/14@336px', device)

t5_tokenizer = T5Tokenizer.from_pretrained("google/t5-v1_1-base", return_special_tokens_mask=True)


device:  cuda
Initializing clip and scene graph models...


In [21]:
def collate_fn(batch):
    input_imgs = []
    input_questions = []
    answers = []
    scene_strs = []
    question_ids = []
    for elt in batch:
        img, question, answer, scene_graph_str, question_id = elt
        input_imgs.append(clip_preprocess(Image.fromarray(img)).unsqueeze(0))
        input_questions.append(question)
        answers.append(answer)
        scene_strs.append(scene_graph_str)
        question_ids.append(question_id)

    image_input = torch.cat(input_imgs).to(device)
    text_input = clip.tokenize(input_questions, truncate=True).to(device)
    sg_input = clip.tokenize(scene_strs, truncate=True).to(device)

    with torch.inference_mode(): # even faster than no_grad()
        # image_features = torch.unsqueeze(clip_model.encode_image(image_input), dim=1)
        # text_features = torch.unsqueeze(clip_model.encode_text(text_input), dim=1)
        # sg_features = torch.unsqueeze(clip_model.encode_text(sg_input), dim=1)
        image_features = clip_model.encode_image(image_input)
        text_features = clip_model.encode_text(text_input)
        sg_features = clip_model.encode_text(sg_input)

    # labels = t5_tokenizer(answers, padding="longest", max_length=128, truncation=True, return_tensors="pt").input_ids
    # labels[labels == t5_tokenizer.pad_token_id] = -100

    # print("answers: ", answers)
    labels = t5_tokenizer(answers, return_tensors="pt").input_ids

    # print("labels: ", labels)

    return image_features, text_features, sg_features, labels, question_ids

# Moc Dataset

In [30]:
import json

class VQA(torch.utils.data.Dataset):
    def __init__(self, annotations, img_path, pseudo_mode="val", mode="train", img_set=None):
        self.annotations = [] 
        self.questions = None
        self.img_path = img_path
        self.qid_to_question = {}
        self.mode = mode
        self.pseudo_mode = pseudo_mode

        self.img_to_scene_graph = {}
        self.question_to_answer = {}

        print(f"Constructing {self.mode} dataset...")
        with open(annotations[f"{self.mode}_annotations"]) as f:
            all_annotations = json.load(f)['annotations']
            print("Stored annotations...")

            for annotation in tqdm(all_annotations):
                image_id = annotation["image_id"]
                padded_image_id = "".join((12 - len(str(image_id)))*["0"]) + str(image_id)
                img_path = os.path.join(self.img_path, f"COCO_{self.pseudo_mode}2014_{padded_image_id}.jpg")

                if img_set is not None:
                    if img_path in img_set:
                        self.annotations.append(annotation)
                else:
                    self.annotations.append(annotation)

        self.construct_question_to_answer_dict()

        with open(annotations[f"scene_graph_json"]) as f:
            print("Processing scene graph json...")
            for obj in f:
                json_obj = json.loads(eval(eval(obj)))
                if str(json_obj["input_img_path"]) not in self.img_to_scene_graph:
                    self.img_to_scene_graph[str(json_obj["input_img_path"])] = str(json_obj["scene_graph_string"])
                else:
                    print("Duplicate scene graphs exist!")

            print("Created img name to scene graph mapping...")

       
        with open(annotations[f"{self.mode}_questions"]) as f:
            self.questions = json.load(f)["questions"]
            print(f"Stored questions...")
            for question in self.questions:
                self.qid_to_question[question["question_id"]] = question["question"]
            print("Created question ID to question mapping...")
            
        

        print(f"{self.mode} dataset contains {len(self.annotations)} annotations")

    
    def construct_question_to_answer_dict(self):
        print("Constructing question to answer dictionary...")
        for annotation in tqdm(self.annotations):
            self.question_to_answer[annotation["question_id"]] = annotation
            
        print("Constructed question to answer dictionary...")

    def get_question_to_answer_dict(self):
        return self.question_to_answer

    def __getitem__(self, idx):
        curr_annotation = self.annotations[idx]

        question_id = curr_annotation["question_id"]
        image_id = curr_annotation["image_id"]
        answers = curr_annotation["answers"]

        # Don't need these for now but may need them for future ablations
        # question_type = curr_annotation["question_type"]
        # answer_type = curr_annotation["answers"]
        # multiple_choice_answer = curr_annotation["multiple_choice_answer"]

        # VQA has multiple possible answers, can modify this later to use other answers
        answer_choice = answers[0]["answer"]
        
        img = None

        question = self.qid_to_question.get(question_id, None)

        assert question is not None

        padded_image_id = "".join((12 - len(str(image_id)))*["0"]) + str(image_id)

        img_path = os.path.join(self.img_path, f"COCO_{self.pseudo_mode}2014_{padded_image_id}.jpg")

        assert os.path.exists(img_path)
            
        img = np.asarray(Image.open(img_path))

        scene_graph_str = self.img_to_scene_graph.get(img_path, None)
       
        assert scene_graph_str is not None
        
        return img, question, answer_choice, scene_graph_str, question_id

    def __len__(self):
        return len(self.annotations)


# we use the original val set as our test set since we have the annotations available (test annotations not available)
# this will be useful for error analysis later on

annotations = {
    "train_questions": "/home/kastan/thesis/video-pretrained-transformer/vqa/v2_OpenEnded_mscoco_val2014_questions.json",
    "test_questions": "/home/kastan/thesis/video-pretrained-transformer/vqa/v2_OpenEnded_mscoco_val2014_questions.json",
    "train_annotations": "/home/kastan/thesis/video-pretrained-transformer/vqa/v2_mscoco_val2014_annotations.json",
    "test_annotations": "/home/kastan/thesis/video-pretrained-transformer/vqa/v2_mscoco_val2014_annotations.json",
    "scene_graph_json": "/home/kastan/thesis/video-pretrained-transformer/vqa/val_scene_graph.json"
}

train_dataset = VQA(annotations, "/home/kastan/thesis/video-pretrained-transformer/vqa/val2014", mode="train", img_set=train_set)
test_dataset = VQA(annotations, "/home/kastan/thesis/video-pretrained-transformer/vqa/val2014", mode="test", img_set=test_set)

Constructing train dataset...
Stored annotations...


100%|██████████| 214354/214354 [00:12<00:00, 17220.43it/s]


Constructing question to answer dictionary...


100%|██████████| 26297/26297 [00:00<00:00, 2049379.64it/s]


Constructed question to answer dictionary...
Processing scene graph json...
Created img name to scene graph mapping...
Stored questions...
Created question ID to question mapping...
train dataset contains 26297 annotations
Constructing test dataset...
Stored annotations...


100%|██████████| 214354/214354 [00:04<00:00, 51146.58it/s]


Constructing question to answer dictionary...


100%|██████████| 8134/8134 [00:00<00:00, 2225035.46it/s]


Constructed question to answer dictionary...
Processing scene graph json...
Created img name to scene graph mapping...
Stored questions...
Created question ID to question mapping...
test dataset contains 8134 annotations


In [31]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

NUM_EPOCHS = 1
MODEL_NAME = "google/t5-v1_1-base"
RUN_NAME = "all_modalities"
BATCH_SIZE = 1

print("Initializing t5 model...")
# config = T5Config.from_pretrained(MODEL_NAME)
t5 = T5ForConditionalGeneration.from_pretrained("/home/kastan/thesis/video-pretrained-transformer/model/yt_pretrain_vqa_val_finetune", torch_dtype=torch.float32, low_cpu_mem_usage=True).to(device)
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME, return_special_tokens_mask=True)

t5.train()

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)

optimizer = torch.optim.Adam(params=t5.parameters(), lr=1e-4)

iter_losses = []

one_input_shape = [BATCH_SIZE, 768, 768]
att_mask_shape = [BATCH_SIZE, 768]

input_embeds_arr = torch.zeros(one_input_shape).to(device) # .astype(np.float16)
attn_mask_arr    = torch.zeros(att_mask_shape).to(device)
attn_mask_arr[0, 0] = 1
attn_mask_arr[0, 1] = 1
attn_mask_arr[0, 2] = 1

input_embeds_arr = torch.zeros(one_input_shape).to(device) 

for epoch in range(NUM_EPOCHS):
    for i, batch in enumerate(tqdm(train_loader)):
        if i % 500 == 0:
            t5.save_pretrained(f"/home/kastan/thesis/video-pretrained-transformer/vqa/model_ckpts/{RUN_NAME}_iter{i}")
            
        optimizer.zero_grad()

        question_embed, img_embed, scene_graph_embed, labels, _ = batch

        question_embed = question_embed.to(device)
        img_embed = img_embed.to(device)
        scene_graph_embed = scene_graph_embed.to(device)
        labels = labels.to(device)

        input_embeds_arr[0, 0, :] = img_embed
        input_embeds_arr[0, 1, :] = question_embed
        input_embeds_arr[0, 2, :] = scene_graph_embed

        # labels = t5_tokenizer("hi my name is pranav", return_tensors="pt").input_ids.to(device)
        # print("labels shaep: ", labels.shape)

        outputs = t5.forward(inputs_embeds=input_embeds_arr, attention_mask=attn_mask_arr, labels=labels)

        loss = outputs[0]

        assert not torch.isnan(loss).any()

        iter_losses.append(loss)

        loss.sum().backward()

        # torch.nn.utils.clip_grad_norm_(t5.parameters(), 1e-2)

        optimizer.step()
        # print("✅ Successful training iteration")

        if i % 100 == 0:
            print("Loss: ", loss.item())


    print(f"Epoch {epoch} done.")
        

Initializing t5 model...


  0%|          | 0/26297 [00:01<?, ?it/s]


RuntimeError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 10.92 GiB total capacity; 2.79 GiB already allocated; 8.50 MiB free; 2.85 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [13]:
t5.save_pretrained(f"/home/kastan/thesis/video-pretrained-transformer/vqa/model_ckpts/{RUN_NAME}_iter{19401}")

In [20]:
print(len(iter_losses))

new_iter_losses = [str(t.item()) for t in iter_losses]

# print(new_iter_losses)
file1 = open('/home/kastan/thesis/video-pretrained-transformer/vqa/model_ckpts/iter_losses.txt', 'w')
file1.writelines(",".join(new_iter_losses))
file1.close()

19756


In [28]:
# Load pretrained model
BATCH_SIZE = 1
MODEL_STR = "google/t5-v1_1-base"
# t5_eval = T5ForConditionalGeneration.from_pretrained("/home/kastan/thesis/video-pretrained-transformer/vqa/model_ckpts/all_modalities_iter19401", torch_dtype=torch.float16, low_cpu_mem_usage=True).to(device)
t5_eval = T5ForConditionalGeneration.from_pretrained("/home/kastan/thesis/video-pretrained-transformer/vqa/model_ckpts/all_modalities_iter5000", torch_dtype=torch.float16, low_cpu_mem_usage=True).to(device)
tokenizer = T5Tokenizer.from_pretrained(MODEL_STR, return_special_tokens_mask=True)

question_to_answer = test_dataset.get_question_to_answer_dict()

t5_eval.eval()

test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn)

iter_accuracies = []

one_input_shape = [BATCH_SIZE, 768, 768]
att_mask_shape = [BATCH_SIZE, 768]

input_embeds_arr = torch.zeros(one_input_shape).to(device) # .astype(np.float16)
attn_mask_arr    = torch.zeros(att_mask_shape).to(device)
attn_mask_arr[0, 0] = 1
attn_mask_arr[0, 1] = 1
attn_mask_arr[0, 2] = 1

for i, batch in enumerate(tqdm(test_loader)):
    question_embed, img_embed, scene_graph_embed, labels, question_ids = batch

    answers = []
    for question_id in question_ids:
        answers.append([answer_obj["answer"] for answer_obj in question_to_answer[question_id]["answers"]])

    # To view the image, question and possible answers, uncomment
    # image_id = question_to_answer[question_id]["image_id"]
    # padded_image_id = "".join((12 - len(str(image_id)))*["0"]) + str(image_id)
    # img_path = os.path.join(self.img_path, f"COCO_{self.pseudo_mode}2014_{padded_image_id}.jpg")
    # print(img_path)
    # print("question: ", test_dataset.qid_to_question[question_ids[0]])
    # print(answers)

    question_embed = question_embed.to(device)
    img_embed = img_embed.to(device)
    scene_graph_embed = scene_graph_embed.to(device)
    labels = labels.to(device)

    input_embeds_arr[0, 0, :] = img_embed
    input_embeds_arr[0, 1, :] = question_embed
    input_embeds_arr[0, 2, :] = scene_graph_embed

    # input_embeds_arr = torch.cat((question_embed, img_embed, scene_graph_embed), dim=1)

    # outputs = t5.forward(inputs_embeds=input_embeds_arr, labels=labels)
    # output_sequences = t5_eval.generate(inputs_embeds=input_embeds_arr,  attention_mask=attn_mask_arr, do_sample=False)
    
    # outputs = tokenizer.batch_decode(output_sequences, skip_special_tokens=True)
    outputs = ['no']
    if i % 100 == 0:
        print("question: ", test_dataset.qid_to_question[question_ids[0]])
        print("answers: ", answers)
        print("outputs: ", outputs)

    for j, output in enumerate(outputs):
        curr_answers = answers[j]

        # evaluation metric for VQA: https://visualqa.org/evaluation.html
        iter_accuracies.append(min(curr_answers.count(str(output)) / 3.0, 1.0))


print("Accuracy: ", sum(iter_accuracies)/len(iter_accuracies))
    

  0%|          | 2/8134 [00:00<12:26, 10.89it/s]

question:  What is the man doing in the street?
answers:  [['crossing it', 'walking', 'walking', 'crossing', 'crossing road', 'walking', 'crossing', 'walking', 'crossing', 'walking']]
outputs:  ['no']


  1%|▏         | 102/8134 [00:08<11:00, 12.17it/s]

question:  What is the player wearing around his head?
answers:  [['hat', 'cap', 'hat', 'cap', 'cap', 'hat', 'hat', 'hat', 'hat', 'hat']]
outputs:  ['no']


  2%|▏         | 202/8134 [00:16<10:57, 12.06it/s]

question:  Where would luggage go?
answers:  [['in back of train', 'on top', 'inside', 'under seats', 'roof rack', 'top', 'top', 'roof rack', 'on top of bus', 'on roof']]
outputs:  ['no']


  4%|▎         | 302/8134 [00:25<11:07, 11.74it/s]

question:  What does the red sign with the yellow M mean?
answers:  [["mcdonald's", "mcdonald's", 'mcdonalds', 'mcdonalds', "mcdonald's", "mcdonald's", 'mcdonald', "mcdonald's", "mcdonald's restaurant ahead", "mcdonald's"]]
outputs:  ['no']


  5%|▍         | 402/8134 [00:33<11:05, 11.61it/s]

question:  Are the motorcycles parked legally?
answers:  [['yes', 'yes', 'yes', 'no', 'no', 'no', 'yes', 'yes', 'yes', 'yes']]
outputs:  ['no']


  6%|▌         | 502/8134 [00:41<10:28, 12.14it/s]

question:  What is the player's number?
answers:  [['46', '46', '46', '46', '46', '46', '46', '46', '46', '46']]
outputs:  ['no']


  7%|▋         | 602/8134 [00:49<10:11, 12.31it/s]

question:  What type of place is this?
answers:  [['school', 'school', 'school', 'outdoors', 'school', 'outdoors', 'school', 'school', "boy's school", 'school']]
outputs:  ['no']


  9%|▊         | 702/8134 [00:58<10:23, 11.93it/s]

question:  How many pastries are in this picture?
answers:  [['0', '0', '47', '30', '0', '63', '50', 'lot', '20', '0']]
outputs:  ['no']


 10%|▉         | 802/8134 [01:06<09:58, 12.25it/s]

question:  What creature is depicted on the yellow sign?
answers:  [['dragon', 'dragon', 'snake', 'dragon', 'dragon', 'dragon', 'dragon', 'dragon', 'snake', 'dragon']]
outputs:  ['no']


 11%|█         | 902/8134 [01:14<09:59, 12.07it/s]

question:  How many people are in this picture?
answers:  [['5', '5', '5', '5', '4', '5', '5', '5', '5', '5']]
outputs:  ['no']


 12%|█▏        | 1002/8134 [01:23<10:03, 11.82it/s]

question:  How large is this living space?
answers:  [['small hospital room', 'small', 'small', 'small', 'small', 'fifty square feet', 'short', 'small', '10x12', 'small']]
outputs:  ['no']


 14%|█▎        | 1102/8134 [01:31<09:50, 11.91it/s]

question:  What is the person in red holding?
answers:  [['plate', 'plate of food', 'plate', 'plate of food', 'plate', 'her dinner plate', 'plate', 'plate', 'plate of food', 'food']]
outputs:  ['no']


 15%|█▍        | 1202/8134 [01:40<09:41, 11.92it/s]

question:  Do the tires look matching?
answers:  [['yes', 'no', 'no', 'no', 'no', 'no', 'yes', 'no', 'no', 'no']]
outputs:  ['no']


 16%|█▌        | 1302/8134 [01:48<09:31, 11.94it/s]

question:  What is on the ground?
answers:  [['leaves', 'leaves', 'fire hydrant', 'fire hydrant', 'leaves', 'leaves', 'leaves', 'leaves', 'fire hydrant', 'leaves']]
outputs:  ['no']


 17%|█▋        | 1402/8134 [01:56<09:25, 11.91it/s]

question:  Is there a glare on the glass?
answers:  [['yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes']]
outputs:  ['no']


 18%|█▊        | 1502/8134 [02:05<09:12, 12.00it/s]

question:  What fruit is hanging from the wall?
answers:  [['0', '0', '0', '0', '0', '0', 'curtain', 'banana', '0', 'pizza']]
outputs:  ['no']


 20%|█▉        | 1602/8134 [02:13<09:14, 11.79it/s]

question:  Is the person holding the umbrella real?
answers:  [['no', 'no', 'no', 'yes', 'no', 'no', 'no', 'no', 'no', 'no']]
outputs:  ['no']


 21%|██        | 1702/8134 [02:22<08:57, 11.97it/s]

question:  Is this belt in motion?
answers:  [['yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes']]
outputs:  ['no']


 22%|██▏       | 1802/8134 [02:30<08:43, 12.09it/s]

question:  What is this person doing?
answers:  [['taking picture', 'talking', 'taking pic', 'taking picture', 'taking picture', 'looking at cell phone', 'looking at their phone', 'taking photo', 'taking photo', 'taking picture']]
outputs:  ['no']


 23%|██▎       | 1902/8134 [02:39<08:36, 12.08it/s]

question:  What color are the stripes?
answers:  [['white', 'white', 'white', 'white', 'white', 'white', 'blue', 'white', 'black and white', 'white']]
outputs:  ['no']


 24%|██▍       | 1983/8134 [02:46<08:34, 11.95it/s]


KeyboardInterrupt: 

In [29]:
sum(iter_accuracies)/len(iter_accuracies)

0.2390317700453855