# Prepping for real run. 

Todo: 
* Define `max_source_length` and `max_target_length` for the model (otherwise truncated).
padding token should be replaced with -100, which is the 'ignore_index' of `CrossEntorpyLoss` in PT and TF. For Flax, use `decoder_attention_mask`. 
Attention_mask. ensures madding tokens of inputs are ignored. 

* Install apex. "model will automatically use apex.normalization.FusedRMSNorm instead of T5LayerNorm." The former uses an optimized fused kernel which is several times faster than the latter.

A note on model sizes: 
T5-11B (original, not v1.1) weights in float32 are 45.2GB. 
See this post for using huggingface endpoints on SINGLE GPU for cheap inference: https://www.philschmid.de/deploy-t5-11b
Uses mixed precision and sharding, and LLM.int8(). 

In [1]:
import torch
import clip
import os
import numpy as np
from transformers import T5Tokenizer, T5ForConditionalGeneration, T5EncoderModel, T5Model, T5Config, AutoModelWithLMHead
import accelerate
# import wandb
from tqdm import tqdm
import lovely_tensors as lt
import math
from PIL import Image
lt.monkey_patch()
# !wandb login  -- reactivate later
device = 'cuda' if torch.cuda.is_available() else 'cpu'

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
''' GLOBALS '''
NUM_EPOCHS = 1
MODEL_NAME = "google/t5-v1_1-base"
RUN_NAME = "all_modalities"
BATCH_SIZE = 1
BASE_DIR = '/scratch/bbki/kastanday/whisper'
use_scene_graph = False


In [3]:
'''
# Split VAL2014 into train and test datasets 

BASE_DIR = '/scratch/bbki/kastanday/whisper'

train_set = []
test_set = []

for i, img_name in enumerate(os.listdir(f"{BASE_DIR}/vqa/val2014")):
    if i >= 39000:
        test_set.append(os.path.join(f"{BASE_DIR}/vqa/val2014", img_name))
    elif i <= 5000:
        train_set.append(os.path.join(f"{BASE_DIR}/vqa/val2014", img_name))

print(len(train_set))
print(len(test_set))
'''

'\n# Split VAL2014 into train and test datasets \n\nBASE_DIR = \'/scratch/bbki/kastanday/whisper\'\n\ntrain_set = []\ntest_set = []\n\nfor i, img_name in enumerate(os.listdir(f"{BASE_DIR}/vqa/val2014")):\n    if i >= 39000:\n        test_set.append(os.path.join(f"{BASE_DIR}/vqa/val2014", img_name))\n    elif i <= 5000:\n        train_set.append(os.path.join(f"{BASE_DIR}/vqa/val2014", img_name))\n\nprint(len(train_set))\nprint(len(test_set))\n'

In [4]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("device: ", device)

# Initialize preprocessing models for collate_fn 
print("Initializing clip and scene graph models...")
clip_model, clip_preprocess = clip.load('ViT-L/14@336px', device)

t5_tokenizer = T5Tokenizer.from_pretrained("google/t5-v1_1-base", return_special_tokens_mask=True)


device:  cpu
Initializing clip and scene graph models...


In [5]:
def collate_fn(batch):
    input_imgs = []
    input_questions = []
    answers = []
    scene_strs = []
    question_ids = []
    for elt in batch:
        img, question, answer, scene_graph_str, question_id = elt
        input_imgs.append(clip_preprocess(Image.fromarray(img)).unsqueeze(0))
        input_questions.append(question)
        answers.append(answer)
        scene_strs.append(scene_graph_str)
        question_ids.append(question_id)

    image_input = torch.cat(input_imgs).to(device)
    text_input = clip.tokenize(input_questions, truncate=True).to(device)
    sg_input = clip.tokenize(scene_strs, truncate=True).to(device)

    with torch.inference_mode(): # even faster than no_grad()
        # image_features = torch.unsqueeze(clip_model.encode_image(image_input), dim=1)
        # text_features = torch.unsqueeze(clip_model.encode_text(text_input), dim=1)
        # sg_features = torch.unsqueeze(clip_model.encode_text(sg_input), dim=1)
        image_features = clip_model.encode_image(image_input)
        text_features = clip_model.encode_text(text_input)
        sg_features = clip_model.encode_text(sg_input)

    # labels = t5_tokenizer(answers, padding="longest", max_length=128, truncation=True, return_tensors="pt").input_ids
    # labels[labels == t5_tokenizer.pad_token_id] = -100

    # print("answers: ", answers)
    labels = t5_tokenizer(answers, return_tensors="pt").input_ids

    # print("labels: ", labels)

    return image_features, text_features, sg_features, labels, question_ids

# Moc Dataset

In [6]:
import json

class VQA(torch.utils.data.Dataset):
    def __init__(self, annotations, img_path, mode="train"):
        self.annotations = [] 
        self.questions = None
        self.img_path = img_path
        self.qid_to_question = {}
        self.mode = mode

        self.img_to_scene_graph = {}
        self.question_to_answer = {}

        print(f"Constructing {self.mode} dataset...")
        with open(annotations[f"{self.mode}_annotations"]) as f:
            all_annotations = json.load(f)['annotations']
            print("Stored annotations...")

            self.annotations = all_annotations

        self.construct_question_to_answer_dict()

        with open(annotations[f"{self.mode}_scene_graph_json"]) as f:
            print("Processing scene graph json...")
            for obj in f:
                json_obj = json.loads(eval(eval(obj)))
                if str(json_obj["input_img_path"]) not in self.img_to_scene_graph:
                    self.img_to_scene_graph[str(json_obj["input_img_path"])] = str(json_obj["scene_graph_string"])
                else:
                    print("Duplicate scene graphs exist!")

            print("Created img name to scene graph mapping...")

       
        with open(annotations[f"{self.mode}_questions"]) as f:
            self.questions = json.load(f)["questions"]
            print(f"Stored questions...")
            for question in self.questions:
                self.qid_to_question[question["question_id"]] = question["question"]
            print("Created question ID to question mapping...")
            
        

        print(f"{self.mode} dataset contains {len(self.annotations)} annotations")

    
    def construct_question_to_answer_dict(self):
        print("Constructing question to answer dictionary...")
        for annotation in tqdm(self.annotations):
            self.question_to_answer[annotation["question_id"]] = annotation
            
        print("Constructed question to answer dictionary...")

    def get_question_to_answer_dict(self):
        return self.question_to_answer

    def __getitem__(self, idx):
        curr_annotation = self.annotations[idx]

        question_id = curr_annotation["question_id"]
        image_id = curr_annotation["image_id"]
        answers = curr_annotation["answers"]

        # Don't need these for now but may need them for future ablations
        # question_type = curr_annotation["question_type"]
        # answer_type = curr_annotation["answers"]
        # multiple_choice_answer = curr_annotation["multiple_choice_answer"]

        # VQA has multiple possible answers, can modify this later to use other answers
        answer_choice = answers[0]["answer"]
        
        img = None

        question = self.qid_to_question.get(question_id, None)

        assert question is not None

        padded_image_id = "".join((12 - len(str(image_id)))*["0"]) + str(image_id)

        img_path = os.path.join(self.img_path, f"COCO_{self.mode}2014_{padded_image_id}.jpg")

        assert os.path.exists(img_path)
            
        img = np.asarray(Image.open(img_path))

        scene_graph_str = self.img_to_scene_graph.get(img_path, None)
       
        assert scene_graph_str is not None
        
        return img, question, answer_choice, scene_graph_str, question_id

    def __len__(self):
        return len(self.annotations)


# we use the original val set as our test set since we have the annotations available (test annotations not available)
# this will be useful for error analysis later on

annotations = {
    "train_questions": f"{BASE_DIR}/vqa/v2_OpenEnded_mscoco_train2014_questions.json",
    "test_questions": f"{BASE_DIR}/vqa/v2_OpenEnded_mscoco_test2014_questions.json",
    "train_annotations": f"{BASE_DIR}/vqa/v2_mscoco_train2014_annotations.json",
    "test_annotations": f"{BASE_DIR}/vqa/v2_mscoco_test2014_annotations.json",
    "train_scene_graph_json": f"{BASE_DIR}/vqa/train_scene_graph.json",
    "test_scene_graph_json": f"{BASE_DIR}/vqa/test_scene_graph.json"
}

train_dataset = VQA(annotations, f"{BASE_DIR}/vqa/train2014", mode="train")
# test_dataset = VQA(annotations, f"{BASE_DIR}/vqa/test2014", mode="test")

Constructing train dataset...
Stored annotations...
Constructing question to answer dictionary...


100%|██████████| 443757/443757 [00:00<00:00, 2707980.35it/s]


Constructed question to answer dictionary...
Processing scene graph json...
Created img name to scene graph mapping...
Stored questions...
Created question ID to question mapping...
train dataset contains 443757 annotations


In [7]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

print("Initializing t5 model...")
# config = T5Config.from_pretrained(MODEL_NAME)
t5 = T5ForConditionalGeneration.from_pretrained(MODEL_NAME, torch_dtype=torch.float32, low_cpu_mem_usage=True).to(device)
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME, return_special_tokens_mask=True)

t5.train()

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)

optimizer = torch.optim.Adam(params=t5.parameters(), lr=1e-4)

iter_losses = []

one_input_shape = [BATCH_SIZE, 768, 768]
att_mask_shape = [BATCH_SIZE, 768]

input_embeds_arr = torch.zeros(one_input_shape).to(device) # .astype(np.float16)
attn_mask_arr    = torch.zeros(att_mask_shape).to(device)
attn_mask_arr[0, 0] = 1
attn_mask_arr[0, 1] = 1

if use_scene_graph:
    attn_mask_arr[0, 2] = 1

input_embeds_arr = torch.zeros(one_input_shape).to(device) 

for epoch in range(NUM_EPOCHS):
    for i, batch in enumerate(tqdm(train_loader)):
        if i % 500 == 0:
            t5.save_pretrained(f"{BASE_DIR}/vqa/model_ckpts/{RUN_NAME}_iter{i}")
            
        optimizer.zero_grad()

        question_embed, img_embed, scene_graph_embed, labels, _ = batch

        question_embed = question_embed.to(device)
        img_embed = img_embed.to(device)
        scene_graph_embed = scene_graph_embed.to(device)
        labels = labels.to(device)

        input_embeds_arr[0, 0, :] = img_embed
        input_embeds_arr[0, 1, :] = question_embed
        if use_scene_graph:
            print("THIS SHOULD NOT BE HAPPENING")
            input_embeds_arr[0, 2, :] = scene_graph_embed

        # labels = t5_tokenizer("hi my name is pranav", return_tensors="pt").input_ids.to(device)
        # print("labels shaep: ", labels.shape)

        outputs = t5.forward(inputs_embeds=input_embeds_arr, attention_mask=attn_mask_arr, labels=labels)

        loss = outputs[0]

        assert not torch.isnan(loss).any()

        iter_losses.append(loss)

        loss.sum().backward()

        # torch.nn.utils.clip_grad_norm_(t5.parameters(), 1e-2)

        optimizer.step()
        # print("✅ Successful training iteration")

        if i % 100 == 0:
            print("Loss: ", loss.item())


    print(f"Epoch {epoch} done.")
        

Initializing t5 model...


  0%|          | 0/443757 [00:00<?, ?it/s]


AssertionError: 

In [None]:
t5.save_pretrained(f"{BASE_DIR}/vqa/model_ckpts/{RUN_NAME}_iter{19401}")

In [None]:
print(len(iter_losses))

new_iter_losses = [str(t.item()) for t in iter_losses]

# print(new_iter_losses)
file1 = open('/home/kastan/thesis/video-pretrained-transformer/vqa/model_ckpts/iter_losses.txt', 'w')
file1.writelines(",".join(new_iter_losses))
file1.close()

In [None]:
# Load pretrained model
BATCH_SIZE = 1
MODEL_STR = "google/t5-v1_1-base"
# t5_eval = T5ForConditionalGeneration.from_pretrained(f"{BASE_DIR}/vqa/model_ckpts/all_modalities_iter19401", torch_dtype=torch.float16, low_cpu_mem_usage=True).to(device)
t5_eval = T5ForConditionalGeneration.from_pretrained(f"{BASE_DIR}/vqa/model_ckpts/all_modalities_iter5000", torch_dtype=torch.float16, low_cpu_mem_usage=True).to(device)
tokenizer = T5Tokenizer.from_pretrained(MODEL_STR, return_special_tokens_mask=True)

question_to_answer = test_dataset.get_question_to_answer_dict()

t5_eval.eval()

# TOOD: add in val dataset here. and check check for if use_scenegraph
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn)

iter_accuracies = []

one_input_shape = [BATCH_SIZE, 768, 768]
att_mask_shape = [BATCH_SIZE, 768]

input_embeds_arr = torch.zeros(one_input_shape).to(device) # .astype(np.float16)
attn_mask_arr    = torch.zeros(att_mask_shape).to(device)
attn_mask_arr[0, 0] = 1
attn_mask_arr[0, 1] = 1
attn_mask_arr[0, 2] = 1

for i, batch in enumerate(tqdm(test_loader)):
    question_embed, img_embed, scene_graph_embed, labels, question_ids = batch

    answers = []
    for question_id in question_ids:
        answers.append([answer_obj["answer"] for answer_obj in question_to_answer[question_id]["answers"]])

    # To view the image, question and possible answers, uncomment
    # image_id = question_to_answer[question_id]["image_id"]
    # padded_image_id = "".join((12 - len(str(image_id)))*["0"]) + str(image_id)
    # img_path = os.path.join(self.img_path, f"COCO_{self.pseudo_mode}2014_{padded_image_id}.jpg")
    # print(img_path)
    # print("question: ", test_dataset.qid_to_question[question_ids[0]])
    # print(answers)

    question_embed = question_embed.to(device)
    img_embed = img_embed.to(device)
    scene_graph_embed = scene_graph_embed.to(device)
    labels = labels.to(device)

    input_embeds_arr[0, 0, :] = img_embed
    input_embeds_arr[0, 1, :] = question_embed
    input_embeds_arr[0, 2, :] = scene_graph_embed

    # input_embeds_arr = torch.cat((question_embed, img_embed, scene_graph_embed), dim=1)

    # outputs = t5.forward(inputs_embeds=input_embeds_arr, labels=labels)
    # output_sequences = t5_eval.generate(inputs_embeds=input_embeds_arr,  attention_mask=attn_mask_arr, do_sample=False)
    
    # outputs = tokenizer.batch_decode(output_sequences, skip_special_tokens=True)
    # outputs = ['no']
    if i % 100 == 0:
        print("question: ", test_dataset.qid_to_question[question_ids[0]])
        print("answers: ", answers)
        print("outputs: ", outputs)

    for j, output in enumerate(outputs):
        curr_answers = answers[j]

        # evaluation metric for VQA: https://visualqa.org/evaluation.html
        iter_accuracies.append(min(curr_answers.count(str(output)) / 3.0, 1.0))

print("Accuracy: ", sum(iter_accuracies)/len(iter_accuracies))

In [None]:
sum(iter_accuracies)/len(iter_accuracies)