# Prepping for real run. 

Todo: 
* Define `max_source_length` and `max_target_length` for the model (otherwise truncated).
padding token should be replaced with -100, which is the 'ignore_index' of `CrossEntorpyLoss` in PT and TF. For Flax, use `decoder_attention_mask`. 
Attention_mask. ensures madding tokens of inputs are ignored. 

* Install apex. "model will automatically use apex.normalization.FusedRMSNorm instead of T5LayerNorm." The former uses an optimized fused kernel which is several times faster than the latter.

A note on model sizes: 
T5-11B (original, not v1.1) weights in float32 are 45.2GB. 
See this post for using huggingface endpoints on SINGLE GPU for cheap inference: https://www.philschmid.de/deploy-t5-11b
Uses mixed precision and sharding, and LLM.int8(). 

In [5]:
import torch
import os
import numpy as np
import tensorflow as tf
from transformers import T5Tokenizer, T5ForConditionalGeneration, T5Model, T5Config, AutoModelWithLMHead
import accelerate
import wandb
import lovely_tensors as lt
import math
from PIL import Image
lt.monkey_patch()
# !wandb login  -- reactivate later
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [4]:
'''
MODEL SELECTION

T5 V1.1 --  https://huggingface.co/docs/transformers/model_doc/t5v1.1 && https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511
small - base - large - 3b/xl - 11b/xxl

OG: t5-small

'google/t5-base-lm-adapt' # largest on my server (without float16)
'google/t5-xl-lm-adapt'

google/t5-v1_1-large
'''

# MODEL_SIZE = "t5-base"
MODEL_NAME = "google/t5-small-lm-adapt"
# config = T5Config.from_pretrained(MODEL_NAME)
t5 = T5ForConditionalGeneration.from_pretrained(MODEL_NAME, torch_dtype=torch.float16, low_cpu_mem_usage=True).to(device)
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME, return_special_tokens_mask=True)
# low_cpu_mem_usage(bool, optional) — Tries to not use more than 1x model size in CPU memory (including peak memory) while loading the model. experimental.

# Moc Dataset

In [27]:
import json

class VQA(torch.utils.data.Dataset):
    def __init__(self, annotations, img_path, mode="train"):
        self.annotations = None 
        self.questions = None
        self.img_path = img_path
        self.qid_to_question = {}
        self.mode = mode

        print(f"Constructing {self.mode} dataset...")
        with open(annotations[f"{self.mode}_annotations"]) as f:
            self.annotations = json.load(f)['annotations']
            print("Stored annotations...")

        with open(annotations[f"{self.mode}_questions"]) as f:
            self.questions = json.load(f)["questions"]
            print("Stored questions...")
            for question in self.questions:
                self.qid_to_question[question["question_id"]] = question["question"]
            print("Created question ID to question mapping...")

        self.tokenizer = T5Tokenizer.from_pretrained("google/t5-small-lm-adapt", return_special_tokens_mask=True)

    def __getitem__(self, idx):
        curr_annotation = self.annotations[idx]

        question_id = curr_annotation["question_id"]
        image_id = curr_annotation["image_id"]
        question_type = curr_annotation["question_type"]
        answer_type = curr_annotation["answers"]
        multiple_choice_answer = curr_annotation["multiple_choice_answer"]
        answers = curr_annotation["answers"]

        img = None
        scene_graph = None 

        question = self.qid_to_question.get(question_id, None)
        if question == None:
            print("question doesn't exist")
            return question, img, answers

        padded_image_id = "".join((12 - len(str(image_id)))*["0"]) + str(image_id)
        print("image id: ", padded_image_id)
        pseudo_mode = "train" if self.mode == "train" else "val"

        if not os.path.exists(os.path.join(self.img_path, f"COCO_{pseudo_mode}2014_{padded_image_id}.jpg")):
            print("image doesn't exist")
            return question, img, answers
            
        img = Image.open(os.path.join(self.img_path, f"COCO_{pseudo_mode}2014_{padded_image_id}.jpg"))
        img = np.asarray(img)

        # TODO: Add in scene graph extraction code, random for now
        one_input_shape = [1, 512]
        scene_graph = np.random.rand( *one_input_shape ).astype(np.float16) 
        scene_graph = torch.from_numpy(scene_graph)

        # TODO: Add in clip preprocessing code, random for now
        one_input_shape = [1, 512]
        clip_img = np.random.rand( *one_input_shape ).astype(np.float16) 
        clip_img = torch.from_numpy(clip_img)

        clip_text = np.random.rand( *one_input_shape ).astype(np.float16) 
        clip_text = torch.from_numpy(clip_text)


        # print("real answers: ", answers)
        
        return clip_text, clip_img, answers[0]["answer"], scene_graph

    def __len__(self):
        return len(self.annotations)


# we use the original val set as our test set since we have the annotations available (test annotations not available)
# this will be useful for error analysis later on

annotations = {
    "train_questions": "/home/kastan/thesis/video-pretrained-transformer/vqa/v2_OpenEnded_mscoco_train2014_questions.json",
    "test_questions": "/home/kastan/thesis/video-pretrained-transformer/vqa/v2_OpenEnded_mscoco_val2014_questions.json",
    "train_annotations": "/home/kastan/thesis/video-pretrained-transformer/vqa/v2_mscoco_train2014_annotations.json",
    "test_annotations": "/home/kastan/thesis/video-pretrained-transformer/vqa/v2_mscoco_val2014_annotations.json"
}

train_dataset = VQA(annotations, "/home/kastan/thesis/video-pretrained-transformer/vqa/train2014", mode="train")
test_dataset = VQA(annotations, "/home/kastan/thesis/video-pretrained-transformer/vqa/val2014", mode="test")

Constructing train dataset...
Stored annotations...
Stored questions...
Created question ID to question mapping...
Constructing test dataset...
Stored annotations...
Stored questions...
Created question ID to question mapping...


In [30]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

NUM_EPOCHS = 100
MODEL_NAME = "google/t5-small-lm-adapt"

# config = T5Config.from_pretrained(MODEL_NAME)
t5 = T5ForConditionalGeneration.from_pretrained(MODEL_NAME, torch_dtype=torch.float16, low_cpu_mem_usage=True).to(device)
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME, return_special_tokens_mask=True)

t5.train()

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=1)

optimizer = torch.optim.Adam(params=t5.parameters(), lr=1e-4)

for epoch in range(100):
    for batch in train_loader:
        optimizer.zero_grad()

        question_embed, img_embed, answers, scene_graph_embed = batch
        # input_ids = batch['input_ids'].to(device)
        # attention_mask = batch['attention_mask'].to(device)
        # labels = batch['labels'].to(device)

        # print("question shape: ", question.shape)
        # print("img shape: ", img.shape)
        # print("scene graph shape; ", scene_graph.shape)
        # print("answers shape: ", answers.shape)

        question_embed = question_embed.to(device)
        img_embed = img_embed.to(device)
        scene_graph_embed = scene_graph_embed.to(device)
        

        # TODO: Clean up answer pickng and tokenization
        labels = tokenizer(list(answers), padding="longest", max_length=128, truncation=True, return_tensors="pt").input_ids
        labels[labels == tokenizer.pad_token_id] = -100

        labels = labels.to(device)

        input_embeds_arr = torch.cat((question_embed, img_embed, scene_graph_embed), dim=1)
        print("input embeds arr: ", input_embeds_arr.shape)
        # print("answers: ", answers)
        print("labels shape: ", labels.shape)

        # outputs = t5.forward(inputs_embeds=input_embeds_arr, attention_mask=attn_mask_arr, decoder_inputs_embeds=input_embeds_arr)
        outputs = t5.forward(inputs_embeds=input_embeds_arr, labels=labels)
        # outputs = t5.forward(inputs_embeds=input_embeds_arr, labels=labels)
        loss = outputs[0]
        loss.shape
        print(loss)

        loss.sum().backward()
        optimizer.step()
        print("✅ Successful training iteration")
        1/0

        

image id:  000000080818
image id:  000000461181
image id:  000000042297
image id:  000000571780
image id:  000000509482
image id:  000000558678
image id:  000000259687
image id:  000000260035
image id:  000000296809
image id:  000000094416
image id:  000000189975
image id:  000000470532
image id:  000000393661
image id:  000000306722
image id:  000000317756
image id:  000000182373
input embeds arr:  torch.Size([16, 3, 512])
labels shape:  torch.Size([16, 4])
tensor f16 grad NllLossBackward0 cuda:0 9.734
✅ Successful training iteration


ZeroDivisionError: division by zero

In [31]:
t5.eval()

for batch in test_loader:

    question_embed, img_embed, answers, scene_graph_embed = batch
    # input_ids = batch['input_ids'].to(device)
    # attention_mask = batch['attention_mask'].to(device)
    # labels = batch['labels'].to(device)

    # print("question shape: ", question.shape)
    # print("img shape: ", img.shape)
    # print("scene graph shape; ", scene_graph.shape)
    # print("answers shape: ", answers.shape)

    question_embed = question_embed.to(device)
    img_embed = img_embed.to(device)
    scene_graph_embed = scene_graph_embed.to(device)
    

    # TODO: Clean up answer pickng and tokenization
    labels = tokenizer(list(answers), padding="longest", max_length=128, truncation=True, return_tensors="pt").input_ids
    labels[labels == tokenizer.pad_token_id] = -100

    labels = labels.to(device)

    input_embeds_arr = torch.cat((question_embed, img_embed, scene_graph_embed), dim=1)
    print("input embeds arr: ", input_embeds_arr.shape)
    # print("answers: ", answers)
    print("labels shape: ", labels.shape)

    # outputs = t5.forward(inputs_embeds=input_embeds_arr, attention_mask=attn_mask_arr, decoder_inputs_embeds=input_embeds_arr)
    # outputs = t5.forward(inputs_embeds=input_embeds_arr, labels=labels)
    output_sequences = t5.generate(inputs_embeds=input_embeds_arr, do_sample=False)

    print(tokenizer.batch_decode(output_sequences, skip_special_tokens=True))
    # outputs = t5.forward(inputs_embeds=input_embeds_arr, labels=labels)
    # loss = outputs[0]
    # loss.shape
    # print(loss)

    # loss.sum().backward()
    # optimizer.step()
    # print("✅ Successful training iteration")
    1/0

    

image id:  000000262148
input embeds arr:  torch.Size([1, 3, 512])
labels shape:  torch.Size([1, 2])




['']


ZeroDivisionError: division by zero

In [7]:
''' PREP EMBEDDING INPUTS '''
# shape = (batch_size, 'words', embedding_dim) -- here 'words' == each of our embeddings, like clip and language.
# one_input_shape = [6, 512, 512]
one_input_shape = [1, 512, 512]
att_mask_shape = [1, 512]

decoder_input_embeds_arr = np.random.rand( *one_input_shape ).astype(np.float16) # need fp32
decoder_input_embeds_arr = torch.from_numpy(decoder_input_embeds_arr).to(device)
input_embeds_arr = np.random.rand( *one_input_shape ).astype(np.float16)
input_embeds_arr = torch.from_numpy(input_embeds_arr).to(device)
attn_mask_arr = np.ones( att_mask_shape )
attn_mask_arr = torch.from_numpy(attn_mask_arr).to(device)

print(decoder_input_embeds_arr)
print(input_embeds_arr)
print(attn_mask_arr)

''' Decoder gets the tokenized caption. Shape is (batch_size, max_caption_length). Use padding to make it fit. '''
# WORKING example, but easier with numpy.
# import torch.nn.functional as F
# decoder_input_ids = tokenizer("This is the target output sentence, aka the video caption. I like tacos because they are so delicious.", return_tensors="pt").input_ids.to(device)
# decoder_input_ids = F.pad(decoder_input_ids, (0, (512-decoder_input_ids.shape[1])), value=tokenizer.pad_token_id)
# print(decoder_input_ids.shape)
# decoder_input_ids

labels = tokenizer("The cute dog did the things", return_tensors="pt").input_ids.to(device)
# labels = torch.from_numpy(np.random.randint(1, 10_000, size=(one_input_shape[0], one_input_shape[2]))).to(device)
print(labels)
# labels = torch.cat((labels, torch.ones((1, 512-7), dtype=int).to(device)), dim=1)
# print(labels)

# labels = torch.from_numpy(np.random.randint(1, 10_000, size=(one_input_shape[0], one_input_shape[2]))).to(device)

tensor[1, 512, 512] f16 n=262144 x∈[5.901e-06, 1.000] μ=0.499 σ=inf cuda:0
tensor[1, 512, 512] f16 n=262144 x∈[1.252e-06, 1.000] μ=0.500 σ=inf cuda:0
tensor[1, 512, 512] f64 n=262144 x∈[1.000, 1.000] μ=1.000 σ=0. cuda:0
tensor[1, 7] i64 x∈[1, 5295] μ=1.130e+03 σ=1.940e+03 cuda:0 [[37, 5295, 1782, 410, 8, 378, 1]]


## Train function

T5 forward() docs: https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5ForConditionalGeneration.forward

Todo: investigate difference between decoder `decoder_input_ids` and `lm_labels`.
For example: 
```
outputs = t5(input_ids = ids, attention_mask = mask, decoder_input_ids=y_ids, lm_labels=lm_labels)
```

I think `loss.sum()` is for multi-iteration loss. I was inadverdently using it 6 batches.
https://discuss.pytorch.org/t/loss-backward-raises-error-grad-can-be-implicitly-created-only-for-scalar-outputs/12152 
loss.backward() # T5 RuntimeError: grad can be implicitly created only for scalar outputs


In [8]:
t5.train()


# outputs = t5.forward(inputs_embeds=input_embeds_arr, attention_mask=attn_mask_arr, decoder_inputs_embeds=input_embeds_arr)
outputs = t5.forward(inputs_embeds=input_embeds_arr, attention_mask=attn_mask_arr, labels=labels)
# outputs = t5.forward(inputs_embeds=input_embeds_arr, labels=labels)
loss = outputs[0]
loss.shape
print(loss)

----------------------------------------------------------------------------------------------------
In transformer stack layer: 
input embeds:  torch.Size([1, 512, 512])
batch size, seq length:  1 512
mask seq length:  512
past key values:  [None, None, None, None, None, None, None, None]
extended attention mask:  tensor[1, 1, 512, 512] f16 [38;2;127;127;127mall_zeros[0m cuda:0
head_mask:  [None, None, None, None, None, None, None, None]
cross_attn_head_mask:  [None, None, None, None, None, None, None, None]
present key value states:  None
all hidden states:  None
all attentions:  None
all cross attentions:  None
hidden states:  tensor[1, 512, 512] f16 n=262144 x∈[0., 1.111] μ=0.500 σ=inf cuda:0
layer moduel:  T5Block(
  (layer): ModuleList(
    (0): T5LayerSelfAttention(
      (SelfAttention): T5Attention(
        (q): Linear(in_features=512, out_features=384, bias=False)
        (k): Linear(in_features=512, out_features=384, bias=False)
        (v): Linear(in_features=512, out_fea

RuntimeError: The size of tensor a (7) must match the size of tensor b (512) at non-singleton dimension 2

In [15]:
''' backwards pass '''
optimizer = torch.optim.Adam(params =  t5.parameters(), lr=1e-4)
optimizer.zero_grad()
loss.sum().backward()
optimizer.step()
print("✅ Successful training iteration")

✅ Successful training iteration


In [None]:
# Creating the training function. This will be called in the main function. It is run depending on the epoch value.
# The model is put into train mode and then we enumerate over the training loader and passed to the defined network 

def train(epoch, tokenizer, model, device, loader, optimizer):
    model.train()
    for _,data in enumerate(loader, 0):
        y = data['target_ids'].to(device, dtype = torch.long)
        y_ids = y[:, :-1].contiguous()
        lm_labels = y[:, 1:].clone().detach()
        lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
        ids = data['source_ids'].to(device, dtype = torch.long)
        mask = data['source_mask'].to(device, dtype = torch.long)

        outputs = model(input_ids = ids, attention_mask = mask, decoder_input_ids=y_ids, lm_labels=lm_labels)
        loss = outputs[0]
        
        if _%10 == 0:
            wandb.log({"Training Loss": loss.item()})

        if _%500==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        # xm.optimizer_step(optimizer)  # FOR TPU
        # xm.mark_step()                # FOR TPU

In [None]:
def main():
    # WandB – Initialize a new run
    wandb.init(project="transformers_tutorials_summarization")

    # WandB – Config is a variable that holds and saves hyperparameters and inputs
    # Defining some key variables that will be used later on in the training  
    config = wandb.config          # Initialize config
    config.TRAIN_BATCH_SIZE = 2    # input batch size for training (default: 64)
    config.VALID_BATCH_SIZE = 2    # input batch size for testing (default: 1000)
    config.TRAIN_EPOCHS = 2        # number of epochs to train (default: 10)
    config.VAL_EPOCHS = 1 
    config.LEARNING_RATE = 1e-4    # learning rate (default: 0.01)
    config.SEED = 42               # random seed (default: 42)
    config.MAX_LEN = 512
    config.SUMMARY_LEN = 150 

    # Set random seeds and deterministic pytorch for reproducibility
    torch.manual_seed(config.SEED) # pytorch random seed
    np.random.seed(config.SEED) # numpy random seed
    torch.backends.cudnn.deterministic = True

    # tokenzier for encoding the text
    tokenizer = T5Tokenizer.from_pretrained("t5-base")
    

    # Importing and Pre-Processing the domain data
    # Selecting the needed columns only. 
    # Adding the summarzie text in front of the text. This is to format the dataset similar to how T5 model was trained for summarization task. 
    df = pd.read_csv('./data/news_summary.csv',encoding='latin-1')
    df = df[['text','ctext']]
    df.ctext = 'summarize: ' + df.ctext
    print(df.head())

    
    # Creation of Dataset and Dataloader
    # Defining the train size. So 80% of the data will be used for training and the rest will be used for validation. 
    train_size = 0.8
    train_dataset=df.sample(frac=train_size,random_state = config.SEED)
    val_dataset=df.drop(train_dataset.index).reset_index(drop=True)
    train_dataset = train_dataset.reset_index(drop=True)

    print("FULL Dataset: {}".format(df.shape))
    print("TRAIN Dataset: {}".format(train_dataset.shape))
    print("TEST Dataset: {}".format(val_dataset.shape))


    # Creating the Training and Validation dataset for further creation of Dataloader
    training_set = CustomDataset(train_dataset, tokenizer, config.MAX_LEN, config.SUMMARY_LEN)
    val_set = CustomDataset(val_dataset, tokenizer, config.MAX_LEN, config.SUMMARY_LEN)

    # Defining the parameters for creation of dataloaders
    train_params = {
        'batch_size': config.TRAIN_BATCH_SIZE,
        'shuffle': True,
        'num_workers': 0
        }

    val_params = {
        'batch_size': config.VALID_BATCH_SIZE,
        'shuffle': False,
        'num_workers': 0
        }

    # Creation of Dataloaders for testing and validation. This will be used down for training and validation stage for the model.
    training_loader = DataLoader(training_set, **train_params)
    val_loader = DataLoader(val_set, **val_params)


    
    # Defining the model. We are using t5-base model and added a Language model layer on top for generation of Summary. 
    # Further this model is sent to device (GPU/TPU) for using the hardware.
    model = T5ForConditionalGeneration.from_pretrained("t5-base")
    model = model.to(device)

    # Defining the optimizer that will be used to tune the weights of the network in the training session. 
    optimizer = torch.optim.Adam(params =  model.parameters(), lr=config.LEARNING_RATE)

    # Log metrics with wandb
    wandb.watch(model, log="all")
    # Training loop
    print('Initiating Fine-Tuning for the model on our dataset')

    for epoch in range(config.TRAIN_EPOCHS):
        train(epoch, tokenizer, model, device, training_loader, optimizer)


    # Validation loop and saving the resulting file with predictions and acutals in a dataframe.
    # Saving the dataframe as predictions.csv
    print('Now generating summaries on our fine tuned model for the validation dataset and saving it in a dataframe')
    for epoch in range(config.VAL_EPOCHS):
        predictions, actuals = validate(epoch, tokenizer, model, device, val_loader)
        final_df = pd.DataFrame({'Generated Text':predictions,'Actual Text':actuals})
        final_df.to_csv('./models/predictions.csv')
        print('Output Files generated for review')

if __name__ == '__main__':
    main()