## UM T5

In [None]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch.nn.functional as F
from torch import optim, nn
from torchvision import models, transforms
from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize
import pandas as pd
from PIL import Image

In [None]:
from transformers import AdamW, Adafactor
import os, sys
sys.path.append('/workout/early-stopping-pytorch')
from pytorchtools import EarlyStopping
from tqdm import tqdm,trange
import time

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

In [None]:
torch.cuda.empty_cache()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
tokenizer = T5Tokenizer.from_pretrained("t5-base")
model = T5ForConditionalGeneration.from_pretrained("t5-base",
                                             return_dict=True)
model.to(device)

In [None]:
class MemeMQACorpus(torch.utils.data.Dataset):
    """Uses jsonl data to preprocess and serve 
    dictionary of multimodal tensors for model input.
    """

    def __init__(
        self,
        data_path,
        img_dir,
        mode=None,
        balance=False,
        dev_limit=None,
        random_state=0,
    ):

        self.samples_frame = pd.read_json(
            data_path
        )
        
        self.samples_frame = self.samples_frame[self.samples_frame["meme_image"].notnull()]
        self.samples_frame = self.samples_frame[self.samples_frame["ocr"].notnull()]
        self.samples_frame = self.samples_frame[self.samples_frame["entity"].notnull()]
        self.samples_frame = self.samples_frame[self.samples_frame["explanation"].notnull()]
        if mode == "test":
            self.samples_frame = self.samples_frame[self.samples_frame["explanation1"].notnull()]


        self.samples_frame = self.samples_frame.reset_index(
            drop=True
        )
        self.samples_frame.image = self.samples_frame.apply(
            lambda row: (img_dir + '/' + row.meme_image), axis=1
        )
        
        self.image_transform = Resize((256,256))

    def __len__(self):
        """This method is called when you do len(instance) 
        for an instance of this class.
        """
        return len(self.samples_frame)

    def __getitem__(self, idx):
        """This method is called when you do instance[key] 
        for an instance of this class.
        """
        if torch.is_tensor(idx):
            idx = idx.tolist()

        img_name = self.samples_frame.loc[idx, "meme_image"]  
        text_inputs = self.samples_frame.loc[idx, "question"]  + "\n Options: " + self.samples_frame.loc[idx, "optC"] + "\nContext: " + self.samples_frame.loc[idx, "ocr"]        
        decoder_text = "Answer: " + self.samples_frame.loc[idx, "entity"] + " BECAUSE " + self.samples_frame.loc[idx, "explanation"] + '</s>'
        sample = {
                "img_name": img_name,        
                "text_inputs": text_inputs,
                "decoder_text": decoder_text
            }
        try:
            sample["decoder_text1"] = "Answer: " + self.samples_frame.loc[idx, "entity"] + " BECAUSE " + self.samples_frame.loc[idx, "explanation1"]
        except:
            pass
        return sample

In [None]:
BS = 4
train_path = "ANONYMISED"
dev_path = "ANONYMISED"
data_dir = "ANONYMISED"
hm_dataset_train = MemeMQACorpus(train_path, data_dir)
dataloader_train = DataLoader(hm_dataset_train, batch_size=BS,
                        shuffle=True, num_workers=0)
hm_dataset_val = MemeMQACorpus(dev_path, data_dir)
dataloader_val = DataLoader(hm_dataset_val, batch_size=BS,
                        shuffle=True, num_workers=0)

In [None]:
hm_dataset_train[0]

In [None]:
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self, name, fmt=':f'):
        self.name = name
        self.fmt = fmt
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

    def __str__(self):
        fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
        return fmtstr.format(**self.__dict__)
data_time = AverageMeter('Data', ':6.3f')

In [None]:
from prettytable import PrettyTable

def count_parameters(model):
    table = PrettyTable(["Modules", "Parameters"])
    total_params = 0
    for name, parameter in model.named_parameters():
        if not parameter.requires_grad: continue
        param = parameter.numel()
        table.add_row([name, param])
        total_params+=param
    print(table)
    print(f"Total Trainable Params: {total_params}")
    return total_params
    
count_parameters(model)

In [None]:
from pathlib import Path
def train_model(model, n_epochs):
  epochs = n_epochs
  train_loss_list = []
  val_loss_list = []
  Path(exp_path).mkdir(parents=True, exist_ok=True)

  model.train()
  for i in range(epochs):
    print(f"******************************EPOCH - {i}****************************************")
    train_loss = 0
    val_loss = 0

    for data in tqdm(dataloader_train, total = len(dataloader_train), desc = "Mini-batch progress"):
      input_tokens = tokenizer.batch_encode_plus(data['text_inputs'],padding=True,max_length=400,return_tensors='pt')
      input_ids = input_tokens.input_ids.to(device)
      decoder_labels = tokenizer.batch_encode_plus(data['decoder_text'],padding=True,max_length=400,return_tensors='pt').input_ids.to(device)
      optimizer.zero_grad()
      model_out = model(input_ids=input_ids, labels=decoder_labels)
      loss = model_out.loss
      loss.backward()
      optimizer.step()
      with torch.no_grad():
        train_loss += loss.item()
      
    model.eval()
    with torch.no_grad():
        for data in dataloader_val: 
            input_tokens = tokenizer.batch_encode_plus(data['text_inputs'],padding=True,max_length=400,return_tensors='pt')
            input_ids = input_tokens.input_ids.to(device)
            decoder_labels = tokenizer.batch_encode_plus(data['decoder_text'],padding=True,max_length=400,return_tensors='pt').input_ids.to(device)
            optimizer.zero_grad()
            model_out_val = model(input_ids=input_ids, labels=decoder_labels)
            val_loss += model_out_val.loss
    print("Saving model...")
    torch.save(model.state_dict(), os.path.join(exp_path, "epoch" + str(i) + "final.pt"))
    train_loss_list.append(train_loss)
    val_loss_list.append(val_loss)
    print(f'Epoch {i+1}: train_loss: {train_loss:.4f} | val_loss: {val_loss:.4f}')
    with open(os.path.join(exp_path, exp_name+'_base_exp_results.txt'), 'a+') as of:
      of.write(f'Epoch {i+1}: train_loss: {train_loss:.4f} | val_loss: {val_loss:.4f}')
    model.train()
    
    torch.cuda.empty_cache()

  return model, train_loss_list, val_loss_list, i


In [None]:
code_prof = False

exp_name = "UM_TEXT_T5_Role"
exp_path = "testing/"+exp_name

lr=0.0001
criterion = nn.CrossEntropyLoss()
optimizer = Adafactor(model.parameters(),lr=1e-3,
                      eps=(1e-30, 1e-3),
                      clip_threshold=1.0,
                      decay_rate=-0.8,
                      beta1=None,
                      weight_decay=0.0,
                      relative_step=False,
                      scale_parameter=False,
                      warmup_init=False)
n_epochs = 10

model, train_loss_list, val_loss_list, i = train_model(model, n_epochs)

In [None]:
def test_model(model):
    generated = []
    exp1 = []
    exp2 = []
    ques = []
    model.eval()
    with torch.no_grad():
        for data in dataloader_test: 
            input_tokens = tokenizer.batch_encode_plus(data['text_inputs'],padding=True,max_length=400,return_tensors='pt').to(device)
            outputs = model.generate(input_tokens.input_ids)
            output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)
            generated.extend(output_str)
            exp1.extend(data["decoder_text"])
            exp2.extend(data["decoder_text1"])
            ques.extend(data["text_inputs"])
    
    torch.cuda.empty_cache()

    return generated, exp1, exp2, ques


In [None]:
test_path = "../data/data_test_role.json"

hm_dataset_test = HarmemeMemesDatasetAug(test_path, data_dir, mode = "test")
dataloader_test = DataLoader(hm_dataset_test, batch_size=BS,
                        shuffle=False, num_workers=0)

In [None]:
generated_result, ref1, ref2, ques = test_model(model)

In [None]:
len(generated_result)

In [None]:
dict = {"hyp" : generated_result, "ref1" : ref1, "ref2" : ref2, "ques" : ques}
df1 = pd.DataFrame(dict)
df1.to_csv(exp_name +  ".csv")