In [1]:
%%capture
!pip install transformers 

In [1]:
import numpy as np
import os
from tqdm import tqdm
import copy

import torch
from torch.utils.data import DataLoader, Dataset

import transformers
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, T5ForConditionalGeneration

# model optim
from torch.optim import AdamW, SGD

# lr schedulers
from transformers import get_cosine_schedule_with_warmup, get_linear_schedule_with_warmup

In [2]:
class LangData(Dataset):
    def __init__(self, base_dir, lang, split):
        
        self.split = split
        
        x_file = os.path.join(base_dir, lang+"." + self.split + ".src")
        self.x = self.__read__(x_file)
        
        if self.split != 'test':
            y_file = os.path.join(base_dir, lang+"." + self.split + ".tgt")
            self.y = self.__read__(y_file)
    
    def __read__(self, file_dir):
        
        data = []
        with open(file_dir, 'r', encoding='utf-8') as f:
            for line in f:
                data.append(line.strip())
        return data        
    

    def __len__(self):
        return len(self.x)
    
    
    def __getitem__(self, idx):
        if self.split == "test":
            return self.x[idx]
        return self.x[idx], self.y[idx]

In [3]:
model = T5ForConditionalGeneration.from_pretrained('google/byt5-small')
tokenizer = AutoTokenizer.from_pretrained('google/byt5-small')

  return self.fget.__get__(instance, owner)()


In [4]:
def __save_model(model_dir, model, model_type='latest'):

    if model_type == "latest":
        saved_name = 'latest_model.pt'
    else:
        saved_name = 'best_model.pt'

    root_model_path = os.path.join(model_dir, saved_name)
    state_dict = {'weights': model.state_dict(), 
                  'optimizer': model.optimizer.state_dict(), 
                  'scheduler': model.scheduler.state_dict()}
    torch.save(state_dict, root_model_path)
        
        

# Loads the experiment data if exists to resume training from last saved checkpoint.
def __load_experiment(model_dir, model, model_type='latest'):
    
    if model_type == "latest":
        saved_name = 'latest_model.pt'
    else:
        saved_name = 'best_model.pt'

    if os.path.exists(os.path.join(model_dir, 'train.log')):
        # get current epoch
        current_epoch = 0
        with open(os.path.join(model_dir, 'train.log')) as f:
            for line in f:
                current_epoch += 1
        # get the latest model
        state_dict = torch.load(os.path.join(model_dir, saved_name), map_location=device.type)
        model.load_state_dict(state_dict['weights'])
        model.optimizer.load_state_dict(state_dict['optimizer'])
        model.scheduler.load_state_dict(state_dict['scheduler'])
    else:
        current_epoch = 0

    return model, current_epoch



def log(output_dir, log_str, file_name=None):
    if file_name is None:
        file_name = "all.log"
    output_file = os.path.join(output_dir, file_name)
    with open(output_file, 'a') as f:
        f.write(log_str + '\n')


def get_optimizer(model, opt_name, lr, eps): 
    if opt_name == 'Adam':
        return AdamW(model.parameters(), lr=lr, eps=eps)
    elif opt_name == 'SGD':
        return SGD(model.parameters(), lr=lr, eps=eps)
    
    
def get_scheduler(model, scheduler, n_batches, n_epochs, warmup_portion=0.1):
    train_steps = n_epochs*n_batches
    warm_step = int(train_steps*warmup_portion)
    if scheduler == "linear": 
        return get_linear_schedule_with_warmup(model.optimizer, num_warmup_steps=warm_step,num_training_steps=train_steps)
    elif scheduler == "cosine":
        return get_cosine_schedule_with_warmup(model.optimizer, num_warmup_steps=warm_step,num_training_steps=train_steps)


In [5]:

# training loop
def train(model, dataloader_train, n_epochs, model_dir, log_file):

    model.train() # put to train mode
    
    # load current model if exist
    model, current_epoch = __load_experiment(model_dir, model)
    
    # model has 3 special tokens so shift utf encoding
    
    all_losses = []
    
    for e in range(current_epoch, n_epochs):

        losses = 0
        for step, data in tqdm(enumerate(dataloader_train), total=len(dataloader_train)):
            
            model_inputs = tokenizer(data[0], padding="longest", return_tensors="pt")
            labels = tokenizer(data[1], padding="longest", return_tensors="pt").input_ids

            outputs = model(**model_inputs.to(device), labels=labels.to(device))

            loss = outputs[0]

            model.optimizer.zero_grad() # clear loss
            loss.backward()
            model.optimizer.step()  # backprop to update the weights

            if model.scheduler is not None:
                model.scheduler.step()  # update learning rate schedule 

            # log losses
            loss /= len(dataloader_train) # already average across batch in nn.CrossEntropy
            losses += loss.item()
            
        # output stats
        print(f"Epoch {e}; loss {losses}")
        log(model_dir, "Epoch " + str(e+1) + "; loss " + str(losses), log_file)
        all_losses.append(losses)
        # save model
        __save_model(model_dir, model) # save latest
        if (e > current_epoch and losses < all_losses[-1]):
            __save_model(model_dir, model, model_type='best') # save best model        

In [6]:
def validation(model, dataloader_test, model_dir, log_file):
    
    model, e = __load_experiment(model_dir, model, model_type='latest')
    
    model.eval()
    
    losses = 0
    for step, data in tqdm(enumerate(dataloader_test), total=len(dataloader_test)):

        model_inputs = tokenizer(data[0], padding="longest", return_tensors="pt")
        labels = tokenizer(data[1], padding="longest", return_tensors="pt").input_ids

        outputs = model(**model_inputs.to(device), labels=labels.to(device))

        loss = outputs[0]

        # log losses
        loss /= len(dataloader_test)
        losses += loss.item()
        
    # output stats
    print(f"Validation loss {losses}")
    log(model_dir, "Validation loss " + str(losses), log_file)

In [7]:
# specify device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

lang_set = 'shp'

batch_size = 4

checkpoint = 'byt5_small'
n_epochs = 10
lr = 1e-4
weight_decay = 1e-5
optim = "Adam"
scheduler = "cosine"
log_file = 'train.log'

In [8]:
# model state_dict
model_dir = f"{lang_set}_{checkpoint}_e{n_epochs}_lr{lr}_eps{weight_decay}_{optim}_{scheduler}_batch{batch_size}"
if not os.path.isdir(model_dir):
    os.mkdir(model_dir)

In [9]:
data = LangData('miniproj1-dataset', lang_set, 'train')
dataloader_train = DataLoader(data, batch_size=batch_size)

In [10]:
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(384, 1472)
  (encoder): T5Stack(
    (embed_tokens): Embedding(384, 1472)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=1472, out_features=384, bias=False)
              (k): Linear(in_features=1472, out_features=384, bias=False)
              (v): Linear(in_features=1472, out_features=384, bias=False)
              (o): Linear(in_features=384, out_features=1472, bias=False)
              (relative_attention_bias): Embedding(32, 6)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=1472, out_features=3584, bias=False)
              (wi_1): Linear(in_features=1472, out_features=3584, bias=False)
              (w

In [11]:
# model optimizer
model.optimizer = get_optimizer(model, optim, lr, weight_decay)

# learning rate scheduler
model.scheduler = get_scheduler(model, scheduler, len(dataloader_train), n_epochs)

In [12]:
type(model.scheduler)

torch.optim.lr_scheduler.LambdaLR

In [13]:
train(model, dataloader_train, n_epochs, model_dir, log_file)

100%|█████████████████████████████████████████████████████████████████████████████████| 219/219 [00:23<00:00,  9.42it/s]


Epoch 0; loss 20.42374260723591


100%|█████████████████████████████████████████████████████████████████████████████████| 219/219 [00:22<00:00,  9.63it/s]


Epoch 1; loss 4.660034565720707


100%|█████████████████████████████████████████████████████████████████████████████████| 219/219 [00:22<00:00,  9.67it/s]


Epoch 2; loss 1.364316575229168


100%|█████████████████████████████████████████████████████████████████████████████████| 219/219 [00:22<00:00,  9.70it/s]


Epoch 3; loss 0.8130061974516138


100%|█████████████████████████████████████████████████████████████████████████████████| 219/219 [00:22<00:00,  9.72it/s]


Epoch 4; loss 0.6384269639384001


100%|█████████████████████████████████████████████████████████████████████████████████| 219/219 [00:22<00:00,  9.61it/s]


Epoch 5; loss 0.5502342128311284


100%|█████████████████████████████████████████████████████████████████████████████████| 219/219 [00:22<00:00,  9.72it/s]


Epoch 6; loss 0.5051389230065979


100%|█████████████████████████████████████████████████████████████████████████████████| 219/219 [00:22<00:00,  9.69it/s]


Epoch 7; loss 0.4808352754916996


100%|█████████████████████████████████████████████████████████████████████████████████| 219/219 [00:22<00:00,  9.71it/s]


Epoch 8; loss 0.46712471288628876


100%|█████████████████████████████████████████████████████████████████████████████████| 219/219 [00:22<00:00,  9.70it/s]


Epoch 9; loss 0.4561161154997535


In [14]:
data_val = LangData('miniproj1-dataset', lang_set, 'dev')
dataloader_test = DataLoader(data_val, batch_size=batch_size)

In [15]:
validation(model, dataloader_test, model_dir, log_file)

100%|███████████████████████████████████████████████████████████████████████████████████| 28/28 [00:00<00:00, 46.00it/s]

Validation loss 0.24250429961830378





In [25]:
# -----------load trained model -----------------
    
# load latest model with weights computed during training
model, current_epoch = __load_experiment(model_dir, model, model_type='latest')

In [26]:
data = LangData('miniproj1-dataset', lang_set, 'dev')
dataloader_test = DataLoader(data, batch_size=batch_size)

In [28]:
# evaluate the model
model.eval()

# validation outputs
with open(f'pred_{lang_set}.dev.tgt', 'w', encoding='utf-8') as f:
    for i in range(len(data_val)):
        inputs = tokenizer(data[i], return_tensors='pt', padding="longest") # validation set need padding 
        input_ids = inputs['input_ids'].to(device)

        # generate questions with context 
        outputs = model.generate(input_ids, max_new_tokens=20)

        # decode output (note that question and answer to that question is generated)
        question_answer = tokenizer.decode(outputs[0], skip_special_tokens=False)
        question_answer = question_answer.replace(tokenizer.pad_token, "").replace(tokenizer.eos_token, "")
        output = question_answer.split(tokenizer.sep_token)
#         print(" ".join(output))
        f.write(" ".join(output))
        f.write('\n')

In [None]:
data = LangData('miniproj1-dataset', lang_set, 'test')
dataloader_test = DataLoader(data, batch_size=batch_size)

In [22]:
with open(f'pred_{lang_set}.test.tgt', 'w', encoding='utf-8') as f:
    for i in range(len(data)):
        inputs = tokenizer(data[i], return_tensors='pt')
        input_ids = inputs['input_ids'].to(device)

        # generate questions with context 
        outputs = model.generate(input_ids, max_new_tokens=20)

        # decode output (note that question and answer to that question is generated)
        question_answer = tokenizer.decode(outputs[0], skip_special_tokens=False)
        question_answer = question_answer.replace(tokenizer.pad_token, "").replace(tokenizer.eos_token, "")
        output = question_answer.split(tokenizer.sep_token)
#         print(" ".join(output))
        f.write(" ".join(output))
        f.write('\n')