In [3]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import pytorch_lightning as pl
from sklearn.model_selection import train_test_split
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers.tensorboard import TensorBoardLogger
from torch.utils.data import Dataset
import pandas as pd
import torch.nn as nn
import torch

train_dataset_name = 'ArithOpsTrain.xlsx'
df = pd.read_excel(train_dataset_name)
df = df.drop('Table 1',axis=1)
df = df.rename(columns=df.iloc[0]).loc[1:]

train_df , valid_df = train_test_split(df,test_size=0.1,random_state=0)

In [4]:
train_df.head()

Unnamed: 0,Description,Question,Equation,Input Numbers,Output
373,mrs. hilt is baking bread . she needs number0 ...,how much flour will she need to make number2 l...,/ number0 number1,5 2 1,2.5
901,robin had number0 songs on her number1 player ...,how many songs does she have on her number4 pl...,+ - number0 number2 number3,30 3 8 10 3,32.0
254,there are number0 more sections that are undev...,what is the total area of the undeveloped land ?,* number0 number1,3 2435,7305.0
468,mom made number0 chocolate chip cookies . it t...,how many cookies were left ?,- number0 number3,32 24 16 9,23.0
333,number0 children are taking a bus to the zoo ....,how many seats will the children need in all ?,/ number0 number1,58 2,29.0


In [3]:
class T5Dataset(Dataset):
    def __init__(
        self,
        data  : pd.DataFrame,
        tokenizer : T5Tokenizer,
        text_max_token_length = 512,
        output_max_token_length = 128
    ):
        
        super().__init__()
        self.tokenizer = tokenizer
        self.data = data 
        self.text_max_token_length = text_max_token_length,
        self.output_max_token_length = output_max_token_length
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        
        data_row = self.data.iloc[index]

        input_text = data_row["Description"]
        input_question = data_row["Question"]

        in_text = input_text + " [SEP] " + input_question
        
        input_text_encoding = self.tokenizer(
            in_text,
            max_length=100,
            padding = "max_length",
            truncation=True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors='pt'
        )

        

        output_text = data_row["Equation"]        
        output_text_encoding = self.tokenizer(
            output_text,
            max_length=100,
            padding = "max_length",
            truncation=True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors='pt'
        )

        return dict(
            input_text = input_text,
            output_text = output_text,
            input_text_ids = input_text_encoding['input_ids'].flatten(),
            input_attention_mask = input_text_encoding['attention_mask'].flatten(),
            output_text_ids = output_text_encoding['input_ids'].flatten(),
            output_attention_mask = output_text_encoding['attention_mask'].flatten()
        )   

In [4]:
t5_tokenizer = T5Tokenizer.from_pretrained("t5-small")
special_tokens_dict = {'additional_special_tokens' : ['[SEP]']}
num_added_tokens = t5_tokenizer.add_special_tokens(special_tokens_dict)



t5_model =T5ForConditionalGeneration.from_pretrained("t5-small")
dataset = T5Dataset(train_df,t5_tokenizer)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [5]:
t5_tokenizer("[SEP] The greatest integer function")

{'input_ids': [32100, 37, 4016, 30278, 1681, 1], 'attention_mask': [1, 1, 1, 1, 1, 1]}

In [6]:
output  = dataset.__getitem__(0)

In [7]:
from torch.utils.data import DataLoader

class T5DataSetModule(pl.LightningDataModule):

    def __init__(
        self,
        train_df : pd.DataFrame,
        valid_df : pd.DataFrame,
        tokenizer : T5Tokenizer,
        batch_size = 32,
        text_max_token_length = 512,
        output_max_token_length = 128
    ):
        super().__init__()
        self.train_df = train_df
        self.valid_df = valid_df
        self.tokenizer = tokenizer,
        self.batch_size = batch_size,
        self.text_max_token_length = text_max_token_length
        self.output_max_token_length = output_max_token_length

    
    def setup(self, stage = None):
        
        self.train_dataset = T5Dataset(self.train_df,self.tokenizer,self.text_max_token_length,self.output_max_token_length)
        self.valid_dataset = T5Dataset(self.valid_df,self.tokenizer,self.text_max_token_length,self.output_max_token_length)
    
    def train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            self.batch_size,
            shuffle=True,
        )
    
    def val_dataloader(self):
        return DataLoader(
            self.valid_dataset,
            self.batch_size,
            shuffle=False
        )
#data_module = T5DataSetModule(train_df,valid_df,t5_tokenizer,BATCH_SIZE)       

In [8]:
def postfix_evaluation(batch_data,input_values):

    arith_symbols = set(['+','-','*','/','%'])
    output_values = []
    
    for i in range(len(batch_data)):
        flag = True
        current_input = batch_data[i].split(' ')
        current_input.reverse()
        input_value = input_values[i]

        stack = []
        for symbol in current_input:
            if symbol in arith_symbols:
                if len(stack)<2:
                    flag = False
                    break
                in1 = stack.pop(-1)
                in2 = stack.pop(-1)

                res = 0
                if symbol=='+':
                    res = in1+in2
                elif symbol=='-':
                    res = in1 - in2 
                elif symbol == '*':
                    res = in1 * in2
                elif symbol=='/':
                    res = in1/in2
                else:
                    res = in1 % in2
                stack.append(res)


            else:
                if "number" in symbol:
                    index = int(symbol[6])
                    stack.append(input_value[index])

        if flag==False or len(stack)!=1:
            output_values.append(0)
        else:
            output_values.append(stack.pop(-1))
    
    return output_values

ans = postfix_evaluation(["+ - number0 number1 number2","+ / - number0 number2 number1 number3"],[[1,4,6],[5,6,7,8]])

In [22]:
import math

class PositionalEncoding(nn.Module):

    def __init__(self,dim_model,dropout_p,max_len) -> None:
        super().__init__()
        self.dropout =  nn.Dropout(dropout_p)

        pos_encoding = torch.zeros(max_len,dim_model)
        
        positions_list = torch.arange(0, max_len, dtype=torch.float).view(-1, 1) 
        division_term = torch.exp(torch.arange(0, dim_model, 2).float() * (-math.log(10000.0)) / dim_model) 
        
        pos_encoding[:, 0::2] = torch.sin(positions_list * division_term)
        pos_encoding[:, 1::2] = torch.cos(positions_list * division_term)
        
        pos_encoding = pos_encoding.unsqueeze(0).transpose(0, 1)
        self.register_buffer("pos_encoding",pos_encoding)

        
    def forward(self, token_embedding: torch.tensor) -> torch.tensor:

        return self.dropout(token_embedding + self.pos_encoding[:token_embedding.size(0), :])

In [26]:
class TransformerModel(nn.Module):

    def __init__(
        self,
        num_tokens,
        dim_model,
        num_heads,
        num_encoder_layers,
        num_decoder_layers,
        dim_feedforward,
        dropout_p
    ):
        super().__init__()

        self.positional_encoder = PositionalEncoding(
            dim_model=dim_model,
            dropout_p= dropout_p,
            max_len=5000
        )

        self.embedding = nn.Embedding(num_tokens,dim_model)
        self.dim_model = dim_model

        self.transformer = nn.Transformer(
            d_model=dim_model,
            nhead=num_heads,
            num_encoder_layers=num_encoder_layers,
            num_decoder_layers=num_decoder_layers,
            dim_feedforward=dim_feedforward,
            dropout= dropout_p,
            batch_first=True
        )

        self.out = nn.Linear(self.dim_model,num_tokens)

    
    def forward(self, src, trg):

        src = self.embedding(src) * math.sqrt(self.dim_model)
        #target = self.embedding(trg) * math.sqrt(self.dim_model)

        src = self.positional_encoder(src)
        #trg = self.positional_encoder(trg)
        
        print(src.shape)
    
        
    

In [27]:
transformermodel = TransformerModel(30,20,2,1,1,512,0.01)
r = torch.tensor([[1,5,6],[6,1,2]])
transformermodel(r,None)

torch.Size([2, 3, 20])


In [9]:
import torch.optim as optim
class T5ArithTranslator(pl.LightningModule):

    def __init__(self):
        super().__init__()
        self.t5_model = T5ForConditionalGeneration.from_pretrained("t5-small")


    def forward(self, input_ids, input_attention_mask, decoder_attention_mask, labels):

        outs = self.t5_model(input_ids=input_ids,attention_mask = input_attention_mask,labels = labels)        
        return outs.loss ,  outs.logits

        
    def training_step(self, batch, batch_idx) :
        
        input_text_ids = batch["input_text_ids"]
        input_attention_mask = batch["input_attention_mask"]
        output_text_ids = batch["output_text_ids"]
        output_attention_mask = batch["output_attention_mask"]

        loss, outs = self(
            input_text_ids,
            input_attention_mask,
            output_attention_mask,
            output_text_ids
        )

        self.log("train_loss" , loss, prog_bar=True,logger=True)
        return loss 
    
    def validation_step(self, batch, batch_idx):
        
        input_text_ids = batch["input_text_ids"]
        input_attention_mask = batch["input_attention_mask"]
        output_text_ids = batch["output_text_ids"]
        output_attention_mask = batch["output_attention_mask"]

        loss, outs = self(
            input_text_ids,
            input_attention_mask,
            output_attention_mask,
            output_text_ids
        )

        self.log("valid_loss" , loss, prog_bar=True,logger=True)
        return loss 

    def configure_optimizers(self):
        return optim.Adam(self.parameters(),lr = 0.0001)


In [10]:
N_EPOCHS = 50
BATCH_SIZE = 32

model = T5ArithTranslator()

checkpoint_callback = ModelCheckpoint(
    dirpath = "checkpoints",
    filename="best-checkpoint",
    save_top_k = 1,
    verbose = True,
    monitor="valid_loss",
    mode = "min"
)

logger = TensorBoardLogger("lightning_logs",name="translator")

trainer = pl.Trainer(
    logger = logger,
    callbacks =  checkpoint_callback,
    max_epochs=N_EPOCHS,
    log_every_n_steps=5,
    gpus=1,
    accelerator='cpu'
)

  rank_zero_deprecation(
GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(


In [11]:
train_dataset = T5Dataset(train_df,t5_tokenizer)
valid_dataset = T5Dataset(valid_df,t5_tokenizer)

train_dataloader = DataLoader(train_dataset,32,True)
valid_dataloader = DataLoader(valid_dataset,32,shuffle=False)

In [12]:
trainer.fit(model,train_dataloader,valid_dataloader)

  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")

  | Name     | Type                       | Params
--------------------------------------------------------
0 | t5_model | T5ForConditionalGeneration | 60.5 M
--------------------------------------------------------
60.5 M    Trainable params
0         Non-trainable params
60.5 M    Total params
242.026   Total estimated model params size (MB)


Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]

  rank_zero_warn(


                                                                           

  rank_zero_warn(


Epoch 0: 100%|██████████| 32/32 [02:08<00:00,  4.02s/it, loss=1.98, v_num=1, train_loss=0.938, valid_loss=0.483]

Epoch 0, global step 28: 'valid_loss' reached 0.48270 (best 0.48270), saving model to '/Users/depressedcoder/DLNLP/Assignment5/partb/checkpoints/best-checkpoint-v1.ckpt' as top 1


Epoch 1: 100%|██████████| 32/32 [02:09<00:00,  4.03s/it, loss=0.521, v_num=1, train_loss=0.402, valid_loss=0.373]

Epoch 1, global step 56: 'valid_loss' reached 0.37264 (best 0.37264), saving model to '/Users/depressedcoder/DLNLP/Assignment5/partb/checkpoints/best-checkpoint-v1.ckpt' as top 1


Epoch 2: 100%|██████████| 32/32 [02:09<00:00,  4.06s/it, loss=0.371, v_num=1, train_loss=0.322, valid_loss=0.301]

Epoch 2, global step 84: 'valid_loss' reached 0.30099 (best 0.30099), saving model to '/Users/depressedcoder/DLNLP/Assignment5/partb/checkpoints/best-checkpoint-v1.ckpt' as top 1


Epoch 3: 100%|██████████| 32/32 [02:10<00:00,  4.06s/it, loss=0.311, v_num=1, train_loss=0.248, valid_loss=0.245]

Epoch 3, global step 112: 'valid_loss' reached 0.24466 (best 0.24466), saving model to '/Users/depressedcoder/DLNLP/Assignment5/partb/checkpoints/best-checkpoint-v1.ckpt' as top 1


Epoch 4: 100%|██████████| 32/32 [02:10<00:00,  4.08s/it, loss=0.265, v_num=1, train_loss=0.245, valid_loss=0.203]

Epoch 4, global step 140: 'valid_loss' reached 0.20330 (best 0.20330), saving model to '/Users/depressedcoder/DLNLP/Assignment5/partb/checkpoints/best-checkpoint-v1.ckpt' as top 1


Epoch 5: 100%|██████████| 32/32 [02:10<00:00,  4.09s/it, loss=0.213, v_num=1, train_loss=0.186, valid_loss=0.156]

Epoch 5, global step 168: 'valid_loss' reached 0.15589 (best 0.15589), saving model to '/Users/depressedcoder/DLNLP/Assignment5/partb/checkpoints/best-checkpoint-v1.ckpt' as top 1


Epoch 6: 100%|██████████| 32/32 [02:11<00:00,  4.11s/it, loss=0.162, v_num=1, train_loss=0.145, valid_loss=0.102]

Epoch 6, global step 196: 'valid_loss' reached 0.10170 (best 0.10170), saving model to '/Users/depressedcoder/DLNLP/Assignment5/partb/checkpoints/best-checkpoint-v1.ckpt' as top 1


Epoch 7: 100%|██████████| 32/32 [02:11<00:00,  4.10s/it, loss=0.116, v_num=1, train_loss=0.114, valid_loss=0.0677]

Epoch 7, global step 224: 'valid_loss' reached 0.06766 (best 0.06766), saving model to '/Users/depressedcoder/DLNLP/Assignment5/partb/checkpoints/best-checkpoint-v1.ckpt' as top 1


Epoch 8: 100%|██████████| 32/32 [02:12<00:00,  4.14s/it, loss=0.0933, v_num=1, train_loss=0.0722, valid_loss=0.0579]

Epoch 8, global step 252: 'valid_loss' reached 0.05789 (best 0.05789), saving model to '/Users/depressedcoder/DLNLP/Assignment5/partb/checkpoints/best-checkpoint-v1.ckpt' as top 1


Epoch 9: 100%|██████████| 32/32 [02:11<00:00,  4.12s/it, loss=0.0729, v_num=1, train_loss=0.086, valid_loss=0.0509] 

Epoch 9, global step 280: 'valid_loss' reached 0.05091 (best 0.05091), saving model to '/Users/depressedcoder/DLNLP/Assignment5/partb/checkpoints/best-checkpoint-v1.ckpt' as top 1


Epoch 10: 100%|██████████| 32/32 [02:11<00:00,  4.10s/it, loss=0.0645, v_num=1, train_loss=0.069, valid_loss=0.0437] 

Epoch 10, global step 308: 'valid_loss' reached 0.04367 (best 0.04367), saving model to '/Users/depressedcoder/DLNLP/Assignment5/partb/checkpoints/best-checkpoint-v1.ckpt' as top 1


Epoch 11: 100%|██████████| 32/32 [02:11<00:00,  4.11s/it, loss=0.0575, v_num=1, train_loss=0.0447, valid_loss=0.0369]

Epoch 11, global step 336: 'valid_loss' reached 0.03694 (best 0.03694), saving model to '/Users/depressedcoder/DLNLP/Assignment5/partb/checkpoints/best-checkpoint-v1.ckpt' as top 1


Epoch 12: 100%|██████████| 32/32 [02:11<00:00,  4.10s/it, loss=0.0492, v_num=1, train_loss=0.0479, valid_loss=0.0336]

Epoch 12, global step 364: 'valid_loss' reached 0.03358 (best 0.03358), saving model to '/Users/depressedcoder/DLNLP/Assignment5/partb/checkpoints/best-checkpoint-v1.ckpt' as top 1


Epoch 13: 100%|██████████| 32/32 [02:07<00:00,  3.99s/it, loss=0.0461, v_num=1, train_loss=0.0419, valid_loss=0.0336]

Epoch 13, global step 392: 'valid_loss' was not in top 1


Epoch 14: 100%|██████████| 32/32 [02:11<00:00,  4.11s/it, loss=0.0423, v_num=1, train_loss=0.0432, valid_loss=0.0331]

Epoch 14, global step 420: 'valid_loss' reached 0.03313 (best 0.03313), saving model to '/Users/depressedcoder/DLNLP/Assignment5/partb/checkpoints/best-checkpoint-v1.ckpt' as top 1


Epoch 15: 100%|██████████| 32/32 [02:12<00:00,  4.13s/it, loss=0.0406, v_num=1, train_loss=0.037, valid_loss=0.0321] 

Epoch 15, global step 448: 'valid_loss' reached 0.03214 (best 0.03214), saving model to '/Users/depressedcoder/DLNLP/Assignment5/partb/checkpoints/best-checkpoint-v1.ckpt' as top 1


Epoch 16: 100%|██████████| 32/32 [02:09<00:00,  4.04s/it, loss=0.0409, v_num=1, train_loss=0.0514, valid_loss=0.0311]

Epoch 16, global step 476: 'valid_loss' reached 0.03107 (best 0.03107), saving model to '/Users/depressedcoder/DLNLP/Assignment5/partb/checkpoints/best-checkpoint-v1.ckpt' as top 1


Epoch 17: 100%|██████████| 32/32 [03:02<00:00,  5.72s/it, loss=0.0388, v_num=1, train_loss=0.0369, valid_loss=0.0306]

Epoch 17, global step 504: 'valid_loss' reached 0.03060 (best 0.03060), saving model to '/Users/depressedcoder/DLNLP/Assignment5/partb/checkpoints/best-checkpoint-v1.ckpt' as top 1


Epoch 18: 100%|██████████| 32/32 [02:11<00:00,  4.11s/it, loss=0.0374, v_num=1, train_loss=0.0336, valid_loss=0.030] 

Epoch 18, global step 532: 'valid_loss' reached 0.02997 (best 0.02997), saving model to '/Users/depressedcoder/DLNLP/Assignment5/partb/checkpoints/best-checkpoint-v1.ckpt' as top 1


Epoch 19: 100%|██████████| 32/32 [02:10<00:00,  4.07s/it, loss=0.0359, v_num=1, train_loss=0.0397, valid_loss=0.0295]

Epoch 19, global step 560: 'valid_loss' reached 0.02954 (best 0.02954), saving model to '/Users/depressedcoder/DLNLP/Assignment5/partb/checkpoints/best-checkpoint-v1.ckpt' as top 1


Epoch 20: 100%|██████████| 32/32 [02:12<00:00,  4.15s/it, loss=0.0359, v_num=1, train_loss=0.0376, valid_loss=0.0288]

Epoch 20, global step 588: 'valid_loss' reached 0.02883 (best 0.02883), saving model to '/Users/depressedcoder/DLNLP/Assignment5/partb/checkpoints/best-checkpoint-v1.ckpt' as top 1


Epoch 21: 100%|██████████| 32/32 [02:11<00:00,  4.11s/it, loss=0.0365, v_num=1, train_loss=0.0345, valid_loss=0.0287]

Epoch 21, global step 616: 'valid_loss' reached 0.02870 (best 0.02870), saving model to '/Users/depressedcoder/DLNLP/Assignment5/partb/checkpoints/best-checkpoint-v1.ckpt' as top 1


Epoch 22: 100%|██████████| 32/32 [02:10<00:00,  4.06s/it, loss=0.0331, v_num=1, train_loss=0.0397, valid_loss=0.0285]

Epoch 22, global step 644: 'valid_loss' reached 0.02855 (best 0.02855), saving model to '/Users/depressedcoder/DLNLP/Assignment5/partb/checkpoints/best-checkpoint-v1.ckpt' as top 1


Epoch 23: 100%|██████████| 32/32 [02:13<00:00,  4.17s/it, loss=0.0332, v_num=1, train_loss=0.0356, valid_loss=0.0274]

Epoch 23, global step 672: 'valid_loss' reached 0.02744 (best 0.02744), saving model to '/Users/depressedcoder/DLNLP/Assignment5/partb/checkpoints/best-checkpoint-v1.ckpt' as top 1


Epoch 24: 100%|██████████| 32/32 [02:13<00:00,  4.18s/it, loss=0.0316, v_num=1, train_loss=0.0253, valid_loss=0.0272]

Epoch 24, global step 700: 'valid_loss' reached 0.02719 (best 0.02719), saving model to '/Users/depressedcoder/DLNLP/Assignment5/partb/checkpoints/best-checkpoint-v1.ckpt' as top 1


Epoch 25: 100%|██████████| 32/32 [02:13<00:00,  4.17s/it, loss=0.0324, v_num=1, train_loss=0.029, valid_loss=0.0272] 

Epoch 25, global step 728: 'valid_loss' reached 0.02718 (best 0.02718), saving model to '/Users/depressedcoder/DLNLP/Assignment5/partb/checkpoints/best-checkpoint-v1.ckpt' as top 1


Epoch 26: 100%|██████████| 32/32 [02:10<00:00,  4.09s/it, loss=0.0326, v_num=1, train_loss=0.0284, valid_loss=0.0269]

Epoch 26, global step 756: 'valid_loss' reached 0.02689 (best 0.02689), saving model to '/Users/depressedcoder/DLNLP/Assignment5/partb/checkpoints/best-checkpoint-v1.ckpt' as top 1


Epoch 27: 100%|██████████| 32/32 [02:11<00:00,  4.11s/it, loss=0.032, v_num=1, train_loss=0.0295, valid_loss=0.0264] 

Epoch 27, global step 784: 'valid_loss' reached 0.02643 (best 0.02643), saving model to '/Users/depressedcoder/DLNLP/Assignment5/partb/checkpoints/best-checkpoint-v1.ckpt' as top 1


Epoch 28: 100%|██████████| 32/32 [02:15<00:00,  4.23s/it, loss=0.0314, v_num=1, train_loss=0.0349, valid_loss=0.0261]

Epoch 28, global step 812: 'valid_loss' reached 0.02613 (best 0.02613), saving model to '/Users/depressedcoder/DLNLP/Assignment5/partb/checkpoints/best-checkpoint-v1.ckpt' as top 1


Epoch 29: 100%|██████████| 32/32 [02:11<00:00,  4.10s/it, loss=0.0317, v_num=1, train_loss=0.0335, valid_loss=0.0258]

Epoch 29, global step 840: 'valid_loss' reached 0.02581 (best 0.02581), saving model to '/Users/depressedcoder/DLNLP/Assignment5/partb/checkpoints/best-checkpoint-v1.ckpt' as top 1


Epoch 30: 100%|██████████| 32/32 [02:37<00:00,  4.92s/it, loss=0.0296, v_num=1, train_loss=0.0315, valid_loss=0.0252]

Epoch 30, global step 868: 'valid_loss' reached 0.02522 (best 0.02522), saving model to '/Users/depressedcoder/DLNLP/Assignment5/partb/checkpoints/best-checkpoint-v1.ckpt' as top 1


Epoch 31: 100%|██████████| 32/32 [02:12<00:00,  4.15s/it, loss=0.0296, v_num=1, train_loss=0.0382, valid_loss=0.0242]

Epoch 31, global step 896: 'valid_loss' reached 0.02421 (best 0.02421), saving model to '/Users/depressedcoder/DLNLP/Assignment5/partb/checkpoints/best-checkpoint-v1.ckpt' as top 1


Epoch 32: 100%|██████████| 32/32 [02:17<00:00,  4.30s/it, loss=0.0293, v_num=1, train_loss=0.0289, valid_loss=0.0239]

Epoch 32, global step 924: 'valid_loss' reached 0.02388 (best 0.02388), saving model to '/Users/depressedcoder/DLNLP/Assignment5/partb/checkpoints/best-checkpoint-v1.ckpt' as top 1


Epoch 33: 100%|██████████| 32/32 [02:10<00:00,  4.09s/it, loss=0.0306, v_num=1, train_loss=0.026, valid_loss=0.0237] 

Epoch 33, global step 952: 'valid_loss' reached 0.02372 (best 0.02372), saving model to '/Users/depressedcoder/DLNLP/Assignment5/partb/checkpoints/best-checkpoint-v1.ckpt' as top 1


Epoch 34: 100%|██████████| 32/32 [02:11<00:00,  4.10s/it, loss=0.0268, v_num=1, train_loss=0.0264, valid_loss=0.0233]

Epoch 34, global step 980: 'valid_loss' reached 0.02331 (best 0.02331), saving model to '/Users/depressedcoder/DLNLP/Assignment5/partb/checkpoints/best-checkpoint-v1.ckpt' as top 1


Epoch 35: 100%|██████████| 32/32 [02:11<00:00,  4.10s/it, loss=0.0287, v_num=1, train_loss=0.0231, valid_loss=0.0228]

Epoch 35, global step 1008: 'valid_loss' reached 0.02280 (best 0.02280), saving model to '/Users/depressedcoder/DLNLP/Assignment5/partb/checkpoints/best-checkpoint-v1.ckpt' as top 1


Epoch 36: 100%|██████████| 32/32 [02:14<00:00,  4.20s/it, loss=0.0267, v_num=1, train_loss=0.0239, valid_loss=0.0219]

Epoch 36, global step 1036: 'valid_loss' reached 0.02188 (best 0.02188), saving model to '/Users/depressedcoder/DLNLP/Assignment5/partb/checkpoints/best-checkpoint-v1.ckpt' as top 1


Epoch 37: 100%|██████████| 32/32 [02:04<00:00,  3.88s/it, loss=0.0263, v_num=1, train_loss=0.0224, valid_loss=0.0219]

Epoch 37, global step 1064: 'valid_loss' was not in top 1


Epoch 38: 100%|██████████| 32/32 [02:16<00:00,  4.26s/it, loss=0.0259, v_num=1, train_loss=0.0335, valid_loss=0.0216]

Epoch 38, global step 1092: 'valid_loss' reached 0.02156 (best 0.02156), saving model to '/Users/depressedcoder/DLNLP/Assignment5/partb/checkpoints/best-checkpoint-v1.ckpt' as top 1


Epoch 39: 100%|██████████| 32/32 [02:14<00:00,  4.21s/it, loss=0.0256, v_num=1, train_loss=0.0168, valid_loss=0.021] 

Epoch 39, global step 1120: 'valid_loss' reached 0.02095 (best 0.02095), saving model to '/Users/depressedcoder/DLNLP/Assignment5/partb/checkpoints/best-checkpoint-v1.ckpt' as top 1


Epoch 40: 100%|██████████| 32/32 [02:06<00:00,  3.96s/it, loss=0.0255, v_num=1, train_loss=0.0315, valid_loss=0.0207]

Epoch 40, global step 1148: 'valid_loss' reached 0.02072 (best 0.02072), saving model to '/Users/depressedcoder/DLNLP/Assignment5/partb/checkpoints/best-checkpoint-v1.ckpt' as top 1


Epoch 41: 100%|██████████| 32/32 [02:10<00:00,  4.09s/it, loss=0.0248, v_num=1, train_loss=0.0281, valid_loss=0.0203]

Epoch 41, global step 1176: 'valid_loss' reached 0.02029 (best 0.02029), saving model to '/Users/depressedcoder/DLNLP/Assignment5/partb/checkpoints/best-checkpoint-v1.ckpt' as top 1


Epoch 42: 100%|██████████| 32/32 [02:07<00:00,  4.00s/it, loss=0.0252, v_num=1, train_loss=0.0272, valid_loss=0.0204]

Epoch 42, global step 1204: 'valid_loss' was not in top 1


Epoch 43: 100%|██████████| 32/32 [02:11<00:00,  4.12s/it, loss=0.0236, v_num=1, train_loss=0.0217, valid_loss=0.0199]

Epoch 43, global step 1232: 'valid_loss' reached 0.01995 (best 0.01995), saving model to '/Users/depressedcoder/DLNLP/Assignment5/partb/checkpoints/best-checkpoint-v1.ckpt' as top 1


Epoch 44: 100%|██████████| 32/32 [02:03<00:00,  3.86s/it, loss=0.0241, v_num=1, train_loss=0.0213, valid_loss=0.0195]

Epoch 44, global step 1260: 'valid_loss' reached 0.01953 (best 0.01953), saving model to '/Users/depressedcoder/DLNLP/Assignment5/partb/checkpoints/best-checkpoint-v1.ckpt' as top 1


Epoch 45: 100%|██████████| 32/32 [02:12<00:00,  4.14s/it, loss=0.0233, v_num=1, train_loss=0.0211, valid_loss=0.020] 

Epoch 45, global step 1288: 'valid_loss' was not in top 1


Epoch 46: 100%|██████████| 32/32 [02:12<00:00,  4.15s/it, loss=0.0229, v_num=1, train_loss=0.0201, valid_loss=0.0195]

Epoch 46, global step 1316: 'valid_loss' reached 0.01952 (best 0.01952), saving model to '/Users/depressedcoder/DLNLP/Assignment5/partb/checkpoints/best-checkpoint-v1.ckpt' as top 1


Epoch 47: 100%|██████████| 32/32 [02:12<00:00,  4.15s/it, loss=0.023, v_num=1, train_loss=0.0284, valid_loss=0.0192] 

Epoch 47, global step 1344: 'valid_loss' reached 0.01919 (best 0.01919), saving model to '/Users/depressedcoder/DLNLP/Assignment5/partb/checkpoints/best-checkpoint-v1.ckpt' as top 1


Epoch 48: 100%|██████████| 32/32 [02:14<00:00,  4.19s/it, loss=0.0227, v_num=1, train_loss=0.0216, valid_loss=0.0191]

Epoch 48, global step 1372: 'valid_loss' reached 0.01909 (best 0.01909), saving model to '/Users/depressedcoder/DLNLP/Assignment5/partb/checkpoints/best-checkpoint-v1.ckpt' as top 1


Epoch 49: 100%|██████████| 32/32 [02:12<00:00,  4.13s/it, loss=0.0214, v_num=1, train_loss=0.0213, valid_loss=0.019] 

Epoch 49, global step 1400: 'valid_loss' reached 0.01902 (best 0.01902), saving model to '/Users/depressedcoder/DLNLP/Assignment5/partb/checkpoints/best-checkpoint-v1.ckpt' as top 1
`Trainer.fit` stopped: `max_epochs=50` reached.


Epoch 49: 100%|██████████| 32/32 [02:12<00:00,  4.15s/it, loss=0.0214, v_num=1, train_loss=0.0213, valid_loss=0.019]


### Inference Model

In [13]:
test_model = T5ArithTranslator.load_from_checkpoint(
    trainer.checkpoint_callback.best_model_path
)
test_model.freeze()

t5_tokenizer = T5Tokenizer.from_pretrained("t5-small")
special_tokens_dict = {'additional_special_tokens' : ['[SEP]']}
num_added_tokens = t5_tokenizer.add_special_tokens(special_tokens_dict)


In [19]:
test_input_ids = t5_tokenizer("last stop in their field trip was the aquarium . penny identified number0 species of sharks number1 species of eels and number2 different species of whales . [SEP] how many species was penny able to identify ?",return_tensors='pt').input_ids

In [20]:
outputs = test_model.t5_model.generate(test_input_ids)
print(t5_tokenizer.decode(outputs[0], skip_special_tokens=True))

+ + number0 number1 number2
