In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from torch.utils.data import Dataset, DataLoader, random_split


from datasets import load_dataset
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace

from pathlib import Path

from tqdm import tqdm
import math
import warnings

In [2]:
class Embeddings(nn.Module):
    def __init__(self, vocab_size,d_model):
        super().__init__()
        self.d_model = d_model
        self.vocab_size = vocab_size 
        self.embedding = nn.Embedding(vocab_size, d_model) #initializes embeddings to be learned to training process
    def forward(self,x):
        return self.embedding(x) * math.sqrt(self.d_model) #normalize the variance of the embeddings



#if feature=2i, sin(pos/1000^(2i/d_model), if feature=2i+k, cos(pos/1000^(2i/d_model)
class Positional_Encoding(nn.Module):
    def __init__(self,seq_length,d_model,Pdrop):
        super().__init__()
        self.dropout = nn.Dropout(Pdrop)
        #if you add self before, then it gets learned as a parameter for the model
        pos = torch.arange(0,seq_length).unsqueeze(1) #pos --> seq_length_pos x 1, adds an extra dimension
        div_term = torch.exp(torch.arange(0,d_model,2)*-np.log(1000)/d_model) #div term --> 1 x div_term_i
        #so there is a different pe value for each token (pos) and feature (2i,2i+1)
        pe = torch.zeros((seq_length, d_model))
        pe[:,0::2] = torch.sin(pos*div_term)
        pe[:,1::2] = torch.cos(pos*div_term)
        pe = pe.unsqueeze(0) # --> (1, seq, d_model)
        self.register_buffer('positional_encoder',pe) #self.positional_encoder, buffer is not a learnable parameter

    def forward(self,x):
        x = x + (self.positional_encoder[:, :x.shape[1], :]).requires_grad_(False)
        return self.dropout(x)



class LayerNormalization(nn.Module):
    def __init__(self):
        super().__init__()
        self.epsilon = 1**-10 #so not divide by 0
        self.a = nn.Parameter(torch.ones(1)) #1 dimension
        self.bias = nn.Parameter(torch.zeros(1)) #1 dimension

    def forward(self,x):
        mean = x.mean(dim=-1, keepdim=True) #keep all dimensions
        std = x.std(dim=-1, keepdim=True)

        return self.a*(x-mean)/(std+self.epsilon)+self.bias


#add non linearity
class FeedForwardNetwork(nn.Module):
    def __init__(self,d_model,Pdrop, d_ff=None):
        if d_ff == None:
            d_ff = d_model*4
        super().__init__()
        self.linear1 = nn.Linear(d_model,d_ff)
        self.dropout= nn.Dropout(Pdrop)
        self.linear2 = nn.Linear(d_ff, d_model)
    
    def forward(self,x):
        x = self.dropout(F.relu(self.linear1(x)))
        return self.linear2(x)


class AddAndNorm(nn.Module):
    def __init__(self,LayerNormalization, Pdrop):
        super(AddAndNorm, self).__init__()
        self.dropout = nn.Dropout(Pdrop)
        self.LayerNormalization = LayerNormalization()
    def forward(self,x,sublayer):
        return self.LayerNormalization(x+self.dropout(sublayer))


In [3]:
#same computation overhead as self attention but more semantic representations

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, h, Pdrop):
        super().__init__()
        if d_model % h != 0:
            raise ValueError()

        d_k=d_v=d_model/h
        self.h = h #num_heads
        self.d_model = d_model #length of embeddings
        self.d_k = self.d_v = int(self.d_model / self.h)

        self.dropout= nn.Dropout(Pdrop)

        self.W_Q = nn.Linear(d_model,d_model)
        self.W_K = nn.Linear(d_model,d_model)
        self.W_V = nn.Linear(d_model,d_model)

        self.W_O = nn.Linear(d_model,d_model)
        
    
        
    def forward(self,Q,K,V, mask):
        batch_size, q_seq_length, d_model = Q.size()
        batch_size, kv_seq_length, d_model = K.size()

        queries = self.W_Q(Q)
        keys = self.W_K(K)
        values = self.W_V(V)

        queries = queries.reshape(batch_size, q_seq_length, self.h, self.d_k) #(batch size, seq_length, d_model) ==> (batch size, seq_length, h, d_k)
        keys = keys.reshape(batch_size, kv_seq_length, self.h, self.d_k)
        values = values.reshape(batch_size, kv_seq_length, self.h, self.d_v)

        queries = queries.transpose(1,2) #(batch size, h, seq_length, d_k), flips the dims 1 and 2
        keys = keys.transpose(1,2)
        values = values.transpose(1,2)

        scores = torch.matmul(queries, keys.transpose(-2,-1))/self.d_k**.5

        scores = scores.masked_fill_(mask==0, -1*10**10) #mask values that are equal to 0 with -10^10 bc e^-inf=0 in softmax will be equal to 0
        attention = F.softmax(scores, dim=-1)
        
        attention = self.dropout(attention) #dropout after activation function
            
        weighted = torch.matmul(attention, values)
        concat = weighted.reshape(batch_size, q_seq_length, d_model)
        out = self.W_O(concat) #W_O
        return out
        
x = torch.tensor([
        [[1., 2., 3., 4.],
         [2., 3., 4., 5.],
         [3., 4., 5., 6.]],

        [[4., 3., 2., 1.],
         [5., 4., 3., 2.],
         [6., 5., 4., 3.]]
])
def causal_mask(size):
    mask = torch.triu(torch.ones(1,size,size),diagonal=1).type(torch.int) #diagonal mask so can't see tokens after it when making prediction
    return mask == 0
mha = MultiHeadAttention(d_model=4,h=2,Pdrop=.1)
Q,K,V = x,x,x
mha.forward(Q,K,V,causal_mask(3))

tensor([[[-0.7581, -0.5262, -0.5710,  0.8651],
         [-1.0470, -1.1174, -0.2363, -1.7936],
         [ 1.0244, -0.3452,  0.5874, -0.5634]],

        [[-2.3001, -1.5569, -1.1608, -0.3298],
         [-1.2177, -1.5458, -0.7360, -1.8968],
         [ 2.9878,  0.5898,  1.5703, -0.2662]]], grad_fn=<ViewBackward0>)

In [4]:
class EncoderBlock(nn.Module):
    def __init__(self,d_model,h,Pdrop):
        super().__init__()
        self.MHA1 = MultiHeadAttention(d_model,h,Pdrop)
        self.FFN = FeedForwardNetwork(d_model, Pdrop, d_ff=d_model*4)
        self.residual = nn.ModuleList([AddAndNorm(LayerNormalization, Pdrop) for x in range(2)])
                                      
    def forward(self,x,mask):
       x = self.residual[0](x,self.MHA1(x,x,x,mask))
       x = self.residual[1](x,self.FFN(x))
       return x

class Encoder(nn.Module):
    def __init__(self,N,d_model,h,Pdrop):
        super().__init__()
        self.layers = nn.ModuleList([EncoderBlock(d_model,h,Pdrop) for x in range(N)])
    def forward(self,x,mask):
        for layer in self.layers:
            x = layer(x,mask)
        return x



In [5]:
class DecoderBlock(nn.Module):
    def __init__(self,d_model,h,Pdrop):
        super().__init__()
        self.MHA2 = MultiHeadAttention(d_model,h,Pdrop)
        self.MHA3 = MultiHeadAttention(d_model,h,Pdrop)
        self.FFN = FeedForwardNetwork(d_model, Pdrop, d_ff=d_model*4)
        self.residual = nn.ModuleList([AddAndNorm(LayerNormalization, Pdrop) for x in range(3)])
    def forward(self,x,encoder_output,decoder_mask, encoder_mask):
        x = self.residual[0](x,self.MHA2(x,x,x,decoder_mask))
        x = self.residual[1](x,self.MHA3(x,encoder_output,encoder_output,encoder_mask))
        x = self.residual[1](x,self.FFN(x))
        return x


class Decoder(nn.Module):
    def __init__(self,N,d_model,h,Pdrop):
        super().__init__()
        self.layers = nn.ModuleList([DecoderBlock(d_model,h,Pdrop) for x in range(N)])
    def forward(self,x,encoder_output,decoder_mask, encoder_mask):
        for layer in self.layers:
            x = layer(x,encoder_output,decoder_mask, encoder_mask)
        return x
        
        
    

In [6]:
class ProjectionLayer(nn.Module):
    def __init__(self,vocab_size,d_model):
        super(ProjectionLayer, self).__init__()
        self.proj = nn.Linear(d_model,vocab_size)
    def forward(self,x):
        return torch.log_softmax(self.proj(x),dim=-1) #softmax along the vocab size dim --> batch_size x seq_length, vocab_size (vocab_probabilities)
        


In [7]:
size = 10
torch.triu(torch.ones(1,size,size),diagonal=1)

tensor([[[0., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
         [0., 0., 1., 1., 1., 1., 1., 1., 1., 1.],
         [0., 0., 0., 1., 1., 1., 1., 1., 1., 1.],
         [0., 0., 0., 0., 1., 1., 1., 1., 1., 1.],
         [0., 0., 0., 0., 0., 1., 1., 1., 1., 1.],
         [0., 0., 0., 0., 0., 0., 1., 1., 1., 1.],
         [0., 0., 0., 0., 0., 0., 0., 1., 1., 1.],
         [0., 0., 0., 0., 0., 0., 0., 0., 1., 1.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]]])

In [8]:
class Transformer(nn.Module):
    def __init__(self,N,src_vocab_size,tgt_vocab_size, src_seq_length, tgt_seq_length, d_model,h,Pdrop):
        super().__init__()
        self.encoder = Encoder(N,d_model,h,Pdrop) #this is a nn.module so it runs forward
        self.decoder = Decoder(N,d_model,h,Pdrop)
        self.src_embeddings = Embeddings(src_vocab_size, d_model)
        self.tgt_embeddings = Embeddings(tgt_vocab_size, d_model)
        self.src_positional_encoding = Positional_Encoding(src_seq_length,d_model,Pdrop)
        self.tgt_positional_encoding = Positional_Encoding(tgt_seq_length,d_model,Pdrop)
        self.projection_layer = ProjectionLayer(tgt_vocab_size,d_model)

    def encode(self,encoder_input, encoder_mask):
        #encoder_input is tokens
        x = self.src_embeddings(encoder_input)
        x = self.src_positional_encoding(x)
        x = self.encoder(x, encoder_mask)
        return x
        
    def decode(self,decoder_input, encoder_output, decoder_mask, encoder_mask):
        #decoder_input is tokens
        x = self.tgt_embeddings(decoder_input)
        x = self.tgt_positional_encoding(x)
        x = self.decoder(x,encoder_output, decoder_mask, encoder_mask)
        return x
        
    def project(self,x):
        return self.projection_layer(x)


# t = Transformer()
encoder_input = torch.tensor([2, 0, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

decoder_input = torch.tensor([[2]])

In [9]:
#process dataset
class ProcessDataset(Dataset):
    def __init__(self,data, tokenizer_src, tokenizer_tgt,seq_length, lang1, lang2):
        super().__init__()
        
        self.data = data
        self.tokenizer_src = tokenizer_src
        self.tokenizer_tgt = tokenizer_tgt
        self.lang1 = lang1
        self.lang2 = lang2
        
        self.seq_length = seq_length
        self.sos_token = torch.tensor([tokenizer_tgt.token_to_id("[SOS]")], dtype=torch.int64)
        self.eos_token = torch.tensor([tokenizer_tgt.token_to_id("[EOS]")], dtype=torch.int64)
        self.pad_token = torch.tensor([tokenizer_tgt.token_to_id("[PAD]")], dtype=torch.int64)
        
    def __len__(self):
        return len(self.data)
    
        
    # Using the index to retrive source and target texts
    def __getitem__(self, index):
        pair = self.data[index]
        src_txt = pair['translation'][self.lang1]
        tgt_txt = pair['translation'][self.lang2]
        
        encoder_input_tokens = self.tokenizer_src.encode(src_txt).ids
        decoder_input_tokens = self.tokenizer_tgt.encode(tgt_txt).ids
        
        
        encoder_num_padding_tokens = self.seq_length - len(encoder_input_tokens) - 2 # Subtracting the two '[EOS]' and '[SOS]' special tokens
        decoder_num_padding_tokens = self.seq_length - len(decoder_input_tokens) - 1 # Subtracting the '[SOS]' special token
        
        if encoder_num_padding_tokens < 0 or decoder_num_padding_tokens < 0: #check if sentence is longer than seq_length - special tokens
            raise ValueError('Sentence is too long')
         
        #get encoder input w/ special tokens
        encoder_input = torch.cat(
            [
            self.sos_token, # inserting the '[SOS]' token
            torch.tensor(encoder_input_tokens, dtype = torch.int64), # Inserting the tokenized source text
            self.eos_token, # Inserting the '[EOS]' token
            torch.tensor([self.pad_token] * encoder_num_padding_tokens, dtype = torch.int64) # Addind padding tokens
            ]
        )
        
        # get decoder input w/ special token SOS
        decoder_input = torch.cat(
            [
                self.sos_token, # inserting the '[SOS]' token 
                torch.tensor(decoder_input_tokens, dtype = torch.int64), # Inserting the tokenized target text
                torch.tensor([self.pad_token] * decoder_num_padding_tokens, dtype = torch.int64) # Addind padding tokens
            ]
        
        )
        
        # create a label, which is just the correct tgt output (decoder_input_tokens + EOS + padding)
        #different than decoder_input because no SOS and has EOS before padding
        label = torch.cat(
            [
                torch.tensor(decoder_input_tokens, dtype = torch.int64), # Inserting the tokenized target text
                self.eos_token, # Inserting the '[EOS]' token 
                torch.tensor([self.pad_token] * decoder_num_padding_tokens, dtype = torch.int64) # adding padding tokens (same # as decoder input because replace SOS with EOS)
                
            ]
        )
        
        
        
        return {
            'encoder_input': encoder_input,
            'decoder_input': decoder_input, 
            #unsqueeze twice, makes it 1,1,seq_length, which makes sense because it gives different mask for each token in the sequence which is in a batch
            'encoder_mask': (encoder_input != self.pad_token).unsqueeze(0).unsqueeze(0).int(), #unmask everything that is not a padding token, ie. the entire encoder_input is unmasked
            'decoder_mask': (decoder_input != self.pad_token).unsqueeze(0).unsqueeze(0).int() & causal_mask(decoder_input.size(0)), #mask everything that is a padding token and also use causal mask
            'label': label,
            'src_text': src_txt,
            'tgt_text': tgt_txt
        }    


In [63]:
class Validation():
    def run_prediction(self,encoder_input, encoder_mask):
        max_length=10
        sos_token = self.tokenizer_tgt.token_to_id('[SOS]') #sos token
        eos_token = self.tokenizer_tgt.token_to_id('[EOS]')
        encoder_output = self.model.encode(encoder_input,encoder_mask)
        decoder_input = torch.empty(1,1).fill_(sos_token).type_as(encoder_input).to(device)
        while True:
            if decoder_input.size(1) == max_length: break
            decoder_mask = causal_mask(decoder_input.size(1))
            decoder_output = self.model.decode(decoder_input, encoder_output,decoder_mask, encoder_mask) #seq_length x d_model
            probabilities = self.model.project(decoder_output[:,-1]) #batch_size x seq_length, vocab_size
            max_probability, next_word = torch.max(probabilities, dim=1) #
            decoder_input = torch.cat([decoder_input, torch.empty(1,1). type_as(encoder_input).fill_(next_word.item()).to(device)], dim=1)

            if next_word == eos_token:
                break
            
        return decoder_input.squeeze(0)

    def run_validation(self):
        self.model.eval()
        with torch.no_grad():
            batch_iterator = tqdm(self.val_dataloader, desc = f'Validation')
            for batch in batch_iterator:
                encoder_input = batch['encoder_input']
                encoder_mask = batch['encoder_mask']
                result = self.run_prediction(encoder_input, encoder_mask)
                label = batch['label']
                src_text = batch['src_txt']
                tgt_txt = batch['tgt_text']
                predicted_txt = self.tokenizer_tgt.decode(result.detach().cpu().numpy())
                print(src_text, tgt_txt,predicted_txt)

    def predict(self, src_txt):
        sos_token = torch.tensor([self.tokenizer_tgt.token_to_id("[SOS]")], dtype=torch.int64)
        eos_token = torch.tensor([self.tokenizer_tgt.token_to_id("[EOS]")], dtype=torch.int64)
        pad_token = torch.tensor([self.tokenizer_tgt.token_to_id("[PAD]")], dtype=torch.int64)
        
        self.model.eval()
        with torch.no_grad():
            encoder_input_tokens = self.tokenizer_src.encode(src_txt).ids
            encoder_num_padding_tokens = self.config["src_seq_length"] - len(encoder_input_tokens) - 2 # Subtracting the two '[EOS]' and '[SOS]' special tokens
            #get encoder input w/ special tokens
            encoder_input = torch.cat(
                [
                sos_token,
                torch.tensor(encoder_input_tokens, dtype = torch.int64), 
                eos_token,
                torch.tensor([pad_token] * encoder_num_padding_tokens, dtype = torch.int64) 
                ]
            ).unsqueeze(0)
            encoder_mask = (encoder_input != pad_token).unsqueeze(0).unsqueeze(0).int()
            result = self.run_prediction(encoder_input, encoder_mask)
            predicted_txt = self.tokenizer_tgt.decode(result.detach().cpu().numpy())
            print(f"input: {src_txt}, output: {predicted_txt}")
            return predicted_txt
            
            
    


In [64]:
class Data():
    def __init__(self,config):
        self.tokenizer_file = "tokenizer {}.json"
        self.config = config
        self.train_dataloader, self.val_dataloader, self.tokenizer_src, self.tokenizer_tgt = self.get_data()
    def get_sentences(self,data,lang):
        for pair in data:
            yield pair['translation'][lang] #generator function, executes until yield, state is saved, then resumes from same spot
            #allows to process data without loading all to memory
            
    def build_tokenizer(self,data,lang):
        tokenizer_path = Path(self.tokenizer_file.format(lang))
        newTokenizer = True #not Path.exists(tokenizer_path)
        if newTokenizer: 
            tokenizer = Tokenizer(WordLevel(unk_token = '[UNK]')) #word level tokenizer
            tokenizer.pre_tokenizer = Whitespace() #based on whitespace
            trainer = WordLevelTrainer(special_tokens = ["[UNK]", "[PAD]", "[SOS]", "[EOS]"], min_frequency = 2) #only keep tokens that show up at least twice in the vocab
            tokenizer.train_from_iterator(self.get_sentences(data, lang), trainer = trainer)
            tokenizer.save(str(tokenizer_path))
        else: 
            tokenizer = Tokenizer.from_file(str(tokenizer_path))
        return tokenizer

    def get_data(self):
        seq_length = self.config["src_seq_length"] #same for both languages
        lang1 = self.config["lang1"]
        lang2 = self.config["lang2"]
        
        data = load_dataset('opus_books', f'{lang1}-{lang2}', split = 'train') 
        #build tokenizers
        tokenizer_src = self.build_tokenizer(data, lang1)
        tokenizer_tgt = self.build_tokenizer(data, lang2)

        #split for train and validation
        train_data_size = int(0.9 * len(data)) 
        val_data_size = len(data) - train_data_size 
        train_data_raw, val_data_raw = random_split(data, [train_data_size, val_data_size]) #randomly split .9 for training, .1 for validation
    
        #process dataset using Dataset class from pytorch
        train_data = ProcessDataset(train_data_raw, tokenizer_src, tokenizer_tgt, seq_length,lang1, lang2)
        val_data = ProcessDataset(val_data_raw, tokenizer_src, tokenizer_tgt, seq_length, lang1, lang2)
                                        
        # Dataloaders iterate in batches
        train_dataloader = DataLoader(train_data, batch_size = self.config['batch_size'], shuffle = True) 
        val_dataloader = DataLoader(val_data, batch_size = 1, shuffle = True)
        
        return train_dataloader,val_dataloader, tokenizer_src, tokenizer_tgt # Returning the DataLoader objects and tokenizers

In [75]:
class BuildTransformer(Validation):
    def __init__(self,data, config):
        self.data = data
        self.train_dataloader, self.val_dataloader, self.tokenizer_src, self.tokenizer_tgt = self.data.train_dataloader, self.data.val_dataloader, self.data.tokenizer_src, self.data.tokenizer_tgt
        self.config = config
        self.seq_length = self.config['src_seq_length'] #for src and tgt
        
        self.get_model()
        #self.run_validation()

    
    def get_transformer(self):
        N = self.config['N']
        src_vocab_size = self.config['src_vocab_size']
        tgt_vocab_size = self.config['tgt_vocab_size']
        src_seq_length = self.config['src_seq_length']
        tgt_seq_length = self.config['tgt_seq_length']
        d_model = self.config['d_model']
        h = self.config['h']
        Pdrop = self.config['Pdrop']
        
        transformer = Transformer(N, src_vocab_size, tgt_vocab_size, src_seq_length, tgt_seq_length, d_model, h, Pdrop)
       
        for p in transformer.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)
                
            return transformer

    
        
    def get_model(self):
        Path('model_weights').mkdir(parents=True, exist_ok=True)
        
        self.config["src_vocab_size"] = self.tokenizer_src.get_vocab_size()
        self.config["tgt_vocab_size"] = self.tokenizer_tgt.get_vocab_size()
        self.model = self.get_transformer().to(device)
    def computeLoss(self, proj_output, label, loss_fn,optimizer):
        loss = loss_fn(proj_output.view(-1, self.tokenizer_tgt.get_vocab_size()), label.view(-1))
        loss.backward() # backpropagation
        optimizer.step() # update parameters based on the gradients
        optimizer.zero_grad() # clear gradients for net batch
        return loss
        
    def train(self):
        
        optimizer = torch.optim.Adam(self.model.parameters(), lr=config['lr'], eps = 1e-9)
        initial_epoch = 0
        global_step = 0 # how many batches
    
        # check if model already pretrained, if so load model and state
        if self.config['preload']:
            epoch = self.config["preload_epoch"]
            model_filename =  f"model_weights/weights_{epoch}.pt"
            state = torch.load(model_filename) 
            initial_epoch = state['epoch'] + 1
            optimizer.load_state_dict(state['optimizer_state_dict'])
            global_step = state['global_step']
        
        # cross entropy loss (L = - Σ_(for x in X) [y_x * log(p_x)])
        # label smoothing, ignore padding when computing loss
        loss_fn = nn.CrossEntropyLoss(ignore_index = self.tokenizer_src.token_to_id('[PAD]'), label_smoothing = 0.1).to(device)
        
        for epoch in range(initial_epoch, self.config['num_epochs']):
            self.model.train() #put into training mode (from model.eval())
            n_seconds = 5
            total_batches = len(self.train_dataloader)
            print(f"Number of Batches {total_batches}")
            batch_iterator = tqdm(transformer.train_dataloader, desc = f'epoch {epoch}',position=0, leave=True, mininterval=n_seconds)
            i = 0
            for batch in batch_iterator:
                encoder_input = batch['encoder_input'].to(device)  #load data to device (GPU if available)
                decoder_input = batch['decoder_input'].to(device)
                encoder_mask = batch['encoder_mask'].to(device)
                decoder_mask = batch['decoder_mask'].to(device)
            
                encoder_output = self.model.encode(encoder_input, encoder_mask)
                decoder_output = self.model.decode(decoder_input,encoder_output, decoder_mask,encoder_mask)
                proj_output = self.model.project(decoder_output)
                

               
                label = batch['label'].to(device)  # load target labels to device
                loss = self.computeLoss(proj_output,label, loss_fn,optimizer)
                print(f"Loss {loss}")
                batch_iterator.set_postfix({f"loss": f"{loss.item():6.3f}"})  #update progress bar
                
                global_step += 1
                
                # if i % 5 == 0: 
                #     batch_iterator.update(5)
                i += 1
            model_filename = f"model_weights/weights_{epoch}.pt"
          
            torch.save({
                'epoch': epoch, 
                'model_state_dict': self.model.state_dict(),# Current model state
                'optimizer_state_dict': optimizer.state_dict(), # Current optimizer state
                'global_step': global_step # Current global step 
            }, model_filename)
            
        

        

In [76]:
config = {
            "N": 6,
            "src_seq_length": 350,
            "tgt_seq_length": 350,
            "d_model": 512,
            "h": 8,
            "Pdrop": .1,
            "batch_size": 8, #size 20 is 1455 batches
            "num_epochs": 1,
            "lr": 10**-4,
            "lang1":"en",
            "lang2": "it",
            "preload":False,
            "preload_epoch":0,
            "num_batches":1
        }
# data = Data(config)

In [77]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [78]:
from tqdm import tqdm


transformer = BuildTransformer(data,config)


In [79]:
transformer.train()

Number of Batches 3638


epoch 0:   0%|                 | 1/3638 [00:14<14:52:31, 14.72s/it, loss=10.018]

Loss 10.017631530761719


epoch 0:   0%|                  | 2/3638 [00:28<14:28:27, 14.33s/it, loss=9.992]

Loss 9.992072105407715


epoch 0:   0%|                  | 3/3638 [00:41<13:41:41, 13.56s/it, loss=9.952]

Loss 9.951912879943848


epoch 0:   0%|                  | 4/3638 [00:53<13:12:29, 13.08s/it, loss=9.904]

Loss 9.903790473937988


epoch 0:   0%|                  | 4/3638 [01:05<16:27:24, 16.30s/it, loss=9.904]


KeyboardInterrupt: 

In [71]:
transformer.predict("hello")

input: hello, output: manico convenire civili cantate pentiti torte scelti Potrei Potrei


'manico convenire civili cantate pentiti torte scelti Potrei Potrei'

In [740]:
transformer.tokenizer_src.encode("what").ids

[0]

In [766]:
[1,2][:True]

[1]

In [38]:
list(getattr(tqdm, '_instances'))

for instance in list(tqdm._instances):
    tqdm._decr_instances(instance)

In [73]:
from tqdm import tqdm
import time
for i in tqdm(transformer.train_dataloader, desc = 'tqdm() Progress Bar'):
    time.sleep(0.5)

tqdm() Progress Bar:   0%|                     | 5/3638 [00:03<36:30,  1.66it/s]


KeyboardInterrupt: 