<a href="https://colab.research.google.com/github/Kshitij-Ambilduke/NLP/blob/master/seq2seqWattenntion.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import spacy
import numpy as np
import time

In [2]:
!python -m spacy download en --quiet
!python -m spacy download de --quiet

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.6/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')
[K     |████████████████████████████████| 14.9MB 12.7MB/s 
[?25h  Building wheel for de-core-news-sm (setup.py) ... [?25l[?25hdone
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('de_core_news_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/de_core_news_sm -->
/usr/local/lib/python3.6/dist-packages/spacy/data/de
You can now load the model via spacy.load('de')


In [3]:
spacy_german = spacy.load("de")
spacy_english = spacy.load("en")


In [4]:
device = 'cuda'

In [5]:
def en_tokenizer(sen):
    tokens = []
    for token in spacy_english.tokenizer(sen):
        tokens.append(token.text)
    return tokens

def de_tokenizer(sen):
    tokens = []
    for token in spacy_german.tokenizer(sen):
        tokens.append(token.text)
    return tokens
      

In [6]:
SOURCE_Field = Field(eos_token = '<src_eos>', init_token = '<src_sos>', lower = True, tokenize = de_tokenizer)
TARGET_Field = Field(eos_token = '<trg_eos>', init_token = '<trg_sos>', lower = True, tokenize = en_tokenizer)

train_data, valid_data, test_data = Multi30k.splits(exts = (".de", ".en"),fields=(SOURCE_Field, TARGET_Field))

downloading training.tar.gz


training.tar.gz: 100%|██████████| 1.21M/1.21M [00:01<00:00, 907kB/s]


downloading validation.tar.gz


validation.tar.gz: 100%|██████████| 46.3k/46.3k [00:00<00:00, 279kB/s]


downloading mmt_task1_test2016.tar.gz


mmt_task1_test2016.tar.gz: 100%|██████████| 66.2k/66.2k [00:00<00:00, 269kB/s]


In [32]:
print(vars(train_data[6])['src'])

['ein', 'mann', 'lächelt', 'einen', 'ausgestopften', 'löwen', 'an', '.']


In [8]:
SOURCE_Field.build_vocab(train_data, min_freq=3)
TARGET_Field.build_vocab(train_data, min_freq=3)

In [9]:
BATCH_SIZE = 128
 
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE, device = device)

In [10]:
class Encoder(nn.Module):
    def __init__(self, source_vocab_len, embeddingsize, hiddensize, batchsize):
        super().__init__()
        self.batch_size = batchsize
        self.embed = nn.Embedding(source_vocab_len, embeddingsize)
        self.lstm = nn.LSTM(embeddingsize, hiddensize,bidirectional = True,num_layers=2)
        self.fc1 = nn.Linear(2*hiddensize,hiddensize)
        self.fc2 = nn.Linear(2*hiddensize,hiddensize)
        self.fc3 = nn.Linear(2*hiddensize,hiddensize)
        self.dp = nn.Dropout(0.3)
        
    
    def forward(self, x):
        x = self.embed(x)              # input = (seqlen, batch), output = (seqlen, batch, embedding_dim)
        op , (h,c) = self.lstm(x)      # input(x)=(seqlen,batch,embedding_dim), 
        h = torch.tanh(h)              # h = (1, batch, hidden_size)
        c = torch.tanh(c)
        # print("h = ",h.shape)
        # print("c = ",c.shape)
        # h = torch.cat((h[0,:,:],h[1,:,:]),dim=1)
        # c = torch.cat((c[0,:,:],c[1,:,:]),dim=1)
        h = h.view(2,h.shape[1],-1)
        c = c.view(2,c.shape[1],-1)
        h = self.fc1(h)
        c = self.fc2(c)
        op = self.dp(op)
        op = self.fc3(op)

        # print("op = ",op.shape)
        
        return op,h,c

In [11]:
class Decoder(nn.Module):
    def __init__(self, target_vocab_len, embeddingsize, hiddensize):
        super().__init__()

        self.opsize = target_vocab_len
        self.embed = nn.Embedding(target_vocab_len,embeddingsize)
        self.lstm = nn.LSTM(embeddingsize, hiddensize,num_layers=2)
        self.fc = nn.Linear(2*hiddensize, self.opsize)
        self.smax = nn.Softmax(dim=0)
    
    def forward(self, x, h0, c0, encoder_op):
      # x = 1, batch;
      # h = c = 2*2, batch, hidden_size;
      # encoder_op = seq_len, batch, embedding_size

      x = self.embed(x)  # input    = [1, batch], output = [1, batch, embedding_dim]
      op, (h,c) = self.lstm(x,(h0,c0)) #op = [1, batch, hidden_dim] 
      op = torch.tanh(op)
      at = torch.mul(encoder_op,op)   #(seq_len, batch_size, Hidden)
      at = torch.sum(at, dim=2)       #(seq_len, batch_size)
      
      at = self.smax(at) 
      at = at.unsqueeze(2)
      at = torch.mul(at,encoder_op)  
      at = torch.sum(at,dim=0)
      at = at.unsqueeze(0)  #at = [1,128,128]
      op = torch.cat((at,op),dim=2)
      op = self.fc(op)
      return op,(h,c)

In [12]:
# class Decoder(nn.Module):
#     def __init__(self, target_vocab_len, embeddingsize, hiddensize):
#         super().__init__()

#         self.opsize = target_vocab_len
#         self.embed = nn.Embedding(target_vocab_len,embeddingsize)
#         self.lstm = nn.LSTM(embeddingsize, hiddensize,num_layers=2)
#         self.fc = nn.Linear(2*hiddensize, self.opsize)
#         self.smax = nn.Softmax(dim=0)
#         self.dp = nn.Dropout(0.3)

#     def forward(self, x, h0, c0, encoder_op):

#         x = self.embed(x)                 # input    = [1, batch], output = [1, batch, embedding_dim]      
#         op, (h,c) = self.lstm(x,(h0,c0))  # input(x) = [1, batch, embedding_dim], op = [1, batch, hidden_dim] 
#         op = torch.tanh(op) 
#         # print(encoder_op.shape)
#         # print(op.shape)
#         at = torch.mul(encoder_op,op )
#         print("1",at.shape) # seqlen,batch,hidden
#         at = torch.sum(at, dim=2)
#         print("2",at.shape) #seqlen, batch
        
        
#         at = self.smax(at)
#         at = torch.mul(at,op) 
#         at = torch.sum(at,dim=0) #at = [128,128]
#         at = at.unsqueeze(0)  #at = [1,128,128]
#         # print(at.shape)
#         # print(op.shape)
#         op = torch.cat((at,op),dim=2)
#         op = torch.tanh(op)
#         op = self.dp(op)
#         # print(op.shape)
#         op = self.fc(op)    
#         # print(op.shape)              # op = [1, batch, vocabsize] (vocabsize==output_size)

#         return op,(h,c)

In [13]:
class Translator(nn.Module):

    def __init__(self,encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
    
    def forward(self, source, target):
        
        #source = [seq len, batch size]
        #target = [seq len, batch size]

        target_len = target.shape[0]
        seq_len = target.shape[0]
        vocab_len = self.decoder.opsize
        batch_size = source.shape[-1]

        enc_op, hid_state, cell_state = self.encoder(source) 

        inp = target[0,:]                 #As input to the decoder is start token  #here, inp = [batch_size]
        inp = inp.unsqueeze(0)                                                     #here, inp = [1,batch_size]
        prediction = torch.zeros(target_len, batch_size, vocab_len).to(device)
        for i in range(1, target_len):
            batch_size = source.shape[-1]
            output,state = self.decoder(inp, hid_state, cell_state, enc_op) #output = [1, batch, vocabsize]
            # print(output.shape)
            hid_state, cell_state = state
            prediction[i] = output.view(batch_size,self.decoder.opsize)
            inp = target[i].unsqueeze(0)
        # print("modelreturn=",prediction.shape)            
        return prediction




In [14]:
source_vocab_len = len(SOURCE_Field.vocab)
target_vocab_len = len(TARGET_Field.vocab)
encoder_embedding_dim = 128
decoder_embedding_dim = 128
hiddensize_encoder = 512
hiddensize_decoder = 512
enc = Encoder(source_vocab_len, encoder_embedding_dim, hiddensize_encoder, 128)
dec = Decoder(target_vocab_len, decoder_embedding_dim, hiddensize_decoder )
model = Translator(enc,dec).to(device)
optimizer = optim.Adam(model.parameters(),lr=0.006)

In [15]:
print(model)
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

Translator(
  (encoder): Encoder(
    (embed): Embedding(5376, 128)
    (lstm): LSTM(128, 512, num_layers=2, bidirectional=True)
    (fc1): Linear(in_features=1024, out_features=512, bias=True)
    (fc2): Linear(in_features=1024, out_features=512, bias=True)
    (fc3): Linear(in_features=1024, out_features=512, bias=True)
    (dp): Dropout(p=0.3, inplace=False)
  )
  (decoder): Decoder(
    (embed): Embedding(4556, 128)
    (lstm): LSTM(128, 512, num_layers=2)
    (fc): Linear(in_features=1024, out_features=4556, bias=True)
    (smax): Softmax(dim=0)
  )
)
The model has 19,860,940 trainable parameters


In [16]:
#Our loss function calculates the average loss per token, however by passing the index of the <pad> token as 
#the ignore_index argument we ignore the loss whenever the target token is a padding token.
target_padding_index = TARGET_Field.vocab.stoi[TARGET_Field.pad_token]
criterion = nn.CrossEntropyLoss(ignore_index = target_padding_index)

In [23]:
def train(model, train_iterator, optimizer, criterion):
  model.train()                               #just tells pytorch that we are in training phase
  epoch_loss = 0
  for i, batch in enumerate(train_iterator):
      source = batch.src
      target = batch.trg
      # print("source=",source.shape,"target=",target.shape)
      # print("t=",target.shape)
      optimizer.zero_grad()
      output = model.forward(source, target)  #target = [trg len, batch size]
                                              #output = [trg len, batch size, output dim]
      output_dim = output.shape[-1]
      # print("op=",output.shape)
      output = output[1:].view(-1,output_dim)
      target = target[1:].view(-1)
      # print("op2=",output.shape)
      # print("t=",target.shape)
      loss = criterion(output, target)
      
      loss.backward()
      optimizer.step()
      epoch_loss += loss.item()

  return epoch_loss / len(train_iterator)

In [24]:
def eval(model, iterator, optimizer, criterion):
    model.eval()
    valos = 0
    for i,batch in enumerate(iterator):
      source = batch.src
      target = batch.trg
      # print("source=",source.shape,"target=",target.shape)
      # print("t=",target.shape)
      optimizer.zero_grad()
      output = model.forward(source, target)  #target = [trg len, batch size]
                                              #output = [trg len, batch size, output dim]
      output_dim = output.shape[-1]
      # print("op=",output.shape)
      output = output[1:].view(-1,output_dim)
      target = target[1:].view(-1)
      # print("op2=",output.shape)
      # print("t=",target.shape)
      loss = criterion(output, target)
      valos+=loss.item()
    return valos/len(iterator)

In [28]:
for epoch in range(15):
    start = time.time()
    train_loss = train(model, train_iterator, optimizer, criterion)
    valos = eval(model, valid_iterator, optimizer, criterion)
    end = time.time()
    print()
    print("train loss = ",train_loss,end='|')
    print(" time taken = ",end-start) 
    print("train loss = ",valos,end='|')
    # print(" time taken = ",end-start) 
    print()
    if epoch%10==0:
      sen = ['ein', 'schickes', 'mädchen', 'spricht', 'mit', 'dem', 'handy', 'während', 'sie', 'langsam', 'die', 'straße', 'entlangschwebt', '.']
      output = converter(sen, SOURCE_Field, TARGET_Field, model)
      print(output)


train loss =  2.2387475111411006| time taken =  45.34497380256653
train loss =  1.9939342439174652|
['a', 'large', 'girl', 'is', 'talking', 'to', 'the', 'phone', 'as', 'they', 'are', 'walking', 'down', 'the', 'street', '.', '<trg_eos>']

train loss =  1.8063019088711507| time taken =  45.9885413646698
train loss =  1.8294150084257126|

train loss =  1.524774088208371| time taken =  46.07008385658264
train loss =  1.7655114531517029|

train loss =  1.3155297272005795| time taken =  45.98650813102722
train loss =  1.7696469128131866|

train loss =  1.1571056593357203| time taken =  45.759459257125854
train loss =  1.7717639058828354|

train loss =  1.022540029450135| time taken =  45.978429555892944
train loss =  1.7869855761528015|


KeyboardInterrupt: ignored

In [26]:
def ipTensor(sentence, src_field):
    if isinstance(sentence, list):
        tokens = [src_field.init_token] + [token.lower() for token in sentence] + [src_field.eos_token]
    else:
        tokens = [src_field.init_token] + de_tokenizer(sentence) + [src_field.eos_token]
    seq_len = len(tokens)
    ip_tensor = torch.LongTensor([src_field.vocab.stoi[token] for token in tokens])
    return ip_tensor.view(seq_len, 1)

In [27]:
def converter(source_sen, source_field, target_field, model):
    input_tensor = ipTensor(source_sen, source_field)
    input_tensor=input_tensor.to(device)
    with torch.no_grad():
        op,h,c = model.encoder(input_tensor)
        states = (h,c)
    sos_loc = target_field.vocab.stoi[target_field.init_token]
    eos_loc = target_field.vocab.stoi[target_field.eos_token]
    predicts = [sos_loc]
    sen_len =1
    while sen_len < 50:
        inp = torch.LongTensor([predicts[-1]]).view(1,-1)
        with torch.no_grad():
            h,c=states
            inp = inp.to(device)
            h = h.to(device)
            c = c.to(device)
            op = op.to(device)
            output, states = model.decoder(inp, h,c,op)
        output = output.squeeze()
        output = output.view(-1,model.decoder.opsize)
        predicts.append(output.argmax(-1).item())
        sen_len+=1
        if predicts[-1]==eos_loc:
            break
    sentence = [target_field.vocab.itos[it] for it in predicts[1:]]
    return sentence

In [30]:
sen =['ein', 'mann', 'lächelt', 'einen', 'ausgestopften', 'löwen', 'an', '.']
output = converter(sen, SOURCE_Field, TARGET_Field, model)

In [31]:
print(output)

['a', 'man', 'is', 'smiling', 'at', 'a', 'stuffed', 'lion', 'cart', '.', '<trg_eos>']


In [None]:
['a', 'man', 'in', 'a', 'blue', 'shirt', 'is', 'sitting', 'on', 'a', 'street', '.', '<trg_eos>']
['a', 'man', 'in', 'a', 'blue', 'shirt', 'is', 'standing', 'on', 'a', 'sidewalk', '.', '<trg_eos>']
['a', 'man', 'in', 'a', 'blue', 'shirt', 'is', 'walking', 'down', 'the', 'street', '.', '<trg_eos>']
['a', 'man', 'in', 'a', 'blue', 'shirt', 'is', 'sitting', 'on', 'a', 'bench', 'in', 'front', 'of', 'a', 'building', '.', '<trg_eos>']
['a', 'man', 'in', 'a', 'blue', 'shirt', 'is', 'holding', 'a', 'baby', 'while', 'wearing', 'a', 'blue', 'shirt', '.', '<trg_eos>']

In [None]:
a = torch.rand(1,2,3)
a = torch.sum(a,dim=2)
print(a.shape)


torch.Size([1, 2])


In [None]:
a = torch.arange(0,16).view(2,1,8)
# a = torch.rand(2,2,4)
b = torch.arange(0,2).view(2,1,1)
# b = torch.ones(1,2,1)
print(a)
print(b)
print(torch.mul(a,b))

tensor([[[ 0,  1,  2,  3,  4,  5,  6,  7]],

        [[ 8,  9, 10, 11, 12, 13, 14, 15]]])
tensor([[[0]],

        [[1]]])
tensor([[[ 0,  0,  0,  0,  0,  0,  0,  0]],

        [[ 8,  9, 10, 11, 12, 13, 14, 15]]])


In [36]:
for n,i in enumerate(test_data):
    src = vars(train_data[n])['src']
    trg = vars(train_data[n])['trg']
    output = converter(src, SOURCE_Field, TARGET_Field, model)
    print(trg)
    print(output)
    print()

['two', 'young', ',', 'white', 'males', 'are', 'outside', 'near', 'many', 'bushes', '.']
['two', 'young', ',', 'men', 'are', 'outside', 'near', 'many', 'bushes', '.', '<trg_eos>']

['several', 'men', 'in', 'hard', 'hats', 'are', 'operating', 'a', 'giant', 'pulley', 'system', '.']
['several', 'men', 'in', 'hard', 'hats', 'are', 'inspecting', 'a', 'wheelbarrow', '.', '<trg_eos>']

['a', 'little', 'girl', 'climbing', 'into', 'a', 'wooden', 'playhouse', '.']
['a', 'little', 'girl', 'climbs', 'into', 'a', 'wooden', 'playhouse', '.', '<trg_eos>']

['a', 'man', 'in', 'a', 'blue', 'shirt', 'is', 'standing', 'on', 'a', 'ladder', 'cleaning', 'a', 'window', '.']
['a', 'man', 'in', 'a', 'blue', 'shirt', 'is', 'standing', 'on', 'a', 'ladder', 'cleaning', 'a', 'window', '.', '<trg_eos>']

['two', 'men', 'are', 'at', 'the', 'stove', 'preparing', 'food', '.']
['two', 'men', 'are', 'standing', 'at', 'the', 'stove', 'preparing', 'food', '.', '<trg_eos>']

['a', 'man', 'in', 'green', 'holds', 'a', 'guita