<a href="https://colab.research.google.com/github/Kshitij-Ambilduke/NLP/blob/master/seq2seqWithattenntion.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import torch
from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import spacy
import numpy as np
import time

In [3]:
!python -m spacy download en --quiet
!python -m spacy download de --quiet

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.6/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')
[K     |████████████████████████████████| 14.9MB 5.1MB/s 
[?25h  Building wheel for de-core-news-sm (setup.py) ... [?25l[?25hdone
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('de_core_news_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/de_core_news_sm -->
/usr/local/lib/python3.6/dist-packages/spacy/data/de
You can now load the model via spacy.load('de')


In [4]:
spacy_german = spacy.load("de")
spacy_english = spacy.load("en")


In [8]:
device = 'cuda'

In [9]:
def en_tokenizer(sen):
    tokens = []
    for token in spacy_english.tokenizer(sen):
        tokens.append(token.text)
    return tokens

def de_tokenizer(sen):
    tokens = []
    for token in spacy_german.tokenizer(sen):
        tokens.append(token.text)
    return tokens
      

In [11]:
SOURCE_Field = Field(eos_token = '<src_eos>', init_token = '<src_sos>', lower = True, tokenize = de_tokenizer)
TARGET_Field = Field(eos_token = '<trg_eos>', init_token = '<trg_sos>', lower = True, tokenize = en_tokenizer)

train_data, valid_data, test_data = Multi30k.splits(exts = (".de", ".en"),fields=(SOURCE_Field, TARGET_Field))

In [121]:
print(vars(train_data[0]))

{'src': ['zwei', 'junge', 'weiße', 'männer', 'sind', 'im', 'freien', 'in', 'der', 'nähe', 'vieler', 'büsche', '.'], 'trg': ['two', 'young', ',', 'white', 'males', 'are', 'outside', 'near', 'many', 'bushes', '.']}


In [12]:
SOURCE_Field.build_vocab(train_data, min_freq=2)
TARGET_Field.build_vocab(train_data, min_freq=2)

In [13]:
BATCH_SIZE = 128

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE, device = device)

In [61]:
class Encoder(nn.Module):
    def __init__(self, source_vocab_len, embeddingsize, hiddensize, batchsize):
        super().__init__()
        self.batch_size = batchsize
        self.embed = nn.Embedding(source_vocab_len, embeddingsize)
        self.lstm = nn.LSTM(embeddingsize, hiddensize,num_layers=4,bidirectional = True)
        self.fc = nn.Linear(hiddensize,hiddensize)
        
    
    def forward(self, x):
        x = self.embed(x)              # input = (seqlen, batch), output = (seqlen, batch, embedding_dim)
        op , (h,c) = self.lstm(x)      # input(x)=(seqlen,batch,embedding_dim), 
        h = torch.tanh(h)              # h = (1, batch, hidden_size)
        c = torch.tanh(c)
        op = self.fc(op)
        
        return op,h,c

In [62]:
class Decoder(nn.Module):
    def __init__(self, target_vocab_len, embeddingsize, hiddensize):
        super().__init__()

        self.opsize = target_vocab_len
        self.embed = nn.Embedding(target_vocab_len,embeddingsize)
        self.lstm = nn.LSTM(embeddingsize, hiddensize,num_layers=4)
        self.fc = nn.Linear(2*hiddensize, self.opsize)
        self.smax = nn.Softmax()

    def forward(self, x, h0, c0, encoder_op):

        x = self.embed(x)                 # input    = [1, batch], output = [1, batch, embedding_dim]      
        op, (h,c) = self.lstm(x,(h0,c0))  # input(x) = [1, batch, embedding_dim], op = [1, batch, hidden_dim] 
        op = torch.tanh(op)
        # print(encoder_op.shape)
        # print(op.shape)
        at = torch.mul(encoder_op,op )
        at = torch.sum(op, dim=1)
        at = at.squeeze()
        # print(at.shape)
        at = self.smax(at)
        at = torch.mul(at,op)
        at = torch.sum(at,dim=0) #at = [128,128]
        at = at.unsqueeze(0)  #at = [1,128,128]
        # print(at.shape)
        # print(op.shape)
        op = torch.cat((at,op),dim=2)
        op = self.fc(op)                  # op = [1, batch, vocabsize] (vocabsize==output_size)

        return op,(h,c)

In [22]:
class Translator(nn.Module):

    def __init__(self,encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
    
    def forward(self, source, target):
        
        #source = [seq len, batch size]
        #target = [seq len, batch size]

        target_len = target.shape[0]
        seq_len = target.shape[0]
        vocab_len = self.decoder.opsize
        batch_size = source.shape[-1]

        enc_op, hid_state, cell_state = self.encoder(source) 

        inp = target[0,:]                 #As input to the decoder is start token  #here, inp = [batch_size]
        inp = inp.unsqueeze(0)                                                     #here, inp = [1,batch_size]
        prediction = torch.zeros(target_len, batch_size, vocab_len).to(device)
        for i in range(1, target_len):
            batch_size = source.shape[-1]
            output,state = self.decoder(inp, hid_state, cell_state, enc_op) #output = [1, batch, vocabsize]
            # print(output.shape)
            hid_state, cell_state = state
            prediction[i] = output.view(batch_size,self.decoder.opsize)
            inp = target[i].unsqueeze(0)
        # print("modelreturn=",prediction.shape)            
        return prediction




In [63]:
source_vocab_len = len(SOURCE_Field.vocab)
target_vocab_len = len(TARGET_Field.vocab)
encoder_embedding_dim = 64
decoder_embedding_dim = 64
hiddensize_encoder = 128
hiddensize_decoder = 128
enc = Encoder(source_vocab_len, encoder_embedding_dim, hiddensize_encoder, 128)
dec = Decoder(target_vocab_len, decoder_embedding_dim, hiddensize_decoder )
model = Translator(enc,dec).to(device)
optimizer = optim.Adam(model.parameters(),lr=0.001)

In [64]:
print(model)
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

Translator(
  (encoder): Encoder(
    (embed): Embedding(7855, 64)
    (lstm): LSTM(64, 128, num_layers=4)
    (fc): Linear(in_features=128, out_features=128, bias=True)
  )
  (decoder): Decoder(
    (embed): Embedding(5893, 64)
    (lstm): LSTM(64, 128, num_layers=4)
    (fc): Linear(in_features=256, out_features=5893, bias=True)
    (smax): Softmax(dim=None)
  )
)
The model has 3,402,117 trainable parameters


In [65]:
#Our loss function calculates the average loss per token, however by passing the index of the <pad> token as 
#the ignore_index argument we ignore the loss whenever the target token is a padding token.
target_padding_index = TARGET_Field.vocab.stoi[TARGET_Field.pad_token]
criterion = nn.CrossEntropyLoss(ignore_index = target_padding_index)

In [26]:
def train(model, iterator, optimizer, criterion):
  model.train()                               #just tells pytorch that we are in training phase
  epoch_loss = 0
  for i, batch in enumerate(train_iterator):
      source = batch.src
      target = batch.trg
      # print("source=",source.shape,"target=",target.shape)
      # print("t=",target.shape)
      optimizer.zero_grad()
      output = model.forward(source, target)  #target = [trg len, batch size]
                                              #output = [trg len, batch size, output dim]
      output_dim = output.shape[-1]
      # print("op=",output.shape)
      output = output[1:].view(-1,output_dim)
      target = target[1:].view(-1)
      # print("op2=",output.shape)
      # print("t=",target.shape)
      loss = criterion(output, target)
      
      loss.backward()
      optimizer.step()
      epoch_loss += loss.item()

  return epoch_loss / len(iterator)

In [75]:
for epoch in range(10):
    start = time.time()
    train_loss = train(model, train_iterator, optimizer, criterion)
    end = time.time()
    print(train_loss,end='|')
    print(" time taken = ",end-start)



2.823803711567681| time taken =  25.852331399917603
2.783943460901403| time taken =  25.938648223876953
2.7444552198905776| time taken =  25.701021909713745
2.7068162090452756| time taken =  25.7673499584198
2.669777913240609| time taken =  25.681989192962646
2.635799615918802| time taken =  25.744008541107178
2.6033190357527545| time taken =  25.741931915283203
2.5701955625139146| time taken =  25.73134708404541
2.5383616313010062| time taken =  25.887011289596558
2.508759267529727| time taken =  25.52130389213562


In [67]:
def ipTensor(sentence, src_field):
    if isinstance(sentence, list):
        tokens = [src_field.init_token] + [token.lower() for token in sentence] + [src_field.eos_token]
    else:
        tokens = [src_field.init_token] + de_tokenizer(sentence) + [src_field.eos_token]
    seq_len = len(tokens)
    ip_tensor = torch.LongTensor([src_field.vocab.stoi[token] for token in tokens])
    return ip_tensor.view(seq_len, 1)

In [68]:
def converter(source_sen, source_field, target_field, model):
    input_tensor = ipTensor(source_sen, source_field)
    input_tensor=input_tensor.to(device)
    with torch.no_grad():
        op,h,c = model.encoder(input_tensor)
        states = (h,c)
    sos_loc = target_field.vocab.stoi[target_field.init_token]
    eos_loc = target_field.vocab.stoi[target_field.eos_token]
    predicts = [sos_loc]
    sen_len =1
    while sen_len < 50:
        inp = torch.LongTensor([predicts[-1]]).view(1,-1)
        with torch.no_grad():
            h,c=states
            inp = inp.to(device)
            h = h.to(device)
            c = c.to(device)
            op = op.to(device)
            output, states = model.decoder(inp, h,c,op)
        output = output.squeeze()
        output = output.view(-1,model.decoder.opsize)
        predicts.append(output.argmax(-1).item())
        sen_len+=1
        if predicts[-1]==eos_loc:
            break
    sentence = [target_field.vocab.itos[it] for it in predicts[1:]]
    return sentence

In [78]:
sen = ['zwei', 'junge', 'weiße', 'männer', 'sind', 'im', 'freien', 'in', 'der', 'nähe', 'vieler', 'büsche', '.']
output = converter(sen, SOURCE_Field, TARGET_Field, model)



In [79]:
print(output)

['two', 'young', 'young', 'a', 'white', 'soccer', 'players', 'are', 'running', 'in', 'the', 'grass', '.', '<trg_eos>']


In [None]:
['two', 'men', 'are', 'sitting', 'on', 'a', 'street', '.', '<trg_eos>']
['two', 'young', 'men', 'are', 'walking', 'in', 'the', 'water', '.', '<trg_eos>']
['two', 'young', 'young', 'a', 'white', 'soccer', 'players', 'are', 'running', 'in', 'the', 'grass', '.', '<trg_eos>']