<a href="https://colab.research.google.com/github/Kshitij-Ambilduke/NLP/blob/master/seq2seq.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import spacy
import numpy as np
import time

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cpu


In [None]:
!python -m spacy download en --quiet
!python -m spacy download de --quiet

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.6/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')
[K     |████████████████████████████████| 14.9MB 792kB/s 
[?25h  Building wheel for de-core-news-sm (setup.py) ... [?25l[?25hdone
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('de_core_news_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/de_core_news_sm -->
/usr/local/lib/python3.6/dist-packages/spacy/data/de
You can now load the model via spacy.load('de')


In [None]:
spacy_german = spacy.load("de")
spacy_english = spacy.load("en")


In [None]:
vars(spacy_german)

{'_meta': {'accuracy': {'ents_f': 82.998702191,
   'ents_p': 83.4885634359,
   'ents_r': 82.5145558435,
   'las': 88.5644840323,
   'tags_acc': 96.2969806963,
   'token_acc': 95.8813352983,
   'uas': 90.713095072},
  'author': 'Explosion',
  'description': 'German multi-task CNN trained on the TIGER and WikiNER corpus. Assigns context-specific token vectors, POS tags, dependency parse and named entities. Supports identification of PER, LOC, ORG and MISC entities.',
  'email': 'contact@explosion.ai',
  'factories': {'ner': 'ner', 'parser': 'parser', 'tagger': 'tagger'},
  'labels': OrderedDict([('tagger',
                ['$(',
                 '$,',
                 '$.',
                 'ADJA',
                 'ADJD',
                 'ADV',
                 'APPO',
                 'APPR',
                 'APPRART',
                 'APZR',
                 'ART',
                 'CARD',
                 'FM',
                 'ITJ',
                 'KOKOM',
                 'KO

In [None]:
def en_tokenizer(sen):
    tokens = []
    for token in spacy_english.tokenizer(sen):
        tokens.append(token.text)
    return tokens

def de_tokenizer(sen):
    tokens = []
    for token in spacy_german.tokenizer(sen):
        tokens.append(token.text)
    return tokens
      

In [None]:
SOURCE_Field = Field(eos_token = '<src_eos>', init_token = '<src_sos>', lower = True, tokenize = de_tokenizer)
TARGET_Field = Field(eos_token = '<trg_eos>', init_token = '<trg_sos>', lower = True, tokenize = en_tokenizer)



In [None]:
vars(SOURCE_Field)

{'batch_first': False,
 'dtype': torch.int64,
 'eos_token': '<src_eos>',
 'fix_length': None,
 'include_lengths': False,
 'init_token': '<src_sos>',
 'is_target': False,
 'lower': True,
 'pad_first': False,
 'pad_token': '<pad>',
 'postprocessing': None,
 'preprocessing': None,
 'sequential': True,
 'stop_words': None,
 'tokenize': <function __main__.de_tokenizer>,
 'truncate_first': False,
 'unk_token': '<unk>',
 'use_vocab': True,
 'vocab': <torchtext.vocab.Vocab at 0x7f0c29ed60b8>}

In [None]:
train_data, valid_data, test_data = Multi30k.splits(exts = (".de", ".en"),fields=(SOURCE_Field, TARGET_Field))


downloading training.tar.gz


training.tar.gz: 100%|██████████| 1.21M/1.21M [00:02<00:00, 602kB/s]


downloading validation.tar.gz


validation.tar.gz: 100%|██████████| 46.3k/46.3k [00:00<00:00, 170kB/s]


downloading mmt_task1_test2016.tar.gz


mmt_task1_test2016.tar.gz: 100%|██████████| 66.2k/66.2k [00:00<00:00, 163kB/s]


In [None]:
vars(test_data[0])

{'src': ['ein',
  'mann',
  'mit',
  'einem',
  'orangefarbenen',
  'hut',
  ',',
  'der',
  'etwas',
  'anstarrt',
  '.'],
 'trg': ['a',
  'man',
  'in',
  'an',
  'orange',
  'hat',
  'starring',
  'at',
  'something',
  '.']}

In [None]:
SOURCE_Field.build_vocab(train_data, min_freq=2)
TARGET_Field.build_vocab(train_data, min_freq=2)

In [None]:
print(f"Number of training examples: {len(train_data.examples)}")
print(f"Number of validation examples: {len(valid_data.examples)}")
print(f"Number of testing examples: {len(test_data.examples)}")

Number of training examples: 29000
Number of validation examples: 1014
Number of testing examples: 1000


In [None]:
print(vars(train_data.examples[0]))

{'src': ['zwei', 'junge', 'weiße', 'männer', 'sind', 'im', 'freien', 'in', 'der', 'nähe', 'vieler', 'büsche', '.'], 'trg': ['two', 'young', ',', 'white', 'males', 'are', 'outside', 'near', 'many', 'bushes', '.']}


In [None]:
BATCH_SIZE = 128

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE)

In [None]:
class Encoder(nn.Module):
    def __init__(self, source_vocab_len, embeddingsize, hiddensize, batchsize):
        super().__init__()
        self.batch_size = batchsize
        self.embed = nn.Embedding(source_vocab_len, embeddingsize)
        self.lstm = nn.LSTM(embeddingsize, hiddensize,num_layers=4)
      
    
    def forward(self, x):
        x = self.embed(x)              # input = (seqlen, batch), output = (seqlen, batch, embedding_dim)
        op , (h,c) = self.lstm(x)      # input(x)=(seqlen,batch,embedding_dim), 
        h = torch.tanh(h)              # h = (1, batch, hidden_size)
        c = torch.tanh(c)
        return h,c

In [None]:
class Decoder(nn.Module):
    def __init__(self, target_vocab_len, embeddingsize, hiddensize):
        super().__init__()

        self.opsize = target_vocab_len
        self.embed = nn.Embedding(target_vocab_len,embeddingsize)
        self.lstm = nn.LSTM(embeddingsize, hiddensize,num_layers=4)
        self.fc = nn.Linear(hiddensize, self.opsize)
        

    def forward(self, x, h0, c0):

        x = self.embed(x)                 # input    = [1, batch], output = [1, batch, embedding_dim]      
        op, (h,c) = self.lstm(x,(h0,c0))  # input(x) = [1, batch, embedding_dim], op = [1, batch, hidden_dim] 
        op = torch.tanh(op)
        op = self.fc(op)                  # op = [1, batch, vocabsize] (vocabsize==output_size)

        return op,(h,c)


In [None]:
class Translator(nn.Module):

    def __init__(self,encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
    
    def forward(self, source, target):
        
        #source = [seq len, batch size]
        #target = [seq len, batch size]

        target_len = target.shape[0]
        seq_len = target.shape[0]
        vocab_len = self.decoder.opsize
        batch_size = source.shape[-1]

        hid_state, cell_state = self.encoder(source) 

        inp = target[0,:]                 #As input to the decoder is start token  #here, inp = [batch_size]
        inp = inp.unsqueeze(0)                                                     #here, inp = [1,batch_size]
        prediction = torch.zeros(target_len, batch_size, vocab_len) 
        for i in range(1, target_len):
            batch_size = source.shape[-1]
            output,state = self.decoder(inp, hid_state, cell_state) #output = [1, batch, vocabsize]
            # print(output.shape)
            hid_state, cell_state = state
            prediction[i] = output.view(batch_size,self.decoder.opsize)
            inp = target[i].unsqueeze(0)
        # print("modelreturn=",prediction.shape)            
        return prediction




In [None]:
source_vocab_len = len(SOURCE_Field.vocab)
target_vocab_len = len(TARGET_Field.vocab)
encoder_embedding_dim = 64
decoder_embedding_dim = 64
hiddensize_encoder = 128
hiddensize_decoder = 128
enc = Encoder(source_vocab_len, encoder_embedding_dim, hiddensize_encoder, 128)
dec = Decoder(target_vocab_len, decoder_embedding_dim, hiddensize_decoder )
model = Translator(enc,dec)
optimizer = optim.Adam(model.parameters(),lr=0.001)

In [None]:
print(model)
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

Translator(
  (encoder): Encoder(
    (embed): Embedding(7855, 64)
    (lstm): LSTM(64, 128, num_layers=4)
  )
  (decoder): Decoder(
    (embed): Embedding(5893, 64)
    (lstm): LSTM(64, 128, num_layers=4)
    (fc): Linear(in_features=128, out_features=5893, bias=True)
  )
)
The model has 2,631,301 trainable parameters


In [None]:
#Our loss function calculates the average loss per token, however by passing the index of the <pad> token as 
#the ignore_index argument we ignore the loss whenever the target token is a padding token.
target_padding_index = TARGET_Field.vocab.stoi[TARGET_Field.pad_token]
criterion = nn.CrossEntropyLoss(ignore_index = target_padding_index)

In [None]:
def train(model, iterator, optimizer, criterion):
  model.train()                               #just tells pytorch that we are in training phase
  epoch_loss = 0
  for i, batch in enumerate(train_iterator):
      source = batch.src
      target = batch.trg
      # print("source=",source.shape,"target=",target.shape)
      # print("t=",target.shape)
      optimizer.zero_grad()
      output = model.forward(source, target)  #target = [trg len, batch size]
                                              #output = [trg len, batch size, output dim]
      output_dim = output.shape[-1]
      # print("op=",output.shape)
      output = output[1:].view(-1,output_dim)
      target = target[1:].view(-1)
      # print("op2=",output.shape)
      # print("t=",target.shape)
      loss = criterion(output, target)
      
      loss.backward()
      optimizer.step()
      epoch_loss += loss.item()

  return epoch_loss / len(iterator)


In [None]:
for epoch in range(10):
    start = time.time()
    train_loss = train(model, train_iterator, optimizer, criterion)
    end = time.time()
    print(train_loss,end='|')
    print(" time taken = ",end-start )

3.5261150152147605| time taken =  380.70202469825745
3.4479502476259474| time taken =  380.0669410228729
3.3693812019499387| time taken =  388.0589597225189
3.293200131554961| time taken =  383.5241780281067
3.2251068547958845| time taken =  392.76274013519287
3.16307433271198| time taken =  384.28492760658264


In [None]:
# def ipTensor(sentence, source_field):
#       tokens = [source_field.init_token]+de_tokenizer(sentence)+[source_field.eos_token]
#       seq_len = len(tokens)
#       ip_tensor = torch.LongTensor([src_field.vocab.stoi[token]] for token in tokens) #words converted into numeric values here
#       return ip_tensor.view(seq_len,1)

def ipTensor(sentence, src_field):
    if isinstance(sentence, list):
        tokens = [src_field.init_token] + [token.lower() for token in sentence] + [src_field.eos_token]
    else:
        tokens = [src_field.init_token] + de_tokenizer(sentence) + [src_field.eos_token]
    seq_len = len(tokens)
    ip_tensor = torch.LongTensor([src_field.vocab.stoi[token] for token in tokens])
    return ip_tensor.view(seq_len, 1)

In [None]:
def converter(source_sen, source_field, target_field, model):
    input_tensor = ipTensor(source_sen, source_field)
    with torch.no_grad():
        states = model.encoder(input_tensor)
    sos_loc = target_field.vocab.stoi[target_field.init_token]
    eos_loc = target_field.vocab.stoi[target_field.eos_token]
    predicts = [sos_loc]
    sen_len =1
    while sen_len < 50:
        inp = torch.LongTensor([predicts[-1]]).view(1,-1)
        with torch.no_grad():
            h,c=states
            output, states = model.decoder(inp, h,c)
        output = output.squeeze()
        output = output.view(-1,model.decoder.opsize)
        predicts.append(output.argmax(-1).item())
        sen_len+=1
        if predicts[-1]==eos_loc:
            break
    sentence = [target_field.vocab.itos[it] for it in predicts[1:]]
    return sentence

In [None]:
sen = ['ein', 'einzelner', 'mann', 'steht', 'abends', 'auf', 'einer', 'brücke', '.']
output = converter(sen, SOURCE_Field, TARGET_Field, model)

NameError: ignored

In [None]:
print(output)

['a', 'man', 'is', 'standing', 'on', 'the', 'sidewalk', 'next', 'to', 'a', 'woman', '.', '<trg_eos>']


In [None]:
print(output)

['two', 'men', 'in', 'a', 'blue', 'shirt', 'and', 'a', 'woman', 'are', 'sitting', 'on', 'a', 'bench', '.', '<trg_eos>']


In [None]:
print(output)

['a', 'man', 'in', 'a', 'blue', 'shirt', 'is', 'standing', 'on', 'a', 'bench', '.', '<trg_eos>']


In [None]:
for i in train_iterator:
    print(i.src.shape)
    print(i.trg.shape)
    print()

torch.Size([28, 128])
torch.Size([28, 128])

torch.Size([24, 128])
torch.Size([25, 128])

torch.Size([29, 128])
torch.Size([34, 128])

torch.Size([33, 128])
torch.Size([35, 128])

torch.Size([30, 128])
torch.Size([28, 128])

torch.Size([31, 128])
torch.Size([27, 128])

torch.Size([46, 128])
torch.Size([37, 128])

torch.Size([26, 128])
torch.Size([27, 128])

torch.Size([32, 128])
torch.Size([32, 128])

torch.Size([30, 128])
torch.Size([26, 128])

torch.Size([34, 128])
torch.Size([35, 128])

torch.Size([33, 128])
torch.Size([35, 128])

torch.Size([24, 128])
torch.Size([28, 128])

torch.Size([26, 128])
torch.Size([34, 128])

torch.Size([26, 128])
torch.Size([27, 128])

torch.Size([29, 128])
torch.Size([33, 128])

torch.Size([26, 128])
torch.Size([25, 128])

torch.Size([33, 128])
torch.Size([29, 128])

torch.Size([29, 128])
torch.Size([28, 128])

torch.Size([36, 128])
torch.Size([37, 128])

torch.Size([30, 128])
torch.Size([27, 128])

torch.Size([33, 128])
torch.Size([27, 128])

torch.Size