In [0]:
import torch
import torch.nn as nn
import torch.optim as optim

from torchtext.datasets import TranslationDataset, Multi30k
from torchtext.data import Field, BucketIterator

import spacy
import numpy as np
import random
import math
import time


In [0]:
! python -m spacy download de

Collecting de_core_news_sm==2.1.0
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-2.1.0/de_core_news_sm-2.1.0.tar.gz (11.1MB)
[K     |████████████████████████████████| 11.1MB 717kB/s 
[?25hBuilding wheels for collected packages: de-core-news-sm
  Building wheel for de-core-news-sm (setup.py) ... [?25l[?25hdone
  Created wheel for de-core-news-sm: filename=de_core_news_sm-2.1.0-cp36-none-any.whl size=11073065 sha256=a0359fd3e8fa8661f5fd169667147c02014040af13849db7afcb9e900d89ab92
  Stored in directory: /tmp/pip-ephem-wheel-cache-0lmft77d/wheels/b4/8b/5e/d2ce5d2756ca95de22f50f68299708009a4aafda2aea79c4e4
Successfully built de-core-news-sm
Installing collected packages: de-core-news-sm
Successfully installed de-core-news-sm-2.1.0
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('de_core_news_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/de_core_news_sm -->
/usr/

In [0]:
spacy_ger=spacy.load('de')
spacy_eng=spacy.load('en')


In [0]:
def tokenize_ger(text):

  return [tok.text for tok in spacy_ger.tokenizer(text)]

def tokenize_eng(text):

  return[ tok.text for tok in spacy_eng.tokenizer(text)]


In [0]:
##Creating the Field

src=Field(tokenize=tokenize_ger,
          init_token='<sos>',
          eos_token='<eos>',
          lower=True)

trg=Field(tokenize=tokenize_eng,
          init_token='<sos>',
          eos_token='<eos>',
          lower=True)

In [0]:
train_data,valid_data,test_data=Multi30k.splits(exts=('.de','.en'),fields=(src,trg))


In [0]:
print(vars(train_data.examples[1]))

{'src': ['mehrere', 'männer', 'mit', 'schutzhelmen', 'bedienen', 'ein', 'antriebsradsystem', '.'], 'trg': ['several', 'men', 'in', 'hard', 'hats', 'are', 'operating', 'a', 'giant', 'pulley', 'system', '.']}


In [0]:
##Tokens apprearing less than 2 times
src.build_vocab(train_data,min_freq=2)
trg.build_vocab(train_data,min_freq=2)

In [0]:
src.vocab.stoi['<eos>']

3

In [0]:
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [0]:
BATCH_SIZE=128

train_iterator,valid_iterator,test_iterator=BucketIterator.splits((train_data,valid_data,test_data),batch_size=BATCH_SIZE,device=device)

In [0]:
class StackedGRUEncoder(nn.Module):

  def __init__(self,input_dim,emb_dim,enc_hid_size=512,dropout=0.3,n_layers=1,bidirectional=False):
    super(StackedGRUEncoder,self).__init__()
    self.input_dim=input_dim
    self.emb_dim=emb_dim

    self.embedding = nn.Embedding(input_dim, emb_dim)
    self.dropout = nn.Dropout(dropout)
    ##self.src_word_emb=src_emb
    self.emb_dim=emb_dim
    self.enc_hid_dim=enc_hid_size
    
    self.bigru=nn.GRU(emb_dim,hidden_size=self.enc_hid_dim,
                      num_layers=n_layers,bias=True,
                      dropout=dropout,bidirectional=bidirectional)
    
  def forward(self,src):
    
    k=self.embedding(src)
    
    embedded = self.dropout(k)
    ##if(src.dim()==3):
      ##xs_e=xs
    ##else: 
    xs_e=embedded
    ##self.bigru.flatten_parameters()
    outputs,hidden=self.bigru(xs_e)
    
    return hidden
    ##*xs_mask[:,:,None]



In [0]:
class Encoder(nn.Module):
    
    #Bidirectional 
    def __init__(self, input_dim, emb_dim, enc_hid_dim,dec_hid_dim, dropout=0.3,bidirectional=True):
        super().__init__()
        
        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim=dec_hid_dim
        
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.GRU(emb_dim, enc_hid_dim,bidirectional=bidirectional)
        self.dropout = nn.Dropout(dropout)
        self.fc=nn.Linear(enc_hid_dim*2,dec_hid_dim)
    
    def forward(self, src):
        embedded = self.dropout(self.embedding(src))
        
        outputs, hidden = self.rnn(embedded) #no cell state!
        hidden=torch.tanh(self.fc(torch.cat((hidden[-1,:,:],hidden[-2,:,:]),dim=1)))

        return hidden

In [0]:
class Attention(nn.Module):
  def __init__(self,enc_hid_dim,dec_hid_dim):
    super().__init__()
    self.attn=nn.Linear((enc_hid_dim*2)+dec_hidden_state,dec_hid_dim)
    self.v=nn.Parameter(torch.rand(dec_hid_dim))


  def forward(self,hidden,encoder_outputs):

    batch_size=encoder_outputs.shape[1]
    src_len=encoder_outputs.shape[0]

    #Hidden here is the prev hidden state of decoder
    hidden=hidden.unsqueeze(1)
    hidden=hidden.repeat(1,src_len,1)

    encoder_outputs=encoder_output.permute(1,0,2)

    e=torch.tanh(self.attn(torch.cat((hidden,encoder_outputs),dim=2)))
    #energy matrix = [batch size, src len, dec hid dim]
    e=e.permute(0,2,1)
    #energy matrix = [batch size, dec hid dim, src len]
    
    v=self.v.repeat(batch_size,1).unsqueeze(1)
    #v=[batch_size,1,dec_hid_dim]

    attention=torch.bmm(v,energy).squeeze(1)
    #bmm is batch matrix multiply

    return F.softmax(attention,dim=1)

In [0]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, dropout):
        super().__init__()

        self.hid_dim = hid_dim
        self.output_dim = output_dim
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.GRU(emb_dim + hid_dim, hid_dim)
        self.fc_out = nn.Linear(emb_dim + hid_dim * 2, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, hidden, context):
        
        input = input.unsqueeze(0)
        
        embedded = self.dropout(self.embedding(input))
        
        
        
        ##embedded=embedded.repeat(4,1,1)
        emb_con = torch.cat((embedded, context), dim = 2)
            
        output, hidden = self.rnn(emb_con, hidden)
        
        output = torch.cat((embedded.squeeze(0), hidden.squeeze(0), context.squeeze(0)), 
                           dim = 1)
       
        prediction = self.fc_out(output)
        
        return prediction, hidden

In [0]:
class Seq2Seq(nn.Module):
  def __init__(self,encoder,decoder,device):
    super().__init__()

    self.encoder=encoder
    self.decoder=decoder
    self.device=device


  def forward(self, src,trg,teacher_forcing_ratio=0.5):
    
    batch_size=src.shape[1]
    trg_len=trg.shape[0]
    trg_vocab_size=self.decoder.output_dim


    outputs=torch.zeros(trg_len,batch_size,trg_vocab_size).to(self.device)
    context=self.encoder(src)

    hidden=context

    input=trg[0,:]

    for t in range(1,trg_len):

      output,hidden=self.decoder(input,hidden,context)
      outputs[t]=output

      top1=output.argmax(1)

      if(random.random()<teacher_forcing_ratio):
        input=trg[t]
      else:
        input=top1

    return outputs  

In [0]:
Input_dim=len(src.vocab)
Output_dim=len(trg.vocab)

ENC_EMB_DIM=256
DEC_EMB_DIM=256
HID_DIM=512
ENC_DROPOUT=0.5
DEC_DROPOUT=0.5

enc=StackedGRUEncoder(Input_dim,ENC_EMB_DIM,HID_DIM,ENC_DROPOUT)

dec=Decoder(Output_dim,DEC_EMB_DIM,512,DEC_DROPOUT)

device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model=Seq2Seq(enc,dec,device).to(device)


  "num_layers={}".format(dropout, num_layers))


In [0]:
def init_weights(m):
  for name, param in m.named_parameters():
    nn.init.normal_(param.data,mean=0,std=0.01)

model.apply(init_weights)


Seq2Seq(
  (encoder): StackedGRUEncoder(
    (embedding): Embedding(4, 256)
    (dropout): Dropout(p=0.5, inplace=False)
    (bigru): GRU(256, 512, dropout=0.5)
  )
  (decoder): Decoder(
    (embedding): Embedding(4, 256)
    (rnn): GRU(768, 512)
    (fc_out): Linear(in_features=1280, out_features=4, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [0]:
optimizer=optim.Adam(model.parameters())


In [0]:
for i, batch in enumerate(train_iterator):
  ##src=batch.src
  ##trg=batch.trg
  print(batch)
  break


[torchtext.data.batch.Batch of size 128 from MULTI30K]
	[.src]:[torch.cuda.LongTensor of size 31x128 (GPU 0)]
	[.trg]:[torch.cuda.LongTensor of size 29x128 (GPU 0)]


In [0]:
TRG_PAD_IDX=trg.vocab.stoi[trg.pad_token]
criterion=nn.CrossEntropyLoss(ignore_index=TRG_PAD_IDX)

In [0]:
def train(model,iterator,optimizer,criterion,clip):
  model.train()

  epoch_loss=0

  for i, batch in enumerate(iterator):
    src=batch.src
    trg=batch.trg

    optimizer.zero_grad()
    output=model(src,trg)

    output_dim=output.shape[-1]

    output=output[1:].view(-1,output_dim)
    trg=trg[1:].view(-1)

    loss=criterion(output,trg)
    loss.backward()

    torch.nn.utils.clip_grad_norm_(model.parameters(),clip)
    optimizer.step()

    epoch_loss+=loss.item()

  return epoch_loss/len(iterator)

In [0]:
def evaluate(model,iterator,criterion):
  
  model.eval()

  epoch_loss=0

  with torch.no_grad():
    for i, batch in enumerate(iterator):
      src=batch.src
      trg=batch.trg

      
      output=model(src,trg,0)

      output_dim=output.shape[-1]

      output=output[1:].view(-1,output_dim)
      trg=trg[1:].view(-1)

      loss=criterion(output,trg)
      
      epoch_loss+=loss.item()

  return epoch_loss/len(iterator)

In [0]:
N_EPOCHS=10
CLIP=1

best_valid_loss=float('inf')

for epoch in range(N_EPOCHS):

  train_loss=train(model,train_iterator,optimizer,criterion,CLIP)
  valid_loss=evaluate(model,valid_iterator,criterion)

  if valid_loss < best_valid_loss:
    best_valid_loss = valid_loss

  print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
  print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

	Train Loss: 0.223 | Train PPL:   1.250
	 Val. Loss: 0.332 |  Val. PPL:   1.394
	Train Loss: 0.176 | Train PPL:   1.193
	 Val. Loss: 0.222 |  Val. PPL:   1.248
	Train Loss: 0.156 | Train PPL:   1.169
	 Val. Loss: 0.150 |  Val. PPL:   1.162
	Train Loss: 0.153 | Train PPL:   1.165
	 Val. Loss: 0.148 |  Val. PPL:   1.159
	Train Loss: 0.151 | Train PPL:   1.163
	 Val. Loss: 0.146 |  Val. PPL:   1.157
	Train Loss: 0.150 | Train PPL:   1.162
	 Val. Loss: 0.147 |  Val. PPL:   1.159
	Train Loss: 0.150 | Train PPL:   1.162
	 Val. Loss: 0.151 |  Val. PPL:   1.164
	Train Loss: 0.150 | Train PPL:   1.162
	 Val. Loss: 0.146 |  Val. PPL:   1.157
	Train Loss: 0.150 | Train PPL:   1.162
	 Val. Loss: 0.147 |  Val. PPL:   1.159
	Train Loss: 0.149 | Train PPL:   1.161
	 Val. Loss: 0.155 |  Val. PPL:   1.168
