In [1]:
# https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html
from io import open
import unicodedata
import string
import re
import random
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

In [2]:
# Start of Sentence and End of Sentence Token
SOS_token = 0
EOS_token = 1

In [3]:
class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2
        
    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)
            
    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [9]:
def normalizeString(s):
    
    s = s.lower().strip()
    
    s = ''.join(
        char for char in unicodedata.normalize('NFD', s)
        if unicodedata.category(char) != "Mn")
    
    s = re.sub(r"([.!?])",r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    
    return s

In [10]:
def readLangs(lang1, lang2, reverse=False):
    
    print("Reading lines...")
    
    lines = open('datasets/data/%s-%s.txt' % (lang1, lang2), encoding='utf-8').\
                read().strip().split('\n')
    
    pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]
    
    if reverse:
        pair = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)
        
    return input_lang, output_lang, pairs

In [15]:
# http://www.manythings.org/anki/

In [12]:
MAX_LENGTH = 10

In [13]:
eng_prefixes = ("I am ", "I m", "he is", "he s ", "she is", "she s ", "you are", "we are", "we re ", "they are", "they re ")

In [38]:
def filterPairs(pairs):
#     return [p for p in pairs
#            if
#            len(p[0].split(' ')) < MAX_LENGTH and
#            len(p[1].split(' ')) < MAX_LENGTH and
    return [[p[0], p[1]] for p in pairs
           if
           len(p[0].split(' ')) < MAX_LENGTH and
           len(p[1].split(' ')) < MAX_LENGTH]

In [39]:
def prepareData(lang1, lang2, reverse=False):
    
    input_lang, output_lang, pairs = readLangs(lang1,lang2, reverse)
    print("Read %s sentence pairs" % len(pairs))
    
    pairs = filterPairs(pairs)
    print("Trimmed to %s sentence pairs" % len(pairs))
    
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
        
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    
    return input_lang, output_lang, pairs

In [40]:
input_lang, output_lang, pairs = prepareData('deu', 'eng', False)

print(random.choice(pairs))

Reading lines...
Read 208486 sentence pairs
Trimmed to 158319 sentence pairs
Counted words:
deu 12996
eng 25815
['he came a little after noon .', 'er kam kurz nach mittag .']


In [42]:
class EncoderRNN(nn.Module):
    
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        
        #We are feeding this hidden layer to the decoder
        self.hidden_size = hidden_size
        
        self.embedding = nn.Embedding(input_size, hidden_size)
        
        self.gru = nn.GRU(hidden_size, hidden_size)
        
    def forward(self, input, hidden):
        
        # Changing the dimension for the GRU cell from [1, hidden_size] --> [1, 1, hidden_size]
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        
        output, hidden = self.gru(output, hidden)
        
        return output, hidden
    
    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size)

In [51]:
class DecoderRNN(nn.Module):
    
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        
        self.hidden_size = hidden_size
        
        self.embedding = nn.Embedding(output_size, hidden_size)
        
        self.gru = nn.GRU(hidden_size, hidden_size)
        
        self.out = nn.Linear(hidden_size, output_size)
        
        self.softmax = nn.LogSoftmax(dim=1)
        
    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        
        output, hidden = self.gru(output, hidden)
        
        # The output must be reshaped so it can be fed into the the Linear network [1, 1, hidden_size] --> [1, hidden_size]
        output = self.softmax(self.out(output[0]))
        
        return output, hidden
    
    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size)

In [44]:
def tensorFromSentence(lang, sentence):
    indexes = [lang.word2index[word] for word in sentence.split(' ')]
    
    indexes.append(EOS_token)
    # Sentence size is converted to is [sentence_length, 1] tensor
    return torch.tensor(indexes, dtype=torch.long).view(-1, 1)

In [45]:
def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

In [46]:
teaching_forcing_ratio = 0.5

In [47]:
def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion):
    encoder_hidden = encoder.initHidden()
    
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    
    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)
    
    loss = 0
    
    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)
    
    decoder_input = torch.tensor([[SOS_token]])
    
    decoder_hidden = encoder_hidden
    
    user_teacher_forcing = True if random.random() < teaching_forcing_ratio else False
    
    if user_teacher_forcing:
        
        for di in range(target_length):
            decoder_output, decoder_hidden = decoder(decoder_input, decoder_input)
            
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]
    
    else:
        
        for di in range(target_length):
            decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
            
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()
            
            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break
            
    loss.backward()
    
    encoder_optimizer.step()
    decoder_optimizer.step()
    
    return loss.item() / target_length
    

In [48]:
plot_losses = []
print_loss_total = 0
plot_loss_total = 0

In [49]:
hidden_size = 256

In [52]:
encoder1 = EncoderRNN(input_lang.n_words, hidden_size)
decoder1 = DecoderRNN(hidden_size, output_lang.n_words)

In [53]:
encoder_optimizer = optim.SGD(encoder1.parameters(), lr=0.01)
decoder_optimizer = optim.SGD(decoder1.parameters(), lr=0.01)

training_pairs = [tensorsFromPair(random.choice(pairs))
                 for i in range(30000)]

criterion = nn.NLLLoss()

In [55]:
for iter in range(1, 30001):
    
    training_pair = training_pairs[iter -1]
    input_tensor = training_pair[0]
    target_tensor = training_pair[1]
    
    loss = train(input_tensor, target_tensor, encoder1, decoder1, encoder_optimizer, decoder_optimizer, criterion)
    
    print_loss_total += loss
    plot_loss_total += loss
    
    if iter % 1000 == 0:
        print_loss_avg = print_loss_total / 100
        print_loss_total = 0
        print('iteration -d %d loss - %.4f' % (iter, print_loss_avg))
        
    if iter % 100 == 0:
        plot_loss_avg = plot_loss_total / 100
        plot_losses.append(plot_loss_avg)
        plot_loss_total = 0
        
fig, ax = plt.subplots(figsize=(15,8))
loc = ticker.MultipleLocator(base=0.2)
ax.yaxis.set_major_locator(loc)
plt.plot(plot_losses)

RuntimeError: Expected hidden size (1, 1, 256), got (1, 1)

In [None]:
def evaluate (encoder, decoder, sentence):
    
    with torch.no_grade():
        input_tensor = tensorFromSentence(input_lang, sentence)
        input_length = input_tensor.size(0)
        
        encoder_hidden = encoder.initHidden()
        
        for ei in range(input_length):
            encoder_oput, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)
            
        decoder_input = torch.tensor([[SOS_token]])
        decoder_hidden = encoder_hidden
        
        decoded_words = []
        
        for di in range(MAX_LENGTH):
            decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
            
            topv, topi = decoder_output.data.topk(1)
            
            if topi.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(output_lang.index2word[topi.item()])
                
            decoder_input = topi.squeeze().detach()
            
        return decoded_words

In [None]:
for i in range(10):
    
    pair = random.choice(pairs)
    
    print('>', pair[0])
    print('=', pair[1])
    
    output_words = evaluate(encoder1, decoder1, pair[0])
    output_sentence = ' '.join(output_words)
    
    print('<', output_sentence)
    print('')

In [None]:
input_sentence = 'es tut mir sehr leid'

In [None]:
output_words = evaluate(encoder1, decoder1, input_sentence)

In [None]:
print('input =', input_sentece)
print('output =', ' '.join(output_words))