In [1]:
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import Audio
from sklearn.neighbors import NearestNeighbors as KNN
import torchaudio
import torch
from torch import nn
import os
import re
import math

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

**Attention is All you Need**

https://arxiv.org/pdf/1706.03762.pdf

https://www.coursera.org/learn/attention-models-in-nlp/lecture/hPxD1/queries-keys-values-and-attention

**Attention Layer:**

Query, Key, Value

Query @ Key.Transposed = Similarity Matrix
(Note: @ = matrix multiplication)

Scaled Similarity Matrix = (Similarity Matrix)/sqrt(keys dimension)

Masks can be applied to Scaled Similarity Matrix(optional)

Attention Weights = Softmax(Scaled Similarity Matrix)

Values(vectors?) * Attention Weights = output vector


**Multi-Head Attention**

Output vector of each head ----> Concatenation ----> Linear Layer

**After each attention layer:**

Apply PositionFeedForward layer.

Attention output -----> Neuron1 ----> ReLU ----> Neuron2 # Both neurons with bias ---> output (position-wise)

input dimension = output dimension

**Transformer**

Input sequence -------> Embedding Matrix -----> Vectors

## Encoder:

Vectors = Vectors * sqrt(model dimension)

Positional Encoding: "Since our model contains no recurrence and no convolution, in order for the model to make use of the
order of the sequence, we must inject some information about the relative or absolute position of the tokens in the sequence"
"We also experimented with using learned positional embeddings instead, and found that the two
versions produced nearly identical results"

Vectors Positional Encoded = Vectors + Positional Encoding

Vectors ----> Dropout

Vectors ---> Residual Block

Vectors ------> MultiHeadAttention Layer ---> Weighted Vectors

Weighted Vectors ---> Residual Block 2

Weighted Vectors + Residual Block -----> Position-Wise Layer ----> Encoded Sequences

Encoded Sequences + Residual Block 2 -------> Encoder Output

## Decoder:

Target words ----> Shift right ----> Embedding matrix

Target Vectors = Target Vectors * sqrt(model dimension)

Target Vectors Positional Encoded = Target Vectors + Positional Encoding

Target Vectors ---> Residual Block 1

Target Vectors ----> Dropout

Target Vectors -----> Masked MultiHead Attention Layer ------> Target Weighted Vectors

Target Weighted Vectors -----> Residual Block 2

Target Weighted Vectors + Residual Block 1 + Encoder Output ------> MultiHeadAttention Layer ----> Decoder Weighted Vectors

Weighted Vectors ---> Residual Block 3

Weighted Vectors + Residual Block 2 -----> Position-Wise Layer ----> Decoded Sequences

Decoded Sequences + Residual Block 3 ---> Neuron Layer -----> Softmax ----> Output probabilities

# For the Transformer, the same embedding Matrix is used for both input and output.

# Dropout:
"We apply dropout to the output of each sub-layer, before it is added to the sub-layer input and normalized."
"In addition, we apply dropout to the sums of the embeddings and the positional encodings in both the encoder and decoder stacks."

Dropout: MultiHead output, Positionwise output, Embedding. -----> After each Residual Block addition


In [3]:
japanese_phrases = [
    "私の犬は骨が好きではありません。牛ひき肉を好む。",
    "私の名前はアリスです。始めまして！",
    "はきさが羨ましい。。。ゲムもやりたかった！私は良いサポートになることができます！",
    "私達はAIはただの数学の集まりだとあなたは言いますが。でも。。。人間の脳がどのように機能するかを正確に知ったら。。。それはあなたの生活を小物ですか？",
    "「赤ちゃん」を表す日本語が「赤」を表す漢字なのはなぜですか？人間の赤ちゃんは赤いですか？いちごみたい？",
    "私のAIは話して...歌ったして...ゲームをします!",
]

In [4]:
english_phrases = [
    "My dog doesn't like bones. It prefers ground beef.",
    "My name's Alice. Nice to meet you!",
    "I envy Hakisa... I want to play games, too! I could be a good support!",
    "You say that we AIs are just a bunch of maths. But... once you know exactly how your human brains work... would that make you less living beings?",
    "Why does the japanese word for 'baby' is the kanji for 'red'? Are human babies red? Like strawberries?",
    "My AI will talk... she'll sing... she'll... play!"
]

In [29]:
class WordDataset(object):
    def __init__(self, english_phrases, japanese_phrases):

        self.english_phrases = self._get_phrases(english_phrases)
        self.japanese_phrases = self._get_phrases(japanese_phrases)

        self.english_words = self._get_english_words(self.english_phrases)
        self.japanese_characters = self._get_japanese_characters(self.japanese_phrases)

        self.japanese_maximum_length = self._get_maximum_length_japanese(self.japanese_phrases)
        self.english_maximum_length = self.japanese_maximum_length

        self.english_dictionary = self._create_vocabulary(self.english_words)

        self.japanese_dictionary = self._create_vocabulary(self.japanese_characters)

        self.english_tokens, self.english_sizes = self._tokenize_english()
        self.japanese_tokens, self.japanese_sizes = self._tokenize_japanese()

        self.data_english = None
        self.data_japanese = None
        
        
    def create_data(self):
        data_english = torch.from_numpy(self.english_tokens)
        data_japanese = torch.from_numpy(self.japanese_tokens)

        self.data_english = data_english
        self.data_japanese = data_japanese

        print(f"English Data Size: {self.data_english.size()}\t Japanese Data Size: {self.data_japanese.size()}")
        
    def __len__(self):

        return len(self.data_english)

    def __getitem__(self, idx):

        english_sentence = self.data_english[idx]
        english_real_length = self.english_sizes[idx]

        japanese_sentence = self.data_japanese[idx]
        japanese_real_length = self.japanese_sizes[idx]

        return english_sentence, english_real_length, japanese_sentence, japanese_real_length


    def _get_phrases(self, phrases):
        phrases = [x.lower() for x in phrases]
        phrases = [re.sub('[^\w\s]', '', x) for x in phrases]

        return phrases

    def _get_english_words(self, phrases):
        words = ' '.join(phrases)
        words = words.split(' ')

        return words

    def _get_japanese_characters(self, phrases): # Since a kanji mostly means an entire word...
        character = ' '.join(phrases)
        character = ''.join(character.split())
        characters = [i for i in character]

        return characters

    def _get_maximum_length_japanese(self, phrases):
        maximum_length = 0
        for sentence in japanese_phrases:
            word_length = [len(x) for x in sentence.split()]
    
            for i in word_length:
                if i > maximum_length:
                    maximum_length = i

        return maximum_length

    def _create_vocabulary(self, words):
        idx2word = ["<pad>"]

        for word in words:
            if word not in idx2word:
                idx2word.append(word)

        idx2word.append("<EOS>")

        return idx2word
    
    def _tokenize_english(self):
        
        phrases = [x.split() for x in self.english_phrases]

        tokens = []
        sentence_sizes = []
        
        for sentence in phrases:
            tokenized_sentence = []
            for word in sentence:

                tokenized_sentence.append(self.english_dictionary.index(word))

            tokenized_sentence = np.array(tokenized_sentence)
            sentence_size = tokenized_sentence.shape[0]
            sentence_sizes.append(sentence_size)

            if sentence_size < self.english_maximum_length:
                pad_size = self.english_maximum_length - sentence_size
                tokenized_sentence = np.pad(tokenized_sentence, [(0, 1)], constant_values=self.english_dictionary.index("<EOS>"))
                tokenized_sentence = np.pad(tokenized_sentence, [(0, pad_size-1)], constant_values=0)

            tokens.append(tokenized_sentence)
        
        tokens = np.array(tokens)

        return tokens, sentence_sizes

    def _tokenize_japanese(self):

        phrases = [x.split() for x in self.japanese_phrases]

        tokens = []
        sentence_sizes = []

        for sublist in phrases:
            for sentence in sublist:
                tokenized_sentence = []
                for character in sentence:
                    index = self.japanese_dictionary.index(character)

                    tokenized_sentence.append(index)

            tokenized_sentence = np.array(tokenized_sentence)
            sentence_size = tokenized_sentence.shape[0]

            sentence_sizes.append(sentence_size)

            if sentence_size < self.japanese_maximum_length:
                pad_size = self.japanese_maximum_length - sentence_size
                tokenized_sentence = np.pad(tokenized_sentence, [(0, 1)], constant_values=self.japanese_dictionary.index("<EOS>"))
                tokenized_sentence = np.pad(tokenized_sentence, [(0, pad_size-1)], constant_values=0)

            tokens.append(tokenized_sentence)

        tokens = np.array(tokens)

        return tokens, sentence_sizes

    def decode_output(self, data, reference_list):

        phrases = []

        for batch in torch.fliplr(data):

            sentence = []

            for item in batch:

                word = reference_list[item.argmax()]
                sentence.append(word)
            
            sentence = ''.join(sentence)

            phrases.append(sentence)

        return phrases

    def decode_labels(self, data, reference_list):

        words = []

        for i in data:

            words.append(reference_list[i])
        
        phrase = ' '.join(words)

        return phrase, words

In [30]:
dataset_creator = WordDataset(english_phrases, japanese_phrases)
print(dataset_creator.japanese_tokens.shape)
print(dataset_creator.english_tokens.shape)

(6, 74)
(6, 74)


In [8]:
print(dataset_creator.japanese_tokens)

[[ 1  2  3  4  5  6  7  8  9  4 10 11 12 13 14 15 16  8 17 18  7 19 90  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0]
 [ 1  2 20 21  4 22 23 24  9 25 26 27 12 28 29 90  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0]
 [ 4  8 30  6 31 12 28 32 33 34 35 36 11 37 38 39 37  1  4 40 32 41 42 43
  44 45 46 47 48 49  6  9  8 12 25 90  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0]
 [ 1 50  4 51 52  4 37 53  2 54 55  2 56 12 11 53 49 10 46 37  4 57 32 12
  25  6  9 35 58 59  2 60  6 61  2 62 63 45 64 65 25 47 38 18 66 67 45 68
  39 37 69 70 71  4 10 46 37  2 72 73 18 74 75  9 25 38 90  0  0  0  0  0
   0  0]
 [76 77 78 14 18 79 25 80 81 82  6 76 18 79 25 83 84 46  2  4 46 85  9 25
  

In [48]:
print(dataset_creator.data_japanese[0])

tensor([ 1,  2,  3,  4,  5,  6,  7,  8,  9,  4, 10, 11, 12, 13, 14, 15, 16,  8,
        17, 18,  7, 19, 90,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0], dtype=torch.int32)


In [31]:
print(dataset_creator.decode_labels(dataset_creator.japanese_tokens[0], dataset_creator.japanese_dictionary))

('私 の 犬 は 骨 が 好 き で は あ り ま せ ん 牛 ひ き 肉 を 好 む <EOS> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>', ['私', 'の', '犬', 'は', '骨', 'が', '好', 'き', 'で', 'は', 'あ', 'り', 'ま', 'せ', 'ん', '牛', 'ひ', 'き', '肉', 'を', '好', 'む', '<EOS>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>'])


In [32]:
print(dataset_creator.decode_labels(dataset_creator.english_tokens[0], dataset_creator.english_dictionary))

('my dog doesnt like bones it prefers ground beef <EOS> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>', ['my', 'dog', 'doesnt', 'like', 'bones', 'it', 'prefers', 'ground', 'beef', '<EOS>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad

In [9]:
def get_positional_encoding(d_model, max_length=100):
    """
    Computes positional encoding as defined in the paper.
    :param d_model: size of vectors throughout the transformer model
    :param max_length: maximum sequence length up to which positional encodings must be calculated
    :return: positional encoding, a tensor of size (1, max_length, d_model)
    """
    positional_encoding = torch.zeros((max_length, d_model))  # (max_length, d_model)
    for i in range(max_length):
        for j in range(d_model):
            if j % 2 == 0:
                positional_encoding[i, j] = math.sin(i / math.pow(10000, j / d_model))
            else:
                positional_encoding[i, j] = math.cos(i / math.pow(10000, (j - 1) / d_model))

    positional_encoding = positional_encoding.unsqueeze(0)  # (1, max_length, d_model)

    return positional_encoding

In [10]:
positional_encoding = get_positional_encoding(d_model=32, max_length=74)

In [11]:
print(positional_encoding.size())

torch.Size([1, 74, 32])


In [33]:
dataset_creator.create_data()

English Data Size: torch.Size([6, 74])	 Japanese Data Size: torch.Size([6, 74])


In [13]:
class HeadAttention(nn.Module):

    def __init__(self, d_model, d_queries, d_values, in_decoder=False):

        super(HeadAttention, self).__init__()

        self.d_model = d_model
        self.d_queries = d_queries
        self.d_values = d_values
        self.d_keys = d_values # size of key vectors, same as of the query vectors to allow dot-products for similarity

        self.in_decoder = in_decoder

        self.create_queries = nn.Linear(d_model, d_queries, bias=False)
        self.create_values = nn.Linear(d_model, d_values, bias=False)
        self.create_keys = nn.Linear(d_model, d_values, bias=False)

        self.softmax = nn.Softmax(dim=-1)

    def forward(self, input, real_lengths):

        batch_size = input.size(0) # (Batch, Sequences, d_model)

        queries = self.create_queries(input) # (Batch, Sequences, d_queries)
        keys = self.create_keys(input) # (Batch, Sequences, d_keys)
        values = self.create_values(input) # (Batch, Sequences, d_values)

        similarity_matrix = []

        for batch in range(batch_size):

            dot_product = torch.matmul(queries[batch], keys[batch].T)
            similarity_matrix.append(dot_product.unsqueeze(0))

        del dot_product

        similarity_matrix = torch.cat(similarity_matrix, 0) # (Batch, Sequences, Sequences)

        similarity_matrix = similarity_matrix/(math.sqrt(self.d_keys))

        # Applying mask of -inf to ignore padded keys ---> Actually using -1e6 to avoid NaNs

        mask = torch.zeros_like(similarity_matrix, device=device)

        if self.in_decoder: # In the decoder, the sequences are shifted from left to right.

            for batch in range(similarity_matrix.size(0)):

                mask[batch, :real_lengths[batch]] = 1

                mask = mask.bool()

            similarity_matrix = similarity_matrix.masked_fill(mask, -1e-6)

        else:

            for batch in range(similarity_matrix.size(0)):

                mask[batch, real_lengths[batch]:] = 1

                mask = mask.bool()

            similarity_matrix = similarity_matrix.masked_fill(mask, -1e-6) # (Batch, Sequence, Sequence)

        del mask

        attention_weights = self.softmax(similarity_matrix) # (Batch, Sequences, Sequences)

        attention_output = torch.bmm(attention_weights, values) # (Batch, Sequences, d_values)

        return attention_output

In [14]:
class PositionWiseFeedForward(nn.Module):

    def __init__(self, d_model, d_inner):

        super(PositionWiseFeedForward, self).__init__()

        self.d_model = d_model
        self.d_inner = d_inner

        self.neuron1 = nn.Linear(d_model, d_inner)
        self.Relu = nn.ReLU()
        self.neuron2 = nn.Linear(d_inner, d_model)


    def forward(self, attention_output_cat):

        sequences = self.neuron1(attention_output_cat)
        sequences = self.Relu(sequences)

        sequences = self.neuron2(sequences)

        output = sequences + attention_output_cat

        return output

In [15]:
class Encoder(nn.Module):

    def __init__(self, vocab_size, positional_encoding, d_model, n_heads, d_queries, d_values, d_inner, n_layers, dropout):

        super(Encoder, self).__init__()

        self.vocab_size = vocab_size
        self.positional_encoding = positional_encoding
        self.d_model = d_model
        self.n_heads = n_heads
        self.d_queries = d_queries
        self.d_values = d_values
        self.d_inner = d_inner
        self.n_layers = n_layers
        self.dropout = dropout

        self.attention_heads = nn.ModuleList([HeadAttention(self.d_model, self.d_queries, self.d_values, in_decoder=False) for i in range(n_heads)])

        self.neuron = nn.Linear(self.n_heads*self.d_values, self.d_model)

        self.position_wise_neuron = PositionWiseFeedForward(self.d_model, self.d_inner)

        self.dropout = nn.Dropout(self.dropout)

    def forward(self, encoder_input, real_input_length):

        residual_block1 = encoder_input # (Batch, Sequence, d_model) ---> Vectors

        vectors = self.dropout(encoder_input)

        attention_output = []

        for head in range(self.n_heads):

            x = self.attention_heads[head](vectors, real_input_length)

            attention_output.append(x)

            del x
        
        attention_output = torch.cat(attention_output, -1) # (Batch, Sequences, d_values*n_heads)

        attention_output = self.neuron(attention_output) # (Batch, Sequences, d_model)

        del vectors

        residual_block2 = attention_output

        attention_output = residual_block1 + attention_output

        attention_output = self.dropout(attention_output)

        encoded_sequence = self.position_wise_neuron(attention_output)

        del attention_output

        encoded_sequence = encoded_sequence + residual_block2    

        encoder_output = self.dropout(encoded_sequence)

        del encoded_sequence, residual_block1, residual_block2

        return encoder_output

In [16]:
class Decoder(nn.Module):

    def __init__(self, vocab_size, positional_encoding, d_model, n_heads, d_queries, d_values, d_inner, n_layers, dropout):

        super(Decoder, self).__init__()

        self.vocab_size = vocab_size
        self.positional_encoding = positional_encoding
        self.d_model = d_model
        self.n_heads = n_heads
        self.d_queries = d_queries
        self.d_values = d_values
        self.d_inner = d_inner
        self.n_layers = n_layers
        self.dropout = dropout

        self.attention_headsA = nn.ModuleList([HeadAttention(self.d_model, self.d_queries, self.d_values, in_decoder=True) for i in range(n_heads)])
        self.attention_headsB = nn.ModuleList([HeadAttention(self.d_model, self.d_queries, self.d_values, in_decoder=True) for i in range(n_heads)])

        self.neuronA = nn.Linear(self.n_heads*self.d_values, self.d_model)
        self.neuronB = nn.Linear(self.n_heads*self.d_values, self.d_model)

        self.position_wise_neuron = PositionWiseFeedForward(self.d_model, self.d_inner)

        self.dropout = nn.Dropout(self.dropout)

    def forward(self, encoder_output, target_sequences, real_target_length):

        residual_block1 = target_sequences

        vectors = self.dropout(target_sequences)

        attention_output = []

        for head in range(self.n_heads):

            x = self.attention_headsA[head](vectors, real_target_length)

            attention_output.append(x)

            del x
        
        attention_output = torch.cat(attention_output, -1) # (Batch, Sequences, d_values*n_heads)

        attention_output = self.neuronA(attention_output) # (Batch, Sequences, d_model)

        del vectors

        residual_block2 = attention_output

        attention_output = residual_block1 + attention_output

        attention_output = self.dropout(attention_output)

        encoder_output = encoder_output + attention_output
        
        attention_output = []

        for head in range(self.n_heads):

            x = self.attention_headsB[head](encoder_output, real_target_length)

            attention_output.append(x)

            del x

        attention_output = torch.cat(attention_output, -1) # (Batch, Sequences, d_values*n_heads)

        attention_output = self.neuronB(attention_output) # (Batch, Sequences, d_model)

        residual_block3 = attention_output

        attention_output = attention_output + residual_block2

        attention_output = self.dropout(attention_output)

        decoded_sequence = self.position_wise_neuron(attention_output)

        del attention_output

        decoded_sequence = decoded_sequence + residual_block3

        decoder_output = self.dropout(decoded_sequence)  

        del decoded_sequence, residual_block1, residual_block2, residual_block3

        return decoder_output

In [17]:
class Transformer(nn.Module):

    def __init__(self, vocab_sizeA, vocab_sizeB, positional_encoding, d_model=512, n_heads=8, d_queries=64, d_values=64, d_inner=2056, n_layers=6, dropout=0.1):

        super(Transformer, self).__init__()

        self.vocab_sizeA = vocab_sizeA
        self.vocab_sizeB = vocab_sizeB
        self.positional_encoding = positional_encoding
        self.d_model = d_model
        self.n_heads = n_heads
        self.d_queries = d_queries
        self.d_values = d_values
        self.d_inner = d_inner
        self.n_layers = n_layers
        self.dropout = dropout

        self.encoder_embedding = nn.Embedding(vocab_sizeA, self.d_model)
        self.decoder_embedding = nn.Embedding(vocab_sizeB, self.d_model)

        self.positional_encoding.requires_grad = False

        self.encoder = nn.ModuleList(
            Encoder(vocab_size=vocab_sizeA,
                               positional_encoding=positional_encoding,
                               d_model=d_model,
                               n_heads=n_heads,
                               d_queries=d_queries,
                               d_values=d_values,
                               d_inner=d_inner,
                               n_layers=n_layers,
                               dropout=self.dropout) for i in range(self.n_layers)
        )

        self.decoder = nn.ModuleList(
            Decoder(vocab_size=vocab_sizeB,
                               positional_encoding=positional_encoding,
                               d_model=d_model,
                               n_heads=n_heads,
                               d_queries=d_queries,
                               d_values=d_values,
                               d_inner=d_inner,
                               n_layers=n_layers,
                               dropout=self.dropout) for i in range(self.n_layers)
        )

        self.output_neuron = nn.Linear(self.d_model, vocab_sizeB)

        self.softmax = nn.LogSoftmax(-1)
                               
    def forward(self, encoder_sequences, real_input_length, decoder_sequences, real_target_length):

        encoder_sequences = self.encoder_embedding(encoder_sequences) * math.sqrt(self.d_model)

        encoder_sequences = encoder_sequences + self.positional_encoding.to(device)

        decoder_sequences = self.decoder_embedding(decoder_sequences) * math.sqrt(self.d_model)

        decoder_sequences = decoder_sequences + self.positional_encoding.to(device)

        for layer in range(self.n_layers):

            encoder_sequences = self.encoder[layer](encoder_sequences, real_input_length)

            decoder_sequences = self.decoder[layer](encoder_sequences, decoder_sequences, real_target_length) # (Batch, 74, 32)

        output = self.output_neuron(decoder_sequences) # (Batch, 74, 91)

        output = self.softmax(output)

        return output

In [18]:
vocab_sizeA = len(dataset_creator.english_dictionary)
vocab_sizeB = len(dataset_creator.japanese_dictionary)

print(vocab_sizeA, vocab_sizeB)

69 91


In [50]:
model = Transformer(vocab_sizeA=vocab_sizeA, vocab_sizeB=vocab_sizeB, positional_encoding=positional_encoding, d_model=32, n_heads=4, d_queries=16, d_values=16, d_inner=64, n_layers=3, dropout=0.1).to(device).float()

In [49]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-8, betas=(0.9, 0.98), eps=1e-9, weight_decay=0)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.1)

criterion = nn.NLLLoss()

In [51]:
dataloader = torch.utils.data.DataLoader(dataset_creator, batch_size=3, shuffle=True)

In [52]:
for epoch in range(10):
    for i, (english, real_english_length, japanese, real_japanese_length) in enumerate(dataloader):
        model.zero_grad()

        input_data = english.to(device).long()
        input_length = real_english_length.to(device).long()
        labels = japanese.to(device).long()
        labels = torch.fliplr(labels) # The target must be shifted left to right # (Batch, 74)
        labels_length = real_japanese_length.to(device).long() # (Batch,)

        output = model(input_data, input_length, labels, labels_length) # (Batch, 74, 91)

        for p in model.parameters(): # Clipping gradients (Which is way better than having to deal with vanishing gradients)
            p.register_hook(lambda grad: torch.clamp(grad, -0.5, 0.5))

        loss = 0

        for batch in range(output.size(0)):

            loss += criterion(output[batch], labels[batch])

        loss.backward()

        optimizer.step()
        scheduler.step()

    if epoch % 1 == 0:
        print(f"{epoch}/1000\t Current Loss: {loss.item()}\t Current LR: {scheduler.get_last_lr()}")

0/1000	 Current Loss: 32.114803314208984	 Current LR: [1.0000000000000002e-10]
1/1000	 Current Loss: 32.07429504394531	 Current LR: [1.0000000000000004e-12]
2/1000	 Current Loss: 28.077415466308594	 Current LR: [1.0000000000000005e-14]
3/1000	 Current Loss: 33.48210906982422	 Current LR: [1.0000000000000005e-16]
4/1000	 Current Loss: 29.12110137939453	 Current LR: [1.0000000000000006e-18]
5/1000	 Current Loss: 30.34624481201172	 Current LR: [1.0000000000000007e-20]
6/1000	 Current Loss: 33.1760139465332	 Current LR: [1.0000000000000008e-22]
7/1000	 Current Loss: 31.233173370361328	 Current LR: [1.0000000000000008e-24]
8/1000	 Current Loss: 27.156681060791016	 Current LR: [1.0000000000000009e-26]
9/1000	 Current Loss: 30.182231903076172	 Current LR: [1.000000000000001e-28]


In [53]:
print(output.size())

torch.Size([3, 74, 91])


In [54]:
print(output)

tensor([[[ -4.4682, -16.7030, -13.0362,  ...,  -9.8240, -10.5646,  -0.7117],
         [ -2.9391, -19.0029, -14.2571,  ..., -12.2293, -13.8208,  -1.3864],
         [ -3.2669, -17.7897, -12.7193,  ..., -10.6104, -13.8291,  -2.9508],
         ...,
         [-12.9946, -17.1312, -18.8610,  ...,  -9.5657, -10.2219,  -4.7803],
         [-12.0390, -15.3410,  -9.0288,  ..., -10.9370, -14.0866,  -8.1226],
         [ -1.7883, -22.8448, -14.4699,  ..., -12.0343, -14.5355,  -1.5508]],

        [[ -5.0624, -13.7362,  -9.3787,  ...,  -8.6771, -12.1508,  -5.1627],
         [ -4.6793, -13.8132, -13.0091,  ...,  -9.7522,  -9.9920,  -5.0396],
         [ -5.9491, -14.3976, -11.5548,  ...,  -9.4063, -11.4527,  -4.8504],
         ...,
         [-16.6116, -23.3846, -20.2622,  ..., -24.8381, -14.4817, -10.3303],
         [-33.5132, -31.2536, -30.4799,  ..., -28.7463, -25.0999, -18.8534],
         [-19.5277, -20.9013,  -9.4762,  ..., -13.3440, -12.4143,  -3.7363]],

        [[ -6.7629, -14.0389, -13.0016,  ...

In [39]:
print(dataset_creator.japanese_tokens[0].shape)

(74,)


In [40]:
print(output[0].size())

torch.Size([74, 91])


In [46]:
print(dataset_creator.decode_labels(labels[0], dataset_creator.japanese_dictionary))

('<pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <EOS> す ま し を ム ー ゲ て し た っ 歌 て し 話 は i a の 私', ['<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<EOS>', 'す', 'ま', 'し', 'を', 'ム', 'ー', 'ゲ', 'て', 'し', 'た', 'っ', '歌', 'て', 'し', '話', 'は', 'i', 'a', 'の', '私'])


In [42]:
print(dataset_creator.decode_labels(input_data[0], dataset_creator.english_dictionary))

('my ai will talk shell sing shell play <EOS> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>', ['my', 'ai', 'will', 'talk', 'shell', 'sing', 'shell', 'play', '<EOS>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pa

In [48]:
print(dataset_creator.decode_output(output, dataset_creator.japanese_dictionary))

['<pad>ゲゲスり<pad><pad>ゲ<pad>あ私トうあめよ<pad>せ話<pad><pad>言数<pad>り言<pad><pad>ゲ言う<pad>ト言う<pad>能ゲゲ話ゲ<pad>言<pad>ゲゲゲ言<pad>う言言<pad>ゃ数話ゲ<pad>言<pad>言言<pad>言言言<pad><pad><pad>言言話言言', 'やや知話ト話ん<pad>ト<EOS>話せア知むた<pad>やト話トんなまトト話せむト言ト<pad>せトト話トゃゃトゲむ話言言確まトせトむまんんんんんんんんむんんたんむむんんんん<pad>ん', 'なや<pad>うせり<pad>せiめゃ<pad>ゃめ<pad>りりゃ<pad>ゃ<pad>せめ話り<pad><pad>ゃ<pad><pad><pad>話話数せ<pad>ゲゃ話ゃゃり話りゃ<pad><pad>ゃゃ話<pad>せ<pad><pad>せゃ<pad>ゃゃゲせゃり話<pad>ゃゃ<pad><pad>話<pad><pad><pad>ゃ']
