In [1]:
import numpy as np
import torch
from torch import nn
import re
import math

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [12]:
japanese_phrases = [
    "私の犬は骨が好きではありません。牛ひき肉を好む。",
    "私の名前はアリスです。初めまして！",
    "はきさが羨ましい。。。ゲムもやりたかった！私は良いサポートになることができます！",
    "私達はAIはただの数学の集まりだとあなたは言いますが。でも。。。人間の脳がどのように機能するかを正確に知ったら。。。それはあなたの生活を小物ですか？",
    "「赤ちゃん」を表す日本語が「赤」を表す漢字なのはなぜですか？人間の赤ちゃんは赤いですか？いちごみたい？",
    "私のAIは話して...歌ったして...ゲームをします!",
    "上手医者と大きい研究者に勉強していたいます。",
    "でも、私の日本語が上手あまりませんね"
]

In [15]:
english_phrases = [
    "My dog doesn't like bones. It prefers ground beef.",
    "My name's Alice. Nice to meet you!",
    "I envy Hakisa... I want to play games, too! I could be a good support!",
    "You say that we AIs are just a bunch of maths. But... once you know exactly how your human brains work... would that make you less living beings?",
    "Why does the japanese word for 'baby' is the kanji for 'red'? Are human babies red? Like strawberries?",
    "My AI will talk... she'll sing... she'll... play!",
    "I shall study so I can be a good physician and a great scientist",
    "Though my japanese is not really good"
]

In [16]:
class WordDataset(object):
    def __init__(self, english_phrases, japanese_phrases):

        self.english_phrases = self._get_phrases(english_phrases)
        self.japanese_phrases = self._get_phrases(japanese_phrases)

        self.english_words = self._get_english_words(self.english_phrases)
        self.japanese_characters = self._get_japanese_characters(self.japanese_phrases)

        self.japanese_maximum_length = self._get_maximum_length_japanese(self.japanese_phrases)
        self.english_maximum_length = self._get_maximum_length_english(self.english_phrases)

        self.english_dictionary = self._create_vocabulary(self.english_words)
        self.japanese_dictionary = self._create_vocabulary(self.japanese_characters)

    def _get_phrases(self, phrases):
        phrases = [x.lower() for x in phrases]
        phrases = [re.sub('[^\w\s]', '', x) for x in phrases]

        return phrases

    def _get_english_words(self, phrases):
        words = ' '.join(phrases)
        words = words.split(' ')

        return words

    def _get_japanese_characters(self, phrases): # Since a kanji mostly means an entire word...
        character = ' '.join(phrases)
        character = ''.join(character.split())
        characters = [i for i in character]

        return characters

    def _get_maximum_length_japanese(self, phrases):
        maximum_length = 0
        for sentence in phrases:
            word_length = [len(x) for x in sentence.split()]
    
            for i in word_length:
                if i > maximum_length:
                    maximum_length = i

        return maximum_length

    def _get_maximum_length_english(self, phrases):

        maximum_length = 0
        real_length = []
        
        for sentence in phrases:
            word_length = 0

            for word in sentence:
                word_length += len(word)+1 # Including spaces between words

            sentence_length = word_length
            real_length.append(sentence_length)

            if sentence_length > maximum_length:
                maximum_length = sentence_length

        return maximum_length

    def _create_vocabulary(self, words):
        idx2word = ["<pad>", " ", "<SOS>", "<EOS>"]

        for word in words:
            if word not in idx2word:
                idx2word.append(word)

        return idx2word

In [17]:
dataset = WordDataset(english_phrases, japanese_phrases)

In [18]:
print(dataset.english_phrases)
print(dataset.japanese_phrases)

['my dog doesnt like bones it prefers ground beef', 'my names alice nice to meet you', 'i envy hakisa i want to play games too i could be a good support', 'you say that we ais are just a bunch of maths but once you know exactly how your human brains work would that make you less living beings', 'why does the japanese word for baby is the kanji for red are human babies red like strawberries', 'my ai will talk shell sing shell play', 'i shall study so i can be a good physician and a great scientist', 'though my japanese is not really good']
['私の犬は骨が好きではありません牛ひき肉を好む', '私の名前はアリスです初めまして', 'はきさが羨ましいゲムもやりたかった私は良いサポートになることができます', '私達はaiはただの数学の集まりだとあなたは言いますがでも人間の脳がどのように機能するかを正確に知ったらそれはあなたの生活を小物ですか', '赤ちゃんを表す日本語が赤を表す漢字なのはなぜですか人間の赤ちゃんは赤いですかいちごみたい', '私のaiは話して歌ったしてゲームをします', '上手医者と大きい研究者に勉強していたいます', 'でも私の日本語が上手あまりませんね']


In [19]:
print(dataset.english_words)
print(dataset.japanese_characters)

['my', 'dog', 'doesnt', 'like', 'bones', 'it', 'prefers', 'ground', 'beef', 'my', 'names', 'alice', 'nice', 'to', 'meet', 'you', 'i', 'envy', 'hakisa', 'i', 'want', 'to', 'play', 'games', 'too', 'i', 'could', 'be', 'a', 'good', 'support', 'you', 'say', 'that', 'we', 'ais', 'are', 'just', 'a', 'bunch', 'of', 'maths', 'but', 'once', 'you', 'know', 'exactly', 'how', 'your', 'human', 'brains', 'work', 'would', 'that', 'make', 'you', 'less', 'living', 'beings', 'why', 'does', 'the', 'japanese', 'word', 'for', 'baby', 'is', 'the', 'kanji', 'for', 'red', 'are', 'human', 'babies', 'red', 'like', 'strawberries', 'my', 'ai', 'will', 'talk', 'shell', 'sing', 'shell', 'play', 'i', 'shall', 'study', 'so', 'i', 'can', 'be', 'a', 'good', 'physician', 'and', 'a', 'great', 'scientist', 'though', 'my', 'japanese', 'is', 'not', 'really', 'good']
['私', 'の', '犬', 'は', '骨', 'が', '好', 'き', 'で', 'は', 'あ', 'り', 'ま', 'せ', 'ん', '牛', 'ひ', 'き', '肉', 'を', '好', 'む', '私', 'の', '名', '前', 'は', 'ア', 'リ', 'ス', 'で', 'す', 

In [20]:
print(dataset.english_dictionary)
print(dataset.japanese_dictionary)

['<pad>', ' ', '<SOS>', '<EOS>', 'my', 'dog', 'doesnt', 'like', 'bones', 'it', 'prefers', 'ground', 'beef', 'names', 'alice', 'nice', 'to', 'meet', 'you', 'i', 'envy', 'hakisa', 'want', 'play', 'games', 'too', 'could', 'be', 'a', 'good', 'support', 'say', 'that', 'we', 'ais', 'are', 'just', 'bunch', 'of', 'maths', 'but', 'once', 'know', 'exactly', 'how', 'your', 'human', 'brains', 'work', 'would', 'make', 'less', 'living', 'beings', 'why', 'does', 'the', 'japanese', 'word', 'for', 'baby', 'is', 'kanji', 'red', 'babies', 'strawberries', 'ai', 'will', 'talk', 'shell', 'sing', 'shall', 'study', 'so', 'can', 'physician', 'and', 'great', 'scientist', 'though', 'not', 'really']
['<pad>', ' ', '<SOS>', '<EOS>', '私', 'の', '犬', 'は', '骨', 'が', '好', 'き', 'で', 'あ', 'り', 'ま', 'せ', 'ん', '牛', 'ひ', '肉', 'を', 'む', '名', '前', 'ア', 'リ', 'ス', 'す', '初', 'め', 'し', 'て', 'さ', '羨', 'い', 'ゲ', 'ム', 'も', 'や', 'た', 'か', 'っ', '良', 'サ', 'ポ', 'ー', 'ト', 'に', 'な', 'る', 'こ', 'と', '達', 'a', 'i', 'だ', '数', '学', '集', '言', '

In [21]:
def get_positional_encoding(d_model, max_length=100):
    """
    Computes positional encoding as defined in the paper.
    :param d_model: size of vectors throughout the transformer model
    :param max_length: maximum sequence length up to which positional encodings must be calculated
    :return: positional encoding, a tensor of size (1, max_length, d_model)
    """
    positional_encoding = torch.zeros((max_length, d_model))  # (max_length, d_model)
    for i in range(max_length):
        for j in range(d_model):
            if j % 2 == 0:
                positional_encoding[i, j] = math.sin(i / math.pow(10000, j / d_model))
            else:
                positional_encoding[i, j] = math.cos(i / math.pow(10000, (j - 1) / d_model))

    positional_encoding = positional_encoding.unsqueeze(0)  # (1, max_length, d_model)

    return positional_encoding

In [23]:
class HeadAttention(nn.Module):

    def __init__(self, d_model, d_queries, d_values, in_decoder=False):

        super(HeadAttention, self).__init__()

        self.d_model = d_model
        self.d_queries = d_queries
        self.d_values = d_values
        self.d_keys = d_values # size of key vectors, same as of the query vectors to allow dot-products for similarity

        self.in_decoder = in_decoder

        self.create_queries = nn.Linear(d_model, d_queries, bias=False)
        self.create_values = nn.Linear(d_model, d_values, bias=False)
        self.create_keys = nn.Linear(d_model, d_values, bias=False)

        self.softmax = nn.Softmax(dim=-1)

    def forward(self, input, real_lengths):

        batch_size = input.size(0) # (Batch, Sequences, d_model)

        queries = self.create_queries(input) # (Batch, Sequences, d_queries)
        keys = self.create_keys(input) # (Batch, Sequences, d_keys)
        values = self.create_values(input) # (Batch, Sequences, d_values)

        similarity_matrix = queries * keys # (Batch, Sequences, d_keys) ---> d_queries must be equal d_keys
        # OBS: Remember that DOT-PRODUCT is exactly an array multiplication.
        # This can be particularly useful if you study C language...

        similarity_matrix = similarity_matrix/(math.sqrt(self.d_keys)) # (Batch, Sequences, d_keys)

        # Applying mask of -inf to ignore padded keys ---> Actually using -1e6 to avoid NaNs

        if self.in_decoder:

            # In the decoder, masks are shifted left to right:
            # <Start-of-Sentence> [prediction1] [prediction2] [prediction3] ... <End-of-Sentence>

            mask = torch.zeros_like(similarity_matrix, device=device)

            for batch in range(similarity_matrix.size(0)):

                mask[batch, :real_lengths[batch]+1] = 1
                #mask[batch, :real_lengths+1] = 1 # For Batch = 1

                mask = mask.bool()

            similarity_matrix = similarity_matrix.masked_fill(mask, -1e-6)

        else:

            mask = torch.zeros_like(similarity_matrix, device=device)

            for batch in range(similarity_matrix.size(0)):

                mask[batch, :real_lengths[batch]+1] = 1
                #mask[batch, :real_lengths] = 1 # For Batch = 1

                mask = mask.bool()

            similarity_matrix = similarity_matrix.masked_fill(mask, -1e-6)

        del mask

        attention_weights = self.softmax(similarity_matrix) # (Batch, Sequences, d_keys)

        attention_output = attention_weights * values # (Batch, Sequences, d_values) ---> d_keys must be equal to d_values

        #attention_output = torch.bmm(attention_weights, values) # (Batch, Sequences, d_values)
        # DOT-PRODUCT, NOT MATRIX MULTIPLICATION!

        return attention_output

In [24]:
class PositionWiseFeedForward(nn.Module):

    def __init__(self, d_model, d_inner):

        super(PositionWiseFeedForward, self).__init__()

        self.d_model = d_model
        self.d_inner = d_inner

        self.neuron1 = nn.Linear(d_model, d_inner)
        self.Relu = nn.ReLU()
        self.neuron2 = nn.Linear(d_inner, d_model)


    def forward(self, attention_output_cat):

        sequences = self.neuron1(attention_output_cat)
        sequences = self.Relu(sequences)

        sequences = self.neuron2(sequences)

        output = sequences + attention_output_cat

        return output

In [25]:
class Encoder(nn.Module):

    def __init__(self, d_model, n_heads, d_queries, d_values, d_inner, n_layers, dropout):

        super(Encoder, self).__init__()

        self.d_model = d_model
        self.n_heads = n_heads
        self.d_queries = d_queries
        self.d_values = d_values
        self.d_inner = d_inner
        self.n_layers = n_layers
        self.dropout = dropout

        self.attention_heads = nn.ModuleList([HeadAttention(self.d_model, self.d_queries, self.d_values, in_decoder=False) for i in range(n_heads)])

        self.neuron = nn.Linear(self.n_heads*self.d_values, self.d_model)

        self.position_wise_neuron = PositionWiseFeedForward(self.d_model, self.d_inner)

        self.dropout = nn.Dropout(self.dropout)

    def forward(self, encoder_input, real_input_length):

        residual_block1 = encoder_input # (Batch, Sequence, d_model) ---> Vectors

        vectors = self.dropout(encoder_input)

        attention_output = []

        for head in range(self.n_heads):

            x = self.attention_heads[head](vectors, real_input_length)

            attention_output.append(x)

            del x
        
        attention_output = torch.cat(attention_output, -1) # (Batch, Sequences, d_values*n_heads)

        attention_output = self.neuron(attention_output) # (Batch, Sequences, d_model)

        del vectors

        residual_block2 = attention_output

        attention_output = residual_block1 + attention_output

        attention_output = self.dropout(attention_output)

        encoded_sequence = self.position_wise_neuron(attention_output)

        del attention_output

        encoded_sequence = encoded_sequence + residual_block2    

        encoder_output = self.dropout(encoded_sequence)

        del encoded_sequence, residual_block1, residual_block2

        return encoder_output

In [26]:
class Decoder(nn.Module):

    def __init__(self, d_model, n_heads, d_queries, d_values, d_inner, n_layers, dropout):

        super(Decoder, self).__init__()

        self.d_model = d_model
        self.n_heads = n_heads
        self.d_queries = d_queries
        self.d_values = d_values
        self.d_inner = d_inner
        self.n_layers = n_layers
        self.dropout = dropout

        self.attention_headsA = nn.ModuleList([HeadAttention(self.d_model, self.d_queries, self.d_values, in_decoder=True) for i in range(n_heads)])
        self.attention_headsB = nn.ModuleList([HeadAttention(self.d_model, self.d_queries, self.d_values, in_decoder=True) for i in range(n_heads)])

        self.neuronA = nn.Linear(self.n_heads*self.d_values, self.d_model)
        self.neuronB = nn.Linear(self.n_heads*self.d_values, self.d_model)

        self.position_wise_neuron = PositionWiseFeedForward(self.d_model, self.d_inner)

        self.dropout = nn.Dropout(self.dropout)

    def forward(self, encoder_output, target_sequences, real_target_length):

        residual_block1 = target_sequences

        vectors = self.dropout(target_sequences)

        attention_output = []

        for head in range(self.n_heads):

            x = self.attention_headsA[head](vectors, real_target_length)

            attention_output.append(x)

            del x
        
        attention_output = torch.cat(attention_output, -1) # (Batch, Sequences, d_values*n_heads)

        attention_output = self.neuronA(attention_output) # (Batch, Sequences, d_model)

        del vectors

        residual_block2 = attention_output

        attention_output = residual_block1 + attention_output

        attention_decoder = self.dropout(attention_output)
        
        attention_output = []

        for head in range(self.n_heads):

            x = self.attention_headsB[head](encoder_output, real_target_length)

            attention_output.append(x)

            del x

        attention_output = torch.cat(attention_output, -1) # (Batch, Sequences, d_values*n_heads)

        attention_output = self.neuronB(attention_output) # (Batch, Sequences, d_model)

        attention_output = attention_output + residual_block2

        attention_encoder = self.dropout(attention_output)

        decoded_sequence = attention_encoder + attention_decoder

        residual_block3 = decoded_sequence

        decoded_sequence = self.position_wise_neuron(attention_output)

        del attention_output, attention_encoder, attention_decoder

        decoded_sequence = decoded_sequence + residual_block3

        decoder_output = self.dropout(decoded_sequence)  

        del decoded_sequence, residual_block1, residual_block2, residual_block3

        return decoder_output

In [34]:
class Broca(nn.Module):

    """
    The generator, which will generate the words that she'll speak.
    """

    def __init__(
            self,
            input_dictionary,
            output_dictionary,
            positional_encoding,
            d_model=512,
            n_heads=8,
            d_queries=64,
            d_values=64,
            d_inner=2056,
            n_layers=6,
            dropout=0.1
    ):

        super(Broca, self).__init__()

        self.vocab_size = len(output_dictionary)
        self.positional_encoding = positional_encoding.to(device)
        self.input_dictionary = input_dictionary
        self.output_dictionary = output_dictionary
        self.d_model = d_model
        self.n_heads = n_heads
        self.d_queries = d_queries
        self.d_values = d_values
        self.d_inner = d_inner
        self.n_layers = n_layers
        self.dropout = dropout

        self.positional_encoding.requires_grad = False

        self.embedding = nn.Embedding(self.vocab_size, self.d_model)

        self.encoder = nn.ModuleList(
            Encoder(
                    d_model=d_model,
                    n_heads=n_heads,
                    d_queries=d_queries,
                    d_values=d_values,
                    d_inner=d_inner,
                    n_layers=n_layers,
                    dropout=self.dropout) for i in range(self.n_layers)
        )

        self.decoder = nn.ModuleList(
            Decoder(
                    d_model=d_model,
                    n_heads=n_heads,
                    d_queries=d_queries,
                    d_values=d_values,
                    d_inner=d_inner,
                    n_layers=n_layers,
                    dropout=self.dropout) for i in range(self.n_layers)
        )

        self.output_neuron = nn.Linear(self.d_model, self.vocab_size)

        self.softmax = nn.LogSoftmax(-1)

    def preprocess_dialogue(self, input_text, input_dictionary):

        batch = []
        sentence_sizes = []

        for phrase in range(len(input_text)):

            text = input_text[phrase]

            text = text.split(' ')

            tokens = []
            
            for word in text:

                value = input_dictionary.index(word)
                tokens.append(value)
                tokens.append(input_dictionary.index(' '))

            tokens = np.array(tokens)
            sentence_size = tokens.shape[0]

            if sentence_size < dataset.english_maximum_length:

                pad_size = dataset.english_maximum_length - sentence_size
                tokens = np.pad(tokens, [(0, 1)], constant_values=input_dictionary.index("<EOS>"))
                tokens = np.pad(tokens, [(0, pad_size-1)], constant_values=0)

            tokens = torch.from_numpy(tokens)
            tokens = tokens.unsqueeze(0)
            batch.append(tokens)
            sentence_sizes.append(sentence_size)

        batch = torch.cat(batch, 0)
        batch = batch.to(device)

        return batch, sentence_sizes
    
    def generate_sentences(self, input):

        input, real_input_length = self.preprocess_dialogue(input, self.input_dictionary)

        target_indices = [[self.output_dictionary.index("<SOS>")]] * input.size(0)

        target_sentence = torch.tensor(target_indices, device=device) # (Batch, 1)

        encoder_vectors = self.embedding(input) + math.sqrt(self.d_model)

        decoder_vectors = self.embedding(target_sentence) + math.sqrt(self.d_model) # Embed out = (Batch, d_model)

        output = self.forward(encoder_vectors, real_input_length, decoder_vectors) # (Batch, sequence_length, vocab_size)

        output_index = output.argmax(-1) # (Batch, sequence_length, 1)

        for batch in range(output.size(0)):

            target_indices[batch].append(output_index[batch, -1].item())
            target_indices[batch].pop(0) # Removing Start-of-Sentence token. We won't need it anymore.
        
        # Generating text until reaching End of Sentence --> Beam Search

        target_outputs = []

        for batch in range(output.size(0)):

            while target_indices[batch][-1] != self.output_dictionary.index("<EOS>"):

                target_pad = encoder_vectors.size(1) - len(target_indices[batch])

                target_sentence = torch.tensor(target_indices[batch] + [self.output_dictionary.index("<pad>")]*target_pad, device=device).unsqueeze(0)

                decoder_vectors = self.embedding(target_sentence) + math.sqrt(self.d_model) # (Batch, sequence_length, d_model)

                output = self.forward(encoder_vectors[batch].unsqueeze(0), [real_input_length[batch]], decoder_vectors) # (Batch, sequence_length, vocab_size)

                output_index = output.argmax(-1) # (Batch, sequence_length, 1)

                target_indices[batch].append(output_index[:, -1].item()) # When Batch = 1 ---> target_indices = (1+1) --> (2+1) --> (3+1 ...)

                if len(target_indices[batch])+2 > encoder_vectors.size(1): # Otherwise, we'll get errors.
                    # Though sometimes those errors still happen...I don't know why...

                    break

            target_outputs.append(output)

        target_outputs = torch.cat(target_outputs, 0)

        return target_indices, target_outputs


    def forward(self, encoder_vectors, real_input_length, decoder_vectors): # Target Sentence (with <SOS> token) provided before forward function

        for layer in range(self.n_layers):

            encoder_vectors = self.encoder[layer](encoder_vectors, real_input_length)

            decoder_vectors = self.decoder[layer](encoder_vectors, decoder_vectors, real_input_length)

        output = self.output_neuron(decoder_vectors) # (Batch, sequences, vocab_size)

        output = self.softmax(output) # (Batch, sequences, vocab_size)

        return output # Probability of words. Take the one with highest probability, add it to the target sentence and repeat.
    
    def talk2me(self, encoded_text): # Easier to write than 話してください

        sentences = []

        for batch in encoded_text:

            words = []

            for i in batch:

                words.append(self.output_dictionary[i])
            
            phrase = ''.join(words)

            sentences.append(phrase)

        return phrase, words

In [28]:
print(len(dataset.english_dictionary))
print(len(dataset.japanese_dictionary))

82
103


In [29]:
print(dataset.english_maximum_length)
print(dataset.japanese_maximum_length)

274
66


In [30]:
positional_encoding = get_positional_encoding(d_model=16, max_length=274)

print(positional_encoding.size())

torch.Size([1, 274, 16])


In [35]:
model = Broca(
    dataset.english_dictionary,
    dataset.japanese_dictionary,
    positional_encoding,
    d_model=16,
    n_heads=2,
    d_queries=16,
    d_values=16,
    d_inner=64,
    n_layers=1,
    dropout=0.1
).to(device)

In [36]:
inputest = dataset.english_phrases[0:3]

print(inputest)

['my dog doesnt like bones it prefers ground beef', 'my names alice nice to meet you', 'i envy hakisa i want to play games too i could be a good support']


In [37]:
teste, probabilities = model.generate_sentences(inputest)

In [38]:
teste2, teste3 = model.talk2me(teste)

In [39]:
print(teste)
print(teste2)
print(teste3)

[[53, 61, 67, 53, 12, 67, 53, 53, 53, 53, 53, 67, 67, 53, 67, 67, 67, 53, 67, 53, 57, 53, 53, 53, 53, 53, 67, 53, 36, 67, 53, 67, 53, 53, 67, 53, 53, 67, 53, 53, 53, 61, 67, 67, 67, 61, 67, 53, 57, 53, 53, 53, 67, 53, 53, 53, 5, 53, 5, 5, 67, 53, 53, 53, 53, 5, 53, 57, 67, 67, 36, 53, 5, 53, 5, 53, 53, 67, 53, 67, 53, 53, 57, 67, 67, 53, 53, 57, 67, 67, 67, 53, 53, 67, 53, 67, 8, 53, 57, 67, 67, 53, 67, 53, 5, 5, 67, 5, 53, 53, 53, 53, 53, 89, 53, 61, 53, 67, 67, 53, 53, 53, 53, 53, 53, 57, 57, 67, 48, 53, 67, 53, 53, 67, 53, 67, 53, 57, 53, 53, 67, 53, 48, 53, 8, 57, 53, 53, 67, 53, 53, 61, 53, 53, 53, 57, 48, 61, 5, 53, 57, 67, 57, 53, 53, 53, 67, 67, 67, 53, 31, 5, 67, 83, 67, 53, 67, 67, 67, 53, 67, 67, 53, 53, 53, 53, 53, 53, 53, 53, 53, 67, 45, 53, 67, 53, 67, 61, 67, 53, 36, 67, 53, 67, 53, 8, 5, 12, 5, 5, 57, 67, 83, 67, 67, 67, 67, 53, 53, 83, 67, 53, 53, 53, 48, 53, 67, 53, 53, 53, 67, 5, 53, 67, 53, 61, 53, 53, 53, 61, 53, 53, 31, 67, 57, 57, 5, 53, 53, 53, 53, 53, 53, 53, 6

In [40]:
print(probabilities.size())
print(probabilities)
print(probabilities[0, -1])

torch.Size([3, 274, 103])
tensor([[[ -9.6949,  -7.8405, -10.7507,  ..., -11.2613, -10.4206,  -9.4045],
         [ -8.1196,  -5.4626, -10.5186,  ...,  -9.7134,  -7.4533,  -6.2812],
         [ -9.9552,  -3.7838,  -7.9190,  ..., -10.4894,  -7.7402,  -6.6727],
         ...,
         [-10.9746,  -8.7719, -11.6170,  ..., -10.2516,  -9.5632,  -8.8721],
         [ -6.4172,  -5.8272,  -8.0991,  ...,  -8.5616,  -6.0806,  -7.1897],
         [ -8.5865,  -9.2055, -12.0311,  ...,  -7.5697,  -8.4377,  -8.8306]],

        [[-11.2734,  -6.8330, -12.5799,  ..., -10.9764,  -9.7323,  -9.4144],
         [ -6.7467,  -4.6849,  -8.2322,  ..., -10.5723,  -7.8436,  -7.8601],
         [ -9.2186,  -6.0319, -10.5349,  ..., -11.2418,  -7.3794,  -8.3844],
         ...,
         [ -8.2786,  -8.6415,  -8.0865,  ...,  -9.5862,  -7.9149,  -7.0735],
         [-10.3087,  -7.6746,  -9.3702,  ..., -11.1647, -10.3114,  -8.7065],
         [-11.0005,  -5.1929,  -8.7254,  ...,  -8.6275, -10.2291,  -9.3702]],

        [[-10.2335

In [41]:
from random import shuffle

texts = dataset.english_phrases
complete_batch = len(texts)
print(complete_batch)

8


In [46]:
optim = torch.optim.Adam(model.parameters(), lr=1e-5, betas=(0., 0.999), eps=1e-9)
loss = nn.NLLLoss() # Remember that we've computed LogSoftmax in the model directly.

BATCH_SIZE = 3

iters = 0

In [47]:
for epoch in range(10):

    for batch in range(0, len(texts), BATCH_SIZE):

        model.zero_grad()

        total_loss = 0.

        if batch == 0:

            shuffle(texts)

        input_text = texts[batch:min(complete_batch, batch+BATCH_SIZE)]

        generated_text, possibilities = model.generate_sentences(input_text)
        possibilities = possibilities[:, -1]

        # Sampling the 5 highest plausible choices to use as labels
        # Though this doesn't seem to make much sense.
        # Best to simply try to predict the next word/sentence (GPT-2 Pre-Training)

        _, true_possibilities = torch.sort(possibilities, dim=-1, descending=True)

        #random_idx = torch.randint(0, 5, size=(1,))

        #true_possibilities = true_possibilities[:, random_idx.item()]
        true_possibilities = true_possibilities[:, -1]

        cost = loss(possibilities, true_possibilities)

        total_loss += cost.item()

        cost.backward()

        grads = torch.mean(model.encoder[0].attention_heads[0].create_queries.weight.grad)

        optim.step()

        iters += 1

        if iters % 10 == 0:

            print(f"Current Iteration: {iters}")
            print(f"Last Loss: {cost.item()}\tTotal Batch Loss: {total_loss}")
            print(f"Gradients Average: {grads}")
            print(f"Last Generated Text:")

            generated_text, _ = model.talk2me(generated_text)

            print(generated_text)

Current Iteration: 10
Last Loss: 11.949999809265137	Total Batch Loss: 11.949999809265137
Gradients Average: -0.002799686510115862
Last Generated Text:
の達機機達達機数達達で機達達骨達数骨達達で人達の達でに機機達達し機し機機機機日達数達の達数機機達達達し達機機達のしゲよ達達骨数達し機機達数達機人機達機機達骨達機機人の達達機達日機達数のしに達し機達達達達数人で達数達機達数数日に脳機機機機達機機達達機達達機数機機機達機達達達達達達達機機達達機機機機機数機達ご機ご達達人機達達達達達達機数達機達達に人達の骨達数達機達人達達達数数達し人のに達に機機機よ達機人達達人数のの機機達達機達機達達機機数達達機機機日の達達人機達達機達達達達の達達達達達者達達達達達達ゲ達達達達の骨達達ご達数達しし達の達機人骨達達達達
Current Iteration: 20
Last Loss: 12.104449272155762	Total Batch Loss: 12.104449272155762
Gradients Average: 0.00025559391360729933
Last Generated Text:
数達達達達達機日機達ご達機達機に達骨機達数達達達名の人達数達達しで機達よ達機達達達機達達数達の機達機機達学達機機機脳達達機機達達達達機数機小機達人数達機達達数達しポ達の達機し機機数数人数数人数達機機達数の達達機達達の達数機機よ達し達機数 達小機の達達骨達人達数ぜ達数達機達達達に骨数達達達機達達達達に達達機ポ達に機人機達達数達機機達達骨達達機達達達の達達達機機の達達達機数人達達達数機骨達達達達達機達達達達機達日機達達達達機達達機達達数機の達ポ数機人機達達し日達よ達骨数達数達達の機機機し達達機機達達機人機機骨達達人達達機機達機骨達学達達機の達機機し
Current Iteration: 30
Last Loss: 12.417062759399414	Total Batch Loss: 12.417062759399414
Gradients Average: -0.008778909221291542
Last Generated 

In [48]:
print(possibilities.size())
print(possibilities[0])
print(possibilities[0].argmax())

torch.Size([2, 103])
tensor([ -7.6125,  -7.2240,  -9.8635,  -7.0475, -10.4405,  -2.2352,  -6.6553,
         -7.5731,  -3.8451,  -7.5994, -10.3296,  -8.3870,  -3.0821,  -9.7696,
         -7.0100,  -7.7499,  -6.1744,  -7.2372,  -6.9445,  -8.9210,  -8.5030,
         -7.6390, -10.6577,  -5.1920,  -9.0527, -10.0057,  -8.2147, -11.2255,
         -8.7194, -10.5065,  -6.9546,  -3.8558,  -6.3006,  -9.4539,  -6.4693,
        -10.6827,  -3.6830, -12.3469,  -6.4252,  -7.9056,  -5.9464,  -9.8419,
         -9.4847, -10.1808,  -7.5625,  -3.1202,  -8.6705,  -7.9627,  -4.4179,
        -10.0122,  -5.8565,  -9.9817,  -8.7818,  -0.9400,  -8.5611,  -6.0881,
         -8.9902,  -3.3191,  -5.0985,  -7.6891, -11.1746,  -2.2150,  -7.8084,
         -7.0604, -10.6685,  -5.6394,  -8.1794,  -2.6505,  -9.4130, -11.3430,
        -10.6113,  -6.0576, -11.5585,  -5.1952,  -9.3726,  -6.6047,  -7.0396,
         -5.2136,  -8.3791,  -5.4716, -10.3538,  -5.4073,  -8.9325,  -4.6469,
        -10.8105,  -9.5773, -13.2281,  -6.0

In [49]:
print(true_possibilities.size())
print(true_possibilities)

torch.Size([2])
tensor([86, 22], device='cuda:0')


# Next Steps: Generative Adversarial Networks

## We can obtain an AI capable of detecting AI-generated texts, while also making an AI capable of generating realistic texts


**TEXT ----> WERNICKE AREA: PROCESS INFORMATION (What does it mean?)**

**WERNICKE ------> ASSOCIATIVE CORTEX(TEMPORAL + PARIETAL?) -----> BROCA AREA: GENERATES NEW(?) INFORMATION ---> MOTOR CORTEX (PRECENTRAL GYRUS)**


*Question: Can she detect metaphores, implicit messages, sense of humour, poetry? Such things aren't as simple as "Word ---> Meaning", afterall.*