In [1]:
import torch
from torch import nn
import torchaudio
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import Audio
from librosa import display
import re
import os
import math
from collections import Counter

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [3]:
class TextDataset(torch.utils.data.Dataset):

    def __init__(self, text_path):

        self.text_path = text_path

        self.phrases = self._get_phrases()
        self.words = self._get_words(self.phrases)

        self.vocabulary = self._create_vocabulary(self.words)

        self.max_length, self.real_length = self._get_text_length()

    
    def _get_phrases(self):

        phrases = []

        with open(self.text_path, 'r') as f:

            for i in f:

                phrases.append(i)

            f.close()

        #phrases = [i.replace('\n', '') for i in phrases] # Not really necessary here
        phrases = [i.replace('"', ' " ').replace("'ve", " 've").replace("'d", " 'd").replace("'t", " 't").replace("'re", " 're").replace("'s", " 's") for i in phrases]
        phrases = [i.replace(',', ' ,').replace(".", " .").replace("!", " !").replace("?", " ?").replace("(", " ( ").replace(")", " ) ") for i in phrases]
        phrases = [i.replace(':', ' :').replace(";", " ;").replace("-", " - ") for i in phrases]
        
        return phrases
    
    def _get_words(self, phrases):
        words = ' '.join(phrases)
        words = words.split(' ')

        return words
    
    def _get_letters(self, words):
        letters = ' '.join(words)
        letters = ''.join(letters.split())
        letters = [i for i in letters]

        return letters
    
    def _create_vocabulary(self, words):
        idx2word = ["<pad>", " ", ",", "'", ".", ":", ";", "!", "?", "(", ")", 're', 'll', 've', 'd', 'm', 't', 's', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '"', "-"]

        for word in words:
            word = re.sub("[\s]", '', word)
            if word not in idx2word:
                idx2word.append(word)

        idx2word.append("<EOS>")
        idx2word.append("<SOS>")

        return idx2word
    
    def _get_text_length(self):

        maximum_length = 0

        real_length = []
        
        for sentence in self.phrases:
            word_length = 0

            for character in sentence:
                word_length += len(character) # Including spaces between words

            sentence_length = word_length
            real_length.append(sentence_length)

            if sentence_length > maximum_length:
                maximum_length = sentence_length
        
        maximum_length += 2 # In order to include an extra space and "<EOS>"

        return maximum_length, real_length

In [4]:
FILE_PATH = 'C:/Users/giova/OneDrive/Área de Trabalho/Faster than the Flame.txt'

In [4]:
FILE_PATH = 'C:/Users/giova/OneDrive/Área de Trabalho/Texte.txt'

In [5]:
dataset = TextDataset(FILE_PATH)

In [6]:
print(dataset.vocabulary)

['<pad>', ' ', ',', "'", '.', ':', ';', '!', '?', '(', ')', 're', 'll', 've', 'd', 'm', 't', 's', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '"', '-', 'Faster', 'faster', 'than', 'the', 'flame', 'Fists', 'up', 'in', 'air', 'tonight', 'Leave', 'sane', 'unleash', 'wild', 'This', 'is', 'our', 'time', 'this', 'fate', 'Pyres', 'will', 'inflame', 'night', 'Restless', 'world', 'alight', 'war', 'last', 'crusade', 'Incendere', '', 'cendere', 'Inflammatum', 'flammatum', 'Illuminatum', 'And', 'we', 'all', 'rise', 'against', 'damned', 'Stand', 'for', 'heaven', 'Hold', 'pastor', 'together', 'by', 'chain', 'at', "'re", 'going', 'When', 'set', 'on', 'fire', 'Rolling', 'Send', 'us', 'to', 'Be', 'prepared', 'sacrifice', 'raid', 'Embers', 'left', 'those', 'alive', 'Madness', 'raised', 'ignite', 'let', 'reign', 'Flame', 'burning', 'heavens', 'name', 'from', 'Lord', 'came', 'Armageddon', 'proclaim', 'now', 'shall', 'lights', 'sky', 'and', 'no', 'sin', 'can', 'deny', 'We', 'burn', '<EOS>', '<SOS>']


In [7]:
print(dataset.phrases[0:100])

['Faster , faster , faster than the flame\n', 'Faster , faster , faster than the flame\n', 'Fists up in the air tonight\n', 'Leave the sane , unleash the wild\n', 'This is our time , this is our fate\n', 'Pyres will inflame the night\n', 'Restless is the world alight\n', 'This is our war , the last crusade\n', 'Incendere  ( cendere ) \n', 'Inflammatum  ( flammatum ) \n', 'Illuminatum\n', 'And we all rise against the damned\n', 'Stand up faster for heaven , faster than the flame\n', 'Hold the pastor together , pastor by the chain\n', "And at night we 're going wild\n", 'When we set the world on fire\n', 'Rolling faster , faster , faster than the flame\n', 'Send us all to war tonight\n', 'Be prepared for sacrifice\n', 'This is our time , this is our raid\n', 'Embers left to those alive\n', 'Madness raised , we all ignite\n', 'This is our night , let fire reign\n', 'Incendere  ( cendere ) \n', 'Inflammatum  ( flammatum ) \n', 'Illuminatum\n', 'And we all rise against the damned\n', 'Stand

In [8]:
print(len(dataset.vocabulary))

126


In [9]:
print(dataset.max_length)

53


In [11]:
class HeadAttention(nn.Module):

    def __init__(self, d_model, d_queries, d_values, dropout, in_decoder=False):

        super(HeadAttention, self).__init__()

        self.d_model = d_model
        self.d_queries = d_queries
        self.d_values = d_values
        self.d_keys = d_values # size of key vectors, same as of the query vectors to allow dot-products for similarity

        self.in_decoder = in_decoder

        self.create_queries = nn.Linear(d_model, d_queries, bias=True)
        self.create_values = nn.Linear(d_model, d_values, bias=True)
        self.create_keys = nn.Linear(d_model, d_values, bias=True)

        self.dropout = nn.Dropout(dropout)

        self.relu = nn.ReLU()
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, input, real_lengths):

        batch_size = input.size(0) # (Batch, Sequences, d_model)

        queries = self.relu(self.create_queries(input)) # (Batch, Sequences, d_queries)
        keys = self.relu(self.create_keys(input)) # (Batch, Sequences, d_keys)
        values = self.relu(self.create_values(input)) # (Batch, Sequences, d_values)

        similarity_matrix = []

        for batch in range(batch_size):

            dot_product = torch.matmul(queries[batch], keys[batch].T)
            similarity_matrix.append(dot_product.unsqueeze(0))

        del dot_product

        similarity_matrix = torch.cat(similarity_matrix, 0) # (Batch, Sequences, Sequences)

        similarity_matrix = similarity_matrix/(math.sqrt(self.d_keys))

        # Applying mask of -inf to ignore padded keys ---> Actually using -1e6 to avoid NaNs

        if self.in_decoder:

            # In the decoder, masks are shifted left to right:
            # <Start-Of-Sentence> <PAD> <PAD> <PAD> ----> <Start-Of-Sentence> <prediction1> <prediction2> ... <End-Of-Sentence>
            # <Start-of-Sentence> [prediction1] [prediction2] [prediction3] ... <End-of-Sentence>

            mask = torch.zeros_like(similarity_matrix, device=device)

            for batch in range(similarity_matrix.size(0)):

                #mask[batch, :real_lengths[batch]] = 1
                mask[batch, real_lengths+1:] = 1 # For Batch = 1

                mask = mask.bool()

            similarity_matrix = similarity_matrix.masked_fill(mask, -1e6)

        else:

            mask = torch.zeros_like(similarity_matrix, device=device)

            for batch in range(similarity_matrix.size(0)):

                #mask[batch, real_lengths[batch]:] = 1
                mask[batch, real_lengths:] = 1 # For Batch = 1

                mask = mask.bool()

            similarity_matrix = similarity_matrix.masked_fill(mask, -1e6) # (Batch, Sequence, Sequence)

        del mask

        attention_weights = self.softmax(similarity_matrix) # (Batch, Sequences, Sequences)

        attention_weights = self.dropout(attention_weights)

        attention_output = torch.bmm(attention_weights, values) # (Batch, Sequences, d_values)

        return attention_output

In [12]:
class PositionWiseFeedForward(nn.Module):

    def __init__(self, d_model, d_inner, dropout):

        super(PositionWiseFeedForward, self).__init__()

        self.d_model = d_model
        self.d_inner = d_inner

        self.neuron1 = nn.Linear(d_model, d_inner)
        self.Relu = nn.ReLU()
        self.neuron2 = nn.Linear(d_inner, d_model)

        self.dropout = nn.Dropout(dropout)

    def forward(self, attention_output_cat):

        sequences = self.neuron1(attention_output_cat)
        sequences = self.Relu(sequences)

        sequences = self.neuron2(self.dropout(sequences))

        output = sequences + attention_output_cat

        return output

In [22]:
class Encoder(nn.Module):

    def __init__(self, d_model, n_heads, d_queries, d_values, d_inner, dropout, masks=False, text_length=None):

        super(Encoder, self).__init__()

        self.n_heads = n_heads

        self.attention_heads = nn.ModuleList([HeadAttention(d_model, d_queries, d_values, dropout, masks) for i in range(n_heads)])

        self.neuron = nn.Linear(self.n_heads*d_values, d_model)

        self.position_wise_neuron = PositionWiseFeedForward(d_model, d_inner, dropout)

        self.dropout = nn.Dropout(dropout)

        # Adding scaling parameter for residual blocks: https://aclanthology.org/2020.emnlp-main.463.pdf

        self.layer_normA = nn.LayerNorm(d_model)
        self.residual_scalingA = nn.Parameter(torch.ones((1, text_length, d_model)))
        self.layer_normB = nn.LayerNorm(d_model)
        self.residual_scalingB = nn.Parameter(torch.ones((1, text_length, d_model)))

    def forward(self, encoder_input, real_input_length):

        residual_block1 = encoder_input # (Batch, Sequence, d_model) ---> Vectors

        vectors = self.dropout(encoder_input)

        attention_output = []

        for head in range(self.n_heads):

            x = self.attention_heads[head](vectors, real_input_length)

            attention_output.append(x)

            del x
        
        attention_output = torch.cat(attention_output, -1) # (Batch, Sequences, d_values*n_heads)

        attention_output = self.neuron(attention_output) # (Batch, Sequences, d_model)

        del vectors

        residual_block2 = attention_output

        #attention_output = residual_block1 + attention_output
        attention_output = residual_block1 + (attention_output * self.residual_scalingA)

        attention_output = self.dropout(attention_output)

        attention_output = self.layer_normA(attention_output)

        encoded_sequence = self.position_wise_neuron(attention_output)

        del attention_output

        #encoded_sequence = encoded_sequence + residual_block2
        encoded_sequence = (encoded_sequence * self.residual_scalingB) + residual_block2

        encoder_output = self.dropout(encoded_sequence)

        encoder_output = self.layer_normB(encoder_output)

        #print(residual_block1.var())
        #print(residual_block2.var())

        del encoded_sequence, residual_block1, residual_block2

        return encoder_output # (Batch, Sequences, d_model)

In [23]:
class Decoder(nn.Module):

    def __init__(self, d_model, n_heads, d_queries, d_values, d_inner, dropout, text_length=None):

        super(Decoder, self).__init__()

        self.n_heads = n_heads

        self.attention_headsA = nn.ModuleList([HeadAttention(d_model, d_queries, d_values, dropout, in_decoder=True) for i in range(n_heads)])
        self.attention_headsB = nn.ModuleList([HeadAttention(d_model, d_queries, d_values, dropout, in_decoder=False) for i in range(n_heads)])

        self.neuronA = nn.Linear(self.n_heads*d_values, d_model)
        self.neuronB = nn.Linear(self.n_heads*d_values, d_model)

        self.position_wise_neuron = PositionWiseFeedForward(d_model, d_inner, dropout)

        self.layer_normA = nn.LayerNorm(d_model)
        self.residual_scalingA = nn.Parameter(torch.ones((1, text_length, d_model)))
        self.layer_normB = nn.LayerNorm(d_model)
        self.residual_scalingB = nn.Parameter(torch.ones((1, text_length, d_model)))
        self.layer_normC = nn.LayerNorm(d_model)
        self.residual_scalingC = nn.Parameter(torch.ones((1, text_length, d_model)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, encoder_output, target_sequences, real_target_length):

        residual_block1 = target_sequences

        vectors = self.dropout(target_sequences)

        attention_output = []

        for head in range(self.n_heads):

            x = self.attention_headsA[head](vectors, real_target_length)

            attention_output.append(x)

            del x
        
        attention_output = torch.cat(attention_output, -1) # (Batch, Sequences, d_values*n_heads)

        attention_output = self.neuronA(attention_output) # (Batch, Sequences, d_model)

        del vectors

        residual_block2 = attention_output

        #attention_output = residual_block1 + attention_output
        attention_output = residual_block1 + (attention_output * self.residual_scalingA)

        attention_decoder = self.dropout(attention_output)

        attention_decoder = self.layer_normA(attention_decoder)
        
        attention_output = []

        for head in range(self.n_heads):

            x = self.attention_headsB[head](encoder_output, real_target_length)

            attention_output.append(x)

            del x

        attention_output = torch.cat(attention_output, -1) # (Batch, Sequences, d_values*n_heads)

        attention_output = self.neuronB(attention_output) # (Batch, Sequences, d_model)

        #attention_output = attention_output + residual_block2
        attention_output = residual_block2 + (attention_output * self.residual_scalingB)

        attention_encoder = self.dropout(attention_output)

        attention_encoder = self.layer_normB(attention_encoder)

        decoded_sequence = attention_encoder + attention_decoder

        residual_block3 = decoded_sequence

        decoded_sequence = self.position_wise_neuron(attention_output)

        del attention_output, attention_encoder, attention_decoder

        #decoded_sequence = decoded_sequence + residual_block3
        decoded_sequence = (decoded_sequence * self.residual_scalingC) + residual_block3

        decoder_output = self.dropout(decoded_sequence)

        decoder_output = self.layer_normC(decoder_output)

        #print(residual_block1.var())
        #print(residual_block2.var())
        #print(residual_block3.var())

        del decoded_sequence, residual_block1, residual_block2, residual_block3

        return decoder_output

In [16]:
def get_positional_encoding(d_model, max_length=100):
    """
    Computes positional encoding as defined in the paper.
    :param d_model: size of vectors throughout the transformer model
    :param max_length: maximum sequence length up to which positional encodings must be calculated
    :return: positional encoding, a tensor of size (1, max_length, d_model)
    """
    positional_encoding = torch.zeros((max_length, d_model))  # (max_length, d_model)
    for i in range(max_length):
        for j in range(d_model):
            if j % 2 == 0:
                positional_encoding[i, j] = math.sin(i / math.pow(10000, j / d_model))
            else:
                positional_encoding[i, j] = math.cos(i / math.pow(10000, (j - 1) / d_model))

    positional_encoding = positional_encoding.unsqueeze(0)  # (1, max_length, d_model)

    return positional_encoding

In [24]:
class Broca(nn.Module):

    """
    The generator, which will generate the words that she'll speak.
    """

    def __init__(self, text_length, vocab_size, positional_encoding, d_model=512, n_heads=8, d_queries=64, d_values=64, d_inner=2056, n_layers=6, dropout=0.1):

        super(Broca, self).__init__()

        self.vocab_size = vocab_size
        self.positional_encoding = positional_encoding.to(device)
        self.d_model = d_model
        self.n_heads = n_heads
        self.n_layers = n_layers

        self.positional_encoding.requires_grad = False

        self.embedding = nn.Embedding(vocab_size, self.d_model)

        #self.image_encoder = ImageEncoder(d_model)

        self.encoder = nn.ModuleList(
            Encoder(
                    d_model=d_model,
                    n_heads=n_heads,
                    d_queries=d_queries,
                    d_values=d_values,
                    d_inner=d_inner,
                    dropout=dropout,
                    text_length=text_length) for i in range(self.n_layers)
        )

        self.decoder = nn.ModuleList(
            Decoder(
                    d_model=d_model,
                    n_heads=n_heads,
                    d_queries=d_queries,
                    d_values=d_values,
                    d_inner=d_inner,
                    dropout=dropout,
                    text_length=text_length) for i in range(self.n_layers)
        )

        self.output_neuron = nn.Linear(self.d_model, vocab_size)

        #self.softmax = nn.LogSoftmax(-1)
        #self.softmax = nn.Softmax(-1) # Not really necessary.

    def preprocess_dialogue(self, input_text, target_text=False):

        if re.findall(',', input_text):

            input_text = input_text.replace(',', ' ,')

        if re.findall(';', input_text):

            input_text = input_text.replace(';', ' ;')
        
        if re.findall("'", input_text):

            input_text = input_text.replace("'", " ' ")

        if re.findall('.', input_text):

            input_text = input_text.replace('.', ' .')

        if re.findall(':', input_text):

            input_text = input_text.replace(':', ' :')

        if re.findall('!', input_text):

            input_text = input_text.replace('!', ' !')

        if re.findall('\?', input_text):

            input_text = input_text.replace('?', ' ?')

        if re.findall('\(', input_text):

            input_text = input_text.replace('(', ' ( ')

        if re.findall('\)', input_text):

            input_text = input_text.replace(')', ' ) ')

        if re.findall('"', input_text):

            input_text = input_text.replace('"', ' " ')

        if re.findall('-', input_text):

            input_text = input_text.replace('-', ' - ')

        input_text = input_text.split(' ')
        #print(input_text)

        for i in range(input_text.count('')):

            input_text.remove('')

        tokens = []

        if target_text:

            tokens.append(dataset.vocabulary.index("<SOS>"))
        
        for word in input_text:

            word = re.sub("[\s]", '', word) # \n

            value = dataset.vocabulary.index(word)
            tokens.append(value)
            tokens.append(dataset.vocabulary.index(' '))

        tokens = np.array(tokens)
        sentence_size = tokens.shape[0]

        if sentence_size < dataset.max_length:

            pad_size = dataset.max_length - sentence_size
            tokens = np.pad(tokens, [(0, 1)], constant_values=dataset.vocabulary.index("<EOS>"))
            tokens = np.pad(tokens, [(0, pad_size-1)], constant_values=dataset.vocabulary.index("<pad>"))

        tokens = torch.from_numpy(tokens)
        tokens = tokens.unsqueeze(0).to(device)

        return tokens, sentence_size


    def forward(self, input, target_sentence): # Target Sentence (with <SOS> token) provided before forward function

        if type(input) == str: # Dialogue

            input, real_input_length = self.preprocess_dialogue(input)
            target_sentence, target_input_length = self.preprocess_dialogue(target_sentence, target_text=True)

            encoder_vectors = self.embedding(input) * math.sqrt(self.d_model) + self.positional_encoding.to(device)

            decoder_vectors = self.embedding(target_sentence) * math.sqrt(self.d_model) + self.positional_encoding.to(device) # Embed out = (Batch, d_model)

        else: # Image reaction ---> Input is already the image encoded into vectors ---> For Alice.

            encoder_vectors = input.view(input.size(0), 1, -1)

            encoder_vectors = encoder_vectors * math.sqrt(self.d_model)
            
            decoder_vectors = encoder_vectors

        for layer in range(self.n_layers):

            encoder_vectors = self.encoder[layer](encoder_vectors, real_input_length)

            decoder_vectors = self.decoder[layer](encoder_vectors, decoder_vectors, target_input_length)

        output = self.output_neuron(decoder_vectors) # (Batch, sequence, vocab_size)

        #output = self.softmax(output) # (Batch, sequence, vocab_size)

        return output # Probability of words. Take the one with highest probability, add it to the target sentence and repeat.

In [18]:
print(len(dataset.vocabulary), dataset.max_length)

126 53


In [19]:
positional_encoding = get_positional_encoding(d_model=64, max_length=53)

print(positional_encoding.size())

torch.Size([1, 53, 64])


In [173]:
model = Broca(
    text_length=53,
    vocab_size=len(dataset.vocabulary),
    positional_encoding=positional_encoding,
    d_model=64,
    n_heads=8,
    d_queries=16,
    d_values=16,
    d_inner=128,
    n_layers=2,
    dropout=0.1
).to(device)

In [15]:
# Initialization --> Not necessary if your model is small and simple

for p in model.parameters():
    # Glorot initialization needs at least two dimensions on the tensor
    if p.dim() > 1:
        nn.init.xavier_uniform_(p, gain=1.)

nn.init.normal_(model.embedding.weight, 0, math.pow(512, -0.5))

with torch.no_grad():
    model.output_neuron.weight = model.embedding.weight

In [16]:
def get_lr(step, d_model, warmup_steps):
    """
    The LR schedule. This version below is twice the definition in the paper, as used in the official T2T repository.
    :param step: training step number
    :param d_model: size of vectors throughout the transformer model
    :param warmup_steps: number of warmup steps where learning rate is increased linearly; twice the value in the paper, as in the official T2T repo
    :return: updated learning rate
    """
    lr = 2. * (d_model ** -0.5) * min((step ** -0.5), step * (warmup_steps** -1.5))

    return lr

In [26]:
# Alternative LR schedule for warmup, based on gaussian distribution
# The LR begins near zero, ascends towards a peak(LR = 5~10) and then decreases towards zero.

def get_lr_gaussian(step, mean_step=5000, std=5000):
    '''
    Alternative LR schedule for warmup phase, based on gaussian distribution
    The LR begins near zero, ascends towards a peak(LR = 5~10) and then decreases towards zero.

    :param step: the current step. The X coordinate in the graphic of a bell curve.
    :param mean_step: the step at which the learning rate will reach its peak.
    :param std: the standard deviation of the gaussian distribution. Here, it's just a hyperparameter.
    '''

    x = step
    mean = mean_step

    A = ((x - mean)**2) / (2*(std**2))

    num = math.e ** - A
    den = std * math.sqrt(math.pi*2)

    y = (num/den) * (x/mean) * 1e5 # x/mean and 1e5 are scaling factors.

    return y

### Warmup Phase isn't really necessary if your model is small and simple.

In [16]:
print(get_lr(1, 64, 10000))
print(get_lr(10000, 64, 10000))
print(get_lr(20000, 64, 10000))
print(get_lr(25000, 64, 10000))

2.5e-07
0.0025
0.0017677669529663688
0.0015811388300841897


In [28]:
print(get_lr_gaussian(1, 5000, 5000))
print(get_lr_gaussian(1000, 5000, 5000))
print(get_lr_gaussian(5000, 5000, 5000))
print(get_lr_gaussian(10000, 5000, 5000))
print(get_lr_gaussian(20000, 5000, 5000))
print(get_lr_gaussian(30000, 5000, 5000))

0.0009680764746536077
1.1587662110459311
7.978845608028654
9.678828980765735
0.35454787295504064
0.00017840634176811586


In [174]:
#optimizer = torch.optim.Adam(model.parameters(), lr=get_lr(1, 512, 2000), betas=(0.9, 0.98), eps=1e-9)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, betas=(0.9, 0.98), eps=1e-9)
criterion = nn.CrossEntropyLoss(ignore_index=dataset.vocabulary.index("<pad>"), label_smoothing=0.1)
EPOCHS = 10000 # Using steps rather than epochs is more convenient here.
schedule_sampling = 0.9999 # Decaying factor. 0.9 decays faster than 0.999

In [143]:
def schedule_sampler(sampling_rate, output, target_encoded):

    '''
    Used to avoid exposure bias, which is caused by the use of target sentence during training,
    compromising the evaluation performance.
    https://arxiv.org/pdf/1906.07651.pdf

    For Transformer, perform 2 iterations per step, the first on target sentence, the second
    on mixed sentences(target + generated on first iteration).
    The sampling rate should begin low(only target sentences) and increase over time.

    Less efficient than Reinforcement Learning(ain't that right, ChatGPT?)
    '''

    mixed_target = target_encoded.clone()

    for i in range(len(target_encoded)):

        if torch.rand((1,)) > sampling_rate:

            one_hot = torch.zeros_like(target_encoded)
            one_hot[i] = output[i].argmax(-1)

            mixed_target[i] = one_hot[i]

    mixed_target = [dataset.vocabulary[i] for i in target_encoded]
    mixed_target = " ".join(mixed_target)
    mixed_target = mixed_target.replace("<SOS>", "").replace("<EOS>", "").replace("<pad>", "")
    # Removing double spaces
    mixed_target = "".join(mixed_target)
    mixed_target = " ".join(mixed_target.split())

    return mixed_target

In [163]:
teste = dataset.phrases[1]
target = dataset.phrases[2]

In [89]:
out = model(teste, target)

In [90]:
target, target_length = model.preprocess_dialogue(target, target_text=True)

In [75]:
print(target)

tensor([[125,  35,   1,  36,   1,  37,   1,  33,   1,  38,   1,  39,   1, 124,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0]],
       device='cuda:0', dtype=torch.int32)


In [160]:
def generate_sentence(input):

    target_indices = [dataset.vocabulary.index("<SOS>")]

    target_sentence = ""

    with torch.no_grad():

        for i in range(100):

            try:

                output = model(input, target_sentence)

            except RuntimeError: # Triggered when encoded target sentence > dataset max length

                break
            
            output = output[0, :len(target_indices)]

            next_token = output[-1].argmax(-1).item()

            if next_token == dataset.vocabulary.index("<EOS>"):
                break

            target_indices.append(next_token)

            target_sentence = [dataset.vocabulary[target_indices[i]] for i in range(len(target_indices))]
            target_sentence = " ".join(target_sentence)
            target_sentence = target_sentence.replace("<SOS>", "").replace("<EOS>", "").replace("<pad>", "")
            target_sentence = "".join(target_sentence)
            target_sentence = " ".join(target_sentence.split())

            #target_sentence += dataset.vocabulary[next_token]

    #target_sentence = target_sentence.replace("<SOS>", "").replace("<EOS>", "").replace("<pad>", "")

    return target_sentence

In [167]:
generated_text = generate_sentence(teste)

print(teste)
print(generated_text)

Faster , faster , faster than the flame

Illuminatum Illuminatum prepared


In [40]:
print(len(dataset.phrases))

40


In [None]:
step = 0

for epoch in range(1, EPOCHS+1):

    epoch_loss = 0

    batches = torch.randperm(len(dataset.phrases)-1)

    for i, idx in enumerate(batches, start=1):

        inputext = dataset.phrases[idx]
        targetext = dataset.phrases[idx+1]

        input, input_length = model.preprocess_dialogue(inputext)
        target, target_length = model.preprocess_dialogue(targetext, target_text=True)

        model.zero_grad()

        possibilities = model(inputext, targetext) # (Batch, sequences, vocab_size)
        possibilities_nopad = possibilities[0, :target_length] # (Sequences, vocab_size)
        target = target.squeeze(0)

        first_loss = 0.

        for item in range(len(possibilities_nopad)):

            first_loss += criterion(possibilities_nopad[item], target[item].long())

        epoch_loss += first_loss.item()

        first_loss.backward()

        optimizer.step()

        model.zero_grad()

        sampling_rate = 1 - (schedule_sampling ** step)

        new_targetext = schedule_sampler(sampling_rate, possibilities.squeeze(0), target)
        new_target, _ = model.preprocess_dialogue(new_targetext, target_text=True)
        new_target = new_target.squeeze(0)


        possibilities = model(inputext, new_targetext)
        possibilities_nopad = possibilities[0, :target_length]

        second_loss = 0.

        for item in range(len(possibilities_nopad)):

            second_loss += criterion(possibilities_nopad[item], target[item].long())

        epoch_loss += second_loss.item()

        second_loss.backward()

        optimizer.step()

        step += 1

        grads = torch.mean(model.encoder[0].attention_heads[0].create_queries.weight.grad)
        grads2 = torch.mean(model.output_neuron.weight.grad)

        if step % 100 == 0:

            print(f"{epoch}/{EPOCHS}\tCurrent Step: {step}")
            print(f"Current Sampling Rate: {sampling_rate}")
            print(f"Last First Iter Loss: {first_loss.item()}\tSecond Iter Loss: {second_loss.item()}")
            print(f"Total Epoch Loss: {epoch_loss/(len(batches))}")
            print(f"First Layer Gradients Average: {grads}")
            print(f"Final Layer Gradients Average: {grads2}")

            decoded_out = [dataset.vocabulary[possibilities_nopad[i].argmax(-1)] for i in range(len(possibilities_nopad))]
            decoded_out = " ".join(decoded_out)
            decoded_out = decoded_out.replace("<SOS>", "").replace("<EOS>", "").replace("<pad>", "")
            decoded_out = "".join(decoded_out)
            decoded_out = " ".join(decoded_out.split())

            print(f"Input Text: {inputext}")
            print(f"Output: {decoded_out}")
            print(f"Target Text: {targetext}")

            generated_text = generate_sentence(inputext)

            print(f"Evaluation Output: {generated_text}")

In [179]:
print(f"{epoch}/{EPOCHS}\tCurrent Step: {step}")
print(f"Current Sampling Rate: {sampling_rate}")
print(f"Last First Iter Loss: {first_loss.item()}\tSecond Iter Loss: {second_loss.item()}")
print(f"Total Epoch Loss: {epoch_loss/(len(batches))}")
print(f"First Layer Gradients Average: {grads}")
print(f"Final Layer Gradients Average: {grads2}")

decoded_out = [dataset.vocabulary[possibilities_nopad[i].argmax(-1)] for i in range(len(possibilities_nopad))]
decoded_out = " ".join(decoded_out)
decoded_out = decoded_out.replace("<SOS>", "").replace("<EOS>", "").replace("<pad>", "")
decoded_out = "".join(decoded_out)
decoded_out = " ".join(decoded_out.split())

print(f"Input Text: {inputext}")
print(f"Output: {decoded_out}")
print(f"Target Text: {targetext}")

generated_text = generate_sentence(inputext)

print(f"Evaluation Output: {generated_text}")

1/10000	Current Step: 0
Current Sampling Rate: 0.8325699054762894
Last First Iter Loss: 9.208990097045898	Second Iter Loss: 9.04998779296875
Total Epoch Loss: 0.0
First Layer Gradients Average: -2.696774856758566e-07
Final Layer Gradients Average: -4.1392117511307447e-10
Input Text: Hold the pastor together , pastor by the chain

Output: Restless is the world alight
Target Text: And at night we 're going wild

Evaluation Output: Illuminatum at Hold lights ' re going no sin at can deny no at going at we at deny at at no no no we we we
