<a href="https://colab.research.google.com/github/MessiNN/chatbot-transformer-Early_Stage-/blob/master/Transformer_Architecture_v1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim.lr_scheduler import LambdaLR


import random
import re
import os
import unicodedata
import itertools
import pandas as pd


device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device_cpu = torch.device("cpu")

save_dir = os.path.join("/content/drive/MyDrive", "data", "save")

PAD_token = 0
SOS_token = 1
EOS_token = 2

MAX_LENGTH = 20
MIN_COUNT = 3

Mounted at /content/drive


In [39]:
class Library:
    def __init__(self):
        self.name = "Dataset"
        self.trimmed = False
        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
        self.num_words = 3

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.num_words
            self.word2count[word] = 1
            self.index2word[self.num_words] = word
            self.num_words += 1
        else:
            self.word2count[word] += 1

    def trim(self, min_count):
        if self.trimmed:
            return
        self.trimmed = True

        keep_words = []

        for k, v in self.word2count.items():
            if v >= min_count:
                keep_words.append(k)


        print(len(keep_words), len(self.word2index), len(keep_words), len(self.word2index))
        print('keep_words {} / {} = {:.4f}'.format(
            len(keep_words), len(self.word2index), len(keep_words) / len(self.word2index)
        ))

        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
        self.num_words = 3

        for word in keep_words:
            self.addWord(word)

def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    s = re.sub(r"\s+", r" ", s).strip()
    return s

def readVocs(datafile):
    pairs = []
    df = pd.read_parquet(datafile)
    questions = df['question'].tolist()
    responses = df['response'].tolist()
    for question, response in zip(questions, responses):
      question = normalizeString(question)
      response = normalizeString(response)
      pair = [question, response]
      pairs.append(pair)
    return pairs

def filterPair(p):
    return len(p[0].split(' ')) < MAX_LENGTH - 1 and len(p[1].split(' ')) < MAX_LENGTH - 1

def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

def loadPrepareData(datafile, save_dir):
    libra = Library()
    pairs = readVocs(datafile)
    pairs = filterPairs(pairs)
    for pair in pairs:
        libra.addSentence(pair[0])
        libra.addSentence(pair[1])
    return libra, pairs


def trimRareWords(libra, pairs, MIN_COUNT):
    libra.trim(MIN_COUNT)
    keep_pairs = []
    for pair in pairs:
        input_sentence = pair[0]
        output_sentence = pair[1]
        keep_input = True
        keep_output = True
        for word in input_sentence.split(' '):
            if word not in libra.word2index:
                keep_input = False
                break
        for word in output_sentence.split(' '):
            if word not in libra.word2index:
                keep_output = False
                break

        if keep_input and keep_output:
            keep_pairs.append(pair)

    print("Trimmed from {} pairs to {}, {:.4f} of total".format(len(pairs), len(keep_pairs), len(keep_pairs) / len(pairs)))
    return keep_pairs

def indexesFromSentence(libra, sentence):
    return [SOS_token] + [libra.word2index[word] for word in sentence.split(' ') if word in libra.word2index] + [EOS_token]

def create_padding_mask(inputs):
    padding_mask = (inputs != 0)
    return padding_mask

def create_look_ahead_mask(size):
    mask = (1 - torch.triu(torch.ones(size, size), diagonal=1)).bool()
    return mask

def zeroPadding(l, fillvalue=PAD_token):
    padded_list = []
    for sequence in l:
        padded_sequence = list(sequence) + [fillvalue] * ((MAX_LENGTH) - len(sequence))
        padded_list.append(padded_sequence)
    return padded_list

def binaryMatrix(l, value=PAD_token):
    m = []
    for i, seq in enumerate(l):
        m.append([])
        for token in seq:
            if token == PAD_token:
                m[i].append(0)
            else:
                m[i].append(1)
    return m

def inputVar(l, voc):
    indexes_batch = [indexesFromSentence(voc, sentence) for sentence in l]
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
    padList = zeroPadding(indexes_batch)
    padVar = torch.LongTensor(padList)
    return padVar, lengths

# Returns padded target sequence tensor, padding mask, and max target length
def outputVar(l, voc):
    indexes_batch = [indexesFromSentence(voc, sentence) for sentence in l]
    padList = zeroPadding(indexes_batch)
    mask = binaryMatrix(padList)
    mask = torch.BoolTensor(mask)
    padVar = torch.LongTensor(padList)
    return padVar, mask

def batch2TrainData(voc, pair_batch):
    pair_batch.sort(key=lambda x: len(x[0].split(" ")), reverse=True)
    input_batch, output_batch = [], []
    for pair in pair_batch:
        input_batch.append(pair[0])
        output_batch.append(pair[1])
    inp, lengths = inputVar(input_batch, voc)
    output, mask = outputVar(output_batch, voc)
    return inp, output, lengths, mask

In [101]:
#Own code implementation (By me)

class PositionalEncoding(nn.Module):
    def __init__(self, position, d_model):
        super(PositionalEncoding, self).__init__()
        self.register_buffer('pos_encoding', self.positional_encoding(position, d_model))

    def get_angles(self, position, i, d_model):
        angles = 1 / torch.pow(10000, (2 * (i // 2)) / torch.FloatTensor([d_model]))
        return position * angles

    def positional_encoding(self, position, d_model):
        angle_rads = self.get_angles(
            position=torch.arange(position).unsqueeze(1),
            i=torch.arange(d_model).unsqueeze(0),
            d_model=d_model)

        # apply sin to even indices in the array; 2i
        sines = torch.sin(angle_rads[:, 0::2])
        # apply cos to odd indices in the array; 2i+1
        cosines = torch.cos(angle_rads[:, 1::2])

        pos_encoding = torch.cat([sines, cosines], dim=-1)
        pos_encoding = pos_encoding.unsqueeze(0)
        return pos_encoding

    def forward(self, inputs):
        return inputs + self.pos_encoding[:, :inputs.size(1), :]



class Normalize(nn.Module):
    def __init__(self, scale: float, shift: float, epsilon: float = 1e-8):
        super(Normalize, self).__init__()
        self.scale = scale
        self.shift = shift
        self.epsilon = epsilon

    def forward(self, x):
        mean = torch.mean(x)
        deviation = torch.std(x) + self.epsilon
        x = (x - mean) / deviation
        x = x * self.scale
        x = x + self.shift
        return x


#---- Self Made Multi Head Attention Implementation ----
class Attention(nn.Module):
    def __init__(self, embedding_size, head_num):
        super(Attention, self).__init__()

        self.embedding_size = embedding_size
        self.num_heads = head_num
        self.head_size = embedding_size // self.num_heads


        self.attn = nn.Linear(embedding_size, embedding_size)

    def general_score(self, hidden, encoder_output):
        energy = self.attn(encoder_output)
        energy = torch.sum(hidden * energy, dim=2)
        return energy

    def forward(self, hidden, encoder_outputs):
        attn_energies = self.general_score(hidden, encoder_outputs)

        attn_energies = attn_energies.t()

        attn_energies = F.softmax(attn_energies, dim=1).unsqueeze(1)

        return attn_energies


#----Self Made Layer Encoder Implementation----
class Encoder_Layer(nn.Module):
    def __init__(self, embedding_size, head_num, batch_size, dropout):
        super(Encoder_Layer, self).__init__()

        self.embedding_size = embedding_size

        self.fc1 = nn.Linear(embedding_size*2, embedding_size)

        self.norm = Normalize(0.4, 0.4)
        self.dropout = nn.Dropout(dropout)

    def forward(self, rnn_normalized):

        #-----First Sub Layer Attention Mechanism-----#

        rnn_normalized = self.dropout(rnn_normalized)
        # each word will independently have a chance to re-initilise it's representation ( [0.1, 0.3, -0.1, 0.2] ) with all 0s

        attentionNormalized = self.norm(rnn_normalized)
        # It should be normalizing, scaling and shifting

        #-----First Sub Layer Attention Mechanism-----#

        return attentionNormalized


#----Self Made Encoder Implementation----
class Encoder(nn.Module):
    def __init__(self, embedding, embedding_size, head_num, batch_size, dropout, n_layers, vocab_size):
        super(Encoder, self).__init__()

        self.num_layers = n_layers
        self.positional_encoding = PositionalEncoding(vocab_size, embedding_size)
        self.encoder_layer = Encoder_Layer(embedding_size, head_num, batch_size, dropout)

        self.embedding = embedding
        self.embedding_size = embedding_size

        self.gru = nn.GRU(embedding_size, embedding_size, n_layers,
                          dropout=(0 if n_layers == 1 else dropout), bidirectional=True, batch_first=True)
        self.fc1 = nn.Linear(embedding_size*2, embedding_size)

        self.dropout = nn.Dropout(dropout)
        self.norm = Normalize(0.4, 0.4)

    def forward(self, source_tensor, lengths, hidden = None):
        # shape: batch / length

        source_embedding = self.embedding(source_tensor).to(device)
        # shape: batch / length / embedding_size
        source_embedding *= torch.sqrt(torch.tensor(self.embedding_size, dtype=torch.float32))
        source_PosEmbedding = self.positional_encoding(source_embedding).to(device)

        packed = nn.utils.rnn.pack_padded_sequence(source_PosEmbedding, lengths, batch_first=True) # Hides paddded sequences for effiecinet computations
        rnn_outputs, hidden = self.gru(packed, hidden)
        rnn_outputs, _ = nn.utils.rnn.pad_packed_sequence(rnn_outputs) # Show padded sequences again

        # we add pos to x's high dimensional space as context for each word position (cannot be seen as 1 more dimension)
        # shape: batch, length / embedding_size / --high-->  pos
        # "What": [0.1, 0.3, -0.1, 0.2] + Position 1: [0.1, 0.2, -0.1, -0.2] = [0.2, 0.5, -0.2, 0.0]

        rnn_outputs = self.dropout(rnn_outputs).to(device)
        rnn_normalized = self.norm(rnn_outputs).to(device)
        # each word will independently have a chance to re-initilise it's representation ( [0.1, 0.3, -0.1, 0.2] ) with all 0s

        rnn_normalized = self.encoder_layer(rnn_normalized)
        # for each encoder_layer generate an output quite simple.

        dense = self.fc1(rnn_normalized)

        return dense, hidden

#----Self Made Layer Decoder Implementation----
class Decoder_Layer(nn.Module):
    def __init__(self, embedding_size, head_num, batch_size, dropout):
        super(Decoder_Layer, self).__init__()

        self.embedding_size = embedding_size

        self.fc1 = nn.Linear(embedding_size, embedding_size, bias=False)
        self.fc2 = nn.Linear(embedding_size, embedding_size, bias=False)

        self.attention = Attention(embedding_size, head_num)


        self.norm = Normalize(0.4, 0.4)
        self.dropout = nn.Dropout(dropout)

    def forward(self, encoder_output, hidden):

        #-----First Sub Layer Attention Mechanism-----#

        attn_weights = self.attention(hidden, encoder_output)

        attn_weights = self.dropout(attn_weights)

        attn_weights = self.norm(attn_weights)
        # It should be normalizing, scaling and shifting

        #-----First Sub Layer Attention Mechanism-----#

        return attn_weights


#----Self Made Decoder Implementation----
class Decoder(nn.Module):
    def __init__(self, embedding, embedding_size, head_num, batch_size, dropout, n_layers, vocab_size):
        super(Decoder, self).__init__()
        self.embedding = embedding
        self.embedding_size = embedding_size
        self.num_layers = n_layers
        self.vocab_size = vocab_size

        self.positional_encoding = PositionalEncoding(vocab_size, embedding_size)

        self.fc = nn.Linear(embedding_size, vocab_size, bias=False)
        self.fc2 = nn.Linear(vocab_size, embedding_size, bias=False)

        self.gru = nn.GRU(embedding_size, embedding_size, n_layers, dropout=(0 if n_layers == 1 else dropout))

        self.decoder_layer = Decoder_Layer(embedding_size, head_num, batch_size, dropout)
        self.concat = nn.Linear(embedding_size * 2, embedding_size)
        self.out = nn.Linear(embedding_size, vocab_size)

        self.norm = Normalize(0.4, 0.4)
        self.dropout = nn.Dropout(dropout)

    def forward(self, decoder_input, encoder_output, hidden_inf):
        #INPUTS
        # decoder_input shape: batch / length
        # enc_output shape: batch / length / embedding_size

        #MASKS
        # ahead shape: length / length
        # mask shape: batch / length / embedding_size

        inputEmbedding = self.embedding(decoder_input).to(device)
        # shape: batch / length / embedding_size
        inputEmbedding *= torch.sqrt(torch.tensor(self.embedding_size, dtype=torch.float32))
        input_PosEmbedding = self.positional_encoding(inputEmbedding).to(device)

        rnn_output, hidden = self.gru(input_PosEmbedding, hidden_inf)

        rnn_output = self.dropout(rnn_output)

        rnn_output = self.norm(rnn_output)

        attn_weights = self.decoder_layer(encoder_output, rnn_output)

        context = attn_weights.bmm(encoder_output.transpose(0, 1))

        context = context.squeeze(1)

        rnn_output = rnn_output.squeeze(0)

        concat_input = torch.cat((rnn_output, context), 1)

        concat_output = torch.tanh(self.concat(concat_input))

        output = self.out(concat_output)

        output = F.softmax(output, dim=1)

        return output, hidden


def maskNLLLoss(inp, target, mask):
    nTotal = mask.sum()
    crossEntropy = -torch.log(torch.gather(inp, 1, target.view(-1, 1)).squeeze(1))
    loss = crossEntropy.masked_select(mask).mean()
    loss = loss.to(device)
    return loss, nTotal.item()

In [92]:
def train(input_variable, target_variable, vocab_size, decoder, encoder, clip,
          encoder_optimizer, decoder_optimizer, embedding_size,head_num, batch_size, lengths, mask, max_length=MAX_LENGTH):

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    loss = 0
    print_losses = []
    n_totals = 0


    decoder_input = torch.LongTensor([[SOS_token for _ in range(batch_size)]]).to(device)

    encoder_output, hidden = encoder(input_variable, lengths)
    decoder_hidden = hidden[:decoder.num_layers]

    choice = random.random()


    if choice > 0.5:
      use_teacher_forcing = True
    else:
      use_teacher_forcing = False

    target_variable = target_variable.t()
    mask = mask.t()

    if use_teacher_forcing:
        for t in range(max_length):
            decoder_output, decoder_hidden = decoder(
                decoder_input, encoder_output, decoder_hidden
            )
            decoder_input = target_variable[t].view(1, -1)

            mask_loss, n_total = maskNLLLoss(decoder_output, target_variable[t], mask[t])
            loss += mask_loss
            print_losses.append(mask_loss.item() * n_total)
            n_totals += n_total

    elif use_teacher_forcing == False:
        for t in range(max_length):
            decoder_output, decoder_hidden = decoder(
                decoder_input, encoder_output, decoder_hidden
            )
            _, topi = decoder_output.topk(1)
            decoder_input = torch.LongTensor([[topi[i][0] for i in range(batch_size)]])
            decoder_input = decoder_input.to(device)

            mask_loss, n_total = maskNLLLoss(decoder_output, target_variable[t], mask[t])
            loss += mask_loss
            print_losses.append(mask_loss.item() * n_total)
            n_totals += n_total

    loss.backward()

    _ = nn.utils.clip_grad_norm_(encoder.parameters(), clip)
    _ = nn.utils.clip_grad_norm_(decoder.parameters(), clip)

    encoder_optimizer.step()
    decoder_optimizer.step()

    return sum(print_losses) / n_totals

In [88]:
def trainIters(model_name, libra, pairs, save_dir, n_iteration, batch_size, checkpoint, clip,
               print_every, save_every, loadFilename, vocab_size, decoder, encoder, head_num, dropout,
               decoder_optimizer, encoder_optimizer, embedding, embedding_size):

    print("Creating the training batches...")
    training_pairs = [batch2TrainData(libra,[random.choice(pairs) for _ in range(batch_size)])
                      for _ in range(n_iteration)]

    start_iteration = 1
    tries = 0

    if loadFilename:
        tries = checkpoint['time']


    print("Initializing Training...")
    print()
    for iteration in range(start_iteration, n_iteration + 1):
        training_pair = training_pairs[iteration - 1]

        input_variable, target_variable, lengths, mask = training_pair
        # batch / length

        input_variable = input_variable.to(device)
        target_variable = target_variable.to(device)

        sum_loss = train(input_variable, target_variable, vocab_size, decoder, encoder, clip,
          encoder_optimizer, decoder_optimizer, embedding_size, head_num, batch_size, lengths, mask)

        if iteration % print_every == 0:
            print("Epoch [{}/{}]\tLoss: {:.3f}".format(iteration, n_iteration,  sum_loss))


        if (iteration % save_every == 0):
                    tries += save_every
                    directory = os.path.join(save_dir, model_name, '{}-{}_{}'.format(embedding_size, head_num, vocab_size))
                    if not os.path.exists(directory):
                        os.makedirs(directory)
                    torch.save({
                        'iteration': iteration,
                        'time': tries,
                        'en': encoder.state_dict(),
                        'de': decoder.state_dict(),
                        'en_opt': encoder_optimizer.state_dict(),
                        'de_opt': decoder_optimizer.state_dict(),
                        'loss': sum_loss,
                        'voc_dict': libra.__dict__,
                        'embedding': embedding.state_dict()
                    }, os.path.join(directory, '{}_{}.tar'.format(iteration, 'checkpoint')))

In [6]:
def evaluate(input_sentence, decoder, encoder, libra, embedding_size):
    # Tokenize the sentence
    indexedSequence = [indexesFromSentence(libra, input_sentence)]
    paddedSequence = zeroPadding(indexedSequence)
    padded_tensor = torch.LongTensor(paddedSequence)

    padding_mask = create_padding_mask(padded_tensor)

    # Convert to tensor and add batch dimension
    sentence_tensor = torch.tensor(padded_tensor, dtype=torch.long)

    # Initialize output tensor with start token
    decoder_input = torch.LongTensor([SOS_token for _ in range(1)])
    decoder_input = decoder_input.t()

    encoder_output = encoder(sentence_tensor, padding_mask)

    for i in range(MAX_LENGTH):
        with torch.no_grad():
            decoder_output = decoder(decoder_input, encoder_output, padding_mask)

        predicted_id = torch.argmax(decoder_output, axis=-1)

        decoder_input = predicted_id

    return decoder_input.squeeze(0)


def predict(input_sentence, decoder, encoder, libra, embedding_size):
    prediction = evaluate(input_sentence, decoder, encoder, libra, embedding_size)
    predicted_sentence = [libra.index2word[index.item()] for index in prediction if index.item() < libra.num_words]
    return predicted_sentence


def evaluateInput(decoder, encoder, libra, embedding_size):
    input_sentence = ''
    while(1):
        try:
            input_sentence = input('User > ')
            if input_sentence == 'q' or input_sentence == 'quit': break
            input_sentence = normalizeString(input_sentence)
            output_words = predict(input_sentence, decoder, encoder, libra, embedding_size)
            print('Cleopatra:', ' '.join(output_words))
        except KeyError:
            print("Error: Encountered unknown word.")

In [7]:
parquet_path = "/content/drive/MyDrive/movie-corpus/movie-corpus/0000.parquet"
libra, pairs = loadPrepareData(parquet_path, save_dir)
pairs = trimRareWords(libra, pairs, MIN_COUNT)

2435 10293 2435 10293
keep_words 2435 / 10293 = 0.2366
Trimmed from 2661 pairs to 340, 0.1278 of total


In [100]:
model_name = 'Cleopatra_model'
checkpoint=None
start_model = "no"
loadFilename = None if start_model == "no" else "/content/drive/MyDrive/data/save/Cleopatra_model/256-4_5442/20000_checkpoint.tar"

if loadFilename:
    print("Set to: 'trained model'")
    checkpoint = torch.load(loadFilename, map_location=device)
    encoder_sd = checkpoint['en']
    decoder_sd = checkpoint['de']
    encoder_optimizer_sd = checkpoint['en_opt']
    decoder_optimizer_sd = checkpoint['de_opt']
    embedding_sd = checkpoint['embedding']
    libra.__dict__ = checkpoint['voc_dict']
    print("Loss: ",checkpoint["loss"])
    print("Time: ",checkpoint["time"])
else:
    print("Set to: 'new model'")

encoder_n_layers = 2
decoder_n_layers = 4
embedding_size = 512
head_num = 8

if embedding_size % head_num != 0:
    raise ValueError("embedding_size / head_num must result in an integer")

dropout = 0.05
batch_size = 10
learning_rate = 0.0001
vocab_size = libra.num_words

task = "train"


embedding = nn.Embedding(vocab_size, embedding_size)
decoder = Decoder(embedding, embedding_size, head_num, batch_size, dropout, decoder_n_layers, vocab_size)
encoder = Encoder(embedding, embedding_size, head_num, batch_size, dropout, encoder_n_layers, vocab_size)


if loadFilename:
    embedding.load_state_dict(embedding_sd)

if loadFilename:
    encoder.load_state_dict(encoder_sd)
    decoder.load_state_dict(decoder_sd)

embedding = embedding.to(device)
encoder = encoder.to(device)
decoder = decoder.to(device)

if task == "train":
    encoder.train()
    decoder.train()
else:
    encoder.eval()
    decoder.eval()

decoder_optimizer = optim.AdamW(decoder.parameters(), lr=learning_rate, betas=(0.9, 0.98), eps=1e-8)
encoder_optimizer = optim.AdamW(encoder.parameters(), lr=learning_rate, betas=(0.9, 0.98), eps=1e-8)

if loadFilename:
    encoder_optimizer.load_state_dict(encoder_optimizer_sd)
    decoder_optimizer.load_state_dict(decoder_optimizer_sd)

clip = 5.0
n_iteration = 1000
print_every = 1
save_every = 10000

if task == "train":
    trainIters(model_name, libra, pairs, save_dir, n_iteration, batch_size, checkpoint, clip,
               print_every, save_every, loadFilename, vocab_size, decoder, encoder, head_num, dropout,
               decoder_optimizer, encoder_optimizer, embedding, embedding_size)

if task == "test":
    evaluateInput(decoder, encoder, libra, embedding_size)

Set to: 'new model'
Creating the training batches...
Initializing Training...

Epoch [1/1000]	Loss: nan
Epoch [2/1000]	Loss: nan
Epoch [3/1000]	Loss: nan
Epoch [4/1000]	Loss: nan
Epoch [5/1000]	Loss: 7.669
Epoch [6/1000]	Loss: 7.642
Epoch [7/1000]	Loss: 7.610
Epoch [8/1000]	Loss: nan
Epoch [9/1000]	Loss: nan
Epoch [10/1000]	Loss: 7.382
Epoch [11/1000]	Loss: 7.243
Epoch [12/1000]	Loss: 7.349
Epoch [13/1000]	Loss: nan
Epoch [14/1000]	Loss: nan
Epoch [15/1000]	Loss: nan
Epoch [16/1000]	Loss: nan
Epoch [17/1000]	Loss: nan
Epoch [18/1000]	Loss: 6.550
Epoch [19/1000]	Loss: 6.440
Epoch [20/1000]	Loss: 6.409
Epoch [21/1000]	Loss: 6.480
Epoch [22/1000]	Loss: nan
Epoch [23/1000]	Loss: nan
Epoch [24/1000]	Loss: nan
Epoch [25/1000]	Loss: nan
Epoch [26/1000]	Loss: 6.317
Epoch [27/1000]	Loss: nan
Epoch [28/1000]	Loss: nan
Epoch [29/1000]	Loss: 5.662
Epoch [30/1000]	Loss: nan
Epoch [31/1000]	Loss: 6.170
Epoch [32/1000]	Loss: 5.954


KeyboardInterrupt: 