<a href="https://colab.research.google.com/github/MessiNN/chatbot-transformer-Early_Stage-/blob/master/Transformer_Architecture_v1_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import random
import re
import os
import yaml
import csv
import unicodedata
import itertools
import pandas as pd


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device_cpu = torch.device("cpu")

save_dir = os.path.join("/content/drive/MyDrive", "data", "save")

PAD_token = 0
SOS_token = 1
EOS_token = 2

MAX_LENGTH = 40

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
class Library:
    def __init__(self):
        self.name = "Dataset"
        self.trimmed = False
        self.word2index = {"PAD": PAD_token, "SOS": SOS_token, "EOS": EOS_token}
        self.word2count = {}
        self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
        self.num_words = 3

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.num_words
            self.word2count[word] = 1
            self.index2word[self.num_words] = word
            self.num_words += 1
        else:
            self.word2count[word] += 1

# We are changing the Standard from unicode (Global Standard) to ASCII which is more in line with our occidental vocabulary (American Standard).
#Thus avoiding letters like "你好，안녕하세요，こんにちは" but allowing letters from A-Z.
def unicodeToAscii(string):
    return ''.join(
        c for c in unicodedata.normalize('NFD', string)
        if unicodedata.category(c) != 'Mn'
    )


def normalizeString(sentence):
    if isinstance(sentence, dict):
      sentence = f"{sentence}"
    AsciiSentence = unicodeToAscii(sentence.lower().strip())
    AsciiSentence = re.sub(r"([.!?])", r" ", AsciiSentence)
    AsciiSentence = re.sub(r"[^a-zA-Z.!?]+", r" ", AsciiSentence)
    AsciiSentence = re.sub(r"[^\x00-\x7F]", r"", AsciiSentence) # This is basically the same as the Unicode to Ascii
    AsciiSentence = re.sub(r"\s+", r" ", AsciiSentence).strip()
    return AsciiSentence

def removePair(pairs):
    try:
        filtered_pairs = [[left, right] for left, right in pairs if left and right]
    except:
        filtered_pairs = [[pair[0], pair[1]] for pair in pairs if pair[0] and pair[1]]
    return filtered_pairs

def readCsv(datafile):
    pairs = []
    questions = []
    responses = []
    with open(datafile, 'r') as csv_file:
        csv_reader = csv.reader(csv_file)
        next(csv_reader)
        for row in csv_reader:
            if len(row) >= 2:
                questions.append(row[1])
                responses.append(row[2])
    for question, response in zip(questions, responses):
        question = normalizeString(question)
        response = normalizeString(response)
        pair = [question, response]
        pairs.append(pair)
    filtered_pairs = removePair(pairs)
    return filtered_pairs


def readPanda(datafile):
    pairs = []
    dataset = pd.read_parquet(datafile)
    questions = dataset['question'].tolist()
    responses = dataset['response'].tolist()
    for question, response in zip(questions, responses):
        question = normalizeString(question)
        response = normalizeString(response)
        pair = [question, response]
        pairs.append(pair)
    filtered_pairs = removePair(pairs)
    return filtered_pairs

def readTxt(datafile):
    lines = open(datafile, encoding='utf-8').\
        read().strip().split('\n')
    pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]
    filtered_pairs = removePair(pairs)
    return  filtered_pairs

def readYml(datafile):
    pairs = []
    with open(datafile, "r") as file:
        yaml_content = file.read()
    data = yaml.safe_load(yaml_content)
    for sentences in data.get("conversations", []):
        question = sentences[0]
        sentences.pop(0)
        if len(sentences) > 1:
          for answer in sentences:
              question = normalizeString(question)
              answer = normalizeString(answer)
              pairs.append([question, answer])
        if len(sentences) == 1:
              question = normalizeString(question)
              sentences = normalizeString(sentences[0])
              pairs.append([question, sentences])
    return pairs

def customData(datafile):
    pairs = []
    pair = []
    data = open(datafile, encoding='utf-8').\
        read().strip().split("\n")
    for line in data:
        split_strings = line.split("\\t")
        source = normalizeString(split_strings[0])
        target = normalizeString(split_strings[1])
        pair.append(source)
        pair.append(target)
        pairs.append(pair)
        pair = []
    filtrated_pairs = removePair(pairs)
    return filtrated_pairs

def filterPair(p):
    return len(p[0].split(' ')) < MAX_LENGTH - 1 and len(p[1].split(' ')) < MAX_LENGTH - 1

def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

def loadPrepareData(libra, option):
    if option == 1:
        pairs = readPanda("/content/drive/MyDrive/movie-corpus/movie-corpus/0000.parquet")
    elif option == 2:
        pairs = readCsv("/content/drive/MyDrive/movie-corpus/movie-corpus/Conversation.csv")
    elif option == 3:
        pairs = readTxt("/content/drive/MyDrive/movie-corpus/movie-corpus/formatted_movie_lines.txt")
    elif option == 4:
        pairs = customData("/content/drive/MyDrive/movie-corpus/movie-corpus/customdata.txt")
    elif option == 5:
        pairs = readYml("/content/drive/MyDrive/movie-corpus/movie-corpus/Topics/full/alltopics.yml")
    else:
        fullPairs = []
        pairs = []

        pairs1 = readTxt("/content/drive/MyDrive/movie-corpus/movie-corpus/formatted_movie_lines.txt")
        fullPairs.append(pairs1)

        pairs2 = readCsv("/content/drive/MyDrive/movie-corpus/movie-corpus/Conversation.csv")
        fullPairs.append(pairs2)

        pairs3 = readPanda("/content/drive/MyDrive/movie-corpus/movie-corpus/0000.parquet")
        fullPairs.append(pairs3)

        pairs4 = customData("/content/drive/MyDrive/movie-corpus/movie-corpus/customdata.txt")
        fullPairs.append(pairs4)

        pairs5 = readYml("/content/drive/MyDrive/movie-corpus/movie-corpus/Topics/full/alltopics.yml")
        fullPairs.append(pairs5)

        for p in fullPairs:
            for pair in p:
                pairs.append(pair)

    for pair in pairs:
        libra.addSentence(pair[0])
        libra.addSentence(pair[1])

    pairs = filterPairs(pairs)

    return pairs




def SentenceToNum(libra, sentence):
    return [SOS_token] + [libra.word2index[word] for word in sentence.split(' ') if word in libra.word2index] + [EOS_token]

def Padding(batch):
    padded_list = []
    for sequence in batch:
        padded_sequence = list(sequence) + [PAD_token] * ((MAX_LENGTH) - len(sequence))
        padded_list.append(padded_sequence)
    return padded_list


def BinaryMask(batch):
    binary = []
    for i, seq in enumerate(batch):
        binary.append([])
        for token in seq:
            if token == PAD_token:
                binary[i].append(0)
            else:
                binary[i].append(1)
    return binary

def inputVar(batch, libra):
    indexes_batch = [SentenceToNum(libra, sentence) for sentence in batch] # batch of tokenized sentences
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
    padList = Padding(indexes_batch)
    padVar = torch.LongTensor(padList)
    return padVar, lengths

# Returns padded target sequence tensor, padding mask, and max target length
def outputVar(batch, libra):
    indexes_batch = [SentenceToNum(libra, sentence) for sentence in batch]
    max_target_len = max([len(indexes) for indexes in indexes_batch])
    padList = Padding(indexes_batch)
    mask = BinaryMask(padList)
    mask = torch.BoolTensor(mask)
    padVar = torch.LongTensor(padList)
    return padVar, mask, max_target_len

def batch2TrainData(libra, pair_batch):
    input_batch, output_batch = [], []
    for pair in pair_batch:
        input_batch.append(pair[0])
        output_batch.append(pair[1])
    inp, lengths = inputVar(input_batch, libra)
    output, mask, max_target_len = outputVar(output_batch, libra)
    return inp, output, lengths, mask, max_target_len

In [None]:
# Normalization Module
class Normalization(nn.Module):
    def __init__(self, scale: float, shift: float, epsilon: float = 1e-8):
        super(Normalization, self).__init__()
        self.scale = scale
        self.shift = shift
        self.epsilon = epsilon

    def forward(self, x):
        mean = torch.mean(x)
        deviation = torch.std(x) + self.epsilon
        x = (x - mean) / deviation
        x = x * self.scale
        x = x + self.shift
        return x

#Special_Neurons # We need to increase the number of neurons
class DecoderNeurons(nn.Module):
    def __init__(self, embedding_size: int, vocab_size: int):
        super(DecoderNeurons, self).__init__()
        self.fc1 = nn.Linear(embedding_size * 2, embedding_size)
        self.fc2 = nn.Linear(embedding_size, vocab_size)
        self.tnh = nn.Tanh()
        self.sig = nn.Sigmoid()
    def forward(self, x):
        #x = self.sig(x)
        x = self.tnh(self.fc1(x))
        output = self.fc2(x)
        return output

class EncoderNeurons(nn.Module):
    def __init__(self, embedding_size: int):
        super(EncoderNeurons, self).__init__()
        self.fc1 = nn.Linear(embedding_size * 2, embedding_size)
        self.fc2 = nn.Linear(embedding_size, embedding_size)
        self.sig = nn.Sigmoid()
    def forward(self, x):
        x = self.sig(self.fc1(x))
        output = self.fc2(x)
        return output

# Attention Module
class Attention(nn.Module):
    def __init__(self, embedding_size: int):
        super(Attention, self).__init__()
        self.attn = nn.Linear(embedding_size, embedding_size)

    def general_score(self, hidden, encoder_output):
        energy = self.attn(encoder_output)
        energy = torch.sum(hidden * energy, dim=2)
        return energy

    def forward(self, hidden, encoder_outputs):
        attn_energies = self.general_score(hidden, encoder_outputs)
        attn_energies = attn_energies.t()
        attn_energies = F.softmax(attn_energies, dim=1).unsqueeze(1)
        return attn_energies

# Encoder Layer
class EncoderLayer(nn.Module):
    def __init__(self, embedding_size: int, dropout: float):
        super(EncoderLayer, self).__init__()
        self.network = EncoderNeurons(embedding_size)
        self.norm = Normalization(0.4, 0.4)
        self.dropout = nn.Dropout(dropout)

    def forward(self, rnn_output):
        input_dropped = self.dropout(rnn_output)
        input_normalized = self.norm(input_dropped)
        output = self.network(input_normalized)
        return output

# Encoder
class Encoder(nn.Module):
    def __init__(self, embedding, embedding_size: int, dropout: float, n_layers: int):
        super(Encoder, self).__init__()
        self.num_layers = n_layers
        self.encoder_layer = EncoderLayer(embedding_size, dropout)
        self.embedding = embedding
        self.embedding_size = embedding_size
        self.lstm = nn.LSTM(embedding_size, embedding_size, n_layers,
                             dropout=dropout, bidirectional=True)
        self.dropout = nn.Dropout(dropout)

    def forward(self, source_tensor, lengths):
        source_embedding = self.embedding(source_tensor).to(device)
        source_embedding = self.dropout(source_embedding).to(device)
        rnn_output, hidden = self.lstm(source_embedding)
        output = self.encoder_layer(rnn_output)
        return output, hidden

# Decoder Layer
class DecoderLayer(nn.Module):
    def __init__(self, embedding_size: int, prediction_size: int, dropout: float):
        super(DecoderLayer, self).__init__()
        self.norm = Normalization(0.4, 0.4)
        self.dropout = nn.Dropout(dropout)
        self.network = DecoderNeurons(embedding_size, prediction_size)

    def forward(self, concat_input):
        input_dropped = self.dropout(concat_input)
        input_normalized = self.norm(input_dropped)
        output = self.network(input_normalized)
        return output

# Decoder
class Decoder(nn.Module):
    def __init__(self, embedding, embedding_size: int, dropout: float, n_layers: int, prediction_size: int):
        super(Decoder, self).__init__()

        self.num_layers = n_layers

        self.attention = Attention(embedding_size)
        self.decoder_layer = DecoderLayer(embedding_size, prediction_size, dropout)

        self.lstm = nn.LSTM(embedding_size, embedding_size, n_layers, dropout=dropout)
        self.dropout = nn.Dropout(dropout)

        self.embedding = embedding

    def forward(self, decoder_input, encoder_output, hidden_inf):
        inputEmbedding = self.embedding(decoder_input)
        inputEmbedding = self.dropout(inputEmbedding)
        rnn_output, hidden = self.lstm(inputEmbedding, hidden_inf)
        attn_weights = self.attention(rnn_output, encoder_output)
        context = attn_weights.bmm(encoder_output.transpose(0, 1))
        rnn_output = rnn_output.squeeze(0)
        context = context.squeeze(1)
        concat_input = torch.cat((rnn_output, context), 1)
        network_output = self.decoder_layer(concat_input)
        output = F.softmax(network_output, dim=1)
        return output, hidden

# Loss Function
def maskNLLLoss(inp, target, mask):
    nTotal = mask.sum()
    crossEntropy = -torch.log(torch.gather(inp, 1, target.view(-1, 1)).squeeze(1))
    loss = crossEntropy.masked_select(mask).mean()
    loss = loss.to(device)
    return loss, nTotal.item()

In [None]:
def train(input_variable, target_variable, decoder, encoder, clip, libra,
          encoder_optimizer, decoder_optimizer, batch_size, lengths, mask, max_length, iteration):

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    loss = 0
    print_losses = []
    n_totals = 0


    decoder_input = torch.LongTensor([[SOS_token for _ in range(batch_size)]]).to(device)

    encoder_output, hidden = encoder(input_variable.t(), lengths)
    (hidden_state, cell_state) = hidden
    decoder_hidden = (hidden_state[:decoder.num_layers], cell_state[:decoder.num_layers])

    choice = random.random()


    if choice > 0.5:
      use_teacher_forcing = True
    else:
      use_teacher_forcing = False

    target_variable = target_variable.t()
    mask = mask.t()

    if use_teacher_forcing:
        #all_tokens = torch.zeros([0], device=device_cpu, dtype=torch.long).to(device)
        for t in range(max_length):
            decoder_output, decoder_hidden = decoder(
                decoder_input.to(device), encoder_output, decoder_hidden
            )

            #decoder_scores, decoder_input = torch.max(decoder_output, dim=1)
            #all_tokens = torch.cat((all_tokens, decoder_input), dim=0)

            decoder_input = target_variable[t].view(1, -1)
            mask_loss, n_total = maskNLLLoss(decoder_output, target_variable[t], mask[t])
            loss += mask_loss
            print_losses.append(mask_loss.item() * n_total)
            n_totals += n_total
        #print(1,"\n")

    elif use_teacher_forcing == False:
        #all_tokens = torch.zeros([0], device=device_cpu, dtype=torch.long).to(device)
        for t in range(max_length):
            decoder_output, decoder_hidden = decoder(
                decoder_input, encoder_output, decoder_hidden
            )

            #decoder_scores, decoder_input = torch.max(decoder_output, dim=1)
            #all_tokens = torch.cat((all_tokens, decoder_input), dim=0)


            _, topi = decoder_output.topk(1)
            decoder_input = torch.LongTensor([[topi[i][0] for i in range(batch_size)]]).to(device)
            mask_loss, n_total = maskNLLLoss(decoder_output, target_variable[t], mask[t])
            loss += mask_loss
            print_losses.append(mask_loss.item() * n_total)
            n_totals += n_total
        #print(2,"\n")
    #decoded_words = [libra.index2word[token.item()] for token in all_tokens]
    #target_words = [libra.index2word[token.item()] for token in target_variable]
    #decoded_words[:] = [x for x in decoded_words if not (x == 'EOS' or x == 'SOS')]
    #target_words[:] = [x for x in target_words if not (x == 'EOS' or x == 'SOS')]
    #print('Cleopatra:', ' '.join(decoded_words))
    #print('Target:', ' '.join(target_words))

    loss.backward()

    _ = nn.utils.clip_grad_norm_(encoder.parameters(), clip)
    _ = nn.utils.clip_grad_norm_(decoder.parameters(), clip)

    encoder_optimizer.step()
    decoder_optimizer.step()

    if iteration % print_every == 0:
      loss_avg = sum(print_losses) / n_totals / print_every
      print("Iteration: {}; Percent complete: {:.1f}%; Average loss: {:.4f}".format(iteration, iteration / n_iteration * 100, loss_avg))
      print_loss = 0

    return loss_avg

In [None]:
def trainIters(model_name, libra, save_dir, n_iteration, batch_size, checkpoint, clip,
               print_every, save_every, loadFilename, prediction_size, decoder, encoder, dropout,
               decoder_optimizer, encoder_optimizer, embedding, embedding_size, pairs, times):

    training_pairs = [batch2TrainData(libra, [random.choice(pairs) for _ in range(batch_size)])
                        for _ in range(n_iteration)]

    start_iteration = 1
    print_loss = 0
    tries = 0

    if loadFilename:
        tries = checkpoint['time']


    print("Initializing Training...")
    print()
    for interlatitude in range(times):
      for iteration in range(start_iteration, n_iteration + 1):
          training_pair = training_pairs[iteration - 1]

          input_variable, target_variable, lengths, mask, max_target_len = training_pair
          # batch / length

          input_variable = input_variable.to(device)
          target_variable = target_variable.to(device)
          mask = mask.to(device)

          while(1):
              loss = train(input_variable, target_variable, decoder, encoder, clip, libra,
                         encoder_optimizer, decoder_optimizer, batch_size, lengths, mask, max_target_len, iteration)
              if loss <= 0.05:
                break

          if (iteration % save_every == 0):
                      tries += save_every
                      directory = os.path.join(save_dir, model_name, 'Words-{}'.format(prediction_size))
                      if not os.path.exists(directory):
                          os.makedirs(directory)
                      torch.save({
                          'iteration': iteration,
                          'time': tries,
                          'en': encoder.state_dict(),
                          'de': decoder.state_dict(),
                          'en_opt': encoder_optimizer.state_dict(),
                          'de_opt': decoder_optimizer.state_dict(),
                          'loss': loss,
                          'voc_dict': libra.__dict__,
                          'embedding': embedding.state_dict()
                      }, os.path.join(directory, '{}_{}.tar'.format(iteration, 'checkpoint')))

In [None]:
class GreedySearchDecoder(nn.Module):
    def __init__(self, encoder, decoder, libra):
        super(GreedySearchDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.libra = libra

    def forward(self, input_sentence):

        all_tokens = torch.zeros([0], device=device_cpu, dtype=torch.long)
        all_scores = torch.zeros([0], device=device_cpu)

        indexedSequence = [SentenceToNum(self.libra, input_sentence)]
        lengths = torch.tensor([len(indexes) for indexes in indexedSequence])


        paddedSequence = Padding(indexedSequence)
        padded_tensor = torch.LongTensor(paddedSequence)
        padding_mask = (padded_tensor != 0)


        # Convert to tensor and add batch dimension
        sentence_tensor = torch.LongTensor(padded_tensor).t()


        # Initialize output tensor with start token
        decoder_input = torch.LongTensor([[SOS_token]])
        decoder_input = decoder_input.t()

        encoder_output, hidden = self.encoder(sentence_tensor.to(device_cpu), lengths.to(device_cpu))
        (hidden_state, cell_state) = hidden
        decoder_hidden = (hidden_state[:self.decoder.num_layers], cell_state[:self.decoder.num_layers])

        for _ in range(MAX_LENGTH):
            decoder_output, decoder_hidden = self.decoder(
                decoder_input, encoder_output, decoder_hidden
            )
            decoder_scores, decoder_input = torch.max(decoder_output, dim=1)

            all_tokens = torch.cat((all_tokens, decoder_input), dim=0)
            all_scores = torch.cat((all_scores, decoder_scores), dim=0)

            decoder_input = torch.unsqueeze(decoder_input, 0)


        decoded_words = [libra.index2word[token.item()] for token in all_tokens]

        return decoded_words

class BeamSearch(nn.Module):
    def __init__(self, beam_width, encoder, decoder, libra, temp= 0.5, penalty = 0.2):
        super(BeamSearch, self).__init__()
        self.beam_width = beam_width
        self.libra = libra
        self.encoder = encoder
        self.decoder = decoder
        self.temperature = temp
        self.rep_penalty = penalty

    def repetition(self, sequence):
        token_counts = {}
        penalty = 0
        for token in sequence:
            if token in token_counts:
                penalty += token_counts[token]
                token_counts[token] += 1
            else:
                token_counts[token] = 1
        return penalty * self.rep_penalty

    def forward(self, input_sentence):
        indexedSequence = [SentenceToNum(self.libra, input_sentence)]
        lengths = torch.tensor([len(indexes) for indexes in indexedSequence])

        paddedSequence = Padding(indexedSequence)
        sentence_tensor = torch.LongTensor(paddedSequence).t()

        decoder_input = torch.LongTensor([[SOS_token]]).t()

        encoder_output, hidden = self.encoder(sentence_tensor.to(device_cpu), lengths.to(device_cpu))
        (hidden_state, cell_state) = hidden
        decoder_hidden = (hidden_state[:self.decoder.num_layers], cell_state[:self.decoder.num_layers])

        beam = [([SOS_token], 0)] # brackets allowing us to unpack both variables. Else python will consider it as 1 variable.
        for _ in range(MAX_LENGTH):
            candidates = []
            for sequence, score in beam:
                last_token = sequence[-1]

                if last_token == EOS_token:
                   candidates.append((sequence, score))
                   continue

                decoder_output, decoder_hidden = self.decoder(decoder_input, encoder_output, decoder_hidden)

                probabilities = decoder_output.squeeze(0) / self.temperature

                topk_probs, topk_indices = torch.topk(probabilities, self.beam_width)

                for probability, index in zip(topk_probs.tolist(), topk_indices.tolist()):
                    penalty_score = self.repetition(sequence + [index])
                    candidates.append((sequence + [index], score + probability - penalty_score))

            beam = sorted(candidates, key= lambda x: x[1], reverse=True)[:self.beam_width]
        predicted_sentence = [self.libra.index2word[index] for index in beam[0][0] if index < self.libra.num_words]
        return predicted_sentence


def evaluateInput(decoder, encoder, searcher, libra):
    while(1):
        try:
            input_sentence = input('User > ')
            if input_sentence == 'q' or input_sentence == 'quit': break
            input_sentence = normalizeString(input_sentence)
            output_words = searcher(input_sentence)
            output_words[:] = [x for x in output_words if not (x == 'EOS' or x == 'SOS')]
            print('Cleopatra:', ' '.join(output_words))
        except KeyError:
            print("Error: Encountered unknown word.")

In [None]:
libra = Library()

task = "train"
model_name = 'Cleopatra_model#v1.3.1'
checkpoint=None
start_model = "yes"
loadFilename = None if start_model == "no" else "/content/drive/MyDrive/data/save/Cleopatra_model#v1.3.1/Words-5000/2000_checkpoint.tar"

clip = 5.0
n_iteration = 2000
print_every = 1
save_every = 2000
times = 10

if loadFilename:
    print("Set to: 'trained model'")
    checkpoint = torch.load(loadFilename, map_location=device)
    encoder_sd = checkpoint['en']
    decoder_sd = checkpoint['de']
    encoder_optimizer_sd = checkpoint['en_opt']
    decoder_optimizer_sd = checkpoint['de_opt']
    libra.__dict__ = checkpoint['voc_dict']
    embedding_sd = checkpoint['embedding']
    print("Loss: ",checkpoint["loss"])
    print("Time: ",checkpoint["time"])
else:
    print("Set to: 'new model'")

if task == "train":
    #if panda, change to 1.
    #if csv, change to 2.
    #if txt, change to 3.
    #if custom, change to 4.
    #if yml, change to 5.
    #if all, change to any but 12345.
    pairs = loadPrepareData(libra, 1)


encoder_n_layers = 2
decoder_n_layers = 4
embedding_size = 512

dropout = 0
batch_size = 1
learning_rate = 0.0000001
prediction_size = 5000 #libra.num_words


embedding = nn.Embedding(prediction_size, embedding_size)
decoder = Decoder(embedding, embedding_size, dropout, decoder_n_layers, prediction_size)
encoder = Encoder(embedding, embedding_size, dropout, encoder_n_layers)


if loadFilename:
    embedding.load_state_dict(embedding_sd)

if loadFilename:
    encoder.load_state_dict(encoder_sd)
    decoder.load_state_dict(decoder_sd)

if task == "train":
    encoder.train()
    decoder.train()

    embedding = embedding.to(device)
    encoder = encoder.to(device)
    decoder = decoder.to(device)
else:
    encoder.eval()
    decoder.eval()

    embedding = embedding.to(device_cpu)
    encoder = encoder.to(device_cpu)
    decoder = decoder.to(device_cpu)

decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate, betas=(0.9, 0.98), eps=1e-8)
encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate, betas=(0.9, 0.98), eps=1e-8)

if loadFilename:
    encoder_optimizer.load_state_dict(encoder_optimizer_sd)
    decoder_optimizer.load_state_dict(decoder_optimizer_sd)

if task == "train":
    trainIters(model_name, libra, save_dir, n_iteration, batch_size, checkpoint, clip,
               print_every, save_every, loadFilename, prediction_size, decoder, encoder, dropout,
               decoder_optimizer, encoder_optimizer, embedding, embedding_size, pairs, times)
#Should later check if the 5000 predicions made are always the same words or they are random each time.
if task == "test":
    beam_width = 30
    #searcher = BeamSearch(beam_width, encoder, decoder, libra)
    searcher = GreedySearchDecoder(encoder, decoder, libra)
    evaluateInput(decoder, encoder, searcher, libra)
#Minimum label: 0
#Maximum label: 24137

In [None]:
#Gradient_decent: Is used to tweak / change the weights and biases of the model to get closer to the appropiate behaviour.

#AI: They work by knowing / recognising the answer to a quiestion.
#For example; If I want an AI to guess the value of x through this operation (5 - x = 1) the AI recognises that x is 4.
#Is not because because it knows the formula 5 - 4 = 1. Is becauses it recognized this pattern "4 - 5 = 1" thorough training.
#Not because it knows the formula explicitly for example if I where to give it (5 - x = 2) it would not be able to guess what x is.
#Because it does not know how to "think_logically" with things that it was not trained on.

In [None]:
"""
Artificial_Neurons = Artificial_Neurons are a type of function for Neural Networks.
                     There are 2 types of neurons linear functions and non-linear functions.

                     The number of inputs a neuron can take can be changed. Because we start of
                     with a fixed value e.g. 512. But this can be modified during process.
                     That means, it represents (n-elements: variable number of inputs).
                     Mathematically this is represented as f(x1,..., x(n-element)) #Note: We add n-element there because it represents the number of inputs,
                                                                                    therefor the last number. Where f is an activation function which introduces
                                                                                    non-linearity. Which allows the model to learn and represent complex patterns.

So the coded neuron functions would look like this:
(non-linear neurons) #Note: They are non-linear because of max(0,z) meaning that if the
                            resulting number is positive the output will be equalt to z
                            but if it's negative the output will be 0. Is non-linear
                            because it does not follow a straight path. A linear neuron
                            will ALWAYS output the same as the input.

def f(x, y):
    return max(0,2*x + 3*y - 3) #Note: for 2 inputs

#Note: This is called "Unary non linear function". Because it only takes 1 input.
def f(x):
    return max(0, 2*x - 3)

Other functions:
Quadratic function f(x) = a*x^2 +b*x +c
Exponential function f(x) = e^x
Logarithmic function f(x) = log(x)
Sigmoid function f(x) = 1|1+e^-z

                     A neuron can be split into 2 sub-functions such as:
                     f(x1,..., x(n-element)) = h(g(x1,..., x(n-element)))

                     Think of it as g is you linear function then h is you
                     non-linear function. Resulting in 1 a neuron. The neuron
                     architecture depends on how you want to make it. Is not
                     always 1 function = 1 neuron.

                     A linear function is represented as:
                     g(x1,..., x(n-element)) = w1*x1 + ..., w(n-element)*x(n-element) + b
                     w1 ..., w(n-element) and b are all parameters from the function.
                     Thus, diffrent linear functions diffrent parameters too.

                     Softmax is a special case. Since is not an activation function like
                     a linear function is. It's output are probabilities of a categorical
                     distribution.

                     A neural networks needs to be composed of linear and non linear functions
                     for example if we have a neural network of only linear functions. Then all
                     we did is a ternary linear function, in simple a large linear function.
                     Thus, a neural network needs linear and non linear funcions to predict the
                     final output as accurate as possible.

                     A complete nuron is basically a linear and a non linear activation function.
                     Making it more powerful when it comes to accurate when producing values.
"""

In [None]:
"""

Chain Rule in Calculus: The chain rule is a fundamental principle in calculus used for differentiating composite functions. If you have a function composed of two or more functions, say y = f(g(x)), the derivative of y with respect to x is given by the product of the derivative of f with respect to g(x) and the derivative of g(x) with respect to x. Mathematically, this is expressed as dy/dx = (df/dg) * (dg/dx).
Backpropagation and the Chain Rule: Backpropagation uses the chain rule to compute the gradient of the loss function with respect to the weights and biases. It starts from the output layer and works backwards through each layer, hence the name “backpropagation”. At each layer, it computes the local gradient and then multiplies it with the gradient flowing into that layer from the next layer (closer to the output). This is essentially applying the chain rule.
Where is the Gradient Computed?: The gradient is computed at each neuron for its weights and bias. For each weight, the gradient tells us how much a small change in that weight would affect the overall loss. These gradients are stored and used to update the weights and biases in the direction that reduces the loss.
So, in a literal sense, the gradient is computed and stored in each neuron for each of its weights and biases. If you’re using a deep learning framework like TensorFlow or PyTorch, this is handled automatically when you call the backward() function. The gradients are stored in a variable associated with each weight and bias, and can be accessed after the backward pass.

"""

In [None]:
"""
#Inteligence, Knowledge, Brain, Mind, Cognition, Calculation, Logic
"""