<a href="https://colab.research.google.com/github/MessiNN/chatbot-transformer-Early_Stage-/blob/master/Transformer_Architecture_v1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from google.colab import drive
drive.mount('/content/drive')

import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim.lr_scheduler import LambdaLR


import random
import re
import os
import unicodedata
import itertools
import pandas as pd


device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device_cpu = torch.device("cpu")

save_dir = os.path.join("/content/drive/MyDrive", "data", "save")

PAD_token = 0
SOS_token = 1
EOS_token = 2

MAX_LENGTH = 20
MIN_COUNT = 4

Mounted at /content/drive


In [3]:
class Library:
    def __init__(self):
        self.name = "Dataset"
        self.trimmed = False
        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
        self.num_words = 3

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.num_words
            self.word2count[word] = 1
            self.index2word[self.num_words] = word
            self.num_words += 1
        else:
            self.word2count[word] += 1

    def trim(self, min_count):
        if self.trimmed:
            return
        self.trimmed = True

        keep_words = []

        for k, v in self.word2count.items():
            if v >= min_count:
                keep_words.append(k)


        print(len(keep_words), len(self.word2index), len(keep_words), len(self.word2index))
        print('keep_words {} / {} = {:.4f}'.format(
            len(keep_words), len(self.word2index), len(keep_words) / len(self.word2index)
        ))

        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
        self.num_words = 3

        for word in keep_words:
            self.addWord(word)

def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    s = re.sub(r"\s+", r" ", s).strip()
    return s

def readVocs(datafile):
    pairs = []
    df = pd.read_parquet(datafile)
    questions = df['question'].tolist()
    responses = df['response'].tolist()
    for question, response in zip(questions, responses):
      question = normalizeString(question)
      response = normalizeString(response)
      pair = [question, response]
      pairs.append(pair)
    return pairs

def filterPair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and len(p[1].split(' ')) < MAX_LENGTH

def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

def loadPrepareData(datafile, save_dir):
    libra = Library()
    pairs = readVocs(datafile)
    pairs = filterPairs(pairs)
    for pair in pairs:
        libra.addSentence(pair[0])
        libra.addSentence(pair[1])
    return libra, pairs


def trimRareWords(libra, pairs, MIN_COUNT):
    libra.trim(MIN_COUNT)
    keep_pairs = []
    for pair in pairs:
        input_sentence = pair[0]
        output_sentence = pair[1]
        keep_input = True
        keep_output = True
        for word in input_sentence.split(' '):
            if word not in libra.word2index:
                keep_input = False
                break
        for word in output_sentence.split(' '):
            if word not in libra.word2index:
                keep_output = False
                break

        if keep_input and keep_output:
            keep_pairs.append(pair)

    print("Trimmed from {} pairs to {}, {:.4f} of total".format(len(pairs), len(keep_pairs), len(keep_pairs) / len(pairs)))
    return keep_pairs

def indexesFromSentence(libra, sentence):
    return [libra.word2index[word] for word in sentence.split(' ') if word in libra.word2index] + [EOS_token]


def zeroPadding(l, fillvalue=PAD_token):
    padded_list = []
    for sequence in l:
        padded_sequence = list(sequence) + [fillvalue] * (MAX_LENGTH - len(sequence))
        padded_list.append(padded_sequence)
    return padded_list

def inputVar(l, voc):
    indexes_batch = [indexesFromSentence(voc, sentence) for sentence in l]
    padList = zeroPadding(indexes_batch)
    padVar = torch.LongTensor(padList)
    return padVar

# Returns padded target sequence tensor, padding mask, and max target length
def outputVar(l, voc):
    indexes_batch = [indexesFromSentence(voc, sentence) for sentence in l]
    padList = zeroPadding(indexes_batch)
    padVar = torch.LongTensor(padList)
    return padVar

def batch2TrainData(voc, pair_batch):
    pair_batch.sort(key=lambda x: len(x[0].split(" ")), reverse=True)
    input_batch, output_batch = [], []
    for pair in pair_batch:
        input_batch.append(pair[0])
        output_batch.append(pair[1])
    inp = inputVar(input_batch, voc)
    output = outputVar(output_batch, voc)
    return inp, output

In [4]:
#Own code implementation (By me)
def create_padding_mask(length):
    seq = torch.eq(length, 0)
    return seq.unsqueeze(1).unsqueeze(2)

def create_look_ahead_mask(size):
    mask = (1 - torch.triu(torch.ones(size, size), diagonal=1)).bool()
    return mask

def positions(sequence, size):

    batch_size = sequence
    pos = torch.arange(batch_size).float().unsqueeze(1)
    i = torch.arange(size).float().unsqueeze(0)
    angles = pos / 10000 ** (2 * (i // 2) / size)
    angles = angles.reshape(batch_size, size)

    pe = torch.zeros(batch_size, size)
    pe[:, 0::2] = torch.sin(angles[:, 0::2])
    pe[:, 1::2] = torch.cos(angles[:, 1::2])
    return pe

def accuracy(y_true, y_pred):
    y_pred_argmax = torch.argmax(y_pred, dim=-1)
    correct = (y_pred_argmax == y_true).float().sum()
    total = y_true.numel()
    return correct / total

class CustomSchedule(LambdaLR):
    def __init__(self, optimizer, d_model, warmup_steps=1000):
        self.d_model = torch.tensor(d_model, dtype=torch.float32)
        self.warmup_steps = warmup_steps
        super(CustomSchedule, self).__init__(optimizer, self.lr_lambda)

    def lr_lambda(self, step):
        step_float = torch.tensor(step, dtype=torch.float32)
        arg1 = torch.rsqrt(step_float)
        arg2 = step_float * (self.warmup_steps**-1.5)
        return (torch.rsqrt(self.d_model) * torch.minimum(arg1, arg2))

class Normalize(nn.Module):
    def __init__(self, scale: float, shift: float, epsilon: float = 1e-8):
        super(Normalize, self).__init__()
        self.scale = scale
        self.shift = shift
        self.epsilon = epsilon

    def forward(self, x):
        mean = torch.mean(x)
        deviation = torch.std(x) + self.epsilon
        x = (x - mean) / deviation
        x = x * self.scale
        x = x + self.shift
        return x


#---- Self Made Multi Head Attention Implementation ----
class Multi_Head_Attention(nn.Module):
    def __init__(self, embedding_size, head_num, batch_size, learning_rate):
        super(Multi_Head_Attention, self).__init__()

        self.embedding_size = embedding_size
        self.num_heads = head_num
        self.learning_rate = learning_rate
        self.head_size = embedding_size // self.num_heads


        self.w_q = nn.Linear(embedding_size, embedding_size*self.num_heads, bias=False)
        self.w_k = nn.Linear(embedding_size, embedding_size*self.num_heads, bias=False)
        self.w_v = nn.Linear(embedding_size, embedding_size*self.num_heads, bias=False)

        self.w_o = nn.Linear(embedding_size*self.num_heads, embedding_size, bias=False) # dense linear pojection

    def split_heads(self, x, batch_size):
        x = x.reshape(batch_size, -1, self.num_heads, self.head_size)
        return x.permute(0, 2, 1, 3)


    def scaled_dot_product_attention(self, q, k, v, x, blank, mask=None):
        distribution = torch.matmul(q, k.transpose(2,3))/ np.sqrt(self.embedding_size)

        if mask is not None:
            distribution.masked_fill(mask, float('-inf'))

        distribution = F.softmax(distribution, dim=-1)
        weight = torch.matmul(distribution, v)
        return weight

    def forward(self, input_d, blank=None, mask=None ):
        #shape: batch / length /embedding_size
        batch, length, size = input_d.shape

        q = self.w_q(input_d)
        k = self.w_k(input_d)
        v = self.w_v(input_d)

        q = self.split_heads(q, batch)
        k = self.split_heads(k, batch)
        v = self.split_heads(v, batch)

        product = self.scaled_dot_product_attention(q, k, v, blank, mask)

        product = product.permute(0,2,1,3)

        concat = product.reshape(batch, -1, self.embedding_size*self.num_heads)

        out = self.w_o(concat)

        return out

#----Self Made Layer Encoder Implementation----
class Encoder_Layer(nn.Module):
    def __init__(self, embedding_size, head_num, batch_size, dropout, learning_rate):
        super(Encoder_Layer, self).__init__()

        self.embedding_size = embedding_size

        self.fc1 = nn.Linear(embedding_size, embedding_size, bias=False)
        self.fc2 = nn.Linear(embedding_size, embedding_size, bias=False)

        self.attention = Multi_Head_Attention(embedding_size, head_num, batch_size, learning_rate)

        self.norm = Normalize(0.4, 0.4)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input_d, mask):
        attn = self.attention(input_d, None, mask)
        # attn shape: batch / length / embedding_size

        attn_d = self.dropout(attn)
        # each word will independently have a chance to re-initilise it's representation ( [0.1, 0.3, -0.1, 0.2] ) with all 0s

        attn_dn = self.norm(input_d + attn_d)
        # It should be normalizing, scaling and shifting

        attn_lin1 = self.fc1(attn_dn)
        # the preservation of vector addition and scalar multiplication [2,3]+[1,6] -linear-> [6,18]

        attn_rel = F.relu(attn_lin1)
        # activation function that looks at each dimensional cell and returns 0 if negative or if positive returns back the number itself


        attn_lin2 = self.fc2(attn_rel)
        # the preservation of vector addition and scalar multiplication [2,3]+[1,6] -linear-> [6,18]

        attn_d = self.dropout(attn_lin2)
        # each word will independently have a chance to re-initilise it's representation ( [0.1, 0.3, -0.1, 0.2] ) with all 0s

        out = self.norm(attn_dn + attn_d)
        # It should be normalizing, scaling and shifting

        return out


#----Self Made Encoder Implementation----
class Encoder(nn.Module):
    def __init__(self, embedding, embedding_size, head_num, batch_size, dropout, learning_rate, n_layers):
        super(Encoder, self).__init__()

        self.num_layers = n_layers
        self.encoder_layers = nn.ModuleList([Encoder_Layer(embedding_size, head_num, batch_size, dropout, learning_rate) for _ in range(n_layers)])

        self.embedding = embedding
        self.embedding_size = embedding_size

        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        # shape: batch / length
        batch, length = x.shape

        input_variable = self.embedding(x).to(device)
        # shape: batch / length / embedding_size

        pos = positions(length, self.embedding_size).to(device)
        input_pos = pos + input_variable
        # we add pos to x's high dimensional space as context for each word position (cannot be seen as 1 more dimension)
        # shape: batch, length / embedding_size / --high-->  pos
        # "What": [0.1, 0.3, -0.1, 0.2] + Position 1: [0.1, 0.2, -0.1, -0.2] = [0.2, 0.5, -0.2, 0.0]

        input_d = self.dropout(input_pos).to(device)
        # each word will independently have a chance to re-initilise it's representation ( [0.1, 0.3, -0.1, 0.2] ) with all 0s

        for encoder_layer in self.encoder_layers:
            input_d = encoder_layer(input_d, mask)
        # for each encoder_layer generate an output quite simple.

        return input_d

#----Self Made Layer Decoder Implementation----
class Decoder_Layer(nn.Module):
    def __init__(self, embedding_size, head_num, batch_size, dropout, learning_rate):
        super(Decoder_Layer, self).__init__()

        self.embedding_size = embedding_size

        self.fc1 = nn.Linear(embedding_size, embedding_size, bias=False)
        self.fc2 = nn.Linear(embedding_size, embedding_size, bias=False)

        self.attention = Multi_Head_Attention(embedding_size, head_num, batch_size, learning_rate)

        self.norm = Normalize(0.4, 0.4)
        self.dropout = nn.Dropout(dropout)

    def forward(self, enc_o, blank, ahead, mask):
        attn1 = self.attention(enc_o, None, mask)

        attn_n = self.norm(attn1 + enc_o)
        # It should be normalizing, scaling and shifting

        attn2 = self.attention(enc_o, blank, ahead)

        attn_d = self.dropout(attn2)
        # each word will independently have a chance to re-initilise it's representation ( [0.1, 0.3, -0.1, 0.2] ) with all 0s

        attn_n2d = self.norm(attn_n + attn_d)
        # It should be normalizing, scaling and shifting

        attn_lin1 = self.fc1(attn_n2d)
        # the preservation of vector addition and scalar multiplication [2,3]+[1,6] -linear-> [6,18]

        attn_rel = F.relu(attn_lin1)
        # activation function that looks at each dimensional cell and returns 0 if negative or if positive returns back the number itself

        attn_lin2 = self.fc2(attn_rel)
        # the preservation of vector addition and scalar multiplication [2,3]+[1,6] -linear-> [6,18]

        attn_d = self.dropout(attn_lin2)
        # each word will independently have a chance to re-initilise it's representation ( [0.1, 0.3, -0.1, 0.2] ) with all 0s

        output = self.norm(attn_d + attn_n2d)
        # It should be normalizing, scaling and shifting

        return output

#----Self Made Decoder Implementation----
class Decoder(nn.Module):
    def __init__(self, embedding, embedding_size, head_num, batch_size, dropout, learning_rate, n_layers):
        super(Decoder, self).__init__()
        self.embedding = embedding
        self.embedding_size = embedding_size
        self.num_layers = n_layers

        self.fc = nn.Linear(embedding_size, vocab_size, bias=False)

        self.decoder_layers = nn.ModuleList([Decoder_Layer(embedding_size, head_num, batch_size, dropout, learning_rate) for _ in range(n_layers)])

        self.dropout = nn.Dropout(dropout)

    def forward(self, decoder_input, enc_output, ahead=None, mask=None):
        batch, length, embedding_size = enc_output.shape
        #INPUTS
        # decoder_input shape: batch / length
        # enc_output shape: batch / length / embedding_size

        #MASKS
        # ahead shape: length / length
        # mask shape: batch / length / embedding_size

        embedded = self.embedding(decoder_input).to(device)
        # shape: batch / length / embedding_size

        pos = positions(length, self.embedding_size).to(device)
        input_pos = pos + enc_output
        # we add pos to x's high dimensional space as context for each word position (cannot be seen as 1 more dimension)
        # shape: batch, length / embedding_size / --high-->  pos
        # "What": [0.1, 0.3, -0.1, 0.2] + Position 1: [0.1, 0.2, -0.1, -0.2] = [0.2, 0.5, -0.2, 0.0]

        input_d = self.dropout(input_pos)
        # each word will independently have a chance to re-initilise it's representation ( [0.1, 0.3, -0.1, 0.2] ) with all 0s

        for decoder_layer in self.decoder_layers:
            input_d = decoder_layer(input_d, embedded, ahead, mask)

        output_fc = self.fc(input_d).to(device)
        decoder_output_probs = F.softmax(output_fc, dim=-1).to(device)

        return decoder_output_probs

In [6]:
def train(input_variable, target_variable, vocab_size, decoder, encoder, clip,
          encoder_optimizer, decoder_optimizer, embedding_size,head_num,max_length=MAX_LENGTH):

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    t_loss = 0
    t_accuracy = 0
    n_totals = 0

    batch, length = input_variable.shape

    enc_padding_mask = create_padding_mask(torch.LongTensor(length)).to(device)
    dec_padding_mask = create_padding_mask(torch.LongTensor(length)).to(device)
    look_ahead_mask  = create_look_ahead_mask(embedding_size // head_num).to(device)

    decoder_input = torch.LongTensor([[SOS_token for _ in range(batch)]]).to(device)

    enc_output = encoder(input_variable, enc_padding_mask)

    for t in range(length):
        dec_output = decoder(decoder_input, enc_output, look_ahead_mask, dec_padding_mask)
        loss = F.cross_entropy(dec_output.view(-1, vocab_size), target_variable.reshape(-1))
        mask = (target_variable != 0).float()
        loss = (loss * mask).mean()
        accuracy_v = accuracy(target_variable, dec_output)
        t_loss += loss
        t_accuracy += accuracy_v.item()
        n_totals += 1
    loss.backward()


    _ = nn.utils.clip_grad_norm_(encoder.parameters(), clip)
    _ = nn.utils.clip_grad_norm_(decoder.parameters(), clip)

    encoder_optimizer.step()
    decoder_optimizer.step()

    encoder_scheduler.step()
    decoder_scheduler.step()

    return t_loss / n_totals, t_accuracy / n_totals

In [5]:
def trainIters(model_name, libra, pairs, save_dir, n_iteration, batch_size, checkpoint, clip,
               print_every, save_every, loadFilename, vocab_size, decoder, encoder, head_num, dropout,
               decoder_optimizer, encoder_optimizer, decoder_scheduler, encoder_scheduler, embedding, embedding_size):

    print("Creating the training batches...")
    training_pairs = [batch2TrainData(libra, [random.choice(pairs) for _ in range(batch_size)])
                      for _ in range(n_iteration)]

    start_iteration = 1
    tries = 0

    if loadFilename:
        tries = checkpoint['time']


    print("Initializing Training...")
    print()
    for iteration in range(start_iteration, n_iteration + 1):
        training_pair = training_pairs[iteration - 1]

        input_variable, target_variable = training_pair
        # batch / length

        input_variable = input_variable.to(device)
        target_variable = target_variable.to(device)

        loss, accuracy = train(input_variable, target_variable, vocab_size, decoder, encoder, clip,
          encoder_optimizer, decoder_optimizer, embedding_size,head_num,)

        if iteration % print_every == 0:
            print("Iteration: {}; Percent complete: {:.1f}%; Average loss: {:.4f}; Average accuracy: {:.4f}".format(iteration, iteration / n_iteration * 100, loss, accuracy))


        if (iteration % save_every == 0):
                    tries += save_every
                    directory = os.path.join(save_dir, model_name, '{}-{}_{}'.format(embedding_size, head_num, vocab_size))
                    if not os.path.exists(directory):
                        os.makedirs(directory)
                    torch.save({
                        'iteration': iteration,
                        'time': tries,
                        'en': encoder.state_dict(),
                        'de': decoder.state_dict(),
                        'en_opt': encoder_optimizer.state_dict(),
                        'de_opt': decoder_optimizer.state_dict(),
                        'loss': loss,
                        'voc_dict': libra.__dict__,
                        'embedding': embedding.state_dict()
                    }, os.path.join(directory, '{}_{}.tar'.format(iteration, 'checkpoint')))

In [14]:
def evaluate(input_sentence, decoder, encoder, libra, embedding_size):
    # Tokenize the sentence
    tokenized_sentence = [SOS_token] + [libra.word2index[word] for word in input_sentence.split()] + [EOS_token]
    # Convert to tensor and add batch dimension
    sentence_tensor = torch.tensor(tokenized_sentence, dtype=torch.long).unsqueeze(0)

    # Initialize output tensor with start token
    decoder_input = torch.LongTensor([[SOS_token for _ in range(batch_size)]])

    enc_output = encoder(sentence_tensor)

    for i in range(MAX_LENGTH):
        with torch.no_grad():
            dec_output = decoder(decoder_input, enc_output)

        # Select the last word from the seq_len dimension
        predictions = dec_output[:, -1:, :]
        predicted_id = torch.argmax(predictions, axis=-1)

        # Return the result if the predicted_id is equal to the end token
        if predicted_id.item() == EOS_token:
            break

        # Concatenate the predicted_id to the output
        output = torch.cat([decoder_input, predicted_id], axis=-1)

    return output.squeeze(0)


def predict(input_sentence, decoder, encoder, libra, embedding_size):
    prediction = evaluate(input_sentence, decoder, encoder, libra, embedding_size)
    predicted_sentence = [libra.index2word[index.item()] for index in prediction if index.item() < libra.num_words]
    return predicted_sentence


def evaluateInput(decoder, encoder, libra, embedding_size):
    input_sentence = ''
    while(1):
        try:
            input_sentence = input('User > ')
            if input_sentence == 'q' or input_sentence == 'quit': break
            input_sentence = normalizeString(input_sentence)
            output_words = predict(input_sentence, decoder, encoder, libra, embedding_size)
            print('Cleopatra:', ' '.join(output_words))
        except KeyError:
            print("Error: Encountered unknown word.")

In [20]:
parquet_path = "/content/drive/MyDrive/movie-corpus/movie-corpus/0000.parquet"
libra, pairs = loadPrepareData(parquet_path, save_dir)
pairs = trimRareWords(libra, pairs, MIN_COUNT)

2235 12010 2235 12010
keep_words 2235 / 12010 = 0.1861
Trimmed from 3165 pairs to 358, 0.1131 of total


In [None]:
#libra = Library()
model_name = 'Cleopatra_model'
checkpoint=None
start_model = "yes"
loadFilename = None if start_model == "no" else "/content/drive/MyDrive/data/save/Cleopatra_model/512-4_2238/20000_checkpoint.tar"

if loadFilename:
    print("Set to: 'trained model'")
    checkpoint = torch.load(loadFilename, map_location=device)
    encoder_sd = checkpoint['en']
    decoder_sd = checkpoint['de']
    encoder_optimizer_sd = checkpoint['en_opt']
    decoder_optimizer_sd = checkpoint['de_opt']
    embedding_sd = checkpoint['embedding']
    libra.__dict__ = checkpoint['voc_dict']
    print("Loss: ",checkpoint["loss"])
    print("Time: ",checkpoint["time"])
else:
    print("Set to: 'new model'")


encoder_n_layers = 2
decoder_n_layers = 4
embedding_size = 512
head_num = 4

if embedding_size % head_num != 0:
    raise ValueError("embedding_size / head_num must result in an integer")

dropout = 0.2
batch_size = 15
learning_rate = 0.001
vocab_size = libra.num_words

task = "train"


embedding = nn.Embedding(vocab_size, embedding_size)
decoder = Decoder(embedding, embedding_size, head_num, batch_size, dropout, learning_rate, encoder_n_layers)
encoder = Encoder(embedding, embedding_size, head_num, batch_size, dropout, learning_rate, decoder_n_layers)


if loadFilename:
    embedding.load_state_dict(embedding_sd)

if loadFilename:
    encoder.load_state_dict(encoder_sd)
    decoder.load_state_dict(decoder_sd)

embedding = embedding.to(device)
encoder = encoder.to(device)
decoder = decoder.to(device)

if task == "train":
    encoder.train()
    decoder.train()
else:
    encoder.eval()
    decoder.eval()

decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate, betas=(0.9, 0.98), eps=1e-9)
encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate, betas=(0.9, 0.98), eps=1e-9)

decoder_scheduler = CustomSchedule(decoder_optimizer, embedding_size)
encoder_scheduler = CustomSchedule(encoder_optimizer, embedding_size)

if loadFilename:
    encoder_optimizer.load_state_dict(encoder_optimizer_sd)
    decoder_optimizer.load_state_dict(decoder_optimizer_sd)

clip = 20.0
n_iteration = 20000
print_every = 100
save_every = 1000

if task == "train":
    trainIters(model_name, libra, pairs, save_dir, n_iteration, batch_size, checkpoint, clip,
               print_every, save_every, loadFilename, vocab_size, decoder, encoder, head_num, dropout,
               decoder_optimizer, encoder_optimizer, decoder_scheduler, encoder_scheduler, embedding, embedding_size)

if task == "test":
    evaluateInput(decoder, encoder, libra, embedding_size)

Set to: 'trained model'
Loss:  tensor(3.9212, requires_grad=True)
Time:  20000
Creating the training batches...
Initializing Training...

