# Char-level model with simple LSTM 

### Constant 

In [11]:
# For PC assign 0 for colab assign 1
PC_OR_COLAB = 1

# Resolve the base path depending on your running environment
colab_base = '/content/drive/MyDrive/ashraful/paper-1/'
pc_base = './'

if PC_OR_COLAB == 1:
    base = colab_base
else:
    base = pc_base


In [13]:
new_dataset_path = base + 'dataset/data.txt'
dataset_paths = [new_dataset_path]

input_tokenizer_dir = base + 'dataset/input-tokenizer_char.pickle'
target_tokenizer_dir = base + 'dataset/target-tokenizer_char.pickle'

model_weights_path = base + 'saved-weights/char-level-4/weights'


In [None]:
# If you are using colab then this is meaningful
if PC_OR_COLAB == 1:
    from google.colab import drive
    drive.mount('/content/drive')
    !pip install tensorflow-addons


In [15]:
import tensorflow as tf
import tensorflow_addons as tfa

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split

import unicodedata
import re
import numpy as np
import os
import io
import time
import pickle
import numpy as np
import urllib3
import shutil
import zipfile
import itertools


### Dataset

In [16]:
class Dataset:
    def __init__(self):
        self.inp_lang_tokenizer = None
        self.targ_lang_tokenizer = None
        self.train_dataset = None
        self.val_dataset = None

    def create_dataset(self):
        # num_examples : Limit the total number of training example for faster training (set num_examples = len(lines) to use full data)
        lines = list()

        for path in dataset_paths:
            lines.extend(
                io.open(path, encoding='UTF-8').read().strip().split('\n'))

        # lines = list(lines)
        lines.sort()
        print(len(lines))

        word_pairs = [[[char for char in '<' + w.replace('ঃ\n', '').replace(
            '\n', '') + '>'] for w in l.split(',')] for l in lines]

        print(word_pairs[0][0])
        print(word_pairs[0][1])

        return zip(*word_pairs)

    # Step 3 and Step 4
    def tokenize(self, lang, lang_tokenizer=None, maxlen=20):
        if lang_tokenizer is None:
            lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
            lang_tokenizer.fit_on_texts(lang)

        tensor = lang_tokenizer.texts_to_sequences(lang)
        tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post',
                                                               maxlen=maxlen, truncating='post')

        return tensor, lang_tokenizer

    def load_dataset(self):
        # creating cleaned input, output pairs
        self.retrieve_tokenizer()
        inp_lang, targ_lang = self.create_dataset()

        input_tensor, inp_lang_tokenizer = self.tokenize(
            inp_lang, self.inp_lang_tokenizer)
        target_tensor, targ_lang_tokenizer = self.tokenize(
            targ_lang, self.targ_lang_tokenizer)

        return input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer

    def retrieve_tokenizer(self):

        try:
            with open(input_tokenizer_dir, mode='rb') as data_file:
                self.inp_lang_tokenizer = pickle.load(data_file)

        except:
            print("Not found input tokenizer")
            exit(1)

        try:
            with open(target_tokenizer_dir, mode='rb') as data_file:
                self.targ_lang_tokenizer = pickle.load(data_file)

        except:
            print("Not found target tokenizer")
            exit(1)

        # print(len(inp_lang_tokenizer.word_index))
        # print(len(targ_lang_tokenizer.word_index))
        return True

    def call(self, BATCH_SIZE):
        # if self.retrieve_data() == False:
        input_tensor, target_tensor, self.inp_lang_tokenizer, self.targ_lang_tokenizer = \
            self.load_dataset()

        print("Input tensor", input_tensor.shape)
        print("Output tensor", target_tensor.shape)

        input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = \
            train_test_split(input_tensor, target_tensor,
                             test_size=0.2, random_state=4651)

        print(input_tensor_train.shape, target_tensor_train.shape)
        print(input_tensor_train[500])
        print(input_tensor_val[500])

        BUFFER_SIZE = len(input_tensor_train)
        self.train_dataset = tf.data.Dataset.from_tensor_slices(
            (input_tensor_train, target_tensor_train))
        self.train_dataset = self.train_dataset.shuffle(
            BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

        self.val_dataset = tf.data.Dataset.from_tensor_slices(
            (input_tensor_val, target_tensor_val))
        self.val_dataset = self.val_dataset.batch(
            BATCH_SIZE, drop_remainder=True)

        return self.inp_lang_tokenizer, self.targ_lang_tokenizer, self.train_dataset, self.val_dataset


In [17]:
BATCH_SIZE = 64

dataset_creator = Dataset()
inp_lang, targ_lang, train_dataset, val_dataset = dataset_creator.call(
    BATCH_SIZE)

print(len(train_dataset), len(val_dataset), len(
    inp_lang.word_index), len(targ_lang.word_index))


2402977
['<', 'a', '>']
['<', 'অ', '>']
Input tensor (2402977, 20)
Output tensor (2402977, 20)
(1922381, 20) (1922381, 20)
[ 1 20  5  8  7 14  7  2  0  0  0  0  0  0  0  0  0  0  0  0]
[ 1  3 27  3  9  7  3  2  0  0  0  0  0  0  0  0  0  0  0  0]
30037 7509 28 63


### Model Parameters

In [18]:
vocab_inp_size = len(inp_lang.word_index)+1
vocab_tar_size = len(targ_lang.word_index)+1
steps_per_epoch = len(train_dataset)//BATCH_SIZE

embedding_dims = 32
rnn_units = 256
dense_units = 256

Tx = 20
Ty = 20


### Define Model

In [29]:

class MyModel(tf.keras.Model):
    def __init__(self, input_vocab_size, output_vocab_size, embedding_dims, rnn_units):
        super().__init__()
        # Encoder
        self.input_vocab_size = input_vocab_size
        self.encoder_embedding = tf.keras.layers.Embedding(input_dim=input_vocab_size,
                                                           output_dim=embedding_dims)
        self.encoder_rnnlayer1 = tf.keras.layers.LSTM(
            rnn_units, return_sequences=True)
        self.encoder_rnnlayer2 = tf.keras.layers.LSTM(rnn_units,
                                                      return_sequences=True,
                                                      return_state=True)
        self.encoder_norm = tf.keras.layers.BatchNormalization()

        # Decoder
        self.output_vocab_size = output_vocab_size
        self.decoder_embedding = tf.keras.layers.Embedding(input_dim=output_vocab_size,
                                                           output_dim=embedding_dims)
        self.dense_layer = tf.keras.layers.Dense(output_vocab_size)
        self.decoder_rnncell = tf.keras.layers.LSTMCell(rnn_units)
        # Sampler
        self.sampler = tfa.seq2seq.sampler.TrainingSampler()
        self.attention_mechanism = self.build_attention_mechanism(
            dense_units, None, BATCH_SIZE*[Tx])
        self.rnn_cell = self.build_rnn_cell(BATCH_SIZE)
        self.decoder = tfa.seq2seq.BasicDecoder(self.rnn_cell,
                                                sampler=self.sampler,
                                                output_layer=self.dense_layer)

        self.attention_mechanism.memory_initialized
        self.decoder_embedding_matrix = None

    def initialize_initial_state(self):
        self.initial_state = [
            tf.zeros((BATCH_SIZE, rnn_units)), tf.zeros((BATCH_SIZE, rnn_units))]

    def build_attention_mechanism(self, units, memory, memory_sequence_length):
        return tfa.seq2seq.LuongAttention(units,
                                          memory=memory,
                                          memory_sequence_length=memory_sequence_length)
        # return tfa.seq2seq.BahdanauAttention(units, memory = memory, memory_sequence_length=memory_sequence_length)

    # wrap decoder rnn cell
    def build_rnn_cell(self, batch_size):
        rnn_cell = tfa.seq2seq.AttentionWrapper(self.decoder_rnncell, self.attention_mechanism,
                                                attention_layer_size=dense_units)
        return rnn_cell

    def build_decoder_initial_state(self, batch_size, encoder_state, Dtype):
        decoder_initial_state = self.rnn_cell.get_initial_state(batch_size=batch_size,
                                                                dtype=Dtype)
        decoder_initial_state = decoder_initial_state.clone(
            cell_state=encoder_state)
        return decoder_initial_state

    def call(self, inputs, training=False):
        encoder_input, decoder_input = inputs

        x = self.encoder_embedding(encoder_input)
        x = self.encoder_rnnlayer1(x)
        x = self.encoder_norm(x, training=training)
        a, a_tx, c_tx = self.encoder_rnnlayer2(x)

        decoder_emb_inp = self.decoder_embedding(decoder_input)
        self.attention_mechanism.setup_memory(a)
        decoder_initial_state = self.build_decoder_initial_state(BATCH_SIZE,
                                                                 encoder_state=[
                                                                     a_tx, c_tx],
                                                                 Dtype=tf.float32)

        outputs, _, _ = self.decoder(decoder_emb_inp,
                                     initial_state=decoder_initial_state,
                                     sequence_length=BATCH_SIZE*[Ty-1])

        return outputs

    def evaluate(self, inputs, beam_width=3):
        if self.decoder_embedding_matrix is None:
            self.decoder_embedding_matrix = tf.train.load_variable(
                model_weights_path, 'decoder_embedding/embeddings/.ATTRIBUTES/VARIABLE_VALUE')
            print(self.decoder_embedding_matrix.shape)

        inference_batch_size = inputs.shape[0]

        x = self.encoder_embedding(inputs)
        x = self.encoder_rnnlayer1(x)
        x = self.encoder_norm(x, training=False)
        enc_out, enc_h, enc_c = self.encoder_rnnlayer2(x)

        dec_h = enc_h
        # dec_c = enc_c

        start_tokens = tf.fill([inference_batch_size],
                               targ_lang.word_index['<'])
        end_token = targ_lang.word_index['>']

        enc_out = tfa.seq2seq.tile_batch(enc_out, multiplier=beam_width)
        self.attention_mechanism.setup_memory(enc_out)

        hidden_state = tfa.seq2seq.tile_batch(
            [enc_h, enc_c], multiplier=beam_width)
        decoder_initial_state = self.rnn_cell.get_initial_state(batch_size=beam_width * inference_batch_size,
                                                                dtype=tf.float32)
        decoder_initial_state = decoder_initial_state.clone(
            cell_state=hidden_state)

        decoder_instance = tfa.seq2seq.BeamSearchDecoder(self.rnn_cell,
                                                         beam_width=beam_width,
                                                         output_layer=self.dense_layer)
        decoder_instance.maximum_iterations = tf.round(tf.reduce_max(Tx) * 2)

        outputs, final_state, sequence_lengths = decoder_instance(self.decoder_embedding_matrix,
                                                                  start_tokens=start_tokens,
                                                                  end_token=end_token,
                                                                  initial_state=decoder_initial_state)

        final_outputs = tf.transpose(outputs.predicted_ids, perm=(0, 2, 1))
        beam_scores = tf.transpose(
            outputs.beam_search_decoder_output.scores, perm=(0, 2, 1))

        return final_outputs.numpy(), beam_scores.numpy()


model = MyModel(vocab_inp_size, vocab_tar_size, embedding_dims, rnn_units)
model.load_weights(filepath=model_weights_path)


<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f1ce805df50>

### Optimizer and Custom Loss Function

In [20]:
optimizer = tf.keras.optimizers.Adam()


In [21]:
def get_bangla(array):
    bangla_list = list(
        map(lambda x: targ_lang.index_word[x] if x != 0 else '', array))
    bangla_list.append('>')
    return "".join(bangla_list[0:bangla_list.index('>')])


In [22]:
sparsecategoricalcrossentropy = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')


def loss_function(y_pred, y):
    loss = sparsecategoricalcrossentropy(y_true=y, y_pred=y_pred)
    # output 0 for y=0 else output 1
    mask = tf.logical_not(tf.math.equal(y, 0))
    mask = tf.cast(mask, dtype=loss.dtype)
    loss = mask * loss
    loss = tf.reduce_mean(loss)
    return loss


def acc_function(pred, real):
    pred = tf.reshape(pred, [pred.shape[0], 19, pred.shape[2]])
    pred = tf.argmax(pred, axis=2)
    pred = tf.cast(pred, dtype=real.dtype)
    pred = list(map(lambda x: get_bangla(x), pred.numpy()))
    real = list(map(lambda x: get_bangla(x), real.numpy()))
    accuracies = tf.equal(real, pred).numpy()

    return accuracies.sum() / accuracies.shape[0]


### One step of training on a batch using Teacher Forcing technique

In [23]:

def train_step(input_batch, output_batch):
    # initialize loss = 0
    loss = 0
    acc = 0

    with tf.GradientTape() as tape:
        # Prepare correct Decoder input & output sequence data
        decoder_input = output_batch[:, :-1]  # ignore <end>
        # compare logits with timestepped +1 version of decoder_input
        decoder_output = output_batch[:, 1:]  # ignore <start>

        outputs = model([input_batch, decoder_input], True)

        logits = outputs.rnn_output
        # Calculate loss

        loss = loss_function(logits, decoder_output)
        acc = acc_function(logits, decoder_output)

    # Returns the list of all layer variables / weights.
    variables = model.trainable_variables
    # differentiate loss wrt variables
    gradients = tape.gradient(loss, variables)

    # grads_and_vars – List of(gradient, variable) pairs.
    grads_and_vars = zip(gradients, variables)
    optimizer.apply_gradients(grads_and_vars)
    return loss, acc


### Training

In [None]:
start = 0
EPOCHS = 20

dataset = train_dataset
steps_per_epoch = len(dataset)
print(steps_per_epoch)
max_acc = .00

for epoch in range(start, EPOCHS+start):
    start = time.time()

    # encoder_initial_cell_state = initialize_initial_state()
    total_loss = 0
    total_acc = 0
    # print(enc_hidden[0].shape, enc_hidden[1].shape)
    for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
        # print(inp.shape, targ.shape)
        batch_loss, batch_acc = train_step(inp, targ)
        total_loss += batch_loss
        total_acc += batch_acc

        if batch % 1000 == 0:
            print(
                f'Epoch {epoch + 1} Upto Batch {batch+1} Loss {total_loss / (batch+1):.4f} Accuracy {total_acc / (batch+1):.4f}')
            # model.save_weights(filepath=model_weights_path)
            # break

    # break

    acc = total_acc / steps_per_epoch
    if acc > max_acc:
        max_acc = acc
        model.save_weights(filepath=model_weights_path)
    else:
        break

    print(f'Epoch {epoch + 1} Loss {total_loss / steps_per_epoch:.4f} Accuracy {total_acc / steps_per_epoch:.4f}')
    print(f'Time taken for 1 epoch {time.time() - start:.2f} sec\n')


### Evaluation

In [25]:
def get_bangla(array):
    bangla_list = list(
        map(lambda x: targ_lang.index_word[x] if x != 0 else '', array))
    bangla_list.append('>')
    return "".join(bangla_list[0:bangla_list.index('>')])


In [26]:
# Evaluate char-level train
def calculate_acc(dataset):
    beam_width = 10
    correct_count = np.array([0]*4)
    total_count = 0
    steps_per_epoch = len(dataset)
    print(steps_per_epoch)
    # exit(0)
    start = time.time()
    for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
        # outputs, scores = beam_evaluate(inp, beam_width=beam_width)
        outputs, scores = model.evaluate(inp, beam_width=beam_width)
        # print(targ.shape)
        targ = list(map(get_bangla, targ.numpy()))
        targ = list(map(lambda x: x.replace('<', ''), targ))
        # print(targ)
        outputs = [list(map(get_bangla, output)) for output in outputs]
        # print(outputs)

        for i in range(len(targ)):
            if targ[i] == outputs[i][0]:
                correct_count[0] += 1
            if targ[i] in outputs[i][0:3]:
                correct_count[1] += 1
            if targ[i] in outputs[i][0:5]:
                correct_count[2] += 1
            if targ[i] in outputs[i]:
                correct_count[3] += 1
            total_count += 1

    print(f'Total size {total_count}')
    print(f'Acc@1 : {((correct_count[0]/total_count))*100:.2f} %')
    print(f'Acc@3 : {((correct_count[1]/total_count))*100:.2f} %')
    print(f'Acc@5 : {((correct_count[2]/total_count))*100:.2f} %')
    print(f'Acc@10: {((correct_count[3]/total_count))*100:.2f} %')
    print(f'Time taken: {(time.time() - start):.2f} s\n')


In [None]:
calculate_acc(train_dataset)
calculate_acc(val_dataset)
