In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install tensorflow-addons

import tensorflow as tf
import tensorflow_addons as tfa

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split

import unicodedata
import re
import numpy as np
import os
import io
import time
import pickle
import numpy as np
import urllib3
import shutil
import zipfile
import itertools

Collecting tensorflow-addons
  Downloading tensorflow_addons-0.15.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 4.1 MB/s 
Installing collected packages: tensorflow-addons
Successfully installed tensorflow-addons-0.15.0


### Download File

In [None]:
colab_base = '/content/drive/MyDrive/ashraful/'
pc_base = './'
base = colab_base

google_dataset_path = base + 'dataset/google-dataset.txt'
modified_google_dataset_path = base + 'dataset/modified-google-dataset.txt'
phoneme_dataset = base + 'dataset/phoneme-dataset.txt'
my_dataset_path = base + 'dataset/my-dataset.txt'
new_dataset_path = base + 'dataset/new/dataset-new.txt'
top20k_path = base + 'dataset/new/top20k-3.txt'

top_50k_word_file = base + 'dataset/new/top50k-sentiment.txt'
top_20k_word_file = base + 'dataset/new/top20k-sentiment.txt'

input_tokenizer_retrieve = base + 'dataset/new/input-tokenizer.pickle'
target_tokenizer_retrieve = base + 'dataset/new/target-tokenizer.pickle'
target_word_tokenizer_retrieve = base + 'dataset/new/target-tokenizer-word.pickle'

word_frequency_dict_file = base + '/dataset/new/word_frequency_dictionary.pickle'
word_frequency_dict_log_file = base + '/dataset/new/word_frequency_dictionary-log.pickle'
word_frequency_dict_sqrt_file = base + '/dataset/new/word_frequency_dictionary-sqrt.pickle'

# dataset_paths = [new_dataset_path, top_50k_word_file, top20k_path, top20k_path]
dataset_paths = [new_dataset_path]

splitted_data_path = base + 'dataset/splited-my-data-lstm'

checkpoint_dir = base + 'models/LSTM/char-level-model-4'
model_weights_path = base + 'models/LSTM/char-level-model-4/weights'
# w-weights-2 => 47.50
progress_file_path = base + 'models/LSTM/progress.txt'


input_tokenizer_dir = base + 'models/LSTM/char-level-model-4/input-tokenizer.pickle'
target_tokenizer_dir = base + 'models/LSTM/char-level-model-4/target-tokenizer.pickle'
example_batch_dir = base + 'models/LSTM/example_batch.pickle'

In [None]:
try:
    with open(word_frequency_dict_sqrt_file, mode='rb') as corpus:
        word_frequency_dict = pickle.loads(corpus.read())
except:
    print("Can not open file")

word_frequency_dict["থেকে"]

1483.176439783953

In [None]:
class Dataset:
    def __init__(self):
        self.inp_lang_tokenizer = None
        self.targ_lang_tokenizer = None
        self.train_dataset = None
        self.val_dataset = None

    def create_dataset(self):
        # num_examples : Limit the total number of training example for faster training (set num_examples = len(lines) to use full data)
        lines = list()

        for path in dataset_paths:
            lines.extend(io.open(path, encoding='UTF-8').read().strip().split('\n'))
        
        # lines = list(lines)
        lines.sort()
        print(len(lines))

        word_pairs = [[[char for char in '<' + w.replace('ঃ\n', '').replace('\n', '') + '>'] for w in l.split(',')] for l in lines]

        print(word_pairs[0][0])
        print(word_pairs[0][1])

        return zip(*word_pairs)

    # Step 3 and Step 4
    def tokenize(self, lang, lang_tokenizer=None, maxlen=20):
        if lang_tokenizer is None:
            lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
            lang_tokenizer.fit_on_texts(lang)
        
        tensor = lang_tokenizer.texts_to_sequences(lang)
        tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post',
                                                               maxlen=maxlen, truncating='post')

        return tensor, lang_tokenizer

    def load_dataset(self):
        # creating cleaned input, output pairs
        self.retrieve_tokenizer()
        inp_lang, targ_lang = self.create_dataset()

        input_tensor, inp_lang_tokenizer = self.tokenize(inp_lang, self.inp_lang_tokenizer)
        target_tensor, targ_lang_tokenizer = self.tokenize(targ_lang, self.targ_lang_tokenizer)

        return input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer

    def save_data(self):
        data = tf.constant([self.inp_lang_tokenizer, self.targ_lang_tokenizer, \
            self.train_dataset, self.val_dataset])

        with open(splitted_data_path, mode='wb') as data_file:
            pickle.dump(data, data_file, protocol=pickle.HIGHEST_PROTOCOL)

    def retrieve_data(self):
        try:
            1/0
            with open(splitted_data_path, mode='rb') as data_file:
                data = pickle.load(data_file)
                [self.inp_lang_tokenizer, self.targ_lang_tokenizer, \
                    self.train_dataset, self.val_dataset] = data.numpy()
        except:
            print("Not found")
            return False

        return True

    def retrieve_tokenizer(self):
        
        try:
            with open(input_tokenizer_retrieve, mode='rb') as data_file:
                self.inp_lang_tokenizer = pickle.load(data_file)
            
        except:
            print("Not found jhkhk")
            return False

        try:
            with open(target_tokenizer_retrieve, mode='rb') as data_file:
                self.targ_lang_tokenizer = pickle.load(data_file)
            
        except:
            print("Not found jgxghkjgkgjkjkjhk")
            return False

        # print(len(inp_lang_tokenizer.word_index))
        # print(len(targ_lang_tokenizer.word_index))
        return True

    def call(self, BATCH_SIZE):
        # if self.retrieve_data() == False:
        input_tensor, target_tensor, self.inp_lang_tokenizer, self.targ_lang_tokenizer = \
            self.load_dataset()

        print("Input tensor", input_tensor.shape)
        print("Output tensor", target_tensor.shape)

        input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = \
            train_test_split(input_tensor, target_tensor, test_size=0.2, random_state=4651)

        print(input_tensor_train.shape, target_tensor_train.shape)
        print(input_tensor_train[500])
        print(input_tensor_val[500])

        BUFFER_SIZE = len(input_tensor_train)
        self.train_dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train))
        self.train_dataset = self.train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

        self.val_dataset = tf.data.Dataset.from_tensor_slices((input_tensor_val, target_tensor_val))
        self.val_dataset = self.val_dataset.batch(BATCH_SIZE, drop_remainder=True)

        return self.inp_lang_tokenizer, self.targ_lang_tokenizer, self.train_dataset, self.val_dataset

### Create Dataset

In [None]:
BATCH_SIZE = 1024

dataset_creator = Dataset()
inp_lang, targ_lang, train_dataset, val_dataset = dataset_creator.call(BATCH_SIZE)

print(len(train_dataset), len(val_dataset), len(inp_lang.word_index), len(targ_lang.word_index))

2402977
['<', 'a', '>']
['<', 'অ', '>']
Input tensor (2402977, 20)
Output tensor (2402977, 20)
(1922381, 20) (1922381, 20)
[ 1 20  5  8  7 14  7  2  0  0  0  0  0  0  0  0  0  0  0  0]
[ 1  3 27  3  9  7  3  2  0  0  0  0  0  0  0  0  0  0  0  0]
1877 469 28 63


In [None]:
with open(input_tokenizer_retrieve, mode='wb') as data_file:
    pickle.dump(inp_lang, data_file, protocol=pickle.HIGHEST_PROTOCOL)
with open(target_tokenizer_retrieve, mode='wb') as data_file:
    pickle.dump(targ_lang, data_file, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
example_input_batch, example_target_batch = next(iter(train_dataset))
print(example_input_batch.shape, example_target_batch.shape)
# print(example_input_batch[0])

(1024, 20) (1024, 20)


### Model Parameters

In [None]:
vocab_inp_size = len(inp_lang.word_index)+1
vocab_tar_size = len(targ_lang.word_index)+1
max_length_input = example_input_batch.shape[1]
max_length_output = example_target_batch.shape[1]

steps_per_epoch = len(train_dataset)//BATCH_SIZE

print("max_length_input, max_length_target, vocab_size_input, vocab_size_target")
print(max_length_input, max_length_output, vocab_inp_size, vocab_tar_size)

print(inp_lang.word_index)
print(targ_lang.word_index)

embedding_dims = 32
rnn_units = 256
dense_units = 256
Dtype = tf.float32   #used to initialize DecoderCell Zero state

Tx = 20
Ty = 20

max_length_input, max_length_target, vocab_size_input, vocab_size_target
20 20 29 64
{'<': 1, '>': 2, 'a': 3, 'o': 4, 'e': 5, 'r': 6, 'i': 7, 'h': 8, 'n': 9, 't': 10, 's': 11, 'k': 12, 'u': 13, 'b': 14, 'l': 15, 'd': 16, 'm': 17, 'p': 18, 'c': 19, 'g': 20, 'j': 21, 'y': 22, 'w': 23, 'f': 24, 'v': 25, 'q': 26, 'z': 27, 'x': 28}
{'<': 1, '>': 2, 'া': 3, 'র': 4, 'ে': 5, 'ি': 6, '্': 7, 'ন': 8, 'ক': 9, 'স': 10, 'ব': 11, 'ল': 12, 'ম': 13, 'ত': 14, 'ু': 15, 'প': 16, 'ট': 17, 'দ': 18, 'ো': 19, 'জ': 20, 'গ': 21, 'ই': 22, 'হ': 23, 'শ': 24, 'ী': 25, 'য': 26, 'ড': 27, 'ভ': 28, 'য়': 29, 'ফ': 30, 'চ': 31, 'ও': 32, 'আ': 33, 'অ': 34, 'এ': 35, 'খ': 36, 'ষ': 37, 'ণ': 38, 'ং': 39, 'ধ': 40, 'থ': 41, 'উ': 42, 'ছ': 43, 'ূ': 44, 'ঁ': 45, 'ৃ': 46, 'ড়': 47, 'ঠ': 48, 'ঘ': 49, 'ঞ': 50, 'ঙ': 51, 'ৌ': 52, 'ৎ': 53, 'ৈ': 54, 'ঝ': 55, 'ঃ': 56, 'ঢ': 57, 'ঈ': 58, 'ঋ': 59, 'ঊ': 60, 'ঐ': 61, 'ঔ': 62, 'ঢ়': 63}


### Creating Encoder-Decoder Model based on tfa.seq2seq module

### Define Model

The encoder network consists of an encoder embedding layer and a LSTM layer.

The decoder network encompasses both decoder and attention mechanism.

The example uses LuongAttention.

In [None]:

class MyModel(tf.keras.Model):
    def __init__(self, input_vocab_size, output_vocab_size, embedding_dims, rnn_units):
        super().__init__()
        # Encoder
        self.input_vocab_size = input_vocab_size
        self.encoder_embedding = tf.keras.layers.Embedding(input_dim=input_vocab_size,
                                                           output_dim=embedding_dims)
        self.encoder_rnnlayer1 = tf.keras.layers.LSTM(rnn_units, return_sequences=True)
        self.encoder_rnnlayer2 = tf.keras.layers.LSTM(rnn_units,
                                                      return_sequences=True,
                                                      return_state=True)
        self.encoder_norm = tf.keras.layers.BatchNormalization()

        # Decoder
        self.output_vocab_size = output_vocab_size
        self.decoder_embedding = tf.keras.layers.Embedding(input_dim=output_vocab_size,
                                                           output_dim=embedding_dims) 
        self.dense_layer = tf.keras.layers.Dense(output_vocab_size)
        self.decoder_rnncell = tf.keras.layers.LSTMCell(rnn_units)
        # Sampler
        self.sampler = tfa.seq2seq.sampler.TrainingSampler()
        self.attention_mechanism = self.build_attention_mechanism(dense_units,None,BATCH_SIZE*[Tx])
        self.rnn_cell =  self.build_rnn_cell(BATCH_SIZE)
        self.decoder = tfa.seq2seq.BasicDecoder(self.rnn_cell, 
                                                sampler= self.sampler,
                                                output_layer=self.dense_layer)

        self.attention_mechanism.memory_initialized
        self.decoder_embedding_matrix = None


    def initialize_initial_state(self):
        self.initial_state = [
            tf.zeros((BATCH_SIZE, rnn_units)), tf.zeros((BATCH_SIZE, rnn_units))]

    def build_attention_mechanism(self, units,memory, memory_sequence_length):
        return tfa.seq2seq.LuongAttention(units, 
                                          memory = memory, 
                                          memory_sequence_length=memory_sequence_length)
        # return tfa.seq2seq.BahdanauAttention(units, memory = memory, memory_sequence_length=memory_sequence_length)

    # wrap decoder rnn cell  
    def build_rnn_cell(self, batch_size ):
        rnn_cell = tfa.seq2seq.AttentionWrapper(self.decoder_rnncell, self.attention_mechanism,
                                                attention_layer_size=dense_units)
        return rnn_cell
    
    def build_decoder_initial_state(self, batch_size, encoder_state,Dtype):
        decoder_initial_state = self.rnn_cell.get_initial_state(batch_size = batch_size, 
                                                                dtype = Dtype)
        decoder_initial_state = decoder_initial_state.clone(cell_state=encoder_state) 
        return decoder_initial_state
    
    def call(self, inputs, training=False):
        encoder_input, decoder_input = inputs

        x = self.encoder_embedding(encoder_input)
        x = self.encoder_rnnlayer1(x)
        x = self.encoder_norm(x, training=training)
        a, a_tx, c_tx = self.encoder_rnnlayer2(x)
        
        decoder_emb_inp = self.decoder_embedding(decoder_input)
        self.attention_mechanism.setup_memory(a)
        decoder_initial_state = self.build_decoder_initial_state(BATCH_SIZE,
                                                                encoder_state=[a_tx, c_tx],
                                                                Dtype=tf.float32)
        
        outputs, _, _ = self.decoder(decoder_emb_inp, 
                                     initial_state=decoder_initial_state,
                                     sequence_length=BATCH_SIZE*[Ty-1])

        return outputs
    
    def evaluate(self, inputs, beam_width=3):
        if self.decoder_embedding_matrix is None:
            self.decoder_embedding_matrix = tf.train.load_variable(
            model_weights_path, 'decoder_embedding/embeddings/.ATTRIBUTES/VARIABLE_VALUE')
            print(self.decoder_embedding_matrix.shape)
        
        inference_batch_size = inputs.shape[0]
        # print(inputs.shape)
        result = ''

        x = self.encoder_embedding(inputs)
        # x = tf.one_hot(inputs, depth=self.input_vocab_size)
        x = self.encoder_rnnlayer1(x)
        x = self.encoder_norm(x, training=False)
        enc_out, enc_h, enc_c = self.encoder_rnnlayer2(x)

        dec_h = enc_h
        # dec_c = enc_c

        start_tokens = tf.fill([inference_batch_size], targ_lang.word_index['<'])
        # print(start_tokens)
        end_token = targ_lang.word_index['>']
        # print(end_token)

        enc_out = tfa.seq2seq.tile_batch(enc_out, multiplier=beam_width)
        self.attention_mechanism.setup_memory(enc_out)
        # print("beam_with * [batch_size, max_length_input, rnn_units] :  3 * [1, 16, 1024]] :", enc_out.shape)

        # set decoder_inital_state which is an AttentionWrapperState considering beam_width
        hidden_state = tfa.seq2seq.tile_batch([enc_h, enc_c], multiplier=beam_width)
        decoder_initial_state = self.rnn_cell.get_initial_state(batch_size=beam_width * inference_batch_size,
                                                                dtype=tf.float32)
        decoder_initial_state = decoder_initial_state.clone(cell_state=hidden_state)

        # Instantiate BeamSearchDecoder
        decoder_instance = tfa.seq2seq.BeamSearchDecoder(self.rnn_cell, 
                                                         beam_width=beam_width, 
                                                         output_layer=self.dense_layer)
        decoder_instance.maximum_iterations = tf.round(tf.reduce_max(Tx) * 2)
        # decoder_embedding_matrix = decoderNetwork.decoder_embedding.variables[0]

        # The BeamSearchDecoder object's call() function takes care of everything.
        outputs, final_state, sequence_lengths = decoder_instance(self.decoder_embedding_matrix, 
                                                                  start_tokens=start_tokens,
                                                                  end_token=end_token, 
                                                                  initial_state=decoder_initial_state)

        final_outputs = tf.transpose(outputs.predicted_ids, perm=(0, 2, 1))
        beam_scores = tf.transpose(outputs.beam_search_decoder_output.scores, perm=(0, 2, 1))

        return final_outputs.numpy(), beam_scores.numpy()

model = MyModel(vocab_inp_size,vocab_tar_size, embedding_dims, rnn_units)
model.load_weights(filepath=model_weights_path)

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f2a4cac05d0>

### Optimizer and Custom Loss Function

In [None]:
optimizer = tf.keras.optimizers.Adam()

Here, mask is a zero-one matrix of the same size as decoder_outputs. It masks padding positions outside of the target sequence lengths with values 0.

In [None]:
def get_bangla(array):
    bangla_list = list(map(lambda x: targ_lang.index_word[x] if x != 0 else '', array))
    bangla_list.append('>')
    return "".join(bangla_list[0:bangla_list.index('>')])

def get_bangla_freq(array):
    bangla_list = list(map(lambda x: targ_lang.index_word[x] if x != 0 else '', array))
    bangla_list.append('>')
    bangla = "".join(bangla_list[0:bangla_list.index('>')])
    if bangla in word_frequency_dict:
        return [word_frequency_dict[bangla]]*len(array)
    return [1.0]*len(array)

In [None]:
sparsecategoricalcrossentropy = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss_function(y_pred, y):
    bangla_freq = list(map(lambda x: get_bangla_freq(x), y.numpy()))
    bangla_freq = tf.convert_to_tensor(bangla_freq, dtype=y_pred.dtype)
    loss = sparsecategoricalcrossentropy(y_true=y, y_pred=y_pred)
    mask = tf.logical_not(tf.math.equal(y,0))   #output 0 for y=0 else output 1
    mask = tf.cast(mask, dtype=loss.dtype)
    loss = mask * loss * bangla_freq
    loss = tf.reduce_mean(loss)
    return loss


def acc_function(pred, real):
    pred = tf.reshape(pred, [pred.shape[0], 19, pred.shape[2]])
    pred = tf.argmax(pred, axis=2)
    pred = tf.cast(pred, dtype=real.dtype)
    pred = list(map(lambda x: get_bangla(x), pred.numpy()))
    real = list(map(lambda x: get_bangla(x), real.numpy()))
    accuracies = tf.equal(real, pred).numpy()

    return accuracies.sum() / accuracies.shape[0]

### One step of training on a batch using Teacher Forcing technique

In [None]:

def train_step(input_batch, output_batch):
    #initialize loss = 0
    loss = 0
    acc = 0

    with tf.GradientTape() as tape:
        # Prepare correct Decoder input & output sequence data
        decoder_input = output_batch[:,:-1] # ignore <end>
        #compare logits with timestepped +1 version of decoder_input
        decoder_output = output_batch[:,1:] #ignore <start>

        outputs = model([input_batch, decoder_input], True)

        logits = outputs.rnn_output
        #Calculate loss

        loss = loss_function(logits, decoder_output)
        acc = acc_function(logits, decoder_output)


    #Returns the list of all layer variables / weights.
    variables = model.trainable_variables
    # differentiate loss wrt variables
    gradients = tape.gradient(loss, variables)

    #grads_and_vars – List of(gradient, variable) pairs.
    grads_and_vars = zip(gradients,variables)
    optimizer.apply_gradients(grads_and_vars)
    return loss, acc

### Training

In [None]:
start = 19
EPOCHS = 20

dataset = train_dataset
steps_per_epoch = len(dataset)
print(steps_per_epoch)
max_acc = .00

for epoch in range(start, EPOCHS+start):
    start = time.time()

    # encoder_initial_cell_state = initialize_initial_state()
    total_loss = 0
    total_acc = 0
    # print(enc_hidden[0].shape, enc_hidden[1].shape)
    for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
        # print(inp.shape, targ.shape)
        batch_loss, batch_acc = train_step(inp, targ)
        total_loss += batch_loss
        total_acc += batch_acc

        if batch % 1000 == 0:
            print(f'Epoch {epoch + 1} Upto Batch {batch+1} Loss {total_loss / (batch+1):.4f} Accuracy {total_acc / (batch+1):.4f}')
            # model.save_weights(filepath=model_weights_path)
            # break
        
    # break

    acc = total_acc / steps_per_epoch
    if acc > max_acc:
        max_acc = acc
        # checkpoint.save(file_prefix=checkpoint_prefix)
        model.save_weights(filepath=model_weights_path)
    else:
        break

    print(f'Epoch {epoch + 1} Loss {total_loss / steps_per_epoch:.4f} Accuracy {total_acc / steps_per_epoch:.4f}')
    print(f'Time taken for 1 epoch {time.time() - start:.2f} sec\n')

### Evaluation

In [None]:
# Evaluate char-level train
def calculate_acc(dataset):
    beam_width = 10
    correct_count = np.array([0]*4)
    total_count = 0
    steps_per_epoch = len(dataset)
    print(steps_per_epoch)
    # exit(0)
    start = time.time()
    for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
        # outputs, scores = beam_evaluate(inp, beam_width=beam_width)
        outputs, scores = model.evaluate(inp, beam_width=beam_width)
        # print(targ.shape)
        targ = list(map(get_bangla, targ.numpy()))
        targ = list(map(lambda x: x.replace('<', ''), targ))
        # print(targ)
        outputs = [list(map(get_bangla, output)) for output in outputs]
        # print(outputs)

        for i in range(len(targ)):
            if targ[i] == outputs[i][0]:
                correct_count[0]+=1
            if targ[i] in outputs[i][0:3]:
                correct_count[1]+=1
            if targ[i] in outputs[i][0:5]:
                correct_count[2]+=1
            if targ[i] in outputs[i]:
                correct_count[3]+=1
            total_count+=1

    print(f'Total size {total_count}')
    print(f'Acc@1 : {((correct_count[0]/total_count))*100:.2f} %')
    print(f'Acc@3 : {((correct_count[1]/total_count))*100:.2f} %')
    print(f'Acc@5 : {((correct_count[2]/total_count))*100:.2f} %')
    print(f'Acc@10: {((correct_count[3]/total_count))*100:.2f} %')
    print(f'Time taken: {(time.time() - start):.2f} s\n')

In [None]:
calculate_acc(train_dataset)
calculate_acc(val_dataset)

1877
(64, 32)
Object was never used (type <class 'tensorflow.python.ops.tensor_array_ops.TensorArray'>):
<tensorflow.python.ops.tensor_array_ops.TensorArray object at 0x7f2a61f01510>
If you want to mark it as used call its "mark_used()" method.
It was originally created here:
  File "/usr/local/lib/python3.7/dist-packages/tensorflow_addons/seq2seq/decoder.py", line 514, in body
    next_sequence_lengths,  File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/util/nest.py", line 870, in map_structure
    expand_composites=expand_composites)  File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/util/nest.py", line 869, in <listcomp>
    structure[0], [func(*x) for x in entries],  File "/usr/local/lib/python3.7/dist-packages/tensorflow_addons/seq2seq/decoder.py", line 506, in <lambda>
    lambda ta, out: ta.write(time, out), outputs_ta, emit  File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/util/tf_should_use.py", line 249, in wrapped
    error_in_function

In [None]:
# 20k (data size 390720)
# Acc@1 : 68.86 %
# Acc@3 : 88.11 %
# Acc@5 : 93.14 %
# Acc@10: 97.05 %

# 50k (data size 759232)
# Acc@1 : 71.54 %
# Acc@3 : 89.46 %
# Acc@5 : 93.96 %
# Acc@10: 97.40 %

# All data (data size 2402944)
# Acc@1 : 51.63%
# Acc@3 : 67.31%
# Acc@5 : 73.17%
# Acc@10: 79.61%

# Acc@1 : 50.63 %
# Acc@3 : 67.20 %
# Acc@5 : 73.31 %
# Acc@10: 79.91 %

# Total test set size 1922304
# Acc@1 : 50.17 %
# Acc@3 : 66.50 %
# Acc@5 : 72.58 %
# Acc@10: 79.36 %
# Time taken: 8379.94 s

beam_width = 10
correct_count = np.array([0]*4)
total_count = 0
dataset = train_dataset
steps_per_epoch = len(dataset)
print(steps_per_epoch)
# exit(0)
start = time.time()
for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
    # outputs, scores = beam_evaluate(inp, beam_width=beam_width)
    outputs, scores = model.evaluate(inp, beam_width=beam_width)
    # print(targ.shape)
    targ = list(map(get_bangla, targ.numpy()))
    targ = list(map(lambda x: x.replace('<', ''), targ))
    # print(targ)
    outputs = [list(map(get_bangla, output)) for output in outputs]
    # print(outputs)

    for i in range(len(targ)):
        if targ[i] == outputs[i][0]:
            correct_count[0]+=1
        if targ[i] in outputs[i][0:3]:
            correct_count[1]+=1
        if targ[i] in outputs[i][0:5]:
            correct_count[2]+=1
        if targ[i] in outputs[i]:
            correct_count[3]+=1
        total_count+=1
    # print(batch)
    if batch % 1000 == 0:
        print(f'Upto batch: {batch}')
        print(f'correct: {correct_count[0]}, Acc@1 : {((correct_count[0]/total_count))*100:.2f}%')
        print(f'correct: {correct_count[1]}, Acc@3 : {((correct_count[1]/total_count))*100:.2f}%')
        print(f'correct: {correct_count[2]}, Acc@5 : {((correct_count[2]/total_count))*100:.2f}%')
        print(f'correct: {correct_count[3]}, Acc@10: {((correct_count[3]/total_count))*100:.2f}%')
        print()

print(f'Total test set size {total_count}')
print(f'Acc@1 : {((correct_count[0]/total_count))*100:.2f} %')
print(f'Acc@3 : {((correct_count[1]/total_count))*100:.2f} %')
print(f'Acc@5 : {((correct_count[2]/total_count))*100:.2f} %')
print(f'Acc@10: {((correct_count[3]/total_count))*100:.2f} %')
print(f'Time taken: {(time.time() - start):.2f} s\n')

1877
Upto batch: 0
correct: 534, Acc@1 : 52.15%
correct: 682, Acc@3 : 66.60%
correct: 751, Acc@5 : 73.34%
correct: 807, Acc@10: 78.81%

Upto batch: 1000
correct: 528726, Acc@1 : 51.58%
correct: 691624, Acc@3 : 67.47%
correct: 752442, Acc@5 : 73.41%
correct: 819575, Acc@10: 79.96%



In [None]:
# 20k (data size 390720)
# Acc@1 : 68.86 %
# Acc@3 : 88.11 %
# Acc@5 : 93.14 %
# Acc@10: 97.05 %

# 50k (data size 759232)
# Acc@1 : 71.54 %
# Acc@3 : 89.46 %
# Acc@5 : 93.96 %
# Acc@10: 97.40 %

# All data (data size 2402944)
# Acc@1 : 51.63%
# Acc@3 : 67.31%
# Acc@5 : 73.17%
# Acc@10: 79.61%

# Acc@1 : 50.63 %
# Acc@3 : 67.20 %
# Acc@5 : 73.31 %
# Acc@10: 79.91 %

# Total test set size 480512
# Acc@1 : 52.15 %
# Acc@3 : 67.86 %
# Acc@5 : 73.65 %
# Acc@10: 80.09 %
# Time taken: 2091.66 s

beam_width = 10
correct_count = np.array([0]*4)
total_count = 0
dataset = val_dataset
steps_per_epoch = len(dataset)
print(steps_per_epoch)
# exit(0)
start = time.time()
for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
    # outputs, scores = beam_evaluate(inp, beam_width=beam_width)
    outputs, scores = model.evaluate(inp, beam_width=beam_width)
    # print(targ.shape)
    targ = list(map(get_bangla, targ.numpy()))
    targ = list(map(lambda x: x.replace('<', ''), targ))
    # print(targ)
    outputs = [list(map(get_bangla, output)) for output in outputs]
    # print(outputs)

    for i in range(len(targ)):
        if targ[i] == outputs[i][0]:
            correct_count[0]+=1
        if targ[i] in outputs[i][0:3]:
            correct_count[1]+=1
        if targ[i] in outputs[i][0:5]:
            correct_count[2]+=1
        if targ[i] in outputs[i]:
            correct_count[3]+=1
        total_count+=1
    # print(batch)
    if batch % 1000 == 0:
        print(f'Upto batch: {batch}')
        print(f'correct: {correct_count[0]}, Acc@1 : {((correct_count[0]/total_count))*100:.2f}%')
        print(f'correct: {correct_count[1]}, Acc@3 : {((correct_count[1]/total_count))*100:.2f}%')
        print(f'correct: {correct_count[2]}, Acc@5 : {((correct_count[2]/total_count))*100:.2f}%')
        print(f'correct: {correct_count[3]}, Acc@10: {((correct_count[3]/total_count))*100:.2f}%')
        print()

print(f'Total test set size {total_count}')
print(f'Acc@1 : {((correct_count[0]/total_count))*100:.2f} %')
print(f'Acc@3 : {((correct_count[1]/total_count))*100:.2f} %')
print(f'Acc@5 : {((correct_count[2]/total_count))*100:.2f} %')
print(f'Acc@10: {((correct_count[3]/total_count))*100:.2f} %')
print(f'Time taken: {(time.time() - start):.2f} s\n')

469
Upto batch: 0
correct: 531, Acc@1 : 51.86%
correct: 712, Acc@3 : 69.53%
correct: 767, Acc@5 : 74.90%
correct: 837, Acc@10: 81.74%

Total test set size 480256
Acc@1 : 50.60 %
Acc@3 : 66.66 %
Acc@5 : 72.79 %
Acc@10: 79.60 %
Time taken: 685.48 s



In [None]:
def preprocess_word(word):
    word = [[char for char in ('<' + word.rstrip().lstrip() + '>')]]
    word = inp_lang.texts_to_sequences(word)
    inputs = tf.keras.preprocessing.sequence.pad_sequences(word, padding='post',
                                                           maxlen=20, truncating='post')
    print(inputs)
    return tf.convert_to_tensor(inputs)

In [None]:
def predict(english_word):
    start = time.time()
    # outputs, score = beam_evaluate(preprocess_word(english_word), 5)
    outputs, score = model.evaluate(preprocess_word(english_word), 10)
    outputs = [list(map(get_bangla, output)) for output in outputs]
    print(outputs[0])

    print(f'Time taken: {(time.time() - start)*1000:.2f} ms\n')

In [None]:
predict("ami")

[[ 1  3 17  7  2  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]]
(64, 32)
tf.Tensor([1], shape=(1,), dtype=int32)
2
['আমি', 'অমি', 'এমই', 'আমী', 'আমই', 'এমি', 'অমী', 'এমী', 'মী', 'মি']
Time taken: 1243.29 ms

