# Word-level model

### Constant 

In [1]:
# For PC assign 0 for colab assign 1
PC_OR_COLAB = 0

# Resolve the base path depending on your running environment
colab_base = '/content/drive/MyDrive/ashraful/paper-1/'
pc_base = './'

if PC_OR_COLAB == 1:
    base = colab_base
else:
    base = pc_base


In [None]:
new_dataset_path = base + 'dataset/top50k.txt'
dataset_paths = [new_dataset_path]

input_tokenizer_dir = base + 'dataset/input-tokenizer_word.pickle'
target_tokenizer_dir = base + 'dataset/target-tokenizer_word.pickle'

saved_model_dir = base + 'saved-weights/word-level-model/word-level.h5'


In [2]:
# If you are using colab then this is meaningful
if PC_OR_COLAB == 1:
    from google.colab import drive
    drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
import tensorflow as tf

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split

import unicodedata
import re
import numpy as np
import os
import io
import time
import pickle


In [None]:
lines = list()

for path in dataset_paths:
    lines.extend(io.open(path, encoding='UTF-8').read().strip().split('\n'))

# lines = list(lines)
# lines.sort()
print(len(lines))

inp_lang = [[char for char in '<' +
             l.split(',')[0].replace('ঃ\n', '').replace('\n', '') + '>'] for l in lines]
targ_lang = [l.split(',')[1].replace('ঃ\n', '').replace('\n', '')
             for l in lines]

print(inp_lang[0])
print(targ_lang[0])


676822
['<', 'a', 'b', '>']
আব


In [None]:
# Retrieving

try:
    with open(input_tokenizer_dir, mode='rb') as data_file:
        inp_lang_tokenizer = pickle.load(data_file)

except:
    print("Input tokeizer Not found")
    exit(1)

try:
    with open(target_tokenizer_dir, mode='rb') as data_file:
        targ_lang_tokenizer = pickle.load(data_file)

except:
    print("Target tokenizer Not found")
    exit(1)

print(len(inp_lang_tokenizer.word_index))
print(len(targ_lang_tokenizer.word_index))


28
50000


In [None]:
input_tensor = inp_lang_tokenizer.texts_to_sequences(inp_lang)
input_tensor = tf.keras.preprocessing.sequence.pad_sequences(input_tensor, padding='post',
                                                             maxlen=12, truncating='post')

target_tensor = targ_lang_tokenizer.texts_to_sequences(targ_lang)
target_tensor = tf.reshape(target_tensor, [-1]).numpy()


print(len(inp_lang_tokenizer.word_index))
print(len(targ_lang_tokenizer.word_index))

total_chars = len(inp_lang_tokenizer.word_index) + 1
total_words = len(targ_lang_tokenizer.word_index) + 1


28
50000


In [None]:
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = \
    train_test_split(input_tensor,
                     target_tensor,
                     test_size=0.2,
                     random_state=4651)

print(input_tensor_train[500])
print(target_tensor_train[500])

train_dataset = tf.data.Dataset.from_tensor_slices(
    (input_tensor_train, target_tensor_train))


val_dataset = tf.data.Dataset.from_tensor_slices(
    (input_tensor_val, target_tensor_val))


[ 1  4 11  8  7 19 12  8  3  2  0  0]
32715


In [None]:
print(len(train_dataset))
print(len(val_dataset))
print(len(train_dataset)+len(val_dataset))


541457
135365
676822


In [None]:
accuracy_1 = tf.keras.metrics.SparseTopKCategoricalAccuracy(
    k=1, name='Accuracy@1', dtype=None)
accuracy_3 = tf.keras.metrics.SparseTopKCategoricalAccuracy(
    k=3, name='Accuracy@3', dtype=None)
accuracy_5 = tf.keras.metrics.SparseTopKCategoricalAccuracy(
    k=5, name='Accuracy@5', dtype=None)
accuracy_10 = tf.keras.metrics.SparseTopKCategoricalAccuracy(
    k=10, name='Accuracy@10', dtype=None)

metrics = [accuracy_1, accuracy_3, accuracy_5, accuracy_10]


In [None]:
# Model
model = tf.keras.models.Sequential()

model.add(tf.keras.layers.Embedding(total_chars, 32, input_length=12))
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128)))
model.add(tf.keras.layers.Dense(128))
model.add(tf.keras.layers.Dense(64))
model.add(tf.keras.layers.Dense(32))

model.add(tf.keras.layers.Dense(total_words, activation='softmax'))
adam = tf.keras.optimizers.Adam()

model.compile(loss='sparse_categorical_crossentropy',
              optimizer=adam, metrics=metrics)
model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 12, 32)            928       
                                                                 
 bidirectional (Bidirectiona  (None, 256)              164864    
 l)                                                              
                                                                 
 dense (Dense)               (None, 128)               32896     
                                                                 
 dense_1 (Dense)             (None, 64)                8256      
                                                                 
 dense_2 (Dense)             (None, 32)                2080      
                                                                 
 dense_3 (Dense)             (None, 50001)             1650033   
                                                        

In [None]:
# Loading prev model
model = tf.keras.models.load_model(saved_model_dir)
model.summary()


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 12, 32)            928       
                                                                 
 bidirectional_1 (Bidirectio  (None, 256)              164864    
 nal)                                                            
                                                                 
 dense_4 (Dense)             (None, 128)               32896     
                                                                 
 dense_5 (Dense)             (None, 64)                8256      
                                                                 
 dense_6 (Dense)             (None, 32)                2080      
                                                                 
 dense_7 (Dense)             (None, 50001)             1650033   
                                                      

In [None]:
history = model.fit(input_tensor_train,
                    target_tensor_train,
                    epochs=200,
                    batch_size=128)


In [None]:
# Saving the model
model.save(saved_model_dir)


In [None]:
model.evaluate(input_tensor_train, target_tensor_train)




[0.33437591791152954,
 0.8702131509780884,
 0.9887064099311829,
 0.9979149103164673,
 0.9997340440750122]

In [None]:
model.evaluate(input_tensor_val, target_tensor_val)




[0.37721994519233704,
 0.8697447776794434,
 0.9832895994186401,
 0.991364061832428,
 0.9930853843688965]