In [1]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import unicodedata
import re
import einops

In [2]:
def load_data(data_path):
    with open(data_path, 'r') as f:
        content = f.read().splitlines()
        pair = [c.split('\t') for c in content]
        context = np.array([c for c, t, _ in pair]).reshape(3061, 1)
        target = np.array([t for c, t, _ in pair]).reshape(3061, 1)
        del pair
        del content
    
    return context, target

In [3]:
X, y = load_data(r'dataset/hin.txt')
batch_size = 64

val = int(len(X) * 0.2)
idx = np.random.randint(0, len(X), (val))
train_idx = np.setdiff1d(np.arange(len(X)), idx)

X_val = X[idx]
y_val = y[idx]
X_train = X[train_idx]
y_train = y[train_idx]

In [4]:
print(X)

[['Wow!']
 ['Duck!']
 ['Duck!']
 ...
 ['Democracy is the worst form of government, except all the others that have been tried.']
 ['If my boy had not been killed in the traffic accident, he would be a college student now.']
 ["When I was a kid, touching bugs didn't bother me a bit. Now I can hardly stand looking at pictures of them."]]


In [5]:
train_data_rw = tf.data.Dataset.from_tensor_slices((X_train, y_train)).batch(batch_size)
valid_data_rw = tf.data.Dataset.from_tensor_slices((X_val, y_val)).batch(batch_size)

In [6]:
for con, ta in train_data_rw.take(1):
    print(con[: 5])
    print(ta[: 5])

tf.Tensor(
[[b'Wow!']
 [b'Duck!']
 [b'Duck!']
 [b'Jump.']
 [b'Hello!']], shape=(5, 1), dtype=string)
tf.Tensor(
[[b'\xe0\xa4\xb5\xe0\xa4\xbe\xe0\xa4\xb9!']
 [b'\xe0\xa4\x9d\xe0\xa5\x81\xe0\xa4\x95\xe0\xa5\x8b!']
 [b'\xe0\xa4\xac\xe0\xa4\xa4\xe0\xa4\x96\xe0\xa4\xbc!']
 [b'\xe0\xa4\x89\xe0\xa4\x9b\xe0\xa4\xb2\xe0\xa5\x8b.']
 [b'\xe0\xa4\xa8\xe0\xa4\xae\xe0\xa4\xb8\xe0\xa5\x8d\xe0\xa4\x95\xe0\xa4\xbe\xe0\xa4\xb0\xe0\xa5\xa4']], shape=(5, 1), dtype=string)


In [7]:
def process_text_hin(text):
    text = tf.strings.lower(text)
    text = tf.strings.regex_replace(text, r'[^०-९ऀ-ॿ\s.?!,¿]', ':') 
    text = tf.strings.regex_replace(text, r'([.?!,¿])', r' \1 ')
    text = tf.strings.strip(text)
    return tf.strings.join(['[START]', text, '[END]'], separator=' ')

def process_text_eng(text):
    text = tf.strings.lower(text)
    text = tf.strings.regex_replace(text, r'[^a-z\s.?!,¿]', '')
    text = tf.strings.regex_replace(text, r'([.?!,¿])', r' \1 ')
    text = tf.strings.strip(text)
    return tf.strings.join(['[START]', text, '[END]'], separator=' ')

In [8]:
text_hin = "यह एक उदाहरण है।"
text_eng = "Democracy is the worst form of government, except all the others that have been tried"

processed_hin = process_text_hin(text_hin)
processed_eng = process_text_eng(text_eng)
print(processed_hin.numpy().decode('utf-8'))
print(processed_eng.numpy().decode('utf-8'))

[START] यह एक उदाहरण है। [END]
[START] democracy is the worst form of government ,  except all the others that have been tried [END]


In [9]:
vocab_size = 5000

vectorization_layer_eng_c = tf.keras.layers.TextVectorization(
    max_tokens = vocab_size,
    standardize = process_text_eng,
    ragged = True
)
vectorization_layer_hindi_t = tf.keras.layers.TextVectorization(
    max_tokens = vocab_size,
    standardize = process_text_hin,
    ragged = True
)

vectorization_layer_eng_c.adapt(train_data_rw.map(lambda c, t: c))
vectorization_layer_hindi_t.adapt(train_data_rw.map(lambda c, t: t))

In [10]:
tg = ta[62]
token = vectorization_layer_hindi_t(np.expand_dims(tg, axis = 0))
context_vocab = np.array(vectorization_layer_hindi_t.get_vocabulary())[token.numpy()]
print(tg)
print(token)
print(context_vocab)

tf.Tensor([b'\xe0\xa4\xb6\xe0\xa4\xbe\xe0\xa4\xac\xe0\xa4\xbe\xe0\xa4\xb6!'], shape=(1,), dtype=string)
<tf.RaggedTensor [[2, 985, 70, 3]]>
[['[START]' 'शाबाश' '!' '[END]']]


In [11]:
def process_data(context, target):
    context = vectorization_layer_eng_c(context).to_tensor()
    target = vectorization_layer_hindi_t(target)
    targ_in = target[:,:-1].to_tensor()
    targ_out = target[:,1:].to_tensor()
    return (context, targ_in), targ_out

In [12]:
train_ds = train_data_rw.map(process_data, tf.data.AUTOTUNE)
valid_ds = valid_data_rw.map(process_data, tf.data.AUTOTUNE)

In [13]:
class shapeChecker():
    def __init__(self):
        self.shape = {}

    def __call__(self, tensor, names, broadcast = False):
        if not tf.executing_eagerly():
            return
        parsed = einops.parse_shape(tensor, names)

        for name, new_dim in parsed.items():
            old_dim = self.shape.get(name, None)

            if(broadcast and new_dim == 1):
                continue
            if old_dim is None:
                self.shape[name] = new_dim
                continue
            if new_dim != old_dim:
                raise ValueError(f"Shape mismatch for dimension: '{name}'\n"
                         f"    found: {new_dim}\n"
                         f"    expected: {old_dim}\n")
        

In [14]:
class Encoder(tf.keras.layers.Layer):
    def __init__(self, text_vector_layer, neurons):
        super(Encoder, self).__init__()
        self.text_vector_layer = text_vector_layer
        self.neurons = neurons
        self.vocab_size = text_vector_layer.vocabulary_size()

        self.embeddings = tf.keras.layers.Embedding(self.vocab_size, neurons, mask_zero = True)
        self.rnn = tf.keras.layers.Bidirectional(
            merge_mode = 'sum',
            layer = tf.keras.layers.GRU(units = neurons, return_sequences = True, kernel_initializer = 'glorot_uniform')
        )

    def call(self, x):
        shape_chec = shapeChecker()
        shape_chec(x, 'batch s')
        
        x = self.embeddings(x)
        shape_chec(x, 'batch s units')
        
        x = self.rnn(x)
        shape_chec(x, 'batch s unit')

        return x
    
    def convert_inp(self, text):
        text = tf.convert_to_tensor(text)
        if len(text.shape) == 0:
            text = tf.convert_to_tensor(text)[tf.newaxis]
        context = self.text_vector_layer(text).to_tensor()
        context = self(context)
        return context

In [15]:
class Attention(tf.keras.layers.Layer):
    def __init__(self, units, **kwargs):
        super().__init__()
        self.mha = tf.keras.layers.MultiHeadAttention(key_dim=units, num_heads=1, **kwargs)
        self.layernorm = tf.keras.layers.LayerNormalization()
        self.add = tf.keras.layers.Add()

    def call(self, x, context):
        shape_chec = shapeChecker()

        shape_chec(x, 'batch t units')
        shape_chec(context, 'batch s units')

        at_out, at_score = self.mha(
            query = x,
            value = context,
            return_attention_scores = True
        )

        shape_chec(x, 'batch t units')
        shape_chec(at_score, 'batch heads t s')

        at_score = tf.reduce_mean(at_score, axis = 1)
        shape_chec(at_score, 'batch t s')

        self.last_attention_weights = at_score

        x = self.add([x, at_out])
        x = self.layernorm(x)

        return x

In [16]:
class Decoder(tf.keras.layers.Layer):
    @classmethod
    def add_method(cls, fun):
        setattr(cls, fun.__name__, fun)
        return fun
    
    def __init__(self, text_vector, units):
        super(Decoder, self).__init__()
        self.text_vector = text_vector
        self.vocab_size = text_vector.vocabulary_size()
        self.word_to_id = tf.keras.layers.StringLookup(
        vocabulary=text_vector.get_vocabulary(),
        mask_token='', oov_token='[UNK]')
        self.id_to_word = tf.keras.layers.StringLookup(
        vocabulary=text_vector.get_vocabulary(),
        mask_token='', oov_token='[UNK]',
        invert=True)

        self.start_token = self.word_to_id('[START]')
        self.end_token = self.word_to_id('[END]')
        
        self.units = units

        self.embedding = tf.keras.layers.Embedding(self.vocab_size, units, mask_zero = True)
        self.rnn = tf.keras.layers.GRU(
            units,
            return_sequences=True,
            return_state=True,
            recurrent_initializer='glorot_uniform'
        )

        self.attention = Attention(units)
        self.output_layer = tf.keras.layers.Dense(self.vocab_size)
        

In [17]:
@Decoder.add_method
def call(self, context, x, state = None, return_state = False):
    shape_chec = shapeChecker()
    shape_chec(x, 'batch t')
    shape_chec(context, 'batch s units')

    x = self.embedding(x)
    shape_chec(x, 'batch t units')

    x, state = self.rnn(x, initial_state = state)
    shape_chec(x, 'batch t units')

    x = self.attention(x, context)
    self.last_attention_weights = self.attention.last_attention_weights
    shape_chec(x, 'batch t units')
    shape_chec(self.last_attention_weights, 'batch t s')

    logits = self.output_layer(x)
    shape_chec(logits, 'batch t target_vocab_size')

    if return_state:
        return logits, state
    else:
        return logits

In [18]:
class Translator(tf.keras.Model):
    @classmethod
    def add_method(cls, func):
        setattr(cls, func.__name__, func)
        return func

    def __init__(self, units, vector_context, vector_target):
        super().__init__()
        encoder = Encoder(vector_context, units)
        decoder = Decoder(vector_target, units)

        self.encoder = encoder
        self.decoder = decoder

    def call(self, inputs):
        context, x = inputs
        context = self.encoder(context)
        logits = self.decoder(context, x)
        
        try:
            del logits._keras_mask
        except AttributeError:
            pass

        return logits

In [19]:
for (c_eg, t_in_eg), t_out_eg in train_ds.take(1):
    print(c_eg[0, :])
    print(t_in_eg[0, :])
    print(t_out_eg[0, :])

tf.Tensor([   2 1161   65    3    0    0], shape=(6,), dtype=int64)
tf.Tensor([   2 1540   70    0    0    0    0    0    0    0], shape=(10,), dtype=int64)
tf.Tensor([1540   70    3    0    0    0    0    0    0    0], shape=(10,), dtype=int64)


In [20]:
model = Translator(256, vectorization_layer_eng_c, vectorization_layer_hindi_t)

logit = model((c_eg, t_in_eg))
print(f'Context tokens, shape: (batch, s, units) {c_eg.shape}')
print(f'Target tokens, shape: (batch, t) {t_in_eg.shape}')
print(f'logits, shape: (batch, t, target_vocabulary_size) {logit.shape}')



Context tokens, shape: (batch, s, units) (64, 6)
Target tokens, shape: (batch, t) (64, 10)
logits, shape: (batch, t, target_vocabulary_size) (64, 10, 2843)




In [21]:
def masked_loss(y_true, y_pred):
    loss_func = tf.keras.losses.SparseCategoricalCrossentropy(
        from_logits=True, reduction='none')
    loss = loss_func(y_true, y_pred)

    mask = tf.cast(y_true != 0, loss.dtype)
    loss *= mask

    return tf.reduce_sum(loss)/tf.reduce_sum(mask)

def masked_acc(y_true, y_pred):
    y_pred = tf.argmax(y_pred, axis=-1)
    y_pred = tf.cast(y_pred, y_true.dtype)

    match = tf.cast(y_true == y_pred, tf.float32)
    mask = tf.cast(y_true != 0, tf.float32)

    return tf.reduce_sum(match)/tf.reduce_sum(mask)

In [22]:
model.compile(optimizer = 'adam', loss = masked_loss, metrics = [masked_acc, masked_loss])

In [33]:
his = model.fit(train_ds.repeat(), epochs = 100, steps_per_epoch = 100, validation_data = valid_ds, validation_steps = 20,
                callbacks = [tf.keras.callbacks.EarlyStopping(patience=3)])

Epoch 1/100
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 134ms/step - loss: 6.0887 - masked_acc: 0.1601 - masked_loss: 6.0931 - val_loss: 5.0445 - val_masked_acc: 0.2388 - val_masked_loss: 5.0502
Epoch 2/100
[1m  1/100[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m11s[0m 114ms/step - loss: 4.3285 - masked_acc: 0.2824 - masked_loss: 4.3285



[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 123ms/step - loss: 3.9989 - masked_acc: 0.3242 - masked_loss: 4.0071 - val_loss: 4.4890 - val_masked_acc: 0.3155 - val_masked_loss: 4.4946
Epoch 3/100
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 120ms/step - loss: 2.4144 - masked_acc: 0.5225 - masked_loss: 2.4149 - val_loss: 4.4030 - val_masked_acc: 0.3384 - val_masked_loss: 4.4143
Epoch 4/100
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 123ms/step - loss: 1.5671 - masked_acc: 0.6670 - masked_loss: 1.5647 - val_loss: 4.4936 - val_masked_acc: 0.3581 - val_masked_loss: 4.5082
Epoch 5/100
[1m 56/100[0m [32m━━━━━━━━━━━[0m[37m━━━━━━━━━[0m [1m5s[0m 118ms/step - loss: 0.7997 - masked_acc: 0.8335 - masked_loss: 0.7987

KeyboardInterrupt: 