In [None]:
import tensorflow as tf
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.preprocessing import text_dataset_from_directory
from tensorflow.keras.layers import(SimpleRNN,Embedding,Input,LSTM,Input,
                                    Dropout,Dense,GRU,LayerNormalization,
                                    Bidirectional,Reshape)
from tensorflow.data.experimental import AUTOTUNE
import numpy as np
import re
import string
import nltk
import datetime
import numpy as np
from matplotlib import pyplot as plt
import pandas

<H1>DATA PREPARATION</H1>

In [None]:
path='...'

In [None]:
NUM_EXAMPLES=250
VALIDATION_RATIO=1
VALIDATION_BRIDGE=int(VALIDATION_RATIO*NUM_EXAMPLES)

text_dataset=tf.data.TextLineDataset(path).take(NUM_EXAMPLES)
BATCH_SIZE=1024

In [None]:
for i in text_dataset.take(1):
    print(i)

In [None]:
def selector(input_text):
    return tf.strings.split(input_text,'\t')[0:1],'starttoken'+tf.strings.split(input_text,'\t')[1:2],tf.strings.split(input_text,'\t')[1:2]

In [None]:
text_dataset=text_dataset.map(selector)

In [None]:
for i in text_dataset.take(1):
    print(i)

In [None]:
def preprocess_sentences(input_data):
    '''
    Task: Preprocess sentences or standardize the sentences
    Input: raw reviews
    output: standardized reviews
    '''
    output=tf.strings.lower(input_data)
    outputs=tf.strings.regex_replace(output,"<[^>]+>","")
    outputs=tf.strings.regex_replace(output,"<[%s]"%re.esceape(string.punctuation)," ")
    outputs=tf.strings.regex_replace(output,"  "," ")
    
    return output

In [None]:
SEQUENCE_LENGTH=10

vectorize_input_layer=TextVectorization(
    standardize=preprocess_sentences,
    output_sequence_length=SEQUENCE_LENGTH,
)

In [None]:
vectorize_pre_output_layer=TextVectorization(
    standardize=preprocess_sentences,
    output_sequence_length=SEQUENCE_LENGTH,
)

In [None]:
vectorize_output_layer=TextVectorization(
    standardize=preprocess_sentences,
    output_sequence_length=SEQUENCE_LENGTH,
)

In [None]:
training_data=text_dataset.map(lambda x,y,z:x)
vectorize_input_layer.adapt(training_data)

In [None]:
training_data=text_dataset.map(lambda x,y,z:y)
vectorize_pre_output_layer.adapt(training_data)

In [None]:
training_data=text_dataset.map(lambda x,y,z:z)
vectorize_output_layer.adapt(training_data)

In [None]:
VOCAB_INPUT_SIZE=len(vectorize_input_layer.get_vocabulary())
VOCAB_PRE_OUTPUT_SIZE=len(vectorize_pre_output_layer.get_vocabulary())
VOCAB_OUTPUT_SIZE=len(vectorize_output_layer.get_vocabulary())

In [None]:
def vectorizer(x,y,z):
    return {'in1':tf.squeeze(vectorize_input_layer(x),0),'in2':tf.squeeze(vectorize_pre_output_layer(y),0),}, tf.squeeze(vectorize_output_layer(z),0)
dataset=text_dataset.map(vectorizer)

In [None]:
dataset=dataset.shuffle(NUM_EXAMPLES)
train_dataset=dataset.take(VALIDATION_BRIDGE)
validation_dataset=dataset.skip(VALIDATION_BRIDGE)


In [None]:
train_dataset=train_dataset.batch(BATCH_SIZE).cache().prefetch(buffer_size=AUTOTUNE)
validation_dataset=validation_dataset.batch(BATCH_SIZE).cache().prefetch(buffer_size=AUTOTUNE)


In [None]:
vectorize_output_layer.get_vocabulary()[1372]

<H1>MODELING</H1>

In [None]:
class SelfAttention(tf.keras.layers.Layer):
    def __init__(self,model_size):
        super(SelfAttention,self).__init__()
        self.model_size=model_size
    def call(self,query,key,value,sequence,look_ahead_masking=False):
        #score=tf.matmul(query,key,transpose_b=True)
        score=tf.einsum('ijk,ibk->ijb',query,key)
        score/=tf.math.sqrt(tf.cast(self.model_size,tf.float32))
        ones=tf.ones_like(score)
        pad_mask=padding_mask(sequence)
        
        total_mask=pad_mask
        if look_ahead_masking:
            ahead_mask=1-tf.linalg.band_part(ones,-1,0)
            total_mask+=ahead_mask
        score+=total_mask*-1e10
        alignment=tf.nn.softmax(score,axis=-1)
        head=tf.matmul(alignment,value)
        return head

In [None]:
def padding_mask(a):
    return tf.expand_dims(tf.cast(tf.math.equal([a],0),tf.float32)[0],axis=-2)

In [None]:
def positional_embedding(model_size):
    output=[]
    for pos in range(SEQUENCE_LENGTH):
        PE=np.zeros((model_size))
        for i in range(model_size):
            if i%2==0:
                PE[i]=np.sin(pos/(10000**(i/model_size)))
            else:
                PE[i]=np.cos(pos/(10000**((i-1)/model_size)))
        output.append(tf.expand_dims(PE,axis=0))
    return tf.concat(output,axis=0)

In [None]:
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self,model_size,h):
        super(MultiHeadAttention,self).__init__()
        self.query_size=model_size//h
        self.key_size=model_size//h
        self.value_size=model_size//h
        self.h=h
        self.dense_q=[Dense(self.query_size) for _ in range(h)]
        self.dense_k=[Dense(self.key_size) for _ in range(h)]
        self.dense_v=[Dense(self.value_size) for _ in range(h)]
        self.dense_o=Dense(model_size)
        self.self_attention=SelfAttention(self,key_size)
        
    def call(self,query,key,value,sequence,look_ahead_masking):
        heads=[]
        
        for i in range(self.h):
            head=self.self_attention(self.dense_q[i](query),self.dense_k[i](key),
                                    self.dense_v[i](value),sequence,look_ahead_masking)
            heads.append(head)
        heads=tf.concat(heads,axis=2)
        heads=self.dense_o(heads)
        return heads

In [None]:
class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self,vocab_size,model_size,h):
        super(EncoderLayer,self).__init__()
        
        self.model_size=model_size
        self.h=h
        self.embedding=Embedding(vocab_size,model_size)
        self.multi_attention=MultiHeadAttention(model_size,h)
        self.dropout=Dropout(0.2)
        
        self.dense_1=Dense(model_size*4,activation='relu')
        self.dense_2=Dense(model_size)
        self.feed_forward_norm=LayerNormalization()
        
    def call(self,enc_in,sequence):
        enc_out=self.multi_attention(enc_in,enc_in,enc_in,sequence,look_ahead_masking=False)
        enc_out=enc_in+enc_out
        enc_out=self.attention_norm(enc_out)
        
        feed_forward_in=enc_out
        feed_forward_out=self.dropout(self.dense_2(self.dense_1(feed_forward_in)))
        feed_forward_out+=feed_forward_in
        feed_forward_out=self.feed_forward_norm(feed_forward_out)
        return feed_forward_out

In [None]:
class Encoder(tf.keras.layers.Layer):
    def __init__(self,vocab_size,model_size,h,num_layers):
        super(Encoder,self).__init__()
        
        self.model_size=model_size
        self.num_layers=num_layers
        self.h=h
        self.embedding=Embedding(vocab_size,model_size)
        self.encoder_layer=[EncoderLayer(vocab_size,model_size,h) for _ in range(num_layers)]
        
    def call(self, sequence):
        enc_in=self.embedding(sequence)
        enc_in+=tf.cast(positional_embedding(self.model_size),dtype=tf.float32)
        
        for i in range(self.num_layers):
            out=self.encoder_layer[i](enc_in,sequence)
            enc_in=out
        return out

In [None]:
class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self,model_size,num_layers,h):
        super(DecoderLayer,self).__init__()
        
        self.model_size=model_size
        self.num_layers=num_layers
        self.h=h
        
        self.multi_attention_bot=MultiHeadAttention(model_size,h)
        self.attetnion_bot_norm=LayerNormalization()
        
        self.multi_attention_mid=MultiHeadAttention(model_size,h)
        self.attetnion_mid_norm=LayerNormalization()
        
        self.dense_1=Dense(model_size*4,activation='relu')
        self.dense_2=Dense(model_size)
        self.dropout=Dropout(0.2)
        
        self.feed_forward_norm=LayerNormalization()
        
    def call(self,enc_in,sequence):
        bot_dec_out=self.multi_attention_bot(bot_dec_in,bot_dec_in,bot_dec_in,sequence,look_ahead_masking=True)
        bot_dec_out+=bot_dec_in
        bot_dec_out=self.attention_bot_norm(bot_dec_out)
        
        mid_dec_in=bot_dec_out
        
        mid_dec_out=self.multi_attention_mid(mid_dec_in,mid_dec_in,mid_dec_in,sequence,look_ahead_masking=False)
        mid_dec_out+=mid_dec_in
        mid_dec_out=self.attention_mid_norm(mid_dec_out)
        
        feed_forward_in=mid_dec_out
        
        feed_forward_out=self.dropout(self.dense_2(self.dense_1(feed_forward_in)))
        feed_forward_out+=feed_forward_in
        feed_forward_out=self.feed_forward_norm(feed_forward_out)
        return feed_forward_out

In [None]:
class Decoder(tf.keras.layers.Layer):
    def __init__(self,vocab_size,model_size,h,num_layers):
        super(Decoder,self).__init__()
        
        self.model_size=model_size
        self.num_layers=num_layers
        self.h=h
        self.embedding=Embedding(pre_vocab_size,model_size)
        self.decoder_layer=[DecoderLayer(model_size,num_layers,h) for _ in range(num_layers)]
        self.dense=Dense(vocab_size,)
        
    def call(self, sequence,encoder_output):
        dec_in=self.embedding(sequence)
        dec_in+=tf.cast(positional_embedding(self.model_size),dtype=tf.float32)
        
        for i in range(self.num_layers):
            out=self.decoder_layer[i](dec_in,encoder_output,sequence)
            dec_in=out
        out=self.dense(out)
        return out

In [None]:
class Transformer(tf.keras.Model):
    def __init__(self,VOCAB_INPUT_SIZE,VOCAB_PRE_OUTPUT_SIZE,VOCAB_OUTPUT_SIZE,MODEL_SIZE,NUM_HEADS,NUM_LAYERS):
        super(Transformer,self).__init__()
        
        self.encoder=Encoder(
            vocab_size=VOCAB_INPUT_SIZE,
            model_size=MODEL_SIZE,
            h=NUM_HEADS,
            num_layers=NUM_LAYERS,
        )
        
        self.decoder=Decoder(
            pre_vocab_size=VOCAB_PRE_OUTPUT_SIZE,
            vocab_size=VOCAB_OUTPUT_SIZE,
            model_size=MODEL_SIZE,
            h=NUM_HEADS,
            num_layers=NUM_LAYERS,
        )
        
    def call(self, inputs,pre_outputs):
        x=self.encoder(inputs)
        x=self.decoder(pre_outputs,x)
        return x

In [None]:
inputs=Input(SEQUENCE_LENGTH)
pre_outputs=Input(SEQUENCE_LENGTH)

transformer=Transformer(VOCAB_INPUT_SIZE,VOCAB_PRE_OUTPUT_SIZE,VOCAB_OUTPUT_SIZE,MODEL_SIZE,NUM_HEADS,NUM_LAYERS)
decoder_output=transformer(inputs,pre_outputs)
model=tf.keras.Model([inputs,pre_outputs],decoder_output,name='transformer')
model.summary()

<H1>TRAINING</H1>

In [None]:
class BLEU(tf.keras.metrics.Metric):
    def __init__(self,name='bleu_score'):
        super(BLEU,self).__init__()
        self.add=0
        self.total=0
    def update_state(self,y_true,y_pred,sample_weight=None):
        y_true=tf.argmax(y_true,-1)
        y_pred=tf.argmax(y_pred,-1)
        
        for i,j in zip(y_pred,y_true):
            tf.autograph.experimental.set_loop_options()
            self.total+=tf.math.count_nonzero(i)
            for word in i:
                if word==0:
                    break
                for q in range(len(j)):
                    if j[q]==0:
                        break
                    if word==j[q]:
                        self.add+=1
                        j=tf.boolean_mask(j,[False if y==q else True for y in range(len(j))])
                        break
    def result(self):
        return self.add/self.total

In [None]:
LR=1e-3
EPOCH=100
model.compile(
    loss=tf.keras.losses.CategoricalCrossentropy(),
    optimizer=tf.keras.optimizers.Adam(lr=LR,),
    #metrics=[BLEU()],
    #run_eagerly=True,
)

In [None]:
checkpoint_filepath='...'
log_dir='...'
callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='loss',
    mode='min',
    save_best_only=True
)

In [None]:
history=model.fit(train_dataset, validation_data=validation_dataset,verbose=1,epochs=EPOCH,callbacks=[callback])

<H1>TESTING</H1>

In [None]:
def translate(input_sentence):
    print('Input:-->',input_sentence)
    test_data=tf.data.Dataset.from_tensor_slices([[input_sentence]])
    input_test_data=test_data.map(vectorize_input_layer)
    
    for i in input_test_data.take(1):
        in_1=i
    in_2=[2]
    final_output=[]
    length=SEQUENCE_LENGTH
    
    for i in range(SEQUENCE_LENGTH):
        p_in_2=tf.pad(tf.constant(in_2),[[0,SEQUENCE_LENGTH-1-I]])
        output=tf.argmax(model.predict([[in_1],tf.expand_dims(p_in_2,0)]),-1)[0][i]
        if output==0:
            length=i
            break
        in_2.append(output.numpy()+1)
        final_output.append(output.numpy())
        
    return [vectorize_output_layer.get_vocabulary()[i] for i in final_output]

In [None]:
translate('we won')