In [None]:
import tensorflow as tf
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.preprocessing import text_dataset_from_directory
from tensorflow.keras.layers import SimpleRNN,Embedding,Input,LSTM,Dense,GRU,Bidirectional,Reshape
from tensorflow.data.experimental import AUTOTUNE
import numpy as np
import re
import string
import nltk
import datetime
import numpy as np
from matplotlib import pyplot as plt
import pandas

<H1>DATA PREPARATION</H1>

In [None]:
path='...'

In [None]:
NUM_EXAMPLES=250
VALIDATION_RATIO=1
VALIDATION_BRIDGE=int(VALIDATION_RATIO*NUM_EXAMPLES)

text_dataset=tf.data.TextLineDataset(path).take(NUM_EXAMPLES)
BATCH_SIZE=1024

In [None]:
for i in text_dataset.take(1):
    print(i)

In [None]:
def selector(input_text):
    return tf.strings.split(input_text,'\t')[0:1],'starttoken'+tf.strings.split(input_text,'\t')[1:2],tf.strings.split(input_text,'\t')[1:2]

In [None]:
text_dataset=text_dataset.map(selector)

In [None]:
for i in text_dataset.take(1):
    print(i)

In [None]:
def preprocess_sentences(input_data):
    '''
    Task: Preprocess sentences or standardize the sentences
    Input: raw reviews
    output: standardized reviews
    '''
    output=tf.strings.lower(input_data)
    outputs=tf.strings.regex_replace(output,"<[^>]+>","")
    outputs=tf.strings.regex_replace(output,"<[%s]"%re.esceape(string.punctuation)," ")
    outputs=tf.strings.regex_replace(output,"  "," ")
    
    return output

In [None]:
SEQUENCE_LENGTH=10

vectorize_input_layer=TextVectorization(
    standardize=preprocess_sentences,
    output_sequence_length=SEQUENCE_LENGTH,
)

In [None]:
vectorize_pre_output_layer=TextVectorization(
    standardize=preprocess_sentences,
    output_sequence_length=SEQUENCE_LENGTH,
)

In [None]:
vectorize_output_layer=TextVectorization(
    standardize=preprocess_sentences,
    output_sequence_length=SEQUENCE_LENGTH,
)

In [None]:
training_data=text_dataset.map(lambda x,y,z:x)
vectorize_input_layer.adapt(training_data)

In [None]:
training_data=text_dataset.map(lambda x,y,z:y)
vectorize_pre_output_layer.adapt(training_data)

In [None]:
training_data=text_dataset.map(lambda x,y,z:z)
vectorize_output_layer.adapt(training_data)

In [None]:
VOCAB_INPUT_SIZE=len(vectorize_input_layer.get_vocabulary())
VOCAB_PRE_OUTPUT_SIZE=len(vectorize_pre_output_layer.get_vocabulary())
VOCAB_OUTPUT_SIZE=len(vectorize_output_layer.get_vocabulary())

In [None]:
def vectorizer(x,y,z):
    return {'in1':tf.squeeze(vectorize_input_layer(x),0),'in2':tf.squeeze(vectorize_pre_output_layer(y),0),}, tf.squeeze(vectorize_output_layer(z),0)
dataset=text_dataset.map(vectorizer)

In [None]:
for i in datset.take(1):
    print(i)

In [None]:
vectorize_pre_output_layer.get_vocabulary()[2]

In [None]:
dataset=dataset.shuffle(NUM_EXAMPLES)
train_dataset=dataset.take(VALIDATION_BRIDGE)
validation_dataset=dataset.skip(VALIDATION_BRIDGE)

In [None]:
train_dataset=train_dataset.batch(BATCH_SIZE).cache().prefetch(buffer_size=AUTOTUNE)
validation_dataset=validation_dataset.batch(BATCH_SIZE).cache().prefetch(buffer_size=AUTOTUNE)


<H1>MODELING</H1>

In [None]:
EMBEDDING_DIM=8
LSTM_ENCODER_HIDDEN_SIZE=300
LSTM_DECODER_HIDDEN_SIZE=1000
SENTENCE_LENGTH=10

inputs=Input(SEQUENCE_LENGTH)
pre_out=Input(SEQUENCE_LENGTH)

x = Embedding(
    VOCAB_INPUT_SIZE,
    EMBEDDING_DIM)(inputs)
encoder = LSTM(
    LSTM_HIDDEN_SIZE,
    return_sequences=False,
    return_state=True)

_,h,c=encoder(x)
x=Embedding(VOCAB_OUPUT_SIZE,EMBEDDING_DIM)(pre_out)
decoder=LSTM(LSTM_DECODER_HIDDEN_SIZE,
            return_state=True,
            return_sequences=True)
h=Dense(LSTM_DECODER_HIDDEN_SIZE)(h)
c=Dense(LSTM_DECODER_HIDDEN_SIZE)(c)

x,h,c=decoder(x,[h,c])
x=Dense(VOCAB_OUTPUT_SIZE,activation='softmax')(x)
model=tf.keras.Model([inputs,pre_out],x)
model.summary()

<H1>TRAINING</H1>

In [None]:
class BLEU(tf.keras.metrics.Metric):
    def __init__(self,name='bleu_score'):
        super(BLEU,self).__init__()
        self.add=0
        self.total=0
    def update_state(self,y_true,y_pred,sample_weight=None):
        y_true=tf.argmax(y_true,-1)
        y_pred=tf.argmax(y_pred,-1)
        
        for i,j in zip(y_pred,y_true):
            tf.autograph.experimental.set_loop_options()
            self.total+=tf.math.count_nonzero(i)
            for word in i:
                if word==0:
                    break
                for q in range(len(j)):
                    if j[q]==0:
                        break
                    if word==j[q]:
                        self.add+=1
                        j=tf.boolean_mask(j,[False if y==q else True for y in range(len(j))])
                        break
    def result(self):
        return self.add/self.total

In [None]:
LR=1e-3
EPOCH=100
model.compile(
    loss=tf.keras.losses.CategoricalCrossentropy(),
    optimizer=tf.keras.optimizers.Adam(lr=LR,),
    metrics=[BLEU()],
    run_eagerly=True,
)

In [None]:
checkpoint_filepath='...'
log_dir='...'
callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='loss',
    mode='min',
    save_best_only=True
)

In [None]:
history=model.fit(train_dataset, validation_data=validation_dataset,verbose=1,epochs=EPOCH,callbacks=[callback])

<H1>TESTING</H1>

In [None]:
test_data=tf.data.Dataset.from_tensor_slices([['i will try']])
init_test_data=tf.data.Dataset.from_tensor_slices([['starttoken']])

In [None]:
input_test_data=test_data.map(vectorize_input_layer)
pre_output_test_data=init_test_data.map(vectorize_pre_output_layer)

In [None]:
for i in input_test_data.take(1):
    print(i)
    in_1=i
for i in pre_output_test_data.take(1):
    print(i)
    in_2=i

In [None]:
def get_output(in_1,in_2):
    return tf.argmax(model.predict([in_1,in_2]),-1)

In [None]:
output=get_output(in_1,in_2)
print(output)

In [None]:
for i in range(SEQUENCE_LENGTH):
    print(vectorize_output_layer.get_vocabulary()[output[0][i]])