In [None]:
!pip install tensorflow==2.6.0

In [None]:
import pathlib
import random 
import string 
import re 
import numpy as np 
import tensorflow as tf
from tensorflow import keras 
from tensorflow.keras import layers 

In [None]:
from tensorflow.keras.layers import TextVectorization

In [None]:
from transformer import *

In [None]:
text_file = keras.utils.get_file(
    fname= 'spa-eng.zip',
    origin='http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip',
    extract = True, 
) 
text_file = pathlib.Path(text_file).parent / "spa-eng" / 'spa.txt'

In [None]:
with open(text_file) as f: 
  lines = f.read().split("\n")[:-1]
text_pairs = []
for line in lines: 
  eng, spa = line.split('\t')
  spa = "[start] " + spa + " [end]"
  text_pairs.append((eng,spa))

In [None]:
for _ in range(4):
  print(random.choice(text_pairs))

In [None]:
random.shuffle(text_pairs)
num_val_samples = int(0.15*len(text_pairs))
num_train_samples = len(text_pairs) - 2 * num_val_samples
train_pairs = text_pairs[:num_train_samples]
val_pairs = text_pairs[num_train_samples : num_train_samples+num_val_samples]
test_pairs = text_pairs[num_train_samples + num_val_samples: ]

print(len(train_pairs))
print(len(val_pairs))
print(len(test_pairs))

In [None]:
train_pairs[1]

In [None]:
string.punctuation

In [None]:
#Vectorizing the text data 
strip_chars = string.punctuation
strip_chars = strip_chars.replace("[","")
strip_chars = strip_chars.replace("]","")

vocab_size = 15000
sequence_length = 20 
batch_size = 64

def custom_standardization(input_string):
  lowercase = tf.strings.lower(input_string)
  return tf.strings.regex_replace(lowercase, "[%s]"%re.escape(strip_chars),"")

eng_vectorization = TextVectorization(max_tokens = vocab_size, output_mode = 'int',output_sequence_length=sequence_length)


spa_vectorization = TextVectorization(max_tokens = vocab_size, output_mode = 'int',output_sequence_length=sequence_length+1,standardize=custom_standardization)


In [None]:
random.seed(10)
print(random.random())

In [None]:
train_eng_texts = [pair[0] for pair in train_pairs]
train_spa_texts = [pair[1] for pair in train_pairs]
train_spa_texts = [x.replace("¿",'') for x in train_spa_texts]
eng_vectorization.adapt(train_eng_texts)
spa_vectorization.adapt(train_spa_texts)

In [None]:
def format_dataset(eng, spa):
    eng = eng_vectorization(eng)
    spa = spa_vectorization(spa)
    return ({"encoder_inputs": eng, "decoder_inputs": spa[:, :-1],}, spa[:, 1:])


def make_dataset(pairs,batch_size = 64):
    eng_texts, spa_texts = zip(*pairs)
    eng_texts = list(eng_texts)
    spa_texts = list(spa_texts)
    dataset = tf.data.Dataset.from_tensor_slices((eng_texts, spa_texts))
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(format_dataset)
    return dataset.shuffle(2048).prefetch(16).cache()


In [None]:
train_ds = make_dataset(train_pairs)
val_ds = make_dataset(val_pairs)

In [None]:
train_pairs

In [None]:
embed_dim = 256

encoder_inputs = keras.Input(shape=(None,), dtype="int64", name="encoder_inputs")
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(encoder_inputs)
encoder_outputs = TransformerEncoder(embed_dim)(x)
encoder = keras.Model(encoder_inputs, encoder_outputs)

decoder_inputs = keras.Input(shape=(None,), dtype="int64", name="decoder_inputs")
encoded_seq_inputs = keras.Input(shape=(None, embed_dim), name="decoder_state_inputs")
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(decoder_inputs)
x = TransformerDecoder(embed_dim)(x, encoded_seq_inputs)
x = layers.Dropout(0.5)(x)
decoder_outputs = layers.Dense(vocab_size, activation="softmax")(x)
decoder = keras.Model([decoder_inputs, encoded_seq_inputs], decoder_outputs)

decoder_outputs = decoder([decoder_inputs, encoder_outputs])
transformer = keras.Model(
    [encoder_inputs, decoder_inputs], decoder_outputs, name="transformer"
)

In [None]:
transformer.summary()

In [None]:
#Optimizer which holds on to the loss of RMS it balances the step size very very efficiently to avoid exploding as well as vanishing gradient descent
transformer.compile(
    optimizer ='rmsprop',loss = 'sparse_categorical_crossentropy',metrics = ['accuracy']
)

In [None]:
#Perplexity and Entropy 
#BLEU Scores
transformer.fit(train_ds,epochs = 1,validation_data = val_ds)

In [None]:
#Model serialization
transformer.save_weights("Translate_de_eng_weights.h5")

In [None]:
loaded_model = keras.Model(
    [encoder_inputs, decoder_inputs], decoder_outputs, name="transformer"
)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!ls drive/MyDrive/Weights

In [None]:
loaded_model.load_weights('drive/MyDrive/Weights_new/Translate_de_eng_weights.h5')

In [None]:
import pickle
train_eng_texts = pickle.load(open('drive/MyDrive/Weights_new/train_eng_texts.sav', 'rb'))
train_spa_texts = pickle.load(open('drive/MyDrive/Weights_new/train_spa_texts.sav', 'rb'))

In [None]:
train_spa_texts = [x.replace("¿",'') for x in train_spa_texts]
eng_vectorization.adapt(train_eng_texts)
spa_vectorization.adapt(train_spa_texts)

In [None]:
spa_vocab = spa_vectorization.get_vocabulary()
spa_index_lookup = dict(zip(range(len(spa_vocab)), spa_vocab))
max_decoded_sentence_length = 20

In [None]:
def decode_sequence(input_sentence,transformer = loaded_model):
    #print(input_sentence)
    tokenized_input_sentence = eng_vectorization([input_sentence])
    #print(tokenized_input_sentence)
    decoded_sentence = "[start]"
    for i in range(max_decoded_sentence_length):
        tokenized_target_sentence = spa_vectorization([decoded_sentence])[:, :-1]
        predictions = transformer([tokenized_input_sentence, tokenized_target_sentence])

        sampled_token_index = np.argmax(predictions[0, i, :])
        sampled_token = spa_index_lookup[sampled_token_index]
        decoded_sentence += " " + sampled_token

        if sampled_token == "[end]":
            break
    return decoded_sentence

In [None]:
test_eng_text = 'My hotel told me to call you.'
translated = decode_sequence(test_eng_text)
print(translated)

In [None]:
[[ 19 518 125  18   4 226   5   0   0   0   0   0   0   0   0   0   0   0
    0   0]]

In [None]:
loaded_model