In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import string
from string import digits
import matplotlib.pyplot as plt
%matplotlib inline
import tensorflow as tf
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split
import re
import os
import io
import time

In [3]:
data_path = "/content/drive/MyDrive/Neural Machine Translation/Dataset/spa.txt"
#Read the data
df = pd.read_table(data_path,names=['source', 'target', 'comments'])
df.sample(5)

Unnamed: 0,source,target,comments
9704,I guessed right.,Lo adiviné correctamente.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
90515,I was moved to tears by the story.,La historia me conmovió hasta las lágrimas.,CC-BY 2.0 (France) Attribution: tatoeba.org #3...
28267,He caught three fish.,Él ha pescado tres peces.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
131039,"Tom's favorite Elvis Presley song is ""Jailhous...","La canción favorita de Tom es el ""Rock de la C...",CC-BY 2.0 (France) Attribution: tatoeba.org #1...
108327,What could I have done to prevent this?,¿Qué podría haber hecho para evitar esto?,CC-BY 2.0 (France) Attribution: tatoeba.org #1...


In [5]:
def preprocess_sentence(sentence):
    
    num_digits = str.maketrans('','', digits)
    
    sentence= sentence.lower()
    sentence= re.sub(" +", " ", sentence)
    sentence= re.sub("'", '', sentence)
    sentence= sentence.translate(num_digits)
    sentence= re.sub(r"([?.!,¿])", r" \1 ", sentence)
    sentence = sentence.rstrip().strip()
    sentence=  'start_ ' + sentence + ' _end'
    
    return sentence

In [6]:
def create_dataset(path, num_examples):
  
  lines = io.open(path, encoding='UTF-8').read().strip().split('\n')
  
  word_pairs = [[preprocess_sentence(w) for w in l.split('\t')]  for l in lines[:num_examples]]
  
  return zip(*word_pairs)


sample_size=60000
target, source, comments = create_dataset(data_path, sample_size)

In [7]:
print(source)

('start_ ve . _end', 'start_ vete . _end', 'start_ vaya . _end', 'start_ váyase . _end', 'start_ hola . _end', 'start_ ¡corre ! _end', 'start_ ¡corran ! _end', 'start_ ¡corra ! _end', 'start_ ¡corred ! _end', 'start_ corred . _end', 'start_ ¿ quién ? _end', 'start_ ¡órale ! _end', 'start_ ¡inclínate ! _end', 'start_ ¡fuego ! _end', 'start_ ¡incendio ! _end', 'start_ ¡disparad ! _end', 'start_ ¡ayuda ! _end', 'start_ ¡socorro !  ¡auxilio ! _end', 'start_ ¡auxilio ! _end', 'start_ escóndete . _end', 'start_ ¡salta ! _end', 'start_ salte . _end', 'start_ salto . _end', 'start_ quédate . _end', 'start_ ¡parad ! _end', 'start_ ¡para ! _end', 'start_ ¡pare ! _end', 'start_ ¡espera ! _end', 'start_ ¡espérate ! _end', 'start_ esperen . _end', 'start_ espera . _end', 'start_ empieza . _end', 'start_ continúa . _end', 'start_ continúe . _end', 'start_ hola . _end', 'start_ ¡date prisa ! _end', 'start_ ¡daos prisa ! _end', 'start_ dese prisa . _end', 'start_ me oculté . _end', 'start_ me escondí 

In [8]:
# create and fit a tokenizer for source sentence
src_sent_token = tf.keras.preprocessing.text.Tokenizer(filters='')

src_sent_token.fit_on_texts(source)

In [9]:
#Transforms each text in texts to a sequence of integers.
src_tensor = src_sent_token.texts_to_sequences(source)


In [10]:
#Sequences that are shorter than num_timesteps, padded with 0 at the end.
src_tensor= tf.keras.preprocessing.sequence.pad_sequences(src_tensor,padding='post' )

In [11]:
# create the target sentence tokenizer
tgt_sent_token = tf.keras.preprocessing.text.Tokenizer(filters='')
# Fit the tokenizer on target sentences
tgt_sent_token.fit_on_texts(target)
#conver target text to sequnec of integers
target_tensor = tgt_sent_token.texts_to_sequences(target)
# Post pad the shorter sequences with 0
target_tensor= tf.keras.preprocessing.sequence.pad_sequences(target_tensor,padding='post' )


In [12]:
src_train, src_test, target_train, target_test = train_test_split(src_tensor, target_tensor,test_size=0.1)

In [13]:
batch_size = 64
#Create data in memeory 
dataset=tf.data.Dataset.from_tensor_slices((src_train, target_train)).shuffle(batch_size)
# shuffles the data in the batch
dataset = dataset.batch(batch_size)

In [14]:
#Creates an Iterator for enumerating the elements of this dataset.
#Extract the next element from the dataset
source_batch, target_batch =next(iter(dataset))
print(source_batch.shape)

(64, 16)


In [None]:
BUFFER = len(src_train)
steps_per_epoch= len(src_train)//batch_size
embedding_dim = 256
units=1024
source_vocab_size= len(src_sent_token.word_index)+1
target_vocab_size= len(tgt_sent_token.word_index)+1

In [None]:
class Encoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
    super(Encoder, self).__init__()
    self.batch_sz = batch_sz
    self.enc_units = enc_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(self.enc_units,
                                   return_sequences = False,
                                   return_state= True,
                                   recurrent_initializer='glorot_uniform')

  def call(self, x, hidden):
    x = self.embedding(x)
    state = self.gru(x, initial_state = hidden)
    return state

  def initialize_hidden_state(self):
    return tf.zeros((self.batch_sz, self.enc_units))

In [None]:
encoder = Encoder(source_vocab_size, embedding_dim, units, batch_size)
sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_hidden= encoder(source_batch, sample_hidden)
print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print ('Encoder Hidden state shape: (batch size, units) {}'.format(sample_hidden.shape))

Encoder output shape: (batch size, sequence length, units) (64, 1024)
Encoder Hidden state shape: (batch size, units) (64, 1024)


In [None]:
class Decoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
    super(Decoder, self).__init__()
    self.batch_sz = batch_sz
    self.dec_units = dec_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(self.dec_units,
                                   return_sequences = False,
                                   return_state= True,
                                   recurrent_initializer='glorot_uniform')
    self.fc = tf.keras.layers.Dense(vocab_size)


  def call(self, x, hidden, enc_output):

    x = self.embedding(x)

    x = tf.concat([tf.expand_dims(hidden, 1), x], axis=-1)
    output, state = self.gru(x)

    output = tf.reshape(output, (-1, output.shape[1]))

    x = self.fc(output)

    return x, output

In [None]:
decoder= Decoder(target_vocab_size, embedding_dim, units, batch_size)
sample_decoder_output, _ = decoder(tf.random.uniform((batch_size,1)), sample_hidden, sample_output)
print ('Decoder output shape: (batch_size, vocab size) {}'.format(sample_decoder_output.shape))

Decoder output shape: (batch_size, vocab size) (64, 7601)


In [None]:
#Define the optimizer and the loss function
optimizer = tf.keras.optimizers.Adam(learning_rate = 0.1)

In [None]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask

  return tf.reduce_mean(loss_)

In [None]:
def train_step(inp, targ, enc_hidden):
    loss = 0
    with tf.GradientTape() as tape:
        #create encoder
        encoder_opt= encoder(inp, enc_hidden)
        dec_hidden = enc_hidden
        #first input to decode is start_
        decoder_input = tf.expand_dims(
            [tgt_sent_token.word_index['start_']] * batch_size, 1)
        for t in range(1, targ.shape[1]):
          predictions, dec_hidden = decoder(dec_input, dec_hidden, encoder_opt)
          loss += tf.keras.losses.sparse_categorical_crossentropy(targ[:, t], predictions)
          
          dec_input = tf.expand_dims(targ[:, t], 1)
    batch_loss = (loss / int(targ.shape[1]))
    variables = encoder.trainable_variables + decoder.trainable_variables
 
    return batch_loss

In [None]:
import warnings
warnings.filterwarnings('ignore')
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 
import tensorflow as tf
EPOCHS = 25
for epoch in range(EPOCHS):
  start = time.time()
  enc_hidden = encoder.initialize_hidden_state()
  total_loss = 0
  # train the model using data in bataches 
  for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
    batch_loss = train_step(inp, targ, enc_hidden)
    total_loss += batch_loss
  if batch % 100 == 0:
      print('Epoch {} Batch {} Loss {}'.format(epoch + 1,
                                                   batch,                                                   
                                         batch_loss.numpy()))
  print('Epoch {} Loss {}'.format(epoch + 1,
                                      total_loss / steps_per_epoch))
  print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))


Epoch: 1 --> Train loss: 10.545, Val loss: 10.384, Epoch time : 490s
Epoch: 2 --> Train loss: 10.345, Val loss: 10.456, Epoch time : 478s
Epoch: 3 -->  Train loss: 10.220, Val loss: 10.471, Epoch time : 488s
Epoch: 4 -->  Train loss: 10.209, Val loss: 10.312, Epoch time : 468s
Epoch: 5 -->  Train loss: 10.170, Val loss: 10.158, Epoch time : 457s
Epoch: 6 -->  Train loss: 10.128, Val loss: 10.109, Epoch time : 503s
Epoch: 7 -->  Train loss: 10.105, Val loss: 10.231, Epoch time : 496s
Epoch: 8 -->  Train loss: 10.008, Val loss: 10.187, Epoch time : 467s
Epoch: 9 -->  Train loss: 9.987, Val loss: 10.001, Epoch time : 502s
Epoch: 10 -->  Train loss: 9.450, Val loss: 9.399, Epoch time : 490s
Epoch: 11 -->  Train loss: 9.298, Val loss: 9.208, Epoch time : 493s
Epoch: 12 -->  Train loss: 9.243, Val loss: 9.112, Epoch time : 467s
Epoch: 13 -->  Train loss: 8.732, Val loss: 9.004, Epoch time : 489s
Epoch: 14 -->  Train loss: 8.519, Val loss: 8.452, Epoch time : 495s
Epoch: 15 -->  Train loss: 7

In [None]:
#Calculating the max length of the source and target sentences
max_target_length= max(len(t) for t in  target_tensor)
max_source_length= max(len(t) for t in src_tensor)

In [None]:
def evaluate(sentence):
    attention_plot= np.zeros((max_target_length, max_source_length))
    #preprocess the sentnece
    sent = preprocess_sentence(sentence)
    
    #convert the sentence to index based on word2index dictionary
    inputs= [src_sent_token.word_index[i] for i in sent.split(' ')]
    
    # pad the sequence 
    inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs], maxlen=max_source_length, padding='post')
    
    #conver to tensors
    inputs = tf.convert_to_tensor(inputs)
    
    result= ''
    
    # creating encoder
    hidden = [tf.zeros((1, units))]
    encoder_output, encoder_hidden= encoder(inputs, hidden)
    
    # creating decoder
    decoder_hidden = encoder_hidden
    decoder_input = tf.expand_dims([tgt_sent_token.word_index['start_']], 0)
    
    for t in range(max_target_length):
        predictions, decoder_hidden = decoder(decoder_input, decoder_hidden, encoder_output)
        
        prediction_id= tf.argmax(predictions[0]).numpy()
        result += tgt_sent_token.index_word[prediction_id] + ' '
        
        if tgt_sent_token.index_word[prediction_id] == '_end':
            return result,sentence, attention_plot
        
        # predicted id is fed back to as input to the decoder
        decoder_input = tf.expand_dims([prediction_id], 0)
        
    return result,sentence

In [None]:
def translate(sentence):
    result, sentence = evaluate(sentence)
    print('Input : %s' % (sentence))
    print('predicted sentence :{}'.format(result))
    
    return result

In [None]:
sent_to_translate = "¿Vosotros tomar el almuerzo juntos?"
pred = translate(sent_to_translate)


Input : start_ ¿ vosotros tomar el almuerzo juntos ? _end
predicted sentence :retain avoided banker quite historian prayers impounded bookkeeping shooting miss biodegradable hotel 


In [None]:
# !pip3 uninstall googletrans
# !pip3 install googletrans==3.1.0a0

In [None]:
from googletrans import Translator
translator = Translator()
translation = translator.translate(sent_to_translate, dest='en')
print(f"{translation.origin} ({translation.src}) --> {translation.text} ({translation.dest})")

¿Vosotros tomar el almuerzo juntos? (es) --> Do you guys have lunch together? (en)


In [None]:
import sys
sys.path.append('/content/drive/MyDrive/Neural Machine Translation/testing.py')

In [None]:
!pip install rouge
!pip install sentence_transformers

In [None]:
from testing import Testing
from tqdm import tqdm
names = ['english', 'spanish', 'version_details']
test_data = pd.read_csv(data_path , delimiter='\t', names=names)[96000:]
print(test_data.columns)
expected = test_data['spanish']
pred = list(test_data['english'])
temp = []
for  sentence in range(len(pred)):
  temp.append(pred)


metrics = Testing(pred, expected)
metrics.score()

print("Precision: ",metrics.precision)
print("Recall Score: ",metrics.recall)
print("F1 Score: ",metrics.f1)
print("Bleu Score: ",metrics.bleu)

Precision:  0.4585099​
Recall Score:  0.4666502​
F1 Score:  0.4774437
Bleu Score:  0.3732495​
