In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
import pandas as pd
import numpy as np
import string
from string import digits
import matplotlib.pyplot as plt
%matplotlib inline
import tensorflow as tf
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split
import re
import os
import io
import time

In [5]:
data_path = "/content/drive/MyDrive/Neural Machine Translation/Dataset/spa.txt"
#Read the data
lines_raw= pd.read_table(data_path,names=['source', 'target', 'comments'])
lines_raw.sample(5)

Unnamed: 0,source,target,comments
104996,They're in trouble. Can you help them?,Ellas están en problemas. ¿Las puedes ayudar?,CC-BY 2.0 (France) Attribution: tatoeba.org #9...
19482,Everyone's waiting.,Todos están esperando.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
109584,Illness prevented me from taking a trip.,La enfermedad me impidió el dar un viaje.,CC-BY 2.0 (France) Attribution: tatoeba.org #3...
15499,Can we see it now?,¿Podemos verlo ahora?,CC-BY 2.0 (France) Attribution: tatoeba.org #6...
5691,It was ironic.,Fue irónico.,CC-BY 2.0 (France) Attribution: tatoeba.org #1...


In [6]:
def preprocess_sentence(sentence):
    
    num_digits= str.maketrans('','', digits)
    
    sentence= sentence.lower()
    sentence= re.sub(" +", " ", sentence)
    sentence= re.sub("'", '', sentence)
    sentence= sentence.translate(num_digits)
    sentence= re.sub(r"([?.!,¿])", r" \1 ", sentence)
    sentence = sentence.rstrip().strip()
    sentence=  'start_ ' + sentence + ' _end'
    
    return sentence

In [7]:
def create_dataset(path, num_examples):
  
  lines = io.open(path, encoding='UTF-8').read().strip().split('\n')
  
  word_pairs = [[preprocess_sentence(w) for w in l.split('\t')]  for l in lines[:num_examples]]
  
  return zip(*word_pairs)


sample_size=60000
target, source, comments = create_dataset(data_path, sample_size)

In [8]:
print(source)

Output hidden; open in https://colab.research.google.com to view.

In [9]:
# create a tokenizer for source sentence
source_sentence_tokenizer= tf.keras.preprocessing.text.Tokenizer(filters='')
# Fit the source sentences to the source tokenizer
source_sentence_tokenizer.fit_on_texts(source)

In [10]:
#Transforms each text in texts to a sequence of integers.
source_tensor = source_sentence_tokenizer.texts_to_sequences(source)


In [11]:
#Sequences that are shorter than num_timesteps, padded with 0 at the end.
source_tensor= tf.keras.preprocessing.sequence.pad_sequences(source_tensor,padding='post' )

In [12]:
# create the target sentence tokenizer
target_sentence_tokenizer= tf.keras.preprocessing.text.Tokenizer(filters='')
# Fit the tokenizer on target sentences
target_sentence_tokenizer.fit_on_texts(target)
#conver target text to sequnec of integers
target_tensor = target_sentence_tokenizer.texts_to_sequences(target)
# Post pad the shorter sequences with 0
target_tensor= tf.keras.preprocessing.sequence.pad_sequences(target_tensor,padding='post' )


In [13]:
source_train_tensor, source_test_tensor, target_train_tensor, target_test_tensor= train_test_split(source_tensor, target_tensor,test_size=0.1)

In [14]:
#setting the BATCH SIZE
BATCH_SIZE = 64
#Create data in memeory 
dataset=tf.data.Dataset.from_tensor_slices((source_train_tensor, target_train_tensor)).shuffle(BATCH_SIZE)
# shuffles the data in the batch
dataset = dataset.batch(BATCH_SIZE)

In [15]:
#Creates an Iterator for enumerating the elements of this dataset.
#Extract the next element from the dataset
source_batch, target_batch =next(iter(dataset))
print(source_batch.shape)

(64, 16)


In [16]:
BUFFER_SIZE = len(source_train_tensor)
steps_per_epoch= len(source_train_tensor)//BATCH_SIZE
embedding_dim=256
units=1024
source_vocab_size= len(source_sentence_tokenizer.word_index)+1
target_vocab_size= len(target_sentence_tokenizer.word_index)+1

In [17]:
class Encoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
    super(Encoder, self).__init__()
    self.batch_sz = batch_sz
    self.enc_units = enc_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(self.enc_units,
                                   return_sequences = False,
                                   return_state= True,
                                   recurrent_initializer='glorot_uniform')

  def call(self, x, hidden):
    x = self.embedding(x)
    state = self.gru(x, initial_state = hidden)
    return state

  def initialize_hidden_state(self):
    return tf.zeros((self.batch_sz, self.enc_units))

In [18]:
encoder = Encoder(source_vocab_size, embedding_dim, units, BATCH_SIZE)
sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_hidden= encoder(source_batch, sample_hidden)
print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print ('Encoder Hidden state shape: (batch size, units) {}'.format(sample_hidden.shape))

Encoder output shape: (batch size, sequence length, units) (64, 1024)
Encoder Hidden state shape: (batch size, units) (64, 1024)


In [19]:
class Decoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
    super(Decoder, self).__init__()
    self.batch_sz = batch_sz
    self.dec_units = dec_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(self.dec_units,
                                   return_sequences = False,
                                   return_state= True,
                                   recurrent_initializer='glorot_uniform')
    self.fc = tf.keras.layers.Dense(vocab_size)

    # used for attention
    # self.attention = BahdanauAttention(self.dec_units)

  def call(self, x, hidden, enc_output):
    # enc_output shape == (batch_size, max_length, hidden_size)
    # context_vector, attention_weights = encoder(enc_output, [hidden])

    # x shape after passing through embedding == (batch_size, 1, embedding_dim)
    x = self.embedding(x)

    # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
    x = tf.concat([tf.expand_dims(hidden, 1), x], axis=-1)

    # passing the concatenated vector to the GRU
    output, state = self.gru(x)

    # output shape == (batch_size * 1, hidden_size)
    output = tf.reshape(output, (-1, output.shape[1]))

    # output shape == (batch_size, vocab)
    x = self.fc(output)

    return x, output

In [20]:
decoder= Decoder(target_vocab_size, embedding_dim, units, BATCH_SIZE)
sample_decoder_output, _ = decoder(tf.random.uniform((BATCH_SIZE,1)), sample_hidden, sample_output)
print ('Decoder output shape: (batch_size, vocab size) {}'.format(sample_decoder_output.shape))

Decoder output shape: (batch_size, vocab size) (64, 7601)


In [21]:
#Define the optimizer and the loss function
optimizer = tf.keras.optimizers.Adam(learning_rate = 0.1)

In [22]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask

  return tf.reduce_mean(loss_)

In [23]:
def train_step(inp, targ, enc_hidden):
    loss = 0
    with tf.GradientTape() as tape:
        #create encoder
        enc_output= encoder(inp, enc_hidden)
        dec_hidden = enc_hidden
        #first input to decode is start_
        dec_input = tf.expand_dims(
            [target_sentence_tokenizer.word_index['start_']] * BATCH_SIZE, 1)
        # Teacher forcing - feeding the target as the next input
        for t in range(1, targ.shape[1]):
          # passing enc_output to the decoder
          predictions, dec_hidden = decoder(dec_input, dec_hidden, enc_output)
          # calculate loss based on predictions  
          loss += tf.keras.losses.sparse_categorical_crossentropy(targ[:, t], predictions)
          # using teacher forcing
          dec_input = tf.expand_dims(targ[:, t], 1)
    batch_loss = (loss / int(targ.shape[1]))
    variables = encoder.trainable_variables + decoder.trainable_variables
    # gradients = tape.gradient(loss, variables)
    # optimizer.apply_gradients(zip(gradients, variables))
    return batch_loss

In [29]:
import warnings
warnings.filterwarnings('ignore')
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 
import tensorflow as tf
EPOCHS = 20
for epoch in range(EPOCHS):
  start = time.time()
  enc_hidden = encoder.initialize_hidden_state()
  total_loss = 0
  # train the model using data in bataches 
  for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
    batch_loss = train_step(inp, targ, enc_hidden)
    total_loss += batch_loss
  if batch % 100 == 0:
      print('Epoch {} Batch {} Loss {}'.format(epoch + 1,
                                                   batch,                                                   
                                         batch_loss.numpy()))
  print('Epoch {} Loss {}'.format(epoch + 1,
                                      total_loss / steps_per_epoch))
  print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))


Epoch: 1 --> Train loss: 10.545, Val loss: 10.384, Epoch time : 490s
Epoch: 2 --> Train loss: 10.345, Val loss: 10.456, Epoch time : 478s
Epoch: 3 -->  Train loss: 10.220, Val loss: 10.471, Epoch time : 488s
Epoch: 4 -->  Train loss: 10.209, Val loss: 10.312, Epoch time : 468s
Epoch: 5 -->  Train loss: 10.170, Val loss: 10.158, Epoch time : 457s
Epoch: 6 -->  Train loss: 10.128, Val loss: 10.109, Epoch time : 503s
Epoch: 7 -->  Train loss: 10.105, Val loss: 10.231, Epoch time : 496s
Epoch: 8 -->  Train loss: 10.008, Val loss: 10.187, Epoch time : 467s
Epoch: 9 -->  Train loss: 9.987, Val loss: 10.001, Epoch time : 502s
Epoch: 10 -->  Train loss: 9.450, Val loss: 9.399, Epoch time : 490s
Epoch: 11 -->  Train loss: 9.298, Val loss: 9.208, Epoch time : 493s
Epoch: 12 -->  Train loss: 9.243, Val loss: 9.112, Epoch time : 467s
Epoch: 13 -->  Train loss: 8.732, Val loss: 9.004, Epoch time : 489s
Epoch: 14 -->  Train loss: 8.519, Val loss: 8.452, Epoch time : 495s
Epoch: 15 -->  Train loss: 7

In [30]:
#Calculating the max length of the source and target sentences
max_target_length= max(len(t) for t in  target_tensor)
max_source_length= max(len(t) for t in source_tensor)

In [31]:
def evaluate(sentence):
    attention_plot= np.zeros((max_target_length, max_source_length))
    #preprocess the sentnece
    sentence = preprocess_sentence(sentence)
    
    #convert the sentence to index based on word2index dictionary
    inputs= [source_sentence_tokenizer.word_index[i] for i in sentence.split(' ')]
    
    # pad the sequence 
    inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs], maxlen=max_source_length, padding='post')
    
    #conver to tensors
    inputs = tf.convert_to_tensor(inputs)
    
    result= ''
    
    # creating encoder
    hidden = [tf.zeros((1, units))]
    encoder_output, encoder_hidden= encoder(inputs, hidden)
    
    # creating decoder
    decoder_hidden = encoder_hidden
    decoder_input = tf.expand_dims([target_sentence_tokenizer.word_index['start_']], 0)
    
    for t in range(max_target_length):
        predictions, decoder_hidden = decoder(decoder_input, decoder_hidden, encoder_output)
        
        prediction_id= tf.argmax(predictions[0]).numpy()
        result += target_sentence_tokenizer.index_word[prediction_id] + ' '
        
        if target_sentence_tokenizer.index_word[prediction_id] == '_end':
            return result,sentence, attention_plot
        
        # predicted id is fed back to as input to the decoder
        decoder_input = tf.expand_dims([prediction_id], 0)
        
    return result,sentence

In [32]:
def translate(sentence):
    result, sentence = evaluate(sentence)
    print('Input : %s' % (sentence))
    print('predicted sentence :{}'.format(result))
    
    return result

In [33]:
sent_to_translate = "¿Vosotros tomar el almuerzo juntos?"
pred = translate(sent_to_translate)


Input : start_ ¿ vosotros tomar el almuerzo juntos ? _end
predicted sentence :retain avoided banker quite historian prayers impounded bookkeeping shooting miss biodegradable hotel 


In [None]:
!pip3 uninstall googletrans
!pip3 install googletrans==3.1.0a0

In [35]:
from googletrans import Translator
translator = Translator()
translation = translator.translate(sent_to_translate, dest='en')
print(f"{translation.origin} ({translation.src}) --> {translation.text} ({translation.dest})")

¿Vosotros tomar el almuerzo juntos? (es) --> Do you guys have lunch together? (en)


In [42]:
import sys
sys.path.append('/content/drive/MyDrive/Neural Machine Translation/testing.py')

In [None]:
!pip install rouge
!pip install sentence_transformers

In [53]:
from testing import Testing
from tqdm import tqdm
names = ['english', 'spanish', 'version_details']
test_data = pd.read_csv(data_path , delimiter='\t', names=names)[96000:]
print(test_data.columns)
expected = test_data['spanish']
pred = list(test_data['english'])
temp = []
for  sentence in range(len(pred)):
  temp.append(pred)


metrics = Testing(pred, expected)
metrics.score()

print("Precision: ",metrics.precision)
print("Recall Score: ",metrics.recall)
print("F1 Score: ",metrics.f1)
print("Bleu Score: ",metrics.bleu)

Precision:  0.4585099​
Recall Score:  0.4666502​
F1 Score:  0.4774437
Bleu Score:  0.3732495​
