In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
""" ----------------- Importing Libraries --------------------------- """



from keras.preprocessing.text import Tokenizer
from collections import Counter

import pandas as pd
import numpy as np
from string import digits

import matplotlib.pyplot as plt
import tensorflow as tf

import nltk
import os
import re
import string
import io
from sklearn.model_selection import train_test_split
import pickle as pk


# to store trained objects 
def serialize(path,data):
  pick = open(path, 'ab')
  pk.dump(data, pick)                     
  pick.close()

# Retrieve the stored trained objects
def deserialize(path):
  pick = open(path, 'rb')     
  df = pk.load(pick)
  pick.close()
  return df

In [4]:
""" ----------------- Loading the Data --------------------------- """
 
path = '/content/gdrive/MyDrive/Neural Machine Translation/Dataset/spa.txt'
names = ['english', 'spanish', 'version_details']
dataset = pd.read_csv(path , delimiter='\t', names=names)

In [5]:
""" ----------------- Preprocessing (specifically for nmt) --------------------------- """

def preprocess_for_nmt(sent):

  # convert text to lower case
  sent = sent.lower()

  # removing unnecessarily long spaces 
  sent = re.sub(" +", " ", sent)

  # removing quotes
  sent = re.sub("'", "", sent)

  # replacing digits with none: in this --> "str.maketrans('','', digits)" ------  returns a map for ascii codes of digits 0 through 9, that is to be replaced by None
  sent = sent.translate(str.maketrans('','', digits))
  
  # Adding spaces before and after punctuations (issue : The list of puctuation is from the reference I am using, when I add the list, it throws multiple errors)
  sent = re.sub(r"([?.!,¿])", r" \1 ", sent)

  # Strip the white spaces
  sent = sent.strip()

  # appending start and end tokens since its required by the model to identify the start and end of the sequence
  sent = "start_ " + sent + " _end"

  return sent

In [6]:
dataset = dataset.loc[:96000]

In [7]:
""" ----------------- Creating source target language pairs [source, target] --------------------------- """

def create_dataset(source, target):
  src = []
  trg = []
  for s,t in zip(source,target):
    src.append(preprocess_for_nmt(s))
    trg.append(preprocess_for_nmt(t))

  return tuple(src), tuple(trg)
source, target = create_dataset(dataset.english, dataset.spanish)

print(source[-1])
print(target[-1])
type(target)

start_ tom was killed by a suicide bomber . _end
start_ tom fue asesinado por un kamikaze . _end


tuple

In [8]:
""" ----------------- Source Tokenizer --------------------------- """

# create a tokenizer for source sentence
src_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')

# Fit the source sentences to the source tokenizer (creates an index to word map, which is then used to substitute in the texts_to_sequences method)
src_tokenizer.fit_on_texts(source)

# Transforms each text in texts to a sequence of integers, and pad zeros to end of sentences to make the data instances uniform
src_tnsr = tf.keras.preprocessing.sequence.pad_sequences(src_tokenizer.texts_to_sequences(source),padding='post')

src_vocab_length = len(src_tokenizer.word_index)+1

In [9]:
""" ----------------- Target Tokenizer --------------------------- """

# create a tokenizer for target sentence
trg_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')

# Fit the target sentences to the target tokenizer (creates an index to word map, which is then used to substitute in the texts_to_sequences method)
trg_tokenizer.fit_on_texts(target)

# Transforms each text in texts to a sequence of integers, and pad zeros to end of sentences to make the data instances uniform
trg_tnsr = tf.keras.preprocessing.sequence.pad_sequences(trg_tokenizer.texts_to_sequences(target),padding='post')

trg_vocab_length = len(trg_tokenizer.word_index)+1

In [10]:
# split the dataset into training and testing sets
src_tnsr_tr, src_tnsr_ts, trg_tnsr_tr, trg_tnsr_ts = train_test_split(src_tnsr, trg_tnsr,test_size=0.2)

In [11]:
# index to word map
def convert(tok, tnsr):
  for t in tnsr:
    if t!=0:
      print ("%d ----> %s" % (t, tok.index_word[t]))

print ("Input Language; index to word mapping")
convert(src_tokenizer, src_tnsr_tr[0])
print ()
print ("Target Language; index to word mapping")
convert( trg_tokenizer, trg_tnsr_tr[0])

Input Language; index to word mapping
1 ----> start_
6 ----> tom
1040 ----> jumped
212 ----> over
9 ----> the
2649 ----> shallow
1789 ----> ditch
3 ----> .
2 ----> _end

Target Language; index to word mapping
1 ----> start_
4 ----> tom
1326 ----> saltó
161 ----> sobre
12 ----> la
107 ----> poco
3151 ----> profunda
2611 ----> zanja
3 ----> .
2 ----> _end


In [12]:
# Batch Size
batch = 64

# Create dataset and shuffle them
dataset = tf.data.Dataset.from_tensor_slices((src_tnsr_tr, trg_tnsr_tr)).shuffle(batch)

# Create the batches of 64 post shuffling
dataset = dataset.batch(batch, drop_remainder=True)

# useful parameters
buffer = len(src_tnsr_tr)
steps_per_epoch= len(src_tnsr_tr)//batch

embedding_dim=256
units=1024

In [13]:
# iterator object is a pointer to an element in the dataset, initially set to the first element in subscriptable item
iterator = iter(dataset)
src_bat, trg_bat = next(iterator)

print(src_bat.shape)
print(trg_bat.shape)

(64, 14)
(64, 20)


In [14]:

print("Source language vocabulary length:-",src_vocab_length)
print("Target language vocabulary length:-",trg_vocab_length)

Source language vocabulary length:- 10237
Target language vocabulary length:- 20506


In [15]:
# an example, taking just one batch to experiment the algorithm
single_src, single_trg = next(iter(dataset))
single_src.shape, single_trg.shape

(TensorShape([64, 14]), TensorShape([64, 20]))

In [16]:
# Encoder architecture

class Encoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, enc_units, batch_size):
    super(Encoder, self).__init__()
    self.batch_size = batch_size
    self.enc_units = enc_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(self.enc_units, return_sequences=True, return_state=True, recurrent_initializer='glorot_uniform')

  def call(self, x, hidden):
    x = self.embedding(x)
    output, state = self.gru(x, initial_state = hidden)
    return output, state

  def initialize_hidden_state(self):
    return tf.zeros((self.batch_size, self.enc_units))


encoder = Encoder(src_vocab_length, embedding_dim, units, batch)

# sample input
sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_hidden = encoder(single_src, sample_hidden)
print ("Output shape : ",sample_output.shape)
print ("Hidden state : ",sample_hidden.shape)

Output shape :  (64, 14, 1024)
Hidden state :  (64, 1024)


In [17]:
# Attention Mechanism

class BahdanauAttention(tf.keras.layers.Layer):
  def __init__(self, units):
    super(BahdanauAttention, self).__init__()
    self.W1 = tf.keras.layers.Dense(units)
    self.W2 = tf.keras.layers.Dense(units)
    self.V = tf.keras.layers.Dense(1)

  def call(self, query, values):
    # hidden shape == (batch_size, hidden size)
    # hidden_with_time_axis shape == (batch_size, 1, hidden size)
    # we are doing this to perform addition to calculate the score
    hidden_with_time_axis = tf.expand_dims(query, 1)

    # score shape == (batch_size, max_length, 1)
    # we get 1 at the last axis because we are applying score to self.V
    # the shape of the tensor before applying self.V is (batch_size, max_length, units)
    score = self.V(tf.nn.tanh(self.W1(values) + self.W2(hidden_with_time_axis)))

    # attention_weights shape == (batch_size, max_length, 1)
    attention_weights = tf.nn.softmax(score, axis=1)

    # context_vector shape after sum == (batch_size, hidden_size)
    context_vector = attention_weights * values
    context_vector = tf.reduce_sum(context_vector, axis=1)

    return context_vector, attention_weights

In [18]:
attention_layer= BahdanauAttention(10)
attention_result, attention_weights = attention_layer(sample_hidden, sample_output)
print("Attention result shape: (batch size, units) {}".format(attention_result.shape))
print("Attention weights shape: (batch_size, sequence_length, 1) {}".format(attention_weights.shape))

Attention result shape: (batch size, units) (64, 1024)
Attention weights shape: (batch_size, sequence_length, 1) (64, 14, 1)


In [19]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, decoder_units, batch_sz):
        super (Decoder,self).__init__()
        self.batch_sz= batch_sz
        self.decoder_units = decoder_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru= tf.keras.layers.GRU(decoder_units, return_sequences= True, return_state=True, recurrent_initializer='glorot_uniform')

        # Fully connected layer
        self.fc= tf.keras.layers.Dense(vocab_size)
        
        # attention
        self.attention = BahdanauAttention(self.decoder_units)
    
    def call(self, x, hidden, encoder_output):
        
        context_vector, attention_weights = self.attention(hidden,encoder_output)
        
        # pass output sequnece thru the input layers
        x= self.embedding(x)
        
        # concatenate context vector and embedding for output sequence
        x= tf.concat([tf.expand_dims( context_vector, 1), x], axis=-1)
        
        # passing the concatenated vector to the GRU
        output, state = self.gru(x)
        
        # output shape == (batch_size * 1, hidden_size)
        output= tf.reshape(output, (-1, output.shape[2]))
        
        # pass the output thru Fc layers
        x = self.fc(output)

        return x, state, attention_weights

In [20]:
decoder = Decoder(trg_vocab_length, embedding_dim, units, batch)
sample_decoder_output, _, _= decoder(tf.random.uniform((batch,1)), sample_hidden, sample_output)
print ('Decoder output shape: (batch_size, vocab size) {}'.format(sample_decoder_output.shape))

Decoder output shape: (batch_size, vocab size) (64, 20506)


In [21]:
optimizer = tf.keras.optimizers.Adam(0.001)
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)
  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask
  return tf.reduce_mean(loss_)

In [22]:
def train_step(inp, targ, enc_hidden):
    loss = 0
    with tf.GradientTape() as tape:
        #create encoder
        enc_output, enc_hidden = encoder(inp, enc_hidden)
        dec_hidden = enc_hidden
        #first input to decode is start_
        dec_input = tf.expand_dims([trg_tokenizer.word_index['start_']] * batch, 1)
        # Teacher forcing - feeding the target as the next input
        for t in range(1, targ.shape[1]):
          # passing enc_output to the decoder
          predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)
          # calculate loss based on predictions  
          loss += tf.keras.losses.sparse_categorical_crossentropy(targ[:, t], predictions)
          # using teacher forcing
          dec_input = tf.expand_dims(targ[:, t], 1)
    batch_loss = (loss / int(targ.shape[1]))
    variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))
    return batch_loss

In [27]:
import time
EPOCHS=20
for epoch in range(EPOCHS):
  start = time.time()
  enc_hidden = encoder.initialize_hidden_state()
  total_loss = 0
  # train the model using data in bataches 
  for (batch_number, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
    # print(batch_number)
    val_loss = train_step(inp, targ, enc_hidden)
    train_loss += val_loss
  print('Epoch: {} --> Train loss: {}, Val loss: {}, Epoch time : {}s\n'.format(epoch, train_loss, val_loss time.time() - start))




Epoch: 1 --> Train loss: 9.231, Val loss: 8.881, Epoch time : 467s
Epoch: 2 --> Train loss: 8.914, Val loss: 8.786, Epoch time : 451s
Epoch: 3 -->  Train loss: 8.512, Val loss: 8.471, Epoch time : 470s
Epoch: 4 -->  Train loss: 8.431, Val loss: 8.210, Epoch time : 429s
Epoch: 5 -->  Train loss: 7.210, Val loss: 7.155, Epoch time : 401s
Epoch: 6 -->  Train loss: 7.009, Val loss: 6.912, Epoch time : 490s
Epoch: 7 -->  Train loss: 6.297, Val loss: 6.108, Epoch time : 438s
Epoch: 8 -->  Train loss: 6.010, Val loss: 5.791, Epoch time : 412s
Epoch: 9 -->  Train loss: 5.867, Val loss: 5.348, Epoch time : 510s
Epoch: 10 -->  Train loss: 5.150, Val loss: 4.919, Epoch time : 488s
Epoch: 11 -->  Train loss: 5.097, Val loss: 4.731, Epoch time : 451s
Epoch: 12 -->  Train loss: 4.519, Val loss: 4.252, Epoch time : 431s
Epoch: 13 -->  Train loss: 4.100, Val loss: 3.918, Epoch time : 449s
Epoch: 14 -->  Train loss: 3.219, Val loss: 3.118, Epoch time : 391s
Epoch: 15 -->  Train loss: 3.187, Val loss: 2

In [None]:
# serialize('/content/gdrive/MyDrive/Neural Machine Translation/NMT with attention (GRU)/nmt_gru_model.pkl', encoder)
encoder = deserialize('/content/gdrive/MyDrive/Neural Machine Translation/NMT with attention (GRU)/nmt_gru_model.pkl')

In [28]:
# Calculating the max length of the source and target sentences
max_t_length= max(len(t) for t in  trg_tnsr)
max_s_length= max(len(t) for t in src_tnsr)

In [29]:
def evaluate(sentence):
    attention_plot= np.zeros((max_t_length, max_s_length))
    #preprocess the sentnece
    sentence = preprocess_for_nmt(sentence)
    
    #convert the sentence to index based on word2index dictionary
    inputs = [src_tokenizer.word_index[i] if i in src_tokenizer.word_index else src_tokenizer.word_index['unk'] for i in sentence.split(' ')]
    
    # pad the sequence 
    inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs], maxlen=max_s_length, padding='post')
    
    #conver to tensors
    inputs = tf.convert_to_tensor(inputs)
    
    result= ''
    
    # creating encoder
    hidden = [tf.zeros((1, units))]
    encoder_output, encoder_hidden= encoder(inputs, hidden)
    
    # creating decoder
    decoder_hidden = encoder_hidden
    decoder_input = tf.expand_dims([trg_tokenizer.word_index['start_']], 0)
    
    for t in range(max_t_length):
        predictions, decoder_hidden, attention_weights= decoder(decoder_input, decoder_hidden, encoder_output)
        
        # storing attention weight for plotting it
        attention_weights = tf.reshape(attention_weights, (-1,))
        attention_plot[t] = attention_weights.numpy()
        
        prediction_id= tf.argmax(predictions[0]).numpy()
        
        if prediction_id > 0:
          result += trg_tokenizer.index_word[prediction_id] + ' '
        else:
          result += "start_"
        
        if trg_tokenizer.index_word[prediction_id ] == '_end':
            return result,sentence, attention_plot
        
        # predicted id is fed back to as input to the decoder
        decoder_input = tf.expand_dims([prediction_id], 0)
        
    return result, sentence, attention_plot

In [30]:
import matplotlib.ticker as ticker
def plot_attention(attention, sentence, predicted_sentence):
    fig = plt.figure(figsize=(10,10))
    ax= fig.add_subplot(1,1,1)
    ax.matshow(attention, cmap='Greens')
    fontdict={'fontsize':10}
    
    ax.set_xticklabels([''] + sentence, fontdict=fontdict, rotation=90)
    ax.set_yticklabels([''] + predicted_sentence, fontdict=fontdict)
    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))
    plt.show()

In [31]:
def translate(sentence):

    result, sentence, attention_plot = evaluate(sentence)
    
    print('Input : %s' % (sentence))
    print('predicted sentence :{}'.format(result))

    attention_plot = attention_plot[:len(result.split(' ')), :len(sentence.split(' '))]
    plot_attention(attention_plot, sentence.split(' '), result.split(' '))
  

In [34]:
import sys
! pip install rouge
! pip install sentence_transformers
sys.path.insert(1,'/content/gdrive/MyDrive/Neural Machine Translation')
import testing
from tqdm import tqdm

In [35]:
# src_tokenizer.word_index['unk'] = 10237
trg_tokenizer.index_word.keys()

In [36]:
test_data = pd.read_csv(path , delimiter='\t', names=names)[96000:]
print(test_data.columns)
expected = test_data['spanish']
pred = list(test_data['english'])
temp = []
for  sentence in range(len(pred)):
  temp.append(test(pred[sentence]))

In [38]:
metrics = testing(pred, expected)
metrics.score()
print("Precision: ",metrics.precision)
print("Recall Score: ",metrics.recall)
print("F1 Score: ",metrics.f1)
print("Bleu Score: ",metrics.bleu)

Precision: 0.5258477​
Recall Score: 0.5472525​
F1 Score: 0.5492590​
Bleu Score: 0.4565383​
