In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
# Import dependencies
import pandas as pd
from string import punctuation
import re
import numpy as np
import tensorflow as tf
import os
import time
import random

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from sklearn.model_selection import train_test_split

In [2]:
# Set up paths
DATA_FILEPATH = "/content/drive/MyDrive/Language_Translation/data.txt"

In [3]:
# Download attention_mechanism.py
!wget https://raw.githubusercontent.com/GargPriyanshu1112/Neural-Machine-Translation/main/attention_mechanism.py

from attention_mechanism import *

--2022-09-05 13:37:35--  https://raw.githubusercontent.com/GargPriyanshu1112/Neural-Machine-Translation/main/attention_mechanism.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2644 (2.6K) [text/plain]
Saving to: ‘attention_mechanism.py’


2022-09-05 13:37:35 (57.2 MB/s) - ‘attention_mechanism.py’ saved [2644/2644]



# Load in the Data

In [4]:
def load_doc(filepath, mode):
    file = open(filepath, mode)
    content = file.read()
    file.close()

    return content

In [5]:
def get_pairs(filepath):
    inputs, targets = [], []

    # Get file contents
    content = load_doc(filepath, 'r')
    
    # Get sentence pairs
    for line in content.split('\n'):
        if len(line) < 1:
            continue

        inp, tar, _ = line.split('\t')
        inputs.append(inp)
        targets.append(tar)


    data = pd.DataFrame({'inputs': inputs,
                         'targets': targets})        
    
    return data

In [6]:
# Load in the data
data = get_pairs(DATA_FILEPATH).sample(70000)
data.head()

Unnamed: 0,inputs,targets
5833,Love is crazy.,El amor es una locura.
135111,"If you ever want to get better, you have to ta...","Si quieres mejorarte algún día, tienes que tom..."
85682,I am very grateful for your help.,Estoy muy agradecido por tu ayuda.
121460,I asked Tom if I could talk to him in private.,Le pregunté a Tom si podía hablarle en privado.
107187,"Just this once, I'll make an exception.",Haré una excepción solo por esta vez.


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 70000 entries, 5833 to 41854
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   inputs   70000 non-null  object
 1   targets  70000 non-null  object
dtypes: object(2)
memory usage: 1.6+ MB


In [16]:
# Hyperparameters
max_len = 30
enc_units = 1024
dec_units = 1024
embedding_dim = 256

# Preprocess the Data

In [9]:
def preprocess_text(text):
    text = text.lower()
    text= re.sub("'", '', text)
    text = re.sub(r"\d", '', text)
    text = re.sub(r"([?.!,¿])", r" \1 ", text)    
    text = re.sub("\s+", ' ',  text).strip()
    text=  'startseq ' + text + ' endseq'

    return text

In [10]:
# Preprocess inputs (English sentences)
data["inputs"] = data["inputs"].apply(lambda x: preprocess_text(x))

# Preprocess targets (Spanish sentences)
data["targets"] = data["targets"].apply(lambda x: preprocess_text(x))

# Train-Test Split

In [11]:
train_inp, test_inp, train_tar, test_tar = train_test_split(data.inputs, data.targets,
                                                            test_size=0.2,
                                                            random_state=42)

print(f"Number of training samples : {len(train_inp)}")
print(f"Number of test samples     : {len(test_inp)}")

Number of training samples : 56000
Number of test samples     : 14000


# Tokenize and Pad the Sentences



In [12]:
class LanguageIndex():
    def __init__(self, sentences_list):
        self.tokenizer = None
        self.word2idx  = None

        self.get_tokenizer(sentences_list)
        self.get_word_to_idx_mapping()


    def get_tokenizer(self, sentences_list):
        self.tokenizer = Tokenizer(filters='', oov_token="<UNK>")
        self.tokenizer.fit_on_texts(sentences_list)

    def get_word_to_idx_mapping(self):
        self.word2idx = self.tokenizer.word_index


    def get_max_length(self, sentences):
        return max([len(line.split())  for line in sentences])

In [13]:
input_lang  = LanguageIndex([line  for line in data.inputs])
target_lang = LanguageIndex([line  for line in data.targets])

In [14]:
print(f"Length of the longest input (English) sentence : {input_lang.get_max_length(data.inputs)}")
print(f"Length of the longest target (Spanish) sentence : {target_lang.get_max_length(data.targets)}")

Length of the longest input (English) sentence : 53
Length of the longest target (Spanish) sentence : 59


In [15]:
# Tokenize input sentences
input_seqs_int = input_lang.tokenizer.texts_to_sequences(train_inp)
# Pad input sentences
input_seqs_int = pad_sequences(input_seqs_int, maxlen=max_len,  padding="post")  


# Tokenize target sentences
target_seqs_int = target_lang.tokenizer.texts_to_sequences(train_tar) 
# Pad target sentences
target_seqs_int = pad_sequences(target_seqs_int, maxlen=max_len, padding="post")

In [17]:
input_vocab_size = len(input_lang.word2idx) + 1
target_vocab_size = len(target_lang.word2idx) + 1

print(f"Vocab size for inputs (English sentences)  : {input_vocab_size}")
print(f"Vocab size for targets (Spanish sentences) : {target_vocab_size}")

Vocab size for inputs (English sentences)  : 11301
Vocab size for targets (Spanish sentences) : 21076


# Create Training Dataset

In [18]:
BATCH_SIZE = 64

In [19]:
def get_training_dataset(X_train, y_train, batch_size):
    dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
    dataset = dataset.batch(batch_size, drop_remainder=True)
    dataset = dataset.prefetch(buffer_size=tf.data.AUTOTUNE)

    return dataset

In [20]:
# Get train dataset
train_dataset = get_training_dataset(input_seqs_int, target_seqs_int, batch_size=BATCH_SIZE)

# Define Optimizer and Loss Function

In [21]:
# Optimizer
optimizer = tf.keras.optimizers.Adam()


# Loss function
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(reduction="none")

def loss_func(actual_words, predicted_words_probability):
	loss = loss_object(actual_words, predicted_words_probability)
	mask = tf.where(actual_words > 0, 1.0, 0.0)
	return tf.reduce_sum(mask * loss)

# Get Model Encoder, Decoder and Attention Layer

In [22]:
# Get the encoder
encoder = Encoder(input_vocab_size, embedding_dim, enc_units, BATCH_SIZE)

# Get the attention layer
attention_layer = BahdanauAttention(10)

# Get the decoder
decoder = Decoder(target_vocab_size, embedding_dim, dec_units, BATCH_SIZE)

# Establish Checkpoints

In [23]:
checkpoint_dir = "training_checkpoints"
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

# Train the Model

In [24]:
EPOCHS = 30

In [25]:
@tf.function
def train_step(input, target, enc_hidden_state):
    loss = 0

    with tf.GradientTape() as tape:
        enc_hidden_states, enc_output_state = encoder(input, enc_hidden_state)
        # The last consolidated encoder hidden state is fed as input to the first
        # time step of the decoder.
        dec_hidden_state = enc_output_state

        # Set decoder input to be: 'startseq'  
        # which is the ground truth for 0th time step
        dec_input = tf.expand_dims([target_lang.word2idx['startseq']] * BATCH_SIZE, axis=1)

        # Loop through each timestep
        for t in range(1, target.shape[1]): 
            # dec_input : ground truth from the previous timestep
            # dec_hidden_state : hidden state of the decoder from previous timestep
            predictions, dec_hidden_state, _ = decoder(dec_input, enc_hidden_states, dec_hidden_state)

            # Calcuate loss for current time step and add it to get loss for all time steps
            loss += loss_func(target[:, t], predictions) 

            # Set the ground truth as the next input of the decoder
            dec_input = tf.expand_dims(target[:, t], axis=1)


    # Calculate batch loss
    batch_loss = loss / int(target.shape[1])

    # Get the trainable variables
    trainable_variables = encoder.trainable_variables + decoder.trainable_variables
    
    # Derive gradients
    gradients = tape.gradient(loss, trainable_variables)
    
    # Apply gradients
    optimizer.apply_gradients(zip(gradients, trainable_variables))

    return batch_loss


In [26]:
for epoch in range(EPOCHS):
    start_time = time.time()

    # Initialize encoder hidden state
    enc_hidden_state = encoder.initialize_hidden_state()

    loss_values = [] # Will store loss value for each batch in an epoch

    for batch, (input, target) in enumerate(train_dataset):
        batch_loss = train_step(input, target, enc_hidden_state)
        loss_values.append(batch_loss)


    # Save checkpoints every 10 epochs
    if (epoch == EPOCHS-1):
        checkpoint.save(file_prefix=checkpoint_prefix)

    print(f"Epoch {epoch+1},  Loss {np.mean(loss_values):.4f}")
    print(f"Time taken for epoch {epoch+1} : {(time.time() - start_time):.1f} sec\n")

Epoch 1,  Loss 85.8733
Time taken for epoch 1 : 326.3 sec

Epoch 2,  Loss 59.1122
Time taken for epoch 2 : 284.9 sec

Epoch 3,  Loss 41.9043
Time taken for epoch 3 : 285.2 sec

Epoch 4,  Loss 30.7302
Time taken for epoch 4 : 285.0 sec

Epoch 5,  Loss 23.2487
Time taken for epoch 5 : 285.4 sec

Epoch 6,  Loss 17.9024
Time taken for epoch 6 : 285.0 sec

Epoch 7,  Loss 14.4215
Time taken for epoch 7 : 285.0 sec

Epoch 8,  Loss 11.8645
Time taken for epoch 8 : 284.9 sec

Epoch 9,  Loss 9.6212
Time taken for epoch 9 : 285.2 sec

Epoch 10,  Loss 8.3704
Time taken for epoch 10 : 285.2 sec

Epoch 11,  Loss 6.9312
Time taken for epoch 11 : 284.7 sec

Epoch 12,  Loss 5.4692
Time taken for epoch 12 : 284.6 sec

Epoch 13,  Loss 4.8627
Time taken for epoch 13 : 284.7 sec

Epoch 14,  Loss 3.9157
Time taken for epoch 14 : 284.8 sec

Epoch 15,  Loss 3.5573
Time taken for epoch 15 : 284.8 sec

Epoch 16,  Loss 2.9082
Time taken for epoch 16 : 284.7 sec

Epoch 17,  Loss 2.6923
Time taken for epoch 17 : 2

# Inference

In [27]:
def translate_text(input_text):

    # Preprocess the text
    text = preprocess_text(input_text)
    # Tokenize and pad the text
    text = [input_lang.word2idx[word]  for word in text.split()]
    text = pad_sequences([text], maxlen=max_len, padding="post")
    text = tf.convert_to_tensor(text)

    enc_hidden_state = tf.zeros((1, enc_units))
    enc_hidden_states, enc_output_state = encoder(text, enc_hidden_state)
    
    dec_hidden_state = enc_output_state
    dec_input = tf.expand_dims([target_lang.word2idx['startseq']], axis=1)

    output_text = ''
    for t in range(1, max_len):
        predictions, dec_hidden_state, attention_weights = decoder(dec_input,
                                                                   enc_hidden_states,
                                                                   dec_hidden_state)

        predicted_id = tf.argmax(predictions[0]).numpy()
        output_text += target_lang.tokenizer.index_word[predicted_id] + ' '

        if target_lang.tokenizer.index_word[predicted_id] == 'endseq':
            return output_text
        else:
            # Feed the predicted ID back into the model
            dec_input = tf.expand_dims([predicted_id], 0)        

    return output_text

In [88]:
for i in range(5):
    rand_idx = random.choice(test_inp.index)
    print(f"English Text        : {' '.join(test_inp[rand_idx].split()[1:-1])}\n")
    print(f"Machine Translation : {' '.join(translate_text(test_inp[rand_idx]).split()[:-1])}")
    print(f"Correct Translation : {' '.join(test_tar[rand_idx].split()[1:-1])}")
    print("\n-------------------------------------------------------------------\n\n")

English Text        : i dont know what i want .

Machine Translation : no sé qué me quiero .
Correct Translation : no sé lo que quiero .

-------------------------------------------------------------------


English Text        : im your boyfriend , arent i ?

Machine Translation : ¿ soy tu novio , ¿ no ?
Correct Translation : soy tu novio , ¿ o no ?

-------------------------------------------------------------------


English Text        : tell me the truth .

Machine Translation : decídmelo a la verdad .
Correct Translation : decidme la verdad .

-------------------------------------------------------------------


English Text        : what makes you think that isnt true ?

Machine Translation : ¿ qué te hace pensar que eso no es verdad ?
Correct Translation : ¿ qué te hace pensar que eso no es cierto ?

-------------------------------------------------------------------


English Text        : i can only tell you what i know .

Machine Translation : puedo saber decirte lo que sé .