In [None]:
# Necessary
import numpy as np
import pandas as pd
import string,re
import math
import random
from collections import Counter
from tqdm.auto import tqdm

# Tokenization
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from torch.utils.data import DataLoader, Dataset


# Evaluation
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge import Rouge
from nltk.translate.meteor_score import meteor_score

# Building the model
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, Concatenate, Attention

In [None]:
train = pd.read_csv('Data/train.csv')[:100000]
valid = pd.read_csv('Data/valid.csv')
test = pd.read_csv('Data/test.csv')

In [None]:
train.head(10)

In [None]:
valid.head(10)

In [None]:
test.head(10)

In [None]:
train.info()

In [None]:
test.info()

In [None]:
valid.info()

## `Data Cleaning `

### `Missing values`

In [None]:
print("Training Data: ")
print(train.isna().sum())

print("\nValidation Data: ")
print(valid.isna().sum())

print("\nTest Data: ")
print(test.isna().sum())


### `Duplicated values`

In [None]:
print("Training Data: ", train.duplicated().sum())

print("\nValidation Data: ", valid.duplicated().sum())

print("\nTest Data: ", test.duplicated().sum())

In [None]:
def SentenceCleaning(dataframe: pd.DataFrame) -> pd.DataFrame:

    # remove punctuation
    dataframe['en'] = dataframe['en'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation))) 
    dataframe['vi'] = dataframe['vi'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))

    # reduce vocab size 
    dataframe['en'] = dataframe['en'].str.lower() 
    dataframe['vi'] = dataframe['vi'].str.lower()
    
    # clear spaces in the beginning and end
    dataframe['en'] = dataframe['en'].str.strip() 
    dataframe['vi'] = dataframe['vi'].str.strip()

    # reduce multiple spaces to single space
    dataframe['en'] = dataframe['en'].apply(lambda x: re.sub('\s+',' ',x)) 
    dataframe['vi'] = dataframe['vi'].apply(lambda x: re.sub('\s+',' ',x))

    return dataframe

In [None]:
train = SentenceCleaning(train)
valid = SentenceCleaning(valid)
test = SentenceCleaning(test)

In [None]:
def sentence_filter(dataframe: pd.DataFrame) -> pd.DataFrame:
    def is_valid_language_sentence(sentence) -> bool:
        return bool(re.compile(r'^[A-Za-zÀ-ỹà-ỹ0-9\s]*$').match(sentence))
        
    filtered_df = dataframe[dataframe['en'].apply(is_valid_language_sentence) & dataframe['vi'].apply(is_valid_language_sentence)]
    return filtered_df

In [None]:
print("Before filtering: ")
print(f'Train: {train.shape[0]}')
print(f'valid: {valid.shape[0]}')
print(f'test: {test.shape[0]}')

train = sentence_filter(train)
valid = sentence_filter(valid)
test = sentence_filter(test)

print("\nAfter filtering: ")
print(f'Train: {train.shape[0]}')
print(f'valid: {valid.shape[0]}')
print(f'test: {test.shape[0]}')

In [None]:
# Tokenize sentences
vi_tokenizer = Tokenizer()
en_tokenizer = Tokenizer()

vi_tokenizer.fit_on_texts(train['vi'])
en_tokenizer.fit_on_texts(train['en'])

vi_sequences = vi_tokenizer.texts_to_sequences(train['vi'])
en_sequences = en_tokenizer.texts_to_sequences(train['en'])

In [None]:
# Pad sequences
max_vi_len = max([len(seq) for seq in vi_sequences])
max_en_len = max([len(seq) for seq in en_sequences])

vi_sequences = pad_sequences(vi_sequences, maxlen=max_vi_len, padding='post')
en_sequences = pad_sequences(en_sequences, maxlen=max_en_len, padding='post')

vi_valid_sequences = vi_tokenizer.texts_to_sequences(valid['vi'])
en_valid_sequences = en_tokenizer.texts_to_sequences(valid['en'])
vi_valid_sequences = pad_sequences(vi_valid_sequences, maxlen=max_vi_len, padding='post')
en_valid_sequences = pad_sequences(en_valid_sequences, maxlen=max_en_len, padding='post')

vi_test_sequences = vi_tokenizer.texts_to_sequences(test['vi'])
en_test_sequences = en_tokenizer.texts_to_sequences(test['en'])
vi_test_sequences = pad_sequences(vi_test_sequences, maxlen=max_vi_len, padding='post')
en_test_sequences = pad_sequences(en_test_sequences, maxlen=max_en_len, padding='post')

In [None]:
class TranslationDataset(Dataset):
    def __init__(self, vi_sequences, en_sequences):
        self.vi_sequences = vi_sequences
        self.en_sequences = en_sequences

    def __len__(self):
        return len(self.vi_sequences)

    def __getitem__(self, idx):
        return torch.tensor(self.vi_sequences[idx], dtype=torch.long), torch.tensor(self.en_sequences[idx], dtype=torch.long)

In [None]:
# Use a subset of data
train_dataset = TranslationDataset(vi_sequences[:10000], en_sequences[:10000])  # Use a subset of 10,000 samples
valid_dataset = TranslationDataset(vi_valid_sequences[:2000], en_valid_sequences[:2000])
test_dataset = TranslationDataset(vi_test_sequences[:2000], en_test_sequences[:2000])

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, pin_memory=True)
valid_loader = DataLoader(valid_dataset, batch_size=16, shuffle=False, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False, pin_memory=True)

In [None]:
train_loader

# `Train models`

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

# Define constants
embedding_dim = 256
hidden_size = 512
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Encoder
class Encoder(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_size):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_size, batch_first=True)

    def forward(self, src):
        embedded = self.embedding(src)
        outputs, (hidden, cell) = self.lstm(embedded)
        return outputs, hidden, cell

# Decoder without attention
class DecoderNoAttention(nn.Module):
    def __init__(self, output_dim, embedding_dim, hidden_size):
        super(DecoderNoAttention, self).__init__()
        self.embedding = nn.Embedding(output_dim, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_dim)

    def forward(self, tgt, hidden, cell):
        embedded = self.embedding(tgt)
        outputs, (hidden, cell) = self.lstm(embedded, (hidden, cell))
        predictions = self.fc(outputs)
        return predictions, hidden, cell

# Decoder with Bahdanau Attention
class BahdanauAttention(nn.Module):
    def __init__(self, hidden_size):
        super(BahdanauAttention, self).__init__()
        self.attn = nn.Linear(hidden_size * 2, hidden_size)
        self.v = nn.Parameter(torch.rand(hidden_size))

    def forward(self, hidden, encoder_outputs):
        hidden = hidden[-1].unsqueeze(1)  # Take the last layer of hidden states
        scores = torch.tanh(self.attn(torch.cat((hidden.expand_as(encoder_outputs), encoder_outputs), dim=2)))
        scores = torch.sum(self.v * scores, dim=2)
        attn_weights = torch.softmax(scores, dim=1)
        context = torch.bmm(attn_weights.unsqueeze(1), encoder_outputs).squeeze(1)
        return context, attn_weights

class DecoderWithAttention(nn.Module):
    def __init__(self, output_dim, embedding_dim, hidden_size):
        super(DecoderWithAttention, self).__init__()
        self.embedding = nn.Embedding(output_dim, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim + hidden_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size * 2, output_dim)
        self.attention = BahdanauAttention(hidden_size)

    def forward(self, tgt, hidden, cell, encoder_outputs):
        embedded = self.embedding(tgt)
        context, attn_weights = self.attention(hidden, encoder_outputs)
        lstm_input = torch.cat((embedded, context.unsqueeze(1).expand_as(embedded)), dim=2)
        outputs, (hidden, cell) = self.lstm(lstm_input, (hidden, cell))
        predictions = self.fc(torch.cat((outputs, context.unsqueeze(1).expand_as(outputs)), dim=2))
        return predictions, hidden, cell, attn_weights

# Training loop
def train_model(encoder, decoder, dataloader, criterion, encoder_optimizer, decoder_optimizer, num_epochs=20):
    encoder.train()
    decoder.train()
    for epoch in range(num_epochs):
        epoch_loss = 0
        for src, tgt in dataloader:
            src, tgt = src.to(device), tgt.to(device)

            # Zero the parameter gradients
            encoder_optimizer.zero_grad()
            decoder_optimizer.zero_grad()

            # Forward pass
            encoder_outputs, hidden, cell = encoder(src)
            decoder_input = tgt[:, :-1]
            decoder_target = tgt[:, 1:]

            if isinstance(decoder, DecoderNoAttention):
                predictions, _, _ = decoder(decoder_input, hidden, cell)
            else:
                predictions, _, _, _ = decoder(decoder_input, hidden, cell, encoder_outputs)

            # Compute loss
            predictions = predictions.permute(0, 2, 1)
            loss = criterion(predictions, decoder_target)

            # Backward pass
            loss.backward()

            # Update parameters
            encoder_optimizer.step()
            decoder_optimizer.step()

            epoch_loss += loss.item()
        print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {epoch_loss / len(dataloader):.4f}')

# Create models
input_dim = len(vi_tokenizer.word_index) + 1
output_dim = len(en_tokenizer.word_index) + 1

encoder = Encoder(input_dim, embedding_dim, hidden_size).to(device)
decoder_no_attention = DecoderNoAttention(output_dim, embedding_dim, hidden_size).to(device)
decoder_with_attention = DecoderWithAttention(output_dim, embedding_dim, hidden_size).to(device)

# Loss and Optimizer
criterion = nn.CrossEntropyLoss(ignore_index=0)
encoder_optimizer = optim.Adam(encoder.parameters(), lr=0.001)
decoder_optimizer_no_attention = optim.Adam(decoder_no_attention.parameters(), lr=0.001)
decoder_optimizer_with_attention = optim.Adam(decoder_with_attention.parameters(), lr=0.001)

In [None]:
train_model(encoder, decoder_no_attention, train_loader, criterion, encoder_optimizer, decoder_optimizer_no_attention, num_epochs=20)

In [None]:
# Train models with attention
train_model(encoder, decoder_with_attention, train_loader, criterion, encoder_optimizer, decoder_optimizer_with_attention, num_epochs=20)

In [None]:
def evaluate_model(encoder, decoder, dataloader, criterion):
    encoder.eval()
    decoder.eval()
    epoch_loss = 0
    bleu_scores = []
    rouge = Rouge()
    rouge_scores = {'rouge-1': [], 'rouge-2': [], 'rouge-l': []}
    meteor_scores = []
    smoothing_function = SmoothingFunction().method4

    with torch.no_grad():
        for src, tgt in dataloader:
            src, tgt = src.to(device), tgt.to(device)

            encoder_outputs, hidden, cell = encoder(src)
            decoder_input = tgt[:, :-1]
            decoder_target = tgt[:, 1:]

            if isinstance(decoder, DecoderNoAttention):
                predictions, _, _ = decoder(decoder_input, hidden, cell)
            else:
                predictions, _, _, _ = decoder(decoder_input, hidden, cell, encoder_outputs)

            predictions = predictions.permute(0, 2, 1)
            loss = criterion(predictions, decoder_target)
            epoch_loss += loss.item()

            # Calculate BLEU, ROUGE, and METEOR scores
            for i in range(predictions.size(0)):
                reference = tgt[i, 1:].cpu().numpy()
                candidate = predictions[i].argmax(0).cpu().numpy()
                bleu_scores.append(sentence_bleu([reference], candidate, smoothing_function=smoothing_function))
                rouge_score = rouge.get_scores(' '.join(map(str, candidate)), ' '.join(map(str, reference)))[0]
                rouge_scores['rouge-1'].append(rouge_score['rouge-1']['f'])
                rouge_scores['rouge-2'].append(rouge_score['rouge-2']['f'])
                rouge_scores['rouge-l'].append(rouge_score['rouge-l']['f'])
                meteor_scores.append(meteor_score([' '.join(map(str, reference))], ' '.join(map(str, candidate))))

    avg_loss = epoch_loss / len(dataloader)
    avg_bleu = np.mean(bleu_scores)
    avg_rouge = {k: np.mean(v) for k, v in rouge_scores.items()}
    avg_meteor = np.mean(meteor_scores)

    return avg_loss, avg_bleu, avg_rouge, avg_meteor

# Evaluate models without attention
loss_no_attention, bleu_no_attention, rouge_no_attention, meteor_no_attention = evaluate_model(encoder, decoder_no_attention, test_loader, criterion)

# Evaluate models with attention
loss_with_attention, bleu_with_attention, rouge_with_attention, meteor_with_attention = evaluate_model(encoder, decoder_with_attention, test_loader, criterion)

print(f'Loss without attention: {loss_no_attention}')
print(f'BLEU score without attention: {bleu_no_attention}')
print(f'ROUGE-1 score without attention: {rouge_no_attention["rouge-1"]}')
print(f'ROUGE-2 score without attention: {rouge_no_attention["rouge-2"]}')
print(f'ROUGE-L score without attention: {rouge_no_attention["rouge-l"]}')
print(f'METEOR score without attention: {meteor_no_attention}')

print(f'Loss with attention: {loss_with_attention}')
print(f'BLEU score with attention: {bleu_with_attention}')
print(f'ROUGE-1 score with attention: {rouge_with_attention["rouge-1"]}')
print(f'ROUGE-2 score with attention: {rouge_with_attention["rouge-2"]}')
print(f'ROUGE-L score with attention: {rouge_with_attention["rouge-l"]}')
print(f'METEOR score with attention: {meteor_with_attention}')

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

def plot_attention(attention_weights, input_sentence, output_sentence):
    fig = plt.figure(figsize=(10, 10))
    sns.heatmap(attention_weights, xticklabels=output_sentence, yticklabels=input_sentence, cmap='viridis')
    plt.xlabel('Output Sentence')
    plt.ylabel('Input Sentence')
    plt.show()

# Example sentence
example_src, example_tgt = next(iter(test_loader))
example_src, example_tgt = example_src[0].unsqueeze(0).to(device), example_tgt[0].unsqueeze(0).to(device)

# Get attention weights for Bahdanau attention model
encoder.eval()
decoder_with_attention.eval()
with torch.no_grad():
    encoder_outputs, hidden, cell = encoder(example_src)
    decoder_input = example_tgt[:, :-1]
    predictions, _, _, attention_weights = decoder_with_attention(decoder_input, hidden, cell, encoder_outputs)

# Plot attention
input_sentence = [vi_tokenizer.index_word[idx.item()] for idx in example_src[0] if idx.item() != 0]
output_sentence = [en_tokenizer.index_word[idx.item()] for idx in example_tgt[0] if idx.item() != 0]
plot_attention(attention_weights[0].cpu().numpy(), input_sentence, output_sentence)

# Save models