<a href="https://colab.research.google.com/github/Jaseelkt007/ML/blob/master/Seq2Seq_translation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sequence to Sequence Machine Translation using LSTMs
### Based on the paper Sequence to Sequence Learning with Neural Network - 2014

In [12]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [19]:
# Decompress the files
!gunzip /content/drive/MyDrive/multi30k_data/multi30k-dataset/data/task1/raw/train.de.gz
!gunzip /content/drive/MyDrive/multi30k_data/multi30k-dataset/data/task1/raw/train.en.gz
!gunzip /content/drive/MyDrive/multi30k_data/multi30k-dataset/data/task1/raw/val.de.gz
!gunzip /content/drive/MyDrive/multi30k_data/multi30k-dataset/data/task1/raw/val.en.gz
!gunzip /content/drive/MyDrive/multi30k_data/multi30k-dataset/data/task1/raw/test_2016_flickr.de.gz
!gunzip /content/drive/MyDrive/multi30k_data/multi30k-dataset/data/task1/raw/test_2016_flickr.en.gz

In [20]:
# Define the paths to the dataset files inside the multi30k-dataset folder in your Drive
train_de_path = '/content/drive/MyDrive/multi30k_data/multi30k-dataset/data/task1/raw/train.de'
train_en_path = '/content/drive/MyDrive/multi30k_data/multi30k-dataset/data/task1/raw/train.en'

val_de_path = '/content/drive/MyDrive/multi30k_data/multi30k-dataset/data/task1/raw/val.de'
val_en_path = '/content/drive/MyDrive/multi30k_data/multi30k-dataset/data/task1/raw/val.en'

test_de_path = '/content/drive/MyDrive/multi30k_data/multi30k-dataset/data/task1/raw/test_2016_flickr.de'
test_en_path = '/content/drive/MyDrive/multi30k_data/multi30k-dataset/data/task1/raw/test_2016_flickr.en'

# Function to load data from a file
def load_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return f.readlines()

# Load the training, validation, and test datasets
train_ger = load_data(train_de_path)
train_eng = load_data(train_en_path)

val_ger = load_data(val_de_path)
val_eng = load_data(val_en_path)

test_ger = load_data(test_de_path)
test_eng = load_data(test_en_path)

In [None]:
!pip install torch torchtext spacy
!python -m spacy download de_core_news_sm
!python -m spacy download en_core_web_sm

In [None]:
import spacy
import random
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from collections import Counter

# Load spacy tokenizers for German and English
spacy_de = spacy.load('de_core_news_sm')
spacy_en = spacy.load('en_core_web_sm')

def tokenize_ger(text):
    return [tok.text.lower() for tok in spacy_de.tokenizer(text)]

def tokenize_eng(text):
    return [tok.text.lower() for tok in spacy_en.tokenizer(text)]

# Special tokens
INIT_TOKEN = '<sos>'
EOS_TOKEN = '<eos>'
PAD_TOKEN = '<pad>'
UNK_TOKEN = '<unk>'

# Build vocabulary from tokenized sentences
def build_vocab(sentences, tokenizer, min_freq=2 , max_size = 10000):
    counter = Counter()
    # Tokenize and count the frequency of tokens
    for sentence in sentences:
        tokens = tokenizer(sentence)
        counter.update(tokens)

    sorted_tokens = sorted(counter.items() , key=lambda x: (-x[1], x[0]))

    # Build a vocab from words appearing more than min_freq times
    vocab = {word: i+4 for i, (word, count) in enumerate(sorted_tokens[:max_size]) if count >= min_freq}
    # special tokens
    vocab[INIT_TOKEN] = 0
    vocab[EOS_TOKEN] = 1
    vocab[PAD_TOKEN] = 2
    vocab[UNK_TOKEN] = 3
    return vocab

# Build vocab for both source and target sentences
german_vocab = build_vocab(train_ger, tokenize_ger)
english_vocab = build_vocab(train_eng, tokenize_eng)


class TranslationDataset(Dataset):
    def __init__(self, src_sentences, tgt_sentences, src_vocab, tgt_vocab, src_tokenizer, tgt_tokenizer):
        self.src_sentences = src_sentences
        self.tgt_sentences = tgt_sentences
        self.src_vocab = src_vocab
        self.tgt_vocab = tgt_vocab
        self.src_tokenizer = src_tokenizer
        self.tgt_tokenizer = tgt_tokenizer

    def __len__(self):
        return len(self.src_sentences)

    def __getitem__(self, idx):
        src_tokens = [INIT_TOKEN] + self.src_tokenizer(self.src_sentences[idx]) + [EOS_TOKEN]
        tgt_tokens = [INIT_TOKEN] + self.tgt_tokenizer(self.tgt_sentences[idx]) + [EOS_TOKEN]

        src_indices = [self.src_vocab.get(token, self.src_vocab[UNK_TOKEN]) for token in src_tokens]
        tgt_indices = [self.tgt_vocab.get(token, self.tgt_vocab[UNK_TOKEN]) for token in tgt_tokens]

        return torch.tensor(src_indices), torch.tensor(tgt_indices)

# Create the dataset
train_dataset = TranslationDataset(train_ger, train_eng, german_vocab, english_vocab, tokenize_ger, tokenize_eng)

# DataLoader
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True, collate_fn=lambda x: x)

class Encoder(nn.Module):
  def __init__(self,input_size, hidden_size, embedding_size, num_layers , p ) -> None:
      super(Encoder,self).__init__()
      self.hidden_size = hidden_size
      self.num_layers = num_layers
      self.embedding = nn.Embedding(input_size,embedding_size)
      self.dropout = nn.Dropout(p)
      self.rnn = nn.LSTM(input_size, hidden_size, num_layers, dropout=p)

  def forward(self, x):
      # x shape : (seq_length, N)

      embedding = self.dropout(self.embedding(x))
      # embedding shape : (seq_length, N , embedding_size)

      output , (hidden , cell) = self.rnn(embedding)
      return hidden, cell # we only interested in final hidden and cell state

class Decoder(nn.Module):
  def __init__(self,input_size, embedding_size, hidden_size, output_size, num_layers, p ) -> None:
      super(Encoder, self).__init__()
      self.hidden_size = hidden_size
      self.num_layers = num_layers

      self.dropout = nn.Dropout(p)
      self.embedding = nn.Embedding(input_size, embedding_size)
      self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers) # the input size to the decoder is the size of embedding size from encoder
      self.fc = nn.Linear(hidden_size, output_size)

  def forward(self, x , hidden, cell):
      # shape of x : (N) but we want (1,N)
      x = x.unsqueeze(0)

      embedding = self.dropout(self.embedding(x))
      # embedding shape : (1 , N, embedding_size)

      output , (hidden, cell) = self.rnn(embedding, (hidden, cell))
      # output shape : (1, N , hidden_size)

      predictions = self.fc(output)
      # shape of predictions : (1 , N , length_of_vocab)

      predictions = predictions.squeeze(0) # because we want to add the prediction at last for the final answer

      return predictions, hidden, cell

class Seq2Seq(nn.Module):
  def __init__(self, encoder, decoder) -> None:
      super(Seq2Seq,self).__init__()
      self.encoder = encoder
      self.decoder = decoder

  def forward(self, source, target, teacher_force_ratio=0.5):
      batch_size = source.shape[1]
      target_len = target.shape[0]
      target_vocab_size = len(english_vocab)

      outputs = torch.zeros(target_len, batch_size, target_vocab_size) # length of target sequence = no of words in that sentence , number of sequences, size of target vocab

      hidden , cell = self.encoder(source)

      x = target[0] # first input to the decoder , is <sos>

      for t in range (1, target_len):
          output , hidden, cell = self.decoder(x , hidden, cell) # output --> (batch_size, target_vocab_size)

          outputs[t] = output

          best_guess = output.argmax(1)

          x = target[t] if random.random() < teacher_force_ratio else best_guess # random.random() generates random no btw 0 and 1

          return outputs

# Traning .. to be continued
