In [31]:
import torch
import torch.optim as optim
from torchtext.datasets import Multi30k
import numpy as np
import pandas as pd
import spacy
import random
import re
import string
import torchtext
import torch.nn as nn
import torch.nn.functional as F
from torchvision import transforms, datasets
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence

In [32]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [33]:
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
epochs = 50

In [34]:
embedding_size = 300
hidden_size = 1024
num_layers = 2
dropout = 0.5
dropout = 0.5

In [35]:
learning_rate = 3e-4
batch_size =32

In [36]:
!python -m spacy download de_core_news_sm
!python -m spacy download en_core_web_sm

Collecting de-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.7.0/de_core_news_sm-3.7.0-py3-none-any.whl (14.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.6/14.6 MB[0m [31m61.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('de_core_news_sm')
Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m66.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [37]:
spacy_ger = spacy.load('de_core_news_sm')
spacy_eng = spacy.load('en_core_web_sm')

In [38]:
class Vocabulary:
    def __init__(self, freq_threshold):
        self.itos = {0:"<PAD>", 1:"<SOS>", 2:"<EOS>", 3:"<UNK>"}
        self.stoi = {"<PAD>":0, "<SOS>":1, "<EOS>":2, "<UNK>":3}
        self.freq_threshold  = freq_threshold
        
    def __len__(self):
        return len(self.itos)
    
    @staticmethod
    def preprocessing_text(text):
        text = text.lower().strip()
        text = re.sub(f'[{string.punctuation}\n]', '', text)
        return text
    
    @staticmethod
    def tokenizer_ger(text):
        text = Vocabulary.preprocessing_text(text)
        return [tok.text for tok in spacy_ger.tokenizer(text)]
    
    @staticmethod
    def tokenizer_eng(text):
        text = Vocabulary.preprocessing_text(text)
        return [tok.text for tok in spacy_eng.tokenizer(text)]
    
    def build_vocabulary(self, sentence_list):
        frequencies = {}
        index = 4
        
        for sentence in sentence_list:
            for word in self.tokenizer_eng(sentence):
                if word not in frequencies:
                    frequencies[word] = 1
                else:
                    frequencies[word] += 1
                    
                if frequencies[word] == self.freq_threshold:
                    self.stoi[word] = index
                    self.itos[index] = word
                    index += 1
            
            for word in self.tokenizer_ger(sentence):
                if word not in frequencies:
                    frequencies[word] = 1
                else:
                    frequencies[word] += 1
                    
                if frequencies[word] == self.freq_threshold:
                    self.stoi[word] = index
                    self.itos[index] = word
                    index += 1
                    
    def numericalize(self, text, lang="eng"):
        if lang == "eng":
            tokenized_text = self.tokenizer_eng(text)
        else:
            tokenized_text = self.tokenizer_ger(text)
            
        return [
            self.stoi[token] if token in self.stoi else self.stoi["<UNK>"]
            for token in tokenized_text
        ]

In [39]:
class MyCollate:
    def __init__(self, pad_index):
        self.pad_index = pad_index
        
    def __call__(self, batch):
        eng = [item[0] for item in batch]
        eng = pad_sequence(eng, batch_first=False, padding_value = self.pad_index)
        ger = [item[1] for item in batch]
        ger = pad_sequence(ger, batch_first=False, padding_value = self.pad_index)
        return eng, ger

In [40]:
class Multi30K_Custom(Dataset):
    def __init__(self, csv_file, transform = None, freq_threshold = 5):
        self.csv_file = csv_file
        self.data = pd.read_csv(csv_file)
        self.eng = self.data["english"]
        self.ger = self.data["german"]
        self.vocab_eng = Vocabulary(freq_threshold)
        self.vocab_eng.build_vocabulary(self.eng.tolist())
        self.vocab_ger = Vocabulary(freq_threshold)
        self.vocab_ger.build_vocabulary(self.ger.tolist())
        
    def __len__(self):
        return len(self.data)
    
    def eng_vocab_len(self):
        return len(self.vocab_eng)
    
    def ger_vocab_len(self):
        return len(self.vocab_ger)
    
    def __getitem__(self, index):
        eng = self.eng[index]
        ger = self.ger[index]
        eng_cap = [self.vocab_eng.stoi["<SOS>"]]
        eng_cap += self.vocab_eng.numericalize(eng)
        eng_cap.append(self.vocab_eng.stoi["<EOS>"])
        ger_cap = [self.vocab_ger.stoi["<SOS>"]]
        ger_cap += self.vocab_ger.numericalize(eng)
        ger_cap.append(self.vocab_ger.stoi["<EOS>"])
        return torch.tensor(eng_cap), torch.tensor(ger_cap)

In [41]:
train_data = Multi30K_Custom("/kaggle/input/machine-translation-dataset-de-en/translation_train.csv")
test_data = Multi30K_Custom("/kaggle/input/machine-translation-dataset-de-en/translation_test.csv")
eng_pad_index = train_data.vocab_eng.stoi["<PAD>"]
ger_pad_index = train_data.vocab_ger.stoi["<PAD>"]
train_loader = DataLoader(train_data, batch_size, shuffle=True, collate_fn=MyCollate(eng_pad_index))
test_loader = DataLoader(test_data, batch_size, shuffle=True, collate_fn=MyCollate(ger_pad_index))

In [42]:
eng_vocab = train_data.vocab_eng
ger_vocab = train_data.vocab_ger
eng_vocab_len = train_data.eng_vocab_len()
ger_vocab_len = train_data.ger_vocab_len()
input_encoder = ger_vocab_len
input_decoder = eng_vocab_len
output_size = eng_vocab_len

In [43]:
class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size, embedding_size, num_layers, p):
        super().__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.dropout = nn.Dropout(p)
        self.lstm = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=p)
        
    def forward(self, x):   # (seq_length, batch_size)
        x = self.embedding(x)  # (seq_length, batch_size, embedding_size)
        x = self.dropout(x)
        output, (hidden, cell) = self.lstm(x)
        return output, hidden, cell

In [44]:
class Decoder(nn.Module):
    def __init__(self, input_size, hidden_size, embedding_size, num_layers, output_size, p):
        super().__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.dropout = nn.Dropout(p)
        self.lstm = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=p)
        self.fc = nn.Linear(hidden_size, output_size)
        
    def forward(self, x, hidden, cell):   # (batch_size) but want (1, batch_size)
        x = x.view(1, x.size(0))
        x = self.embedding(x)  # (1, batch_size, embedding_size)
        x = self.dropout(x)
        outputs, (hidden, cell) = self.lstm(x)  # output --> (1, batch_size, hidden_size)
        predictions = self.fc(outputs)  # (1, batch_size, vocab_eng)
        predictions = predictions.squeeze(0)
        return predictions, hidden, cell

In [45]:
class Translator(nn.Module):
    def __init__(self):
        super().__init__()
        self.encoder = Encoder(input_encoder, hidden_size, embedding_size, num_layers, dropout)
        self.decoder = Decoder(input_decoder, hidden_size, embedding_size, num_layers, output_size, dropout)
        
    def forward(self, source, target, teacher_force_ratio = 0.5):
        batch_size = source.shape[1]
        target_len = target.shape[0]
        target_vocab_size = eng_vocab_len
        
        outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(device)
        _, hidden, cell = self.encoder(source)
        x = target[0]
        for t in range(1, target_len):
            output, hidden, cell = self.decoder(x, hidden, cell)
            outputs[t] = output
            best_guess = output.argmax(1)
            x = target[t] if random.random() < teacher_force_ratio else best_guess
            return outputs

In [46]:
model = Translator().to(device)
optimizer = optim.Adam(model.parameters(), lr = learning_rate)
criterion = nn.CrossEntropyLoss(ignore_index=eng_pad_index)

In [47]:
t_loss = []
for epoch in range(epochs):
    model.train()
    train_loss = 0
    for batch_idx, (eng, ger) in enumerate(train_loader):
        optimizer.zero_grad()
        eng = eng.to(device)
        ger = ger.to(device)
        output = model(ger, eng)
        output = output[1:].reshape(-1, output.shape[2])
        eng = eng[1:].reshape(-1)
        loss = criterion(output, eng)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
        train_loss += loss.item()
        optimizer.step()
    train_loss /= len(train_loader)
    t_loss.append(train_loss)
    print(f'Epoch {epoch+1}, Train Loss: {train_loss}')

Epoch 1, Train Loss: 7.947397762517288
Epoch 2, Train Loss: 7.934791579661848
Epoch 3, Train Loss: 7.934518496545962
Epoch 4, Train Loss: 7.934014799313614
Epoch 5, Train Loss: 7.933949411112524
Epoch 6, Train Loss: 7.934088987710063
Epoch 7, Train Loss: 7.933795534131808
Epoch 8, Train Loss: 7.933719405265732
Epoch 9, Train Loss: 7.933795636649169
Epoch 10, Train Loss: 7.933539807467897
Epoch 11, Train Loss: 7.933519842343399
Epoch 12, Train Loss: 7.933554942710397
Epoch 13, Train Loss: 7.933818928067713
Epoch 14, Train Loss: 7.933527850789298
Epoch 15, Train Loss: 7.933509988585013
Epoch 16, Train Loss: 7.933554111005453
Epoch 17, Train Loss: 7.933248681296587
Epoch 18, Train Loss: 7.9335782504265735
Epoch 19, Train Loss: 7.933573734931147
Epoch 20, Train Loss: 7.933711280107761
Epoch 21, Train Loss: 7.933812526778586
Epoch 22, Train Loss: 7.933521012092767
Epoch 23, Train Loss: 7.933637853494158
Epoch 24, Train Loss: 7.93331409841283
Epoch 25, Train Loss: 7.933529851192151
Epoch 26,

In [48]:
def translate_sentence(model, sentence, src_vocab, trg_vocab, device, max_length=50):
    # Set the model to evaluation mode
    model.eval()
    
    # Tokenize and numericalize the input sentence
    tokens = [token.text.lower() for token in spacy_ger.tokenizer(sentence)]
    numericalized_tokens = [src_vocab.stoi["<SOS>"]] + [src_vocab.stoi.get(token, src_vocab.stoi["<UNK>"]) for token in tokens] + [src_vocab.stoi["<EOS>"]]
    sentence_tensor = torch.tensor(numericalized_tokens).unsqueeze(1).to(device)
    
    # Feed the input to the encoder
    with torch.no_grad():
        hidden = model.encoder(sentence_tensor)
    
    # Initialize the decoder input with <SOS>
    x = torch.tensor([trg_vocab.stoi["<SOS>"]], dtype=torch.long).to(device)
    
    translated_sentence = []
    cell = tuple(torch.zeros_like(h) for h in hidden)  # Initialize cell state
    
    for _ in range(max_length):
        with torch.no_grad():
            output, hidden, cell = model.decoder(x, hidden, cell)  # Pass both hidden and cell states to decoder
            pred_token = output.argmax(1).item()
            translated_sentence.append(pred_token)
            x = torch.tensor([[pred_token]], dtype=torch.long).to(device)  # Corrected shape for the decoder input
            if pred_token == trg_vocab.stoi["<EOS>"]:
                break
    
    translated_tokens = [trg_vocab.itos[idx] for idx in translated_sentence]
    
    return translated_tokens


In [49]:
sentence = "ein kleines mädchen spielt im garten"
translated_sentence = translate_sentence(model, sentence, ger_vocab, eng_vocab, device)
translated_sentence = ' '.join(translated_sentence)
print(f"Translated Sentence: {translated_sentence}")

Translated Sentence: a tambourine a tambourine a tambourine a tambourine a tambourine a tambourine a tambourine a tambourine a tambourine a tambourine a tambourine a tambourine a tambourine a tambourine a tambourine a tambourine a tambourine a tambourine a tambourine a tambourine a tambourine a tambourine a tambourine a tambourine a tambourine
