In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchdata.datapipes as dp
import torchtext.transforms as T
import spacy
import random
from torchtext.vocab import build_vocab_from_iterator

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
!wget 'https://raw.githubusercontent.com/Apress/applied-natural-language-processing-w-python/master/data_etc/deu.txt'

--2024-04-15 19:14:02--  https://raw.githubusercontent.com/Apress/applied-natural-language-processing-w-python/master/data_etc/deu.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 12316220 (12M) [text/plain]
Saving to: 'deu.txt'


2024-04-15 19:14:02 (138 MB/s) - 'deu.txt' saved [12316220/12316220]



In [4]:
!python -m spacy download en_core_web_sm
!python -m spacy download de_core_news_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m77.1 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
Collecting de-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.7.0/de_core_news_sm-3.7.0-py3-none-any.whl (14.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.6/14.6 MB[0m [31m71.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: de-core-news-sm
Successfully installed de-core-news-sm-3.7.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('de_core_news_sm')


In [5]:
eng = spacy.load("en_core_web_sm") # Load the English model to tokenize English text
de = spacy.load("de_core_news_sm") # Load the German model to tokenize German text

In [6]:
import random

def split_dataset(input_file, train_file, test_file, train_percent):
    with open(input_file, 'r') as file:
        lines = file.readlines()  

    total_lines = len(lines)
    train_lines = int(total_lines * train_percent)
    test_lines = total_lines - train_lines


    random.shuffle(lines)

    with open(train_file, 'w') as train:
        for line in lines[:train_lines]:
            train.write(line)

    with open(test_file, 'w') as test:
        for line in lines[train_lines:]:
            test.write(line)


input_file = '/kaggle/working/deu.txt'


train_file = 'shiny_train.txt'
test_file = 'shiny_test.txt'

train_percent = 0.8

split_dataset(input_file, train_file, test_file, train_percent)

In [7]:
FILE_PATH = '/kaggle/working/shiny_train.txt'
data_pipe = dp.iter.IterableWrapper([FILE_PATH])
data_pipe = dp.iter.FileOpener(data_pipe, mode='rb')
data_pipe = data_pipe.parse_csv(skip_lines=1, delimiter='\t', as_tuple=True)

In [8]:
for sample in data_pipe:
    print(sample)
    break

('The city was founded in 573.', 'Die Stadt wurde 573 gegründet.')


In [9]:
def engTokenize(text):
    """
    Tokenize an English text and return a list of tokens
    """
    return [token.text for token in eng.tokenizer(text)]

def deTokenize(text):
    """
    Tokenize a German text and return a list of tokens
    """
    return [token.text for token in de.tokenizer(text)]

In [10]:
print(engTokenize("Have a good day!!!"))
print(deTokenize("Haben Sie einen guten Tag!!!"))

['Have', 'a', 'good', 'day', '!', '!', '!']
['Haben', 'Sie', 'einen', 'guten', 'Tag', '!', '!', '!']


In [11]:
def getTokens(data_iter, place):
    """
    Function to yield tokens from an iterator. Since, our iterator contains
    tuple of sentences (source and target), `place` parameters defines for which
    index to return the tokens for. `place=0` for source and `place=1` for target
    """
    for english, german in data_iter:
        if place == 0:
            yield engTokenize(english)
        else:
            yield deTokenize(german)

In [12]:
source_vocab = build_vocab_from_iterator(
    getTokens(data_pipe,0),
    min_freq=2,
    specials= ['<pad>', '<sos>', '<eos>', '<unk>'],
    special_first=True
)
source_vocab.set_default_index(source_vocab['<unk>'])

In [13]:
target_vocab = build_vocab_from_iterator(
    getTokens(data_pipe,1),
    min_freq=2,
    specials= ['<pad>', '<sos>', '<eos>', '<unk>'],
    special_first=True
)
target_vocab.set_default_index(target_vocab['<unk>'])

In [14]:
print(source_vocab.get_itos()[:9])

['<pad>', '<sos>', '<eos>', '<unk>', '.', 'I', 'Tom', 'to', 'you']


In [15]:
def getTransform(vocab):
    """
    Create transforms based on given vocabulary. The returned transform is applied to sequence
    of tokens.
    """
    text_tranform = T.Sequential(
        ## converts the sentences to indices based on given vocabulary
        T.VocabTransform(vocab=vocab),
        ## Add <sos> at beginning of each sentence. 1 because the index for <sos> in vocabulary is
        # 1 as seen in previous section
        T.AddToken(1, begin=True),
        ## Add <eos> at beginning of each sentence. 2 because the index for <eos> in vocabulary is
        # 2 as seen in previous section
        T.AddToken(2, begin=False)
    )
    return text_tranform

In [16]:
temp_list = list(data_pipe)
some_sentence = temp_list[798][0]
print("Some sentence= ", end="")
print(some_sentence)
transformed_sentence = getTransform(source_vocab)(engTokenize(some_sentence))
print("Transformed sentence=", end="")
print(transformed_sentence)
index_to_string = source_vocab.get_itos()
for index in transformed_sentence:
    print(index_to_string[index], end=" ")

Some sentence= That's why I like you.
Transformed sentence=[1, 73, 14, 211, 5, 44, 8, 4, 2]
<sos> That 's why I like you . <eos> 

In [17]:
def applyTransform(sequence_pair):
    """
    Apply transforms to sequence of tokens in a sequence pair
    """

    return (
        getTransform(source_vocab)(engTokenize(sequence_pair[0])),
        getTransform(target_vocab)(deTokenize(sequence_pair[1]))
    )
data_pipe = data_pipe.map(applyTransform) ## Apply the function to each element in the iterator
temp_list = list(data_pipe)
print(temp_list[0])

([1, 27, 449, 20, 3860, 16, 3, 4, 2], [1, 55, 302, 88, 3, 3530, 4, 2])


In [18]:
def sortBucket(bucket):
    """
    Function to sort a given bucket. Here, we want to sort based on the length of
    source and target sequence.
    """
    return sorted(bucket, key=lambda x: (len(x[0]), len(x[1])))

In [19]:
bucket = [
    (['how', 'are', 'you'], ['wie', 'geht', 'es', 'dir']),
    (['hello', 'world'], ['hallo', 'welt']),
    (['this', 'is', 'a', 'test'], ['das', 'ist', 'ein', 'test'])
]

sorted_data = sortBucket(bucket)

for item in sorted_data:
    print(item)

(['hello', 'world'], ['hallo', 'welt'])
(['how', 'are', 'you'], ['wie', 'geht', 'es', 'dir'])
(['this', 'is', 'a', 'test'], ['das', 'ist', 'ein', 'test'])


In [20]:
data_pipe = data_pipe.bucketbatch(
    batch_size = 64, batch_num=5,  bucket_num=1,
    use_in_batch_shuffle=False, sort_key=sortBucket
)

In [21]:
print(list(data_pipe)[0])

[([1, 27, 293, 110, 16, 40, 64, 4, 2], [1, 67, 310, 191, 1176, 41, 4, 2]), ([1, 80, 43, 12, 22, 255, 120, 4, 2], [1, 12, 437, 9, 297, 115, 4, 2]), ([1, 5, 45, 42, 881, 387, 199, 4, 2], [1, 7, 1109, 498, 189, 205, 4, 2]), ([1, 39, 86, 194, 6, 11, 468, 4, 2], [1, 38, 280, 6, 37, 920, 274, 4, 2]), ([1, 6, 110, 66, 467, 38, 28, 4, 2], [1, 6, 88, 69, 677, 30, 39, 4, 2]), ([1, 50, 13, 863, 17, 70, 417, 4, 2], [1, 12, 10, 841, 30, 389, 430, 4, 2]), ([1, 173, 43, 23, 977, 7, 406, 10, 2], [1, 212, 20, 17, 895, 11, 784, 8, 2]), ([1, 73, 135, 13, 11, 170, 135, 4, 2], [1, 221, 134, 10, 21, 548, 134, 4, 2]), ([1, 50, 640, 7, 925, 9, 212, 4, 2], [1, 12, 2256, 32, 214, 11, 2343, 4, 2]), ([1, 26, 86, 380, 6, 11, 2092, 4, 2], [1, 1387, 12, 6, 177, 37, 554, 4, 2]), ([1, 5, 2740, 2152, 16, 9, 731, 4, 2], [1, 7, 24, 2503, 68, 685, 4532, 4, 2]), ([1, 5, 22, 11, 66, 176, 2419, 4, 2], [1, 7, 24, 37, 69, 504, 3935, 4, 2]), ([1, 173, 43, 8, 290, 9, 776, 10, 2], [1, 212, 75, 13, 14, 1907, 225, 8, 2]), ([1, 173,

In [22]:
def separateSourceTarget(sequence_pairs):
    """
    input of form: `[(X_1,y_1), (X_2,y_2), (X_3,y_3), (X_4,y_4)]`
    output of form: `((X_1,X_2,X_3,X_4), (y_1,y_2,y_3,y_4))`
    """
    sources,targets = zip(*sequence_pairs)
    return sources,targets

## Apply the function to each element in the iterator
data_pipe = data_pipe.map(separateSourceTarget)
print(list(data_pipe)[0])

(([1, 5, 22, 12, 1157, 187, 37, 307, 308, 272, 4, 2], [1, 59, 590, 1476, 13, 1969, 105, 9, 801, 83, 4, 2], [1, 36, 8, 223, 30, 247, 244, 16, 9, 604, 10, 2], [1, 36, 8, 58, 6, 13, 16, 163, 38, 8, 10, 2], [1, 5, 47, 11, 341, 1565, 195, 13, 11, 971, 4, 2], [1, 3263, 145, 33, 12, 162, 6884, 105, 372, 145, 4, 2], [1, 27, 343, 20, 7429, 7, 1550, 78, 9, 461, 4, 2], [1, 49, 14, 30, 397, 363, 7, 893, 16, 403, 10, 2], [1, 25, 1203, 21, 7, 210, 9, 155, 48, 353, 4, 2], [1, 27, 118, 2531, 13, 801, 19, 672, 34, 590, 4, 2], [1, 39, 264, 18, 9, 288, 85, 12, 35, 252, 4, 2], [1, 5, 314, 7, 129, 202, 18, 493, 67, 421, 4, 2], [1, 6, 14, 734, 33, 239, 2700, 7, 28, 14, 4, 2], [1, 41, 274, 44, 8, 53, 579, 55, 126, 302, 4, 2], [1, 5, 86, 69, 1541, 8, 18, 29, 347, 369, 4, 2], [1, 5, 47, 8, 33, 1237, 317, 577, 9, 2519, 4, 2], [1, 6, 102, 8, 347, 35, 514, 16, 29, 135, 4, 2], [1, 5, 91, 160, 121, 51, 1146, 172, 9, 1210, 4, 2], [1, 6, 243, 7, 244, 1828, 175, 32, 110, 277, 4, 2], [1, 5, 69, 1274, 614, 8, 311, 629, 

In [23]:
def applyPadding(pair_of_sequences):
    """
    Convert sequences to tensors and apply padding
    """
    return (T.ToTensor(0)(list(pair_of_sequences[0])), T.ToTensor(0)(list(pair_of_sequences[1])))
## `T.ToTensor(0)` returns a transform that converts the sequence to `torch.tensor` and also applies
# padding. Here, `0` is passed to the constructor to specify the index of the `<pad>` token in the
# vocabulary.
data_pipe = data_pipe.map(applyPadding)

In [24]:
print(list(data_pipe)[0])

(tensor([[   1, 1460,   12,   79,  101,   57,  115,   10,    2,    0],
        [   1,   41,   14,   24,   30,  388,   82,    4,    2,    0],
        [   1,    5,   20,  917,   38,    9, 1562,    4,    2,    0],
        [   1,    6,   34,   28,   33,  162, 1673,    4,    2,    0],
        [   1,   39,  162,  323,  181,   40,   54,    4,    2,    0],
        [   1,   80,   67,   11, 2543,   64,  381,    4,    2,    0],
        [   1,    5,   45,   42, 2373, 1196,  272,    4,    2,    0],
        [   1,   26,   15,   12, 1658,   21,  309,    4,    2,    0],
        [   1, 8446,  285, 6184,   17,    9, 5965,    4,    2,    0],
        [   1,   80, 6796,    9, 4879,   38,  346,    4,    2,    0],
        [   1,    5,   45,  305,   32,   13, 1038,    4,    2,    0],
        [   1,  504,   33,   24,    8,   19,    6,    4,    2,    0],
        [   1,   26, 1024,    7,   22,  542,  152,    4,    2,    0],
        [   1,  166,    8,  180,   89,    7, 8313,   10,    2,    0],
        [   1, 3263

In [25]:
source_index_to_string = source_vocab.get_itos()
target_index_to_string = target_vocab.get_itos()

def showSomeTransformedSentences(data_pipe):
    """
    Function to show how the sentences look like after applying all transforms.
    Here we try to print actual words instead of corresponding index
    """
    for sources,targets in data_pipe:
        if sources[0][-1] != 0:
            continue # Just to visualize padding of shorter sentences
        for i in range(4):
            source = ""
            for token in sources[i]:
                source += " " + source_index_to_string[token]
            target = ""
            for token in targets[i]:
                target += " " + target_index_to_string[token]
            print(f"Source: {source}")
            print(f"Traget: {target}")
        break

showSomeTransformedSentences(data_pipe)

Source:  <sos> Keep them . <eos> <pad> <pad> <pad>
Traget:  <sos> Behaltet sie . <eos> <pad> <pad> <pad> <pad> <pad> <pad>
Source:  <sos> We 're screaming . <eos> <pad> <pad>
Traget:  <sos> Wir schreien . <eos> <pad> <pad> <pad> <pad> <pad> <pad>
Source:  <sos> Expect the unexpected . <eos> <pad> <pad>
Traget:  <sos> Erwarte das Unerwartete ! <eos> <pad> <pad> <pad> <pad> <pad>
Source:  <sos> It works great . <eos> <pad> <pad>
Traget:  <sos> Es funktioniert bestens . <eos> <pad> <pad> <pad> <pad> <pad>


In [26]:
for (data, target) in data_pipe:
    print(data)
    print(data.shape)
    print(target)
    print(target.shape)
    break

tensor([[   1,    6,  535, 3166,   78,    9, 1350,    4,    2,    0],
        [   1,    6,   52,   12,  537,   11,  543,    4,    2,    0],
        [   1,   27, 2625, 3882,   40,    9, 1392,    4,    2,    0],
        [   1,   25,   43,   23,   24,    9,  120,    4,    2,    0],
        [   1,  106,   28,   34, 1326,  125, 1138,   10,    2,    0],
        [   1,    5,  179,    6,    7,  427,  294,    4,    2,    0],
        [   1,   50,  249,   97,  126,    7,  192,    4,    2,    0],
        [   1,    5,  432,  116,    6,  265,   72,    4,    2,    0],
        [   1,   25,   14,  112,  176,   24,   21,    4,    2,    0],
        [   1,    5,   15,   12,  352,    3,  309,    4,    2,    0],
        [   1,   27,  288,   13, 1419,   78,  836,    4,    2,    0],
        [   1,  151,   81,    8,  117,   23,  256,   10,    2,    0],
        [   1,    5,   47,  116,  467,    8,   33,    4,    2,    0],
        [   1,    6,  102, 1250,   67,   89, 1336,    4,    2,    0],
        [   1,    5,

In [27]:
import torch.nn as nn
import random
import torch.optim as optim
from tqdm import tqdm
import torch

In [28]:
class Transformers(nn.Module):
    def __init__(
        self,
        embedding_size,
        src_vocab_size,
        trg_vocab_size,
        src_pad_idx,
        num_heads,
        num_encoder_layers,
        num_decoder_layers,
        forward_expansion,
        dropout,
        max_len,
        device
    ):
        super(Transformers, self).__init__()
        self.src_word_embedding = nn.Embedding(src_vocab_size, embedding_size)
        self.src_position_embedding = nn.Embedding(max_len, embedding_size)
        self.trg_word_embedding = nn.Embedding(trg_vocab_size, embedding_size)
        self.trg_position_embedding = nn.Embedding(max_len, embedding_size)
        self.device = device
        self.transformer = nn.Transformer(
            embedding_size,
            num_heads,
            num_encoder_layers,
            num_decoder_layers,
            forward_expansion,
            dropout,
        )
        self.fc_out = nn.Linear(embedding_size, trg_vocab_size)
        self.dropout = nn.Dropout(dropout)
        self.src_pad_idx = src_pad_idx
    
    def make_src_mask(self, src):
        # src shape = (src_len, N)
        src_mask = src.transpose(0, 1) == self.src_pad_idx
        # (N, src_len)
        return src_mask
    
    def forward(self, src, trg):
        src_seq_lenght, N = src.shape
        trg_seq_lenght, N = trg.shape
        
        src_positions = (
            torch.arange(0, src_seq_lenght).unsqueeze(1).expand(src_seq_lenght, N)
            .to(self.device)
        )
        
        trg_positions = (
            torch.arange(0, trg_seq_lenght).unsqueeze(1).expand(trg_seq_lenght, N)
            .to(self.device)
        )
        
        embed_src = self.dropout(self.src_word_embedding(src) + self.src_position_embedding(src_positions))
        embed_trg = self.dropout(self.trg_word_embedding(trg) + self.trg_position_embedding(trg_positions))
        src_mask_padding_mask = self.make_src_mask(src)
        trg_mask = self.transformer.generate_square_subsequent_mask(trg_seq_lenght).to(self.device)
        
        out = self.transformer(
            embed_src,
            embed_trg,
            src_key_padding_mask = src_mask_padding_mask,
            tgt_mask = trg_mask
        )
        out = self.fc_out(out)
        return out

In [29]:
num_epochs = 10
learning_rate = 3e-4
batch_size = 64

src_vocab_size = len(source_vocab)
trg_vocab_size = len(target_vocab)
embedding_size = 512
num_heads = 8
num_encoder_layers = 3
num_decoder_layers = 3
dropout = 0.10
max_len = 57
forward_expansion = 2048
src_pad_idx = target_vocab.lookup_indices(['<pad>'])[0]

In [30]:
FILE_PATH = '/kaggle/working/shiny_test.txt'
test_data_pipe = dp.iter.IterableWrapper([FILE_PATH])
test_data_pipe = dp.iter.FileOpener(test_data_pipe, mode='rb')
test_data_pipe = test_data_pipe.parse_csv(skip_lines=1, delimiter='\t', as_tuple=True)

In [31]:
import torch
import spacy
from torchtext.data.metrics import bleu_score
import sys

def translate_sentence(model, sentence, german_voc, english_voc, spacy_en, device, max_length=50):

    # Create tokens using spacy and everything in lower case (which is what our vocab is)
    if type(sentence) == str:
        tokens = [token.text.lower() for token in spacy_en(sentence)]
    else:
        tokens = [token.lower() for token in sentence]

    # Add <SOS> and <EOS> in beginning and end respectively
    tokens.insert(0, '<sos>')
    tokens.append('<eos>')
    
    # Go through each german token and convert to an index
    text_to_indices = english_voc.lookup_indices(tokens)

    # Convert to Tensor
    sentence_tensor = torch.tensor(text_to_indices, device=device).unsqueeze(1)

    outputs = german_voc.forward(['<sos>'])
    end_of_sentece = german_voc.forward(['<eos>'])[0]
    
    for i in range(max_length):
        trg_tensor = torch.tensor(outputs, device=device).unsqueeze(1)

        with torch.no_grad():
            output = model(sentence_tensor, trg_tensor)

        best_guess = output.argmax(2)[-1, :].item()
        outputs.append(best_guess)

        if best_guess == end_of_sentece:
            break

    translated_sentence = german_voc.lookup_tokens(outputs)
    # remove start token
    return translated_sentence[1:]

In [32]:
model = Transformers(
    embedding_size,
    src_vocab_size,
    trg_vocab_size,
    src_pad_idx,
    num_heads,
    num_encoder_layers,
    num_decoder_layers,
    forward_expansion,
    dropout,
    max_len,
    device
).to(device)



In [33]:
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
pad_idx = target_vocab.lookup_indices(['<pad>'])[0]
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

In [34]:
sentence = 'You must know yourself.'
spacy_en = spacy.load("en_core_web_sm")

for epoch in range(num_epochs):
    
    model.eval()
    translated_sentence = translate_sentence(model, sentence, target_vocab, source_vocab, spacy_en, device)
    print(' '.join(translated_sentence[:-1]))
    model.train()
    print(f'Epoch {epoch+1} / {num_epochs}')
    for batch_idx, (data, target) in enumerate(data_pipe):
            data = data.permute([1,0])
            data = data.to(device)
           
            target = target.permute([1,0])
            target = target.to(device)
    
            output = model(data, target[:-1])
        
            output = output.reshape(-1, output.shape[2])
            target = target[1:].reshape(-1)
            
            optimizer.zero_grad()
            loss = criterion(output, target)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
            optimizer.step()

Flitterwochen erfreut Flitterwochen abgeschnitten Fußstapfen Flitterwochen sauren Verausgabe Verausgabe Protesten vergiss abgeschnitten Verausgabe Verausgabe Protesten Schlips Verausgabe Verausgabe Verausgabe unausstehlich beklagt Bauern enttäuschen Computerspiel Verausgabe Verausgabe Verausgabe lud liebe gelesen geküsst Computerspiel Verausgabe Flitterwochen abgeschnitten Fußstapfen anliegende beklagt enttäuschen irrte Protesten Bauern kannte Fußballmannschaft zusammen Verausgabe seien Beförderung attraktiver
Epoch 1 / 10
Du musst dich wissen .
Epoch 2 / 10
Sie müssen sich selbst wissen .
Epoch 3 / 10
Du musst dich selbst wissen .
Epoch 4 / 10
Du musst dich selbst wissen .
Epoch 5 / 10
Sie müssen sich selbst wissen .
Epoch 6 / 10
Sie müssen sich selbst kennen .
Epoch 7 / 10
Sie müssen sich selbst kennen .
Epoch 8 / 10
Sie müssen sich kennen .
Epoch 9 / 10
Sie müssen sich kennen .
Epoch 10 / 10


In [35]:
from torchmetrics.functional.text import bleu_score
from torch.utils.data import DataLoader


def bleu(data_pipe, model, german_vocab, english_vocab, device):
    spacy_en = spacy.load("en_core_web_sm")
    targets = []
    outputs = []
    
    for idx, (source, target) in enumerate(data_pipe):
        if idx % 10000 == 0:
            print(idx)
        
        prediction = translate_sentence(model, source, german_vocab, english_vocab, spacy_en, device)
        prediction = prediction[:-1]
        detokenized  = ' '.join(deTokenize(target))
        targets.append([detokenized])
        outputs.append(" ".join(prediction))
#         print(f'Source: {source} Target: {t} Pred: {detokenized}', end='\n\n')
    return bleu_score(outputs, targets)

In [36]:
score = bleu(test_data_pipe, model, target_vocab, source_vocab, device)

0
10000
20000
30000


In [37]:
print(score)

tensor(0.2155)
