In [1]:
from datasets import load_dataset

ds = load_dataset("PaulineSanchez/Translation_words_and_sentences_english_french")

README.md:   0%|          | 0.00/1.84k [00:00<?, ?B/s]

dataset_infos.json:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

(…)-00000-of-00001-3d14582ea46e1b17.parquet:   0%|          | 0.00/6.43M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/175466 [00:00<?, ? examples/s]

In [2]:
ds

DatasetDict({
    train: Dataset({
        features: ['English words/sentences', 'French words/sentences'],
        num_rows: 175466
    })
})

In [8]:
with open('en-fr-175k.txt', 'w') as file:
    for each in ds['train']:
        en = each['English words/sentences']
        fr = each['French words/sentences']
        text = f"{fr}||{en}\n"
        file.write(text)


In [1]:
en_sent = []
fr_sent = []
with open('en-fr-175k.txt', 'r') as f:
    for line in f.readlines():
        fr, en = line.split('||')
        en_sent.append(en.strip())
        fr_sent.append(fr.strip())

print(len(fr_sent))
len(en_sent)

175466


175466

In [2]:
pairs = [(fr, en) for fr, en in zip(fr_sent, en_sent)]
pairs

[('Salut!', 'Hi.'),
 ('Cours\u202f!', 'Run!'),
 ('Courez\u202f!', 'Run!'),
 ('Qui ?', 'Who?'),
 ('Ça alors\u202f!', 'Wow!'),
 ('Au feu !', 'Fire!'),
 ("À l'aide\u202f!", 'Help!'),
 ('Saute.', 'Jump.'),
 ('Ça suffit\u202f!', 'Stop!'),
 ('Stop\u202f!', 'Stop!'),
 ('Arrête-toi !', 'Stop!'),
 ('Attends !', 'Wait!'),
 ('Attendez !', 'Wait!'),
 ('Poursuis.', 'Go on.'),
 ('Continuez.', 'Go on.'),
 ('Poursuivez.', 'Go on.'),
 ('Bonjour !', 'Hello!'),
 ('Salut !', 'Hello!'),
 ('Je comprends.', 'I see.'),
 ("J'essaye.", 'I try.'),
 ("J'ai gagné !", 'I won!'),
 ("Je l'ai emporté !", 'I won!'),
 ('J’ai gagné.', 'I won.'),
 ('Oh non !', 'Oh no!'),
 ('Attaque !', 'Attack!'),
 ('Attaquez !', 'Attack!'),
 ('Santé !', 'Cheers!'),
 ('À votre santé !', 'Cheers!'),
 ('Merci !', 'Cheers!'),
 ('Tchin-tchin !', 'Cheers!'),
 ('Lève-toi.', 'Get up.'),
 ('Va, maintenant.', 'Go now.'),
 ('Allez-y maintenant.', 'Go now.'),
 ('Vas-y maintenant.', 'Go now.'),
 ("J'ai pigé !", 'Got it!'),
 ('Compris !', 'Got it!'),


In [3]:
en_vocab = set(en_sent)
len(en_vocab)

123363

In [4]:
fr_vocab = set(fr_sent)
len(fr_vocab)

165448

In [5]:
def build_vocab(corpus):
    special_tokens = {
        "PAD_TOKEN": "<PAD>",
        "EOS_TOKEN": "<EOS>",
        "SOS_TOKEN": "<SOS>",
        "UNK_TOKEN": "<UNK>"
    }


    vocab = set()
    for sentence in corpus:
        words = sentence.lower().split()
        vocab.update(words)
    vocab = [value for key, value in special_tokens.items()] + sorted(list(vocab))
    print(f"{len(vocab)=}")

    word2id = {word: id for id, word in enumerate(vocab)}
    id2word = {id: word for id, word in enumerate(vocab)}
    return vocab, word2id, id2word

en_vocab, en_word2id, en_id2word =build_vocab(en_vocab)
fr_vocab, fr_word2id, fr_id2word = build_vocab(fr_vocab)


len(vocab)=26668
len(vocab)=42460


In [13]:
def encode_sent_to_token(sent, word2id, max_len=40, do_padding=False):
        encoding = [word2id['<SOS>']]
        for word in sent.lower().split():
            if word in word2id:
                encoding.append(word2id[word])
            else:
                encoding.append(word2id["<UNK>"])
        encoding.append(word2id["<EOS>"])

        if len(encoding)>=max_len:
            encoding = encoding[:max_len-1] + [word2id["<EOS>"]]
        else:
            if do_padding:
                encoding.extend([word2id["<PAD>"]]*(max_len-len(encoding)))
            # else:
            #     encoding.append(word2id["<PAD>"])
        return encoding

# def decode()

print(encode_sent_to_token("who are you how do  ", en_word2id))
print(encode_sent_to_token("who are you how do you know me", en_word2id))
print(encode_sent_to_token("who are you how do you know me you", en_word2id))

[2, 25991, 1554, 26555, 11629, 7115, 1]
[2, 25991, 1554, 26555, 11629, 7115, 26555, 13141, 14422, 1]
[2, 25991, 1554, 26555, 11629, 7115, 26555, 13141, 14422, 26555, 1]


In [14]:
from torch.utils.data import DataLoader, Dataset
import torch

class CustomDataset(Dataset):
    def __init__(self, pairs, en_word2id, fr_word2id, max_length):
        self.pairs = pairs
        self.en_word2id = en_word2id
        self.fr_word2id = fr_word2id
        self.max_length = max_length
    
    def __len__(self):
        return len(self.pairs)

    # def encode(self, sent, word2id, max_len=40):
    #     encoding = [word2id['<SOS>']]
    #     for word in sent.lower().split():
    #         if word in word2id:
    #             encoding.append(word2id[word])
    #         else:
    #             encoding.append(word2id["<UNK>"])
    #     encoding.append(word2id["<EOS>"])

    #     if len(encoding)>=max_len:
    #         encoding = encoding[:max_len-1] + [word2id["<EOS>"]]
    #     else:    
    #         encoding.extend([word2id["<PAD>"]]*(max_len-len(encoding)))     
    #     return encoding
    
    def __getitem__(self, index):
        fr, en = self.pairs[index]
        fr_tensor = torch.tensor(encode_sent_to_token(fr, self.fr_word2id, self.max_length, do_padding=True), dtype=torch.int32) #todo check if we can set unsigned here?
        en_tensor = torch.tensor(encode_sent_to_token(en, self.en_word2id, self.max_length, do_padding=True), dtype=torch.int32)
        return fr_tensor, en_tensor

batch_size = 8
dataset = CustomDataset(pairs, en_word2id, fr_word2id, max_length=32)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, drop_last=True)


In [91]:
# from torch.nn.utils.rnn import pad_sequence

# batch = [
#     ([1,2,3,4], [101, 102, 10]),
#     ([1,2,3,4,5,6,7,8], [4332, 3, 103, 69696]),
#     ([1,2], [123, 111])
# ]
# batch = [(torch.tensor(pair[0]), torch.tensor(pair[1])) for pair in batch]

# en_batch, fr_batch = zip(*batch)
# print(en_batch)

# pad_sequence(en_batch, batch_first=True, padding_value=9999999)

(tensor([1, 2, 3, 4]), tensor([1, 2, 3, 4, 5, 6, 7, 8]), tensor([1, 2]))


tensor([[      1,       2,       3,       4, 9999999, 9999999, 9999999, 9999999],
        [      1,       2,       3,       4,       5,       6,       7,       8],
        [      1,       2, 9999999, 9999999, 9999999, 9999999, 9999999, 9999999]])

In [59]:
# from torch.nn.utils.rnn import pad_sequence

# """
# batch > 4

# pairs: [
#     ["who are you", "kon ho?"],
#     ["who are you", "kon ho?"],
#     ["who are you", "kon ho?"],
#     .
#     .
    
# ]

# 1; [1,2,3,4]
# 2; [8,3,5,6, 7]
# 3; [9,0]

# """

# def collate_fn(batch):
#     en, fr = zip(*batch)
#     print(en)
#     en_padded = pad_sequence(en, batch_first=True, padding_value="<P>")

# batch = [[[1,2,3],2], [[4,5,6], 345], [[8,9], 3234124]]
# collate_fn(batch) 

([1, 2, 3], [4, 5, 6], [8, 9])


TypeError: expected Tensor as element 0 in argument 0, but got list

In [15]:
for i, j in dataloader:
    print(i, j)
    break

tensor([[    2, 19118, 42315, 17973,     1,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0],
        [    2, 19449, 34975,  5235, 30976, 27931,     1,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0],
        [    2, 17818, 34685,  3670, 14347, 10081, 36156, 33020,     1,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0],
        [    2, 35770, 39586, 25865, 40371, 41066, 27591, 40205, 39586, 25865,
         21796, 11126, 27593,     1,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,  

In [16]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

class Encoder(nn.Module):
    def __init__(self, vocab_size, emb_size, hidden_size, num_layers=1):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(emb_size, hidden_size, num_layers, batch_first=True)

    def forward(self, X):
        X = torch.flip(X, [1])
        embedded = self.embedding(X)
        outputs, (hidden, cell) = self.lstm(embedded)
        return outputs, hidden, cell

class Decoder(nn.Module):
    def __init__(self, vocab_size, emb_size, hidden_size, num_layers=1):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(emb_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)
    
    def forward(self, X, hidden, cell):
        output = self.embedding(X)
        output, (hidden, cell) = self.lstm(output, (hidden, cell))
        output = self.fc(output).reshape(output.size(0), -1)
        return output, hidden, cell


In [17]:
en_voca_size = len(en_vocab)
fr_voca_size = len(fr_vocab)

emb_size = 256
hidden_size = 512
num_layers = 1

encoder = Encoder(en_voca_size, emb_size, hidden_size, num_layers).to('cuda')
decoder = Decoder(fr_voca_size, emb_size, hidden_size, num_layers).to('cuda')

In [18]:
def translate(encode, decoder, sent, en_word2id, fr_id2word, max_length=20):
    encoder.eval()
    decoder.eval()

    with torch.inference_mode():
        input_tensor = torch.tensor(encode_sent_to_token(sent, en_word2id, max_len=max_length, do_padding=False), dtype=torch.int32)
        input_tensor = input_tensor.view(1, -1).to('cuda')

        _, encoder_hidden, encoder_cell = encode(input_tensor)

        decoder_input =torch.tensor([[en_word2id["<SOS>"]]], dtype=torch.int32)
        decoder_hidden, decoder_cell = encoder_hidden, encoder_cell

        decoded_words = []
        last_word = decoder_input.to('cuda')
        for di in range(max_length):
            logits, decoder_hidden, decoder_cell = decoder(last_word, decoder_hidden, decoder_cell)
            next_token = logits.argmax(dim=1)
            last_word = torch.tensor([[next_token]]).to('cuda')
            if next_token.item() == fr_word2id["<EOS>"]:
                break
            else:
                decoded_words.append(fr_id2word.get(next_token.item()))
    
    return " ".join(decoded_words)

translate(encoder, decoder, "who are you?", en_word2id, fr_id2word)


"capturée. fastidieux. cola utilise siens. arrières-pensées. entourée première bondé. affairé description paumée. couronne ecrivez interroger audition. naturel. n'attendais accro penseriez-vous"

In [20]:
import torch.optim as optim
import torch.nn as nn
import random

loss_fn = nn.CrossEntropyLoss(ignore_index=en_word2id["<PAD>"])

encoder_optimizer = optim.AdamW(encoder.parameters())
decoder_optimizer = optim.AdamW(decoder.parameters())

num_epochs = 1

encoder.train()
decoder.train()

for epoch in range(num_epochs):
    for i, (input_tensor, target_tensor) in enumerate(dataloader):
        input_tensor, target_tensor = input_tensor.to('cuda'), target_tensor.to('cuda')

        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()

        target_length = target_tensor.size(1)

        _, encoder_hidden, encoder_cell = encoder(input_tensor)

        decoder_input = torch.full((batch_size, 1), en_word2id["<SOS>"], dtype=torch.int32).to('cuda')
        decoder_hidden, decoder_cell = encoder_hidden, encoder_cell

        random_word_index = random.randit(0, target_length-1)

        loss = 0

        for di in range(target_length):
            logits, decoder_hidden, decoder_cell = decoder(decoder_input, decoder_hidden, decoder_cell)
            loss += loss_fn(logits, target_tensor[:, di])
            decoder_input = target_tensor[:, di].reshape(batch_size, 1)
        
        loss.backward()

        encoder_optimizer.step()
        decoder_optimizer.step()

        print(f"{epoch=} | Batch:{i} | Loss:{loss.item() / target_length:.4f}")
        

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
