In [1]:
import torch
import numpy as np
from torch import nn
import pickle
from torch import Tensor
from torch.utils.data import DataLoader, Dataset
import pandas as pd

from transformer import Transformer

In [2]:
df = pd.read_csv("D:\\translation_french_english\\archive\\en-fr.csv").sample(frac=0.2)

In [3]:
print(len(df))
df = df.dropna(axis=0)
print(len(df))

4504075
4504061


In [4]:
df.head()

Unnamed: 0,en,fr
6274489,5.5 A contributor subject to the CSC special p...,5.5 Un cotisant assujetti au régime spécial SC...
4037334,The Chief and Council and the health authority...,Le Chef et le Conseil et l’autorité de santé s...
12641160,"The port city is trade-oriented, has a solid i...","La ville portuaire est axée sur le commerce, e..."
2698970,The repayment schedule and a list of any condi...,"Enfin, le calendrier de remboursement et la li..."
14042083,"The King, like Metternich and the Austrian Kai...","Le roi, à l’instar de Metternich et des kaiser..."


In [5]:
en_sentences = df['en'].tolist()

In [6]:
fr_sentences = df["fr"].tolist()

In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = torch.device("cpu")
device

device(type='cuda')

In [None]:
# with open('english-german-both.pkl', 'rb') as f:
#     data = pickle.load(f)

In [None]:
# Normal sentences - array of strings
# eng_sentences = data[:, 0]
# ger_sentences = data[:, 1]

In [None]:
# print(len(eng_sentences))
# print(len(ger_sentences))

In [8]:
print(len(en_sentences))
print(len(fr_sentences))

4504061
4504061


In [9]:
def get_unique_words(sentences):
    all_words = []
    for sentence in sentences:
        all_words.extend(sentence.split(' '))
    return set(all_words)

def get_number_of_unique_words(sentences):
    return len(get_unique_words(sentences))

def sentence_to_indexes(sentence, mapper):
    sentences = sentence.split(' ')
    indexes = list(map(lambda x: mapper[x], sentences))
    return indexes

def indexes_to_sentence(indexes, mapper):
    words = list(map(lambda x: mapper[x], indexes))
    return ' '.join(words)

def pad_sequence(sequence, max_len):
    sequence = [1] + sequence + [2]
    
    while len(sequence) < max_len:
        sequence.append(0)

    return sequence[:max_len]

In [10]:
PAD_TOKEN_IDX = 0
SOS_TOKEN_IDX = 1
EOS_TOKEN_IDX = 2

In [11]:
eng_words = sorted(get_unique_words(en_sentences))
ger_words = sorted(get_unique_words(fr_sentences))

index_to_word_eng = {i: word for i, word in enumerate(eng_words, start=3)}
index_to_word_eng[0] = "<PAD>"
index_to_word_eng[1] = "<SOS>"
index_to_word_eng[2] = "<EOS>"
word_to_index_eng = {word: i for i, word in enumerate(eng_words, start=3)}
word_to_index_eng["<PAD>"] = 0
word_to_index_eng["<SOS>"] = 1
word_to_index_eng["<EOS>"] = 2

index_to_word_ger = {i: word for i, word in enumerate(ger_words, start=3)}
index_to_word_ger[0] = "<PAD>"
index_to_word_ger[1] = "<SOS>"
index_to_word_ger[2] = "<EOS>"
word_to_index_ger = {word: i for i, word in enumerate(ger_words, start=3)}
word_to_index_ger["<PAD>"] = 0
word_to_index_ger["<SOS>"] = 1
word_to_index_ger["<EOS>"] = 2

In [12]:
eng_sentences_indexes = list(map(lambda x: sentence_to_indexes(x, mapper=word_to_index_eng), en_sentences))
ger_sentences_indexes = list(map(lambda x: sentence_to_indexes(x, mapper=word_to_index_ger), fr_sentences))

In [None]:
len(max(eng_sentences_indexes, key=len))

In [None]:
len(max(ger_sentences_indexes, key=len))

In [None]:
eng_sentences_indexes = list(map(lambda x: pad_sequence(x, max_len=20), eng_sentences_indexes))
ger_sentences_indexes = list(map(lambda x: pad_sequence(x, max_len=20), ger_sentences_indexes))

In [None]:
src_vocab_size = len(set([item for sublist in eng_sentences_indexes for item in sublist]))
trg_vocab_size = len(set([item for sublist in ger_sentences_indexes for item in sublist]))

In [None]:
class TranslationDataset(Dataset):
    def __init__(self, source, target):
        assert len(source) == len(target)
        self.source = source
        self.target = target

    def __len__(self):
        return len(self.target)

    def __getitem__(self, idx):
        return self.source[idx], self.target[idx]

In [None]:
def collate_fn(batch):
    source_sequences, target_sequences = zip(*batch)
    
    source_sequences = [torch.tensor(seq) for seq in source_sequences]
    target_sequences = [torch.tensor(seq) for seq in target_sequences]
    
    return torch.stack(source_sequences), torch.stack(target_sequences)


In [None]:
train_dataset = TranslationDataset(source=eng_sentences_indexes, target=ger_sentences_indexes)
train_dataloader = DataLoader(train_dataset, batch_size=32, collate_fn=collate_fn)

In [None]:
model = Transformer(
    src_pad_idx=0, 
    trg_pad_idx=0, 
    device=device, 
    d_model=512, 
    heads=8, 
    dropout=0.1, 
    max_len=20, 
    num_layers=6, 
    src_vocab_size=src_vocab_size, 
    trg_vocab_size=trg_vocab_size
    ).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)
criterion = nn.CrossEntropyLoss(ignore_index=0)

In [None]:
for epoch in range(10):
    model.train()
    total_loss = 0
    total_batches = len(train_dataloader)
    
    for src, trg in train_dataloader:
        src, trg = src.to(device), trg.to(device)
        
        # Forward pass
        output = model(src, trg[:, :-1])
        output = output.reshape(-1, output.shape[2])
        trg = trg[:, 1:].reshape(-1)
        
        # Obliczenie straty
        optimizer.zero_grad()
        loss = criterion(output, trg)
        loss.backward()
        # nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
        optimizer.step()

        total_loss += loss.item()

    # Średnia strata z epoki
    avg_loss = total_loss / total_batches
    print(f"Epoch: {epoch+1}, Average Loss: {avg_loss:.4f}")

In [None]:
eng_sentences[4444]

In [None]:
model.eval()

test_sentence = eng_sentences[4444]
tokenized_text = sentence_to_indexes(test_sentence, mapper=word_to_index_eng)
tokenized_text = pad_sequence(tokenized_text, max_len=10)

sentence_tensor = torch.tensor(tokenized_text).unsqueeze(0).to(device)
print(sentence_tensor.shape)

outputs = [1]
for i in range(10):
    trg_tensor = torch.LongTensor(outputs).unsqueeze(1).to(device)

    with torch.no_grad():
        output = model(sentence_tensor, trg_tensor)
    
    best_guess = output.argmax(2)[-1, :].item()
    outputs.append(best_guess)

print(indexes_to_sentence(outputs, mapper=index_to_word_ger))
print(outputs)