<a href="https://colab.research.google.com/github/KingsFrown/ml-course/blob/main/alien_translation_seq2seq.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from tqdm.notebook import tqdm as tqdmn
from tqdm import tqdm
import random
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from collections import Counter
from nltk.tokenize import word_tokenize
import nltk

In [3]:
def load_jsonl(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()
    data = [pd.json_normalize(eval(line)) for line in tqdmn(lines, desc="Processing JSON Lines")]
    return pd.concat(data, ignore_index=True)

In [4]:
train_df = load_jsonl('train')

Processing JSON Lines:   0%|          | 0/300000 [00:00<?, ?it/s]

In [5]:
val_df = load_jsonl('val')
test_df = load_jsonl('test_no_reference')

Processing JSON Lines:   0%|          | 0/500 [00:00<?, ?it/s]

Processing JSON Lines:   0%|          | 0/1000 [00:00<?, ?it/s]

In [7]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [35]:
# Токенизаторы
def tokenize(text):
    return word_tokenize(text.lower())

# Создание словарей
def build_vocab(tokenized_texts, max_vocab_size=10000, specials=["<unk>", "<pad>", "<bos>", "<eos>"]):
    counter = Counter([token for sentence in tokenized_texts for token in sentence])
    most_common = counter.most_common(max_vocab_size - len(specials))
    vocab = {token: i for i, (token, _) in enumerate(most_common, start=len(specials))}
    vocab.update({token: i for i, token in enumerate(specials)})
    return vocab

train_src_tokens = [tokenize(text) for text in train_df['src']]
train_tgt_tokens = [tokenize(text) for text in train_df['dst']]

vocab_src = build_vocab(train_src_tokens, max_vocab_size=10000)
vocab_tgt = build_vocab(train_tgt_tokens, max_vocab_size=10000)

trg_vocab = {v:k for k,v in vocab_tgt.items()}
src_vocab = {v:k for k,v in vocab_src.items()}

# Преобразование текста в индексы
def text_pipeline(x, vocab):
    return [vocab.get(token, vocab["<unk>"]) for token in tokenize(x)]

# DataLoader
class TranslationDataset(Dataset):
    def __init__(self, df, vocab_src, vocab_tgt):
        self.df = df
        self.vocab_src = vocab_src
        self.vocab_tgt = vocab_tgt

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        src_text = self.df.iloc[idx]['src']
        if 'dst' in self.df.columns:
            tgt_text = self.df.iloc[idx]['dst']
            tgt_tensor = torch.tensor(text_pipeline(tgt_text, self.vocab_tgt), dtype=torch.long)
        else:
            tgt_tensor = None
        src_tensor = torch.tensor(text_pipeline(src_text, self.vocab_src), dtype=torch.long)
        return src_tensor, tgt_tensor

train_dataset = TranslationDataset(train_df, vocab_src, vocab_tgt)
val_dataset = TranslationDataset(val_df, vocab_src, vocab_tgt)
test_dataset = TranslationDataset(test_df, vocab_src, vocab_tgt)

In [36]:
def collate_fn(batch):
    src_batch, tgt_batch = zip(*batch)
    src_batch = torch.nn.utils.rnn.pad_sequence(src_batch, padding_value=vocab_src["<pad>"])
    if tgt_batch[0] is not None:
        tgt_batch = torch.nn.utils.rnn.pad_sequence(tgt_batch, padding_value=vocab_tgt["<pad>"])
    return src_batch, tgt_batch

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn, num_workers=2)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn, num_workers=2)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn, num_workers=2)

In [28]:
# class Encoder(nn.Module):
#     def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
#         super().__init__()
#         self.embedding = nn.Embedding(input_dim, emb_dim)
#         self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout)
#         self.dropout = nn.Dropout(dropout)

#     def forward(self, src):
#         embedded = self.dropout(self.embedding(src))
#         outputs, (hidden, cell) = self.rnn(embedded)
#         return hidden, cell

# class Decoder(nn.Module):
#     def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
#         super().__init__()
#         self.output_dim = output_dim
#         self.embedding = nn.Embedding(output_dim, emb_dim)
#         self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout)
#         self.fc_out = nn.Linear(hid_dim, output_dim)
#         self.dropout = nn.Dropout(dropout)

#     def forward(self, input, hidden, cell):
#         input = input.unsqueeze(0)
#         embedded = self.dropout(self.embedding(input))
#         output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
#         prediction = self.fc_out(output.squeeze(0))
#         return prediction, hidden, cell

# class Seq2Seq(nn.Module):
#     def __init__(self, encoder, decoder, device):
#         super().__init__()
#         self.encoder = encoder
#         self.decoder = decoder
#         self.device = device

#     def forward(self, src, trg, teacher_forcing_ratio=0.5):
#         batch_size = trg.shape[1]
#         trg_len = trg.shape[0]
#         trg_vocab_size = self.decoder.output_dim
#         outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
#         hidden, cell = self.encoder(src)
#         input = trg[0,:]
#         for t in range(1, trg_len):
#             output, hidden, cell = self.decoder(input, hidden, cell)
#             outputs[t] = output
#             teacher_force = random.random() < teacher_forcing_ratio
#             top1 = output.argmax(1)
#             input = (trg[t] if teacher_force else top1)
#         return outputs

In [2]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.GRU(emb_dim, hid_dim, n_layers, dropout=dropout)  # Использование GRU вместо LSTM
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        embedded = self.dropout(self.embedding(src))
        outputs, hidden = self.rnn(embedded)
        return hidden

class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        self.output_dim = output_dim
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.GRU(emb_dim, hid_dim, n_layers, dropout=dropout)  # Использование GRU вместо LSTM
        self.fc_out = nn.Linear(hid_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden):
        input = input.unsqueeze(0)
        embedded = self.dropout(self.embedding(input))
        output, hidden = self.rnn(embedded, hidden)
        prediction = self.fc_out(output.squeeze(0))
        return prediction, hidden

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        batch_size = trg.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        hidden = self.encoder(src)
        input = trg[0,:]
        for t in range(1, trg_len):
            output, hidden = self.decoder(input, hidden)
            outputs[t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)
            input = (trg[t] if teacher_force else top1)
        return outputs

In [4]:
INPUT_DIM = 32000
OUTPUT_DIM = 32000
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
N_LAYERS = 2
ENC_DROPOUT = 0.3
DEC_DROPOUT = 0.3

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = Seq2Seq(enc, dec, device).to(device)
sum([p.numel() for p in model.parameters()])

38317312

In [38]:
INPUT_DIM = len(vocab_src)
OUTPUT_DIM = len(vocab_tgt)
ENC_EMB_DIM = 128
DEC_EMB_DIM = 128
HID_DIM = 256
N_LAYERS = 2
ENC_DROPOUT = 0.3
DEC_DROPOUT = 0.3

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = Seq2Seq(enc, dec, device).to(device)

optimizer = optim.AdamW(model.parameters())
criterion = nn.CrossEntropyLoss()

def train(model, iterator, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0
    for i, (src, trg) in enumerate(tqdmn(iterator, desc="Training", leave=False)):
        src, trg = src.to(device), trg.to(device)
        optimizer.zero_grad()
        output = model(src, trg)
        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)
        loss = criterion(output, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(iterator)

def evaluate(model, iterator, criterion):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for i, (src, trg) in enumerate(tqdmn(iterator, desc="Evaluating", leave=False)):
            src, trg = src.to(device), trg.to(device)
            output = model(src, trg, 0)
            output_dim = output.shape[-1]
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)
            loss = criterion(output, trg)
            epoch_loss += loss.item()
    return epoch_loss / len(iterator)

N_EPOCHS = 3
CLIP = 1
epoch=0

# for epoch in range(N_EPOCHS):
#     train_loss = train(model, train_loader, optimizer, criterion, CLIP)
#     valid_loss = evaluate(model, val_loader, criterion)
#     bleu_score = calculate_bleu(val_loader, model, vocab_tgt, device)
#     print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Val. Loss: {valid_loss:.3f}, BLEU: {bleu_score:.3f}')

In [41]:
train_loss = train(model, train_loader, optimizer, criterion, CLIP)
valid_loss = evaluate(model, val_loader, criterion)

In [40]:
print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Val. Loss: {valid_loss:.3f}')
epoch+=1

Epoch: 01, Train Loss: 1.833, Val. Loss: 3.233


In [79]:
def translate_test_set(model, test_loader, src_vocab, tgt_vocab, device):
    model.eval()
    translations = []
    trg_vocab_inv = {v: k for k, v in tgt_vocab.items()}  # Инвертируем словарь tgt_vocab для получения слов из индексов

    with torch.no_grad():
        for src_batch, _ in tqdmn(test_loader, desc="Translating Test Set"):
            src_batch = src_batch.to(device)
            hidden = model.encoder(src_batch)  # Кодируем весь батч

            for i in range(src_batch.size(1)):  # Переводим каждое предложение по отдельности
                src_sentence = src_batch[:, i].unsqueeze(1)  # Извлекаем i-ое предложение и добавляем размерность для батча
                sentence_hidden = hidden[:, i, :].unsqueeze(1).contiguous()  # Делаем тензор непрерывным

                input_token = torch.tensor([tgt_vocab["<bos>"]], device=device)
                translation = []

                for _ in range(50):  # Ограничиваем длину перевода 50 токенами
                    output, sentence_hidden = model.decoder(input_token, sentence_hidden)
                    top1 = output.argmax(1).item()

                    if top1 == tgt_vocab["<eos>"]:
                        break

                    translation.append(trg_vocab_inv[top1])
                    input_token = torch.tensor([top1], device=device)

                translations.append(" ".join(translation))

    return translations

# Использование функции
translations = translate_test_set(model, test_loader, vocab_src, vocab_tgt, device)

Translating Test Set:   0%|          | 0/32 [00:00<?, ?it/s]