<a href="https://colab.research.google.com/github/KingsFrown/ml-course/blob/main/alien_translation_seq2seq_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [66]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from torch.optim import AdamW, Adam
from torch.amp import autocast, GradScaler
from torch.nn.utils import clip_grad_norm_
from torch.optim.lr_scheduler import ReduceLROnPlateau
from tqdm.notebook import tqdm
from nltk.translate.bleu_score import sentence_bleu
from transformers import AutoTokenizer  # Для WordPiece токенизации
import pandas as pd

In [67]:
def load_jsonl(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()
    data = [pd.json_normalize(eval(line)) for line in tqdm(lines, desc="Processing JSON Lines")]
    return pd.concat(data, ignore_index=True)

In [68]:
train_df = load_jsonl('train')
val_df = load_jsonl('val')
test_df = load_jsonl('test_no_reference')

Processing JSON Lines:   0%|          | 0/300000 [00:00<?, ?it/s]

Processing JSON Lines:   0%|          | 0/500 [00:00<?, ?it/s]

Processing JSON Lines:   0%|          | 0/1000 [00:00<?, ?it/s]

In [69]:
# train_df = train_df.sample(frac=0.1, random_state=42)

In [70]:
# 1. Подготовка данных с WordPiece токенизацией
def get_tokenizer(max_vocab_size=None):
    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
    if max_vocab_size:
        # Получаем оригинальный словарь
        original_vocab = tokenizer.get_vocab()
        # Ограничиваем словарь, сохраняя только первые max_vocab_size токенов
        reduced_vocab = dict(list(original_vocab.items())[:max_vocab_size])
        # Создаем новый токенизатор на основе сокращенного словаря
        tokenizer = AutoTokenizer.from_pretrained(
            "bert-base-uncased",
            vocab=reduced_vocab,
            model_max_length=tokenizer.model_max_length,
            truncation=True,
            padding_side=tokenizer.padding_side,
        )
    return tokenizer

# Пример использования
tokenizer = get_tokenizer(max_vocab_size=32000)

class TranslationDataset(Dataset):
    def __init__(self, df, src_tokenizer, dst_tokenizer, max_len=50):
        self.df = df
        self.src_tokenizer = src_tokenizer
        self.dst_tokenizer = dst_tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        src = self.df.iloc[idx]['src']
        src_tokens = self.src_tokenizer(
            src, padding='max_length', truncation=True, max_length=self.max_len, return_tensors='pt'
        ).input_ids.squeeze(0)

        if 'dst' in self.df.columns:
            dst = self.df.iloc[idx]['dst']
            dst_tokens = self.dst_tokenizer(
                dst, padding='max_length', truncation=True, max_length=self.max_len, return_tensors='pt'
            ).input_ids.squeeze(0)
        else:
            dst_tokens = None

        return src_tokens, dst_tokens

# Создаем датасеты
train_dataset = TranslationDataset(train_df, tokenizer, tokenizer)
val_dataset = TranslationDataset(val_df, tokenizer, tokenizer)
test_dataset = TranslationDataset(test_df, tokenizer, tokenizer)

In [71]:
# Коллектор для Dynamic Padding и Bucketing
def collate_fn(batch):
    src_batch, dst_batch = zip(*batch)
    src_batch = torch.nn.utils.rnn.pad_sequence(
        [seq.clone().detach() for seq in src_batch], batch_first=True, padding_value=tokenizer.pad_token_id
    )
    if dst_batch[0] is not None:
        dst_batch = torch.nn.utils.rnn.pad_sequence(
            [seq.clone().detach() for seq in dst_batch], batch_first=True, padding_value=tokenizer.pad_token_id
        )
    else:
        dst_batch = None
    return src_batch, dst_batch

# DataLoaders с оптимизациями
train_loader = DataLoader(
    train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn, pin_memory=True, num_workers=2
)
val_loader = DataLoader(val_dataset, batch_size=16, collate_fn=collate_fn, pin_memory=True, num_workers=2)
test_loader = DataLoader(test_dataset, batch_size=1, collate_fn=collate_fn, pin_memory=True, num_workers=2)

In [72]:
# 2. Определение модели
# Encoder: Однонаправленный GRU
class Encoder(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.gru = nn.GRU(embed_size, hidden_size, batch_first=True)

    def forward(self, x):
        embedded = self.embedding(x)
        outputs, hidden = self.gru(embedded)
        return outputs, hidden

# Decoder: Однонаправленный GRU
class Decoder(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.gru = nn.GRU(embed_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x, hidden):
        embedded = self.embedding(x.unsqueeze(1))
        outputs, hidden = self.gru(embedded, hidden)
        logits = self.fc(outputs.squeeze(1))
        return logits, hidden

# Seq2Seq без bidirectional
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        batch_size = src.size(0)
        trg_len = trg.size(1)
        trg_vocab_size = self.decoder.embedding.num_embeddings

        outputs = torch.zeros(batch_size, trg_len, trg_vocab_size).to(self.device)

        enc_outputs, hidden = self.encoder(src)

        input = trg[:, 0]
        for t in range(1, trg_len):
            output, hidden = self.decoder(input, hidden)
            outputs[:, t, :] = output
            top1 = output.argmax(1)
            input = trg[:, t] if torch.rand(1).item() < teacher_forcing_ratio else top1

        return outputs

In [73]:
# 3. Оптимизация обучения
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Упрощенные параметры модели
embed_size = 128  # Было 256
hidden_size = 256  # Было 512
max_len = 50  # Уменьшение максимальной длины последовательности

encoder = Encoder(len(tokenizer), embed_size, hidden_size).to(device)
decoder = Decoder(len(tokenizer), embed_size, hidden_size).to(device)
model = Seq2Seq(encoder, decoder, device).to(device)

optimizer = AdamW(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
scaler = GradScaler('cuda')
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2)

In [74]:
# 4. Расчет BLEU
def calculate_bleu(model, val_loader, tokenizer, device):
    model.eval()
    bleu_scores = []

    with torch.no_grad():
        for src, trg in val_loader:
            src, trg = src.to(device), trg.to(device)
            outputs = model(src, trg, teacher_forcing_ratio=0)  # Без teacher forcing
            outputs = outputs.argmax(-1)  # Извлекаем предсказания

            for i in range(len(trg)):
                pred_tokens = tokenizer.decode(outputs[i].cpu().tolist(), skip_special_tokens=True)
                trg_tokens = tokenizer.decode(trg[i].cpu().tolist(), skip_special_tokens=True)
                bleu_scores.append(sentence_bleu([trg_tokens.split()], pred_tokens.split()))

    return sum(bleu_scores) / len(bleu_scores)

In [None]:
# 5. Тренировка
n_epochs = 3
for epoch in range(n_epochs):
    model.train()
    epoch_loss = 0
    teacher_forcing_ratio = max(0.5 - epoch * 0.05, 0.1)

    for src, trg in tqdm(train_loader):
        src, trg = src.to(device), trg.to(device)
        optimizer.zero_grad()

        with autocast('cuda'):
            output = model(src, trg, teacher_forcing_ratio)
            output = output[:, 1:].reshape(-1, output.shape[-1])
            trg = trg[:, 1:].reshape(-1)
            loss = criterion(output, trg)

        scaler.scale(loss).backward()
        clip_grad_norm_(model.parameters(), max_norm=1.0)
        scaler.step(optimizer)
        scaler.update()

        epoch_loss += loss.item()

    val_loss = 0
    model.eval()
    with torch.no_grad():
        for src, trg in val_loader:
            src, trg = src.to(device), trg.to(device)
            with autocast('cuda'):
                output = model(src, trg, 0)  # Без teacher forcing на валидации
                output = output[:, 1:].reshape(-1, output.shape[-1])
                trg = trg[:, 1:].reshape(-1)
                loss = criterion(output, trg)
                val_loss += loss.item()

    scheduler.step(val_loss)
    bleu_score = calculate_bleu(model, val_loader, tokenizer, device)
    print(f"Epoch {epoch + 1}, Train Loss: {epoch_loss / len(train_loader):.4f}, "
          f"Val Loss: {val_loss / len(val_loader):.4f}, BLEU: {bleu_score:.4f}")

  0%|          | 0/9375 [00:00<?, ?it/s]

In [63]:
def predict_translation(model, test_loader, tokenizer, device):
    model.eval()
    predictions = []

    with torch.no_grad():
        for src, _ in test_loader:  # В test_loader нет эталонного перевода, поэтому _ используется как заглушка
            src = src.to(device)
            batch_predictions = []

            # Получаем выходные последовательности
            enc_outputs, hidden = model.encoder(src)

            # Начинаем с токена <sos>
            input_token = torch.tensor([tokenizer.cls_token_id] * src.size(0), device=device)

            for _ in range(50):  # Ограничиваем длину предсказания (например, 50 токенов)
                output, hidden = model.decoder(input_token, hidden)
                input_token = output.argmax(1)  # Берем токен с наибольшей вероятностью
                batch_predictions.append(input_token.unsqueeze(1))

                # Прерываем генерацию, если все последовательности заканчиваются <eos>
                if (input_token == tokenizer.sep_token_id).all():
                    break

            # Собираем предсказания по батчу
            batch_predictions = torch.cat(batch_predictions, dim=1)  # Объединяем токены в последовательности
            for pred_tokens in batch_predictions:
                pred_text = tokenizer.decode(pred_tokens.cpu().tolist(), skip_special_tokens=True)
                predictions.append(pred_text)

    return predictions

# Пример использования:
test_predictions = predict_translation(model, test_loader, tokenizer, device)

In [65]:
test_predictions[0]

"i ' ' t, i ' ' t, but i ' ' t..."