<a href="https://colab.research.google.com/github/KingsFrown/ml-course/blob/main/alien_translation_seq2seq_v3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from tqdm.notebook import tqdm
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from torch.optim import AdamW, Adam
from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)
from transformers import PreTrainedTokenizerFast
from nltk.translate.bleu_score import sentence_bleu
from torch.amp import autocast, GradScaler
from torch.nn.utils import clip_grad_norm_
from torch.optim.lr_scheduler import ReduceLROnPlateau

In [2]:
def load_jsonl(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()
    data = [pd.json_normalize(eval(line)) for line in tqdm(lines, desc="Processing JSON Lines")]
    return pd.concat(data, ignore_index=True)

In [3]:
train_df_full = load_jsonl('train')
val_df = load_jsonl('val')
test_df = load_jsonl('test_no_reference')

Processing JSON Lines:   0%|          | 0/300000 [00:00<?, ?it/s]

Processing JSON Lines:   0%|          | 0/500 [00:00<?, ?it/s]

Processing JSON Lines:   0%|          | 0/1000 [00:00<?, ?it/s]

In [51]:
# train_df = train_df_full.sample(frac=0.1, random_state=42)
train_df = train_df_full

In [5]:
with open('train_src.txt', 'w') as file:
    file.write(train_df_full.src.to_string(index=False, header=False))

In [6]:
with open('train_dst.txt', 'w') as file:
    file.write(train_df_full.dst.to_string(index=False, header=False))

In [52]:
vocab_size = 32000

In [53]:
tokenizer = Tokenizer(models.WordPiece(unk_token="[UNK]"))
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
special_tokens = ["[UNK]", "[PAD]", "[SOS]", "[EOS]"]
trainer = trainers.WordPieceTrainer(vocab_size=vocab_size, special_tokens=special_tokens)
tokenizer.train(["train_src.txt"], trainer=trainer)
sos_token_id = tokenizer.token_to_id("[SOS]")
eos_token_id = tokenizer.token_to_id("[EOS]")
tokenizer.post_processor = processors.TemplateProcessing(
    single=f"[SOS]:0 $A:0 [EOS]:0",
    pair=f"[SOS]:0 $A:0 [EOS]:0 $B:1 [EOS]:1",
    special_tokens=[("[SOS]", sos_token_id), ("[EOS]", eos_token_id)],
)
tokenizer.decoder = decoders.WordPiece(prefix="##")

src_tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=tokenizer,
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[SOS]",
    sep_token="[EOS]",
)

In [54]:
tokenizer = Tokenizer(models.WordPiece(unk_token="[UNK]"))
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
special_tokens = ["[UNK]", "[PAD]", "[SOS]", "[EOS]"]
trainer = trainers.WordPieceTrainer(vocab_size=vocab_size, special_tokens=special_tokens)
tokenizer.train(["train_dst.txt"], trainer=trainer)
sos_token_id = tokenizer.token_to_id("[SOS]")
eos_token_id = tokenizer.token_to_id("[EOS]")
tokenizer.post_processor = processors.TemplateProcessing(
    single=f"[SOS]:0 $A:0 [EOS]:0",
    pair=f"[SOS]:0 $A:0 [EOS]:0 $B:1 [EOS]:1",
    special_tokens=[("[SOS]", sos_token_id), ("[EOS]", eos_token_id)],
)
tokenizer.decoder = decoders.WordPiece(prefix="##")

dst_tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=tokenizer,
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[SOS]",
    sep_token="[EOS]",
)

In [59]:
class TranslationDataset(Dataset):
    def __init__(self, df, src_tokenizer, dst_tokenizer):
        self.df = df
        self.src_tokenizer = src_tokenizer
        self.dst_tokenizer = dst_tokenizer

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        src = self.df.iloc[idx]['src']
        src_tokens = self.src_tokenizer(
            src, return_tensors='pt'
        ).input_ids.squeeze(0)

        if 'dst' in self.df.columns:
            dst = self.df.iloc[idx]['dst']
            dst_tokens = self.dst_tokenizer(
                dst, return_tensors='pt'
            ).input_ids.squeeze(0)
        else:
            dst_tokens = None

        return src_tokens, dst_tokens

# Создаем датасеты
train_dataset = TranslationDataset(train_df, src_tokenizer, dst_tokenizer)
val_dataset = TranslationDataset(val_df, src_tokenizer, dst_tokenizer)
test_dataset = TranslationDataset(test_df, src_tokenizer, dst_tokenizer)

In [60]:
pad_token_id = src_tokenizer.pad_token_id

In [61]:
def sort_by_length(dataset):
    sorted_indices = sorted(range(len(dataset)), key=lambda i: len(dataset[i][0]))
    sorted_dataset = [dataset[i] for i in sorted_indices]
    return sorted_dataset

In [62]:
train_dataset = sort_by_length(train_dataset)
val_dataset = sort_by_length(val_dataset)

In [63]:
def collate_fn(batch):
    src_batch, dst_batch = zip(*batch)
    src_batch = torch.nn.utils.rnn.pad_sequence(
        [seq.clone().detach() for seq in src_batch], batch_first=True, padding_value=pad_token_id
    )
    if dst_batch[0] is not None:
        dst_batch = torch.nn.utils.rnn.pad_sequence(
            [seq.clone().detach() for seq in dst_batch], batch_first=True, padding_value=pad_token_id
        )
    else:
        dst_batch = None
    return src_batch, dst_batch

# DataLoaders с оптимизациями
train_loader = DataLoader(
    train_dataset, batch_size=32, collate_fn=collate_fn
)
val_loader = DataLoader(val_dataset, batch_size=32, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=1, collate_fn=collate_fn)

In [64]:
# 2. Определение модели
# Encoder: Однонаправленный GRU с dropout и настройкой числа слоёв
class Encoder(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers=1, dropout=0.0):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.gru = nn.GRU(embed_size, hidden_size, num_layers=num_layers,
                          dropout=dropout if num_layers > 1 else 0, batch_first=True)

    def forward(self, x):
        embedded = self.embedding(x)
        outputs, hidden = self.gru(embedded)
        return outputs, hidden

# Decoder: Однонаправленный GRU с dropout и настройкой числа слоёв
class Decoder(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers=1, dropout=0.0):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.gru = nn.GRU(embed_size, hidden_size, num_layers=num_layers,
                          dropout=dropout if num_layers > 1 else 0, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x, hidden):
        # Входное слово (x) превращается в эмбеддинг
        embedded = self.embedding(x.unsqueeze(1))  # Добавляем размерность для batch
        outputs, hidden = self.gru(embedded, hidden)
        logits = self.fc(outputs.squeeze(1))  # Убираем добавленное измерение
        return logits, hidden

# Seq2Seq модель
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        batch_size = src.size(0)
        trg_len = trg.size(1)
        trg_vocab_size = self.decoder.embedding.num_embeddings

        # Создаём тензор для хранения выходов
        outputs = torch.zeros(batch_size, trg_len, trg_vocab_size).to(self.device)

        # Прогон через энкодер
        enc_outputs, hidden = self.encoder(src)

        # Начальное входное слово в декодере (токен <sos>)
        input = trg[:, 0]

        for t in range(1, trg_len):
            output, hidden = self.decoder(input, hidden)
            outputs[:, t, :] = output
            # Выбираем слово с наибольшей вероятностью
            top1 = output.argmax(1)
            # Используем teacher forcing
            input = trg[:, t] if torch.rand(1).item() < teacher_forcing_ratio else top1

        return outputs

In [65]:
# 3. Оптимизация обучения
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

vocab_size = vocab_size
embed_size = 256
hidden_size = 512
num_layers = 2
dropout = 0.3

encoder = Encoder(vocab_size, embed_size, hidden_size, num_layers=num_layers, dropout=dropout)
decoder = Decoder(vocab_size, embed_size, hidden_size, num_layers=num_layers, dropout=dropout)
model = Seq2Seq(encoder, decoder, device).to(device)

optimizer = AdamW(model.parameters(), lr=0.005)
criterion = nn.CrossEntropyLoss(ignore_index=pad_token_id)
scaler = GradScaler('cuda')
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2)

In [66]:
# 4. Расчет BLEU
def calculate_bleu(model, val_loader, tokenizer, device):
    model.eval()
    bleu_scores = []

    with torch.no_grad():
        for src, trg in val_loader:
            src, trg = src.to(device), trg.to(device)
            outputs = model(src, trg, teacher_forcing_ratio=0)  # Без teacher forcing
            outputs = outputs.argmax(-1)  # Извлекаем предсказания

            for i in range(len(trg)):
                pred_tokens = tokenizer.decode(outputs[i].cpu().tolist(), skip_special_tokens=True)
                trg_tokens = tokenizer.decode(trg[i].cpu().tolist(), skip_special_tokens=True)
                bleu_scores.append(sentence_bleu([trg_tokens.split()], pred_tokens.split()))

    return sum(bleu_scores) / len(bleu_scores)

In [68]:
epoch = 0

In [None]:
# 5. Тренировка
# for epoch in range(n_epochs):
model.train()
epoch_loss = 0
teacher_forcing_ratio = max(0.5 - epoch * 0.05, 0.1)

for src, trg in tqdm(train_loader):
    src, trg = src.to(device), trg.to(device)
    optimizer.zero_grad()

    with autocast('cuda'):
        output = model(src, trg, teacher_forcing_ratio)
        output = output[:, 1:].reshape(-1, output.shape[-1])
        trg = trg[:, 1:].reshape(-1)
        loss = criterion(output, trg)

    scaler.scale(loss).backward()
    clip_grad_norm_(model.parameters(), max_norm=1.0)
    scaler.step(optimizer)
    scaler.update()

    epoch_loss += loss.item()

val_loss = 0
model.eval()
with torch.no_grad():
    for src, trg in val_loader:
        src, trg = src.to(device), trg.to(device)
        with autocast('cuda'):
            output = model(src, trg, 0)  # Без teacher forcing на валидации
            output = output[:, 1:].reshape(-1, output.shape[-1])
            trg = trg[:, 1:].reshape(-1)
            loss = criterion(output, trg)
            val_loss += loss.item()

scheduler.step(val_loss)
bleu_score = calculate_bleu(model, val_loader, dst_tokenizer, device)
print(f"Epoch {epoch + 1}, Train Loss: {epoch_loss / len(train_loader):.4f}, "
      f"Val Loss: {val_loss / len(val_loader):.4f}, BLEU: {bleu_score:.4f}")

epoch+=1

  0%|          | 0/9375 [00:00<?, ?it/s]

In [None]:
torch.save(model.state_dict(), 'my_model.pth')

In [None]:
loaded_state_dict = torch.load('my_model.pth')
model.load_state_dict(loaded_state_dict)