# Imports

In [1]:
import os
import random
from dataclasses import dataclass

import pandas as pd
import numpy as np

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments,  DataCollatorForSeq2Seq
from datasets import load_dataset
import sentencepiece

import matplotlib.pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
DEVICE = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
# DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(DEVICE)

mps


Завантаження та підготовка даних:

In [3]:
# Load Dataset
ds = load_dataset("Helsinki-NLP/europarl", "en-sk")

# Split Data
train_valid_split = ds['train'].train_test_split(test_size=0.2, seed=42)

train_data = train_valid_split['train']
valid_data = train_valid_split['test']

# Add these lines to reduce dataset size
train_data = train_data.select(range(5000))
valid_data = valid_data.select(range(1000))

In [4]:

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-sk")

# Define tokenization function
def tokenize_function(examples):
    source_texts = [item['en'] for item in examples['translation']]
    target_texts = [item['sk'] for item in examples['translation']]
    return tokenizer(
        source_texts, 
        text_target=target_texts, 
        truncation=True, 
        max_length=128
    )
    
train_tokenized = train_data.map(
    tokenize_function, 
    batched=True, 
    remove_columns=["translation"]
)

valid_tokenized = valid_data.map(
    tokenize_function, 
    batched=True, 
    remove_columns=["translation"]
)

Map: 100%|██████████| 5000/5000 [00:01<00:00, 4920.29 examples/s]
Map: 100%|██████████| 1000/1000 [00:00<00:00, 5353.74 examples/s]


In [5]:
# Create Data Collator
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model="Helsinki-NLP/opus-mt-en-sk")

# Create Data Loaders
train_dataloader = DataLoader(train_tokenized, shuffle=True, batch_size=16, collate_fn=data_collator)
valid_dataloader = DataLoader(valid_tokenized, batch_size=16, collate_fn=data_collator)
print(len(train_dataloader))

313


4. Побудова моделі:

In [6]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.GRU(emb_dim, hid_dim, n_layers, dropout=dropout, batch_first=True)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        # src: [batch_size, src_len]
        embedded = self.dropout(self.embedding(src))  # [batch_size, src_len, emb_dim]
        outputs, hidden = self.rnn(embedded)  # outputs: [batch_size, src_len, hid_dim]
        # hidden: [n_layers, batch_size, hid_dim]
        return outputs, hidden

In [7]:
class Attention(nn.Module):
    def __init__(self, hid_dim):
        super().__init__()
        self.attn = nn.Linear(hid_dim * 2, hid_dim)
        self.v = nn.Linear(hid_dim, 1, bias=False)

    def forward(self, hidden, encoder_outputs):
        # hidden: [batch_size, hid_dim]
        # encoder_outputs: [batch_size, src_len, hid_dim]

        src_len = encoder_outputs.shape[1]
        hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)  # [batch_size, src_len, hid_dim]
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))  # [batch_size, src_len, hid_dim]
        attention = self.v(energy).squeeze(2)  # [batch_size, src_len]
        return torch.softmax(attention, dim=1)

In [8]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout, attention):
        super().__init__()
        self.output_dim = output_dim
        self.attention = attention
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.GRU(hid_dim + emb_dim, hid_dim, n_layers, dropout=dropout, batch_first=True)
        self.fc_out = nn.Linear(hid_dim * 2 + emb_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, encoder_outputs):
        # input: [batch_size]
        # hidden: [n_layers, batch_size, hid_dim]
        # encoder_outputs: [batch_size, src_len, hid_dim]

        input = input.unsqueeze(1)  # [batch_size, 1]
        embedded = self.dropout(self.embedding(input))  # [batch_size, 1, emb_dim]

        # Use the top layer hidden state for attention
        attn_weights = self.attention(hidden[-1, :, :], encoder_outputs)  # [batch_size, src_len]
        attn_weights = attn_weights.unsqueeze(1)  # [batch_size, 1, src_len]
        context = torch.bmm(attn_weights, encoder_outputs)  # [batch_size, 1, hid_dim]

        rnn_input = torch.cat((embedded, context), dim=2)  # [batch_size, 1, emb_dim + hid_dim]
        # Use hidden as is: shape [n_layers, batch_size, hid_dim]
        output, hidden = self.rnn(rnn_input, hidden)  # output: [batch_size, 1, hid_dim], hidden: [n_layers, batch_size, hid_dim]

        prediction = self.fc_out(torch.cat((output.squeeze(1), context.squeeze(1), embedded.squeeze(1)), dim=1))
        # prediction: [batch_size, output_dim]

        return prediction, hidden

In [9]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        encoder_outputs, hidden = self.encoder(src)

        batch_size = trg.shape[0]
        trg_len = trg.shape[1]
        trg_vocab_size = self.decoder.output_dim

        # Define the start token ID safely
        start_token_id = tokenizer.bos_token_id
        if start_token_id is None:
            start_token_id = tokenizer.pad_token_id

        # Initialize the first decoder input token
        input = torch.full((batch_size,), start_token_id, dtype=torch.long, device=self.device)
    
        outputs = torch.zeros(batch_size, trg_len, trg_vocab_size, device=self.device)
    
        for t in range(1, trg_len):
            output, hidden = self.decoder(input, hidden, encoder_outputs)
            outputs[:, t, :] = output
    
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)
    
            # Get the target token at time-step t
            next_token = trg[:, t]
    
            # Replace any -100 indices (padding for labels) with the model's prediction (top1)
            next_token = torch.where(next_token == -100, top1, next_token)
    
            input = next_token if teacher_force else top1

        return outputs

In [10]:
INPUT_DIM = len(tokenizer.get_vocab())  # Vocabulary size for the source language
OUTPUT_DIM = len(tokenizer.get_vocab())  # Vocabulary size for the target language
EMB_DIM = 256
HID_DIM = 512
N_LAYERS = 2
DROPOUT = 0.5

# Initialize Components
attention = Attention(HID_DIM)
encoder = Encoder(INPUT_DIM, EMB_DIM, HID_DIM, N_LAYERS, DROPOUT)
decoder = Decoder(OUTPUT_DIM, EMB_DIM, HID_DIM, N_LAYERS, DROPOUT, attention)

# Combine into Seq2Seq Model
model = Seq2Seq(encoder, decoder, DEVICE).to(DEVICE)

5. Визначення функції втрат та оптимізатора:

In [11]:
import torch.optim as optim

# Define Loss Function
criterion = nn.CrossEntropyLoss(ignore_index=-100)

# Define Optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)

6. Навчання моделі:

In [12]:
def train_one_epoch(model, dataloader, optimizer, criterion, device):
    print("Train one epoch")
    model.train()  # Set the model to training mode
    epoch_loss = 0

    for i, batch in enumerate(dataloader):
        if i % 100 == 0:
            print(f"Processed {i} batches...")

        src = batch['input_ids'].to(device)       # Source tokens
        trg = batch['labels'].to(device)          # Target tokens

        # Reset gradients
        optimizer.zero_grad()

        # Forward pass
        output = model(src, trg)  # output: [batch_size, trg_len, output_dim]

        # Reshape for loss calculation
        output_dim = output.shape[-1]
        output = output[:, 1:].reshape(-1, output_dim)  # Exclude <sos> token
        trg = trg[:, 1:].reshape(-1)  # Exclude <sos> token

        # Calculate loss
        loss = criterion(output, trg)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        
        # Accumulate loss
        epoch_loss += loss.item()

    return epoch_loss / len(dataloader)

In [13]:
def train_model(model, train_dataloader, valid_dataloader, optimizer, criterion, device, n_epochs):
    train_losses = []
    valid_losses = []

    for epoch in range(n_epochs):
        print(f'Epoch {epoch+1}/{n_epochs}')

        # Training phase
        train_loss = train_one_epoch(model, train_dataloader, optimizer, criterion, device)
        train_losses.append(train_loss)

        # Validation phase (no gradients)
        valid_loss = evaluate_model(model, valid_dataloader, criterion, device)
        valid_losses.append(valid_loss)

        print(f'Training Loss: {train_loss:.4f} | Validation Loss: {valid_loss:.4f}')
    
    return train_losses, valid_losses

In [14]:
def evaluate_model(model, dataloader, criterion, device):
    model.eval()  # Set the model to evaluation mode
    epoch_loss = 0

    with torch.no_grad():
        for batch in dataloader:
            src = batch['input_ids'].to(device)
            trg = batch['labels'].to(device)

            output = model(src, trg, teacher_forcing_ratio=0)  # Disable teacher forcing for validation

            # Reshape for loss calculation
            output_dim = output.shape[-1]
            output = output[:, 1:].reshape(-1, output_dim)  # Exclude <sos> token
            trg = trg[:, 1:].reshape(-1)

            # Calculate loss
            loss = criterion(output, trg)
            epoch_loss += loss.item()

    return epoch_loss / len(dataloader)

In [15]:
# Hyperparameters
N_EPOCHS = 3

# Train the Model
train_losses, valid_losses = train_model(
    model=model,
    train_dataloader=train_dataloader,
    valid_dataloader=valid_dataloader,
    optimizer=optimizer,
    criterion=criterion,
    device=DEVICE,
    n_epochs=N_EPOCHS
)

# Save Losses for Analysis
loss_data = {'train_loss': train_losses, 'valid_loss': valid_losses}
torch.save(loss_data, 'loss_data.pth')

Epoch 1/3
Train one epoch
Processed 0 batches...
Processed 100 batches...
Processed 200 batches...
Processed 300 batches...
Training Loss: nan | Validation Loss: nan
Epoch 2/3
Train one epoch
Processed 0 batches...
Processed 100 batches...
Processed 200 batches...
Processed 300 batches...


KeyboardInterrupt: 

In [None]:
import matplotlib.pyplot as plt

def plot_losses(train_losses, valid_losses):
    plt.figure(figsize=(10, 6))
    plt.plot(train_losses, label='Training Loss')
    plt.plot(valid_losses, label='Validation Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.title('Training and Validation Loss')
    plt.legend()
    plt.show()

plot_losses(train_losses, valid_losses)