<a href="https://colab.research.google.com/github/KingsFrown/ml-course/blob/main/alien_translation_transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install torch torchtext transformers datasets tqdm

Collecting torchtext
  Downloading torchtext-0.18.0-cp310-cp310-manylinux1_x86_64.whl.metadata (7.9 kB)
Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec (from torch)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading torchtext-0.18.0-cp310-cp310-manylinux1_x86_64.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m56.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m42.5 MB/s[0m eta [36

In [2]:
import pandas as pd
from datasets import Dataset
from transformers import BertTokenizer, T5Tokenizer, T5ForConditionalGeneration
from torch.utils.data import DataLoader
import torch
from tqdm.notebook import tqdm as tqdmn
from tqdm import tqdm

In [3]:
def load_jsonl(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()
    data = [pd.json_normalize(eval(line)) for line in tqdmn(lines, desc="Processing JSON Lines")]
    return pd.concat(data, ignore_index=True)

In [4]:
train_df = load_jsonl('train')

Processing JSON Lines:   0%|          | 0/300000 [00:00<?, ?it/s]

In [5]:
val_df = load_jsonl('val')
test_df = load_jsonl('test_no_reference')

Processing JSON Lines:   0%|          | 0/500 [00:00<?, ?it/s]

Processing JSON Lines:   0%|          | 0/1000 [00:00<?, ?it/s]

In [7]:
# Преобразование DataFrame в Dataset
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

# Токенизатор
tokenizer = T5Tokenizer.from_pretrained('t5-small')

def preprocess_function(examples):
    inputs = examples['src']
    if 'dst' in examples:
        targets = examples['dst']
        model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
        labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length")
        model_inputs["labels"] = labels["input_ids"]
    else:
        model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
    return model_inputs

tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_val = val_dataset.map(preprocess_function, batched=True)
tokenized_test = test_dataset.map(preprocess_function, batched=True)

# # DataLoader
# train_loader = DataLoader(tokenized_train, batch_size=2, shuffle=True)
# val_loader = DataLoader(tokenized_val, batch_size=2, shuffle=False)
# test_loader = DataLoader(tokenized_test, batch_size=2, shuffle=False)

Map:   0%|          | 0/300000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [20]:
# DataLoader
def collate_fn(batch):
    return {key: torch.tensor([item[key] for item in batch]) for key in batch[0] if isinstance(batch[0][key], list)}

train_loader = DataLoader(tokenized_train, batch_size=2, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(tokenized_val, batch_size=2, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(tokenized_test, batch_size=2, shuffle=False, collate_fn=collate_fn)

In [24]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = T5ForConditionalGeneration.from_pretrained('t5-small')
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [25]:
from torch.optim import AdamW
from torch.nn import CrossEntropyLoss

optimizer = AdamW(model.parameters(), lr=5e-5)
criterion = CrossEntropyLoss()

def train(model, dataloader, optimizer, criterion, device):
    model.train()
    epoch_loss = 0
    for batch in tqdmn(dataloader, desc="Training"):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(dataloader)

def evaluate(model, dataloader, criterion, device):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for batch in tqdmn(dataloader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            epoch_loss += loss.item()
    return epoch_loss / len(dataloader)

N_EPOCHS = 3

for epoch in range(N_EPOCHS):
    train_loss = train(model, train_loader, optimizer, criterion, device)
    val_loss = evaluate(model, val_loader, criterion, device)
    print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Val. Loss: {val_loss:.3f}')

Training:   0%|          | 0/150000 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

def calculate_bleu(model, dataloader, tokenizer, device):
    model.eval()
    bleu_scores = []
    smoothing_function = SmoothingFunction().method4
    with torch.no_grad():
        for batch in tqdmn(dataloader, desc="Calculating BLEU"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask)
            decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
            decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
            for pred, true in zip(decoded_outputs, decoded_labels):
                reference = [true.split()]
                candidate = pred.split()
                bleu_score = sentence_bleu(reference, candidate, smoothing_function=smoothing_function)
                bleu_scores.append(bleu_score)
    return sum(bleu_scores) / len(bleu_scores)

bleu_score = calculate_bleu(model, val_loader, tokenizer, device)
print(f'BLEU Score on Validation Set: {bleu_score:.3f}')