# Первоначальная настройка ноутбука

## Импорт зависимостей

In [1]:
import torch
import torch.nn as nn
import pytorch_lightning as pl
from pytorch_lightning import Trainer
from torch.utils.data import DataLoader, Dataset, random_split

import random
import numpy as np

from typing import Tuple
import spacy

In [None]:
!python -m spacy download en_core_web_sm

In [None]:
!python -m spacy download ru_core_news_md

## Видеокарта + версия cuda

In [2]:
torch.__version__

'2.2.2+cu121'

In [None]:
!nvidia-smi

Sat Oct 19 20:39:33 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 565.51.01              Driver Version: 565.90         CUDA Version: 12.7     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3060 Ti     On  |   00000000:01:00.0  On |                  N/A |
|  0%   44C    P8             21W /  200W |    1149MiB /   8192MiB |     28%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

## Установка seed-ов

In [3]:
seed = 42
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
random.seed(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

## Подготовка переменных

In [4]:
NUM_EPOCHS = 10
EMBED_DIM = 256
HIDDEN_DIM = 512
N_LAYERS = 2
DROPOUT = 0.5
BATCH_SIZE = 32

RU_DATASET_PATH = "1mcorpus/corpus.en_ru.1m.ru"
EN_DATASET_PATH = "1mcorpus/corpus.en_ru.1m.en"

SPACY_EN = spacy.load('en_core_web_sm')
SPACY_RU = spacy.load('ru_core_news_md')

In [5]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Обучение на {}".format(DEVICE))

Обучение на cuda


# Предобрабокта данных

## Загрузка данных

In [6]:
def load_data(path_to_en_data: str, path_to_ru_data: str) -> Tuple[list[str], list[str]]:
    '''Чтение данных'''
    with open(file=path_to_en_data, mode="r", encoding="utf-8") as en_file:
        en_dataset = en_file.readlines()
    with open(file=path_to_ru_data, mode="r", encoding="utf-8") as ru_file:
        ru_dataset = ru_file.readlines()

    return en_dataset[:20000], ru_dataset[:20000]

In [7]:
en_dataset, ru_dataset = load_data(path_to_en_data=EN_DATASET_PATH, path_to_ru_data=RU_DATASET_PATH)

## Подготовка текстовых данных для обучения

In [8]:
from torchtext.vocab import build_vocab_from_iterator
from torch.nn.utils.rnn import pad_sequence

In [9]:
def tokenizer_en(sentences):
    tokenize_sentence = [sentence.text for sentence in SPACY_EN.tokenizer(sentences)]
    return tokenize_sentence

def tokenizer_ru(sentences):
    tokenize_sentence = [sentence.text for sentence in SPACY_RU.tokenizer(sentences)]
    return tokenize_sentence

def tokenize(data, tokenizer):
    return [[tokenizer(sentence)] for sentence in data]

In [10]:
en_tokenized = tokenize(en_dataset, tokenizer_en)
ru_tokenized = tokenize(ru_dataset, tokenizer_ru)

In [11]:
def data_generator(tokenized_en, tokenized_ru):
    for en_sentence, ru_sentence in zip(tokenized_en, tokenized_ru):
        yield en_sentence, ru_sentence

def build_vocab(tokenized_text):
    def yield_tokens(data_iter):
        for sentence in data_iter:
            yield sentence[0]
    return build_vocab_from_iterator(yield_tokens(tokenized_text), specials=["<unk>", "<pad>", "<sos>", "<eos>"])

def text_pipeline(text, vocab):
    return [vocab["<sos>"]] + [vocab[token] for token in text] + [vocab["<eos>"]]

In [12]:
vocab_en = build_vocab(en_tokenized)
vocab_ru = build_vocab(ru_tokenized)

In [13]:
def collate_fn(batch):
    en_batch, ru_batch = [], []
    for en_item, ru_item in batch:
        en_batch.append(torch.tensor(text_pipeline(en_item[0], vocab_en), dtype=torch.long))
        ru_batch.append(torch.tensor(text_pipeline(ru_item[0], vocab_ru), dtype=torch.long))

    en_batch = pad_sequence(en_batch, padding_value=vocab_en["<pad>"])
    ru_batch = pad_sequence(ru_batch, padding_value=vocab_ru["<pad>"])

    return en_batch, ru_batch

In [14]:
class TranslationDataset(Dataset):
    def __init__(self, en_sentences, ru_sentences):
        self.en_sentences = en_sentences
        self.ru_sentences = ru_sentences

    def __len__(self):
        return len(self.en_sentences)

    def __getitem__(self, idx):
        return self.en_sentences[idx], self.ru_sentences[idx]

In [15]:
dataset = TranslationDataset(en_tokenized, ru_tokenized)

train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

train_dataloader = DataLoader(dataset,
                              batch_size=BATCH_SIZE,
                              collate_fn=collate_fn)
valid_dataloader = DataLoader(test_dataset,
                              batch_size=1,
                              collate_fn=collate_fn)

In [16]:
sample = next(iter(train_dataloader))
print(sample[0], sample[1], sep='\n')

tensor([[    2,     2,     2,  ...,     2,     2,     2],
        [   76,    99,   209,  ...,    48,  3624,   165],
        [   80, 32027,    30,  ...,     4,    16,    13],
        ...,
        [    1,     1,     1,  ...,     1,     1,     1],
        [    1,     1,     1,  ...,     1,     1,     1],
        [    1,     1,     1,  ...,     1,     1,     1]])
tensor([[    2,     2,     2,  ...,     2,     2,     2],
        [ 3005,  5026,   198,  ...,    21, 41405,   754],
        [  388,   660,  1558,  ...,   112,  2418,   879],
        ...,
        [    1,     1,     1,  ...,     1,     1,     1],
        [    1,     1,     1,  ...,     1,     1,     1],
        [    1,     1,     1,  ...,     1,     1,     1]])


# Создание модели

In [17]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hidden_dim, n_layers, dropout):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.GRU(emb_dim, hidden_dim, n_layers, dropout=dropout)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        embedded = self.dropout(self.embedding(src))
        outputs, hidden = self.rnn(embedded)
        return hidden

In [18]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hidden_dim, n_layers, dropout):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.GRU(emb_dim, hidden_dim, n_layers, dropout=dropout)
        self.fc_out = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden):
        # input: [batch_size]
        input = input.unsqueeze(0)  # [1, batch_size]
        embedded = self.dropout(self.embedding(input))  # [1, batch_size, emb_dim]
        output, hidden = self.rnn(embedded, hidden)
        # output: [1, batch_size, hidden_dim]
        prediction = self.fc_out(output.squeeze(0))
        return prediction, hidden

In [19]:
class Seq2Seq(pl.LightningModule):
    def __init__(self, encoder, decoder, pad_idx):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.pad_idx = pad_idx

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        trg_len = trg.shape[0]
        batch_size = trg.shape[1]
        trg_vocab_size = self.decoder.fc_out.out_features

        # Тензор для хранения предсказанных токенов
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)

        hidden = self.encoder(src)

        input = trg[0, :]

        for t in range(1, trg_len):
            output, hidden = self.decoder(input, hidden)
            outputs[t] = output

            teacher_force = torch.rand(1).item() < teacher_forcing_ratio
            top1 = output.argmax(1)
            input = trg[t] if teacher_force else top1

        return outputs

    def training_step(self, batch, batch_idx):
        src, trg = batch
        output = self(src, trg)

        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)

        loss = nn.CrossEntropyLoss(ignore_index=self.pad_idx)(output, trg)
        self.log('train_loss', loss, batch_size=BATCH_SIZE)
        return loss

    def validation_step(self, batch, batch_idx):
        src, trg = batch
        output = self(src, trg, 0) 
        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)

        loss = nn.CrossEntropyLoss(ignore_index=self.pad_idx)(output, trg)
        self.log('val_loss', loss, batch_size=BATCH_SIZE)

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=0.001)

In [20]:
encoder = Encoder(input_dim=len(vocab_en),
                  emb_dim=EMBED_DIM,
                  hidden_dim=HIDDEN_DIM,
                  n_layers=N_LAYERS,
                  dropout=DROPOUT)

decoder = Decoder(output_dim=len(vocab_ru),
                  emb_dim=EMBED_DIM,
                  hidden_dim=HIDDEN_DIM,
                  n_layers=N_LAYERS,
                  dropout=DROPOUT
                 )

TRG_PAD_IDX = 1

model = Seq2Seq(encoder, decoder, TRG_PAD_IDX)

In [None]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)

model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(35691, 256)
    (rnn): GRU(256, 512, num_layers=2, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(72116, 256)
    (rnn): GRU(256, 512, num_layers=2, dropout=0.5)
    (fc_out): Linear(in_features=512, out_features=72116, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

# Обучение модели

In [None]:
trainer = Trainer(max_epochs=NUM_EPOCHS)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


In [None]:
trainer.fit(model, train_dataloader, valid_dataloader)

INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name    | Type    | Params | Mode 
--------------------------------------------
0 | encoder | Encoder | 11.9 M | train
1 | decoder | Decoder | 58.2 M | train
--------------------------------------------
70.1 M    Trainable params
0         Non-trainable params
70.1 M    Total params
280.446   Total estimated model params size (MB)
9         Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=10` reached.


In [None]:
torch.save(model.state_dict(), 'seq2seq_model_10_epochs.pth')

In [None]:
trainer.validate(model, valid_dataloader)

INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Validation: |          | 0/? [00:00<?, ?it/s]

[{'val_loss': 6.580455303192139}]

## Загрузка модели

In [21]:
model.load_state_dict(torch.load("seq2seq_model_10_epochs.pth"))

<All keys matched successfully>

In [None]:
def translate_sentence(sentence, vocab_en, vocab_ru, model, max_len=50):
    model.eval()
    
    # Токенизация и числовое представление исходного предложения
    tokens = tokenizer_en(sentence) 
    numericalized = [vocab_en["<sos>"]] + [vocab_en[token] for token in tokens] + [vocab_en["<eos>"]]
    src_tensor = torch.tensor(numericalized, dtype=torch.long).unsqueeze(1).to(model.device)  # (len, 1)

    hidden = model.encoder(src_tensor)

    # Инициализация входного токена для декодера (токен <sos>)
    trg_indexes = [vocab_ru["<sos>"]]

    for _ in range(max_len):
        trg_tensor = torch.tensor([trg_indexes[-1]], dtype=torch.long).to(model.device)

        with torch.no_grad():
            output, hidden = model.decoder(trg_tensor, hidden)

        pred_token = output.argmax(1).item()
        trg_indexes.append(pred_token)

        # Если предсказан токен <eos>, то прерываем цикл
        if pred_token == vocab_ru["<eos>"]:
            break

    trg_tokens = [vocab_ru.get_itos()[i] for i in trg_indexes]

    # Возвращаем предсказанный перевод без токенов <sos> и <eos>
    return trg_tokens[1:-1]

sentence = "how are you"
translated_sentence = translate_sentence(sentence, vocab_en, vocab_ru, model)
print("Перевод:", " ".join(translated_sentence))


Перевод: Почему вы хотите ? ? 

