In [1]:
import os
os.environ['WANDB_MODE'] = 'offline'
import wandb
import pandas as pd
import torch
import numpy as np

from configs import config
from transformer import EncoderDecoder
from preprocessing import preprocessing
from metrics import LabelSmoothingLoss
from optimizer import NoamOpt
from transformer import fit

if torch.cuda.is_available():
    from torch.cuda import FloatTensor, LongTensor
    DEVICE = torch.device('cuda')
else:
    from torch import FloatTensor, LongTensor
    DEVICE = torch.device('cpu')

np.random.seed(42)

In [2]:
!wandb login f40bc28fcdb2758937b8be9acbc2bbc7b6509b6e

wandb: Appending key for api.wandb.ai to your netrc file: C:\Users\Vladlen\_netrc
wandb: W&B API key is configured. Use `wandb login --relogin` to force relogin


In [3]:
wandb.login()

# Запуск проекта и конфигурации
wandb.init(project="transformer-summarizer", config={
    "epochs": config['epochs'],
    "batch_size": config['b_size_train'],
    "learning_rate": config['learning_rate'],
    "model": "EncoderDecoder",
    "d_model": config['d_model'],
    "n_heads": config['n_heads'],
})

# !wandb sync "D:/05_Attention/05_Attention/seminar/wandb/offline-run-20250526_162725-2622dbq9"
# !wandb sync "D:/05_Attention/05_Attention/seminar/wandb/offline-run-*"



In [4]:
data = pd.read_csv('D:/05_Attention/05_Attention/seminar/news.csv', delimiter=',') 
train_iter, test_iter, word_field, embedding_layer = preprocessing(data, tokenize='moses')
vocab_size = len(word_field.vocab)

  0%|          | 0/74278 [00:00<?, ?it/s]

Загрузка векторов:   0%|          | 0/61747 [00:00<?, ?it/s]

In [5]:
print("Tokenize type:", type(word_field.tokenize))
print("Sample tokenization:", word_field.tokenize("Пример текста дляю. , проверки"))

Tokenize type: <class 'method'>
Sample tokenization: ['Пример', 'текста', 'дляю', '.', ',', 'проверки']


In [6]:
# 1. Токенизация (как настроено в Field)
tokens = word_field.preprocess("Пример текста для абоба 55 проверки")
# ['это', 'тестовое', 'предложение']

# 2. Преобразование в индексы
indexed = word_field.process([tokens])  # Обернуть в список, чтобы получить батч размером 1

# Теперь `indexed` — это тензор с индексами:
# tensor([[<BOS>, 'это', 'тестовое', 'предложение', <EOS>, <PAD>, ...]])

print(indexed)

BOS_TOKEN = '<s>'
EOS_TOKEN = '</s>'
PAD_TOKEN = '<pad>'

bos_idx = word_field.vocab.stoi[BOS_TOKEN]
eos_idx = word_field.vocab.stoi[EOS_TOKEN]
pad_idx = word_field.vocab.stoi[PAD_TOKEN]

print("BOS index:", bos_idx)
print("EOS index:", eos_idx)
print("PAD index:", pad_idx)

indexed_list = indexed[0].tolist() 

words = [word_field.vocab.itos[i] for i in indexed_list]
print(words)

tensor([[    2, 15006, 15061,    33,     0,  2016,  1037,     3]])
BOS index: 2
EOS index: 3
PAD index: 1
['<s>', 'пример', 'текста', 'для', '<unk>', '55', 'проверки', '</s>']


In [7]:
model = EncoderDecoder(word_field, embedding=embedding_layer)
model = model.to(DEVICE)

self.d_model = 300


300
300
300
300
300
300
300
300
300
300
300
300
300
300
300
300
300
300
300
300


In [8]:

# criterion = nn.CrossEntropyLoss(ignore_index=pad_idx).to(DEVICE) заменили на...
pad_idx = word_field.vocab.stoi[word_field.pad_token]
criterion = LabelSmoothingLoss(vocab_size,  padding_idx=pad_idx).to(DEVICE)

In [9]:
optimizer = NoamOpt(model)
fit(model, criterion, optimizer, train_iter, epochs_count=30, val_iter=test_iter)

  0%|          | 0/3946 [00:00<?, ?it/s]

<class 'torchtext.data.batch.Batch'>
размеры source_mask, target_mask: (torch.Size([16, 1, 144]), torch.Size([16, 13, 13]))
Размер source_inputs torch.Size([16, 144])
Размер target_inputs torch.Size([16, 13])
torch.Size([16, 144]) torch.Size([16, 13]) torch.Size([16, 1, 144]) torch.Size([16, 13, 13])
размер после применения эмбеддинга: torch.Size([16, 144, 300])
EncoderBlock inputs size: torch.Size([16, 144, 300])
ResidualBlock input: torch.Size([16, 144, 300])
размеры query, key, value: torch.Size([16, 144, 300]), torch.Size([16, 144, 300]), torch.Size([16, 144, 300])
forward после transpose torch.Size([16, 144, 300])
torch.Size([16, 144, 300])
Sublayer output: torch.Size([16, 144, 300])
ResidualBlock input: torch.Size([16, 144, 300])
Sublayer output: torch.Size([16, 144, 300])
EncoderBlock inputs size: torch.Size([16, 144, 300])
ResidualBlock input: torch.Size([16, 144, 300])
размеры query, key, value: torch.Size([16, 144, 300]), torch.Size([16, 144, 300]), torch.Size([16, 144, 300])

KeyboardInterrupt: 