# 라이브러리

In [None]:
pip install datasets
python -m pip install --upgrade pip
pip install torch
pip install transformers

pip install protobuf#???? 설치해야하나?
pip install sentencepiece
pip install nltk

# 데이터 처리

In [1]:
print('hi')

hi


In [2]:
import torch
from transformers import MBartForConditionalGeneration, MBart50Tokenizer
from datasets import load_dataset
from torch.utils.data import Dataset, DataLoader, DistributedSampler


#토크나이저
model_name = "facebook/mbart-large-50-many-to-many-mmt"
tokenizer = MBart50Tokenizer.from_pretrained(model_name)
model = MBartForConditionalGeneration.from_pretrained(model_name)

tokenizer.src_lang = 'en_XX'
tokenizer.tgt_lang='de_DE'
#데이터셋 로드
ds = load_dataset("bentrevett/multi30k")

# 데이터셋 클래스 정의
class TranslationDataset(Dataset):
    def __init__(self, dataset, tokenizer, max_len=128):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):#소스,타겟문장 추출
        src_text = self.dataset[idx]['en']
        tgt_text = self.dataset[idx]['de']

        src_tokens = self.tokenizer(src_text, padding='max_length', truncation=True, max_length=self.max_len, return_tensors="pt", add_special_tokens=True)
        tgt_tokens = self.tokenizer(tgt_text, padding='max_length', truncation=True, max_length=self.max_len, return_tensors="pt", add_special_tokens=True)

        return {
            'src_input_ids': src_tokens['input_ids'].squeeze(),
            'tgt_input_ids': tgt_tokens['input_ids'].squeeze(),
            'src_attention_mask': src_tokens['attention_mask'].squeeze(),
            'tgt_attention_mask': tgt_tokens['attention_mask'].squeeze()
        }


#데이터셋-> 전처리된 데이터셋으로 변환
train_dataset = TranslationDataset(ds['train'],tokenizer)
val_dataset = TranslationDataset(ds['validation'],tokenizer)
test_dataset = TranslationDataset(ds['test'],tokenizer)

train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=8, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=False)


#실제 토큰화된 결과 확인
for batch in train_dataset:
  print(batch)
  break
#<BOS> 250004 / <EOS> 2 / <PAD> 1



{'src_input_ids': tensor([250004,  32964,  27150,      4,  22392,  11280,      7,    621,  50782,
         43573,   5941,    373, 114942,      5,      2,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1

# Transformer

In [3]:

from torch import Tensor
import torch
import torch.nn as nn
from torch.nn import Transformer
from torch.nn import TransformerEncoderLayer, TransformerDecoderLayer
import math
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 단어 순서 개념(notion)을 토큰 임베딩에 도입하기 위한 위치 인코딩(positional encoding)을 위한 헬퍼 모듈(Module)
class ParallelSeq2SeqTransformer(nn.Module):
    def __init__(self, num_layers: int, emb_size: int, nhead: int, src_vocab_size: int, tgt_vocab_size: int, dim_feedforward: int = 512, dropout: float = 0.1):
        super(ParallelSeq2SeqTransformer, self).__init__()
        self.num_layers = num_layers
        self.enc_layers = nn.ModuleList([TransformerEncoderLayer(emb_size, nhead, dim_feedforward, dropout) for _ in range(num_layers)])
        self.dec_layers = nn.ModuleList([TransformerDecoderLayer(emb_size, nhead, dim_feedforward, dropout) for _ in range(num_layers)])
        self.generator = nn.Linear(emb_size, tgt_vocab_size)
        self.src_tok_emb = TokenEmbedding(src_vocab_size, emb_size)
        self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, emb_size)
        self.positional_encoding = PositionalEncoding(emb_size, dropout)

    def forward(self, src, trg, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, memory_key_padding_mask):
        src_emb = self.positional_encoding(self.src_tok_emb(src))
        tgt_emb = self.positional_encoding(self.tgt_tok_emb(trg))

        # Parallel Encoder Processing
        memory = src_emb
        for layer in self.enc_layers:
            memory = layer(memory, src_mask, src_padding_mask)

        # Parallel Decoder Processing
        output = tgt_emb
        for layer in self.dec_layers:
            output = layer(output, memory, tgt_mask, None, tgt_padding_mask, memory_key_padding_mask)

        return self.generator(output)

class PositionalEncoding(nn.Module):
    def __init__(self, emb_size: int, dropout: float, maxlen: int = 5000):
        super(PositionalEncoding, self).__init__()
        den = torch.exp(-torch.arange(0, emb_size, 2) * math.log(10000) / emb_size)
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
        pos_embedding = torch.zeros((maxlen, emb_size))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(-2)

        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding)

    def forward(self, token_embedding: torch.Tensor):
        return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])
    
# 입력 인덱스의 텐서를 해당하는 토큰 임베딩의 텐서로 변환하기 위한 헬퍼 모듈(Module)
class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size: int, emb_size):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.emb_size = emb_size

    def forward(self, tokens: Tensor):
        return self.embedding(tokens.long()) * math.sqrt(self.emb_size)


class Seq2SeqTransformer(nn.Module):
    def __init__(self,
                 num_encoder_layers: int,
                 num_decoder_layers: int,
                 emb_size: int,
                 nhead: int,
                 src_vocab_size: int,
                 tgt_vocab_size: int,
                 dim_feedforward: int = 512,
                 dropout: float = 0.1):
        super(Seq2SeqTransformer, self).__init__()
        self.transformer = Transformer(d_model=emb_size,
                                       nhead=nhead,
                                       num_encoder_layers=num_encoder_layers,
                                       num_decoder_layers=num_decoder_layers,
                                       dim_feedforward=dim_feedforward,
                                       dropout=dropout)
        self.generator = nn.Linear(emb_size, tgt_vocab_size)#출력 생성기
        self.src_tok_emb = TokenEmbedding(src_vocab_size, emb_size)#source 토큰 임베딩
        self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, emb_size)#target 토큰 임베딩
        self.positional_encoding = PositionalEncoding(#위치 인코딩
            emb_size, dropout=dropout)

    #인코더-디코더 각 과정/훈련데이터 처리
    def forward(self,
                src: Tensor,
                trg: Tensor,
                src_mask: Tensor,
                tgt_mask: Tensor,
                src_padding_mask: Tensor,
                tgt_padding_mask: Tensor,
                memory_key_padding_mask: Tensor):
        src_emb = self.positional_encoding(self.src_tok_emb(src))
        tgt_emb = self.positional_encoding(self.tgt_tok_emb(trg))
        #source 문장을 처리한 후, target문장을 디코더로 처리?
        outs = self.transformer(src_emb, tgt_emb, src_mask, tgt_mask, None,
                                src_padding_mask, tgt_padding_mask, memory_key_padding_mask)#mask 적용
        return self.generator(outs)#최종적으로 각 타겟 단어에 대한 확률분포 반환/임베딩 차원->선형변환 적용->확률분포

    #추론(예측) 과정
    #인코딩
    def encode(self, src: Tensor, src_mask: Tensor):#source 문장 인코딩
        return self.transformer.encoder(self.positional_encoding(
                            self.src_tok_emb(src)), src_mask)
    #디코딩
    def decode(self, tgt: Tensor, memory: Tensor, tgt_mask: Tensor):#target 문장 디코딩
        return self.transformer.decoder(self.positional_encoding(
                          self.tgt_tok_emb(tgt)), memory,
                          tgt_mask)

In [4]:
#마스킹 생성
def generate_square_subsequent_mask(sz):
    mask = (torch.triu(torch.ones((sz, sz), device=DEVICE)) == 1).transpose(0, 1)#상삼각행렬 생성(순차적인 마스킹 적용)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))#-무한대로 마스킹 적용
    return mask

#마스크 설정
def create_mask(src, tgt):
   #[0]: 배치크기, [1]:시퀀스 길이
    tgt_seq_len = tgt.shape[0]-1#128-1


    #어텐션 masking(디코더)
    tgt_mask = generate_square_subsequent_mask(tgt_seq_len).type(torch.float32).to(DEVICE)

    #패딩 토큰에 대한 masking(인코더/디코더)
    src_padding_mask = (src == tokenizer.pad_token_id).to(torch.bool).transpose(0,1)#[src_seq_len, batch_size]
    tgt_padding_mask = (tgt[:-1,:] == tokenizer.pad_token_id).to(torch.bool).transpose(0,1)

    #차원 확인용
    #print('tgt_seq_len',tgt_seq_len, '  tgt_mask',tgt_mask.shape  )
    #print('\n',src_padding_mask.shape, '\n',tgt_padding_mask.shape)

    return None, tgt_mask, src_padding_mask, tgt_padding_mask #src_mask는 반환하지 않음

In [9]:
torch.manual_seed(0)

SRC_VOCAB_SIZE = tokenizer.vocab_size
TGT_VOCAB_SIZE = tokenizer.vocab_size
EMB_SIZE = 512#512
NHEAD = 4#8
FFN_HID_DIM = 512#2048
BATCH_SIZE = 8
NUM_ENCODER_LAYERS = 6 #6
NUM_DECODER_LAYERS = 6 #6

transformer = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE,
                                 NHEAD, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, FFN_HID_DIM)

model = transformer.to(DEVICE)



In [6]:
#BLEU score 계산
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
smoothie = SmoothingFunction().method4

# BLEU-4 점수 계산 함수
def calculate_bleu(output_ids, target_ids, tokenizer):
    total_bleu = 0
    total_sentences = 0

    for i in range(output_ids.size(0)):  # 배치의 각 문장에 대해 BLEU 계산
        pred_sentence = tokenizer.decode(output_ids[i], skip_special_tokens=True)  # 예측된 문장
        tgt_sentence = tokenizer.decode(target_ids[i], skip_special_tokens=True)  # 실제 문장

        # BLEU-4 계산
        reference = [tgt_sentence.split()]  # 참조 문장
        candidate = pred_sentence.split()  # 예측 문장
        bleu_score = sentence_bleu(reference, candidate, smoothing_function=smoothie)
        total_bleu += bleu_score
        total_sentences += 1

    avg_bleu = total_bleu / total_sentences if total_sentences > 0 else 0
    return avg_bleu

In [7]:
import torch.nn as nn

#손실 함수에 EOS 토큰 가중치 적용
def create_weighted_loss(vocab_size, eos_token_id, pad_token_id, device):
    weights = torch.ones(vocab_size).to(device)  #GPU로 옮김
    weights[eos_token_id] = 0.1 #<EOS> 토큰에 대한 가중치를 낮춤
    weights[pad_token_id] = 0.0#<PAD> 토큰은 무시

    return nn.CrossEntropyLoss(weight=weights, ignore_index=pad_token_id)


In [10]:
import torch.cuda.amp as amp
from tqdm import tqdm
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import time
smoothie = SmoothingFunction().method4

import torch.nn.functional as F

# 손실 함수 및 옵티마이저
#loss_fn = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
eos_token_id = tokenizer.eos_token_id
pad_token_id = tokenizer.pad_token_id
loss_fn = create_weighted_loss(vocab_size=tokenizer.vocab_size, eos_token_id=eos_token_id, pad_token_id=pad_token_id, device=DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

# 학습 함수
def train_epoch(model, optimizer, dataloader, loss_fn, device, pbar):
    model.train()  # 모델을 학습 모드로 전환
    total_loss = 0
    total_bleu=0
    scaler = amp.GradScaler()  # Mixed Precision에 필요한 GradScaler 초기화


    for batch in dataloader:
        #데이터 로드
        src = batch['src_input_ids'].to(device).transpose(0,1)
        tgt = batch['tgt_input_ids'].to(device).transpose(0,1)

         #마스크 생성
        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt)

        #옵티마이저 초기화
        optimizer.zero_grad()

        with amp.autocast():
            #모델의 출력 계산
            output = model(src, tgt[:-1, :], None, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask)

            #손실 계산 (output을 [batch_size, sequence_len, vocab_size]로 변환)
            tgt_out = tgt[1:, :]  #<BOS> 토큰 제외
            loss = loss_fn(output.reshape(-1, output.shape[-1]), tgt_out.reshape(-1))

        # 역전파 및 파라미터 업데이트
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        total_loss += loss.item()

        #BLEU-4 스코어 계산
        output_ids = output.argmax(dim=-1).transpose(0,1)#예측 토큰 -> [batch_size, sequence_length]
        avg_bleu = calculate_bleu(output_ids, tgt[1:, :].transpose(0, 1), tokenizer)  # BLEU 계산
        total_bleu += avg_bleu


        #tqdm 업데이트
        pbar.update(1)

        #평균계산
        avg_loss = total_loss/len(dataloader)
        avg_bleu = total_bleu/len(dataloader)

    return avg_loss, avg_bleu



# 평가 함수
def evaluate(model, dataloader, loss_fn, device, pbar):
    model.eval()  # 평가 모드
    total_loss = 0
    total_bleu=0

    with torch.no_grad():  # 평가 시에는 역전파가 필요 없으므로 no_grad 사용
        for batch in dataloader:
            src = batch['src_input_ids'].to(device).transpose(0,1)
            tgt = batch['tgt_input_ids'].to(device).transpose(0,1)

            src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt)

            output = model(src, tgt[:-1, :], None, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask)
            tgt_out = tgt[1:, :]
            loss = loss_fn(output.reshape(-1, output.shape[-1]), tgt_out.reshape(-1))

            total_loss += loss.item()

            #BLEU-4 스코어 계산
            output_ids = output.argmax(dim=-1).transpose(0,1)#예측 토큰 -> [batch_size, sequence_length]
            avg_bleu = calculate_bleu(output_ids, tgt[1:, :].transpose(0, 1), tokenizer)  # BLEU 계산
            total_bleu += avg_bleu


            #tqdm 업데이트
            pbar.update(1)

            #평균계산
            avg_loss = total_loss/len(dataloader)
            avg_bleu = total_bleu/len(dataloader)


    return avg_loss, avg_bleu


#test
def test_model(model, dataloader, loss_fn, device, tokenizer):
    model.eval()
    total_loss = 0  # 손실 누적
    total_bleu = 0  # BLEU-4 스코어 누적
    total_sentences = 0  # 문장 수 누적

    with torch.no_grad():
        for batch in dataloader:
            src = batch['src_input_ids'].to(device).transpose(0, 1)
            tgt = batch['tgt_input_ids'].to(device).transpose(0, 1)

            # 마스크 생성
            src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt)

            with amp.autocast():
                # 모델의 출력 계산
                output = model(src, tgt[:-1, :], None, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask)

                # 손실 계산
                tgt_out = tgt[1:, :]
                loss = loss_fn(output.reshape(-1, output.shape[-1]), tgt_out.reshape(-1))
                total_loss += loss.item()

            # BLEU-4 계산
            output_ids = output.argmax(dim=-1).transpose(0, 1)  # 예측 토큰
            avg_bleu = calculate_bleu(output_ids, tgt[1:, :].transpose(0, 1), tokenizer)  # BLEU 계산
            total_bleu += avg_bleu
            total_sentences += 1

    avg_loss = total_loss / len(dataloader)  # 평균 손실
    avg_bleu_score = total_bleu / total_sentences if total_sentences > 0 else 0  # 평균 BLEU-4 스코어

    return avg_loss, avg_bleu_score


###################

train_dataset = TranslationDataset(ds['train'],tokenizer)
val_dataset = TranslationDataset(ds['validation'],tokenizer)
test_dataset = TranslationDataset(ds['test'],tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=8, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=False)

"""
small_train_dataset = TranslationDataset(ds['train'].select(range(200)),tokenizer)
small_val_dataset = TranslationDataset(ds['validation'].select(range(200)),tokenizer)
small_test_dataset =  TranslationDataset(ds['test'].select(range(200)),tokenizer)
train_dataloader = DataLoader(small_train_dataset, batch_size=8)
val_dataloader = DataLoader(small_val_dataset, batch_size=8)
test_dataloader = DataLoader(small_test_dataset, batch_size=8) """


# 에폭 반복문
EPOCHS = 5  # 원하는 에폭 수 설정
start_time = time.time()

for epoch in range(1, EPOCHS + 1):
    print(f"Epoch {epoch}/{EPOCHS}")

    # 훈련 - tqdm 적용
    with tqdm(total=len(train_dataloader), desc=f"Training Epoch {epoch}/{EPOCHS}", unit="batch") as pbar:
        train_loss, train_bleu = train_epoch(model, optimizer, train_dataloader, loss_fn, DEVICE, pbar)
    print(f"  Train loss: {train_loss}, Train Bleu: {train_bleu}")

    # 검증 - tqdm 적용
    with tqdm(total=len(val_dataloader), desc=f"Validation Epoch {epoch}/{EPOCHS}", unit="batch") as pbar:
        val_loss, val_bleu = evaluate(model, val_dataloader, loss_fn, DEVICE, pbar)
    print(f"  Validation loss: {val_loss}, Validation Bleu: {val_bleu}\n")

# 학습이 끝난 후 테스트 데이터로 평가
print("\n\nEvaluating on Test dataset : ")
test_loss, test_bleu = test_model(model, test_dataloader, loss_fn, DEVICE, tokenizer)
print(f"Test loss: {test_loss}, Test BLEU-4: {test_bleu}")

# 학습 종료 시간
end_time = time.time()

# 총 소요 시간 계산
total_time = end_time - start_time
print(f'Total Training Time: {total_time:.2f} seconds')

Epoch 1/10


Training Epoch 1/10:   4%|▍         | 161/3625 [00:24<08:42,  6.63batch/s]


KeyboardInterrupt: 

# P-Transformer

In [9]:

from torch import Tensor
import torch
import torch.nn as nn
from torch.nn import Transformer
from torch.nn import TransformerEncoderLayer, TransformerDecoderLayer
import math
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 단어 순서 개념(notion)을 토큰 임베딩에 도입하기 위한 위치 인코딩(positional encoding)을 위한 헬퍼 모듈(Module)
class ParallelTransformer(nn.Module):
    def __init__(self, num_layers: int, emb_size: int, nhead: int, src_vocab_size: int, tgt_vocab_size: int, dim_feedforward: int = 1024, dropout: float = 0.1):
        super(ParallelTransformer, self).__init__()
        self.num_layers = num_layers
        #인코더->디코더 변환 레이어
        self.enc_to_dec_proj = nn.Linear(emb_size, emb_size)
        
        self.enc_layers = nn.ModuleList([TransformerEncoderLayer(emb_size, nhead, dim_feedforward, dropout) for _ in range(num_layers)])
        self.dec_layers = nn.ModuleList([TransformerDecoderLayer(emb_size, nhead, dim_feedforward, dropout) for _ in range(num_layers)])
        
        self.generator = nn.Linear(emb_size, tgt_vocab_size)#출력레이어
        
        #임베딩 레이어
        self.src_tok_emb = TokenEmbedding(src_vocab_size, emb_size)
        self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, emb_size)
        self.positional_encoding = PositionalEncoding(emb_size, dropout)

    def forward(self, src, trg, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, memory_key_padding_mask):
        src_emb = self.positional_encoding(self.src_tok_emb(src))
        tgt_emb = self.positional_encoding(self.tgt_tok_emb(trg))

        memory = src_emb
        output = tgt_emb

        #인코더->디코더 순으로 레이어들을 순차적으로 통과
        for i in range(self.num_layers):
            #인코더
            memory = self.enc_layers[i](memory, src_mask, src_padding_mask)
            
            #디코더
            output = self.dec_layers[i](output, memory, tgt_mask, None, tgt_padding_mask, memory_key_padding_mask)
        
        return self.generator(output)

class PositionalEncoding(nn.Module):
    def __init__(self, emb_size: int, dropout: float, maxlen: int = 5000):
        super(PositionalEncoding, self).__init__()
        den = torch.exp(-torch.arange(0, emb_size, 2) * math.log(10000) / emb_size)
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
        pos_embedding = torch.zeros((maxlen, emb_size))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(-2)

        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding)

    def forward(self, token_embedding: torch.Tensor):
        return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])
    
# 입력 인덱스의 텐서를 해당하는 토큰 임베딩의 텐서로 변환하기 위한 헬퍼 모듈(Module)
class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size: int, emb_size):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.emb_size = emb_size

    def forward(self, tokens: Tensor):
        return self.embedding(tokens.long()) * math.sqrt(self.emb_size)


class Seq2SeqTransformer(nn.Module):
    def __init__(self,
                 num_encoder_layers: int,
                 num_decoder_layers: int,
                 emb_size: int,
                 nhead: int,
                 src_vocab_size: int,
                 tgt_vocab_size: int,
                 dim_feedforward: int = 512,
                 dropout: float = 0.1):
        super(Seq2SeqTransformer, self).__init__()
        self.transformer = Transformer(d_model=emb_size,
                                       nhead=nhead,
                                       num_encoder_layers=num_encoder_layers,
                                       num_decoder_layers=num_decoder_layers,
                                       dim_feedforward=dim_feedforward,
                                       dropout=dropout)
        self.generator = nn.Linear(emb_size, tgt_vocab_size)#출력 생성기
        self.src_tok_emb = TokenEmbedding(src_vocab_size, emb_size)#source 토큰 임베딩
        self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, emb_size)#target 토큰 임베딩
        self.positional_encoding = PositionalEncoding(#위치 인코딩
            emb_size, dropout=dropout)

    #인코더-디코더 각 과정/훈련데이터 처리
    def forward(self,
                src: Tensor,
                trg: Tensor,
                src_mask: Tensor,
                tgt_mask: Tensor,
                src_padding_mask: Tensor,
                tgt_padding_mask: Tensor,
                memory_key_padding_mask: Tensor):
        src_emb = self.positional_encoding(self.src_tok_emb(src))
        tgt_emb = self.positional_encoding(self.tgt_tok_emb(trg))
        #source 문장을 처리한 후, target문장을 디코더로 처리?
        outs = self.transformer(src_emb, tgt_emb, src_mask, tgt_mask, None,
                                src_padding_mask, tgt_padding_mask, memory_key_padding_mask)#mask 적용
        return self.generator(outs)#최종적으로 각 타겟 단어에 대한 확률분포 반환/임베딩 차원->선형변환 적용->확률분포

    #추론(예측) 과정
    #인코딩
    def encode(self, src: Tensor, src_mask: Tensor):#source 문장 인코딩
        return self.transformer.encoder(self.positional_encoding(
                            self.src_tok_emb(src)), src_mask)
    #디코딩
    def decode(self, tgt: Tensor, memory: Tensor, tgt_mask: Tensor):#target 문장 디코딩
        return self.transformer.decoder(self.positional_encoding(
                          self.tgt_tok_emb(tgt)), memory,
                          tgt_mask)

In [10]:
#마스킹 생성
def generate_square_subsequent_mask(sz):
    mask = (torch.triu(torch.ones((sz, sz), device=DEVICE)) == 1).transpose(0, 1)#상삼각행렬 생성(순차적인 마스킹 적용)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))#-무한대로 마스킹 적용
    return mask

#마스크 설정
def create_mask(src, tgt):
   #[0]: 배치크기, [1]:시퀀스 길이
    tgt_seq_len = tgt.shape[0]-1#128-1


    #어텐션 masking(디코더)
    tgt_mask = generate_square_subsequent_mask(tgt_seq_len).type(torch.float32).to(DEVICE)

    #패딩 토큰에 대한 masking(인코더/디코더)
    src_padding_mask = (src == tokenizer.pad_token_id).to(torch.bool).transpose(0,1)#[src_seq_len, batch_size]
    tgt_padding_mask = (tgt[:-1,:] == tokenizer.pad_token_id).to(torch.bool).transpose(0,1)

    #차원 확인용
    #print('tgt_seq_len',tgt_seq_len, '  tgt_mask',tgt_mask.shape  )
    #print('\n',src_padding_mask.shape, '\n',tgt_padding_mask.shape)

    return None, tgt_mask, src_padding_mask, tgt_padding_mask #src_mask는 반환하지 않음

In [5]:
torch.manual_seed(0)

SRC_VOCAB_SIZE = tokenizer.vocab_size
TGT_VOCAB_SIZE = tokenizer.vocab_size
EMB_SIZE = 512#512
NHEAD = 4#8
FFN_HID_DIM = 512#2048
BATCH_SIZE = 8
NUM_ENCODER_LAYERS = 6 #6
NUM_DECODER_LAYERS = 6 #6

transformer = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE,
                                 NHEAD, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, FFN_HID_DIM)

model = transformer.to(DEVICE)



In [6]:
#BLEU score 계산
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
smoothie = SmoothingFunction().method4

# BLEU-4 점수 계산 함수
def calculate_bleu(output_ids, target_ids, tokenizer):
    total_bleu = 0
    total_sentences = 0

    for i in range(output_ids.size(0)):  # 배치의 각 문장에 대해 BLEU 계산
        pred_sentence = tokenizer.decode(output_ids[i], skip_special_tokens=True)  # 예측된 문장
        tgt_sentence = tokenizer.decode(target_ids[i], skip_special_tokens=True)  # 실제 문장

        # BLEU-4 계산
        reference = [tgt_sentence.split()]  # 참조 문장
        candidate = pred_sentence.split()  # 예측 문장
        bleu_score = sentence_bleu(reference, candidate, smoothing_function=smoothie)
        total_bleu += bleu_score
        total_sentences += 1

    avg_bleu = total_bleu / total_sentences if total_sentences > 0 else 0
    return avg_bleu

In [7]:
import torch.nn as nn

#손실 함수에 EOS 토큰 가중치 적용
def create_weighted_loss(vocab_size, eos_token_id, pad_token_id, device):
    weights = torch.ones(vocab_size).to(device)  #GPU로 옮김
    weights[eos_token_id] = 0.1 #<EOS> 토큰에 대한 가중치를 낮춤
    weights[pad_token_id] = 0.0#<PAD> 토큰은 무시

    return nn.CrossEntropyLoss(weight=weights, ignore_index=pad_token_id)


In [8]:
import torch.cuda.amp as amp
from tqdm import tqdm
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import time
smoothie = SmoothingFunction().method4

import torch.nn.functional as F

# 손실 함수 및 옵티마이저
#loss_fn = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
eos_token_id = tokenizer.eos_token_id
pad_token_id = tokenizer.pad_token_id
loss_fn = create_weighted_loss(vocab_size=tokenizer.vocab_size, eos_token_id=eos_token_id, pad_token_id=pad_token_id, device=DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

# 학습 함수
def train_epoch(model, optimizer, dataloader, loss_fn, device, pbar):
    model.train()  # 모델을 학습 모드로 전환
    total_loss = 0
    total_bleu=0
    scaler = amp.GradScaler()  # Mixed Precision에 필요한 GradScaler 초기화


    for batch in dataloader:
        #데이터 로드
        src = batch['src_input_ids'].to(device).transpose(0,1)
        tgt = batch['tgt_input_ids'].to(device).transpose(0,1)

         #마스크 생성
        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt)

        #옵티마이저 초기화
        optimizer.zero_grad()

        with amp.autocast():
            #모델의 출력 계산
            output = model(src, tgt[:-1, :], None, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask)

            #손실 계산 (output을 [batch_size, sequence_len, vocab_size]로 변환)
            tgt_out = tgt[1:, :]  #<BOS> 토큰 제외
            loss = loss_fn(output.reshape(-1, output.shape[-1]), tgt_out.reshape(-1))

        # 역전파 및 파라미터 업데이트
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        total_loss += loss.item()

        #BLEU-4 스코어 계산
        output_ids = output.argmax(dim=-1).transpose(0,1)#예측 토큰 -> [batch_size, sequence_length]
        avg_bleu = calculate_bleu(output_ids, tgt[1:, :].transpose(0, 1), tokenizer)  # BLEU 계산
        total_bleu += avg_bleu


        #tqdm 업데이트
        pbar.update(1)

        #평균계산
        avg_loss = total_loss/len(dataloader)
        avg_bleu = total_bleu/len(dataloader)

    return avg_loss, avg_bleu



# 평가 함수
def evaluate(model, dataloader, loss_fn, device, pbar):
    model.eval()  # 평가 모드
    total_loss = 0
    total_bleu=0

    with torch.no_grad():  # 평가 시에는 역전파가 필요 없으므로 no_grad 사용
        for batch in dataloader:
            src = batch['src_input_ids'].to(device).transpose(0,1)
            tgt = batch['tgt_input_ids'].to(device).transpose(0,1)

            src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt)

            output = model(src, tgt[:-1, :], None, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask)
            tgt_out = tgt[1:, :]
            loss = loss_fn(output.reshape(-1, output.shape[-1]), tgt_out.reshape(-1))

            total_loss += loss.item()

            #BLEU-4 스코어 계산
            output_ids = output.argmax(dim=-1).transpose(0,1)#예측 토큰 -> [batch_size, sequence_length]
            avg_bleu = calculate_bleu(output_ids, tgt[1:, :].transpose(0, 1), tokenizer)  # BLEU 계산
            total_bleu += avg_bleu


            #tqdm 업데이트
            pbar.update(1)

            #평균계산
            avg_loss = total_loss/len(dataloader)
            avg_bleu = total_bleu/len(dataloader)


    return avg_loss, avg_bleu


#test
def test_model(model, dataloader, loss_fn, device, tokenizer):
    model.eval()
    total_loss = 0  # 손실 누적
    total_bleu = 0  # BLEU-4 스코어 누적
    total_sentences = 0  # 문장 수 누적

    with torch.no_grad():
        for batch in dataloader:
            src = batch['src_input_ids'].to(device).transpose(0, 1)
            tgt = batch['tgt_input_ids'].to(device).transpose(0, 1)

            # 마스크 생성
            src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt)

            with amp.autocast():
                # 모델의 출력 계산
                output = model(src, tgt[:-1, :], None, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask)

                # 손실 계산
                tgt_out = tgt[1:, :]
                loss = loss_fn(output.reshape(-1, output.shape[-1]), tgt_out.reshape(-1))
                total_loss += loss.item()

            # BLEU-4 계산
            output_ids = output.argmax(dim=-1).transpose(0, 1)  # 예측 토큰
            avg_bleu = calculate_bleu(output_ids, tgt[1:, :].transpose(0, 1), tokenizer)  # BLEU 계산
            total_bleu += avg_bleu
            total_sentences += 1

    avg_loss = total_loss / len(dataloader)  # 평균 손실
    avg_bleu_score = total_bleu / total_sentences if total_sentences > 0 else 0  # 평균 BLEU-4 스코어

    return avg_loss, avg_bleu_score


###################

train_dataset = TranslationDataset(ds['train'],tokenizer)
val_dataset = TranslationDataset(ds['validation'],tokenizer)
test_dataset = TranslationDataset(ds['test'],tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=8, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=False)

"""
small_train_dataset = TranslationDataset(ds['train'].select(range(200)),tokenizer)
small_val_dataset = TranslationDataset(ds['validation'].select(range(200)),tokenizer)
small_test_dataset =  TranslationDataset(ds['test'].select(range(200)),tokenizer)
train_dataloader = DataLoader(small_train_dataset, batch_size=8)
val_dataloader = DataLoader(small_val_dataset, batch_size=8)
test_dataloader = DataLoader(small_test_dataset, batch_size=8) """


# 에폭 반복문
EPOCHS = 5  # 원하는 에폭 수 설정
start_time = time.time()

for epoch in range(1, EPOCHS + 1):
    print(f"Epoch {epoch}/{EPOCHS}")

    # 훈련 - tqdm 적용
    with tqdm(total=len(train_dataloader), desc=f"Training Epoch {epoch}/{EPOCHS}", unit="batch") as pbar:
        train_loss, train_bleu = train_epoch(model, optimizer, train_dataloader, loss_fn, DEVICE, pbar)
    print(f"  Train loss: {train_loss}, Train Bleu: {train_bleu}")

    # 검증 - tqdm 적용
    with tqdm(total=len(val_dataloader), desc=f"Validation Epoch {epoch}/{EPOCHS}", unit="batch") as pbar:
        val_loss, val_bleu = evaluate(model, val_dataloader, loss_fn, DEVICE, pbar)
    print(f"  Validation loss: {val_loss}, Validation Bleu: {val_bleu}\n")

# 학습이 끝난 후 테스트 데이터로 평가
print("\n\nEvaluating on Test dataset : ")
test_loss, test_bleu = test_model(model, test_dataloader, loss_fn, DEVICE, tokenizer)
print(f"Test loss: {test_loss}, Test BLEU-4: {test_bleu}")

# 학습 종료 시간
end_time = time.time()

# 총 소요 시간 계산
total_time = end_time - start_time
print(f'Total Training Time: {total_time:.2f} seconds')

Epoch 1/5




Training Epoch 1/5: 100%|██████████| 3625/3625 [09:04<00:00,  6.66batch/s]


  Train loss: 4.963761483948806, Train Bleu: 0.049792581369893715


Validation Epoch 1/5: 100%|██████████| 127/127 [00:10<00:00, 12.53batch/s]


  Validation loss: 3.904220746258112, Validation Bleu: 0.07591318384091325

Epoch 2/5


Training Epoch 2/5: 100%|██████████| 3625/3625 [09:02<00:00,  6.68batch/s]


  Train loss: 3.554837892071954, Train Bleu: 0.0763299039632607


Validation Epoch 2/5: 100%|██████████| 127/127 [00:10<00:00, 12.68batch/s]


  Validation loss: 3.242792005614033, Validation Bleu: 0.10317179598069949

Epoch 3/5


Training Epoch 3/5: 100%|██████████| 3625/3625 [09:01<00:00,  6.69batch/s]


  Train loss: 3.0355732363010275, Train Bleu: 0.09102686387542122


Validation Epoch 3/5: 100%|██████████| 127/127 [00:10<00:00, 12.70batch/s]


  Validation loss: 2.915244609352172, Validation Bleu: 0.1127169018317616

Epoch 4/5


Training Epoch 4/5: 100%|██████████| 3625/3625 [09:02<00:00,  6.69batch/s]


  Train loss: 2.7084292000737684, Train Bleu: 0.09993656659592934


Validation Epoch 4/5: 100%|██████████| 127/127 [00:10<00:00, 12.61batch/s]


  Validation loss: 2.700872539535282, Validation Bleu: 0.12045635649431972

Epoch 5/5


Training Epoch 5/5: 100%|██████████| 3625/3625 [09:02<00:00,  6.69batch/s]


  Train loss: 2.46573330221505, Train Bleu: 0.10726020946304046


Validation Epoch 5/5: 100%|██████████| 127/127 [00:10<00:00, 12.62batch/s]


  Validation loss: 2.58388367224866, Validation Bleu: 0.12465180463960317



Evaluating on Test dataset : 
Test loss: 2.5243550882339476, Test BLEU-4: 0.13146173478009524
Total Training Time: 2771.30 seconds


# P-Transformer with Teacher Forcing

In [None]:

from torch import Tensor
import torch
import torch.nn as nn
from torch.nn import Transformer
from torch.nn import TransformerEncoderLayer, TransformerDecoderLayer
import math
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 단어 순서 개념(notion)을 토큰 임베딩에 도입하기 위한 위치 인코딩(positional encoding)을 위한 헬퍼 모듈(Module)
class ParallelTransformer(nn.Module):
    def __init__(self, num_layers: int, emb_size: int, nhead: int, src_vocab_size: int, tgt_vocab_size: int, dim_feedforward: int = 1024, dropout: float = 0.1):
        super(ParallelTransformer, self).__init__()
        self.num_layers = num_layers
        #인코더->디코더 변환 레이어
        self.enc_to_dec_proj = nn.Linear(emb_size, emb_size)
        
        self.enc_layers = nn.ModuleList([TransformerEncoderLayer(emb_size, nhead, dim_feedforward, dropout) for _ in range(num_layers)])
        self.dec_layers = nn.ModuleList([TransformerDecoderLayer(emb_size, nhead, dim_feedforward, dropout) for _ in range(num_layers)])
        
        self.generator = nn.Linear(emb_size, tgt_vocab_size)#출력레이어
        
        #임베딩 레이어
        self.src_tok_emb = TokenEmbedding(src_vocab_size, emb_size)
        self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, emb_size)
        self.positional_encoding = PositionalEncoding(emb_size, dropout)

    def forward(self, src, trg, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, memory_key_padding_mask, teacher_forcing_ratio=0.5):
        src_emb = self.positional_encoding(self.src_tok_emb(src))
        tgt_emb = self.positional_encoding(self.tgt_tok_emb(trg))

        memory = src_emb
        output = tgt_emb


        for i in range(self.num_layers):
            memory = self.enc_layers[i](memory, src_mask, src_padding_mask)

            use_teacher_forcing = random.random() < teacher_forcing_ratio

            #teacher forcing
            output = self.dec_layers[i](tgt_emb, memory, tgt_mask, None, tgt_padding_mask, memory_key_padding_mask)

        return self.generator(output)




class PositionalEncoding(nn.Module):
    def __init__(self, emb_size: int, dropout: float, maxlen: int = 5000):
        super(PositionalEncoding, self).__init__()
        den = torch.exp(-torch.arange(0, emb_size, 2) * math.log(10000) / emb_size)
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
        pos_embedding = torch.zeros((maxlen, emb_size))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(-2)

        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding)

    def forward(self, token_embedding: torch.Tensor):
        return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])
    
# 입력 인덱스의 텐서를 해당하는 토큰 임베딩의 텐서로 변환하기 위한 헬퍼 모듈(Module)
class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size: int, emb_size):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.emb_size = emb_size

    def forward(self, tokens: Tensor):
        return self.embedding(tokens.long()) * math.sqrt(self.emb_size)


class Seq2SeqTransformer(nn.Module):
    def __init__(self,
                 num_encoder_layers: int,
                 num_decoder_layers: int,
                 emb_size: int,
                 nhead: int,
                 src_vocab_size: int,
                 tgt_vocab_size: int,
                 dim_feedforward: int = 512,
                 dropout: float = 0.1):
        super(Seq2SeqTransformer, self).__init__()
        self.transformer = Transformer(d_model=emb_size,
                                       nhead=nhead,
                                       num_encoder_layers=num_encoder_layers,
                                       num_decoder_layers=num_decoder_layers,
                                       dim_feedforward=dim_feedforward,
                                       dropout=dropout)
        self.generator = nn.Linear(emb_size, tgt_vocab_size)#출력 생성기
        self.src_tok_emb = TokenEmbedding(src_vocab_size, emb_size)#source 토큰 임베딩
        self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, emb_size)#target 토큰 임베딩
        self.positional_encoding = PositionalEncoding(#위치 인코딩
            emb_size, dropout=dropout)

    #인코더-디코더 각 과정/훈련데이터 처리
    def forward(self,
                src: Tensor,
                trg: Tensor,
                src_mask: Tensor,
                tgt_mask: Tensor,
                src_padding_mask: Tensor,
                tgt_padding_mask: Tensor,
                memory_key_padding_mask: Tensor):
        src_emb = self.positional_encoding(self.src_tok_emb(src))
        tgt_emb = self.positional_encoding(self.tgt_tok_emb(trg))
        #source 문장을 처리한 후, target문장을 디코더로 처리?
        outs = self.transformer(src_emb, tgt_emb, src_mask, tgt_mask, None,
                                src_padding_mask, tgt_padding_mask, memory_key_padding_mask)#mask 적용
        return self.generator(outs)#최종적으로 각 타겟 단어에 대한 확률분포 반환/임베딩 차원->선형변환 적용->확률분포

    #추론(예측) 과정
    #인코딩
    def encode(self, src: Tensor, src_mask: Tensor):#source 문장 인코딩
        return self.transformer.encoder(self.positional_encoding(
                            self.src_tok_emb(src)), src_mask)
    #디코딩
    def decode(self, tgt: Tensor, memory: Tensor, tgt_mask: Tensor):#target 문장 디코딩
        return self.transformer.decoder(self.positional_encoding(
                          self.tgt_tok_emb(tgt)), memory,
                          tgt_mask)

In [None]:
#마스킹 생성
def generate_square_subsequent_mask(sz):
    mask = (torch.triu(torch.ones((sz, sz), device=DEVICE)) == 1).transpose(0, 1)#상삼각행렬 생성(순차적인 마스킹 적용)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))#-무한대로 마스킹 적용
    return mask

#마스크 설정
def create_mask(src, tgt):
   #[0]: 배치크기, [1]:시퀀스 길이
    tgt_seq_len = tgt.shape[0]-1#128-1


    #어텐션 masking(디코더)
    tgt_mask = generate_square_subsequent_mask(tgt_seq_len).type(torch.float32).to(DEVICE)

    #패딩 토큰에 대한 masking(인코더/디코더)
    src_padding_mask = (src == tokenizer.pad_token_id).to(torch.bool).transpose(0,1)#[src_seq_len, batch_size]
    tgt_padding_mask = (tgt[:-1,:] == tokenizer.pad_token_id).to(torch.bool).transpose(0,1)

    #차원 확인용
    #print('tgt_seq_len',tgt_seq_len, '  tgt_mask',tgt_mask.shape  )
    #print('\n',src_padding_mask.shape, '\n',tgt_padding_mask.shape)

    return None, tgt_mask, src_padding_mask, tgt_padding_mask #src_mask는 반환하지 않음

In [None]:
torch.manual_seed(0)

SRC_VOCAB_SIZE = tokenizer.vocab_size
TGT_VOCAB_SIZE = tokenizer.vocab_size
EMB_SIZE = 512#512
NHEAD = 4#8
FFN_HID_DIM = 512#2048
BATCH_SIZE = 8
NUM_ENCODER_LAYERS = 6 #6
NUM_DECODER_LAYERS = 6 #6

transformer = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE,
                                 NHEAD, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, FFN_HID_DIM)

model = transformer.to(DEVICE)

In [None]:
#BLEU score 계산
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
smoothie = SmoothingFunction().method4

# BLEU-4 점수 계산 함수
def calculate_bleu(output_ids, target_ids, tokenizer):
    total_bleu = 0
    total_sentences = 0

    for i in range(output_ids.size(0)):  # 배치의 각 문장에 대해 BLEU 계산
        pred_sentence = tokenizer.decode(output_ids[i], skip_special_tokens=True)  # 예측된 문장
        tgt_sentence = tokenizer.decode(target_ids[i], skip_special_tokens=True)  # 실제 문장

        # BLEU-4 계산
        reference = [tgt_sentence.split()]  # 참조 문장
        candidate = pred_sentence.split()  # 예측 문장
        bleu_score = sentence_bleu(reference, candidate, smoothing_function=smoothie)
        total_bleu += bleu_score
        total_sentences += 1

    avg_bleu = total_bleu / total_sentences if total_sentences > 0 else 0
    return avg_bleu

In [None]:
import torch.nn as nn

#손실 함수에 EOS 토큰 가중치 적용
def create_weighted_loss(vocab_size, eos_token_id, pad_token_id, device):
    weights = torch.ones(vocab_size).to(device)  #GPU로 옮김
    weights[eos_token_id] = 0.1 #<EOS> 토큰에 대한 가중치를 낮춤
    weights[pad_token_id] = 0.0#<PAD> 토큰은 무시

    return nn.CrossEntropyLoss(weight=weights, ignore_index=pad_token_id)


In [None]:
import torch.cuda.amp as amp
from tqdm import tqdm
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import time
smoothie = SmoothingFunction().method4

import torch.nn.functional as F

# 손실 함수 및 옵티마이저
#loss_fn = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
eos_token_id = tokenizer.eos_token_id
pad_token_id = tokenizer.pad_token_id
loss_fn = create_weighted_loss(vocab_size=tokenizer.vocab_size, eos_token_id=eos_token_id, pad_token_id=pad_token_id, device=DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

# 학습 함수
def train_epoch(model, optimizer, dataloader, loss_fn, device, pbar):
    model.train()  # 모델을 학습 모드로 전환
    total_loss = 0
    total_bleu=0
    scaler = amp.GradScaler()  # Mixed Precision에 필요한 GradScaler 초기화


    for batch in dataloader:
        #데이터 로드
        src = batch['src_input_ids'].to(device).transpose(0,1)
        tgt = batch['tgt_input_ids'].to(device).transpose(0,1)

         #마스크 생성
        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt)

        #옵티마이저 초기화
        optimizer.zero_grad()

        with amp.autocast():
            #모델의 출력 계산
            output = model(src, tgt[:-1, :], None, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask)

            #손실 계산 (output을 [batch_size, sequence_len, vocab_size]로 변환)
            tgt_out = tgt[1:, :]  #<BOS> 토큰 제외
            loss = loss_fn(output.reshape(-1, output.shape[-1]), tgt_out.reshape(-1))

        # 역전파 및 파라미터 업데이트
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        total_loss += loss.item()

        #BLEU-4 스코어 계산
        output_ids = output.argmax(dim=-1).transpose(0,1)#예측 토큰 -> [batch_size, sequence_length]
        avg_bleu = calculate_bleu(output_ids, tgt[1:, :].transpose(0, 1), tokenizer)  # BLEU 계산
        total_bleu += avg_bleu


        #tqdm 업데이트
        pbar.update(1)

        #평균계산
        avg_loss = total_loss/len(dataloader)
        avg_bleu = total_bleu/len(dataloader)

    return avg_loss, avg_bleu



# 평가 함수
def evaluate(model, dataloader, loss_fn, device, pbar):
    model.eval()  # 평가 모드
    total_loss = 0
    total_bleu=0

    with torch.no_grad():  # 평가 시에는 역전파가 필요 없으므로 no_grad 사용
        for batch in dataloader:
            src = batch['src_input_ids'].to(device).transpose(0,1)
            tgt = batch['tgt_input_ids'].to(device).transpose(0,1)

            src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt)

            output = model(src, tgt[:-1, :], None, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask)
            tgt_out = tgt[1:, :]
            loss = loss_fn(output.reshape(-1, output.shape[-1]), tgt_out.reshape(-1))

            total_loss += loss.item()

            #BLEU-4 스코어 계산
            output_ids = output.argmax(dim=-1).transpose(0,1)#예측 토큰 -> [batch_size, sequence_length]
            avg_bleu = calculate_bleu(output_ids, tgt[1:, :].transpose(0, 1), tokenizer)  # BLEU 계산
            total_bleu += avg_bleu


            #tqdm 업데이트
            pbar.update(1)

            #평균계산
            avg_loss = total_loss/len(dataloader)
            avg_bleu = total_bleu/len(dataloader)


    return avg_loss, avg_bleu


#test
def test_model(model, dataloader, loss_fn, device, tokenizer):
    model.eval()
    total_loss = 0  # 손실 누적
    total_bleu = 0  # BLEU-4 스코어 누적
    total_sentences = 0  # 문장 수 누적

    with torch.no_grad():
        for batch in dataloader:
            src = batch['src_input_ids'].to(device).transpose(0, 1)
            tgt = batch['tgt_input_ids'].to(device).transpose(0, 1)

            # 마스크 생성
            src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt)

            with amp.autocast():
                # 모델의 출력 계산
                output = model(src, tgt[:-1, :], None, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask)

                # 손실 계산
                tgt_out = tgt[1:, :]
                loss = loss_fn(output.reshape(-1, output.shape[-1]), tgt_out.reshape(-1))
                total_loss += loss.item()

            # BLEU-4 계산
            output_ids = output.argmax(dim=-1).transpose(0, 1)  # 예측 토큰
            avg_bleu = calculate_bleu(output_ids, tgt[1:, :].transpose(0, 1), tokenizer)  # BLEU 계산
            total_bleu += avg_bleu
            total_sentences += 1

    avg_loss = total_loss / len(dataloader)  # 평균 손실
    avg_bleu_score = total_bleu / total_sentences if total_sentences > 0 else 0  # 평균 BLEU-4 스코어

    return avg_loss, avg_bleu_score


###################

train_dataset = TranslationDataset(ds['train'],tokenizer)
val_dataset = TranslationDataset(ds['validation'],tokenizer)
test_dataset = TranslationDataset(ds['test'],tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=8, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=False)

"""
small_train_dataset = TranslationDataset(ds['train'].select(range(200)),tokenizer)
small_val_dataset = TranslationDataset(ds['validation'].select(range(200)),tokenizer)
small_test_dataset =  TranslationDataset(ds['test'].select(range(200)),tokenizer)
train_dataloader = DataLoader(small_train_dataset, batch_size=8)
val_dataloader = DataLoader(small_val_dataset, batch_size=8)
test_dataloader = DataLoader(small_test_dataset, batch_size=8) """


# 에폭 반복문
EPOCHS = 5  # 원하는 에폭 수 설정
start_time = time.time()

for epoch in range(1, EPOCHS + 1):
    print(f"Epoch {epoch}/{EPOCHS}")

    # 훈련 - tqdm 적용
    with tqdm(total=len(train_dataloader), desc=f"Training Epoch {epoch}/{EPOCHS}", unit="batch") as pbar:
        train_loss, train_bleu = train_epoch(model, optimizer, train_dataloader, loss_fn, DEVICE, pbar)
    print(f"  Train loss: {train_loss}, Train Bleu: {train_bleu}")

    # 검증 - tqdm 적용
    with tqdm(total=len(val_dataloader), desc=f"Validation Epoch {epoch}/{EPOCHS}", unit="batch") as pbar:
        val_loss, val_bleu = evaluate(model, val_dataloader, loss_fn, DEVICE, pbar)
    print(f"  Validation loss: {val_loss}, Validation Bleu: {val_bleu}\n")

# 학습이 끝난 후 테스트 데이터로 평가
print("\n\nEvaluating on Test dataset : ")
test_loss, test_bleu = test_model(model, test_dataloader, loss_fn, DEVICE, tokenizer)
print(f"Test loss: {test_loss}, Test BLEU-4: {test_bleu}")

# 학습 종료 시간
end_time = time.time()

# 총 소요 시간 계산
total_time = end_time - start_time
print(f'Total Training Time: {total_time:.2f} seconds')

# 평가


In [17]:
#############################
#번역 작업 수행
def translate(model, sentence, tokenizer, device, max_length=128):
    model.eval()  # 모델을 평가 모드로 전환

    # 입력 문장 토큰화 (소스 언어로 설정된 상태에서)
    inputs = tokenizer(sentence, return_tensors="pt", max_length=max_length, truncation=True, padding="max_length").to(device)

    # 소스 문장을 인코딩
    with torch.no_grad():
        src_mask = torch.zeros((inputs['input_ids'].shape[1], inputs['input_ids'].shape[1]), device=device).type(torch.bool)
        memory = model.encode(inputs['input_ids'].transpose(0, 1), src_mask)

    # 타겟 문장 시작을 <sos> 토큰으로 설정
    tgt_tokens = torch.ones(1, 1).fill_(tokenizer.bos_token_id).type(torch.long).to(device)

    for _ in range(max_length):
        tgt_mask = generate_square_subsequent_mask(tgt_tokens.size(0)).to(device)

        # 디코딩
        with torch.no_grad():
            output = model.decode(tgt_tokens, memory, tgt_mask)
            output = model.generator(output)

        #확인용~##############################################################
        # Softmax 적용하여 확률로 변환##############################
        probabilities = F.softmax(output[-1, :, :], dim=-1)
        # 상위 top_k 후보 확인
        top_k=5
        top_k_probabilities, top_k_indices = torch.topk(probabilities, top_k, dim=-1)

        # 출력 후보들을 확인
        print("\nTop-k predictions:")
        for i in range(top_k):
            token_id = top_k_indices[0, i].item()
            token_prob = top_k_probabilities[0, i].item()
            token = tokenizer.decode([token_id])
            print(f"Token: {token}, Probability: {token_prob:.4f}")
        ##########################################################

        # 가장 가능성 높은 토큰을 선택
        next_token = output.argmax(dim=-1)[-1].item()

        # <eos> 토큰이 나오면 종료
        if next_token == tokenizer.eos_token_id:
            break

        # 예측된 토큰을 타겟 토큰 시퀀스에 추가
        tgt_tokens = torch.cat([tgt_tokens, torch.tensor([[next_token]], device=device)], dim=0)

    # 번역된 토큰을 텍스트로 변환
    translated_sentence = tokenizer.decode(tgt_tokens.flatten(), skip_special_tokens=True)

    return translated_sentence


In [19]:
sentence='Have a good day.'
translated_sentence = translate(model, sentence, tokenizer, DEVICE)
print(f'\n sentence : {sentence}')
print(f'translated_sentence : {translated_sentence}')


Top-k predictions:
Token: S, Probability: 0.0557
Token: für, Probability: 0.0445
Token: ., Probability: 0.0413
Token: ,, Probability: 0.0279
Token: er, Probability: 0.0235

Top-k predictions:
Token: S, Probability: 0.0190
Token: rassen, Probability: 0.0175
Token: ", Probability: 0.0175
Token: ,, Probability: 0.0166
Token: trä, Probability: 0.0115

Top-k predictions:
Token: ,, Probability: 0.0186
Token: rassen, Probability: 0.0183
Token: S, Probability: 0.0167
Token: Person, Probability: 0.0150
Token: ", Probability: 0.0130

Top-k predictions:
Token: S, Probability: 0.1197
Token: als, Probability: 0.1047
Token: um, Probability: 0.0856
Token: wie, Probability: 0.0712
Token: der, Probability: 0.0394

Top-k predictions:
Token: tur, Probability: 0.0222
Token: ,, Probability: 0.0220
Token: S, Probability: 0.0185
Token: Person, Probability: 0.0182
Token: ., Probability: 0.0168

Top-k predictions:
Token: S, Probability: 0.1334
Token: Person, Probability: 0.0522
Token: zh, Probability: 0.0256


Token: ,, Probability: 0.0220
Token: ., Probability: 0.0218
Token: um, Probability: 0.0183
Token: S, Probability: 0.0181
Token: ", Probability: 0.0178

Top-k predictions:
Token: um, Probability: 0.1078
Token: S, Probability: 0.0959
Token: als, Probability: 0.0672
Token: , Probability: 0.0662
Token: der, Probability: 0.0307

Top-k predictions:
Token: ein, Probability: 0.0456
Token: etwas, Probability: 0.0358
Token: „, Probability: 0.0261
Token: S, Probability: 0.0239
Token: für, Probability: 0.0222

Top-k predictions:
Token: S, Probability: 0.0893
Token: „, Probability: 0.0465
Token: , Probability: 0.0369
Token: ., Probability: 0.0293
Token: Mü, Probability: 0.0243

Top-k predictions:
Token: ., Probability: 0.0215
Token: ,, Probability: 0.0209
Token: ", Probability: 0.0189
Token: S, Probability: 0.0180
Token: um, Probability: 0.0179

Top-k predictions:
Token: “, Probability: 0.1345
Token: </s>, Probability: 0.0623
Token: ,, Probability: 0.0268
Token: S, Probability: 0.0239
Token: steht,

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import TransformerEncoderLayer, TransformerDecoderLayer

class ParallelTransformer(nn.Module):
    def __init__(self, embed_size, num_heads, forward_expansion, num_layers, src_vocab_size, tgt_vocab_size, max_length, device):
        super(ParallelTransformer, self).__init__()
        self.device = device
        
        # 인코더와 디코더 임베딩
        self.src_embedding = nn.Embedding(src_vocab_size, embed_size)
        self.tgt_embedding = nn.Embedding(tgt_vocab_size, embed_size)
        
        # 포지셔널 인코딩
        self.positional_encoding = PositionalEncoding(embed_size, max_length)

        # 병렬 인코더-디코더 레이어 (모든 레이어가 병렬로 실행됨)
        self.encoder_layers = nn.ModuleList([
            TransformerEncoderLayer(d_model=embed_size, nhead=num_heads, dim_feedforward=forward_expansion * embed_size)
            for _ in range(num_layers)
        ])
        self.decoder_layers = nn.ModuleList([
            TransformerDecoderLayer(d_model=embed_size, nhead=num_heads, dim_feedforward=forward_expansion * embed_size)
            for _ in range(num_layers)
        ])

        # 출력 레이어
        self.fc_out = nn.Linear(embed_size, tgt_vocab_size)

    def forward(self, src, tgt, src_mask, tgt_mask):
        # 임베딩과 포지셔널 인코딩 적용
        src = self.src_embedding(src) + self.positional_encoding(src)
        tgt = self.tgt_embedding(tgt) + self.positional_encoding(tgt)

        # 병렬 인코더-디코더 실행
        encoder_outputs = []
        decoder_outputs = []
        for encoder_layer, decoder_layer in zip(self.encoder_layers, self.decoder_layers):
            enc_output = encoder_layer(src, src_mask)  # 각 인코더 레이어 병렬 실행
            encoder_outputs.append(enc_output)

            dec_output = decoder_layer(tgt, enc_output, tgt_mask, src_mask)  # 각 디코더 레이어 병렬 실행
            decoder_outputs.append(dec_output)

        # 각 레이어의 출력값을 평균하여 최종 출력으로 사용
        encoder_final_output = torch.stack(encoder_outputs, dim=0).mean(dim=0)
        decoder_final_output = torch.stack(decoder_outputs, dim=0).mean(dim=0)

        # 출력층 적용
        output = self.fc_out(decoder_final_output)
        return output

class PositionalEncoding(nn.Module):
    def __init__(self, embed_size, max_length):
        super(PositionalEncoding, self).__init__()
        self.encoding = torch.zeros(max_length, embed_size)
        self.encoding.requires_grad = False

        position = torch.arange(0, max_length).unsqueeze(1).float()
        div_term = torch.exp(torch.arange(0, embed_size, 2).float() * (-torch.log(torch.tensor(10000.0)) / embed_size))

        self.encoding[:, 0::2] = torch.sin(position * div_term)
        self.encoding[:, 1::2] = torch.cos(position * div_term)

    def forward(self, x):
        batch_size, seq_len = x.size()
        return self.encoding[:seq_len, :].unsqueeze(0).repeat(batch_size, 1, 1).to(x.device)

# 모델 하이퍼파라미터 설정
embed_size = 512
num_heads = 4
forward_expansion = 4
num_layers = 4
src_vocab_size = tokenizer.vocab_size
tgt_vocab_size = tokenizer.vocab_size
max_length = 128
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 모델 초기화
model = ParallelTransformer(embed_size, num_heads, forward_expansion, num_layers, src_vocab_size, tgt_vocab_size, max_length, device).to(device)
