# Module Import & Data Preparation

### Important Installations

In [None]:
!pip install torchtext==0.12.0
!pip install konlpy
!pip install openpyxl

Collecting torchtext==0.12.0
  Downloading torchtext-0.12.0-cp310-cp310-manylinux1_x86_64.whl.metadata (8.0 kB)
Collecting torch==1.11.0 (from torchtext==0.12.0)
  Downloading torch-1.11.0-cp310-cp310-manylinux1_x86_64.whl.metadata (24 kB)
Downloading torchtext-0.12.0-cp310-cp310-manylinux1_x86_64.whl (10.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.4/10.4 MB[0m [31m20.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading torch-1.11.0-cp310-cp310-manylinux1_x86_64.whl (750.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m750.6/750.6 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch, torchtext
  Attempting uninstall: torch
    Found existing installation: torch 2.4.1+cu121
    Uninstalling torch-2.4.1+cu121:
      Successfully uninstalled torch-2.4.1+cu121
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the foll

In [None]:
import torch

print("Torch version:{}".format(torch.__version__))
print("cuda version: {}".format(torch.version.cuda))
print("cudnn version:{}".format(torch.backends.cudnn.version()))

Torch version:1.11.0+cu102
cuda version: 10.2
cudnn version:7605


### Mount Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
os.chdir('/content/drive/MyDrive/24Fall_NLP')
# os.chdir('/content/drive/MyDrive/YAI/24Fall_NLP')

### Import Mandatory Modules

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

import torchtext.transforms as transforms
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import Dataset, DataLoader
from konlpy.tag import Okt
from torchtext.datasets import Multi30k

import spacy
import numpy as np

import random
import math
import time

### Data Preprocessing

In [None]:
from glob import glob

data = glob('./한영번역_sample/*.xlsx')

In [None]:
data_path = './한영번역_sample/'

In [None]:
data

['./한영번역_sample/2_대화체_190920.xlsx',
 './한영번역_sample/4_문어체_한국문화_190920.xlsx',
 './한영번역_sample/3_문어체_뉴스_190920.xlsx',
 './한영번역_sample/6_문어체_지자체웹사이트_190920.xlsx',
 './한영번역_sample/5_문어체_조례_190920.xlsx']

In [None]:
file_list = [os.path.basename(file) for file in data]

df = pd.DataFrame(columns=['kr', 'en'])

for datum in data:
    temp = pd.read_excel(datum)
    exs_columns_en = [col for col in temp.columns if col in ['영어검수', '검수', 'Review', 'REVIEW']]
    exs_columns_kr = [col for col in temp.columns if col in ['원문','한국어']]
    if exs_columns_en:
        temp = temp.rename(columns={col:'en' for col in exs_columns_en})
    if exs_columns_kr:
        temp = temp.rename(columns={col:'kr' for col in exs_columns_kr})

    df = pd.concat([df, temp[['kr', 'en']]])

In [None]:
print(df.shape)
df.head()

(56263, 2)


Unnamed: 0,kr,en
0,이번 시험 혹시 범위가 어떻게 되는지 아세요? 제가 지난주 수업을 못가서요.,Do you know which part our test is going to co...
1,네. 이번 시험은 100페이지부터 250페이지까지입니다.,Yes. Our test will be from page 100 to page 250.
2,그렇군요. 감사합니다. 추가로 안내 받으신 사항 있으실까요?,"I see, thanks! Did you have any additional inf..."
3,네. 시험 범위 외에 교수님께서 지난번에 주신 발표자료도 참고하라고 하셨습니다.,Yes. The professor also told us to look at the...
4,"역대 시험 난이도 분석중인데, 이번 시험 난이도는 어떻게 될것같습니까?",I'm currently analyzing the level of difficult...


### Dataset Preparation

In [None]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=5, shuffle=True, random_state=42)

df_shuffled = df.sample(frac=1).reset_index(drop=True) # 전체 문장들을 무작위로 섞은 뒤 인덱스를 리셋해준다
df = df_shuffled[:30000] # 그 중 30,000개 문장만 가져와 사용

for i, (train_idx, val_idx) in enumerate(kf.split(df['kr'])):
    train = df.iloc[train_idx]
    val = df.iloc[val_idx]

print('train_size: ', len(train))
print('val_size: ', len(val))

train.to_csv(os.path.join(data_path, 'train.csv'), index=False)
val.to_csv(os.path.join(data_path, 'val.csv'), index=False)

train_size:  24000
val_size:  6000


In [None]:
# 어휘 생성
def yield_tokens(data_iter, tokenizer):
    for text in data_iter:
        yield tokenizer(text)

# 한국어와 영어 토크나이저 정의
tokenizer_kr = Okt()
tokenizer_en = spacy.load('en_core_web_sm')

def tokenize_kor(text):
    return tokenizer_kr.morphs(text)

def tokenize_en(text):
    return [token.text for token in tokenizer_en(text)]

### Sample Sentence Check

In [None]:
# 데이터셋에서 샘플 문장 확인
sample_kr_sentence = train_df['kr'].iloc[0]  # 첫 번째 샘플 문장 (한국어)
sample_en_sentence = train_df['en'].iloc[0]  # 첫 번째 샘플 문장 (영어)

# 토큰화된 결과 출력
print("Korean Sentence:", sample_kr_sentence)
print("Tokenized Korean:", list(yield_tokens([sample_kr_sentence], tokenize_kor)))

print("English Sentence:", sample_en_sentence)
print("Tokenized English:", list(yield_tokens([sample_en_sentence], tokenize_en)))

Korean Sentence: 조금 더 일찍 올 걸 그랬나. ATM 앞에 사람이 너무 많아. 
Tokenized Korean: [['조금', '더', '일찍', '올', '걸', '그랬나', '.', 'ATM', '앞', '에', '사람', '이', '너무', '많아', '.']]
English Sentence: We should have come earlier. There are so many people in front of the ATM.
Tokenized English: [['We', 'should', 'have', 'come', 'earlier', '.', 'There', 'are', 'so', 'many', 'people', 'in', 'front', 'of', 'the', 'ATM', '.']]


In [None]:
train_df = pd.read_csv(f'{data_path}train.csv')
valid_df = pd.read_csv(f'{data_path}val.csv')

# 어휘 구축
SRC_vocab = build_vocab_from_iterator(yield_tokens(train_df['kr'], tokenize_kor), min_freq=2, specials=['<unk>', '<pad>', '<sos>', '<eos>'])
SRC_vocab.set_default_index(SRC_vocab['<unk>'])

TRG_vocab = build_vocab_from_iterator(yield_tokens(train_df['en'], tokenize_en), min_freq=2, specials=['<unk>', '<pad>', '<sos>', '<eos>'])
TRG_vocab.set_default_index(TRG_vocab['<unk>'])

print(f"len(SRC_vocab): {len(SRC_vocab)}")
print(f"len(TRG_vocab): {len(TRG_vocab)}")

# Transforms 정의
SRC_transform = transforms.Sequential(
    transforms.VocabTransform(SRC_vocab),
    transforms.AddToken(token=SRC_vocab['<sos>'], begin=True),
    transforms.AddToken(token=SRC_vocab['<eos>'], begin=False)
)

TRG_transform = transforms.Sequential(
    transforms.VocabTransform(TRG_vocab),
    transforms.AddToken(token=TRG_vocab['<sos>'], begin=True),
    transforms.AddToken(token=TRG_vocab['<eos>'], begin=False)
)

In [None]:
# CSV 파일을 처리하여 텐서로 변환하는 Dataset 클래스
class TranslationDataset(Dataset):
    def __init__(self, path, src_transform=None, trg_transform=None):
        self.data = pd.read_csv(path)
        self.src_transform = src_transform
        self.trg_transform = trg_transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        src_text = self.data.iloc[idx]['kr']
        trg_text = self.data.iloc[idx]['en']

        # 토큰화 적용
        src_tokenized = tokenize_kor(src_text)
        trg_tokenized = tokenize_en(trg_text)

        # 변환 적용 (리스트로 감싸서 transform 적용)
        if self.src_transform:
            src_text = self.src_transform([src_tokenized])  # 리스트로 감싸서 transform 적용
        if self.trg_transform:
            trg_text = self.trg_transform([trg_tokenized])  # 리스트로 감싸서 transform 적용

        return torch.tensor(src_text[0]), torch.tensor(trg_text[0])  # 텐서로 변환

len(SRC_vocab): 18393
len(TRG_vocab): 15056


In [None]:
def collate_fn(batch):
    src_batch, trg_batch = [], []
    for src_sample, trg_sample in batch:
        src_batch.append(src_sample)
        trg_batch.append(trg_sample)

    # 텐서로 변환하고 패딩 처리
    src_batch = torch.nn.utils.rnn.pad_sequence(src_batch, padding_value=SRC_vocab['<pad>'], batch_first=True)
    trg_batch = torch.nn.utils.rnn.pad_sequence(trg_batch, padding_value=TRG_vocab['<pad>'], batch_first=True)

    return src_batch, trg_batch

In [None]:
# DataLoader 설정
BATCH_SIZE = 128

# Dataset 생성
train_data = TranslationDataset(train_csv, src_transform=SRC_transform, trg_transform=TRG_transform)
valid_data = TranslationDataset(valid_csv, src_transform=SRC_transform, trg_transform=TRG_transform)

# DataLoader 생성
train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
valid_loader = DataLoader(valid_data, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

# Batch shape 확인
for src, trg in train_loader:
    print(f"Source batch shape: {src.shape}")
    print(f"Target batch shape: {trg.shape}")
    break

### Define Dataset, DataLoader

In [None]:
# Dataset 생성
train_data = TranslationDataset(train_csv, src_transform=SRC_transform, trg_transform=TRG_transform)
valid_data = TranslationDataset(valid_csv, src_transform=SRC_transform, trg_transform=TRG_transform)

# DataLoader 설정
BATCH_SIZE = 128

def collate_fn(batch):
    src_batch, trg_batch = [], []
    for src_sample, trg_sample in batch:
        src_batch.append(src_sample)
        trg_batch.append(trg_sample)

    # 텐서로 변환하고 패딩 처리
    src_batch = torch.nn.utils.rnn.pad_sequence(src_batch, padding_value=SRC_vocab['<pad>'], batch_first=True)
    trg_batch = torch.nn.utils.rnn.pad_sequence(trg_batch, padding_value=TRG_vocab['<pad>'], batch_first=True)

    return src_batch, trg_batch

train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
valid_loader = DataLoader(valid_data, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

# DataLoader에서 배치를 받아오는 예시
for src, trg in train_loader:
    print(f"Source batch shape: {src.shape}")
    print(f"Target batch shape: {trg.shape}")
    break

Source batch shape: torch.Size([128, 74])
Target batch shape: torch.Size([128, 84])


# Hyper Parameters

In [None]:
import argparse

parser = argparse.ArgumentParser()
args = parser.parse_args("")

# ====== Random Seed Initialization ====== #
seed = 42
np.random.seed(seed)
torch.manual_seed(seed)

# ====== Padding Index Setting ====== #
# indices of <pad> token
SRC_PAD_IDX = SRC_vocab['<pad>']
TRG_PAD_IDX = TRG_vocab['<pad>']

# ====== Model ====== #
args.input_dim = len(SRC_vocab)
args.output_dim = len(TRG_vocab)

args.hidden_dim = 256 # 512 in the original paper

args.max_len = 200 # >= sentence max length

args.enc_layers = 3 # 6 in the original paper
args.dec_layers = 3

args.enc_heads = 8
args.dec_heads = 8

args.enc_pf_dim = 1024 # 2048 in the original paper
args.dec_pf_dim = 1024

args.enc_dropout = 0.1
args.dec_dropout = 0.1

# ====== Optimization ====== #
args.lr = 0.0005
args.warmup_steps = 4000

args.optim = "AdamW"

# ====== Train, Validate, Test ====== #
args.epoch = 10
args.clip = 1 # Not in the paper!
args.batch_size = 128

# Model Architecture

### Positional Embedding
* $PE_{(pos, 2i)} = sin(pos/10000^{2i/d_{model}})$
* $PE_{(pos, 2i+1)} = cos(pos/10000^{2i/d_{model}})$

In [None]:
def positional_encoding(max_len, hidden_dim, device):
    pos = torch.arange(0, max_len).unsqueeze(1).to(device) # [[0], [1], [2], ..., [max_len-1]]
    dim = torch.arange(0, hidden_dim, 2).to(device) # [0, 2, 4, 6, ..., hidden_dim-2]

    # 각 차원의 주기를 다르게 하기 위한 계산
    angle_rates = 1 / torch.pow(10000, (dim.float() / hidden_dim))

    # sin과 cos을 계산하여 matrix에 넣음
    pos_encoding = torch.zeros((max_len, hidden_dim)).to(device)
    pos_encoding[:, 0::2] = torch.sin(pos * angle_rates)  # 짝수 인덱스는 sin
    pos_encoding[:, 1::2] = torch.cos(pos * angle_rates)  # 홀수 인덱스는 cos

    return pos_encoding.unsqueeze(0)  # 배치 차원을 추가하여 반환

### Multi Head Attention
* hidden_dim: 하나의 단어에 대한 임베딩 차원
* n_heads: 헤드 개수
* dropout_ratio: 드롭아웃 비율

In [None]:
class MultiHeadAttentionLayer(nn.Module):
    def __init__(self, hidden_dim, n_heads, dropout_ratio, device):
        super().__init__()

        assert hidden_dim % n_heads == 0 # hidden_dim은 n_heads* d_k 값과 동일함

        self.hidden_dim = hidden_dim # 임베딩 차원
        self.n_heads = n_heads # head의 개수
        self.head_dim = hidden_dim // n_heads # hidden_dim은 n_heads* head_dim 값과 동일

        self.fc_q = nn.Linear(hidden_dim, hidden_dim) # Query 값에 적용될 FC 레이어
        self.fc_k = nn.Linear(hidden_dim, hidden_dim) # Key 값에 적용될 FC 레이어
        self.fc_v = nn.Linear(hidden_dim, hidden_dim) # Value 값에 적용될 FC 레이어

        self.fc_o = nn.Linear(hidden_dim, hidden_dim)

        self.dropout = nn.Dropout(dropout_ratio)

        self.scale = torch.sqrt(torch.FloatTensor([self.head_dim])).to(device)

    def forward(self, query, key, value, mask = None):

        batch_size = query.shape[0]

        # query: [batch_size, query_len, hidden_dim]
        # key: [batch_size, key_len, hidden_dim]
        # value: [batch_size, value_len, hidden_dim]

        Q = self.fc_q(query)
        K = self.fc_k(key)
        V = self.fc_v(value)

        # Q: [batch_size, query_len, hidden_dim]
        # K: [batch_size, key_len, hidden_dim]
        # V: [batch_size, value_len, hidden_dim]

        # hidden_dim → n_heads X head_dim 형태로 변형
        # n_heads(h)개의 서로 다른 어텐션(attention) 컨셉을 학습하도록 유도
        Q = Q.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        K = K.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        V = V.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)

        # Q: [batch_size, n_heads, query_len, head_dim]
        # K: [batch_size, n_heads, key_len, head_dim]
        # V: [batch_size, n_heads, value_len, head_dim]

        # Attention Energy 계산
        energy = torch.matmul(Q, K.permute(0, 1, 3, 2)) / self.scale

        # energy: [batch_size, n_heads, query_len, key_len]

        # 마스크(mask)를 사용하는 경우
        if mask is not None:
            # 마스크(mask) 값이 0인 부분을 -1e10으로 채우기
            energy = energy.masked_fill(mask==0, -1e10)

        # 어텐션(attention) 스코어 계산: 각 단어에 대한 확률 값
        attention = torch.softmax(energy, dim=-1)

        # attention: [batch_size, n_heads, query_len, key_len]

        # 여기에서 Scaled Dot-Product Attention을 계산
        x = torch.matmul(self.dropout(attention), V)

        # x: [batch_size, n_heads, query_len, head_dim]

        x = x.permute(0, 2, 1, 3).contiguous()

        # x: [batch_size, query_len, n_heads, head_dim]

        x = x.view(batch_size, -1, self.hidden_dim)

        # x: [batch_size, query_len, hidden_dim]

        x = self.fc_o(x)

        # x: [batch_size, query_len, hidden_dim]

        return x, attention

### Feed-forward
* same input-output dimension size
* pf_dim: Feed-forward layer 내부의 임베딩 차원 (x2)

In [None]:
class PositionwiseFeedforwardLayer(nn.Module):
    def __init__(self, hidden_dim, pf_dim, dropout_ratio):
        super().__init__()

        self.fc_1 = nn.Linear(hidden_dim, pf_dim)
        self.fc_2 = nn.Linear(pf_dim, hidden_dim)

        self.dropout = nn.Dropout(dropout_ratio)

    def forward(self, x):

        # x: [batch_size, seq_len, hidden_dim]

        x = self.dropout(torch.relu(self.fc_1(x)))

        # x: [batch_size, seq_len, pf_dim]

        x = self.fc_2(x)

        # x: [batch_size, seq_len, hidden_dim]

        return x

### Encoder Layer
* 하나의 인코더 레이어
* input-output 차원 동일
* '\<pad>' 토큰 mask = 0

In [None]:
class EncoderLayer(nn.Module):
    def __init__(self, hidden_dim, n_heads, pf_dim, dropout_ratio, device):
        super().__init__()

        self.self_attn_layer_norm = nn.LayerNorm(hidden_dim)
        self.ff_layer_norm = nn.LayerNorm(hidden_dim)
        self.self_attention = MultiHeadAttentionLayer(hidden_dim, n_heads, dropout_ratio, device)
        self.positionwise_feedforward = PositionwiseFeedforwardLayer(hidden_dim, pf_dim, dropout_ratio)
        self.dropout = nn.Dropout(dropout_ratio)

    # 하나의 임베딩이 복제되어 Query, Key, Value로 입력되는 방식
    def forward(self, src, src_mask):

        # src: [batch_size, src_len, hidden_dim]
        # src_mask: [batch_size, src_len]

        # self attention
        # 필요한 경우 마스크(mask) 행렬을 이용하여 어텐션(attention)할 단어를 조절 가능
        _src, _ = self.self_attention(src, src, src, src_mask)

        # dropout, residual connection and layer norm
        src = self.self_attn_layer_norm(src + self.dropout(_src))

        # src: [batch_size, src_len, hidden_dim]

        # position-wise feedforward
        _src = self.positionwise_feedforward(src)

        # dropout, residual and layer norm
        src = self.ff_layer_norm(src + self.dropout(_src))

        # src: [batch_size, src_len, hidden_dim]

        return src

### Encoder (Encoder Layer x enc_layers)
* max_length: 문장 내 최대 단어 개수

In [None]:
class Encoder(nn.Module):
    def __init__(self, input_dim, hidden_dim, n_layers, n_heads, pf_dim, dropout_ratio, device, max_length):
        super().__init__()

        self.device = device

        self.tok_embedding = nn.Embedding(input_dim, hidden_dim)
        self.pos_embedding = positional_encoding(max_length, hidden_dim, device)

        self.layers = nn.ModuleList([EncoderLayer(hidden_dim, n_heads, pf_dim, dropout_ratio, device) for _ in range(n_layers)])

        self.dropout = nn.Dropout(dropout_ratio)

        self.scale = torch.sqrt(torch.FloatTensor([hidden_dim])).to(device)

    def forward(self, src, src_mask):

        # src: [batch_size, src_len]
        # src_mask: [batch_size, src_len]

        batch_size = src.shape[0]
        src_len = src.shape[1]

        pos = self.pos_embedding[:, :src_len, :].to(self.device)

        # 소스 문장의 임베딩과 위치 임베딩을 더한 것을 사용
        src = self.dropout((self.tok_embedding(src) * self.scale) + pos)

        for layer in self.layers:
            src = layer(src, src_mask)

        return src

### Decoder Layer
* input-output 차원 동일
* masked self-attention
* cross attention

In [None]:
class DecoderLayer(nn.Module):
    def __init__(self, hidden_dim, n_heads, pf_dim, dropout_ratio, device):
        super().__init__()

        self.self_attn_layer_norm = nn.LayerNorm(hidden_dim)
        self.enc_attn_layer_norm = nn.LayerNorm(hidden_dim)
        self.ff_layer_norm = nn.LayerNorm(hidden_dim)
        self.self_attention = MultiHeadAttentionLayer(hidden_dim, n_heads, dropout_ratio, device)
        self.encoder_attention = MultiHeadAttentionLayer(hidden_dim, n_heads, dropout_ratio, device)
        self.positionwise_feedforward = PositionwiseFeedforwardLayer(hidden_dim, pf_dim, dropout_ratio)
        self.dropout = nn.Dropout(dropout_ratio)

    # 인코더의 출력 값(enc_src)을 어텐션(attention)하는 구조
    def forward(self, trg, enc_src, trg_mask, src_mask):

        # trg: [batch_size, trg_len, hidden_dim]
        # enc_src: [batch_size, src_len, hidden_dim]
        # trg_mask: [batch_size, trg_len]
        # src_mask: [batch_size, src_len]

        # self attention
        _trg, _ = self.self_attention(trg, trg, trg, trg_mask)

        # dropout, residual connection and layer norm
        trg = self.self_attn_layer_norm(trg + self.dropout(_trg))

        # encoder attention
        # 디코더의 쿼리(Query)를 이용해 인코더를 어텐션(attention)
        _trg, attention = self.encoder_attention(trg, enc_src, enc_src, src_mask)

        # dropout, residual connection and layer norm
        trg = self.enc_attn_layer_norm(trg + self.dropout(_trg))

        # positionwise feedforward
        _trg = self.positionwise_feedforward(trg)

        # dropout, residual and layer norm
        trg = self.ff_layer_norm(trg + self.dropout(_trg))

        # attention: [batch_size, n_heads, trg_len, src_len]

        return trg, attention

### Decoder (Decoder Layer x dec_layers)
* Seq2Seq과는 마찬가지로 inference 시기에서는 디코더를 반복적으로 넣을 필요가 있음
    * training 시기에서는 한 번에 출력 문장을 구해 학습할 수 있음

In [None]:
class Decoder(nn.Module):
    def __init__(self, output_dim, hidden_dim, n_layers, n_heads, pf_dim, dropout_ratio, device, max_length):
        super().__init__()

        self.device = device

        self.tok_embedding = nn.Embedding(output_dim, hidden_dim)
        self.pos_embedding = positional_encoding(max_length, hidden_dim, device)

        self.layers = nn.ModuleList([DecoderLayer(hidden_dim, n_heads, pf_dim, dropout_ratio, device) for _ in range(n_layers)])

        self.fc_out = nn.Linear(hidden_dim, output_dim)

        self.dropout = nn.Dropout(dropout_ratio)

        self.scale = torch.sqrt(torch.FloatTensor([hidden_dim])).to(device)

    def forward(self, trg, enc_src, trg_mask, src_mask):

        # trg: [batch_size, trg_len]
        # enc_src: [batch_size, src_len, hidden_dim]
        # trg_mask: [batch_size, trg_len]
        # src_mask: [batch_size, src_len]

        batch_size = trg.shape[0]
        trg_len = trg.shape[1]

        pos = self.pos_embedding[:, :trg_len, :].to(self.device)

        # pos: [batch_size, trg_len]

        trg = self.dropout((self.tok_embedding(trg) * self.scale) + pos)

        # trg: [batch_size, trg_len, hidden_dim]

        for layer in self.layers:
            # 소스 마스크와 타겟 마스크 모두 사용
            trg, attention = layer(trg, enc_src, trg_mask, src_mask)

        # attention: [batch_size, n_heads, trg_len, src_len]

        output = self.fc_out(trg)

        # output: [batch_size, trg_len, output_dim]

        return output, attention

### Transformer

In [None]:
class Transformer(nn.Module):
    def __init__(self, encoder, decoder, src_pad_idx, trg_pad_idx, device):
        super().__init__()

        self.encoder = encoder
        self.decoder = decoder
        self.src_pad_idx = src_pad_idx
        self.trg_pad_idx = trg_pad_idx
        self.device = device

    # 소스 문장의 <pad> 토큰에 대하여 마스크(mask) 값을 0으로 설정
    def make_src_mask(self, src):

        # src: [batch_size, src_len]

        src_mask = (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)

        # src_mask: [batch_size, 1, 1, src_len]

        return src_mask

    # 타겟 문장에서 각 단어는 다음 단어가 무엇인지 알 수 없도록(이전 단어만 보도록) 만들기 위해 마스크를 사용
    def make_trg_mask(self, trg):

        # trg: [batch_size, trg_len]

        """ (마스크 예시)
        1 0 0 0 0
        1 1 0 0 0
        1 1 1 0 0
        1 1 1 0 0
        1 1 1 0 0
        """
        trg_pad_mask = (trg != self.trg_pad_idx).unsqueeze(1).unsqueeze(2)

        # trg_pad_mask: [batch_size, 1, 1, trg_len]

        trg_len = trg.shape[1]

        """ (마스크 예시)
        1 0 0 0 0
        1 1 0 0 0
        1 1 1 0 0
        1 1 1 1 0
        1 1 1 1 1
        """
        trg_sub_mask = torch.tril(torch.ones((trg_len, trg_len), device = self.device)).bool()

        # trg_sub_mask: [trg_len, trg_len]

        trg_mask = trg_pad_mask & trg_sub_mask

        # trg_mask: [batch_size, 1, trg_len, trg_len]

        return trg_mask

    def forward(self, src, trg):

        # src: [batch_size, src_len]
        # trg: [batch_size, trg_len]

        src_mask = self.make_src_mask(src)
        trg_mask = self.make_trg_mask(trg)

        # src_mask: [batch_size, 1, 1, src_len]
        # trg_mask: [batch_size, 1, trg_len, trg_len]

        enc_src = self.encoder(src, src_mask)

        # enc_src: [batch_size, src_len, hidden_dim]

        output, attention = self.decoder(trg, enc_src, trg_mask, src_mask)

        # output: [batch_size, trg_len, output_dim]
        # attention: [batch_size, n_heads, trg_len, src_len]

        return output, attention

# Train

## Train Preparation

### GPU Setting

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

### Model Initialization

In [None]:
# 인코더(encoder)와 디코더(decoder) 객체 선언
enc = Encoder(args.input_dim, args.hidden_dim, args.enc_layers, args.enc_heads, args.enc_pf_dim, args.enc_dropout, device, args.max_len)
dec = Decoder(args.output_dim, args.hidden_dim, args.dec_layers, args.dec_heads, args.dec_pf_dim, args.dec_dropout, device, args.max_len)

# Transformer 객체 선언
model = Transformer(enc, dec, SRC_PAD_IDX, TRG_PAD_IDX, device).to(device)

# 모델 출력
print(f"Transformer model created with {args.enc_layers} encoder layers and {args.dec_layers} decoder layers.")

Transformer model created with 3 encoder layers and 3 decoder layers.


### Weight Initialization

In [None]:
def initialize_weights(m):
    if hasattr(m, 'weight') and m.weight.dim() > 1:
        nn.init.xavier_uniform_(m.weight.data)

model.apply(initialize_weights)

Transformer(
  (encoder): Encoder(
    (tok_embedding): Embedding(18393, 256)
    (layers): ModuleList(
      (0): EncoderLayer(
        (self_attn_layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (ff_layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (self_attention): MultiHeadAttentionLayer(
          (fc_q): Linear(in_features=256, out_features=256, bias=True)
          (fc_k): Linear(in_features=256, out_features=256, bias=True)
          (fc_v): Linear(in_features=256, out_features=256, bias=True)
          (fc_o): Linear(in_features=256, out_features=256, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (positionwise_feedforward): PositionwiseFeedforwardLayer(
          (fc_1): Linear(in_features=256, out_features=1024, bias=True)
          (fc_2): Linear(in_features=1024, out_features=256, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (dropout): Dropout(p=0.1, inpla

### Define Scheduler

In [None]:

# 학습률 스케줄러 정의
class NoamScheduler(optim.lr_scheduler._LRScheduler):
    def __init__(self, optimizer, hidden_dim, warmup_steps, last_epoch=-1):
        self.hidden_dim = hidden_dim
        self.warmup_steps = warmup_steps
        super(NoamScheduler, self).__init__(optimizer, last_epoch)

    def get_lr(self):
        step_num = max(1, self._step_count)  # step이 0일 경우를 방지
        scale = (self.hidden_dim ** -0.5) * min(step_num ** -0.5, step_num * (self.warmup_steps ** -1.5))
        return [base_lr * scale for base_lr in self.base_lrs]

# 모델의 옵티마이저 선택
if args.optim == 'SGD':
    optimizer = optim.SGD(model.parameters(), lr=args.lr, weight_decay=args.l2)
elif args.optim == 'RMSprop':
    optimizer = optim.RMSprop(model.parameters(), lr=args.lr, weight_decay=args.l2)
elif args.optim == 'Adam':
    optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.l2)
elif args.optim == 'AdamW':
    optimizer = optim.AdamW(model.parameters(), lr=args.lr, betas=(0.9, 0.98), eps=1e-9)
else:
    raise ValueError(f"Invalid optimizer choice: {args.optim}")

# 학습률 스케줄러 정의
scheduler = NoamScheduler(
    optimizer=optimizer,               # 생성한 옵티마이저를 스케줄러에 전달
    hidden_dim=args.hidden_dim,        # args에서 hidden_dim 가져오기
    warmup_steps=args.warmup_steps     # args에서 warmup_steps 가져오기
)

## Define Train, Evaluation Function

### Train

In [None]:
# 모델 학습(train) 함수
def train(model, train_loader, args):

    # 뒷 부분의 패딩(padding)에 대해서는 값 무시
    criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)

    if args.optim == 'SGD':
        optimizer = optim.SGD(model.parameters(), lr=args.lr, weight_decay=args.l2)
    elif args.optim == 'RMSprop':
        optimizer = optim.RMSprop(model.parameters(), lr=args.lr, weight_decay=args.l2)
    elif args.optim == 'Adam':
        optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.l2)
    elif args.optim == 'AdamW':
        optimizer =  torch.optim.AdamW(model.parameters(), lr=args.lr, betas=(0.9, 0.98), eps=1e-9)
    else:
        raise ValueError('In-valid optimizer choice')

    model.train() # 학습 모드
    epoch_loss = 0

    # 전체 학습 데이터를 확인하며
    for i, data in enumerate(train_loader, 0):

        src, trg = data
        src = src.to(device)
        trg = trg.to(device)

        optimizer.zero_grad()

        # 출력 단어의 마지막 인덱스(<eos>)는 제외
        # 입력을 할 때는 <sos>부터 시작하도록 처리
        output, _ = model(src, trg[:,:-1])

        # output: [배치 크기, trg_len - 1, output_dim]
        # trg: [배치 크기, trg_len]

        output_dim = output.shape[-1]

        output = output.contiguous().view(-1, output_dim)
        # 출력 단어의 인덱스 0(<sos>)은 제외
        trg = trg[:,1:].contiguous().view(-1)

        # output: [배치 크기 * trg_len - 1, output_dim]
        # trg: [배치 크기 * trg len - 1]

        loss = criterion(output, trg)
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip)

        optimizer.step()

        epoch_loss += loss.item()

    return epoch_loss / len(train_loader)

### Evaluate

In [None]:
def evaluate(model, valid_loader):

    # 뒷 부분의 패딩(padding)에 대해서는 값 무시
    criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)

    model.eval() # 평가 모드
    epoch_loss = 0


    # 전체 평가 데이터를 확인하며
    for i, data in enumerate(valid_loader, 0):

        src, trg = data

        src = src.to(device)
        trg = trg.to(device)

        # 출력 단어의 마지막 인덱스(<eos>)는 제외
        # 입력을 할 때는 <sos>부터 시작하도록 처리
        output, _ = model(src, trg[:,:-1])

        # output: [배치 크기, trg_len - 1, output_dim]
        # trg: [배치 크기, trg_len]

        output_dim = output.shape[-1]

        output = output.contiguous().view(-1, output_dim)
        # 출력 단어의 인덱스 0(<sos>)은 제외
        trg = trg[:,1:].contiguous().view(-1)

        # output: [배치 크기 * trg_len - 1, output_dim]
        # trg: [배치 크기 * trg len - 1]

        loss = criterion(output, trg)

        epoch_loss += loss.item()

    return epoch_loss / len(valid_loader)

In [None]:
import math
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

# Train & Validate

In [None]:
best_valid_loss = float('inf')

for epoch in range(args.epoch):
    start_time = time.time() # 시작 시간 기록

    train_loss = train(model, train_loader, args)
    valid_loss = evaluate(model, valid_loader)

    end_time = time.time() # 종료 시간 기록
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'transformer_korean_to_english.pt')

    print(f'Epoch: {epoch + 1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):.3f}')
    print(f'\tValidation Loss: {valid_loss:.3f} | Validation PPL: {math.exp(valid_loss):.3f}')

Epoch: 01 | Time: 8m 25s
	Train Loss: 6.156 | Train PPL: 471.694
	Validation Loss: 4.909 | Validation PPL: 135.534
Epoch: 02 | Time: 8m 27s
	Train Loss: 4.733 | Train PPL: 113.688
	Validation Loss: 4.378 | Validation PPL: 79.675
Epoch: 03 | Time: 8m 26s
	Train Loss: 4.304 | Train PPL: 73.999
	Validation Loss: 4.111 | Validation PPL: 60.996
Epoch: 04 | Time: 8m 24s
	Train Loss: 4.026 | Train PPL: 56.016
	Validation Loss: 3.937 | Validation PPL: 51.271
Epoch: 05 | Time: 8m 25s
	Train Loss: 3.805 | Train PPL: 44.905
	Validation Loss: 3.827 | Validation PPL: 45.945
Epoch: 06 | Time: 8m 29s
	Train Loss: 3.612 | Train PPL: 37.052
	Validation Loss: 3.735 | Validation PPL: 41.876
Epoch: 07 | Time: 8m 26s
	Train Loss: 3.448 | Train PPL: 31.442
	Validation Loss: 3.652 | Validation PPL: 38.535
Epoch: 08 | Time: 8m 30s
	Train Loss: 3.289 | Train PPL: 26.806
	Validation Loss: 3.607 | Validation PPL: 36.864
Epoch: 09 | Time: 8m 22s
	Train Loss: 3.141 | Train PPL: 23.132
	Validation Loss: 3.567 | Val