# Assignment 2: Korean to English Translation

- Sequence to Sequence 모델의 대표적인 한국어-영어 번역을 [Encoder-decoder](https://github.com/bentrevett/pytorch-seq2seq/blob/main/1%20-%20Sequence%20to%20Sequence%20Learning%20with%20Neural%20Networks.ipynb), [Attention]( https://github.com/bentrevett/pytorch-seq2seq/blob/main/3%20-%20Neural%20Machine%20Translation%20by%20Jointly%20Learning%20to%20Align%20and%20Translate.ipynb), 그리고 [Transformers](https://github.com/bentrevett/pytorch-seq2seq/blob/main/legacy/6%20-%20Attention%20is%20All%20You%20Need.ipynb) 기반으로 구현
- Pytorch Seq to Seq 모델을 참고로 하여 한국어와 영어의 형태소분석되고 의존관계로 되어 있는 파일을 프로세싱하여 두 언어의 parallel 데이터 쌍으로 만들고 이를 학습하여 모델별로 Perplexity가 어떻게 달라지는지 살펴 보고, 가장 성능이 좋은 모델을 근간으로 해서 Inference로 한국어 문장을 입력하면 대응되는 영어 번역이 출력될 수 있도록 구현
- Transformer 기반은 이전 토치텍스트 버전으로 되어 있으니 이를 새로운 토치 텍스트 버전으로 바꾸어야 함
- 반드시 다음 세 모델에 대해서 PPL와 BLEU score가  다 체크되어야 함.  Encoder-Decoder, Transformers.
- 세 모델 중에 학습이 제대로 이루어지지 않는 경우, PPL이나 BLEU가 문제가 있는 경우 이를 Fix하려고 시도해 보라.
- **새로운 버전의 TorchText를 사용하여 코랩에서 실행가능하도록**
- 그룹을 허용. 그룹으로 할 경우 2명을 넘지 않아야 하며, 제출 파일에 참여자 이름과 역할을 반드시 명시할 것.
- Inference시에 unk인 단어를 로마자화해서 번역에 나타날 수 있도록 시도해 볼 것(참고할 수 있는 사이트 중 하나 https://github.com/osori/korean-romanizer)

- File이름 : Assignment2_학번_(그룹)이름.ipynb
- Due: 11월 5일 밤 11시 59분




## Data
- 첨부된 ko-en-en.parse.syn은 330,974 한국어 문장에 대응되는 영어문장이 품사와 구문분석이 되어 있는 파일이고 ko-en-ko.parse.syn은 이에 대응되는 한국어 문장이 형태소와 구문분석이 되어 있는 파일이다.

(ROOT (S (NP (NNP Flight) (NNP 007)) (VP (MD will) (VP (VB stay) (PP (IN on) (NP (NP (DT the) (NN ground)) (PP (IN for) (NP (CD one) (NN hour))))))) (. .)))


<id 1>
<sent 1>
1       2       NP      777/SN
2       6       NP_SBJ  항공편/NNG|은/JX
3       4       NP      1/SN|시간/NNG
4       6       NP_AJT  동안/NNG
5       6       NP_AJT  지상/NNG|에/JKB
6       7       VP      머물/VV|게/EC
7       0       VP      되/VV|ㅂ니다/EF|./SF
</sent>
</id>

- 이 두 파일을 프로세싱하여 한-영 병행 데이터로 만들고 이를 학습 및 테스트 데이터로 사용한다.
- Hint: 구조화된 데이터를 프로세싱하기 위해서는 nltk의 모듈을 사용할 수 있다.

- 한국어 형태소 분석된 단위를 어절별로 결합할 수 있고, 분석된 채로 그대로 사용할 수도 있다.
- 두 언어의 어순을 비슷하게 데이터를 만들어 학습할 수도 있고, 번역의 성능을 높이기 위해 다양한 형태로 재구조화 할 수 있다.

## 구현,실험 전체적인 설명 및 분석 

## Your Code

In [None]:
###################### data preprocessing

## file processing - 1. 한국어

import re
from tqdm import tqdm
import pandas as pd


def parse_ko(raw_data):
    # # 각 문장을 담을 리스트
    processed_sentences = []

    # 1. 문장별 분리: <sent #>  분리
    sents = re.findall(r"<sent 1>(.*?)</sent>", raw_data, re.DOTALL)
    # 영어 데이터와 개수 매칭 이슈로 영어 데이터의 규칙을 따라 복수 문장의 경우 분절하지 않고 한 문장으로 이어붙임.

    for i, sent in tqdm(enumerate(sents), total = len(sents)):
        words = []

        # 2. 어절별 분리
        for line in sent.strip().splitlines():
            parts = line.split('\t')
            if len(parts) < 4:
                continue  # 4개 열로 구성된 파트만 보도록

            # 3. 각 어절 내 형태소 surface form만 추출
            morphemes = parts[-1].split('|')  # e.g. 10/SN|장/NNG -> 10/SN, 장/NNG
            word = [x.split('/')[0] for x in morphemes]
            words.append(' '.join(word))

        # 4. 완성된 문장을 문자열로 변환하여 추가
        sent_joined = ' '.join(words)
        processed_sentences.append(sent_joined)
        #print(f'문장 {i}: {sent_joined}')

    return processed_sentences


f1 = open('/content/ko-en.ko.parse','r')
ko_raw = f1.read()

ko_parsed = parse_ko(ko_raw)
ko_parsed = [x for x in ko_parsed if x.strip() != '']
print(f'\n\n한국어 데이터 문장 개수: {len(ko_parsed)}')
print(f'한국어 문장 예시: {ko_parsed[-1]}')


## file processing - 2. 영어
from nltk import Tree

f2 = open('/content/ko-en.en.parse.syn','r')
en_raw = f2.readlines() # 영어 데이터: 1문장마다 1 줄 기재 -> 줄 단위로 받도록
print(f'영어 raw 데이터 예시:\n{en_raw[0]}')

def parse_en(raw_line):
  words = Tree.fromstring(raw_line).leaves()
  return ' '.join(words)


en_parsed = [parse_en(line) for line in tqdm(en_raw, total = len(en_raw))]
print(f'\n\n영어 데이터 문장 개수: {len(en_parsed)}')
print(f'영어 문장 예시: {en_parsed[-1]}')

ko_en_sents = pd.DataFrame({'ko': ko_parsed, 'en': en_parsed})
ko_en_sents.to_csv('ko_en_parallel.csv', index = False, encoding = 'utf-8')


In [1]:
#################################################make dataloader(여기까지는 공통)########################################

import torch
import torch.nn as nn
import torch.optim as optim
import random
import numpy as np
import spacy
import datasets
import torchtext
import tqdm
import evaluate
import torchtext


seed = 1234

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True

dataset = datasets.load_dataset("csv", data_files = "data_ass2/ko_en_parallel.csv")
train_valTest = dataset['train'].train_test_split(test_size=0.2)
val_test = train_valTest["test"].train_test_split(test_size=0.5)
dataset = {
    "train": train_valTest["train"],
    "validation": val_test["train"],
    "test": val_test["test"],
}

train_data, valid_data, test_data = (
    dataset["train"],
    dataset["validation"],
    dataset["test"],
)


en_nlp = spacy.load("en_core_web_sm")
ko_nlp = spacy.load("ko_core_news_sm")


def tokenize_example(example, en_nlp, ko_nlp, max_length, lower, sos_token, eos_token):
    en_tokens = [token.text for token in en_nlp.tokenizer(example["en"])][:max_length]
    ko_tokens = [token.text for token in ko_nlp.tokenizer(example["ko"])][:max_length]
    if lower:
        en_tokens = [token.lower() for token in en_tokens]
        ko_tokens = [token.lower() for token in ko_tokens]
    en_tokens = [sos_token] + en_tokens + [eos_token]
    ko_tokens = [sos_token] + ko_tokens + [eos_token]
    return {"en_tokens": en_tokens, "ko_tokens": ko_tokens}
    
max_length = 128
lower = True
sos_token = "<sos>"
eos_token = "<eos>"

fn_kwargs = {
    "en_nlp": en_nlp,
    "ko_nlp": ko_nlp,
    "max_length": max_length,
    "lower": lower,
    "sos_token": sos_token,
    "eos_token": eos_token,
}

train_data = train_data.map(tokenize_example, fn_kwargs=fn_kwargs)
valid_data = valid_data.map(tokenize_example, fn_kwargs=fn_kwargs)
test_data = test_data.map(tokenize_example, fn_kwargs=fn_kwargs)


from torchtext.vocab import build_vocab_from_iterator

min_freq = 2
unk_token = "<unk>"
pad_token = "<pad>"

special_tokens = [
    unk_token,
    pad_token,
    sos_token,
    eos_token,
]

# Define your special tokens

# Initialize tokenizers for English and German (or your specific languages)

# Function to yield tokens from your data

# Assuming train_data["en_tokens"] and train_data["ko_tokens"] are lists of sentences
en_vocab = build_vocab_from_iterator(
    train_data["en_tokens"],
    min_freq=min_freq,
    specials=special_tokens,
    special_first=True  # Place special tokens at the beginning of the vocab
)

ko_vocab = build_vocab_from_iterator(
    train_data["ko_tokens"],
    min_freq=min_freq,
    specials=special_tokens,
    special_first=True  # Place special tokens at the beginning of the vocab
)

# Optional: Set default index for unknown tokens
en_vocab.set_default_index(en_vocab[unk_token])
ko_vocab.set_default_index(ko_vocab[unk_token])

fn_kwargs = {
    "en_vocab": en_vocab,
    "ko_vocab": ko_vocab,
}

unk_index = en_vocab[unk_token]
pad_index = en_vocab[pad_token]

en_vocab.set_default_index(unk_index)
ko_vocab.set_default_index(unk_index)

def numericalize_example(example, en_vocab, ko_vocab):
    en_ids = en_vocab.lookup_indices(example["en_tokens"])
    ko_ids = ko_vocab.lookup_indices(example["ko_tokens"])
    return {"en_ids": en_ids, "ko_ids": ko_ids}

train_data = train_data.map(numericalize_example, fn_kwargs=fn_kwargs, remove_columns=['en_tokens', 'ko_tokens'])
valid_data = valid_data.map(numericalize_example, fn_kwargs=fn_kwargs, remove_columns=['en_tokens', 'ko_tokens'])
test_data = test_data.map(numericalize_example, fn_kwargs=fn_kwargs, remove_columns=['en_tokens', 'ko_tokens'])
data_type = "torch"
format_columns = ["en_ids", "ko_ids"]

train_data = train_data.with_format(
    type=data_type, columns=format_columns, output_all_columns=True
)

valid_data = valid_data.with_format(
    type=data_type,
    columns=format_columns,
    output_all_columns=True,
)

test_data = test_data.with_format(
    type=data_type,
    columns=format_columns,
    output_all_columns=True,
)

def get_collate_fn(pad_index):
    def collate_fn(batch):
        batch_en_ids = [example["en_ids"] for example in batch]
        batch_ko_ids = [example["ko_ids"] for example in batch]
        batch_en_ids = nn.utils.rnn.pad_sequence(batch_en_ids, padding_value=pad_index)
        batch_ko_ids = nn.utils.rnn.pad_sequence(batch_ko_ids, padding_value=pad_index)
        batch = {
            "en_ids": batch_en_ids,
            "ko_ids": batch_ko_ids,
        }
        return batch

    return collate_fn
    
def get_data_loader(dataset, batch_size, pad_index, shuffle=False):
    collate_fn = get_collate_fn(pad_index)
    data_loader = torch.utils.data.DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        collate_fn=collate_fn,
        shuffle=shuffle,
    )
    return data_loader

batch_size = 512
train_data_loader = get_data_loader(train_data, batch_size, pad_index, shuffle=True)
valid_data_loader = get_data_loader(valid_data, batch_size, pad_index)
test_data_loader = get_data_loader(test_data, batch_size, pad_index)


Map:   0%|          | 0/33098 [00:00<?, ? examples/s]



Map:   0%|          | 0/33098 [00:00<?, ? examples/s]

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [2]:
############################################################ seq2seq

class Encoder(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, n_layers, dropout):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=dropout)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        # src = [src length, batch size]
        embedded = self.dropout(self.embedding(src))
        # embedded = [src length, batch size, embedding dim]
        outputs, (hidden, cell) = self.rnn(embedded)
        # outputs = [src length, batch size, hidden dim * n directions]
        # hidden = [n layers * n directions, batch size, hidden dim]
        # cell = [n layers * n directions, batch size, hidden dim]
        # outputs are always from the top hidden layer
        return hidden, cell


class Decoder(nn.Module):
    def __init__(self, output_dim, embedding_dim, hidden_dim, n_layers, dropout):
        super().__init__()
        self.output_dim = output_dim
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.embedding = nn.Embedding(output_dim, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=dropout)
        self.fc_out = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, cell):
        # input = [batch size]
        # hidden = [n layers * n directions, batch size, hidden dim]
        # cell = [n layers * n directions, batch size, hidden dim]
        # n directions in the decoder will both always be 1, therefore:
        # hidden = [n layers, batch size, hidden dim]
        # context = [n layers, batch size, hidden dim]
        input = input.unsqueeze(0)
        # input = [1, batch size]
        embedded = self.dropout(self.embedding(input))
        # embedded = [1, batch size, embedding dim]
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        # output = [seq length, batch size, hidden dim * n directions]
        # hidden = [n layers * n directions, batch size, hidden dim]
        # cell = [n layers * n directions, batch size, hidden dim]
        # seq length and n directions will always be 1 in this decoder, therefore:
        # output = [1, batch size, hidden dim]
        # hidden = [n layers, batch size, hidden dim]
        # cell = [n layers, batch size, hidden dim]
        prediction = self.fc_out(output.squeeze(0))
        # prediction = [batch size, output dim]
        return prediction, hidden, cell

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        assert (
            encoder.hidden_dim == decoder.hidden_dim
        ), "Hidden dimensions of encoder and decoder must be equal!"
        assert (
            encoder.n_layers == decoder.n_layers
        ), "Encoder and decoder must have equal number of layers!"

    def forward(self, src, trg, teacher_forcing_ratio):
        # src = [src length, batch size]
        # trg = [trg length, batch size]
        # teacher_forcing_ratio is probability to use teacher forcing
        # e.g. if teacher_forcing_ratio is 0.75 we use ground-truth inputs 75% of the time
        batch_size = trg.shape[1]
        trg_length = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        # tensor to store decoder outputs
        outputs = torch.zeros(trg_length, batch_size, trg_vocab_size).to(self.device)
        # last hidden state of the encoder is used as the initial hidden state of the decoder
        hidden, cell = self.encoder(src)
        # hidden = [n layers * n directions, batch size, hidden dim]
        # cell = [n layers * n directions, batch size, hidden dim]
        # first input to the decoder is the <sos> tokens
        input = trg[0, :]
        # input = [batch size]
        for t in range(1, trg_length):
            # insert input token embedding, previous hidden and previous cell states
            # receive output tensor (predictions) and new hidden and cell states
            output, hidden, cell = self.decoder(input, hidden, cell)
            # output = [batch size, output dim]
            # hidden = [n layers, batch size, hidden dim]
            # cell = [n layers, batch size, hidden dim]
            # place predictions in a tensor holding predictions for each token
            outputs[t] = output
            # decide if we are going to use teacher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio
            # get the highest predicted token from our predictions
            top1 = output.argmax(1)
            # if teacher forcing, use actual next token as next input
            # if not, use predicted token
            input = trg[t] if teacher_force else top1
            # input = [batch size]
        return outputs


# train seq2seq

input_dim = len(ko_vocab)
output_dim = len(en_vocab)
# encoder_embedding_dim = 256
# decoder_embedding_dim = 256
# hidden_dim = 512
encoder_embedding_dim = 128
decoder_embedding_dim = 128
hidden_dim = 128
n_layers = 2
encoder_dropout = 0.5
decoder_dropout = 0.5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

encoder = Encoder(
    input_dim,
    encoder_embedding_dim,
    hidden_dim,
    n_layers,
    encoder_dropout,
)

decoder = Decoder(
    output_dim,
    decoder_embedding_dim,
    hidden_dim,
    n_layers,
    decoder_dropout,
)

model = Seq2Seq(encoder, decoder, device).to(device)

def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)


model.apply(init_weights)

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


print(f"The model has {count_parameters(model):,} trainable parameters")

optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index=pad_index)

def train_fn(
    model, data_loader, optimizer, criterion, clip, teacher_forcing_ratio, device
):
    model.train()
    epoch_loss = 0
    for i, batch in enumerate(data_loader):
        src = batch["ko_ids"].to(device)
        trg = batch["en_ids"].to(device)
        # src = [src length, batch size]
        # trg = [trg length, batch size]
        optimizer.zero_grad()
        output = model(src, trg, teacher_forcing_ratio)
        # output = [trg length, batch size, trg vocab size]
        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)
        # output = [(trg length - 1) * batch size, trg vocab size]
        trg = trg[1:].view(-1)
        # trg = [(trg length - 1) * batch size]
        loss = criterion(output, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(data_loader)

def evaluate_fn(model, data_loader, criterion, device):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for i, batch in enumerate(data_loader):
            src = batch["ko_ids"].to(device)
            trg = batch["en_ids"].to(device)
            # src = [src length, batch size]
            # trg = [trg length, batch size]
            output = model(src, trg, 0)  # turn off teacher forcing
            # output = [trg length, batch size, trg vocab size]
            output_dim = output.shape[-1]
            output = output[1:].view(-1, output_dim)
            # output = [(trg length - 1) * batch size, trg vocab size]
            trg = trg[1:].view(-1)
            # trg = [(trg length - 1) * batch size]
            loss = criterion(output, trg)
            epoch_loss += loss.item()
    return epoch_loss / len(data_loader)
n_epochs = 1
clip = 1.0
teacher_forcing_ratio = 0.5

best_valid_loss = float("inf")

# for epoch in tqdm.tqdm(range(n_epochs)):
#     train_loss = train_fn(
#         model,
#         train_data_loader,
#         optimizer,
#         criterion,
#         clip,
#         teacher_forcing_ratio,
#         device,
#     )
#     valid_loss = evaluate_fn(
#         model,
#         valid_data_loader,
#         criterion,
#         device,
#     )
#     if valid_loss < best_valid_loss:
#         best_valid_loss = valid_loss
#         torch.save(model.state_dict(), "tut1-model.pt")
#     print(f"\tTrain Loss: {train_loss:7.3f} | Train PPL: {np.exp(train_loss):7.3f}")
#     print(f"\tValid Loss: {valid_loss:7.3f} | Valid PPL: {np.exp(valid_loss):7.3f}")





cuda
The model has 6,057,827 trainable parameters


In [3]:
# seq2seq: bleu and ppl

model.load_state_dict(torch.load("tut1-model.pt"))
test_loss = evaluate_fn(model, test_data_loader, criterion, device)
print(f"| Test Loss: {test_loss:.3f} | Test PPL: {np.exp(test_loss):7.3f} |")

def translate_sentence(
    sentence,
    model,
    en_nlp,
    ko_nlp,
    en_vocab,
    ko_vocab,
    lower,
    sos_token,
    eos_token,
    device,
    max_output_length=25,
):
    model.eval()
    with torch.no_grad():
        if isinstance(sentence, str):
            tokens = [token.text for token in ko_nlp.tokenizer(sentence)]
        else:
            tokens = [token for token in sentence]
        if lower:
            tokens = [token.lower() for token in tokens]
        tokens = [sos_token] + tokens + [eos_token]
        ids = ko_vocab.lookup_indices(tokens)
        tensor = torch.LongTensor(ids).unsqueeze(-1).to(device)
        hidden, cell = model.encoder(tensor)
        inputs = en_vocab.lookup_indices([sos_token])
        for _ in range(max_output_length):
            inputs_tensor = torch.LongTensor([inputs[-1]]).to(device)
            output, hidden, cell = model.decoder(inputs_tensor, hidden, cell)
            predicted_token = output.argmax(-1).item()
            inputs.append(predicted_token)
            if predicted_token == en_vocab[eos_token]:
                break
        tokens = en_vocab.lookup_tokens(inputs)
    return tokens


translations = [
    translate_sentence(
        example["ko"],
        model,
        en_nlp,
        ko_nlp,
        en_vocab,
        ko_vocab,
        lower,
        sos_token,
        eos_token,
        device,
    )
    for example in tqdm.tqdm(test_data)
]
bleu = evaluate.load("bleu")
predictions = [" ".join(translation[1:-1]) for translation in translations]
references = [[example["en"]] for example in test_data]


def get_tokenizer_fn(nlp, lower):
    def tokenizer_fn(s):
        tokens = [token.text for token in nlp.tokenizer(s)]
        if lower:
            tokens = [token.lower() for token in tokens]
        return tokens

    return tokenizer_fn
tokenizer_fn = get_tokenizer_fn(en_nlp, lower)
tokenizer_fn(predictions[0]), tokenizer_fn(references[0][0])
results = bleu.compute(
    predictions=predictions, references=references, tokenizer=tokenizer_fn
)

print(results)

| Test Loss: 5.026 | Test PPL: 152.307 |


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 33098/33098 [01:18<00:00, 422.47it/s]


{'bleu': 0.0, 'precisions': [0.34532237769658747, 0.002499291698708868, 0.0, 0.0], 'brevity_penalty': 0.3408361518621992, 'length_ratio': 0.48161358035958746, 'translation_length': 131926, 'reference_length': 273925}


In [None]:
#####################################################attention



# 2. 모델 구축 및 훈련
# **2-2. Attention**
# Attention Encoder

class AttentionEncoder(nn.Module):
    def __init__(
        self, input_dim, embedding_dim, encoder_hidden_dim, decoder_hidden_dim, dropout
    ):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.GRU(embedding_dim, encoder_hidden_dim, bidirectional=True)
        self.fc = nn.Linear(encoder_hidden_dim * 2, decoder_hidden_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        embedded = self.dropout(self.embedding(src))
        outputs, hidden = self.rnn(embedded)

        # hidden [-2, :, : ] is the last of the forwards RNN
        # hidden [-1, :, : ] is the last of the backwards RNN
        hidden = torch.tanh(
            self.fc(torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1))
        )

        # outputs = [src length, batch size, encoder hidden dim * 2]
        # hidden = [batch size, decoder hidden dim]

        return outputs, hidden
class Attention(nn.Module):
    def __init__(self, encoder_hidden_dim, decoder_hidden_dim):
        super().__init__()
        self.attn_fc = nn.Linear(
            (encoder_hidden_dim * 2) + decoder_hidden_dim, decoder_hidden_dim
        )
        self.v_fc = nn.Linear(decoder_hidden_dim, 1, bias=False)

    def forward(self, hidden, encoder_outputs):
        batch_size = encoder_outputs.shape[1]
        src_length = encoder_outputs.shape[0]
        # repeat decoder hidden state src_length times
        hidden = hidden.unsqueeze(1).repeat(1, src_length, 1)
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        energy = torch.tanh(self.attn_fc(torch.cat((hidden, encoder_outputs), dim=2)))
        attention = self.v_fc(energy).squeeze(2)

        return torch.softmax(attention, dim=1)
# Attention Decoder

class AttentionDecoder(nn.Module):
    def __init__(
        self,
        output_dim,
        embedding_dim,
        encoder_hidden_dim,
        decoder_hidden_dim,
        dropout,
        attention,
    ):
        super().__init__()
        self.output_dim = output_dim
        self.attention = attention
        self.embedding = nn.Embedding(output_dim, embedding_dim)
        self.rnn = nn.GRU((encoder_hidden_dim * 2) + embedding_dim, decoder_hidden_dim)
        self.fc_out = nn.Linear(
            (encoder_hidden_dim * 2) + decoder_hidden_dim + embedding_dim, output_dim
        )
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, encoder_outputs):
        input = input.unsqueeze(0)
        embedded = self.dropout(self.embedding(input))
        a = self.attention(hidden, encoder_outputs)
        a = a.unsqueeze(1)
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        weighted = torch.bmm(a, encoder_outputs)
        weighted = weighted.permute(1, 0, 2)
        rnn_input = torch.cat((embedded, weighted), dim=2)
        output, hidden = self.rnn(rnn_input, hidden.unsqueeze(0))

        # seq len, n layers and n directions will always be 1 in this decoder, therefore:
        # this also means that output == hidden
        assert (output == hidden).all()
        embedded = embedded.squeeze(0)
        output = output.squeeze(0)
        weighted = weighted.squeeze(0)
        prediction = self.fc_out(torch.cat((output, weighted, embedded), dim=1))

        # prediction = [batch size, output dim]
        return prediction, hidden.squeeze(0), a.squeeze(1)
### Seq2seq + Attention 모델 class
# Seq2seq + Attention 연결한 모델
class AttentionSeq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio):
        # teacher_forcing_ratio is probability to use teacher forcing
        # e.g. if teacher_forcing_ratio is 0.75 we use teacher forcing 75% of the time
        batch_size = src.shape[1]
        trg_length = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim

        # tensor to store decoder outputs
        outputs = torch.zeros(trg_length, batch_size, trg_vocab_size).to(self.device)
        # encoder_outputs is all hidden states of the input sequence, back and forwards
        # hidden is the final forward and backward hidden states, passed through a linear layer
        encoder_outputs, hidden = self.encoder(src)
        # first input to the decoder is the <sos> tokens
        input = trg[0, :]

        for t in range(1, trg_length):
            # insert input token embedding, previous hidden state and all encoder hidden states
            # receive output tensor (predictions) and new hidden state
            output, hidden, _ = self.decoder(input, hidden, encoder_outputs)

            # place predictions in a tensor holding predictions for each token
            outputs[t] = output
            # decide if we are going to use teacher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio
            # get the highest predicted token from our predictions
            top1 = output.argmax(1)
            # if teacher forcing, use actual next token as next input
            # if not, use predicted token
            input = trg[t] if teacher_force else top1
        return outputs
# Seq2seq + Attention 모델 구축

input_dim = len(ko_vocab)
output_dim = len(en_vocab)
encoder_embedding_dim = 128
decoder_embedding_dim = 128
encoder_hidden_dim = 128
decoder_hidden_dim = 128
encoder_dropout = 0.5
decoder_dropout = 0.5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

attention = Attention(encoder_hidden_dim, decoder_hidden_dim)

encoder = AttentionEncoder(
    input_dim,
    encoder_embedding_dim,
    encoder_hidden_dim,
    decoder_hidden_dim,
    encoder_dropout,
)

decoder = AttentionDecoder(
    output_dim,
    decoder_embedding_dim,
    encoder_hidden_dim,
    decoder_hidden_dim,
    decoder_dropout,
    attention,
)

model = AttentionSeq2Seq(encoder, decoder, device).to(device)
# 모델 가중치 초기화

def init_weights(m):
    for name, param in m.named_parameters():
        if "weight" in name:
            nn.init.normal_(param.data, mean=0, std=0.01)
        else:
            nn.init.constant_(param.data, 0)


model.apply(init_weights)
# 훈련되는 parameter 수 출력(모든 parameter가 훈련되지 않을 수 있음)

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


print(f"The model has {count_parameters(model):,} trainable parameters")
### Seq2seq + Attention 훈련

optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index=pad_index)
def train_fn(
    model, data_loader, optimizer, criterion, clip, teacher_forcing_ratio, device
):
    model.train()
    epoch_loss = 0
    for i, batch in enumerate(tqdm(data_loader)):
        src = batch["ko_ids"].to(device)
        trg = batch["en_ids"].to(device)

        optimizer.zero_grad()
        output = model(src, trg, teacher_forcing_ratio)
        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)

        loss = criterion(output, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(data_loader)
def evaluate_fn(model, data_loader, criterion, device):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for i, batch in enumerate(tqdm.tqdm(data_loader)):
            src = batch["ko_ids"].to(device)
            trg = batch["en_ids"].to(device)

            output = model(src, trg, 0)  # turn off teacher forcing
            output_dim = output.shape[-1]
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)

            loss = criterion(output, trg)
            epoch_loss += loss.item()
    return epoch_loss / len(data_loader)
n_epochs = 1
clip = 1.0
teacher_forcing_ratio = 0.5

best_valid_loss = float("inf")
# for epoch in tqdm(range(n_epochs)):
#     train_loss = train_fn(
#         model,
#         train_data_loader,
#         optimizer,
#         criterion,
#         clip,
#         teacher_forcing_ratio,
#         device,
#     )
#     valid_loss = evaluate_fn(
#         model,
#         valid_data_loader,
#         criterion,
#         device,
#     )
#     if valid_loss < best_valid_loss:
#         best_valid_loss = valid_loss
#         torch.save(model.state_dict(), "model_attention.pt")
#     print(f"\tTrain Loss: {train_loss:7.3f} | Train PPL: {np.exp(train_loss):7.3f}")
#     print(f"\tValid Loss: {valid_loss:7.3f} | Valid PPL: {np.exp(valid_loss):7.3f}")

##########################################attention: evaluate




In [5]:
##########################################attention: evaluate


model.load_state_dict(torch.load("model_attention.pt"))

test_loss = evaluate_fn(model, test_data_loader, criterion, device)

print(f"| Test Loss: {test_loss:.3f} | Test PPL: {np.exp(test_loss):7.3f} |")




def translate_sentence(sentence, src_field, trg_field, model, device, max_len = 50):
    
    model.eval()
        
    if isinstance(sentence, str):
        tokens = [token.lower() for token in sentence]
    else:
        tokens = [token.lower() for token in sentence]

    # tokens = [src_field.init_token] + tokens + [src_field.eos_token]
    tokens = [sos_token] + tokens + [eos_token]
    # src_indexes = [src_field.vocab.stoi[token] for token in tokens]
    src_indexes = ko_vocab.lookup_indices(tokens)

    src_tensor = torch.LongTensor(src_indexes).unsqueeze(0).to(device).transpose(0,1)
    
    # src_mask = model.make_src_mask(src_tensor)
    
    with torch.no_grad():
        # enc_src = model.encoder(src_tensor, src_mask)
        encoder_outputs, hidden = model.encoder(src_tensor)

    # trg_indexes = [trg_field.vocab.stoi[trg_field.init_token]]
    trg_indexes = en_vocab.lookup_indices([sos_token])

    for i in range(max_len):

        trg_tensor = torch.LongTensor(trg_indexes[-1]).to(device)
        # trg_mask = model.make_trg_mask(trg_tensor)
        with torch.no_grad():
            # output, attention, = model.decoder(trg_tensor, enc_src, trg_mask, src_mask)
            output, attention, _ = model.decoder(trg_tensor,hidden, encoder_outputs, )
        
        pred_token = output.argmax(-1).item()
        
        trg_indexes.append(pred_token)

        # if pred_token == trg_field.vocab.stoi[trg_field.eos_token]:
        if pred_token == eos_token:

            break
    
    # trg_tokens = [trg_field.vocab.itos[i] for i in trg_indexes]
    trg_tokens = en_vocab.lookup_tokens(trg_indexes)
    
    return trg_tokens[1:], attention


from torchtext.data.metrics import bleu_score

def calculate_bleu(data, src_field, trg_field, model, device, max_len = 50):
    
    trgs = []
    pred_trgs = []
    
    for e, datum in tqdm.tqdm(enumerate(data)):
        
        src = datum['ko']
        trg = datum['en']
        
        pred_trg, _ = translate_sentence(src, ko_vocab, en_vocab, model, device, max_len)
        
        #cut off <eos> token
        pred_trg = pred_trg[:-1]
        
        pred_trgs.append(pred_trg)
        trgs.append([trg])

        if e > 100:
            break

        
    return bleu_score(pred_trgs, trgs)

bleu_score = calculate_bleu(test_data, ko_vocab, en_vocab, model, device)

print(f'BLEU score = {bleu_score*100:.2f}')


100%|███████████| 65/65 [00:33<00:00,  1.94it/s]


| Test Loss: 4.963 | Test PPL: 142.971 |


0it [00:00, ?it/s]


RuntimeError: Sizes of tensors must match except in dimension 2. Expected size 2 but got size 1 for tensor number 1 in the list.

In [8]:


class Encoder(nn.Module):
    def __init__(self, 
                 input_dim, 
                 hid_dim, 
                 n_layers, 
                 n_heads, 
                 pf_dim,
                 dropout, 
                 device,
                 max_length = max_length):
        super().__init__()

        self.device = device
        
        self.tok_embedding = nn.Embedding(input_dim, hid_dim)
        self.pos_embedding = nn.Embedding(input_dim, hid_dim)
        
        self.layers = nn.ModuleList([EncoderLayer(hid_dim, 
                                                  n_heads, 
                                                  pf_dim,
                                                  dropout, 
                                                  device) 
                                     for _ in range(n_layers)])
        
        self.dropout = nn.Dropout(dropout)
        
        self.scale = torch.sqrt(torch.FloatTensor([hid_dim])).to(device)
        
    def forward(self, src, src_mask):
        
        #src = [batch size, src len]
        #src_mask = [batch size, 1, 1, src len]
        batch_size = src.shape[0]
        src_len = src.shape[1]
        
        pos = torch.arange(0, src_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)
        
        #pos = [batch size, src len]

        src = self.dropout((self.tok_embedding(src) * self.scale) + self.pos_embedding(pos))
        #src = [batch size, src len, hid dim]
        
        for layer in self.layers:
            src = layer(src, src_mask)
            
        #src = [batch size, src len, hid dim]
            
        return src

class EncoderLayer(nn.Module):
    def __init__(self, 
                 hid_dim, 
                 n_heads, 
                 pf_dim,  
                 dropout, 
                 device):
        super().__init__()
        
        self.self_attn_layer_norm = nn.LayerNorm(hid_dim)
        self.ff_layer_norm = nn.LayerNorm(hid_dim)
        self.self_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, device)
        self.positionwise_feedforward = PositionwiseFeedforwardLayer(hid_dim, 
                                                                     pf_dim, 
                                                                     dropout)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src, src_mask):
        
        #src = [batch size, src len, hid dim]
        #src_mask = [batch size, 1, 1, src len] 
                
        #self attention
        _src, _ = self.self_attention(src, src, src, src_mask)
        
        #dropout, residual connection and layer norm
        src = self.self_attn_layer_norm(src + self.dropout(_src))
        
        #src = [batch size, src len, hid dim]
        
        #positionwise feedforward
        _src = self.positionwise_feedforward(src)
        
        #dropout, residual and layer norm
        src = self.ff_layer_norm(src + self.dropout(_src))
        
        #src = [batch size, src len, hid dim]
        
        return src

class MultiHeadAttentionLayer(nn.Module):
    def __init__(self, hid_dim, n_heads, dropout, device):
        super().__init__()
        
        assert hid_dim % n_heads == 0
        
        self.hid_dim = hid_dim
        self.n_heads = n_heads
        self.head_dim = hid_dim // n_heads
        
        self.fc_q = nn.Linear(hid_dim, hid_dim)
        self.fc_k = nn.Linear(hid_dim, hid_dim)
        self.fc_v = nn.Linear(hid_dim, hid_dim)
        
        self.fc_o = nn.Linear(hid_dim, hid_dim)
        
        self.dropout = nn.Dropout(dropout)
        
        self.scale = torch.sqrt(torch.FloatTensor([self.head_dim])).to(device)
        
    def forward(self, query, key, value, mask = None):
        
        #query = [batch size, query len, hid dim]
        #key = [batch size, key len, hid dim]
        #value = [batch size, value len, hid dim]
        batch_size = query.shape[0]

        # Q = self.fc_q(query)

        Q = self.fc_q(query)
        K = self.fc_k(key)
        V = self.fc_v(value)
        
        #Q = [batch size, query len, hid dim]
        #K = [batch size, key len, hid dim]
        #V = [batch size, value len, hid dim]
                
        Q = Q.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        K = K.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        V = V.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        
        #Q = [batch size, n heads, query len, head dim]
        #K = [batch size, n heads, key len, head dim]
        #V = [batch size, n heads, value len, head dim]
                
        energy = torch.matmul(Q, K.permute(0, 1, 3, 2)) / self.scale
        
        #energy = [batch size, n heads, query len, key len]
        
        if mask is not None:
            energy = energy.masked_fill(mask == 0, -1e10)
        
        attention = torch.softmax(energy, dim = -1)
                
        #attention = [batch size, n heads, query len, key len]
                
        x = torch.matmul(self.dropout(attention), V)
        
        #x = [batch size, n heads, query len, head dim]
        
        x = x.permute(0, 2, 1, 3).contiguous()
        
        #x = [batch size, query len, n heads, head dim]
        
        x = x.view(batch_size, -1, self.hid_dim)
        
        #x = [batch size, query len, hid dim]
        
        x = self.fc_o(x)
        
        #x = [batch size, query len, hid dim]
        
        return x, attention

class PositionwiseFeedforwardLayer(nn.Module):
    def __init__(self, hid_dim, pf_dim, dropout):
        super().__init__()
        
        self.fc_1 = nn.Linear(hid_dim, pf_dim)
        self.fc_2 = nn.Linear(pf_dim, hid_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        
        #x = [batch size, seq len, hid dim]
        
        x = self.dropout(torch.relu(self.fc_1(x)))
        
        #x = [batch size, seq len, pf dim]
        
        x = self.fc_2(x)
        
        #x = [batch size, seq len, hid dim]
        
        return x
class Decoder(nn.Module):
    def __init__(self, 
                 output_dim, 
                 hid_dim, 
                 n_layers, 
                 n_heads, 
                 pf_dim, 
                 dropout, 
                 device,
                 max_length = 100):
        super().__init__()
        
        self.device = device
        
        self.tok_embedding = nn.Embedding(output_dim, hid_dim)
        self.pos_embedding = nn.Embedding(max_length, hid_dim)
        
        self.layers = nn.ModuleList([DecoderLayer(hid_dim, 
                                                  n_heads, 
                                                  pf_dim, 
                                                  dropout, 
                                                  device)
                                     for _ in range(n_layers)])
        
        self.fc_out = nn.Linear(hid_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
        self.scale = torch.sqrt(torch.FloatTensor([hid_dim])).to(device)
        
    def forward(self, trg, enc_src, trg_mask, src_mask):
        
        #trg = [batch size, trg len]
        #enc_src = [batch size, src len, hid dim]
        #trg_mask = [batch size, 1, trg len, trg len]
        #src_mask = [batch size, 1, 1, src len]
                
        batch_size = trg.shape[0]
        trg_len = trg.shape[1]
        
        pos = torch.arange(0, trg_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)
                            
        #pos = [batch size, trg len]
            
        trg = self.dropout((self.tok_embedding(trg) * self.scale) + self.pos_embedding(pos))
                
        #trg = [batch size, trg len, hid dim]
        
        for layer in self.layers:
            trg, attention = layer(trg, enc_src, trg_mask, src_mask)
        
        #trg = [batch size, trg len, hid dim]
        #attention = [batch size, n heads, trg len, src len]
        
        output = self.fc_out(trg)
        
        #output = [batch size, trg len, output dim]
            
        return output, attention


class DecoderLayer(nn.Module):
    def __init__(self, 
                 hid_dim, 
                 n_heads, 
                 pf_dim, 
                 dropout, 
                 device):
        super().__init__()
        
        self.self_attn_layer_norm = nn.LayerNorm(hid_dim)
        self.enc_attn_layer_norm = nn.LayerNorm(hid_dim)
        self.ff_layer_norm = nn.LayerNorm(hid_dim)
        self.self_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, device)
        self.encoder_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, device)
        self.positionwise_feedforward = PositionwiseFeedforwardLayer(hid_dim, 
                                                                     pf_dim, 
                                                                     dropout)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, trg, enc_src, trg_mask, src_mask):
        
        #trg = [batch size, trg len, hid dim]
        #enc_src = [batch size, src len, hid dim]
        #trg_mask = [batch size, 1, trg len, trg len]
        #src_mask = [batch size, 1, 1, src len]
        
        #self attention
        _trg, _ = self.self_attention(trg, trg, trg, trg_mask)
        
        #dropout, residual connection and layer norm
        trg = self.self_attn_layer_norm(trg + self.dropout(_trg))
            
        #trg = [batch size, trg len, hid dim]
            
        #encoder attention
        _trg, attention = self.encoder_attention(trg, enc_src, enc_src, src_mask)
        
        #dropout, residual connection and layer norm
        trg = self.enc_attn_layer_norm(trg + self.dropout(_trg))
                    
        #trg = [batch size, trg len, hid dim]
        
        #positionwise feedforward
        _trg = self.positionwise_feedforward(trg)
        
        #dropout, residual and layer norm
        trg = self.ff_layer_norm(trg + self.dropout(_trg))
        
        #trg = [batch size, trg len, hid dim]
        #attention = [batch size, n heads, trg len, src len]
        
        return trg, attention

class Seq2Seq(nn.Module):
    def __init__(self, 
                 encoder, 
                 decoder, 
                 src_pad_idx, 
                 trg_pad_idx, 
                 device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.src_pad_idx = src_pad_idx
        self.trg_pad_idx = trg_pad_idx
        self.device = device
        
    def make_src_mask(self, src):
        
        #src = [batch size, src len]
        src_mask = (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)

        #src_mask = [batch size, 1, 1, src len]

        return src_mask
    
    def make_trg_mask(self, trg):
        
        #trg = [batch size, trg len]
        
        trg_pad_mask = (trg != self.trg_pad_idx).unsqueeze(1).unsqueeze(2).to(device)
        
        #trg_pad_mask = [batch size, 1, 1, trg len]
        
        trg_len = trg.shape[1]
        
        trg_sub_mask = torch.tril(torch.ones((trg_len, trg_len), device = self.device)).bool()
        #trg_sub_mask = [trg len, trg len]
            
        trg_mask = trg_pad_mask & trg_sub_mask
        
        #trg_mask = [batch size, 1, trg len, trg len]
        
        return trg_mask

    def forward(self, src, trg):
        
        #src = [batch size, src len]
        #trg = [batch size, trg len]
                
        src_mask = self.make_src_mask(src)
        trg_mask = self.make_trg_mask(trg)
        
        #src_mask = [batch size, 1, 1, src len]
        #trg_mask = [batch size, 1, trg len, trg len]
        
        enc_src = self.encoder(src, src_mask)
        
        #enc_src = [batch size, src len, hid dim]
                
        output, attention = self.decoder(trg, enc_src, trg_mask, src_mask)
        
        #output = [batch size, trg len, output dim]
        #attention = [batch size, n heads, trg len, src len]
        
        return output, attention

# INPUT_DIM = len(SRC.vocab)
# OUTPUT_DIM = len(TRG.vocab)
INPUT_DIM = len(ko_vocab)
OUTPUT_DIM = len(en_vocab)
HID_DIM = 512
ENC_LAYERS = 4
DEC_LAYERS = 4
ENC_HEADS = 4
DEC_HEADS = 4
ENC_PF_DIM = 1024
DEC_PF_DIM = 1024
ENC_DROPOUT = 0.1
DEC_DROPOUT = 0.1


enc = Encoder(INPUT_DIM, 
              HID_DIM, 
              ENC_LAYERS, 
              ENC_HEADS, 
              ENC_PF_DIM, 
              ENC_DROPOUT, 
              device).to(device)

dec = Decoder(OUTPUT_DIM, 
              HID_DIM, 
              DEC_LAYERS, 
              DEC_HEADS, 
              DEC_PF_DIM, 
              DEC_DROPOUT, 
              device).to(device)

# SRC_PAD_IDX = SRC.vocab.stoi[SRC.pad_token]
# TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]
SRC_PAD_IDX = pad_index # ko_vocab[pad_index]
TRG_PAD_IDX = pad_index # en_vocab[pad_index]

model = Seq2Seq(enc, dec, SRC_PAD_IDX, TRG_PAD_IDX, device).to(device)

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

def initialize_weights(m):
    if hasattr(m, 'weight') and m.weight.dim() > 1:
        nn.init.xavier_uniform_(m.weight.data)

model.apply(initialize_weights)

LEARNING_RATE = 0.0005


optimizer = torch.optim.Adam(model.parameters(), lr = LEARNING_RATE)
criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)


def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        src = batch['ko_ids'].transpose(0,1).to(device)
        trg = batch['en_ids'].transpose(0,1).to(device)

        optimizer.zero_grad()
        
        output, _ = model(src, trg[:,:-1])
                
        #output = [batch size, trg len - 1, output dim]
        #trg = [batch size, trg len]
            
        output_dim = output.shape[-1]
            
        output = output.contiguous().view(-1, output_dim)
        trg = trg[:,1:].contiguous().view(-1)
                
        #output = [batch size * trg len - 1, output dim]
        #trg = [batch size * trg len - 1]
            
        loss = criterion(output, trg)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            src = batch['ko_ids'].transpose(0,1).to(device)
            trg = batch['en_ids'].transpose(0,1).to(device)

            output, _ = model(src, trg[:,:-1])
            
            #output = [batch size, trg len - 1, output dim]
            #trg = [batch size, trg len]
            
            output_dim = output.shape[-1]
            
            output = output.contiguous().view(-1, output_dim)
            trg = trg[:,1:].contiguous().view(-1)
            
            #output = [batch size * trg len - 1, output dim]
            #trg = [batch size * trg len - 1]
            
            loss = criterion(output, trg)

            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

N_EPOCHS = 1
CLIP = 1

best_valid_loss = float('inf')

# for epoch in tqdm.tqdm(range(N_EPOCHS)):
    
#     start_time = time.time()
    
#     # train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
#     # valid_loss = evaluate(model, valid_iterator, criterion)
#     train_loss = train(model, train_data_loader, optimizer, criterion, CLIP)
#     valid_loss = evaluate(model, valid_data_loader, criterion)
    
#     end_time = time.time()
    
#     epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
#     if valid_loss < best_valid_loss:
#         best_valid_loss = valid_loss
#         torch.save(model.state_dict(), 'tut6-model.pt')
    
#     print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
#     print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
#     print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')



The model has 51,356,899 trainable parameters


In [9]:
import math

model = Seq2Seq(enc, dec, SRC_PAD_IDX, TRG_PAD_IDX, device).to(device)
model.load_state_dict(torch.load('tut6-model.pt'))

test_loss = evaluate(model, test_data_loader, criterion)


print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')


def translate_sentence(sentence, src_field, trg_field, model, device, max_len = 50):
    
    model.eval()
        
    if isinstance(sentence, str):
        tokens = [token.lower() for token in sentence]
    else:
        tokens = [token.lower() for token in sentence]

    # tokens = [src_field.init_token] + tokens + [src_field.eos_token]
    tokens = [sos_token] + tokens + [eos_token]
    # src_indexes = [src_field.vocab.stoi[token] for token in tokens]
    src_indexes = ko_vocab.lookup_indices(tokens)

    src_tensor = torch.LongTensor(src_indexes).unsqueeze(0).to(device)
    
    src_mask = model.make_src_mask(src_tensor)
    
    with torch.no_grad():
        enc_src = model.encoder(src_tensor, src_mask)

    # trg_indexes = [trg_field.vocab.stoi[trg_field.init_token]]
    trg_indexes = en_vocab.lookup_indices([sos_token])

    for i in range(max_len):

        trg_tensor = torch.LongTensor(trg_indexes).unsqueeze(0).to(device)

        trg_mask = model.make_trg_mask(trg_tensor)
        
        with torch.no_grad():
            output, attention = model.decoder(trg_tensor, enc_src, trg_mask, src_mask)
        
        pred_token = output.argmax(2)[:,-1].item()
        
        trg_indexes.append(pred_token)

        # if pred_token == trg_field.vocab.stoi[trg_field.eos_token]:
        if pred_token == eos_token:

            break
    
    # trg_tokens = [trg_field.vocab.itos[i] for i in trg_indexes]
    trg_tokens = en_vocab.lookup_tokens(trg_indexes)
    
    return trg_tokens[1:], attention




from torchtext.data.metrics import bleu_score

def calculate_bleu(data, src_field, trg_field, model, device, max_len = 50):
    
    trgs = []
    pred_trgs = []
    
    for e, datum in tqdm.tqdm(enumerate(data)):
        
        src = datum['ko']
        trg = datum['en']
        
        pred_trg, _ = translate_sentence(src, ko_vocab, en_vocab, model, device, max_len)
        
        #cut off <eos> token
        pred_trg = pred_trg[:-1]
        
        pred_trgs.append(pred_trg)
        trgs.append([trg])

        if e > 100:
            break

        
    return bleu_score(pred_trgs, trgs)

bleu_score = calculate_bleu(test_data, ko_vocab, en_vocab, model, device)

print(f'BLEU score = {bleu_score*100:.2f}')




| Test Loss: 1.767 | Test PPL:   5.856 |


101it [00:18,  5.36it/s]


BLEU score = 0.00


## Inference

In [6]:


sen_list = [
'모든 액체 , 젤 , 에어로졸 등 은 1 커트 짜리 여닫이 투명 봉지 하나 에 넣 어야 하 ㅂ니다 .',
'미안 하 지만 , 뒷쪽 아이 들 의 떠들 는 소리 가 커 어서 , 광화문 으로 가 아고 싶 은데 표 를 바꾸 어 주 시 겠 어요 ?',
'은행 이 너무 멀 어서 안 되 겠 네요 . 현찰 이 필요 하면 돈 을 훔치 시 어요',
'아무래도 분실 하 ㄴ 것 같 으니 분실 신고서 를 작성 하 아야 하 겠 습니다 . 사무실 로 같이 가 시 ㄹ 까요 ?',
'부산 에서 코로나 확진자 가 급증 하 아서 병상 이 부족하 아 지자  확진자 20명 을 대구 로 이송하 ㄴ다 .',
'변기 가 막히 었 습니다 .',
'그 바지 좀 보이 어 주 시 ㅂ시오 . 이거 얼마 에 사 ㄹ 수 있 는 것 이 ㅂ니까 ?',
'비 가 오 아서 백화점 으로 가지 말 고 두타 로 가 았 으면 좋 겠 습니다 .',
'속 이 안 좋 을 때 는 죽 이나 미음 으로 아침 을 대신 하 ㅂ니다',
'문 대통령 은 집단 이익 에서 벗어 나 아 라고 말 하 었 다 .',
'이것 좀 먹어 보 ㄹ 몇 일 간 의 시간 을 주 시 어요 .',
'이날 개미군단 은 외인 의 물량 을 모두 받 아 내 었 다 .',
'통합 우승 의 목표 를 달성하 ㄴ NC 다이노스 나성범 이 메이저리그 진출 이라는 또 다른 꿈 을 향하 어 나아가 ㄴ다 .',
'이번 구조 조정 이 제품 을 효과 적 으로 개발 하 고 판매 하 기 위하 ㄴ 회사 의 능력 강화 조처 이 ㅁ 을 이해 하 아 주 시 리라 생각 하 ㅂ니다 .',
'요즘 이 프로그램 녹화 하 며 많은 걸 느끼 ㄴ다 ']


In [10]:
translate_sentence(sen_list, ko_vocab, en_vocab, model, device)[0]

['<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>']