In [1]:
import torch
import torchtext

print(torch.__version__) # 1.8.0
print(torchtext.__version__) # 0.9.0

1.8.0
0.9.0


## 참고 자료
- [튜토리얼](https://pytorch.org/tutorials/beginner/torchtext_translation_tutorial.html)

- [torchtext 0.9.0 docs](https://pytorch.org/text/0.9.0/vocab.html#vocab)

- [나동빈 코드](https://github.com/ndb796/Deep-Learning-Paper-Review-and-Practice/blob/master/code_practices/Sequence_to_Sequence_with_LSTM_Tutorial.ipynb)

In [2]:
!pip uninstall torchtext

Found existing installation: torchtext 0.13.0
Uninstalling torchtext-0.13.0:
  Would remove:
    /usr/local/lib/python3.7/dist-packages/torchtext-0.13.0.dist-info/*
    /usr/local/lib/python3.7/dist-packages/torchtext/*
Proceed (y/n)? y
  Successfully uninstalled torchtext-0.13.0


In [3]:
# 재설치 후 런타임 다시 시작 필요!
!pip install torchtext==0.9

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torchtext==0.9
  Downloading torchtext-0.9.0-cp37-cp37m-manylinux1_x86_64.whl (7.1 MB)
[K     |████████████████████████████████| 7.1 MB 7.0 MB/s 
[?25hCollecting torch==1.8.0
  Downloading torch-1.8.0-cp37-cp37m-manylinux1_x86_64.whl (735.5 MB)
[K     |████████████████████████████████| 735.5 MB 13 kB/s 
Installing collected packages: torch, torchtext
  Attempting uninstall: torch
    Found existing installation: torch 1.12.0+cu113
    Uninstalling torch-1.12.0+cu113:
      Successfully uninstalled torch-1.12.0+cu113
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchvision 0.13.0+cu113 requires torch==1.12.0, but you have torch 1.8.0 which is incompatible.
torchaudio 0.12.0+cu113 requires torch==1.12.0, but you have torch 1.8.0 which is incomp

In [2]:
# 구글드라이브 마운트

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


<img width="631" alt="image" src="https://user-images.githubusercontent.com/33839093/181196619-8e79f4c9-6e97-44c5-8114-14cf311b73fd.png">


In [60]:
import torch
import torch.nn as nn
import torch.optim as optim
# from torchtext.legacy.datasets import Multi30k
# from torchtext.legacy.data import Field, BucketIterator, Iterator

import spacy
import numpy as np

import random
import math
import time

### 1. 데이터셋 다운로드

In [61]:
# 난수 생성
seed = 1234

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True

In [5]:
# 문장을 토큰화하는 모듈 설치
# spacy 라이브러리: 문장의 토큰화, 태깅 등 전처리 기능을 위한 라이브러리

!python -m spacy download en_core_web_sm
!python -m spacy download fr_core_news_sm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-sm==3.4.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.0/en_core_web_sm-3.4.0-py3-none-any.whl (12.8 MB)
[K     |████████████████████████████████| 12.8 MB 10.8 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting fr-core-news-sm==3.4.0
  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.4.0/fr_core_news_sm-3.4.0-py3-none-any.whl (16.3 MB)
[K     |████████████████████████████████| 16.3 MB 8.4 MB/s 
Installing collected packages: fr-core-news-sm
Successfully installed fr-core-news-sm-3.4.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('fr_core_news_sm')


In [101]:
import torch
import torchtext
from torchtext.data.utils import get_tokenizer
from collections import Counter
from torchtext.vocab import Vocab
from torchtext.utils import download_from_url, extract_archive
import io

# url에서 raw text 다운로드
url_base = 'https://raw.githubusercontent.com/multi30k/dataset/master/data/task1/raw/'
train_urls = ('train.fr.gz', 'train.en.gz')
val_urls = ('val.fr.gz', 'val.en.gz')
test_urls = ('test_2016_flickr.fr.gz', 'test_2016_flickr.en.gz')

# 각 데이터셋의 경로가 들어있음
train_filepaths = [extract_archive(download_from_url(url_base + url))[0] for url in train_urls]
val_filepaths = [extract_archive(download_from_url(url_base + url))[0] for url in val_urls]
test_filepaths = [extract_archive(download_from_url(url_base + url))[0] for url in test_urls]

# spacy tokenizer 로드
fr_tokenizer = get_tokenizer('spacy', language='fr')
en_tokenizer = get_tokenizer('spacy', language='en')




In [102]:
print('train_filepaths:', train_filepaths)

cnt = 0
with io.open(train_filepaths[1], encoding="utf8") as f:
    for string_ in f:
        if cnt == 5:
            break
        print(string_)
        
        cnt += 1

train_filepaths: ['/content/.data/train.fr', '/content/.data/train.en']
Two young, White males are outside near many bushes.

Several men in hard hats are operating a giant pulley system.

A little girl climbing into a wooden playhouse.

A man in a blue shirt is standing on a ladder cleaning a window.

Two men are at the stove preparing food.



## 2. 데이터 전처리

In [103]:
# 각 단어에 인덱스를 부여, 2번 이상 등장한 단어를 vocab에 포함
def build_vocab(filepath, tokenizer):
  counter = Counter()
  with io.open(filepath, encoding="utf8") as f:
    for string_ in f:
      counter.update(tokenizer(string_))
  return Vocab(counter, min_freq=2, specials=['<unk>', '<pad>', '<sos>', '<eos>'])

fr_vocab = build_vocab(train_filepaths[0], fr_tokenizer)
en_vocab = build_vocab(train_filepaths[1], en_tokenizer)

In [104]:
# vocab 생성 결과가 조금 이상함

print(en_vocab.stoi["<unk>"]) # 0
print(en_vocab.stoi["<pad>"]) # 1
print(en_vocab.stoi["<sos>"]) # 0
print(en_vocab.stoi["sos>"]) # 0
print(en_vocab.stoi["<eos>"]) # 3
print()
print(en_vocab.itos[0]) # <unk>
print(en_vocab.itos[1]) # <pad>
print(en_vocab.itos[2]) # <sos>
print(en_vocab.itos[3]) # <eos>

0
1
0
0
3

<unk>
<pad>
<sos>
<eos>


In [138]:
# 각 토큰에 해당하는 인덱스를 찾아 텐서로 반환
def data_process(filepaths):
  raw_fr_iter = iter(io.open(filepaths[0], encoding="utf8"))
  raw_en_iter = iter(io.open(filepaths[1], encoding="utf8"))
  data = []

  for (raw_fr, raw_en) in zip(raw_fr_iter, raw_en_iter):
    fr_tensor_ = torch.tensor([fr_vocab[token] for token in fr_tokenizer(raw_fr)],
                            dtype=torch.long)
    # fr_tensors_ = [fr_vocab[token] for token in fr_tokenizer(raw_fr)]
    # fr_tensor_rev = torch.tensor(fr_tensors_[::-1], dtype=torch.long)

    en_tensor_ = torch.tensor([en_vocab[token] for token in en_tokenizer(raw_en)],
                            dtype=torch.long)

    data.append((fr_tensor_, en_tensor_))
    # data.append((fr_tensor_rev, en_tensor_))
  return data

train_data = data_process(train_filepaths)
val_data = data_process(val_filepaths)
test_data = data_process(test_filepaths)

print(f'Number of training examples: {len(train_data)}')
print(f'Number of validation examples: {len(val_data)}')
print(f'Number of testing examples: {len(test_data)}')

Number of training examples: 29000
Number of validation examples: 1014
Number of testing examples: 1000


In [139]:
# 첫 번째 train data 출력해보기 [프랑스어, 영어]

def idx2sen(data, lan):
    res = []
    if lan == "fr":
        for idx in data[0]:
            res.append(fr_vocab.itos[idx])

    elif lan == "en":
        for idx in data[1]:
            res.append(en_vocab.itos[idx])
    
    return " ".join(res)


fr_sen = idx2sen(train_data[0], lan="fr")
en_sen = idx2sen(train_data[0], lan="en")
print(train_data[0][0])
print("프랑스어: ", fr_sen)

print(train_data[0][1])
print("영어: ", en_sen)

tensor([  26,   85,   34,  225,   31,   91,   75,    9, 1202,    5,    4])
프랑스어:  Deux jeunes hommes blancs sont dehors près de buissons . 

tensor([  20,   26,   16, 1170,  809,   18,   58,   85,  337, 1340,    6,    5])
영어:  Two young , White males are outside near many bushes . 



In [140]:
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('device:', device)

BATCH_SIZE = 128
PAD_IDX = fr_vocab['<pad>']
SOS_IDX = fr_vocab['<sos>']
EOS_IDX = fr_vocab['<eos>']

from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

# 배치사이즈에 맞게 패딩
def generate_batch(data_batch):
  fr_batch, en_batch = [], []
  for (fr_item, en_item) in data_batch:
    # <sos> + 문장 + <eos> 형태로 만들기
    fr_batch.append(torch.cat([torch.tensor([SOS_IDX]), fr_item, torch.tensor([EOS_IDX])], dim=0))
    en_batch.append(torch.cat([torch.tensor([SOS_IDX]), en_item, torch.tensor([EOS_IDX])], dim=0))
  fr_batch = pad_sequence(fr_batch, padding_value=PAD_IDX)  # 패딩
  en_batch = pad_sequence(en_batch, padding_value=PAD_IDX)
  return fr_batch, en_batch

# 데이터 로더
train_iter = DataLoader(train_data, batch_size=BATCH_SIZE,
                        shuffle=True, collate_fn=generate_batch)
valid_iter = DataLoader(val_data, batch_size=BATCH_SIZE,
                        shuffle=True, collate_fn=generate_batch)
test_iter = DataLoader(test_data, batch_size=BATCH_SIZE,
                       shuffle=True, collate_fn=generate_batch)


device: cuda


In [141]:
PAD_IDX = fr_vocab['<pad>']
SOS_IDX = fr_vocab['<sos>']

print(torch.tensor([SOS_IDX]).shape)

torch.Size([1])


In [142]:
import torch

batch_size, N, K = 3, 10, 256

# x = torch.rand(batch_size, N, K) # [M, N, K]
# y = torch.rand(batch_size, N, K) # [M, N, K]
x = torch.tensor([[1,2,3], [4,5,6]])
y = torch.tensor([[7,8,9], [10,11,12]])

print(x.shape)
print(y.shape)
print()

output0 = torch.cat([x,y], dim=0)
output1 = torch.cat([x,y], dim=1) #[M, N+N, K]
# output2 = torch.cat([x,y], dim=2) 
print(output0.shape)
print(output1.shape)
# print(output2.shape)

torch.Size([2, 3])
torch.Size([2, 3])

torch.Size([4, 3])
torch.Size([2, 6])


In [143]:
from torch.nn.utils.rnn import pad_sequence

a = torch.ones(2, 3)
b = torch.ones(4, 3)
res = pad_sequence([a, b], padding_value=0)

# (?, 3)인 텐서 2개에 패딩을 수행하면 (4, 2, 3)
# (배치에서 가장 긴 시퀀스 길이, 시퀀스 개수, 3)
# 시퀀스가 가장 긴(패딩이 가장 적은) 순서대로 반환

print(res.size())
print(res)

torch.Size([4, 2, 3])
tensor([[[1., 1., 1.],
         [1., 1., 1.]],

        [[1., 1., 1.],
         [1., 1., 1.]],

        [[0., 0., 0.],
         [1., 1., 1.]],

        [[0., 0., 0.],
         [1., 1., 1.]]])


## 3. 모델 정의

In [145]:
import random
from typing import Tuple

import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch import Tensor


# 인코더(Encoder) 아키텍처 정의
class Encoder(nn.Module):
    def __init__(self, input_dim, embed_dim, hidden_dim, n_layers, dropout_ratio):
        super().__init__()

        # 임베딩(embedding)은 원-핫 인코딩(one-hot encoding)을 특정 차원의 임베딩으로 매핑하는 레이어
        # input_dim은 전체 단어의 개수
        # embed_dim은 각 단어를 몇차원의 벡터로 만들 것인지를 의미
        self.embedding = nn.Embedding(input_dim, embed_dim)

        # LSTM layer는 hidden state와 cell state를 반환
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.rnn = nn.LSTM(embed_dim, hidden_dim, n_layers, dropout=dropout_ratio) # 여기서 dropout을 한다고?
        
        # dropout
        self.dropout = nn.Dropout(dropout_ratio)

    # 인코더는 소스 문장을 입력으로 받아 문맥 벡터(context vector)를 반환        
    def forward(self, src):
        # src: [단어 개수, 배치 크기]: 각 단어의 인덱스(index) 정보
        embedded = self.dropout(self.embedding(src))
        # embedded: [단어 개수, 배치 크기, 임베딩 차원]

        outputs, (hidden, cell) = self.rnn(embedded)
        # outputs: [단어 개수, 배치 크기, 히든 차원]: 현재 단어의 출력 정보
        # hidden: [레이어 개수, 배치 크기, 히든 차원]: 현재까지의 모든 단어의 정보
        # cell: [레이어 개수, 배치 크기, 히든 차원]: 현재까지의 모든 단어의 정보

        # context vector 반환
        return hidden, cell


class Decoder(nn.Module):
    def __init__(self, output_dim, embed_dim, hidden_dim, n_layers, dropout_ratio):
        super().__init__()

        # embedding layer
        self.embedding = nn.Embedding(output_dim, embed_dim)

        # LSTM layer
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.rnn = nn.LSTM(embed_dim, hidden_dim, n_layers, dropout=dropout_ratio)
        
        # FC 레이어 (인코더와 다른 부분)
        # hidden layer -> 번역 결과를 출력하기 위해 fc layer가 필요
        self.output_dim = output_dim
        self.fc_out = nn.Linear(hidden_dim, output_dim)
        
        # 드롭아웃(dropout)
        self.dropout = nn.Dropout(dropout_ratio)

    # 디코더는 현재까지 출력된 문장에 대한 정보를 입력으로 받아 타겟 문장을 반환     
    # 단어를 하나 입력 받아 예측한 단어 하나씩 출력
    def forward(self, input, hidden, cell):
        # input: [배치 크기]: 단어의 개수는 항상 1개이도록 구현
        input = input.unsqueeze(0)
        # input: [단어 개수 = 1, 배치 크기]
        
        embedded = self.dropout(self.embedding(input)) # 현재 input에 임베딩을 거친 결과
        # embedded: [단어 개수, 배치 크기, 임베딩 차원]

        output, (hidden, cell) = self.rnn(embedded, (hidden, cell)) # 현재 단어 & 현재까지 정보 모두
        # output: [단어 개수 = 1, 배치 크기, 히든 차원]: 현재 단어의 출력 정보

        # 단어 개수는 어차피 1개이므로 차원 제거
        prediction = self.fc_out(output.squeeze(0))
        # prediction = [배치 크기, 출력 차원]
        
        # (현재 출력 단어, 현재까지의 모든 단어의 정보, 현재까지의 모든 단어의 정보)
        return prediction, hidden, cell


class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()

        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    # 학습할 때는 완전한 형태의 소스 문장, 타겟 문장, teacher_forcing_ratio를 넣기
    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        # src: [단어 개수, 배치 크기]
        # trg: [단어 개수, 배치 크기]
        # 먼저 인코더를 거쳐 문맥 벡터(context vector)를 추출
        hidden, cell = self.encoder(src)

        # 디코더(decoder)의 최종 결과를 담을 텐서 객체 만들기
        trg_len = trg.shape[0] # 단어 개수
        batch_size = trg.shape[1] # 배치 크기
        trg_vocab_size = self.decoder.output_dim # 출력 차원
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)

        # 첫 번째 입력은 항상 <sos> 토큰
        input = trg[0, :]

        # 타겟 단어의 개수만큼 반복하여 디코더에 포워딩(forwarding)
        for t in range(1, trg_len):
            output, hidden, cell = self.decoder(input, hidden, cell)

            outputs[t] = output # FC를 거쳐서 나온 현재의 출력 단어 정보
            top1 = output.argmax(1) # 가장 확률이 높은 단어의 인덱스 추출

            # teacher_forcing_ratio: 학습할 때 정답(ground-truth)을 사용하는 비율
            teacher_force = random.random() < teacher_forcing_ratio
            input = trg[t] if teacher_force else top1 # 현재의 출력 결과를 다음 입력으로 넣기
        
        return outputs

In [146]:
## training
INPUT_DIM = len(fr_vocab)
OUTPUT_DIM = len(en_vocab)
# ENC_EMB_DIM = 256
# DEC_EMB_DIM = 256
# HID_DIM = 512
# N_LAYERS = 2
# DROPOUT = 0.5

ENC_EMB_DIM = 1000
DEC_EMB_DIM = 1000
HID_DIM = 1024
N_LAYERS = 4
DROPOUT = 0.5

# encoder, decoder 객체 선언
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DROPOUT)

# Seq2Seq 객체 선언
model = Seq2Seq(enc, dec, device).to(device)

# 논문 내용대로 (-0.08, 0.08) 정규분포를 따르도록 파라미터 초기화
def init_weights(m: nn.Module):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)


print("INPUT: ", INPUT_DIM)
print("OUTPUT: ", OUTPUT_DIM)
model.apply(init_weights)
print(model)


INPUT:  6556
OUTPUT:  6192
Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(6556, 1000)
    (rnn): LSTM(1000, 1024, num_layers=4, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(6192, 1000)
    (rnn): LSTM(1000, 1024, num_layers=4, dropout=0.5)
    (fc_out): Linear(in_features=1024, out_features=6192, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)


In [147]:
# Adam optimizer로 학습 최적화
optimizer = optim.Adam(model.parameters())

# 파라미터 수
def count_parameters(model: nn.Module):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

# loss 계산 시 뒷 부분 패딩은 무시
PAD_IDX = en_vocab.stoi['<pad>']

# cross-entropy loss
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)

The model has 86,072,592 trainable parameters


In [158]:
a = torch.tensor([[1, 2, 3], [4, 5, 6]])
a.size()

b = a.shape[-1]
b

3

In [148]:
def train(model, iterator, optimizer, criterion, clip):
    model.train() # 학습 모드
    epoch_loss = 0

    for i, (src, trg) in enumerate(iterator):
        src, trg = src.to(device), trg.to(device)
        optimizer.zero_grad()
        output = model(src, trg) # [출력 단어 개수, 배치 사이즈, 출력 차원]
        output_dim = output.shape[-1] 

        output = output[1:].view(-1, output_dim) # [(출력 단어의 개수-1) * 배치 사이즈, output_dim]

        trg = trg[1:].view(-1) # [(출력 단어의 개수-1) * 배치 사이즈]

        loss = criterion(output, trg) # loss 계산
        loss.backward()
        torch.nn.utils.clip_grad_norm(model.parameters(), clip) # gradient exploding 방지
        optimizer.step() # 파라미터 업데이트

        epoch_loss += loss.item() # 전체 loss 계산

    return epoch_loss / len(iterator)


def evaluate(model, iterator, criterion):
    model.eval() # 평가 모드
    epoch_loss = 0

    with torch.no_grad():
        for i, (src, trg) in enumerate(iterator):
            src, trg = src.to(device), trg.to(device)
            # 평가할 때는 teacher forcing을 사용하지 않음
            output = model(src, trg, 0)
            output_dim = output.shape[-1]

            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)

            loss = criterion(output, trg)
            epoch_loss += loss.item()

    return epoch_loss / len(iterator)


def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs




N_EPOCHS = 10
CLIP = 1
best_valid_loss = float('inf')
SAVE_PATH = '/content/drive/MyDrive/Colab Notebooks/papers-with-code/seq2seq/case3.pth'

for epoch in range(N_EPOCHS):
    start_time = time.time() # 시작 시간 기록

    train_loss = train(model, train_iter, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iter, criterion)

    end_time = time.time() # 종료 시간 기록
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), SAVE_PATH)

    print(f'Epoch: {epoch + 1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):.3f}')
    print(f'\tValidation Loss: {valid_loss:.3f} | Validation PPL: {math.exp(valid_loss):.3f}')



Epoch: 01 | Time: 1m 37s
	Train Loss: 4.812 | Train PPL: 123.009
	Validation Loss: 4.603 | Validation PPL: 99.763
Epoch: 02 | Time: 1m 37s
	Train Loss: 4.140 | Train PPL: 62.783
	Validation Loss: 4.297 | Validation PPL: 73.462
Epoch: 03 | Time: 1m 37s
	Train Loss: 3.669 | Train PPL: 39.193
	Validation Loss: 4.131 | Validation PPL: 62.260
Epoch: 04 | Time: 1m 37s
	Train Loss: 3.396 | Train PPL: 29.855
	Validation Loss: 3.926 | Validation PPL: 50.712
Epoch: 05 | Time: 1m 37s
	Train Loss: 3.164 | Train PPL: 23.671
	Validation Loss: 3.861 | Validation PPL: 47.519
Epoch: 06 | Time: 1m 37s
	Train Loss: 2.970 | Train PPL: 19.494
	Validation Loss: 3.738 | Validation PPL: 42.005
Epoch: 07 | Time: 1m 37s
	Train Loss: 2.798 | Train PPL: 16.412
	Validation Loss: 3.687 | Validation PPL: 39.922
Epoch: 08 | Time: 1m 37s
	Train Loss: 2.647 | Train PPL: 14.108
	Validation Loss: 3.676 | Validation PPL: 39.498
Epoch: 09 | Time: 1m 37s
	Train Loss: 2.492 | Train PPL: 12.089
	Validation Loss: 3.699 | Valid

In [80]:
# 학습된 모델 저장
from google.colab import files

# files.download('seq2seq_atten_ep10.pt')
files.download(SAVE_PATH)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [149]:
# # 모델 테스트

model.load_state_dict(torch.load(SAVE_PATH))

test_loss = evaluate(model, test_iter, criterion)

print(f'Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):.3f}')

Test Loss: 3.557 | Test PPL: 35.041


In [83]:
# 번역(translation) 함수

# def translate_sentence(sentence, src_field, trg_field, model, device, max_len=50):
def translate_sentence(sentence, src_vocab, trg_vocab, model, device, max_len=50):
    model.eval() # 평가 모드

    if isinstance(sentence, str):
        # nlp = spacy.load('de')
        nlp = spacy.load('fr')
        tokens = [token.text.lower() for token in nlp(sentence)]
    else:
        tokens = [token.lower() for token in sentence]

    # 처음에 <sos> 토큰, 마지막에 <eos> 토큰 붙이기
    # tokens = [src_field.init_token] + tokens + [src_field.eos_token]
    tokens = ['<sos>'] + tokens + ['<eos>']
    print(f"전체 소스 토큰: {tokens}")

    # src_indexes = [src_field.vocab.stoi[token] for token in tokens]
    src_indexes = [src_vocab.stoi[token] for token in tokens]
    print(f"소스 문장 인덱스: {src_indexes}")

    src_tensor = torch.LongTensor(src_indexes).unsqueeze(1).to(device)

    # 인코더(endocer)에 소스 문장을 넣어 문맥 벡터(context vector) 계산
    with torch.no_grad():
        hidden, cell = model.encoder(src_tensor)

    # 처음에는 <sos> 토큰 하나만 가지고 있도록 하기
    # trg_indexes = [trg_field.vocab.stoi[trg_field.init_token]]
    trg_indexes = [trg_vocab.stoi['<sos>']]

    for i in range(max_len):
        # 이전에 출력한 단어가 현재 단어로 입력될 수 있도록
        trg_tensor = torch.LongTensor([trg_indexes[-1]]).to(device)

        with torch.no_grad():
            output, hidden, cell = model.decoder(trg_tensor, hidden, cell)

        pred_token = output.argmax(1).item()
        trg_indexes.append(pred_token) # 출력 문장에 더하기

        # <eos>를 만나는 순간 끝
        # if pred_token == trg_field.vocab.stoi[trg_field.eos_token]:
        if pred_token == trg_vocab.stoi['<eos>']:
            break

    # 각 출력 단어 인덱스를 실제 단어로 변환
    # trg_tokens = [trg_field.vocab.itos[i] for i in trg_indexes]
    trg_tokens = [trg_vocab.itos[i] for i in trg_indexes]

    # 첫 번째 <sos>는 제외하고 출력 문장 반환
    return trg_tokens[1:]

In [150]:
src = fr_tokenizer("Bonne nuit.") # 영어로 Good night
# src = src[::-1]
print(f'소스 문장: {src}')
print("모델 출력 결과:", " ".join(translate_sentence(src, fr_vocab, en_vocab, model, device)))

소스 문장: ['Bonne', 'nuit', '.']
전체 소스 토큰: ['<sos>', 'bonne', 'nuit', '.', '<eos>']
소스 문장 인덱스: [0, 2090, 276, 5, 3]
모델 출력 결과: On the day . 
 <eos>


In [152]:
# train data에 있던 문장으로 테스트

src = fr_tokenizer("Deux jeunes hommes blancs sont dehors près de buissons.")
print(f'소스 문장: {src}')
print("모델 출력 결과:", " ".join(translate_sentence(src, fr_vocab, en_vocab, model, device)))

# Two young , White males are outside near many bushes . 

소스 문장: ['Deux', 'jeunes', 'hommes', 'blancs', 'sont', 'dehors', 'près', 'de', 'buissons', '.']
전체 소스 토큰: ['<sos>', 'deux', 'jeunes', 'hommes', 'blancs', 'sont', 'dehors', 'près', 'de', 'buissons', '.', '<eos>']
소스 문장 인덱스: [0, 68, 85, 34, 225, 31, 91, 75, 9, 1202, 5, 3]
모델 출력 결과: Four young - haired children are sitting at the . 
 <eos>
