### 확인
- 입력 배치는 time step에 대해서 배치셋을 생성
- 출력의 경우 2-d array

### 의문
- 임베딩 레이어에 padding 없이 입력 들어가도 상관이 없나?
    - 레퍼런스 코드에서는 패딩해줌. 그런데 임베딩에 패딩 인덱스를 따로 넣어주지 않음..
    
### 미구현
- 빔서치
- 병렬 처리
- large Datasets

In [1]:
import sys
sys.path.append("/home/jack/torchstudy/1week/2_refactoring/src")
sys.path.append("/home/jack/torchstudy/2week/2_rafactoring/src")
import torch
from vocab import Vocabs
from seq2seq import Encoder, Decoder, seq2seq
from utils import *

## Inputs (Toy)

In [3]:
src = ["나 너 좋아", "그러나 나 너 싫어", "나 너 누군지 몰라", "그러나 너 찾을거야", "너 죽일거야", "그러나","그러나 나",]
dst = ["i love you", "but i hate you", "i don't know who you are", "but i will find you", "i will kill you", "but", "but i"]

In [4]:
src_vocab = Vocabs("mecab")
dst_vocab = Vocabs()
src_vocab.build_vocabs(src)
dst_vocab.build_vocabs(dst)
dst_vocab.build_index_dict()

In [5]:
src_index_data = [torch.IntTensor(i) for i in src_vocab.stoi(src, option="seq2seq", reverse=True)]
dst_index_data = [torch.IntTensor(i) for i in dst_vocab.stoi(dst, option="seq2seq")]

In [6]:
def pad_and_inverse_batch(data, pad_val):
    a = torch.nn.utils.rnn.pad_sequence(data, padding_value=pad_val)
    datas = torch.stack([a[i,:].to(torch.int64) for i in range(a.size()[1])])
    return datas

In [7]:
src_sample = pad_and_inverse_batch(src_index_data, pad_val=1)
dst_sample = pad_and_inverse_batch(dst_index_data, pad_val=1)

In [8]:
src_sample

tensor([[ 2,  2,  2,  2,  2,  2,  2],
        [ 7, 10, 12, 16, 16,  8,  4],
        [ 6,  9, 11, 15, 15,  3,  8],
        [ 5,  5,  5, 14, 17,  1,  3],
        [ 4,  4,  4, 13,  5,  1,  1],
        [ 3,  8,  3,  5,  3,  1,  1],
        [ 1,  3,  1,  8,  1,  1,  1]])

In [9]:
dst_sample

tensor([[ 2,  2,  2,  2,  2,  2,  2],
        [ 4,  7,  4,  7,  4,  7,  7],
        [ 5,  4,  9,  4, 13,  3,  4],
        [ 6,  8, 10, 13, 15,  1,  3],
        [ 3,  6, 11, 14,  6,  1,  1],
        [ 1,  3,  6,  6,  3,  1,  1],
        [ 1,  1, 12,  3,  1,  1,  1]])

## Toy Models

In [10]:
src_vocab_size = len(src_vocab)
mock_encoder = Encoder(emb_dim=1000, hid_dim=1000, lstm_layers=4, num_embeddings=src_vocab_size, pad_idx=1)

dst_vocab_size = len(dst_vocab)
mock_decoder = Decoder(emb_dim = dst_vocab_size, 
                       hid_dim = 1000, 
                       output_dim = dst_vocab_size, 
                       num_embeddings = dst_vocab_size,
                       lstm_layers = 4, 
                       pad_idx = 0)

mock_seq2seq = seq2seq(mock_encoder, mock_decoder)

In [11]:
def init_weights(m):
    for name, param in m.named_parameters():
        torch.nn.init.uniform_(param.data, -0.08, 0.08)
        
mock_seq2seq.apply(init_weights)

seq2seq(
  (encoder): Encoder(
    (emb): Embedding(18, 1000, padding_idx=1)
    (lstm): LSTM(1000, 1000, num_layers=4)
  )
  (decoder): Decoder(
    (emb): Embedding(16, 16, padding_idx=1)
    (lstm): LSTM(16, 1000, num_layers=4)
    (fc_out): Linear(in_features=1000, out_features=16, bias=True)
  )
)

In [12]:
import torch.optim as optim
optimizer = optim.Adam(mock_seq2seq.parameters())

In [13]:
TRG_PAD_IDX = 0 
criterion = torch.nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)

In [20]:
epoch_loss = 0
clip = 2
for epoch in range(10):

    src = src_sample
    trg = dst_sample

    optimizer.zero_grad()

    output = mock_seq2seq(src, trg)

    #trg = [trg len, batch size]
    #output = [trg len, batch size, output dim]

    output_dim = output.shape[-1]

    output = output[1:].view(-1, output_dim)
    trg = trg[1:].view(-1)

    #trg = [(trg len - 1) * batch size]
    #output = [(trg len - 1) * batch size, output dim]

    loss = criterion(output, trg)

    loss.backward()

    torch.nn.utils.clip_grad_norm_(mock_seq2seq.parameters(), clip)

    optimizer.step()

    epoch_loss += loss.item()

In [21]:
hid, cell = mock_encoder(src)

In [22]:
hid.size()

torch.Size([4, 7, 1000])

In [23]:
test_sample = torch.Tensor([src_vocab.stoi("그러나 너가 싫어", reverse=True)]).to(torch.int64)
print(test_sample)
test_sample = torch.reshape(test_sample, (-1,1))

tensor([[ 8,  5,  0,  9, 10]])


In [24]:
pred, pred_probs = mock_seq2seq.predict(test_sample)

In [25]:
pred = dst_vocab.itos(pred)
pred

'but i hate you'

In [26]:
perplexity(pred_probs)

1.0540944036286821

## Mulit30k

In [120]:
import torch
from torchtext.data.utils import get_tokenizer
from collections import Counter
from torchtext.utils import download_from_url, extract_archive
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
import io

def update_vocabs(filepath, vocab):
    with io.open(filepath, encoding="utf8") as f:
        for string_ in f:
              vocab.build_vocabs([string_.replace("\n","").lower()])
                

# 일단 메모리에 올려서 작업
def data_process(src_path, dst_path, src_vocab, dst_vocab, src_reverse=True):
    raw_de_iter = iter(io.open(src_path, encoding="utf8"))
    raw_en_iter = iter(io.open(dst_path, encoding="utf8"))
    data = []
    if src_reverse == True:
        order = -1
    else:
        order = 1
    for (raw_de, raw_en) in zip(raw_de_iter, raw_en_iter):
        de_tensor_ = torch.tensor([de_vocab.vocab_dict[token] for token in src_vocab.tokenizer(raw_de.lower())[::order]]).to(torch.int64)
        en_tensor_ = torch.tensor([en_vocab.vocab_dict[token] for token in dst_vocab.tokenizer(raw_en.lower())]).to(torch.int64)
        data.append((de_tensor_, en_tensor_))
    return data

def generate_batch(data_batch, pad_idx=0, sos_idx=2, eos_idx=3):
    de_batch, en_batch = [], []
    for (de_item, en_item) in data_batch:
        de_batch.append(torch.cat([torch.tensor([sos_idx]), de_item, torch.tensor([eos_idx])], dim=0))
        en_batch.append(torch.cat([torch.tensor([sos_idx]), en_item, torch.tensor([eos_idx])], dim=0))
    de_batch = pad_sequence(de_batch, padding_value=pad_idx)
    en_batch = pad_sequence(en_batch, padding_value=pad_idx)
    return de_batch, en_batch

In [121]:
url_base = 'https://raw.githubusercontent.com/multi30k/dataset/master/data/task1/raw/'
train_urls = ('train.de.gz', 'train.en.gz')
val_urls = ('val.de.gz', 'val.en.gz')
test_urls = ('test_2016_flickr.de.gz', 'test_2016_flickr.en.gz')

train_filepaths = [extract_archive(download_from_url(url_base + url))[0] for url in train_urls]
val_filepaths = [extract_archive(download_from_url(url_base + url))[0] for url in val_urls]
test_filepaths = [extract_archive(download_from_url(url_base + url))[0] for url in test_urls]

de_tokenizer = get_tokenizer('spacy', language='de_core_news_sm')
en_tokenizer = get_tokenizer('spacy', language='en_core_web_sm')

de_vocab = Vocabs(tokenizer=de_tokenizer)
en_vocab = Vocabs(tokenizer=en_tokenizer)

# 여기에만 있는단어 ['kappe.zwei', 'mr', 'n.y', 'nr', 'q&a', 'u.s']
update_vocabs(train_filepaths[0], de_vocab)
update_vocabs(train_filepaths[1], en_vocab)

train_data = data_process(train_filepaths[0], train_filepaths[1], de_vocab, en_vocab, src_reverse=True)
val_data = data_process(val_filepaths[0], val_filepaths[1], de_vocab, en_vocab, src_reverse=True)
test_data = data_process(test_filepaths[0], test_filepaths[1], de_vocab, en_vocab, src_reverse=True)

In [122]:
BATCH_SIZE = 128
train_iter = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True, collate_fn=generate_batch)
valid_iter = DataLoader(val_data, batch_size=BATCH_SIZE, shuffle=True, collate_fn=generate_batch)
test_iter = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=True, collate_fn=generate_batch)

## build model

In [124]:
device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')

src_vocab_size = len(de_vocab)
mock_encoder = Encoder(emb_dim=1000, hid_dim=1000, lstm_layers=4, num_embeddings=src_vocab_size, pad_idx=0)

dst_vocab_size = len(en_vocab)
mock_decoder = Decoder(emb_dim = 1000, hid_dim = 1000, output_dim = dst_vocab_size, num_embeddings = dst_vocab_size,
                       lstm_layers = 4, 
                       pad_idx = 0)

mock_seq2seq = seq2seq(mock_encoder, mock_decoder)

In [125]:
def init_weights(m):
    for name, param in m.named_parameters():
        torch.nn.init.uniform_(param.data, -0.08, 0.08)
        
mock_seq2seq.apply(init_weights)

seq2seq(
  (encoder): Encoder(
    (emb): Embedding(19378, 1000, padding_idx=0)
    (lstm): LSTM(1000, 1000, num_layers=4)
  )
  (decoder): Decoder(
    (emb): Embedding(10080, 1000, padding_idx=0)
    (lstm): LSTM(1000, 1000, num_layers=4)
    (fc_out): Linear(in_features=1000, out_features=10080, bias=True)
  )
)

In [126]:
import torch.optim as optim
optimizer = optim.Adam(mock_seq2seq.parameters())

PAD_IDX = 0
# CrossEntropyLoss = logSoftmax + NLLloss
criterion = torch.nn.CrossEntropyLoss(ignore_index = PAD_IDX)

# scheduler = optim.lr_scheduler.LambdaLR(optimizer=optimizer,
#                                         lr_lambda=lambda epoch: 0.7 if epoch <= 5 else 0.7 * ((0.5) ** (epoch-5)),
#                                         last_epoch=-1,
#                                         verbose=False)

In [127]:
def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    
    for i, (src, trg) in enumerate(iterator):

        optimizer.zero_grad()
        
        output = model(src, trg)
        
        #trg = [trg len, batch size]
        #output = [trg len, batch size, output dim]
        
        output_dim = output.shape[-1]
        
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)
        
        #trg = [(trg len - 1) * batch size]
        #output = [(trg len - 1) * batch size, output dim]
        
        loss = criterion(output, trg)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [128]:
def evaluate(model, iterator, criterion):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for i, (src, trg) in enumerate(iterator):
            output = model(src, trg) #turn off teacher forcing
            output_dim = output.shape[-1]
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)
            loss = criterion(output, trg)
            epoch_loss += loss.item()
    return epoch_loss / len(iterator)

In [129]:
import time
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

import math
N_EPOCHS = 1
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    start_time = time.time()
    train_loss = train(mock_seq2seq, train_iter, optimizer, criterion, CLIP)
    valid_loss = evaluate(mock_seq2seq, valid_iter, criterion)
    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(mock_seq2seq.state_dict(), 'seq2seq-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')


KeyboardInterrupt: 

## 미완(빔서치)

In [None]:
import torch.nn as nn
import torch.nn.functional as F
from queue import PriorityQueue


class BeamSearchNode(object):
    def __init__(self, hiddenstate, previousNode, wordId, logProb, length):
        '''
        :param hiddenstate:
        :param previousNode:
        :param wordId:
        :param logProb:
        :param length:
        '''
        self.h = hiddenstate
        self.prevNode = previousNode
        self.wordid = wordId
        self.logp = logProb
        self.leng = length

    def eval(self, alpha=1.0):
        reward = 0
        # Add here a function for shaping a reward

        return self.logp / float(self.leng - 1 + 1e-6) + alpha * reward

def beam_decode(target_tensor, decoder_hiddens, encoder_outputs=None):
    '''
    :param target_tensor: target indexes tensor of shape [B, T] where B is the batch size and T is the maximum length of the output sentence
    :param decoder_hidden: input tensor of shape [1, B, H] for start of the decoding
    :param encoder_outputs: if you are using attention mechanism you can pass encoder outputs, [T, B, H] where T is the maximum length of input sentence
    :return: decoded_batch
    '''
    SOS_token = 1
    beam_width = 10
    topk = 3  # how many sentence do you want to generate
    decoded_batch = []

    # decoding goes sentence by sentence
    for idx in range(target_tensor.size(0)):
        if isinstance(decoder_hiddens, tuple): 
            decoder_hidden = (decoder_hiddens[0][:,idx, :].unsqueeze(0), decoder_hiddens[1][:,idx, :].unsqueeze(0))
        else:
            decoder_hidden = decoder_hiddens[:, idx, :].unsqueeze(0)
#         encoder_output = encoder_outputs[:,idx, :].unsqueeze(1)

        # Start with the start of the sentence token
        decoder_input = torch.LongTensor([SOS_token], device=device)

        # Number of sentence to generate
        endnodes = []
        number_required = min((topk + 1), topk - len(endnodes))

        # starting node -  hidden vector, previous node, word id, logp, length
        node = BeamSearchNode(decoder_hidden, None, decoder_input, 0, 1)
        nodes = PriorityQueue()

        # start the queue
        nodes.put((-node.eval(), node))
        qsize = 1

        # start beam search
        while True:
            # give up when decoding takes too long
            if qsize > 2000: break

            # fetch the best node
            score, n = nodes.get()
            decoder_input = n.wordid
            decoder_hidden = n.h

            if n.wordid.item() == EOS_token and n.prevNode != None:
                endnodes.append((score, n))
                # if we reached maximum # of sentences required
                if len(endnodes) >= number_required:
                    break
                else:
                    continue

            # decode for one step using decoder
            decoder_hidden = (decoder_hidden[0].view(4, -1, 1000), decoder_hidden[1].view(4, -1, 1000))
            decoder_output, decoder_hidden, cell = decoder(decoder_input, decoder_hidden[0],decoder_hidden[1])

            # PUT HERE REAL BEAM SEARCH OF TOP
            log_prob, indexes = torch.topk(decoder_output, beam_width)
            nextnodes = []

            for new_k in range(beam_width):
                decoded_t = indexes[0][new_k].view(1, -1)
                log_p = log_prob[0][new_k].item()

                node = BeamSearchNode(decoder_hidden, n, decoded_t, n.logp + log_p, n.leng + 1)
                score = -node.eval()
                nextnodes.append((score, node))

            # put them into queue
            for i in range(len(nextnodes)):
                score, nn = nextnodes[i]
                nodes.put((score, nn))
                # increase qsize
            qsize += len(nextnodes) - 1

        # choose nbest paths, back trace them
        if len(endnodes) == 0:
            endnodes = [nodes.get() for _ in range(topk)]

        utterances = []
        for score, n in sorted(endnodes, key=operator.itemgetter(0)):
            utterance = []
            utterance.append(n.wordid)
            # back trace
            while n.prevNode != None:
                n = n.prevNode
                utterance.append(n.wordid)

            utterance = utterance[::-1]
            utterances.append(utterance)

        decoded_batch.append(utterances)

    return decoded_batch

In [68]:
device = "cpu"
decoder = mock_seq2seq.decoder
EOS_token = 3
beam_decode(target_tensor=trg.view(-1, len(trg)), decoder_hiddens = hid, decoder_cell = cell)

torch.Size([1, 4, 1000])


RuntimeError: Expected hidden[0] size (4, 1, 1000), got [1, 4, 1000]