In [10]:
import os
import io
import math
import time
import random
import gc
import pickle
import re

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

from konlpy.tag import Okt 

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torchtext.data import Field, BucketIterator, TabularDataset, Iterator

In [11]:
SEED = 2020010553
random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [1]:
# data = pd.read_pickle('input.pickle')

In [2]:
# df = pd.DataFrame()
# df['full'] = list(data.keys())
# df['core'] = list(data.values())
# df.head()

In [3]:
# del data
# gc.collect()

In [4]:
# df.info()

In [5]:
# df.head()

In [6]:
# df['core'] = df.core.apply(lambda x: ' '.join(x[1:-1]))
# df.head()

In [7]:
# df['full'] = df.full.apply(lambda x: ' '.join(re.compile("[가-힣a-zA-Z0-9]+").findall(x)))
# df['core'] = df.core.apply(lambda x: ' '.join(re.compile("[가-힣a-zA-Z0-9]+").findall(x)))

In [8]:
# df.info()

In [12]:
#df.to_csv('LYRICS_TORCH.csv', encoding='UTF-8', index=False)
df = pd.read_csv('LYRICS_TORCH.csv', encoding='UTF-8')

gc.collect()

0

In [228]:
# #to make skim data

# df = pd.read_csv('LYRICS_TORCH.csv', encoding='UTF-8')
# df.dropna(inplace = True)
# df = df.iloc[:10000]
# df.to_csv('LYRICS_TORCH_10000.csv', index=False, encoding='UTF-8')
# df = pd.read_csv('LYRICS_TORCH_10000.csv', encoding='UTF-8')

In [13]:
df.dropna(inplace = True)
gc.collect()

0

In [14]:
df.head()

Unnamed: 0,full,core
0,정말 바보같은 나의 실수 차려 정신빨리 니가뭔데 란 착각에 또 빠졌어,빠졌어 바보 정신 실수 빨리 같은 착각 차려 정말
1,니가뭔데 란 착각에 또 빠졌어 멀리 떠나버린 지금 그리워,착각 멀리 그리워 지금 버린 빠졌어 떠나
2,멀리 떠나버린 지금 그리워 니 말이 전부 다 맞아,버린 그리워 맞아 떠나 지금 전부 멀리
3,니 말이 전부 다 맞아 내게 돌아와서 불러줘 She say my boo,불러줘 say 내게 She boo 돌아와서 맞아 전부
4,내게 돌아와서 불러줘 She say my boo I feel like,돌아와서 boo She 불러줘 say feel like 내게


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 841081 entries, 0 to 841126
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   full    841081 non-null  object
 1   core    841081 non-null  object
dtypes: object(2)
memory usage: 19.3+ MB


In [16]:
# tokenizer
tokenizer = Okt()

LYRICS_CORE = Field(
    tokenize = str.split,
    init_token = '<sos>',
    eos_token = '<eos>',
    lower = True,
    include_lengths = True,
    batch_first = False)

LYRICS_FULL = Field(
    tokenize = tokenizer.morphs,
    init_token = '<sos>',
    eos_token = '<eos>',
    lower = True,
    include_lengths = True,
    batch_first = False)

In [17]:
# %%time

# df['core'] = df.core.apply(str.split)
# df['full'] = df.full.apply(tokenizer.morphs)

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 841081 entries, 0 to 841126
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   full    841081 non-null  object
 1   core    841081 non-null  object
dtypes: object(2)
memory usage: 19.3+ MB


In [19]:
# import pickle

# with open('LYRICS_TORCH_split.pickle', 'wb') as f:
#     pickle.dump(df, f)

In [20]:
# df = pd.DataFrame(pd.read_pickle('LYRICS_TORCH_split.pickle'))

In [22]:
df.iloc[:100000].to_csv('LYRICS_TORCH_100k.csv', encoding='UTF-8', index=False)

In [23]:
data_set = TabularDataset(path='./LYRICS_TORCH_100k.csv',
                         format='csv',
                         fields=[('full',LYRICS_FULL),('core',LYRICS_CORE)])

In [25]:
data_set.examples[30].core

['늘어', '털어', '때매', '부어', '안해', '낄라', '본체']

In [26]:
# Build Vocab for CORE
LYRICS_CORE.build_vocab(data_set.core, min_freq=2)

In [27]:
# Build Vocab for CORE
LYRICS_FULL.build_vocab(data_set.full, min_freq=2)

In [28]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Device to use:', device)

Device to use: cuda


## Create Data Iterators

In [29]:
BATCH_SIZE = 128

In [30]:
data_iterator = BucketIterator(
        dataset = data_set,
        batch_size = BATCH_SIZE,
        train = True,
        sort_within_batch = True,
        sort_key = lambda x: len(x.core),
        device = device
)

print(f'Number of minibatches per epoch: {len(data_iterator)}')

Number of minibatches per epoch: 782


In [31]:
valid_iterator = BucketIterator(
        dataset = data_set,
        batch_size = BATCH_SIZE,
        train = True,
        sort_within_batch = True,
        sort_key = lambda x: len(x.core),
        device = device
)

print(f'Number of minibatches per epoch: {len(valid_iterator)}')

Number of minibatches per epoch: 782


In [32]:
data_batch = next(iter(data_iterator))
core, core_len = data_batch.core # X 변수
full, full_len = data_batch.full # Y 변수
print('a batch of CORE examples has shape:', core.size())  # (source_seq_len, batch_size)
print('a batch of FULL examples has shape:', full.size())  # (target_seq_len, batch_size)

a batch of CORE examples has shape: torch.Size([6, 128])
a batch of FULL examples has shape: torch.Size([19, 128])


In [33]:
# Checking last sample in mini-batch (GERMAN, source lang)
core, core_len = data_batch.core
core_indices = core[:,6]
core_tokens = [LYRICS_CORE.vocab.itos[i] for i in core_indices]
for t, i in zip(core_tokens, core_indices):
    print(f"{t} ({i})")
del core_indices, core_tokens

<sos> (2)
내게도 (3010)
이런 (94)
외로운 (1697)
작고 (4602)
<eos> (3)


In [34]:
# Checking last sample in mini-batch (EN, target lang)
full, full_len = data_batch.full
full_indices = full[:, 6]
full_tokens = [LYRICS_FULL.vocab.itos[i] for i in full_indices]
for t, i in zip(full_tokens, full_indices):
    print(f"{t} ({i})")
del full_indices, full_tokens

<sos> (2)
작고 (5319)
외로운 (2195)
이런 (212)
내게도 (3626)
<eos> (3)
<pad> (1)
<pad> (1)
<pad> (1)
<pad> (1)
<pad> (1)
<pad> (1)
<pad> (1)
<pad> (1)
<pad> (1)
<pad> (1)
<pad> (1)
<pad> (1)
<pad> (1)


# Building Seq2Seq Model

## Hyperparameters

In [73]:
INPUT_DIM = len(LYRICS_CORE.vocab) ## Bag of Words
OUTPUT_DIM = len(LYRICS_FULL.vocab) ## Bag of Workds
ENC_EMB_DIM = DEC_EMB_DIM = 256 ##임베딩 차원
ENC_HID_DIM = DEC_HID_DIM = 512 ## hidden state 차원
USE_BIDIRECTIONAL = False

print('source vocabulary size:', INPUT_DIM)
print('source word embedding size:', ENC_EMB_DIM)
print(f'encoder RNN hidden size: {ENC_HID_DIM}({ENC_HID_DIM *2} if bidirectional)')
print('-'*50)
print('target vocabulary size:', OUTPUT_DIM)
print('target word embedding size:', ENC_EMB_DIM)
print('decoder RNN hidden size:', ENC_HID_DIM)

source vocabulary size: 45715
source word embedding size: 256
encoder RNN hidden size: 512(1024 if bidirectional)
--------------------------------------------------
target vocabulary size: 47063
target word embedding size: 256
decoder RNN hidden size: 512


## ENCODER

In [55]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, enc_hid_dim, dec_hid_dim, bidirectional=False):
        super(Encoder, self).__init__()
        
        self.input_dim = input_dim
        self.emb_dim = emb_dim
        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim
        self.bidirectional = bidirectional
        
        self.embedding = nn.Embedding(num_embeddings = self.input_dim,
                                      embedding_dim = self.emb_dim
                                     )
        
        self.rnn = nn.GRU(
            input_size = self.emb_dim,
            hidden_size = self.enc_hid_dim,
            bidirectional = self.bidirectional,
            batch_first = False
        )
        
        self.rnn_output_dim = self.enc_hid_dim
        if self.bidirectional:
            self.rnn_output_dim *= 2
        
        self.fc = nn.Linear(self.rnn_output_dim, self.dec_hid_dim)
        self.dropout = nn.Dropout(0.2)
        
    def forward(self, src, src_len):
        assert len(src.size()) == 2, 'Input requires dimension (input_seq_len, batch_size)'

        #Shape: (b, s, h)
        embedded = self.embedding(src)
        embedded = self.dropout(embedded)

        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, src_len)
        packed_outputs, hidden = self.rnn(packed_embedded)
        outputs, _ = nn.utils.rnn.pad_packed_sequence(packed_outputs)

        if self.bidirectional:
            # (2, b, enc_h) -> (b, 2*enc_h)
            hidden = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)
        else:
            # (1, b, enc_h) -> (b, enc_h)
            hidden = hidden.squeeze(0)

        # (b, num_directions * enc_h) -> (b, dec_h)
        hidden = self.fc(hidden)
        hidden = torch.tanh(hidden)

        # (S, B, enc_h * num_directions), (B, dec_h)
        return outputs, hidden          

## ATTENTION

In [56]:
class Attention(nn.Module):
    def __init__(self, enc_hid_dim, dec_hid_dim, encoder_is_bidirectional = False):
        super(Attention, self).__init__()
        
        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim
        self.encoder_is_bidirectional = encoder_is_bidirectional
        
        self.attention_input_dim = enc_hid_dim + dec_hid_dim
        if self.encoder_is_bidirectional:
            # 2*h_enc + h_enc
            self.attention_input_dim += enc_hid_dim 
        
        self.linear = nn.Linear(self.attention_input_dim, dec_hid_dim)
        self.v = nn.Parameter(torch.rand(dec_hid_dim))
        
    def forward(self, hidden, encoder_outputs, mask):

        # Shape Check
        assert hidden.dim() == 2
        assert encoder_outputs.dim() == 3

        seq_len, batch_size, _ = encoder_outputs.size()

        # (b, dec_h) -> (b, s, dec_h)
        hidden = hidden.unsqueeze(1).expand(-1, seq_len, -1)

        # (s, b, enc_h*num_directions) -> (b, s, enc_h*num_directions)
        encoder_outputs = encoder_outputs.permute(1, 0, 2)

        # concat; shape results in (b, s, enc_h + dec_h)
        # if encoder is bidirectional, (b, s, 2*h_enc + h_dec)
        concat = torch.cat((hidden, encoder_outputs), dim=2)

        #energy; shape is (b, s, dec_h)
        energy = torch.tanh(self.linear(concat))

        # tile v; (dec_h, ) -> (b, dec_h) -> (b, dec_h, 1)
        v = self.v.unsqueeze(0).expand(batch_size, -1).unsqueeze(2)

        # attn; (b, s, dec_h) @ (b, dec_h, 1) -> (b, s, 1) -> (b, s)
        attn_scores = torch.bmm(energy, v).squeeze(-1)

        # mask padding indices
        attn_scores = attn_scores.masked_fill(mask==0, -1e10)

        #Final shape checkL (b, s)
        assert attn_scores.dim() == 2

        # Attention Weight
        return F.softmax(attn_scores, dim=1)

## DECODER

In [57]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, enc_hid_dim, dec_hid_dim, attention_module, encoder_is_bidirectional=False):
        super(Decoder, self).__init__()
        
        self.emb_dim = emb_dim
        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim
        self.output_dim = output_dim
        self.encoder_is_bidirectional = encoder_is_bidirectional
        
        if isinstance(attention_module, nn.Module):
            self.attention_module = attention_module
        else:
            raise ValueError
        
        self.rnn_input_size = enc_hid_dim + emb_dim # enc_h + dec_emb_dim
        if self.encoder_is_bidirectional:
            self.rnn_input_size += enc_hid_dim
        
        self.embedding = nn.Embedding(output_dim, emb_dim)
        
        self.rnn = nn.GRU(
            input_size=self.rnn_input_size,
            hidden_size=dec_hid_dim,
            bidirectional=False,
            batch_first=False
        )
        
        self.out_input_size = emb_dim + dec_hid_dim + enc_hid_dim
        if self.encoder_is_bidirectional:
            self.out_input_size += enc_hid_dim
        self.out = nn.Linear(self.out_input_size, output_dim)
        
        self.dropout = nn.Dropout(0.2)
        
    def forward(self, inp, hidden, encoder_outputs, mask, temperature=1.0):
        assert inp.dim() == 1
        assert hidden.dim() == 2
        assert encoder_outputs.dim() == 3
        
        # (b, ) -> (1, b)
        inp = inp.unsqueeze(0)
        
        # (1, b) -> (1, b, emb)
        embedded = self.embedding(inp)
        embedded = self.dropout(embedded)
        
        # Attention Weights
        # attention probabilities: (b, s)
        attn_probs = self.attention_module(hidden, encoder_outputs, mask)
        
        # (b, 1, s)
        attn_probs = attn_probs.unsqueeze(1)
        
        # (s, b, ~) -> (b, s, ~)
        encoder_outputs = encoder_outputs.permute(1,0,2)
        
        # (b, 1, s) @ (b, s, ~) -> (b, 1, enc_h*num_directions)
        weighted = torch.bmm(attn_probs, encoder_outputs)
        
        # (1, b, ~)
        weighted = weighted.permute(1,0,2)
        
        # (b, 1, emb + enc_h*num_directions)
        rnn_input = torch.cat((embedded, weighted), dim=2)
        
        # output; (b, 1, dec_h)
        # new_hidden; (1, b, dec_h)
        output, new_hidden = self.rnn(rnn_input, hidden.unsqueeze(0))
        assert (output == new_hidden).all()
        
        embedded = embedded.squeeze(0) # (1, b, emb) -> (b, emb)
        output = output.squeeze(0) # (1, b, dec_h) -> (b, dec_h)
        weighted = weighted.squeeze(0) # (1, b, enc_h*num_d) -> (b, enc_h,*num_d)
        
        # output; (b, emb + enc_h + dec_h) -> (b, output_dim)
        # if encoder is bidirectional, (b, emb + 2*enc_h + dec_h) -> (b, output_dim)
        output = self.out(torch.cat((output, weighted, embedded), dim=1))
        output = output/temperature
        
        return output, new_hidden.squeeze(0), attn_probs.squeeze(1)

## Seq2Seq MODEL

In [58]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, pad_idx, sos_idx, eos_idx, device):
        super(Seq2Seq, self).__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.pad_idx = pad_idx
        self.sos_idx = sos_idx
        self.eos_idx = eos_idx
        self.device = device
        
    def create_mask(self, src):
        mask = (src != self.pad_idx).permute(1,0) # (b, s)
        return mask
    
    def forward(self, src, src_len, trg=None, teacher_forcing_ratio=0.5):
        
        batch_size = src.size(1)
        max_seq_len = trg.size(0) if trg is not None else 100
        
        trg_vocab_size = self.decoder.output_dim
        
        if trg is None:
            assert teacher_forcing_ratio == 0., 'Must be zero during inference.'
            inference = True
            trg = torch.zeros(max_seq_len, batch_size).long().fill_(self.sos_idx).to(self.device)
        else:
            inference = False
            
        # An empty tensor to stor decoder outputs (time index first for faster indexing)
        outputs_shape = (max_seq_len, batch_size, trg_vocab_size)
        outputs = torch.zeros(outputs_shape).to(self.device)
        
        # empty tensor to store attention probs
        attns_shape = (max_seq_len, batch_size, src.size(0))
        attns = torch.zeros(attns_shape).to(self.device)
        
        encoder_outputs, hidden = self.encoder(src, src_len)
        mask = self.create_mask(src)
        
        # first input to the decoder is '<sos>'
        # trg; shape (batch_size, seq_len)
        initial_dec_input = output = trg[0, :] # get first timestep token
        
        for t in range(1, max_seq_len):
            
            output, hidden, attn = self.decoder(output, hidden, encoder_outputs, mask)
            outputs[t] = output
            attns[t] = attn
            
            _, idx = output.max(dim=1)
            teacher_force = torch.rand(1).item() < teacher_forcing_ratio
            
            new_dec_input = output = trg[t] if teacher_force else idx
            
            if inference and output.item() == self.eos_idx:
                return outputs[:t], attns[:t]
            
        return outputs, attns

## Build MODEL

In [59]:
# Define encoder
enc = Encoder(
    input_dim = INPUT_DIM,
    emb_dim = ENC_EMB_DIM,
    enc_hid_dim = ENC_HID_DIM,
    dec_hid_dim = DEC_HID_DIM,
    bidirectional = USE_BIDIRECTIONAL
)

print(enc)

Encoder(
  (embedding): Embedding(45715, 128)
  (rnn): GRU(128, 256)
  (fc): Linear(in_features=256, out_features=256, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)


In [60]:
# Define attention layer
attn = Attention(
    enc_hid_dim = ENC_HID_DIM,
    dec_hid_dim = DEC_HID_DIM,
    encoder_is_bidirectional = USE_BIDIRECTIONAL
)

print(attn)

Attention(
  (linear): Linear(in_features=512, out_features=256, bias=True)
)


In [61]:
# Define decoder
dec = Decoder(
    output_dim = OUTPUT_DIM,
    emb_dim = DEC_EMB_DIM,
    enc_hid_dim = ENC_HID_DIM,
    dec_hid_dim = DEC_HID_DIM,
    attention_module = attn,
    encoder_is_bidirectional = USE_BIDIRECTIONAL
)

print(dec)

Decoder(
  (attention_module): Attention(
    (linear): Linear(in_features=512, out_features=256, bias=True)
  )
  (embedding): Embedding(47063, 128)
  (rnn): GRU(384, 256)
  (out): Linear(in_features=640, out_features=47063, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)


In [62]:
PAD_IDX = LYRICS_CORE.vocab.stoi['<pad>']
SOS_IDX = LYRICS_CORE.vocab.stoi['<sos>']
EOS_IDX = LYRICS_CORE.vocab.stoi['<eos>']

print('PAD INDEX:', PAD_IDX)
print('SOS INDEX:', SOS_IDX)
print('EOS INDEX:', EOS_IDX)

PAD INDEX: 1
SOS INDEX: 2
EOS INDEX: 3


In [63]:
model = Seq2Seq(enc, dec, PAD_IDX, SOS_IDX, EOS_IDX, device).to(device)
print(model)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(45715, 128)
    (rnn): GRU(128, 256)
    (fc): Linear(in_features=256, out_features=256, bias=True)
    (dropout): Dropout(p=0.2, inplace=False)
  )
  (decoder): Decoder(
    (attention_module): Attention(
      (linear): Linear(in_features=512, out_features=256, bias=True)
    )
    (embedding): Embedding(47063, 128)
    (rnn): GRU(384, 256)
    (out): Linear(in_features=640, out_features=47063, bias=True)
    (dropout): Dropout(p=0.2, inplace=False)
  )
)


## Count trainable parameters

In [64]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters.')

The model has 43,029,847 trainable parameters.


## Initialize trainable parameters

In [65]:
def init_parameters(model):
    for name, param in model.named_parameters():
        if 'weight' in name:
            nn.init.normal_(param.data, mean=0., std=0.01)
        else:
            nn.init.constant_(param.data, 0.)
            
model.apply(init_parameters)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(45715, 128)
    (rnn): GRU(128, 256)
    (fc): Linear(in_features=256, out_features=256, bias=True)
    (dropout): Dropout(p=0.2, inplace=False)
  )
  (decoder): Decoder(
    (attention_module): Attention(
      (linear): Linear(in_features=512, out_features=256, bias=True)
    )
    (embedding): Embedding(47063, 128)
    (rnn): GRU(384, 256)
    (out): Linear(in_features=640, out_features=47063, bias=True)
    (dropout): Dropout(p=0.2, inplace=False)
  )
)

# Train

## Optimizer

- USE `optim.Adam` or `optim.RMSprop`

In [66]:
optimizer = optim.Adam(model.parameters(), lr=0.001)

## Loss function

In [67]:
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)
print(f"<pad> index in target vocab (en): '{PAD_IDX}' will be ignored when loss is calculated.")

<pad> index in target vocab (en): '1' will be ignored when loss is calculated.


## Train function

In [68]:
def train(seq2seq_model, iterator, optimizer, criterion, grad_clip=1.0):
    seq2seq_model.train()
    
    epoch_loss = .0
    
    for i, batch in enumerate(iterator):
        print('.', end='')
        
        core, core_len = batch.core
        full, _ = batch.full
        
        optimizer.zero_grad()
        
        decoder_outputs, _ = seq2seq_model(core, core_len, full, teacher_forcing_ratio=0.5)
        full_seq_len, batch_size, full_vocab_size = decoder_outputs.size() # (s, b, full_vocab)
        
        # (s-1, b, full_vocab)
        decoder_outputs = decoder_outputs[1:]
        
        # (s-1 * b, full_vocab)
        decoder_outputs = decoder_outputs.view(-1, full_vocab_size)
        
        # (s, b) -> (s-1 * b, )
        full = full[1:].view(-1)
        
        loss = criterion(decoder_outputs, full)
        loss.backward()
        
        # Gradient clipping: remedy for exploding gradients
        torch.nn.utils.clip_grad_norm_(seq2seq_model.parameters(), grad_clip)
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

## Evaluate function

In [69]:
def evaluate(seq2seq_model, iterator, criterion):
    seq2seq_model.eval()
    
    epoch_loss = 0.
    
    with torch.no_grad():
        
        for i, batch in enumerate(iterator):
            
            print('.', end='')
            
            core, core_len = batch.core
            full, _ = batch.full
            
            decoder_outputs, _ = seq2seq_model(core, core_len, full, teacher_forcing_ratio=0.)
            full_seq_len, batch_size, full_vocab_size = decoder_outputs.size() # (s, b, full_vocab)
            
            # (s-1, b, full_vocab)
            decoder_outputs = decoder_outputs[1:]
            
            # (s-1 * b, full_vocab)
            decoder_outputs = decoder_outputs.view(-1, full_vocab_size)
            
            # (s, b) -> (s-1 * b)
            full = full[1:].view(-1)
            
            loss = criterion(decoder_outputs, full)
            
            epoch_loss += loss.item()
            
    return epoch_loss / len(iterator)

## Epoch time measure function

In [70]:
def epoch_time(start_time, end_time):
    """Returns elapsed time in mins & secs"""
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

## Train for multiple epochs

In [71]:
NUM_EPOCHS = 10

In [72]:
best_valid_loss = float('inf')

for epoch in range(NUM_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, data_iterator, optimizer, criterion)
    valid_loss = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), './best_model_lyrics_rebuild.pt')
        
    print('\n')
    print(f"Epoch: {epoch + 1:>02d} | Time: {epoch_mins}m {epoch_secs}s")
    print(f"Train Loss: {train_loss:>.4f} | Train Perplexity: {math.exp(train_loss):7.3f}")
    print(f"Valid Loss: {valid_loss:>.4f} | Valid Perplexity: {math.exp(valid_loss):7.3f}")

.

RuntimeError: CUDA out of memory. Tried to allocate 530.00 MiB (GPU 0; 6.00 GiB total capacity; 3.53 GiB already allocated; 367.14 MiB free; 4.23 GiB reserved in total by PyTorch)

## Save last model (overfitted)

In [None]:
torch.save(model.state_dict(), './last_model_lyrcis_rebuild_overfit.pt')

## 5. TEST

## Evaluate on test data

In [None]:
model.load_state_dict(torch.load('best_model_lyrics_rebuild.pt'))
test_loss = evaluate(model, valid_iterator, criterion)
print(f"| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f}")

## Function to convert indices to Original Text Strings (Rebuild Lyrics)

In [215]:
def rebuild_lyrics(seq2seq_model, sentence):
    
    seq2seq_model.eval()
    
    # Tokenize sentence    
    okt = Okt()
    tokenized = okt.morphs(sentence)
    
    # lower tokens
    tokenized = [t.lower() for t in tokenized]
    
    # Add <sos> & <eos> tokens to the front and back of the sentence
    tokenized = ['<sos>'] + tokenized + ['<eos>']
    
    # tokens -> indices
    numericalized = [LYRICS_FULL.vocab.stoi[s] for s in tokenized]
    
    sent_length = torch.tensor([len(numericalized)]).long().to(device)
    tensor = torch.LongTensor(numericalized).unsqueeze(1).to(device)
    
    translation_logits, attention = seq2seq_model(tensor, sent_length, None, 0)
    translation_tensor = torch.argmax(translation_logits.squeeze(1), dim=1)
    translation = [LYRICS_FULL.vocab.itos[s] for s in translation_tensor]
    translation, attention = translation[1:], attention[1:]
    
    return translation, attention

In [218]:
def display_attention(candidate, translation, attention):
    
    okt = Okt()
    
    fig = plt.figure(figsize=(10,10))
    ax = fig.add_subplot(111)
    
    attention = attention.squeeze(1).cpu().detach().numpy()
    
    cax = ax.matshow(attention, cmap='bone')
    
    ax.tick_params(labelsize = 15)
    ax.set_xticklabels([''] + ['<sos>'] + [t.lower() for t in okt.morphs(candidate)] +['<eos>'], rotation=45)
    ax.set_yticklabels([''] + translation)
    
    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))
    
    plt.show()
    plt.close()

In [227]:
example_idx = 200
core = '여러가지 색깔의 flame'
full = ' '.join(data_set.examples[example_idx].full)

print(f'core = {core}')
print(f'full = {full}')

rebuild, attention = rebuild_lyrics(model, core)
print(f'predicted trg = {rebuild}')

core = 여러가지 색깔의 flame
full = 돈 다발 이 시 다 바리
predicted trg = ['third', 'season', '멋있데', '세', '찾아온', '춤춰', '들이지', '했을', '깃발', '하는거는', '울리네', '별거', '바라보면', 'bank', 'clock', '실망할', '시끄러운', '비트', 'none', '믿어', '다발', '줄래요', '만난다면', '트로피', '피하', 'keezy', '고맙지', '들은', '짦', '짼', '어린아이', '동심', '가삿말', '이민', '읽다가', '봉지', '영원한건', '놀이기구', '돌담', '소독약', '마이크', '에서라도', '민족', 'll', 'have', '되거나', '고기', '뿐이죠', 'bay', '검게', '기계로', '구경', '며칠', '였다는', '하긴', '억지로', '있었으면', '차단', '방해', '샷', '변하는대로', 'trumpet', '올', '쳐하면서', '글', '빽', 'goddamn', '여심', '명언', '갈건데', '시', '꼬', 'seems', '에서라도', '텔레비전', '엿봐', '쓰라려도', '염탐', '심판', '나왔으면', '신비로운', '비싼', '담을래', '나왔으면', '부터', '며', '나빠', '유통', '기쁨', '담', '파져', 'useless', '열었지', '들뜬', '전혀', '또', '부은', '브라보', '난사']
