In [1]:
import gensim
import numpy as np
from gensim.models.word2vec import Word2Vec
import sys
import multiprocessing
from tqdm import tqdm
import os

Slow version of gensim.models.doc2vec is being used


In [2]:
import json
import time
from konlpy.tag import Twitter
twitter=Twitter()

In [3]:
import torch
from torch.autograd import Variable
import torch.nn.utils.rnn as rnn_utils
import torch.nn as nn

In [4]:
ts = time.strftime('%Y-%b-%d-%H-%M-%S', time.gmtime())

save_model_path = os.path.join('won', ts)
os.makedirs('.\\'+save_model_path)

In [5]:
model = gensim.models.Word2Vec.load("./Word2vec_pad.model")

word2index=model.wv.index2word

word2index = {}
for i,j in enumerate(model.wv.index2word):
    word2index[j]=i
index2word=model.wv.index2word

model_embedding = []
for index in model.wv.index2word:
    model_embedding.append(model.wv[index])
model_embedding = np.array(model_embedding)

with open('input.txt', 'r') as f:
    input_dict=json.load(f)


song_names=list(input_dict.keys())
inputs = {}
targets = {}
for song_name in song_names:
    for i, line in enumerate(input_dict[song_name]):
        for j, s in enumerate(line):
            input_dict[song_name][i][j]=word2index[s]
    inputs[song_name] = input_dict[song_name][:len(input_dict[song_name])//2]
    targets[song_name] = input_dict[song_name][len(input_dict[song_name])//2 : ]

In [6]:
keys=inputs.keys()

In [7]:
inputs_len = {}
for key in keys:
    temp=[]
    for sentence in inputs[key]:
        temp.append(len(sentence)-sentence.count(0))
    inputs_len[key] = temp

In [8]:
inputs_len

{'`친구`라 말할 수 있는 건': [8,
  6,
  8,
  8,
  7,
  10,
  6,
  5,
  9,
  8,
  10,
  8,
  7,
  6,
  6,
  6,
  7,
  8,
  7,
  10,
  6,
  6,
  7,
  8,
  7,
  10,
  7,
  8,
  7,
  9,
  7,
  8,
  7,
  10],
 '사랑한 후에 (2006 Ver.)': [6,
  9,
  8,
  6,
  7,
  8,
  7,
  6,
  6,
  6,
  10,
  6,
  6,
  8,
  4,
  7,
  5,
  7,
  5,
  9,
  5,
  5,
  8,
  6,
  8,
  4,
  7,
  5,
  7,
  5,
  9,
  5,
  5,
  8],
 '고여': [5,
  4,
  7,
  8,
  6,
  6,
  7,
  10,
  8,
  6,
  6,
  5,
  5,
  9,
  5,
  7,
  7,
  11,
  8,
  8,
  6,
  6,
  5,
  5,
  9,
  5,
  4,
  4,
  9,
  5,
  7,
  7],
 '사랑한 후에': [6,
  4,
  5,
  10,
  6,
  6,
  5,
  8,
  9,
  5,
  7,
  8,
  7,
  7,
  10,
  8,
  5,
  6,
  6,
  6,
  11,
  5,
  7,
  8,
  7,
  7,
  10,
  8,
  5,
  8,
  8,
  8,
  5,
  5,
  5,
  7,
  6,
  5],
 '슬픔이 올때': [4,
  7,
  6,
  4,
  6,
  7,
  6,
  9,
  5,
  4,
  4,
  4,
  10,
  8,
  7,
  6,
  3,
  7,
  5,
  7,
  3,
  3,
  5,
  5,
  4,
  6,
  7,
  3,
  5,
  5,
  3,
  7,
  7,
  7,
  3,
  3,
  5,
  8,
  6,
  7,
  3,
  9,
  3,
  7,
  5,
 

In [9]:
max_sequence_length = 60
embedding_size = 300
hidden_size = 256
word_dropout = 0.5
latent_size = 16
num_layers = 1
bidirectional = False
batch_size_fit = 32
rnn_type = 'gru'
learning_rate = 0.0002
k = 0.002
x0 = 2500
vocab_size = len(index2word)
sos_idx = word2index['_STA_']
eos_idx = word2index['_EOS_']
pad_idx = word2index['_PAD_']

In [10]:
class RVAE(nn.Module):
    def __init__(self,vocab_size, embedding_size, max_sequence_length, hidden_size, word_dropout, latent_size,
                sos_idx, eos_idx, pad_idx, rnn_type='rnn' , num_layers=1, bidirectional=True):
        
        super().__init__()
        
        self.max_sequence_length = max_sequence_length
        self.sos_idx = sos_idx
        self.eos_idx = eos_idx
        self.pad_idx = pad_idx
        
        self.latent_size = latent_size
        self.rnn_type = rnn_type
        self.bidirectional = bidirectional
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        
        self.embedding = nn.Embedding(vocab_size,embedding_size)
        #self.outputs2vocab = nn.Linear(hidden_size * (2 if bidirectional else 1), vocab_size)
        self.encoder = Encoder(vocab_size = vocab_size,embedding_size = embedding_size, hidden_size = hidden_size, num_layers = num_layers, bidirectional = bidirectional,latent_size = latent_size,rnn_type = rnn_type).cuda()
        self.decoder = Decoder(vocab_size = vocab_size,embedding_size = embedding_size, hidden_size = hidden_size, num_layers = num_layers, bidirectional = bidirectional,latent_size = latent_size,rnn_type = rnn_type, word_dropout=word_dropout).cuda()
    
    def forward(self,x,length):        
        batch_size = x.size(0)
        sorted_lengths, sorted_idx = torch.sort(length, descending=True)
        input_sequence = x[sorted_idx.cuda()]
        input_embedding = self.embedding(input_sequence)
        
        packed_input = rnn_utils.pack_padded_sequence(input_embedding, sorted_lengths.tolist(), batch_first=True)

        mu,logvar,reparam = self.encoder(packed_input)
        logp,outputs  = self.decoder(input_embedding, reparam, sorted_lengths, sorted_idx)
        
        
        return logp, mu, logvar, reparam, outputs

In [11]:
class Encoder(nn.Module):
    def __init__(self,vocab_size,embedding_size, hidden_size, latent_size, bidirectional=True, num_layers = 1,rnn_type='rnn'):
        super(Encoder,self).__init__()
        
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        self.latent_size = latent_size
        self.rnn_type = rnn_type
        self.bidirectional = bidirectional
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        
        if self.rnn_type == 'rnn':
            rnn = nn.RNN
        elif self.rnn_type == 'gru':
            rnn = nn.GRU
        elif self.rnn_type =='lstm':
            rnn = nn.LSTM
        else:
            raise ValueError()
            
        
        self.encoder = rnn(self.embedding_size, self.hidden_size, num_layers = self.num_layers, bidirectional = self.bidirectional, batch_first = True)
        
        self.hidden_factor = (2 if self.bidirectional else 1) * self.num_layers
        
        self.hidden2mean = nn.Linear(self.hidden_size* self.hidden_factor, self.latent_size)
        self.hidden2logv = nn.Linear(self.hidden_size* self.hidden_factor, self.latent_size)
                        
    
    def reparametrize(self, mu, logvar):
        std = logvar.mul(0.5).exp_()
        
        eps = torch.FloatTensor(std.size()).normal_()
        #print(eps)
        eps = Variable(eps).cuda()
        
        return eps.mul(std).add_(mu)
    
    def forward(self,x):
        
        _,hidden = self.encoder(x)
        
        if self.bidirectional or self.num_layers > 1:
            # flatten hidden state
            hidden = hidden.view(batch_size, self.hidden_size*self.hidden_factor)
        else:
            hidden = hidden.squeeze()

        mu = self.hidden2mean(hidden)
        
        logvar = self.hidden2logv(hidden)
        std = torch.exp(0.5 * logvar)
        #reparam = self.reparametrize(mu,logvar)
        z = Variable(torch.randn([batch_size, self.latent_size])).cuda()
        z = z * std + mu
        
        
        return mu,logvar,z#,reparam

In [12]:
class Decoder(nn.Module):
    def __init__(self,vocab_size,embedding_size, hidden_size, latent_size, bidirectional=True, num_layers = 1,rnn_type='rnn',word_dropout = 0.5):
        super(Decoder,self).__init__()
        
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        self.latent_size = latent_size
        self.rnn_type = rnn_type
        self.bidirectional = bidirectional
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        
        if self.rnn_type == 'rnn':
            rnn = nn.RNN
        elif self.rnn_type == 'gru':
            rnn = nn.GRU
        elif self.rnn_type =='lstm':
            rnn = nn.LSTM
        else:
            raise ValueError()
            
        self.hidden_factor = (2 if self.bidirectional else 1) * self.num_layers            
        self.latent2hidden = nn.Linear(latent_size, hidden_size * self.hidden_factor)
        
        self.embedding = nn.Embedding(vocab_size,embedding_size)
        self.word_dropout = nn.Dropout(p=word_dropout)
        
        self.decoder = rnn(embedding_size, hidden_size, num_layers=num_layers, bidirectional=self.bidirectional, batch_first=True)
        self.outputs2vocab = nn.Linear(hidden_size * (2 if bidirectional else 1), vocab_size)
        
    def forward(self,x,z,sorted_lengths,sorted_idx):

        
        hidden = self.latent2hidden(z)
        if self.bidirectional or self.num_layers > 1:
            # unflatten hidden state
            hidden = hidden.view(self.hidden_factor, batch_size, self.hidden_size)
        else:
            hidden = hidden.unsqueeze(0)
        input_embedding = self.word_dropout(x)
        packed_input = rnn_utils.pack_padded_sequence(input_embedding, sorted_lengths.tolist(), batch_first=True)
        outputs,_ = self.decoder(packed_input, hidden)
        
        padded_outputs = rnn_utils.pad_packed_sequence(outputs, batch_first=True)[0]
        padded_outputs = padded_outputs.contiguous()
        _,reversed_idx = torch.sort(sorted_idx.cuda())
        padded_outputs = padded_outputs[reversed_idx]
        b,s,_ = padded_outputs.size()
        
        logp = nn.functional.log_softmax(self.outputs2vocab(padded_outputs.view(-1, padded_outputs.size(2))), dim=-1)
        logp = logp.view(b, s, self.embedding.num_embeddings)
        
        return logp,padded_outputs.view(-1, padded_outputs.size(2))

In [13]:
rvae=RVAE(vocab_size, embedding_size, max_sequence_length, hidden_size, word_dropout, latent_size,sos_idx, eos_idx, pad_idx , num_layers=num_layers ,rnn_type='gru',bidirectional= False).cuda()

In [14]:
def kl_anneal_function(anneal_function, step, k, x0):
    if anneal_function == 'logistic':
        return float(1/(1+np.exp(-k*(step-x0))))
    elif anneal_function == 'linear':
        return min(1, step/x0)



In [15]:
NLL = torch.nn.NLLLoss(size_average=False, ignore_index = pad_idx)

In [16]:
def loss_fn(logp, target, length, mean, logv, anneal_function, step, k, x0):

    # cut-off unnecessary padding from target, and flatten
    target = target[:, :torch.max(length)].contiguous().view(-1)
    logp = logp.view(-1, logp.size(2))

    # Negative Log Likelihood
    NLL_loss = NLL(logp, target)

    # KL Divergence
    KL_loss = -0.5 * torch.sum(1 + logv - mean.pow(2) - logv.exp())
    KL_weight = kl_anneal_function(anneal_function, step, k, x0)

    return NLL_loss, KL_loss, KL_weight

In [17]:
optimizer = torch.optim.Adam(rvae.parameters(), lr=learning_rate)

In [18]:
rvae

RVAE(
  (embedding): Embedding(8375, 300)
  (encoder): Encoder(
    (encoder): GRU(300, 256, batch_first=True)
    (hidden2mean): Linear(in_features=256, out_features=16, bias=True)
    (hidden2logv): Linear(in_features=256, out_features=16, bias=True)
  )
  (decoder): Decoder(
    (latent2hidden): Linear(in_features=16, out_features=256, bias=True)
    (embedding): Embedding(8375, 300)
    (word_dropout): Dropout(p=0.5)
    (decoder): GRU(300, 256, batch_first=True)
    (outputs2vocab): Linear(in_features=256, out_features=8375, bias=True)
  )
)

In [19]:
keys=inputs.keys()
x = []
x_len = []
y = []
for key in keys:
    if len(inputs[key]) != len(inputs_len[key])!= len(target[key]):
        print(len(inputs_len[key])-len(inputs[key]))
    y = y + targets[key]
    x = x + inputs[key]
    
    x_len = x_len + inputs_len[key]

In [20]:
def batch(batch_size,input_var,target_var,length_var):
    start = 0
    end = batch_size
    #if len(input_var)%32 != 0:
    while end < len(input_var):
        batch_input = input_var[start:end]
        batch_target = target_var[start:end]
        batch_length = length_var[start:end]
        temp = end
        end  = end + batch_size
        start = temp
        yield batch_input, batch_target, batch_length
        
    if end >= len(input_var):
        batch_input  = input_var[start:]
        batch_target = target_var[start:]
        batch_length = length_var[start:]
        yield batch_input, batch_target, batch_length

In [21]:
epochs = 10

In [22]:
step = 0
avg_losses = []
losses = []
NLL_losses = []
KL_losses = []
KL_weights = []
iteration = 0
for epoch in range(epochs):
    
    for batch_x, batch_y, batch_len in batch(batch_size_fit, x, y, x_len):
        iteration = iteration + 1
        
        x_ = Variable(torch.cuda.LongTensor(batch_x))
        y_ = Variable(torch.cuda.LongTensor(batch_y))
        batch_size = x_.size(0)
        length = torch.cuda.LongTensor(batch_len)

        logp, mean, logv, z, outputs=rvae(x_,length)

        NLL_loss, KL_loss, KL_weight = loss_fn(logp, y_, length, mean, logv, 'logistic', step, k, x0)

        loss = (NLL_loss + KL_loss*KL_weight)/batch_size#(NLL_loss/batch_size)


        losses.append(float(loss.cpu().data))
        NLL_losses.append(NLL_loss.data[0]/batch_size)
        KL_losses.append(KL_loss.data[0]/batch_size)
        KL_weights.append(KL_weight)



        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        step += 1

        if iteration % 100 == 0 or iteration == (len(x)-1)//batch_size:
            print("Batch %04d/%i, Loss %9.4f, NLL-Loss %9.4f, KL-Loss %9.4f, KL-Weight %6.3f"
        %( iteration, (len(x)-1)//batch_size, loss.data[0], NLL_loss.data[0]/batch_size, KL_loss.data[0]/batch_size, KL_weight))
            np.savez(L=losses,file='loss.npz')
            np.savez(L=NLL_losses,file='NLL_losses.npz')
            np.savez(L=KL_losses,file='KL_losses.npz') 
            np.savez(L=KL_weights,file='KL_weights.npz')


    checkpoint_path = os.path.join(save_model_path, "E%i.pytorch"%(epoch))
    torch.save(rvae, checkpoint_path)
    print("Model saved at %s"%checkpoint_path)
    print("Epoch %02d/%i, Mean ELBO %9.4f"%( epoch, epochs, np.mean(np.array(losses))))
    avg_losses.append(np.mean(np.array(losses)))
    np.savez(L=avg_losses,file='avg_losses.npz')

Batch 0100/903, Loss   43.4926, NLL-Loss   43.2600, KL-Loss   28.5497, KL-Weight  0.008
Batch 0200/903, Loss   45.9288, NLL-Loss   45.8571, KL-Loss    7.2257, KL-Weight  0.010
Batch 0300/903, Loss   39.3435, NLL-Loss   39.2592, KL-Loss    6.9613, KL-Weight  0.012
Batch 0400/903, Loss   45.4945, NLL-Loss   45.1188, KL-Loss   25.4788, KL-Weight  0.015
Batch 0500/903, Loss   43.1458, NLL-Loss   42.8832, KL-Loss   14.6266, KL-Weight  0.018
Batch 0600/903, Loss   34.0220, NLL-Loss   33.5698, KL-Loss   20.7038, KL-Weight  0.022
Batch 0700/903, Loss   40.1691, NLL-Loss   39.6384, KL-Loss   19.9917, KL-Weight  0.027
Batch 0800/903, Loss   33.4194, NLL-Loss   32.5948, KL-Loss   25.5826, KL-Weight  0.032
Batch 0900/903, Loss   42.1313, NLL-Loss   41.3182, KL-Loss   20.8005, KL-Weight  0.039
Batch 0903/903, Loss   34.1754, NLL-Loss   33.5105, KL-Loss   16.9128, KL-Weight  0.039


  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


Model saved at won\2018-Jun-06-12-23-50\E0.pytorch
Epoch 00/10, Mean ELBO   40.4437
Batch 1000/903, Loss   31.5260, NLL-Loss   30.6706, KL-Loss   18.0717, KL-Weight  0.047
Batch 1100/903, Loss   36.3860, NLL-Loss   35.1866, KL-Loss   20.9627, KL-Weight  0.057
Batch 1200/903, Loss   45.2172, NLL-Loss   44.1080, KL-Loss   16.0729, KL-Weight  0.069
Batch 1300/903, Loss   37.6125, NLL-Loss   36.3427, KL-Loss   15.2951, KL-Weight  0.083
Batch 1400/903, Loss   33.8620, NLL-Loss   32.1901, KL-Loss   16.7909, KL-Weight  0.100
Batch 1500/903, Loss   38.0700, NLL-Loss   36.1706, KL-Loss   15.9617, KL-Weight  0.119
Batch 1600/903, Loss   23.1332, NLL-Loss   21.2077, KL-Loss   13.5974, KL-Weight  0.142
Batch 1700/903, Loss   34.1355, NLL-Loss   32.1201, KL-Loss   12.0181, KL-Weight  0.168
Batch 1800/903, Loss   27.3776, NLL-Loss   24.5483, KL-Loss   14.3257, KL-Weight  0.197
Model saved at won\2018-Jun-06-12-23-50\E1.pytorch
Epoch 01/10, Mean ELBO   37.5901
Batch 1900/903, Loss   40.8898, NLL-Loss

In [28]:
try:
    rvae = torch.load(checkpoint_path)
    print("\n--------model restored--------\n")
except:
    print("\n--------model not restored--------\n")
    pass
rvae.cuda()
rvae.eval()


--------model restored--------



RVAE(
  (embedding): Embedding(8375, 300)
  (encoder): Encoder(
    (encoder): GRU(300, 256, batch_first=True)
    (hidden2mean): Linear(in_features=256, out_features=16, bias=True)
    (hidden2logv): Linear(in_features=256, out_features=16, bias=True)
  )
  (decoder): Decoder(
    (latent2hidden): Linear(in_features=16, out_features=256, bias=True)
    (embedding): Embedding(8375, 300)
    (word_dropout): Dropout(p=0.5)
    (decoder): GRU(300, 256, batch_first=True)
    (outputs2vocab): Linear(in_features=256, out_features=8375, bias=True)
  )
)

In [29]:
def inference(batch, z):
    
    hidden = rvae.decoder.latent2hidden(z)
    hidden = hidden.view(rvae.decoder.hidden_factor, batch_size, rvae.hidden_size)
    
    t = 0
    generations=torch.cuda.FloatTensor(batch_size, max_sequence_length).fill_(pad_idx).long()
    
    while(t<max_sequence_length ):
        if t == 0:
            input_sequence = Variable(torch.Tensor(batch_size).fill_(sos_idx).long()).cuda()
        input_sequence = input_sequence.unsqueeze(1)
        input_embedding = rvae.embedding(input_sequence)
        output, hidden = rvae.decoder.decoder(input_embedding, hidden)
        logits = rvae.decoder.outputs2vocab(output)
        samples = torch.topk(logits,1,dim=-1)[1].squeeze()
        input_sequence = samples.squeeze()
        generations[:,t] = input_sequence.data
        if ((torch.sum(input_sequence)/input_sequence.size(0)) == eos_idx).cpu().data.numpy():
            break
        t+=1
    return generations

def print_inference(generations):


    w2i, i2w = word2index, index2word

    samples = generations.cpu().numpy()

    sent_str = [str()]*len(samples)

    for i, sent in enumerate(samples):
        for word_id in sent:
            if word_id == w2i['_PAD_']: 
                break
            sent_str[i] += i2w[word_id] + " "
        sent_str[i] = sent_str[i].strip()
    return sent_str

In [30]:
batch_size = 5

z = Variable(torch.randn([batch_size, latent_size])).cuda()

In [31]:
generations=inference(batch_size, z)

In [32]:
sent_str = print_inference(generations)

In [33]:
for i in sent_str:
    print(i)

너 의 곁 에 _EOS_ _EOS_ _EOS_
내 가 더 이상 나 _EOS_ _EOS_
난 꾸밈 없이 _EOS_ _EOS_ _EOS_ _EOS_
너 를 사랑해 _EOS_ _EOS_ _EOS_ _EOS_
내 가 알 아 줘 요 _EOS_


In [45]:
steps = 8

z1 = torch.randn([latent_size]).numpy()
z2 = torch.randn([latent_size]).numpy()

interpolation = np.zeros((z1.shape[0], steps + 2))

for dim, (s,e) in enumerate(zip(z1,z2)):
    interpolation[dim] = np.linspace(s,e,steps+2)
    
z = Variable(torch.from_numpy(interpolation.T).float()).cuda()

In [46]:
batch_size = z.size(0)

generations=inference(batch_size, z)

sent_str = print_inference(generations)

for i in sent_str:
    print(i)

나 의 마음 을 _EOS_ _EOS_
나 의 마음 을 _EOS_ _EOS_
나 의 마음 을 _EOS_ _EOS_
나 의 마음 을 _EOS_ _EOS_
나 의 마음 을 _EOS_ _EOS_
나 는 어떡해 _EOS_ _EOS_ _EOS_
내 가 더 잘할 게 _EOS_
내 가 너무 그리워 _EOS_ _EOS_
내 가 너무 그리워 _EOS_ _EOS_
내 가 너무 그리워 _EOS_ _EOS_
