In [1]:
import os
import json
import time
import torch
import argparse
import numpy as np
from multiprocessing import cpu_count
from torch.utils.data import DataLoader
from collections import OrderedDict, defaultdict
import torch.nn as nn
from torch.autograd import Variable

from ptb import PTB
from utils import to_var, idx2word, expierment_name
from model import SentenceVAE

In [2]:
ts = time.strftime('%Y-%b-%d-%H-%M-%S', time.gmtime())

In [3]:
datasets = OrderedDict()

In [4]:
splits = ['train', 'valid'] + (['test'] if True else [])

In [5]:
splits

['train', 'valid', 'test']

In [6]:
for split in splits:
    datasets[split] = PTB(
        data_dir='data',
        split=split,
        create_data=True,
        max_sequence_length=60,
        min_occ=1
    )

Creating new TRAIN ptb data.
Vocablurary of 9877 keys created.
Creating new VALID ptb data.
Creating new TEST ptb data.


In [7]:
vocab_size = datasets['train'].vocab_size

In [8]:
sos_idx=datasets['train'].sos_idx

In [9]:
eos_idx=datasets['train'].eos_idx

In [10]:
pad_idx=datasets['train'].pad_idx

In [41]:
max_sequence_length = 60
embedding_size = 300
hidden_size = 256
word_dropout = 0.5
latent_size = 16
num_layers = 1
bidirectional = True
batch_size = 32
rnn_type = 'rnn'
learning_rate = 0.001
k = 0.0025
x0 = 2500

In [21]:
class RVAE(nn.Module):
    def __init__(self,vocab_size, embedding_size, max_sequence_length, hidden_size, word_dropout, latent_size,
                sos_idx, eos_idx, pad_idx, rnn_type='rnn' , num_layers=1, bidirectional=True):
        
        super().__init__()
        
        self.max_sequence_length = max_sequence_length
        self.sos_idx = sos_idx
        self.eos_idx = eos_idx
        self.pad_idx = pad_idx
        
        self.latent_size = latent_size
        self.rnn_type = rnn_type
        self.bidirectional = bidirectional
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        
        
        self.outputs2vocab = nn.Linear(hidden_size * (2 if bidirectional else 1), vocab_size)
        self.encoder = Encoder(vocab_size = vocab_size,embedding_size = embedding_size, hidden_size = hidden_size, num_layers = num_layers, bidirectional = bidirectional,latent_size = latent_size,rnn_type = rnn_type).cuda()
        self.decoder = Decoder(vocab_size = vocab_size,embedding_size = embedding_size, hidden_size = hidden_size, num_layers = num_layers, bidirectional = bidirectional,latent_size = latent_size,rnn_type = rnn_type).cuda()
    
    def forward(self,x,length):
        
        mu,logvar,reparam = self.encoder(Variable(x),length)
        logp  = self.decoder(Variable(x),reparam)
        
        
        return logp, mu, logvar, reparam

In [29]:
class Encoder(nn.Module):
    def __init__(self,vocab_size,embedding_size, hidden_size, latent_size, bidirectional=True, num_layers = 1,rnn_type='rnn'):
        super(Encoder,self).__init__()
        
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        self.latent_size = latent_size
        self.rnn_type = rnn_type
        self.bidirectional = bidirectional
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        
        if self.rnn_type == 'rnn':
            rnn = nn.RNN
        elif self.rnn_type == 'gru':
            rnn = nn.GRU
        elif self.rnn_type =='lstm':
            rnn = nn.LSTM
        else:
            raise ValueError()
            
        self.embedding = nn.Embedding(vocab_size,embedding_size)
        self.encoder = rnn(self.embedding_size, self.hidden_size, num_layers = self.num_layers, bidirectional = self.bidirectional, batch_first = True)
        
        self.hidden_factor = (2 if self.bidirectional else 1) * self.num_layers
        
        self.hidden2mean = nn.Linear(self.hidden_size* self.hidden_factor, self.latent_size)
        self.hidden2logv = nn.Linear(self.hidden_size* self.hidden_factor, self.latent_size)
                        
    
    def reparametrize(self, mu, logvar):
        std = logvar.mul(0.5).exp_()
        
        eps = torch.FloatTensor(std.size()).normal_()
        eps = Variable(eps).cuda()
        
        return eps.mul(std).add_(mu)
    
    def forward(self,x,length):
        batch_size = x.size(0)
        sorted_lengths, sorted_idx = torch.sort(length, descending=True)
        input_sequence = x[sorted_idx.cuda()]
        #print(input_sequence)
        _,hidden = self.encoder(self.embedding(input_sequence))
        if self.bidirectional or self.num_layers > 1:
            # flatten hidden state
            hidden = hidden.view(batch_size, self.hidden_size*self.hidden_factor)
        else:
            hidden = hidden.squeeze()

        mu = self.hidden2mean(hidden)
        logvar = self.hidden2logv(hidden)

        reparam = self.reparametrize(mu,logvar)
        
        return mu,logvar,reparam
        


In [30]:
class Decoder(nn.Module):
    def __init__(self,vocab_size,embedding_size, hidden_size, latent_size, bidirectional=True, num_layers = 1,rnn_type='rnn',word_dropout = 0.5):
        super(Decoder,self).__init__()
        
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        self.latent_size = latent_size
        self.rnn_type = rnn_type
        self.bidirectional = bidirectional
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        
        if self.rnn_type == 'rnn':
            rnn = nn.RNN
        elif self.rnn_type == 'gru':
            rnn = nn.GRU
        elif self.rnn_type =='lstm':
            rnn = nn.LSTM
        else:
            raise ValueError()
            
        self.hidden_factor = (2 if self.bidirectional else 1) * self.num_layers            
        self.latent2hidden = nn.Linear(latent_size, hidden_size * self.hidden_factor)
        
        self.embedding = nn.Embedding(vocab_size,embedding_size)
        self.word_dropout = nn.Dropout(p=word_dropout)
        
        self.decoder = rnn(embedding_size, hidden_size, num_layers=num_layers, bidirectional=self.bidirectional, batch_first=True)
        self.outputs2vocab = nn.Linear(hidden_size * (2 if bidirectional else 1), vocab_size)
        
    def forward(self,x,z):
        
        batch_size = x.size(0)
        sorted_lengths, sorted_idx = torch.sort(length, descending=True)
        input_sequence = x[sorted_idx.cuda()]
        
        hidden = self.latent2hidden(z)
        if self.bidirectional or self.num_layers > 1:
            # unflatten hidden state
            hidden = hidden.view(self.hidden_factor, batch_size, self.hidden_size)
        else:
            hidden = hidden.unsqueeze(0)
            
        outputs,_ = self.decoder(self.embedding(input_sequence),hidden)
        
        logp =nn.functional.log_softmax(self.outputs2vocab(outputs))
        
        return logp
        


In [31]:
data_loader = DataLoader(
                dataset=datasets[split],
                batch_size=batch_size,
                shuffle=split=='train',
                num_workers=cpu_count(),
                pin_memory=torch.cuda.is_available()
            )
if split == 'train':
    model.train()
else:
    model.eval()

In [32]:
rvae=RVAE(vocab_size, embedding_size, max_sequence_length, hidden_size, word_dropout, latent_size,sos_idx, eos_idx, pad_idx)

In [33]:
def kl_anneal_function(anneal_function, step, k, x0):
    if anneal_function == 'logistic':
        return float(1/(1+np.exp(-k*(step-x0))))
    elif anneal_function == 'linear':
        return min(1, step/x0)

NLL = torch.nn.NLLLoss(size_average=False, ignore_index=datasets['train'].pad_idx)

In [34]:
def loss_fn(logp, target, length, mean, logv, anneal_function, step, k, x0):

    # cut-off unnecessary padding from target, and flatten
    target = target[:, :60].contiguous().view(-1)
    logp = logp.view(-1, logp.size(2))

    # Negative Log Likelihood
    NLL_loss = NLL(logp, target)

    # KL Divergence
    KL_loss = -0.5 * torch.sum(1 + logv - mean.pow(2) - logv.exp())
    KL_weight = kl_anneal_function(anneal_function, step, k, x0)

    return NLL_loss, KL_loss, KL_weight

In [37]:
optimizer = torch.optim.Adam(rvae.parameters(), lr=learning_rate)

In [40]:
epochs = 10

In [44]:
step = 0
for epoch in range(epochs):
    for split in splits:
        data_loader = DataLoader(
                    dataset=datasets[split],
                    batch_size=batch_size,
                    shuffle=split=='train',
                    num_workers=cpu_count(),
                    pin_memory=torch.cuda.is_available()
                )
        
        losses = []
        target = []
        z = []
        
        if split == 'train':
            model.train()
        else:
            model.eval()
            
        for iteration, batch in enumerate(data_loader):
            batch_size = batch['input'].size(0)
            
            optimizer.zero_grad()
            
            x = batch['input'].type(torch.cuda.LongTensor)
            length = batch['length']
            
            logp, mean, logv, z=rvae(x,length)
            
            NLL_loss, KL_loss, KL_weight = loss_fn(logp, Variable(batch['target']).type(torch.cuda.LongTensor),batch['length'], mean, logv, 'logistic', step, k, x0)
            
            loss = (NLL_loss + KL_weight * KL_loss)/batch_size
            
            losses.append(float(loss.cpu().data))
            
            if split == 'train':
                loss.backward()
                optimizer.step()
                step += 1

            if iteration % 50 == 0 or iteration+1 == len(data_loader):
                print("%s Batch %04d/%i, Loss %9.4f, NLL-Loss %9.4f, KL-Loss %9.4f, KL-Weight %6.3f"
            %(split.upper(), iteration, len(data_loader)-1, loss.data[0], NLL_loss.data[0]/batch_size, KL_loss.data[0]/batch_size, KL_weight))
            if split == 'valid':
                if 'target_sents' not in tracker:
                    tracker['target_sents'] = list()
                tracker['target_sents'] += idx2word(batch['target'].data, i2w=datasets['train'].get_i2w(), pad_idx=datasets['train'].pad_idx)
                tracker['z'] = torch.cat((tracker['z'], z.data), dim=0)

        print("%s Epoch %02d/%i, Mean ELBO %9.4f"%(split.upper(), epoch, args.epochs, np.mean(np.array(losses))))



TEST Batch 0000/117, Loss   30.4043, NLL-Loss   30.3890, KL-Loss    7.9315, KL-Weight  0.002
TEST Batch 0050/117, Loss   28.7174, NLL-Loss   28.5958, KL-Loss   55.7315, KL-Weight  0.002
TEST Batch 0100/117, Loss   44.3120, NLL-Loss   44.2233, KL-Loss   35.8837, KL-Weight  0.002
TEST Batch 0117/117, Loss   13.6635, NLL-Loss   13.6298, KL-Loss   13.0440, KL-Weight  0.003
TEST Batch 0000/117, Loss   29.2536, NLL-Loss   29.1815, KL-Loss   27.8824, KL-Weight  0.003
TEST Batch 0050/117, Loss   26.6323, NLL-Loss   26.4969, KL-Loss   46.2161, KL-Weight  0.003
TEST Batch 0100/117, Loss   41.3903, NLL-Loss   41.3377, KL-Loss   15.8582, KL-Weight  0.003
TEST Batch 0117/117, Loss   12.6803, NLL-Loss   12.6361, KL-Loss   12.7733, KL-Weight  0.003
TEST Batch 0000/117, Loss   27.0172, NLL-Loss   26.9219, KL-Loss   27.4529, KL-Weight  0.003
TEST Batch 0050/117, Loss   25.6663, NLL-Loss   25.4896, KL-Loss   44.9508, KL-Weight  0.004
TEST Batch 0100/117, Loss   39.2800, NLL-Loss   39.1805, KL-Loss   22.

In [38]:
for iteration, batch in enumerate(data_loader):
    optimizer.zero_grad()
    x = batch['input'].type(torch.cuda.LongTensor)
    length = batch['length']
    logp, mean, logv, z=rvae(x,length)
    NLL_loss, KL_loss, KL_weight = loss_fn(logp, Variable(batch['target']).type(torch.cuda.LongTensor),batch['length'], mean, logv, 'logistic', 0, 0.0025, 2500)
    loss = (NLL_loss + KL_weight * KL_loss)/batch_size
    loss.backward()
    optimizer.step()
    step += 1
    print(loss)



Variable containing:
 73.8618
[torch.cuda.FloatTensor of size 1 (GPU 0)]

Variable containing:
 75.6461
[torch.cuda.FloatTensor of size 1 (GPU 0)]

Variable containing:
 78.5806
[torch.cuda.FloatTensor of size 1 (GPU 0)]

Variable containing:
 93.2190
[torch.cuda.FloatTensor of size 1 (GPU 0)]

Variable containing:
 69.6532
[torch.cuda.FloatTensor of size 1 (GPU 0)]

Variable containing:
 67.3333
[torch.cuda.FloatTensor of size 1 (GPU 0)]

Variable containing:
 97.5509
[torch.cuda.FloatTensor of size 1 (GPU 0)]

Variable containing:
 84.6253
[torch.cuda.FloatTensor of size 1 (GPU 0)]

Variable containing:
 85.0360
[torch.cuda.FloatTensor of size 1 (GPU 0)]

Variable containing:
 77.2883
[torch.cuda.FloatTensor of size 1 (GPU 0)]

Variable containing:
 75.0642
[torch.cuda.FloatTensor of size 1 (GPU 0)]

Variable containing:
 92.8637
[torch.cuda.FloatTensor of size 1 (GPU 0)]

Variable containing:
 85.3980
[torch.cuda.FloatTensor of size 1 (GPU 0)]

Variable containing:
 86.3229
[torch.c


Variable containing:
 80.7253
[torch.cuda.FloatTensor of size 1 (GPU 0)]

Variable containing:
 91.5615
[torch.cuda.FloatTensor of size 1 (GPU 0)]

Variable containing:
 46.1668
[torch.cuda.FloatTensor of size 1 (GPU 0)]



In [None]:
from utils import to_var, idx2word, interpolate

with open('data'+'/ptb.vocab.json', 'r') as file:
    vocab = json.load(file)

w2i, i2w = vocab['w2i'], vocab['i2w']

samples = torch.topk(nn.functional.softmax(decoder.outputs2vocab(outputs)),1,dim=-1)[1].squeeze()

samples = samples.cpu().data.numpy()

sent_str = [str()]*len(samples)

for i, sent in enumerate(samples):
    for word_id in sent:
        if word_id == w2i['<pad>']: 
            break
        sent_str[i] += i2w[str(word_id)] + " "
    sent_str[i] = sent_str[i].strip()

samples