In [1]:
from transformer_nb import *

In [2]:
import torch.nn as nn
import torch
from torch.autograd import Variable
import json
import numpy as np
from tqdm import tqdm_notebook as tqdm
# from tqdm import tqdm
import math

In [3]:
folder = '../pointer-generator/preprocessing-cnn-all/'
# folder = '../pointer-generator/preprocessing-300d-all/'
data_name = folder+'train_seq.json'
validation_name = folder+'valid_seq.json'
testdata_name = folder+'testdata_seq.json'
vocab_name = folder+'vocab.json'

In [4]:
num_epochs = 10
save_rate = 1 #how many epochs per modelsave
continue_from = "trained/Model3" # if none, put None
continue_from = None
epsilon = 1e-10
validation_size = 5000

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [6]:
vocab = json.load(open(vocab_name, 'r'))
VOC_SIZE = len(vocab)
INPUT_MAX = 400
OUTPUT_MAX = 100

In [7]:
from torch.utils import data

class Dataset(data.Dataset):    
    def __init__(self, data_name, vocab, cutoff=None):
        print("loading json")
        data = json.load(open(data_name, 'r'))
        print("load json done.")
        sum_list = data['summary']
        data_list = data['document']
        
        if cutoff is not None:
            sum_list = sum_list[:cutoff]
            data_list = data_list[:cutoff]
        # idata -> list
        self.size = len(sum_list)
        self.dataset = []
        self.sum_len = 0
        
        for i in tqdm(range(len(sum_list))):
            if(len(data_list[i]) <= INPUT_MAX):
                data = [vocab['<pad>']]*(INPUT_MAX-len(data_list[i])) + data_list[i]
            else:
                data = data_list[i][:INPUT_MAX]
                
            if(len(sum_list[i]) <= OUTPUT_MAX):
                sum_in = sum_list[i] + [vocab['<pad>']]*(OUTPUT_MAX-len(sum_list[i]))
            else:
                sum_in = sum_list[i][:OUTPUT_MAX]
                
            self.dataset.append([data, sum_in])
     
    def __len__(self):
        return self.size
    def __getitem__(self, index):
        return torch.tensor(self.dataset[index][0]), torch.tensor(self.dataset[index][1])

In [8]:
batch_size = 10
training_set = Dataset(data_name, vocab)
validation_set = Dataset(validation_name, vocab, cutoff=validation_size)
params = {'batch_size':batch_size,
         'shuffle': True,
         'num_workers': 4}
training_generator = data.DataLoader(training_set, **params)
validation_generator = data.DataLoader(validation_set, **params)

def data_gen_train():
    for src,tgt in training_generator:
        src = Variable(src, requires_grad=False).to(device)
        tgt = Variable(tgt, requires_grad=False).to(device)
        yield Batch(src, tgt, vocab['<pad>'])
def data_gen_val():
    for src,tgt in validation_generator:
        src = Variable(src, requires_grad=False).to(device)
        tgt = Variable(tgt, requires_grad=False).to(device)
        yield Batch(src, tgt, vocab['<pad>'])

loading json
load json done.


HBox(children=(IntProgress(value=0, max=284367), HTML(value='')))


loading json
load json done.


HBox(children=(IntProgress(value=0, max=2860), HTML(value='')))




In [9]:
total_train = int(math.ceil(training_set.size / batch_size))
total_valid = int(math.ceil(validation_set.size / batch_size))
print(total_train, total_valid)

28437 286


In [10]:
criterion = LabelSmoothing(size=VOC_SIZE, padding_idx=vocab['<pad>'], smoothing=0.1)
# model = make_model(VOC_SIZE, VOC_SIZE, N=6, d_model=512, d_ff=2048, h=8, dropout=0.1)
model = make_model(VOC_SIZE, VOC_SIZE, N=4, d_model=256, d_ff=1024, h=8, dropout=0.1)


if continue_from == None:
    model_opt = NoamOpt(model.src_embed[0].d_model, 1, 4000,
            torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9))
else:
    saved_model = torch.load(continue_from)
    model.load_state_dict(saved_model['model'])
    model_opt = saved_model['optim']

criterion.cuda()
model.cuda()



EncoderDecoder(
  (encoder): Encoder(
    (layers): ModuleList(
      (0): EncoderLayer(
        (self_attn): MultiHeadedAttention(
          (linears): ModuleList(
            (0): Linear(in_features=256, out_features=256, bias=True)
            (1): Linear(in_features=256, out_features=256, bias=True)
            (2): Linear(in_features=256, out_features=256, bias=True)
            (3): Linear(in_features=256, out_features=256, bias=True)
          )
          (dropout): Dropout(p=0.1)
        )
        (feed_forward): PositionwiseFeedForward(
          (w_1): Linear(in_features=256, out_features=1024, bias=True)
          (w_2): Linear(in_features=1024, out_features=256, bias=True)
          (dropout): Dropout(p=0.1)
        )
        (sublayer): ModuleList(
          (0): SublayerConnection(
            (norm): LayerNorm()
            (dropout): Dropout(p=0.1)
          )
          (1): SublayerConnection(
            (norm): LayerNorm()
            (dropout): Dropout(p=0.1)
      

In [None]:
# model.cuda()
# criterion.cuda()
for epoch in range(num_epochs):
    print("Epoch", epoch)
        
    model.train()
    run_epoch(data_gen_train(), model, 
              SimpleLossCompute(model.generator, criterion, model_opt), total=total_train)
    model.eval()
    run_epoch(data_gen_val(), model, 
                    SimpleLossCompute(model.generator, criterion, None), total=total_valid)
    try:
        torch.save({'model':model.state_dict(),
                   'optim': model_opt, }, 'trained/Model'+str(epoch))
    except:
        continue

In [11]:
def greedy_decode(model, src, src_mask, max_len, start_symbol):
    memory = model.encode(src, src_mask)
    ys = torch.ones(1, 1).fill_(start_symbol).type_as(src.data)
    for i in range(max_len-1):
        out = model.decode(memory, src_mask, 
                           Variable(ys), 
                           Variable(subsequent_mask(ys.size(1))
                                    .type_as(src.data)))
        prob = model.generator(out[:, -1])
        _, next_word = torch.max(prob, dim = 1)
        next_word = next_word.data[0]
        ys = torch.cat([ys, 
                        torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=1)
    return ys

def readable(sent):
    try:
        end = sent.index('<eos>')
    except ValueError:
        end = len(sent)
    sent = " ".join(sent[:end])
    sent = sent.replace("<bos>", '')
    sent = sent.replace("<eos>", '')
    sent = sent.replace("<unk>", '-UNK-')
    sent = sent.replace("<pad>", '')
    return sent

vocab_inv = {i:w for w, i in vocab.items()}

In [16]:
saved_model = torch.load('trained/Model9')
model.load_state_dict(saved_model['model'])

model.eval()

IndexError: index 8 is out of bounds for dimension 0 with size 1

In [23]:
for j,batch in enumerate(data_gen_val()):
    srcs = batch.src
    src_masks = batch.src_mask
    
    trgs = batch.trg
    trg_masks = batch.trg_mask
        
    bs = srcs.shape[0]
    
    
    for i in range(bs):    
        if i + j*bs == 99:           
        
            src = srcs[i]
            trg = trgs[i]
            src_mask = src_masks[i]
            trg_mask = trg_masks[i]

            out = greedy_decode(model, src.view(1, -1), None, max_len=100, start_symbol=vocab['<bos>'])
            out = out.view(-1)

            src_words = [vocab_inv[idx.item()] for idx in src]
            trg_words = [vocab_inv[idx.item()] for idx in trg]
            out_words = [vocab_inv[idx.item()] for idx in out]

            print('[input]\n', readable(src_words))
            print('[target]\n', readable(trg_words))
            print('[output]\n', readable(out_words))

[input]
  baghdad , iraq -lrb- cnn -rrb- -- brad blauser lives in war-torn baghdad , where he does n't earn a paycheck and is thousands of miles from his family . but he has no intention of leaving anytime soon . since 2005 , brad blauser 's wheelchairs for iraqi kids program has distributed nearly 650 free wheelchairs . for the past four years , the dallas , texas , native has been providing hope to hundreds of disabled iraqi children and their families through the distribution of pediatric wheelchairs . `` disabled children -- they 're really the forgotten ones in this war , `` said blauser , 43 . `` they are often not seen in society . `` blauser arrived in iraq as a civilian contractor in 2004 , but quit that job last year to devote himself full time to his program , without compensation . vote now for the cnn hero of the year . `` there 's no paycheck . it 's not really safe here . but this is a once-in-a-lifetime opportunity , `` he said . an estimated one in seven iraqi children