In [1]:
from rouge import Rouge 
import json
import torch
from tqdm import tqdm_notebook as tqdm
from multiprocessing import Pool

In [2]:
from preprocessors import Preprocessor
from dataset import make_data_generator
from transformer_nb2 import *

In [3]:
task_name = "giga"

if task_name == "giga":
    doc_name = "/home/george/Projects/speechlab/pointer-generator/data/Giga/input.txt"
    summ_name = "/home/george/Projects/speechlab/pointer-generator/data/Giga/task1_ref0.txt"
else:
    doc_name = "/home/george/Projects/speechlab/pointer-generator/data2/val.txt.src"
    summ_name = "/home/george/Projects/speechlab/pointer-generator/data2/val.txt.tgt.tagged"
    
val_size = 1951 if task_name == "giga" else 13368

continue_from = "trained-giga-50k/Model11"
eval_dir = "evaluation-{}/".format(task_name)
data_seq_name = eval_dir+'tmp.json'
vocab_name = 'data-{}/vocab.json'.format(task_name)

!mkdir -p {eval_dir}

num_threads = 4
batch_size = 64 if task_name == "giga" else 16
vocab = json.load(open(vocab_name, 'r'))
VOC_SIZE = len(vocab)

INPUT_MAX = 50 if task_name == "giga" else 400
OUTPUT_MAX = 20 if task_name == "giga" else 100
UNK = "[UNK]"
BOS = "[CLS]"
EOS = "[SEP]"
PAD = "[PAD]"

if task_name == 'giga':
    token_mappings = {'UNK':UNK, '-lrb-':'(', '-rrb-':')'}
else:
    token_mappings = {'<unk>':UNK, '<t>':'', '</t>':'', '-lrb-':'(', '-rrb-':')'}
    
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")
print(device)
faster = False


FileNotFoundError: [Errno 2] No such file or directory: 'data-giga/vocab.json'

In [None]:

prepro = Preprocessor(doc_name, summ_name, 0, 50000, token_mappings, num_threads)
summaries = prepro.summaries
if faster:
    prepro.vocab = vocab
    prepro.vocab_inv = {a:b for b, a in vocab.items()}
else:
    prepro.process(vocab)
    prepro.export(None,data_seq_name,None)
    articles = prepro.documents

In [None]:
if not faster:
    index = 73
    print("[summary]", summaries[index])
    print("[documen]", articles[index])

In [None]:

ev_set, ev_generator = make_data_generator(\
data_seq_name, INPUT_MAX, OUTPUT_MAX, vocab[PAD], batch_size, cutoff=None, shuffle=False, num_workers=4)

def data_gen_val():
    for src,tgt in ev_generator:
        src = Variable(src, requires_grad=False).to(device)
        tgt = Variable(tgt, requires_grad=False).to(device)
        yield Batch(src, tgt, vocab[PAD])

In [None]:

if str(device) == 'cpu':
    saved_model = torch.load(continue_from, map_location=lambda storage, location: storage)
else:
    saved_model = torch.load(continue_from)

model = make_model(VOC_SIZE, VOC_SIZE, N=4, d_model=256, d_ff=1024, h=8, dropout=0.1, emb_share=True)
model.load_state_dict(saved_model['model'])
model.eval()
if str(device) != 'cpu':
    model.cuda()

In [None]:
del saved_model
torch.cuda.empty_cache()

In [None]:
def greedy_decode_batch(model, src, src_mask, max_len, start_symbol):
    batch_size = src.shape[0]
    
    memory = model.encode(src, src_mask)
    ys = torch.ones(batch_size, 1).fill_(start_symbol).type_as(src.data)
    for i in range(max_len-1):
        out = model.decode(memory, src_mask, 
                           Variable(ys), 
                           Variable(subsequent_mask(ys.size(1))
                                    .type_as(src.data)))
        #print(out.shape) 128,1,256
        probs = model.generator(out[:, -1, :])
        
        #print(probs.shape) 128,30522
        next_words = torch.argmax(probs, dim=1, keepdim=True)
        
        #print(next_words.shape)        
        #print(ys.shape) both 128,1
        
        ys = torch.cat((ys, next_words), dim=1)
    return ys

In [None]:
def readable(sent):
    try:
        end = sent.index(EOS)
    except ValueError:
        end = len(sent)
    sent = [tok for tok in sent[:end] if tok not in [BOS, EOS, PAD]] # remove special tokens
    sent = " ".join(sent)
    return sent

In [None]:
total = int(math.ceil(val_size / batch_size))

hypothesis = []

for i,batch in tqdm(enumerate(data_gen_val()), total= total):
    srcs = batch.src
    src_masks = batch.src_mask
    
    trgs = batch.trg
    trg_masks = batch.trg_mask
        
    bs = srcs.shape[0]
    
    outs = greedy_decode_batch(model, srcs, src_masks, max_len=OUTPUT_MAX, start_symbol=vocab[BOS])
    
    for j, (out_tensor, tgt_tensor) in enumerate(zip(outs, trgs)):   
        tokens = prepro.ids_to_tokens(out_tensor.cpu().numpy())
        line = readable(tokens)
        
        hypothesis.append(line)
            
    if i == total:
        break

In [None]:
rouge = Rouge()
scores = rouge.get_scores(hypothesis, summaries, avg=True)
print(scores['rouge-1']['f'], scores['rouge-2']['f'], scores['rouge-l']['f'])

In [None]:
# 
# 
# 
# 0.3320365650729159 0.14976638992424993 0.29414807564498124
# 
# 
# 
# 0.3401100694524329 0.1571963019013948 0.30353686782891715
#
#
# 0.34232405091848905 0.16004304876386657 0.30398585868610345

In [None]:
index = 888
print(hypothesis[index])
print(summaries[index])