In [1]:
import json
import gensim
import os
import json
import tqdm
import time
import numpy as np
import nltk
import random

%matplotlib  inline
import matplotlib.pyplot as plt

from rouge import Rouge 
rouge = Rouge()

In [2]:
% time
with open('./data_pointer_example.txt', 'r', encoding='UTF-8') as f:
    stories = json.load(f)

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 6.44 µs


In [3]:
model_embedding = np.load('Word2vec_pointer.npz')['E']

In [4]:
ts = time.strftime('%Y-%b-%d-%H-%M-%S', time.gmtime())

save_model_path = os.path.join('won', ts)
os.makedirs('./'+save_model_path)

In [5]:
class Vocab(object):

  def __init__(self, vocab_file, max_size):
    self._word_to_id = {}
    self._id_to_word = {}
    self._count = 0 # keeps track of total number of words in the Vocab

    # [UNK], [PAD], [START] and [STOP] get the ids 0,1,2,3.
    for w in [UNKNOWN_TOKEN, PAD_TOKEN, START_DECODING, STOP_DECODING]:
      self._word_to_id[w] = self._count
      self._id_to_word[self._count] = w
      self._count += 1

    # Read the vocab file and add words up to max_size
    with open(vocab_file, 'r', encoding='utf-8') as vocab_f:
      for line in vocab_f:
        pieces = line.split()
        if len(pieces) != 2:
          print ('Warning: incorrectly formatted line in vocabulary file: %s\n' % line)
          continue
        w = pieces[0]
        if w in [SENTENCE_START, SENTENCE_END, UNKNOWN_TOKEN, PAD_TOKEN, START_DECODING, STOP_DECODING]:
          raise Exception('<s>, </s>, [UNK], [PAD], [START] and [STOP] shouldn\'t be in the vocab file, but %s is' % w)
        if w in self._word_to_id:
          raise Exception('Duplicated word in vocabulary file: %s' % w)
        self._word_to_id[w] = self._count
        self._id_to_word[self._count] = w
        self._count += 1
        if max_size != 0 and self._count >= max_size:
          print ("max_size of vocab was specified as %i; we now have %i words. Stopping reading." % (max_size, self._count))
          break

    print ("Finished constructing vocabulary of %i total words. Last word added: %s" % (self._count, self._id_to_word[self._count-1]))

  def word2id(self, word):
    if word not in self._word_to_id:
      return self._word_to_id[UNKNOWN_TOKEN]
    return self._word_to_id[word]

  def id2word(self, word_id):
    if word_id not in self._id_to_word:
      raise ValueError('Id not found in vocab: %d' % word_id)
    return self._id_to_word[word_id]

  def size(self):
    return self._count

  def write_metadata(self, fpath):
    print ("Writing word embedding metadata file to %s..." % (fpath))
    with open(fpath, "w") as f:
      fieldnames = ['word']
      writer = csv.DictWriter(f, delimiter="\t", fieldnames=fieldnames)
      for i in xrange(self.size()):
        writer.writerow({"word": self._id_to_word[i]})

In [6]:
# <s> and </s> are used in the data files to segment the abstracts into sentences. They don't receive vocab ids.
SENTENCE_START = '<s>'
SENTENCE_END = '</s>'

PAD_TOKEN = '[PAD]' # This has a vocab id, which is used to pad the encoder input, decoder input and target sequence
UNKNOWN_TOKEN = '[UNK]' # This has a vocab id, which is used to represent out-of-vocabulary words
START_DECODING = '[START]' # This has a vocab id, which is used at the start of every decoder input sequence
STOP_DECODING = '[STOP]' # This has a vocab id, which is used at the end of untruncated target sequences

# Note: none of <s>, </s>, [PAD], [UNK], [START], [STOP] should appear in the vocab file.

In [7]:
vocab = Vocab('./vocab', 50000)









max_size of vocab was specified as 50000; we now have 50000 words. Stopping reading.
Finished constructing vocabulary of 50000 total words. Last word added: long-delayed


In [8]:
class Example(object):

  def __init__(self, article, abstract_sentences, vocab):
    # Get ids of special tokens
    start_decoding = vocab.word2id(START_DECODING)
    stop_decoding = vocab.word2id(STOP_DECODING)

    # Process the article
    article_words = article.split()
    if len(article_words) > 200:
      article_words = article_words[:200]
    self.enc_len = len(article_words) # store the length after truncation but before padding
    self.enc_input = [vocab.word2id(w) for w in article_words] # list of word ids; OOVs are represented by the id for UNK token

    # Process the abstract
    abstract = ' '.join(abstract_sentences) # string
    abstract_words = abstract.split() # list of strings
    abs_ids = [vocab.word2id(w) for w in abstract_words] # list of word ids; OOVs are represented by the id for UNK token

    # Get the decoder input sequence and target sequence
    self.dec_input, self.target = self.get_dec_inp_targ_seqs(abs_ids, 20, start_decoding, stop_decoding)
    self.dec_len = len(self.dec_input)

    # If using pointer-generator mode, we need to store some extra info
    if True:
      # Store a version of the enc_input where in-article OOVs are represented by their temporary OOV id; also store the in-article OOVs words themselves
      self.enc_input_extend_vocab, self.article_oovs = article2ids(article_words, vocab)

      # Get a verison of the reference summary where in-article OOVs are represented by their temporary article OOV id
      abs_ids_extend_vocab = abstract2ids(abstract_words, vocab, self.article_oovs)

      # Overwrite decoder target sequence so it uses the temp article OOV ids
      _, self.target = self.get_dec_inp_targ_seqs(abs_ids_extend_vocab, 20, start_decoding, stop_decoding)

    # Store the original strings
    self.original_article = article
    self.original_abstract = abstract
    self.original_abstract_sents = abstract_sentences


  def get_dec_inp_targ_seqs(self, sequence, max_len, start_id, stop_id):
    inp = [start_id] + sequence[:]
    target = sequence[:]
    if len(inp) > max_len: # truncate
      inp = inp[:max_len]
      target = target[:max_len] # no end_token
    else: # no truncation
      target.append(stop_id) # end token
    assert len(inp) == len(target)
    return inp, target


  def pad_decoder_inp_targ(self, max_len, pad_id):
    while len(self.dec_input) < max_len:
      self.dec_input.append(pad_id)
    while len(self.target) < max_len:
      self.target.append(pad_id)


  def pad_encoder_input(self, max_len, pad_id):
    while len(self.enc_input) < max_len:
      self.enc_input.append(pad_id)
    if True:
      while len(self.enc_input_extend_vocab) < max_len:
        self.enc_input_extend_vocab.append(pad_id)
        
def article2ids(article_words, vocab):
  ids = []
  oovs = []
  unk_id = vocab.word2id(UNKNOWN_TOKEN)
  for w in article_words:
    i = vocab.word2id(w)
    if i == unk_id: # If w is OOV
      if w not in oovs: # Add to list of OOVs
        oovs.append(w)
      oov_num = oovs.index(w) # This is 0 for the first article OOV, 1 for the second article OOV...
      ids.append(vocab.size() + oov_num) # This is e.g. 50000 for the first article OOV, 50001 for the second...
    else:
      ids.append(i)
  return ids, oovs


def abstract2ids(abstract_words, vocab, article_oovs):
  ids = []
  unk_id = vocab.word2id(UNKNOWN_TOKEN)
  for w in abstract_words:
    i = vocab.word2id(w)
    if i == unk_id: # If w is an OOV word
        #print(abstract_words)
        ids.append(i) 
#       if w in article_oovs: # If w is an in-article OOV
#         vocab_idx = vocab.size() + article_oovs.index(w) # Map to its temporary article OOV number
#         ids.append(vocab_idx)
#       else: # If w is an out-of-article OOV
#         ids.append(unk_id) # Map to the UNK token id
    else:
        ids.append(i)
  return ids

In [9]:
start_decoding = vocab.word2id(START_DECODING)
stop_decoding = vocab.word2id(STOP_DECODING)

input_index=[]
dec_inputs_index=[]
target_index=[]
inputs_len=[]
dec_inputs_len=[]

index_data = {}

zero_len = 0

dec_max_len = 20
enc_max_len = 200

pad_id = vocab.word2id(PAD_TOKEN)
files_group = list(stories.keys())

for file_group in tqdm.tqdm(files_group):
    for file in tqdm.tqdm(list(stories[file_group].keys())):       
        for data in list(stories[file_group][file].keys()):
            if stories[file_group][file][data]['abstract_sentences'] == []:
                print('*********************************************')
                abstract = stories[file_group][file][data]['abstract']
                abstract_sentences = []
                article = stories[file_group][file][data]['article']
                example=Example(article=article,abstract_sentences=abstract_sentences,vocab=vocab)
                example.pad_decoder_inp_targ(dec_max_len,pad_id)
                example.pad_encoder_input(enc_max_len,pad_id)
            else:
                abstract = stories[file_group][file][data]['abstract']
                abstract_sentences = [stories[file_group][file][data]['abstract_sentences'][0]]
                article = stories[file_group][file][data]['article']
                example=Example(article=article,abstract_sentences=abstract_sentences,vocab=vocab)
                example.pad_decoder_inp_targ(dec_max_len,pad_id)
                example.pad_encoder_input(enc_max_len,pad_id)
            if example.enc_len <= 0 or example.dec_len <= 0:
                print(file_group, file, data)
                zero_len +=1
                pass
            else:
                input_index.append(example.enc_input)
                dec_inputs_index.append(example.dec_input)
                target_index.append(example.target)
                inputs_len.append(example.enc_len)
                dec_inputs_len.append(example.dec_len)
    print('********************************************************************************')
    print(len(input_index))
    print('********************************************************************************')
    index_data[file_group] = (input_index,inputs_len,dec_inputs_index,dec_inputs_len,target_index)
    input_index=[]
    dec_inputs_index=[]
    target_index=[]
    inputs_len=[]
    dec_inputs_len=[]

  0%|          | 0/3 [00:00<?, ?it/s]
  0%|          | 0/288 [00:00<?, ?it/s][A
  0%|          | 1/288 [00:00<01:14,  3.85it/s][A

./chunked/train_*.bin ./chunked\train_000.bin 598
./chunked/train_*.bin ./chunked\train_000.bin 611
./chunked/train_*.bin ./chunked\train_000.bin 676
./chunked/train_*.bin ./chunked\train_000.bin 687
./chunked/train_*.bin ./chunked\train_000.bin 986
./chunked/train_*.bin ./chunked\train_001.bin 276
./chunked/train_*.bin ./chunked\train_001.bin 339



  1%|          | 2/288 [00:00<01:15,  3.79it/s][A

./chunked/train_*.bin ./chunked\train_001.bin 561
./chunked/train_*.bin ./chunked\train_001.bin 640
./chunked/train_*.bin ./chunked\train_001.bin 825
./chunked/train_*.bin ./chunked\train_001.bin 903
./chunked/train_*.bin ./chunked\train_002.bin 44
./chunked/train_*.bin ./chunked\train_002.bin 198



  1%|          | 3/288 [00:00<01:14,  3.84it/s][A

./chunked/train_*.bin ./chunked\train_002.bin 727
./chunked/train_*.bin ./chunked\train_002.bin 941
./chunked/train_*.bin ./chunked\train_002.bin 956
./chunked/train_*.bin ./chunked\train_003.bin 56
./chunked/train_*.bin ./chunked\train_003.bin 152



  1%|▏         | 4/288 [00:01<01:14,  3.81it/s][A

./chunked/train_*.bin ./chunked\train_003.bin 978
./chunked/train_*.bin ./chunked\train_004.bin 192
./chunked/train_*.bin ./chunked\train_004.bin 345
./chunked/train_*.bin ./chunked\train_004.bin 358
./chunked/train_*.bin ./chunked\train_004.bin 409



  2%|▏         | 5/288 [00:01<01:15,  3.76it/s][A

./chunked/train_*.bin ./chunked\train_004.bin 866
./chunked/train_*.bin ./chunked\train_004.bin 930
./chunked/train_*.bin ./chunked\train_005.bin 110
./chunked/train_*.bin ./chunked\train_005.bin 228
./chunked/train_*.bin ./chunked\train_005.bin 402



  2%|▏         | 6/288 [00:01<01:15,  3.73it/s][A
  2%|▏         | 7/288 [00:01<01:15,  3.74it/s][A

./chunked/train_*.bin ./chunked\train_006.bin 429
./chunked/train_*.bin ./chunked\train_006.bin 556
./chunked/train_*.bin ./chunked\train_006.bin 722
./chunked/train_*.bin ./chunked\train_006.bin 962



  3%|▎         | 8/288 [00:02<01:14,  3.74it/s][A

./chunked/train_*.bin ./chunked\train_007.bin 235



  3%|▎         | 9/288 [00:02<01:23,  3.35it/s][A

./chunked/train_*.bin ./chunked\train_008.bin 753
./chunked/train_*.bin ./chunked\train_008.bin 770
./chunked/train_*.bin ./chunked\train_009.bin 347



  3%|▎         | 10/288 [00:02<01:21,  3.39it/s][A
  4%|▍         | 11/288 [00:03<01:20,  3.42it/s][A
  4%|▍         | 12/288 [00:03<01:19,  3.46it/s][A
  5%|▍         | 13/288 [00:03<01:18,  3.50it/s][A

./chunked/train_*.bin ./chunked\train_013.bin 161
./chunked/train_*.bin ./chunked\train_013.bin 562
./chunked/train_*.bin ./chunked\train_013.bin 563
./chunked/train_*.bin ./chunked\train_013.bin 564
./chunked/train_*.bin ./chunked\train_013.bin 565
./chunked/train_*.bin ./chunked\train_013.bin 566
./chunked/train_*.bin ./chunked\train_013.bin 567
./chunked/train_*.bin ./chunked\train_013.bin 568
./chunked/train_*.bin ./chunked\train_013.bin 569
./chunked/train_*.bin ./chunked\train_013.bin 570
./chunked/train_*.bin ./chunked\train_013.bin 571
./chunked/train_*.bin ./chunked\train_013.bin 572
./chunked/train_*.bin ./chunked\train_013.bin 573
./chunked/train_*.bin ./chunked\train_013.bin 574



  5%|▍         | 14/288 [00:03<01:18,  3.51it/s][A

./chunked/train_*.bin ./chunked\train_013.bin 936
./chunked/train_*.bin ./chunked\train_013.bin 937
./chunked/train_*.bin ./chunked\train_013.bin 938
./chunked/train_*.bin ./chunked\train_013.bin 998
./chunked/train_*.bin ./chunked\train_014.bin 332



  5%|▌         | 15/288 [00:04<01:17,  3.53it/s][A
  6%|▌         | 16/288 [00:04<01:16,  3.55it/s][A

./chunked/train_*.bin ./chunked\train_015.bin 313
./chunked/train_*.bin ./chunked\train_015.bin 314
./chunked/train_*.bin ./chunked\train_015.bin 419
./chunked/train_*.bin ./chunked\train_015.bin 420
./chunked/train_*.bin ./chunked\train_015.bin 428
./chunked/train_*.bin ./chunked\train_015.bin 501
./chunked/train_*.bin ./chunked\train_015.bin 526
./chunked/train_*.bin ./chunked\train_015.bin 898
./chunked/train_*.bin ./chunked\train_015.bin 903
./chunked/train_*.bin ./chunked\train_016.bin 108
./chunked/train_*.bin ./chunked\train_016.bin 111
./chunked/train_*.bin ./chunked\train_016.bin 113
./chunked/train_*.bin ./chunked\train_016.bin 114
./chunked/train_*.bin ./chunked\train_016.bin 115



  6%|▌         | 17/288 [00:04<01:16,  3.56it/s][A
  6%|▋         | 18/288 [00:05<01:15,  3.58it/s][A
  7%|▋         | 19/288 [00:05<01:14,  3.60it/s][A
  7%|▋         | 20/288 [00:05<01:14,  3.61it/s][A

./chunked/train_*.bin ./chunked\train_019.bin 413
./chunked/train_*.bin ./chunked\train_020.bin 28
./chunked/train_*.bin ./chunked\train_020.bin 29
./chunked/train_*.bin ./chunked\train_020.bin 104



  7%|▋         | 21/288 [00:05<01:13,  3.61it/s][A
  8%|▊         | 22/288 [00:06<01:13,  3.62it/s][A

./chunked/train_*.bin ./chunked\train_021.bin 622
./chunked/train_*.bin ./chunked\train_021.bin 958
./chunked/train_*.bin ./chunked\train_022.bin 200



  8%|▊         | 23/288 [00:06<01:13,  3.62it/s][A
  8%|▊         | 24/288 [00:06<01:12,  3.63it/s][A

./chunked/train_*.bin ./chunked\train_023.bin 376
./chunked/train_*.bin ./chunked\train_024.bin 14



  9%|▊         | 25/288 [00:06<01:12,  3.63it/s][A

./chunked/train_*.bin ./chunked\train_024.bin 314
./chunked/train_*.bin ./chunked\train_024.bin 350
./chunked/train_*.bin ./chunked\train_024.bin 363
./chunked/train_*.bin ./chunked\train_024.bin 395



  9%|▉         | 26/288 [00:07<01:12,  3.63it/s][A
  9%|▉         | 27/288 [00:07<01:11,  3.64it/s][A

./chunked/train_*.bin ./chunked\train_026.bin 300
./chunked/train_*.bin ./chunked\train_027.bin 149



 10%|▉         | 28/288 [00:07<01:11,  3.64it/s][A
 10%|█         | 29/288 [00:07<01:11,  3.64it/s][A
 10%|█         | 30/288 [00:08<01:10,  3.64it/s][A

./chunked/train_*.bin ./chunked\train_029.bin 611



 11%|█         | 31/288 [00:08<01:10,  3.64it/s][A

./chunked/train_*.bin ./chunked\train_030.bin 677
./chunked/train_*.bin ./chunked\train_030.bin 773
./chunked/train_*.bin ./chunked\train_030.bin 838
./chunked/train_*.bin ./chunked\train_031.bin 400



 11%|█         | 32/288 [00:08<01:10,  3.65it/s][A
 11%|█▏        | 33/288 [00:09<01:09,  3.66it/s][A

./chunked/train_*.bin ./chunked\train_032.bin 753
./chunked/train_*.bin ./chunked\train_033.bin 415



 12%|█▏        | 34/288 [00:09<01:09,  3.65it/s][A
 12%|█▏        | 35/288 [00:09<01:09,  3.65it/s][A

./chunked/train_*.bin ./chunked\train_034.bin 726



 12%|█▎        | 36/288 [00:09<01:08,  3.65it/s][A
 13%|█▎        | 37/288 [00:10<01:08,  3.65it/s][A
 13%|█▎        | 38/288 [00:10<01:08,  3.65it/s][A
 14%|█▎        | 39/288 [00:10<01:08,  3.65it/s][A
 14%|█▍        | 40/288 [00:10<01:07,  3.65it/s][A
 14%|█▍        | 41/288 [00:11<01:07,  3.65it/s][A

./chunked/train_*.bin ./chunked\train_040.bin 501



 15%|█▍        | 42/288 [00:11<01:07,  3.65it/s][A

./chunked/train_*.bin ./chunked\train_041.bin 440



 15%|█▍        | 43/288 [00:11<01:07,  3.65it/s][A
 15%|█▌        | 44/288 [00:12<01:06,  3.65it/s][A

./chunked/train_*.bin ./chunked\train_043.bin 685



 16%|█▌        | 45/288 [00:12<01:06,  3.65it/s][A

./chunked/train_*.bin ./chunked\train_045.bin 40
./chunked/train_*.bin ./chunked\train_045.bin 369



 16%|█▌        | 46/288 [00:12<01:06,  3.65it/s][A
 16%|█▋        | 47/288 [00:12<01:05,  3.65it/s][A
 17%|█▋        | 48/288 [00:13<01:05,  3.66it/s][A
 17%|█▋        | 49/288 [00:13<01:05,  3.65it/s][A
 17%|█▋        | 50/288 [00:13<01:05,  3.65it/s][A

./chunked/train_*.bin ./chunked\train_049.bin 937



 18%|█▊        | 51/288 [00:13<01:05,  3.65it/s][A
 18%|█▊        | 52/288 [00:14<01:04,  3.65it/s][A
 18%|█▊        | 53/288 [00:14<01:04,  3.64it/s][A
 19%|█▉        | 54/288 [00:14<01:04,  3.64it/s][A

./chunked/train_*.bin ./chunked\train_053.bin 760



 19%|█▉        | 55/288 [00:15<01:03,  3.64it/s][A

./chunked/train_*.bin ./chunked\train_054.bin 642



 19%|█▉        | 56/288 [00:15<01:03,  3.64it/s][A

./chunked/train_*.bin ./chunked\train_055.bin 890
./chunked/train_*.bin ./chunked\train_056.bin 591



 20%|█▉        | 57/288 [00:15<01:03,  3.64it/s][A

./chunked/train_*.bin ./chunked\train_056.bin 676



 20%|██        | 58/288 [00:15<01:03,  3.64it/s][A
 20%|██        | 59/288 [00:16<01:02,  3.64it/s][A

./chunked/train_*.bin ./chunked\train_058.bin 341
./chunked/train_*.bin ./chunked\train_058.bin 447



 21%|██        | 60/288 [00:16<01:02,  3.64it/s][A
 21%|██        | 61/288 [00:16<01:02,  3.65it/s][A
 22%|██▏       | 62/288 [00:17<01:01,  3.65it/s][A
 22%|██▏       | 63/288 [00:17<01:01,  3.65it/s][A
 22%|██▏       | 64/288 [00:17<01:01,  3.64it/s][A
 23%|██▎       | 65/288 [00:17<01:01,  3.63it/s][A
 23%|██▎       | 66/288 [00:18<01:01,  3.64it/s][A

./chunked/train_*.bin ./chunked\train_065.bin 483



 23%|██▎       | 67/288 [00:18<01:00,  3.64it/s][A

./chunked/train_*.bin ./chunked\train_067.bin 114
./chunked/train_*.bin ./chunked\train_067.bin 189



 24%|██▎       | 68/288 [00:18<01:00,  3.64it/s][A
 24%|██▍       | 69/288 [00:18<01:00,  3.64it/s][A
 24%|██▍       | 70/288 [00:19<00:59,  3.64it/s][A

./chunked/train_*.bin ./chunked\train_069.bin 903



 25%|██▍       | 71/288 [00:19<00:59,  3.64it/s][A
 25%|██▌       | 72/288 [00:19<00:59,  3.64it/s][A
 25%|██▌       | 73/288 [00:20<01:01,  3.50it/s][A

./chunked/train_*.bin ./chunked\train_072.bin 920
./chunked/train_*.bin ./chunked\train_072.bin 942



 26%|██▌       | 74/288 [00:21<01:01,  3.50it/s][A
 26%|██▌       | 75/288 [00:21<01:00,  3.51it/s][A
 26%|██▋       | 76/288 [00:21<01:00,  3.50it/s][A
 27%|██▋       | 77/288 [00:21<01:00,  3.50it/s][A
 27%|██▋       | 78/288 [00:22<00:59,  3.51it/s][A
 27%|██▋       | 79/288 [00:22<00:59,  3.51it/s][A
 28%|██▊       | 80/288 [00:22<00:59,  3.51it/s][A
 28%|██▊       | 81/288 [00:23<00:58,  3.51it/s][A
 28%|██▊       | 82/288 [00:23<00:58,  3.51it/s][A

./chunked/train_*.bin ./chunked\train_081.bin 686



 29%|██▉       | 83/288 [00:23<00:58,  3.51it/s][A
 29%|██▉       | 84/288 [00:23<00:58,  3.51it/s][A

./chunked/train_*.bin ./chunked\train_083.bin 574
./chunked/train_*.bin ./chunked\train_083.bin 924



 30%|██▉       | 85/288 [00:24<00:57,  3.52it/s][A
 30%|██▉       | 86/288 [00:24<00:57,  3.52it/s][A
 30%|███       | 87/288 [00:24<00:57,  3.52it/s][A
 31%|███       | 88/288 [00:24<00:56,  3.52it/s][A
 31%|███       | 89/288 [00:25<00:56,  3.52it/s][A
 31%|███▏      | 90/288 [00:25<00:56,  3.53it/s][A
 32%|███▏      | 91/288 [00:25<00:55,  3.53it/s][A
 32%|███▏      | 92/288 [00:26<00:55,  3.53it/s][A
 32%|███▏      | 93/288 [00:26<00:55,  3.53it/s][A
 33%|███▎      | 94/288 [00:26<00:54,  3.53it/s][A
 33%|███▎      | 95/288 [00:26<00:54,  3.53it/s][A
 33%|███▎      | 96/288 [00:27<00:54,  3.53it/s][A
 34%|███▎      | 97/288 [00:27<00:54,  3.54it/s][A
 34%|███▍      | 98/288 [00:27<00:53,  3.53it/s][A
 34%|███▍      | 99/288 [00:28<00:53,  3.54it/s][A
 35%|███▍      | 100/288 [00:28<00:53,  3.54it/s][A
 35%|███▌      | 101/288 [00:28<00:52,  3.54it/s][A
 35%|███▌      | 102/288 [00:28<00:52,  3.54it/s][A
 36%|███▌      | 103/288 [00:29<00:52,  3.54it/s][A
 36%|██

*********************************************
./chunked/train_*.bin ./chunked\train_191.bin 183



 67%|██████▋   | 192/288 [00:54<00:27,  3.55it/s][A
 67%|██████▋   | 193/288 [00:54<00:26,  3.55it/s][A
 67%|██████▋   | 194/288 [00:54<00:26,  3.55it/s][A
 68%|██████▊   | 195/288 [00:54<00:26,  3.55it/s][A
 68%|██████▊   | 196/288 [00:55<00:25,  3.55it/s][A
 68%|██████▊   | 197/288 [00:55<00:25,  3.56it/s][A
 69%|██████▉   | 198/288 [00:55<00:25,  3.56it/s][A
 69%|██████▉   | 199/288 [00:55<00:25,  3.56it/s][A
 69%|██████▉   | 200/288 [00:56<00:24,  3.56it/s][A
 70%|██████▉   | 201/288 [00:56<00:24,  3.56it/s][A
 70%|███████   | 202/288 [00:56<00:24,  3.56it/s][A
 70%|███████   | 203/288 [00:56<00:23,  3.56it/s][A
 71%|███████   | 204/288 [00:57<00:23,  3.56it/s][A
 71%|███████   | 205/288 [00:57<00:23,  3.56it/s][A
 72%|███████▏  | 206/288 [00:57<00:23,  3.57it/s][A
 72%|███████▏  | 207/288 [00:58<00:22,  3.57it/s][A
 72%|███████▏  | 208/288 [00:58<00:22,  3.57it/s][A
 73%|███████▎  | 209/288 [00:58<00:22,  3.57it/s][A
 73%|███████▎  | 210/288 [00:58<00:21,  3.57i

********************************************************************************
287112
********************************************************************************



  7%|▋         | 1/14 [00:00<00:03,  3.65it/s][A
 14%|█▍        | 2/14 [00:00<00:03,  3.67it/s][A
 21%|██▏       | 3/14 [00:00<00:02,  3.77it/s][A
 29%|██▊       | 4/14 [00:01<00:02,  3.79it/s][A
 36%|███▌      | 5/14 [00:01<00:02,  3.80it/s][A
 43%|████▎     | 6/14 [00:01<00:02,  3.78it/s][A
 50%|█████     | 7/14 [00:01<00:01,  3.72it/s][A
 57%|█████▋    | 8/14 [00:02<00:01,  3.65it/s][A
 64%|██████▍   | 9/14 [00:02<00:01,  3.65it/s][A
 71%|███████▏  | 10/14 [00:02<00:01,  3.64it/s][A
 79%|███████▊  | 11/14 [00:03<00:00,  3.63it/s][A
 86%|████████▌ | 12/14 [00:03<00:00,  3.63it/s][A
 93%|█████████▎| 13/14 [00:03<00:00,  3.63it/s][A
 67%|██████▋   | 2/3 [01:25<00:42, 42.81s/it]s][A
  0%|          | 0/12 [00:00<?, ?it/s][A

********************************************************************************
13368
********************************************************************************



  8%|▊         | 1/12 [00:00<00:02,  3.73it/s][A
 17%|█▋        | 2/12 [00:00<00:02,  3.73it/s][A
 25%|██▌       | 3/12 [00:00<00:02,  3.79it/s][A
 33%|███▎      | 4/12 [00:01<00:02,  3.82it/s][A
 42%|████▏     | 5/12 [00:01<00:01,  3.76it/s][A
 50%|█████     | 6/12 [00:01<00:01,  3.74it/s][A
 58%|█████▊    | 7/12 [00:01<00:01,  3.73it/s][A
 67%|██████▋   | 8/12 [00:02<00:01,  3.69it/s][A
 75%|███████▌  | 9/12 [00:02<00:00,  3.69it/s][A
 83%|████████▎ | 10/12 [00:02<00:00,  3.68it/s][A
 92%|█████████▏| 11/12 [00:03<00:00,  3.66it/s][A
100%|██████████| 12/12 [00:03<00:00,  3.80it/s][A
100%|██████████| 3/3 [01:28<00:00, 29.59s/it]

********************************************************************************
11490
********************************************************************************





In [10]:
for file_group in files_group:
    print(file_group)
    (input_index,inputs_len,dec_inputs_index,dec_inputs_len,target_index) = index_data[file_group]
    print(len(input_index))
    start = int(len(input_index)*0.7)
    end = -1
    index_data[file_group] = (input_index[start:end],inputs_len[start:end],dec_inputs_index[start:end],dec_inputs_len[start:end],target_index[start:end])
    (input_index,inputs_len,dec_inputs_index,dec_inputs_len,target_index) = index_data[file_group]
    print(len(input_index))

./chunked/train_*.bin
287112
28711
./chunked/val_*.bin
13368
1336
./chunked/test_*.bin
11490
1148


In [None]:
import torch
from torch.autograd import Variable
import torch.nn.utils.rnn as rnn_utils
import torch.nn as nn
import torch.nn.functional as F

In [None]:
embedding_size = 300
hidden_size = 150
word_dropout = 0.5
num_layers = 2
bidirectional = True
batch_size_fit = 1024-256
rnn_type = 'gru'
learning_rate = 0.001

vocab_size = len(vocab._word_to_id)

# <s> and </s> are used in the data files to segment the abstracts into sentences. They don't receive vocab ids.
SENTENCE_START = '<s>'
SENTENCE_END = '</s>'

PAD_TOKEN = '[PAD]' # This has a vocab id, which is used to pad the encoder input, decoder input and target sequence
UNKNOWN_TOKEN = '[UNK]' # This has a vocab id, which is used to represent out-of-vocabulary words
START_DECODING = '[START]' # This has a vocab id, which is used at the start of every decoder input sequence
STOP_DECODING = '[STOP]' # This has a vocab id, which is used at the end of untruncated target sequences

# Note: none of <s>, </s>, [PAD], [UNK], [START], [STOP] should appear in the vocab file.

sos_idx = vocab.word2id(START_DECODING)
eos_idx = vocab.word2id(STOP_DECODING)
pad_idx = vocab.word2id(PAD_TOKEN)

In [None]:
class Embedding_layer(nn.Module):
    def __init__(self, vocab_size, embedding_size, numpy_embedding = None):
        super().__init__()
#         if numpy_embedding == None:
#             self.embedding = nn.Embedding(vocab_size,embedding_size)
#         else:
        self.model_embedding = torch.from_numpy(numpy_embedding).float()
        self.embedding = nn.Embedding(vocab_size,embedding_size)
        self.embedding.weight = nn.Parameter(self.model_embedding)
            
    def forward(self,inputs):
        return self.embedding(inputs)

In [None]:
class Seq2Seq_encoder(nn.Module):
    def __init__(self,vocab_size, embedding_size, hidden_size, word_dropout,
                 sos_idx, eos_idx, pad_idx, embedding,rnn_type='rnn' , num_layers=1, bidirectional=True):
        
        super().__init__()
        #self.model_embedding = torch.from_numpy(numpy_embedding)
        self.sos_idx = sos_idx
        self.eos_idx = eos_idx
        self.pad_idx = pad_idx
        self.vocab_size = vocab_size
        self.rnn_type = rnn_type
        self.bidirectional = bidirectional
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        
        self.embedding = embedding
        #self.embedding.weight = nn.Parameter(self.model_embedding)
        
        self.encoder = encoder(vocab_size = self.vocab_size, embedding_size = embedding_size, hidden_size = hidden_size, num_layers = num_layers, bidirectional = bidirectional,rnn_type = rnn_type).cuda()
        #self.decoder = Decoder(vocab_size = vocab_size,embedding_size = embedding_size, hidden_size = hidden_size, num_layers = num_layers, bidirectional = False,rnn_type = rnn_type, word_dropout=word_dropout).cuda()
        
    def forward(self,x,length,y_): 
        #print(x.size())
        batch_size = x.size(0)
        sorted_lengths, sorted_idx = torch.sort(length, descending=True)
        input_sequence = x[sorted_idx.cuda()]
        input_embedding = self.embedding(input_sequence).float()

        packed_input = rnn_utils.pack_padded_sequence(input_embedding, sorted_lengths.tolist(), batch_first=True)

        self.encoder_outputs, self.encoder_hidden_state, self.encoder_hidden_state_attn = self.encoder(packed_input,batch_size)

        padded_outputs = rnn_utils.pad_packed_sequence(self.encoder_outputs, batch_first=True)[0]
        padded_outputs = padded_outputs.contiguous()
        _,reversed_idx = torch.sort(sorted_idx.cuda())
        self.encoder_outputs = padded_outputs[reversed_idx]        
        
        return self.encoder_outputs, self.encoder_hidden_state, self.encoder_hidden_state_attn

In [None]:
class encoder(nn.Module):
    def __init__(self,vocab_size,embedding_size, hidden_size, bidirectional=True, num_layers = 1,rnn_type='rnn'):
        super(encoder,self).__init__()
        
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        self.rnn_type = rnn_type
        self.bidirectional = bidirectional
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        
        if self.rnn_type == 'rnn':
            rnn = nn.RNN
        elif self.rnn_type == 'gru':
            rnn = nn.GRU
        elif self.rnn_type =='lstm':
            rnn = nn.LSTM
        else:
            raise ValueError()
            
        
        self.encoder = rnn(self.embedding_size, self.hidden_size, num_layers = self.num_layers, bidirectional = self.bidirectional, batch_first = True)
        self.encoder.weight_hh_l0 = nn.init.xavier_uniform(self.encoder.weight_hh_l0)
        self.encoder.weight_ih_l0 = nn.init.xavier_uniform(self.encoder.weight_ih_l0)

        self.hidden_factor = (2 if self.bidirectional else 1) * self.num_layers

    
    def forward(self,x,batch_size):
        
        outputs, self.hidden = self.encoder(x)
        
        if self.bidirectional or self.num_layers > 1:
            # flatten hidden state
            self.hidden_ = self.hidden.view(batch_size, self.hidden_size*self.hidden_factor)
        else:
            self.hidden_ = self.hidden.squeeze()

        
        return outputs, self.hidden_, self.hidden[-1]

In [None]:
class Attn(nn.Module):
    def __init__(self, method, hidden_size, max_length=20):
        super(Attn, self).__init__()
        
        self.method = method
        self.hidden_size = hidden_size
        
        if self.method == 'general':
            self.attn = nn.Linear(self.hidden_size, hidden_size)

        elif self.method == 'concat':
            self.attn = nn.Linear(self.hidden_size * 2, hidden_size)
            self.other = nn.Parameter(torch.FloatTensor(1, hidden_size))

    def forward(self, hidden, encoder_outputs):
        seq_len = encoder_outputs.size(1)
        batch_size = encoder_outputs.size(0)
        
         # Create variable to store attention energies
        attn_energies = Variable(torch.zeros(batch_size,seq_len)).cuda()# B x 1 x S
        

        # Calculate energies for each encoder output
        for i in range(seq_len):
            attn_energies[:,i] = self.score(hidden, encoder_outputs[:,i]).squeeze()

        # Normalize energies to weights in range 0 to 1, resize to 1 x 1 x seq_len
        return F.softmax(attn_energies).unsqueeze(1)
    
    def score(self, hidden, encoder_output):
        
        if self.method == 'dot':
            energy = hidden.dot(encoder_output)
            return energy
        
        elif self.method == 'general':
            energy = self.attn(encoder_output)
            energy = hidden.bmm(energy.unsqueeze(2))
            return energy
        
        elif self.method == 'concat':
            energy = self.attn(torch.cat((hidden, encoder_output), 1))
            energy = self.other.dot(energy)
            return energy

In [None]:
class Seq2Seq_decoder(nn.Module):
    def __init__(self,vocab_size,embedding_size , hidden_size, encoder_bi, embedding, bidirectional=True, num_layers = 1,rnn_type='rnn',word_dropout = 0.5):
        super(Seq2Seq_decoder,self).__init__()
        
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        self.rnn_type = rnn_type
        self.bidirectional = bidirectional
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        self.attn = Attn('general', hidden_size)
        self.encoder_bi = encoder_bi
        if self.rnn_type == 'rnn':
            rnn = nn.RNN
        elif self.rnn_type == 'gru':
            rnn = nn.GRU
        elif self.rnn_type =='lstm':
            rnn = nn.LSTM
        else:
            raise ValueError()
            
        self.hidden_factor = (2 if self.encoder_bi else 1)           
        
        self.embedding = embedding
        self.word_dropout = nn.Dropout(p=word_dropout)
        
        self.decoder = rnn(embedding_size+hidden_size, hidden_size, num_layers=num_layers, bidirectional=self.bidirectional, batch_first=True)
        self.decoder.weight_hh_l0 = nn.init.xavier_uniform(self.decoder.weight_hh_l0)
        self.decoder.weight_ih_l0 = nn.init.xavier_uniform(self.decoder.weight_ih_l0)
        
        self.hidden2hidden = nn.Linear(hidden_size * 2*(2 if bidirectional else 1), hidden_size)
        self.hidden2outputs = nn.Linear(hidden_size, hidden_size)
        self.outputs2vocab = nn.Linear(hidden_size, vocab_size)

        
    def forward(self,batch_size,context,hidden,encoder_outputs):
        self.batch_size = batch_size.size(0)
        self.context = context

        if  self.num_layers > 1:
            # unflatten hidden state
            self.hidden = hidden.view(self.hidden_factor, self.batch_size, self.hidden_size)
        else:
            #hidden = hidden[-1]
            self.hidden = hidden.unsqueeze(0)
        #print(hidden.size())    
        self.input_embedding = self.embedding(batch_size).view(encoder_outputs.size(0),1, -1)

        self.inputs = torch.cat((self.input_embedding, self.context), 2)
        self.dec_outputs,self.hidden = self.decoder(self.inputs, self.hidden)
        
        
        self.attn_weights = self.attn(self.dec_outputs, encoder_outputs)
        self.context = self.attn_weights.bmm(encoder_outputs)
        
        self.attn2output = torch.cat((self.dec_outputs.squeeze(),context.squeeze()),1)
        
        #self.test = self.dec_outputs + self.context
        self.outputs = nn.functional.log_softmax(self.outputs2vocab(self.hidden2outputs(self.hidden2hidden(self.attn2output)+self.input_embedding.squeeze())))
        #self.outputs = nn.functional.log_softmax(self.outputs2vocab(self.outputs.squeeze()))
        
        return self.outputs, self.context, self.hidden, self.attn_weights

In [None]:
embedding = Embedding_layer(vocab_size, embedding_size,numpy_embedding = model_embedding)

In [None]:
Encoder=Seq2Seq_encoder(vocab_size, embedding_size, hidden_size, word_dropout,sos_idx, eos_idx, pad_idx , num_layers=num_layers ,rnn_type='gru',bidirectional= bidirectional,embedding =embedding ).cuda()

In [None]:
Decoder = Seq2Seq_decoder(vocab_size = vocab_size,embedding_size = embedding_size, hidden_size = hidden_size*2, num_layers = num_layers, encoder_bi= bidirectional,bidirectional = False,rnn_type = rnn_type, word_dropout=word_dropout,embedding =embedding).cuda()

In [None]:
Encoder,Decoder

In [None]:
NLL = torch.nn.NLLLoss(ignore_index = pad_idx)

optimizer_encoder = torch.optim.Adam(Encoder.parameters(), lr=learning_rate)
optimizer_decoder = torch.optim.Adam(Decoder.parameters(), lr=learning_rate)

In [None]:
import random

def batch(batch_size,input_var,dec_input_var,target_var,length_var, dec_length_var):

    shuffle_list = list(zip(input_var,dec_input_var,target_var,length_var,dec_length_var))
    random.shuffle(shuffle_list)
    
    start = 0
    end = batch_size
    #if len(input_var)%32 != 0:
    while end < len(input_var):
        batch_input = []
        batch_input_dec = []
        batch_target = []
        batch_length = []
        batch_length_dec = []
        
        batch_shuffle = shuffle_list[start:end]
        
        for i,j,k,n,m in batch_shuffle:
            batch_input.append(i)
            batch_input_dec.append(j)
            batch_target.append(k)
            batch_length.append(n)
            batch_length_dec.append(m)
            
        temp = end
        end  = end + batch_size
        start = temp
        yield batch_input, batch_input_dec, batch_target, batch_length, batch_length_dec
        
    if end >= len(input_var):
        batch_input = []
        batch_input_dec = []
        batch_target = []
        batch_length = []
        batch_length_dec = []
        batch_shuffle = shuffle_list[start:]
        
        for i,j,k,n,m in batch_shuffle:
            batch_input.append(i)
            batch_input_dec.append(j)
            batch_target.append(k)
            batch_length.append(n)
            batch_length_dec.append(m)
        yield batch_input, batch_input_dec, batch_target, batch_length, batch_length_dec

In [None]:
epochs = 50
max_target_len = 20
clip = 5

In [None]:
paths = ['./chunked/train_*.bin', './chunked/val_*.bin']

In [None]:
step = 0
var_losses = []
train_losses = []
avg_losses = []

iteration = 0
for epoch in tqdm.tqdm(range(epochs+1)):
    
    for path in paths :
        print(path)
        input_index, inputs_len, dec_inputs_index, dec_inputs_len, target_index = index_data[path]
        
        for batch_x, batch_y_x, batch_y, batch_len, batch_len_y in batch(batch_size_fit, input_index, \
                                                                         dec_inputs_index,target_index, \
                                                                         inputs_len, dec_inputs_len):
            optimizer_encoder.zero_grad()
            optimizer_decoder.zero_grad()

            iteration = iteration + 1

            if path == paths[0]:
                Encoder.train()
                Decoder.train()
            else:
                Encoder.eval()
                Decoder.eval()

            x_ = Variable(torch.cuda.LongTensor(batch_x))
            y_ = Variable(torch.cuda.LongTensor(batch_y)).transpose(1,0)
            batch_size = x_.size(0)

            length = torch.cuda.LongTensor(batch_len)

            decoder_input=Variable(torch.cuda.LongTensor(batch_size)).fill_(sos_idx)

            encoder_outputs, encoder_hidden_state,encoder_hidden_state_attn = Encoder(x_,length,y_)
            decoder_context = Variable(torch.zeros(batch_size,1, Decoder.hidden_size)).cuda()
            decoder_hidden = encoder_hidden_state

            loss = 0

            for i in range(max_target_len):
                decoder_output, decoder_context, decoder_hidden, decoder_attention = Decoder(decoder_input,decoder_context,decoder_hidden,encoder_outputs)

                topv, topi = decoder_output.data.topk(1)

                loss += NLL(decoder_output,y_[i])
                decoder_hidden = decoder_hidden.squeeze(0)
                decoder_input = y_[i]

            
            if path == paths[0]:

                loss.backward()
                #torch.nn.utils.clip_grad_norm(Encoder.parameters(), clip)
                #torch.nn.utils.clip_grad_norm(Decoder.parameters(), clip)
                optimizer_encoder.step()
                optimizer_decoder.step()

                loss = loss.data[0]/max_target_len

                train_losses.append(loss)

                step += 1

                if iteration % 10 == 0 or iteration == (len(input_index)-1)//batch_size:
                    print("Batch %04d/%i, Loss %9.4f"%( iteration, (len(input_index)-1)//batch_size_fit, loss))
                    np.savez(L=train_losses,file='./train_loss.npz')
                    checkpoint_path_encoder = os.path.join(save_model_path, "enc_E%i.pytorch"%(epoch))
                    checkpoint_path_decoder = os.path.join(save_model_path, "dec_E%i.pytorch"%(epoch))
                    torch.save(Encoder, checkpoint_path_encoder)
                    torch.save(Decoder, checkpoint_path_decoder)

            else:

                loss = loss.data[0]/max_target_len

                var_losses.append(loss)

                step += 1

                if iteration % 10 == 0 or iteration == (len(input_index)-1)//batch_size:
                    print("Valid Batch %04d/%i, Loss %9.4f"%( iteration, (len(input_index)-1)//batch_size_fit, loss))
                    np.savez(L=var_losses,file='./var_loss.npz')

            del loss
            del encoder_outputs
            del encoder_hidden_state 
        iteration = 0
    print("Model saved at %s"%save_model_path)
    print("Epoch %02d/%i, Mean ELBO %9.4f"%( epoch, epochs, np.mean(np.array(var_losses))))
    avg_losses.append(np.mean(np.array(var_losses)))
    np.savez(L=avg_losses,file='./avg_losses.npz')
        

In [None]:
save_model_path,epoch

In [None]:
save_model_path = os.path.join('won', '2018-Jul-30-00-58-21')

In [None]:
checkpoint_path_encoder = os.path.join(save_model_path, 'enc_E8.pytorch')
checkpoint_path_decoder = os.path.join(save_model_path, 'dec_E8.pytorch')

In [None]:
Encoder = torch.load(checkpoint_path_encoder)
Decoder = torch.load(checkpoint_path_decoder)

In [None]:
Encoder.eval()
Decoder.eval()

In [None]:
Encoder,Decoder

In [None]:
(input_index,inputs_len,dec_inputs_index,dec_inputs_len,target_index) = index_data[files_group[2]]

In [None]:
torch.cuda.empty_cache()
#del x_
#del Y_X
del Encoder
del Decoder

In [None]:
start = 0
end = 10

In [None]:
inputs_len = []
for sentence in input_index[start:end]:
    inputs_len.append(len(sentence) - sentence.count(0))
dec_inputs_len = []
for sentence in dec_inputs_index[start:end]:
    dec_inputs_len.append(len(sentence) - sentence.count(vocab.word2id(PAD_TOKEN)))

In [None]:
x_ = Variable(torch.cuda.LongTensor(input_index[start:end]))
Y_X = Variable(torch.cuda.LongTensor(dec_inputs_index[start:end]))

batch_size = x_.size(0)
length = torch.cuda.LongTensor(inputs_len)
length_y = torch.cuda.LongTensor(dec_inputs_len)

sorted_lengths, sorted_idx = torch.sort(length, descending=True)
input_sequence = x_[sorted_idx.cuda()]
input_embedding = Encoder.embedding(input_sequence)
_,reversed_idx = torch.sort(sorted_idx.cuda())
packed_input = rnn_utils.pack_padded_sequence(input_embedding, sorted_lengths.tolist(), batch_first=True)

encoder_outputs, encoder_hidden_state,encoder_hidden_state_attn = Encoder.encoder(packed_input,batch_size)

padded_outputs = rnn_utils.pad_packed_sequence(encoder_outputs, batch_first=True)[0]
padded_outputs = padded_outputs.contiguous()
_,reversed_idx = torch.sort(sorted_idx.cuda())
encoder_outputs = padded_outputs[reversed_idx]

decoder_input=Variable(torch.cuda.LongTensor(encoder_outputs.size(0))).fill_(sos_idx)

Y = Variable(torch.cuda.LongTensor(target_index[start:end])).transpose(1,0)

if len(encoder_hidden_state.size()) == 1:
    decoder_hidden = encoder_hidden_state.unsqueeze(0)
else: 
    decoder_hidden = encoder_hidden_state
decoder_context = Variable(torch.zeros(batch_size,1, Decoder.hidden_size)).cuda()

In [None]:
decoded_words = []
decoder_attentions = torch.zeros(encoder_outputs.size(0),dec_max_len, enc_max_len)
for i in range(dec_max_len):

    decoder_output, decoder_context, decoder_hidden, decoder_attention = Decoder(decoder_input,decoder_context,decoder_hidden.float(),encoder_outputs.float())

    decoder_attentions[:decoder_attention.size(0),i,:decoder_attention.size(2)] += decoder_attention.squeeze(0).squeeze(0).cpu().data
    topv, topi = decoder_output.data.topk(1)

    temp = []
    if len(topi.size()) == 1:
        temp.append(vocab.id2word(topi.cpu().numpy()[0]))
    else:
        for top in topi.cpu().numpy():
            temp.append(vocab.id2word(top[0]))
    decoded_words.append(temp)
    decoder_input = Variable(topi).cuda()
    decoder_hidden = decoder_hidden.squeeze()
    if len(decoder_hidden.size()) == 1:
        decoder_hidden = decoder_hidden.unsqueeze(0)
    else: 
        decoder_hidden = decoder_hidden

In [None]:
inferences = []
for decoded_sent in np.array(decoded_words).transpose():
    for i,word in enumerate(decoded_sent):
        if word == STOP_DECODING:
            decoded_sent = decoded_sent[:i]

    inferences.append(list(decoded_sent))


targets_result = []
for inputs in target_index[start:end]:
    result = []
    for word in inputs:
        if word == eos_idx:
            break
        else:
            result.append(vocab.id2word(word))
    targets_result.append(result)    

inputs_result = []
for inputs in input_index[start:end]:
    result = []
    for word in inputs:
        result.append(vocab.id2word(word))
    inputs_result.append(result)

In [None]:
for source,target,inference, decoder_attention in zip(inputs_result, targets_result, inferences, decoder_attentions):
    print('*********************************************************************')
    print('source : '+' '.join(source))
    
    print('*********************************************************************')
    print('target : '+' '.join(target))
    
    print('*********************************************************************')
    print('inference : '+' '.join(inference))
    
    print('*********************************************************************')
    scores = rouge.get_scores(' '.join(target), ' '.join(inference))
    for score in scores[0]:
        print(score +' : '+str(scores[0][score]))

    plt.matshow(decoder_attention.numpy())

In [None]:
r1_recall = []
r2_recall = []
rl_recall = []

r1_f1 = []
r2_f1 = []
rl_f1 = []

error = 0

for source,target,inference, decoder_attention in tqdm.tqdm(zip(inputs_result, targets_result, inferences, decoder_attentions)):
    try:
        scores = rouge.get_scores(' '.join(target), ' '.join(inference))
    except ValueError:
        error +=1
        print('******************************************')
        print('target'+' '.join(target))
        print('inference'+' '.join(inference))
        pass
    r1_recall.append(scores[0]['rouge-1']['r'])
    r2_recall.append(scores[0]['rouge-2']['r'])
    rl_recall.append(scores[0]['rouge-l']['r'])
    
    r1_f1.append(scores[0]['rouge-1']['f'])
    r2_f1.append(scores[0]['rouge-2']['f'])
    rl_f1.append(scores[0]['rouge-l']['f'])
print('error sentence : ' + str(error))

In [None]:
print('ROUGE-1 recall : ' + str(sum(r1_recall)*100 / len(r1_recall)))
print('ROUGE-1 F1 : ' + str(sum(r1_f1)*100 / len(r1_f1)))
print('********************************************************************')
print('ROUGE-2 recall : ' + str(sum(r2_recall)*100 / len(r2_recall)))
print('ROUGE-2 F1 : ' + str(sum(r2_f1)*100 / len(r2_f1)))
print('********************************************************************')
print('ROUGE-l recall : ' + str(sum(rl_recall)*100 / len(rl_recall)))
print('ROUGE-l F1 : ' + str(sum(rl_f1)*100 / len(rl_f1)))

## Input length: 200, Output length: 20
## Encoder: bi_layer1, Decoder: layer1, Ouput: 3layer, hidden: 200
********************************************************************
ROUGE-1 recall : 18.32848421558105  
ROUGE-1 F1 : 14.55207536225354
********************************************************************
ROUGE-2 recall : 2.12192578890871  
ROUGE-2 F1 : 1.929616342515873
********************************************************************
ROUGE-l recall : 16.305944007556953  
ROUGE-l F1 : 11.950990121991415


## '2018-Jul-19-03-00-40' Epoch: 6
## Input length: 200, Output length: 20
## Encoder: bi_layer2, Decoder: layer2, Ouput: 3layer, hidden: 200
********************************************************************
ROUGE-1 recall : 21.747208037530445  
ROUGE-1 F1 : 14.735247486910408
********************************************************************
ROUGE-2 recall : 2.826018309889277  
ROUGE-2 F1 : 2.0510893369941567
********************************************************************
ROUGE-l recall : 20.20920690275513  
ROUGE-l F1 : 11.742692338164279
