In [1]:
import sys 
sys.path.append("..") 
from utils import config
from utils.seq2seq.batcher import *
from utils.seq2seq.train_util import *
import argparse


import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

os.environ['CUDA_LAUNCH_BLOCKING'] = "1" 

parser = argparse.ArgumentParser()
parser.add_argument('--key_attention', type=bool, default=False, help = 'True/False')
parser.add_argument('--intra_encoder', type=bool, default=True, help = 'True/False')
parser.add_argument('--intra_decoder', type=bool, default=True, help = 'True/False')
parser.add_argument('--copy', type=bool, default=True, help = 'True/False') # for transformer

parser.add_argument('--model_type', type=str, default='seq2seq', choices=['seq2seq', 'transformer'])
parser.add_argument('--train_rl', type=bool, default=False, help = 'True/False')
parser.add_argument('--keywords', type=str, default='Noun_adj_keys', 
                    help = 'POS_keys / DEP_keys / Noun_adj_keys / TextRank_keys')

parser.add_argument('--lr', type=float, default=0.0001)
parser.add_argument('--rand_unif_init_mag', type=float, default=0.02)
parser.add_argument('--trunc_norm_init_std', type=float, default=0.001)
parser.add_argument('--mle_weight', type=float, default=1.0)
parser.add_argument('--gound_truth_prob', type=float, default=0.5)

parser.add_argument('--max_enc_steps', type=int, default=500)
parser.add_argument('--max_dec_steps', type=int, default=20)
parser.add_argument('--min_dec_steps', type=int, default=6)
parser.add_argument('--max_epochs', type=int, default=15)
parser.add_argument('--vocab_size', type=int, default=50000)
parser.add_argument('--beam_size', type=int, default=16)
parser.add_argument('--batch_size', type=int, default=1)

parser.add_argument('--hidden_dim', type=int, default=512)
parser.add_argument('--emb_dim', type=int, default=300)
parser.add_argument('--gradient_accum', type=int, default=1)

parser.add_argument('--word_emb_type', type=str, default='word2Vec', help='word2Vec/glove/FastText')
parser.add_argument('--pre_train_emb', type=bool, default=True, help = 'True/False') # 若pre_train_emb為false, 則emb type為NoPretrain

opt = parser.parse_args(args=[])
config = re_config(opt)


In [2]:
from create_model.pg_multi_head import Model 
load_model_path = 'Pointer_MultiHead.tar'  


import torch as T

T.backends.cudnn.benchmark = True 
checkpoint = T.load(load_model_path)
vocab = checkpoint['vocab']
print(load_model_path)


model = Model(pre_train_emb=config.pre_train_emb, 
              word_emb_type = config.word_emb_type, 
              vocab = vocab)

model = model.cuda()
model.load_state_dict(checkpoint['model'])
model.eval()

I0703 18:24:13.582961 140064335443776 file_utils.py:35] PyTorch version 1.4.0 available.


Pointer_MultiHead.tar


  "num_layers={}".format(dropout, num_layers))
I0703 18:24:18.355422 140064335443776 utils_any2vec.py:341] loading projection weights from /home/eagleuser/Users/leyan//Train-Data/Mix6_mainCat_Ekphrasis/Embedding/word2Vec/word2Vec.300d.txt


/home/eagleuser/Users/leyan//Train-Data/Mix6_mainCat_Ekphrasis/Embedding/word2Vec/word2Vec.300d.txt


I0703 18:24:27.620254 140064335443776 utils_any2vec.py:405] loaded (48560, 300) matrix from /home/eagleuser/Users/leyan//Train-Data/Mix6_mainCat_Ekphrasis/Embedding/word2Vec/word2Vec.300d.txt


Model(
  (encoder): Encoder(
    (lstm): LSTM(300, 512, batch_first=True, dropout=0.2, bidirectional=True)
    (reduce_h): Linear(in_features=1024, out_features=512, bias=True)
    (reduce_c): Linear(in_features=1024, out_features=512, bias=True)
  )
  (decoder): Decoder(
    (enc_attention): encoder_attention(
      (key_linear): Linear(in_features=1324, out_features=1024, bias=True)
      (linears): ModuleList(
        (0): Linear(in_features=1024, out_features=1024, bias=True)
        (1): Linear(in_features=1024, out_features=1024, bias=True)
        (2): Linear(in_features=1024, out_features=1024, bias=True)
        (3): Linear(in_features=1024, out_features=1024, bias=True)
      )
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (dec_attention): MultiHead_attention(
      (key_linear): Linear(in_features=812, out_features=512, bias=True)
      (linears): ModuleList(
        (0): Linear(in_features=512, out_features=512, bias=True)
        (1): Linear(in_features=512, out

In [3]:
import pandas as pd
import os
from tqdm import tqdm
from utils.seq2seq.batcher import Example, Batch
from translate.seq2seq_beam import *

config.batch_size = 1
config.gound_truth_prob = 0.0

def generate(data):
    # ready data
    ex = Example(config, vocab, data)
    b = Batch([ex])
    b.enc_pad_mask
    enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, coverage, ct_e, enc_key_batch, enc_key_mask, enc_key_lens= \
                    get_input_from_batch(b, config, batch_first = True)
    dec_batch, dec_padding_mask, dec_lens, max_dec_len, target_batch = \
            get_output_from_batch(b, config, batch_first = True)
    max_enc_len = max(T.max(enc_lens,dim=0)).tolist()[0] 
    # encode
    enc_batch = model.embeds(enc_batch)  # Get embeddings for encoder input    
    enc_key_batch = model.embeds(enc_key_batch)  # Get key embeddings for encoder input

    # model generate
    enc_out, enc_hidden = model.encoder(enc_batch, enc_lens, max_enc_len)

    # 'Feed encoder data to predict'
    pred_ids = beam_search(enc_hidden, enc_out, enc_padding_mask, ct_e, extra_zeros, 
                            enc_batch_extend_vocab, enc_key_batch, enc_key_mask, model, 
                            START, END, UNKNOWN_TOKEN)[0]
    pred_words = [vocab.id2word(d) for d in pred_ids]
    pred_words = " ".join(pred_words) 
    return pred_words

In [4]:
folder = 'B07P6Y7954'
fn = 'NEW_CRAWL/%s/total.xlsx'%(folder)
df = pd.read_excel(fn)
df['pred_summary'] = ''
df = df[df['summary_conflict']==False]
df = df[df['summary_polarity']>0]
df = df[df['summary_subjectivity']>0]
df = df[df['Noun_adj_keys']!='[]']
df.reset_index(drop=True, inplace=True)
# idx = 0

with tqdm(total=len(df)) as pbar:
    for i ,row in df.iterrows(): 
        data = df.iloc[i].to_dict()
        data['review_ID'] = str(i)
        data['review'] = data['lemm_reviewtext']
        data['summary'] = data['lemm_summary']

        pred_words = generate(data)
        df.loc[i,'pred_summary'] = pred_words        
        pbar.set_description("%s row %s" % (folder, i, ))
        pbar.update(1)
        
        
        
        

B07P6Y7954 row 63: 100%|██████████| 64/64 [01:11<00:00,  1.10s/it]


In [37]:
from rouge import Rouge
from nltk.translate.bleu_score import corpus_bleu
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.meteor_score import single_meteor_score

def evaluate(data):
    ref_sents = data['summary']
    decoded_sents = data['pred_summary']
    # Rouge
    rouge = Rouge() 
    scores = rouge.get_scores(decoded_sents, ref_sents, avg = False)
    score = scores[0]   
    rouge_1_p = round(score['rouge-1']['p'],2)
    rouge_2_p = round(score['rouge-2']['p'],2)
    rouge_l_p = round(score['rouge-l']['p'],2)

    rouge_1_r = round(score['rouge-1']['r'],2)
    rouge_2_r = round(score['rouge-2']['r'],2)
    rouge_l_r = round(score['rouge-l']['r'],2)

    rouge_1_f = round(score['rouge-1']['f'],2)
    rouge_2_f = round(score['rouge-2']['f'],2)
    rouge_l_f = round(score['rouge-l']['f'],2)
    
    # BLEU
    Bleu_1 = round(sentence_bleu([ref_sents.split(" ")], decoded_sents.split(" "), weights=(1, 0, 0, 0)), 2)
    Bleu_2 = round(sentence_bleu([ref_sents.split(" ")], decoded_sents.split(" "), weights=(0.5, 0.5, 0, 0)), 2)
    Bleu_3 = round(sentence_bleu([ref_sents.split(" ")], decoded_sents.split(" "), weights=(0.33, 0.33, 0.33, 0)), 2)
    Bleu_4 = round(sentence_bleu([ref_sents.split(" ")], decoded_sents.split(" "), weights=(0.25, 0.25, 0.25, 0.25)), 2)
    
    #METEOR
    Meteor = round(single_meteor_score(ref_sents, decoded_sents), 2)  
    
    eval_scores = {}
    

    eval_scores['rouge_1_p'] = rouge_1_p
    eval_scores['rouge_2_p'] = rouge_2_p
    eval_scores['rouge_l_p'] = rouge_l_p
    
    eval_scores['rouge_1_r'] = rouge_1_r
    eval_scores['rouge_2_r'] = rouge_2_r
    eval_scores['rouge_l_r'] = rouge_l_r
    
    eval_scores['rouge_1_f'] = rouge_1_f
    eval_scores['rouge_2_f'] = rouge_2_f
    eval_scores['rouge_l_f'] = rouge_l_f
    
    eval_scores['Bleu_1'] = Bleu_1
    eval_scores['Bleu_2'] = Bleu_2
    eval_scores['Bleu_3'] = Bleu_3
    eval_scores['Bleu_4'] = Bleu_4
    
    eval_scores['Meteor'] = Meteor
    return eval_scores
    

In [38]:
with tqdm(total=len(df)) as pbar:
    for i ,row in df.iterrows(): 
        data = df.iloc[i].to_dict()
        eval_scores = evaluate(data) 
        for key ,score in eval_scores.items():
            df.loc[i,key] = score
        pbar.set_description("%s row %s" % (folder, i, ))
        pbar.update(1)
            

B07P6Y7954 row 63: 100%|██████████| 64/64 [00:00<00:00, 101.68it/s]


In [40]:
df.to_excel('%s.xlsx'%folder, encoding='utf8', engine='xlsxwriter')
print('pred num',len(df))

pred num 64


In [53]:
df.mean()
with open("%s_res.txt"%(folder), 'w',encoding='utf-8') as f:
    f.write('##-- Rouge-1 --##\n')
    f.write('rouge_1_p: '+ str(df.rouge_1_p.mean())+ '\n')
    f.write('rouge_1_r: '+ str(df.rouge_1_r.mean())+ '\n')
    f.write('rouge_1_f: '+ str(df.rouge_1_f.mean())+ '\n')
    
    f.write('##-- Rouge-2 --##\n')
    f.write('rouge_2_p: '+ str(df.rouge_2_p.mean())+ '\n')
    f.write('rouge_2_r: '+ str(df.rouge_2_r.mean())+ '\n')
    f.write('rouge_2_f: '+ str(df.rouge_2_f.mean())+ '\n')
    
    f.write('##-- Rouge-l --##\n')
    f.write('rouge_l_p: '+ str(df.rouge_l_p.mean())+ '\n')
    f.write('rouge_l_r: '+ str(df.rouge_l_r.mean())+ '\n')
    f.write('rouge_l_f: '+ str(df.rouge_l_f.mean())+ '\n')
    
    f.write('##-- BLEU --##\n')
    f.write('Bleu_1: '+ str(df.Bleu_1.mean())+ '\n')
    f.write('Bleu_2: '+ str(df.Bleu_2.mean())+ '\n')
    f.write('Bleu_3: '+ str(df.Bleu_3.mean())+ '\n')
    f.write('Bleu_4: '+ str(df.Bleu_4.mean())+ '\n')
    
    f.write('##-- Meteor --##\n')
    f.write('Meteor: '+ str(df.Meteor.mean())+ '\n')

# for folder in os.listdir('NEW_CRAWL'):
#     fn = 'NEW_CRAWL/%s/total.xlsx'%(folder)
#     df = pd.read_excel(fn)
    
#     print(fn)
#     break