In [1]:
import argparse
import torch
import time
import json
import numpy as np
import math
import random
import xml.etree.ElementTree as ET
from subprocess import check_output
from utils_test import test_as, generate_repositioning_test_data, load_data_test, squad_test_data, save_test, predict_boundary_test_result
from generate_test_pred import generate_test_pred
from generate_num_pred_label import generate_num_test_data, json_to_csv, predict_test_label, return_predicted_test_label

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
Using TensorFlow backend.


In [2]:
np.random.seed(1337)
random.seed(1337)
torch.manual_seed(1337)
torch.cuda.manual_seed(1337)

In [3]:
class Model(torch.nn.Module):
    #定义神经网络层
    def __init__(self, gen_emb, domain_emb, num_classes=3, dropout=0.5, crf=True):
        super(Model, self).__init__()
        #构建通用领域嵌入矩阵，先生成相应大小的空间，再将参数传进去
        self.gen_embedding = torch.nn.Embedding(gen_emb.shape[0], gen_emb.shape[1])
        self.gen_embedding.weight=torch.nn.Parameter(torch.from_numpy(gen_emb), requires_grad=False)
        #构建特定领域嵌入矩阵，先生成相应大小的空间，再将参数传进去
        
        #设定模型的卷积层参数
        #第一层第一种卷积核,输入维度是400，输出维度是128，跨步是5，填充是2
        self.conv1=torch.nn.Conv1d(gen_emb.shape[1], 128, 5, padding=2)
        self.conv2=torch.nn.Conv1d(gen_emb.shape[1], 128, 3, padding=1)
        self.dropout=torch.nn.Dropout(dropout)
        
        #剩下三层的参数相同，输入维度是256，输出维度是128，跨步是5，填充是2
        self.conv3=torch.nn.Conv1d(256, 256, 5, padding=2)
        self.conv4=torch.nn.Conv1d(256, 256, 5, padding=2)
        self.conv5=torch.nn.Conv1d(256, 256, 5, padding=2)
        #设置解码层1：就是一个全连接层进行分类，分为3类
        self.linear_ae=torch.nn.Linear(256, num_classes)
        self.crf_flag=crf
        #设置解码层2：CRF
        if self.crf_flag:
            from allennlp.modules import ConditionalRandomField
            self.crf=ConditionalRandomField(num_classes)
          
    #定义操作
    def forward(self, x, x_len, x_mask, x_tag=None, testing=True):
        #cat是拼接的意思，0是竖着拼，1是横着拼，2是（3维的情况）
        #这段代码好像是传入batchsize的单词的编号，然后去除这些单词嵌入
        x_emb=self.gen_embedding(x)
        x_emb=self.dropout(x_emb).transpose(1, 2)
        x_conv=torch.nn.functional.relu(torch.cat((self.conv1(x_emb), 
                                                   self.conv2(x_emb)), dim=1))
        
        x_conv=self.dropout(x_conv)
        x_conv=torch.nn.functional.relu(self.conv3(x_conv))
        x_conv=self.dropout(x_conv)
        x_conv=torch.nn.functional.relu(self.conv4(x_conv))
        x_conv=self.dropout(x_conv)
        x_conv=torch.nn.functional.relu(self.conv5(x_conv))
        x_conv=x_conv.transpose(1, 2)
        x_logit=self.linear_ae(x_conv)
        #如果选择testing，就是输出预测值
        if testing:
            if self.crf_flag:
                score=self.crf.viterbi_tags(x_logit, x_mask)
            else:
                x_logit=x_logit.transpose(2, 0)
                score=torch.nn.functional.log_softmax(x_logit).transpose(2, 0)
        else:
            if self.crf_flag:
                score=-self.crf(x_logit, x_tag, x_mask)
            else:
                x_logit=torch.nn.utils.rnn.pack_padded_sequence(x_logit, x_len, batch_first=True)
                score=torch.nn.functional.nll_loss(torch.nn.functional.log_softmax(x_logit.data), x_tag.data)
        return score

In [4]:
def label_rest_xml(fn, output_fn, corpus, label):
    dom=ET.parse(fn)
    root=dom.getroot()
    pred_y=[]
    for zx, sent in enumerate(root.iter("sentence") ) :
        tokens=corpus[zx]
        lb=label[zx]
        opins=ET.Element("Opinions")
        token_idx, pt, tag_on=0, 0, False
        start, end=-1, -1
        for ix, c in enumerate(sent.find('text').text):
            if token_idx<len(tokens) and pt>=len(tokens[token_idx] ):
                pt=0
                token_idx+=1

            if token_idx<len(tokens) and lb[token_idx]==1 and pt==0 and c!=' ':
                if tag_on:
                    end=ix
                    tag_on=False
                    opin=ET.Element("Opinion")
                    opin.attrib['target']=sent.find('text').text[start:end]
                    opin.attrib['from']=str(start)
                    opin.attrib['to']=str(end)
                    opins.append(opin)
                start=ix
                tag_on=True
            elif token_idx<len(tokens) and lb[token_idx]==2 and pt==0 and c!=' ' and not tag_on:
                start=ix
                tag_on=True
            elif token_idx<len(tokens) and (lb[token_idx]==0 or lb[token_idx]==1) and tag_on and pt==0:
                end=ix
                tag_on=False 
                opin=ET.Element("Opinion")
                opin.attrib['target']=sent.find('text').text[start:end]
                opin.attrib['from']=str(start)
                opin.attrib['to']=str(end)
                opins.append(opin)
            elif token_idx>=len(tokens) and tag_on:
                end=ix
                tag_on=False 
                opin=ET.Element("Opinion")
                opin.attrib['target']=sent.find('text').text[start:end]
                opin.attrib['from']=str(start)
                opin.attrib['to']=str(end)
                opins.append(opin)
            if c==' ':
                pass
            elif tokens[token_idx][pt:pt+2]=='``' or tokens[token_idx][pt:pt+2]=="''":
                pt+=2
            else:
                pt+=1
        if tag_on:
            tag_on=False
            end=len(sent.find('text').text)
            opin=ET.Element("Opinion")
            opin.attrib['target']=sent.find('text').text[start:end]
            opin.attrib['from']=str(start)
            opin.attrib['to']=str(end)
            opins.append(opin)
        sent.append(opins )
    dom.write(output_fn)

In [5]:
def label_laptop_xml(fn, output_fn, corpus, label):
    dom=ET.parse(fn)
    root=dom.getroot()
    pred_y=[]
    for zx, sent in enumerate(root.iter("sentence") ) :
        tokens=corpus[zx]
        lb=label[zx]
        opins=ET.Element("aspectTerms")
        token_idx, pt, tag_on=0, 0, False
        start, end=-1, -1
        for ix, c in enumerate(sent.find('text').text):
            if token_idx<len(tokens) and pt>=len(tokens[token_idx] ):
                pt=0
                token_idx+=1

            if token_idx<len(tokens) and lb[token_idx]==1 and pt==0 and c!=' ':
                if tag_on:
                    end=ix
                    tag_on=False
                    opin=ET.Element("aspectTerm")
                    opin.attrib['term']=sent.find('text').text[start:end]
                    opin.attrib['from']=str(start)
                    opin.attrib['to']=str(end)
                    opins.append(opin)
                start=ix
                tag_on=True
            elif token_idx<len(tokens) and lb[token_idx]==2 and pt==0 and c!=' ' and not tag_on:
                start=ix
                tag_on=True
            elif token_idx<len(tokens) and (lb[token_idx]==0 or lb[token_idx]==1) and tag_on and pt==0:
                end=ix
                tag_on=False 
                opin=ET.Element("aspectTerm")
                opin.attrib['term']=sent.find('text').text[start:end]
                opin.attrib['from']=str(start)
                opin.attrib['to']=str(end)
                opins.append(opin)
            elif token_idx>=len(tokens) and tag_on:
                end=ix
                tag_on=False 
                opin=ET.Element("aspectTerm")
                opin.attrib['term']=sent.find('text').text[start:end]
                opin.attrib['from']=str(start)
                opin.attrib['to']=str(end)
                opins.append(opin)
            if c==' ' or ord(c)==160:
                pass
            elif tokens[token_idx][pt:pt+2]=='``' or tokens[token_idx][pt:pt+2]=="''":
                pt+=2
            else:
                pt+=1
        if tag_on:
            tag_on=False
            end=len(sent.find('text').text)
            opin=ET.Element("aspectTerm")
            opin.attrib['term']=sent.find('text').text[start:end]
            opin.attrib['from']=str(start)
            opin.attrib['to']=str(end)
            opins.append(opin)
        sent.append(opins )
    dom.write(output_fn)  

In [6]:
ae_data=np.load("data/prep_data/laptop.npz")
    
#分割训练集，此处是固定分割
test_x = ae_data['test_X']
#print("test_x", test_x)
batch_test_X_len=np.sum(test_x[0:128]!=0, axis=1)
#print("batch_test_X_len", batch_test_X_len)
batch_idx=batch_test_X_len.argsort()[::-1]
#print("batch_idx", batch_idx)
r_idx=batch_idx.argsort()
#print("r_idx", r_idx)
batch_test_X_len=batch_test_X_len[batch_idx]
#print("batch_test_X_len", batch_test_X_len)
batch_test_X_mask=(test_x[0:128]!=0)[batch_idx].astype(np.uint8)
#print("batch_test_X_mask", batch_test_X_mask)
batch_test_X=test_x[0:128][batch_idx]
#print("batch_test_X", batch_test_X)

word_idx_fn = 'data/prep_data/word_idx.json'
with open(word_idx_fn) as f:
    word_idx=json.load(f)
idx_word={}
for key,val in word_idx.items():
    idx_word[val]=key

In [7]:
def test(model, test_X, test_y, raw_X, domain, command, template, run_epoch, boundary_process, crf, generate_data, batch_size, num_process):
    #pred_y就是我要的，预测出的标注
    pred_y=np.zeros((test_X.shape[0], test_X.shape[1]), np.int16)
    model.eval()
    """
    作用：
    1.把输入的每batch句子按照句子中单词多少重新排列
    2.获得转换的索引batch_idx
    3.获得反转换的索引r_idx
    """
    for offset in range(0, test_X.shape[0], batch_size):
        #统计一个batchsize内每行句子中的单词个数
        batch_test_X_len=np.sum(test_X[offset:offset+batch_size]!=0, axis=1)
        #按照每行句子单词数量从多到少，将每行句子原本的索引存到batch_idx中
        batch_idx=batch_test_X_len.argsort()[::-1]
        #现在batch_test_X_len就变成了按照每行句子单词数量从多到少的排序
        batch_test_X_len=batch_test_X_len[batch_idx]
        #按照每行单词数量从多到少排列，如果该列有单词，设值为1，否则为0
        batch_test_X_mask=(test_X[offset:offset+batch_size]!=0)[batch_idx].astype(np.uint8)
        #把输入数据集变成按照每行句子单词从大到小的排列
        batch_test_X=test_X[offset:offset+batch_size][batch_idx]
        #将numpy转成torch tensor
        batch_test_X_mask=torch.autograd.Variable(torch.from_numpy(batch_test_X_mask).long().cuda())
        batch_test_X=torch.autograd.Variable(torch.from_numpy(batch_test_X).long().cuda())
        #获得预测的标签
        batch_pred_y=model(batch_test_X, batch_test_X_len, batch_test_X_mask, testing=True)
        #将输入句子还原的索引
        r_idx=batch_idx.argsort()
        if crf:
            batch_pred_y=[batch_pred_y[idx] for idx in r_idx]
            for ix in range(len(batch_pred_y)):
                for jx in range(len(batch_pred_y[ix][0])):
                    pred_y[offset+ix,jx]=batch_pred_y[ix][0][jx]
        else:
            batch_pred_y=batch_pred_y.data.cpu().numpy().argmax(axis=2)[r_idx]
            pred_y[offset:offset+batch_size,:batch_pred_y.shape[1]]=batch_pred_y
    assert len(pred_y)==len(test_X)
    
    if generate_data:
        """
        输出test数据集的预测结果
        """
        results = []
        for j_th in range(len(test_X)):
            result = []
            words_num = test_X[j_th]
            words_str = []
            test_y_part = test_y[j_th]
            #把数字转换成单词
            for w in words_num:
                if(w != 0):
                    words_str.append(idx_word[w])
            pred = pred_y[j_th]
            for words_str, test_y_part, pred in zip(words_str, test_y_part, pred):
                result.append(" ".join([words_str, str(test_y_part), str(pred)]))
            results.append(result)
        test_as(results, run_epoch)
        generate_repositioning_test_data(run_epoch)
        #生成预测结果
        predict_boundary_test_result(run_epoch)
        """
        结束
        """
    if boundary_process:
        pred_y = generate_test_pred(run_epoch)
    
    #方面个数预测，应该在边界预测之后
    if num_process:
        if boundary_process:
            #根据pred_label重构原始文件，再处理
            results = []
            for j_th in range(len(test_X)):
                result = []
                words_num = test_X[j_th]
                words_str = []
                test_y_part = test_y[j_th]
                #把数字转换成单词
                for w in words_num:
                    if(w != 0):
                        words_str.append(idx_word[w])
                pred = pred_y[j_th]
                for words_str, test_y_part, pred in zip(words_str, test_y_part, pred):
                    result.append(" ".join([words_str, str(test_y_part), str(pred)]))
                results.append(result)
            test_as(results, run_epoch)
            generate_num_test_data(run_epoch)
            json_to_csv(run_epoch)
            predict_test_label(run_epoch)
            pred_y = return_predicted_test_label(run_epoch, pred_y)
        else:
            generate_num_test_data(run_epoch)
            json_to_csv(run_epoch)
            predict_test_label(run_epoch)
            pred_y = return_predicted_test_label(run_epoch, pred_y)

    command=command.split()
    if domain=='restaurant':
        label_rest_xml(template, command[6], raw_X, pred_y)
        acc=check_output(command ).split()
        print(acc)
        return float(acc[9][10:])
    elif domain=='laptop':
        label_laptop_xml(template, command[4], raw_X, pred_y)
        acc=check_output(command ).split()
        print(acc)
        return float(acc[15])

In [8]:
def evaluate(runs, data_dir, model_dir, domain, boundary_process, crf, generate_data, batch_size, add_num_loss, num_process, command, template):
    #读所有数据，里面有训练数据也有测试数据（但是数据全是数字）
    ae_data=np.load(data_dir+domain+".npz")
    #读测试数据，数据是字符
    with open(data_dir+domain+"_raw_test.json") as f:
        raw_X=json.load(f)
    results=[]
    if add_num_loss:
        model_dir = 'model_num/'
    for r in range(runs):
        #载入训练好的模型
        model=torch.load(model_dir+domain+str(r))
        result=test(model, ae_data['test_X'], ae_data['test_y'], raw_X, domain, command, template, r, boundary_process, crf, generate_data, batch_size, num_process)
        results.append(result)
    if boundary_process:
        with open('evaluate_log/log_boundary_process.txt', 'a') as log:
            log.write(str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) + '\n')
            log.write(str(sum(results)/len(results)) + '\n' + '\n')
    else:
        with open('evaluate_log/log.txt', 'a') as log:
            log.write(str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) + '\n')
            log.write(str(sum(results)/len(results)) + '\n' + '\n')
    print(sum(results)/len(results))

In [9]:
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--runs', type=int, default=5)
    parser.add_argument('--data_dir', type=str, default="data/prep_data/")
    parser.add_argument('--model_dir', type=str, default="model/")
    parser.add_argument('--domain', type=str, default="laptop")
    parser.add_argument('--crf', type=bool, default=True)
    parser.add_argument('--generate_data', type=bool, default=False)
    parser.add_argument('--batch_size', type=int, default=256)
    parser.add_argument('--add_num_loss', type=bool, default=False)
    parser.add_argument('--num_process', type=bool, default=True)
    parser.add_argument('--boundary_process', type=bool, default=True)
    args = parser.parse_known_args()[0]

    if args.domain=='restaurant':
        command="java -cp script/A.jar absa16.Do Eval -prd data/official_data/pred.xml -gld data/official_data/EN_REST_SB1_TEST.xml.gold -evs 2 -phs A -sbt SB1"
        template="data/official_data/EN_REST_SB1_TEST.xml.A"
    elif args.domain=='laptop':
        command="java -cp script/eval.jar Main.Aspects data/official_data/pred.xml data/official_data/Laptops_Test_Gold.xml"
        template="data/official_data/Laptops_Test_Data_PhaseA.xml"
    
    evaluate(args.runs, args.data_dir, args.model_dir, args.domain, args.boundary_process, args.crf, args.generate_data, args.batch_size, args.add_num_loss, args.num_process, command, template)

[b'Aspects', b'--------------------------------------', b'#System', b'Aspect', b'Terms=579', b'#Gold', b'Aspect', b'Terms=654', b'Pre:', b'0.8531952', b'(494/579)', b'Rec:', b'0.75535166', b'(494/654)', b'F:', b'0.80129766', b'Categories', b'--------------------------------------', b'#System', b'Aspect', b'Categories=0', b'#Gold', b'Aspect', b'Categories=0', b'Pre:', b'NaN', b'(0/0)', b'Rec:', b'NaN', b'(0/0)', b'F:', b'NaN']
[b'Aspects', b'--------------------------------------', b'#System', b'Aspect', b'Terms=772', b'#Gold', b'Aspect', b'Terms=654', b'Pre:', b'0.6735751', b'(520/772)', b'Rec:', b'0.795107', b'(520/654)', b'F:', b'0.7293128', b'Categories', b'--------------------------------------', b'#System', b'Aspect', b'Categories=0', b'#Gold', b'Aspect', b'Categories=0', b'Pre:', b'NaN', b'(0/0)', b'Rec:', b'NaN', b'(0/0)', b'F:', b'NaN']
[b'Aspects', b'--------------------------------------', b'#System', b'Aspect', b'Terms=730', b'#Gold', b'Aspect', b'Terms=654', b'Pre:', b'0.67

In [10]:
#原始GE-CNN
[b'Aspects',b'#System', b'Aspect', b'Terms=615', b'#Gold', b'Aspect', b'Terms=654', b'Pre:', b'0.8113821', b'(499/615)', b'Rec:', b'0.762997', b'(499/654)', b'F:', b'0.78644603',]
[b'Aspects',b'#System', b'Aspect', b'Terms=670', b'#Gold', b'Aspect', b'Terms=654', b'Pre:', b'0.78507465', b'(526/670)', b'Rec:', b'0.80428135', b'(526/654)', b'F:', b'0.794562',]
[b'Aspects',b'#System', b'Aspect', b'Terms=607', b'#Gold', b'Aspect', b'Terms=654', b'Pre:', b'0.82372326', b'(500/607)', b'Rec:', b'0.764526', b'(500/654)', b'F:', b'0.7930215',]
[b'Aspects',b'#System', b'Aspect', b'Terms=630', b'#Gold', b'Aspect', b'Terms=654', b'Pre:', b'0.8095238', b'(510/630)', b'Rec:', b'0.7798165', b'(510/654)', b'F:', b'0.7943926',]
[b'Aspects',b'#System', b'Aspect', b'Terms=668', b'#Gold', b'Aspect', b'Terms=654', b'Pre:', b'0.78892213', b'(527/668)', b'Rec:', b'0.8058104', b'(527/654)', b'F:', b'0.7972768',]
0.793139786

0.793139786

In [11]:
#使用boundary_process
[b'Aspects',b'#System', b'Aspect', b'Terms=603', b'#Gold', b'Aspect', b'Terms=654', b'Pre:', b'0.85240465', b'(514/603)', b'Rec:', b'0.7859327', b'(514/654)', b'F:', b'0.81782025']
[b'Aspects',b'#System', b'Aspect', b'Terms=645', b'#Gold', b'Aspect', b'Terms=654', b'Pre:', b'0.8310078', b'(536/645)', b'Rec:', b'0.81957185', b'(536/654)', b'F:', b'0.82525015']
[b'Aspects',b'#System', b'Aspect', b'Terms=590', b'#Gold', b'Aspect', b'Terms=654', b'Pre:', b'0.8661017', b'(511/590)', b'Rec:', b'0.78134555', b'(511/654)', b'F:', b'0.8215434']
[b'Aspects',b'#System', b'Aspect', b'Terms=620', b'#Gold', b'Aspect', b'Terms=654', b'Pre:', b'0.85', b'(527/620)', b'Rec:', b'0.8058104', b'(527/654)', b'F:', b'0.82731557']
[b'Aspects',b'#System', b'Aspect', b'Terms=656', b'#Gold', b'Aspect', b'Terms=654', b'Pre:', b'0.8246951', b'(541/656)', b'Rec:', b'0.8272171', b'(541/654)', b'F:', b'0.8259542']
0.823576714

0.823576714

In [12]:
#使用num_process
[b'Aspects',b'#System', b'Aspect', b'Terms=563', b'#Gold', b'Aspect', b'Terms=654', b'Pre:', b'0.86856127', b'(489/563)', b'Rec:', b'0.7477064', b'(489/654)', b'F:', b'0.80361545']
[b'Aspects',b'#System', b'Aspect', b'Terms=596', b'#Gold', b'Aspect', b'Terms=654', b'Pre:', b'0.8624161', b'(514/596)', b'Rec:', b'0.7859327', b'(514/654)', b'F:', b'0.8224']
[b'Aspects',b'#System', b'Aspect', b'Terms=557', b'#Gold', b'Aspect', b'Terms=654', b'Pre:', b'0.87971276', b'(490/557)', b'Rec:', b'0.74923545', b'(490/654)', b'F:', b'0.8092485']
[b'Aspects',b'#System', b'Aspect', b'Terms=574', b'#Gold', b'Aspect', b'Terms=654', b'Pre:', b'0.87108016', b'(500/574)', b'Rec:', b'0.764526', b'(500/654)', b'F:', b'0.81433225']
[b'Aspects',b'#System', b'Aspect', b'Terms=600', b'#Gold', b'Aspect', b'Terms=654', b'Pre:', b'0.85833335', b'(515/600)', b'Rec:', b'0.78746176', b'(515/654)', b'F:', b'0.8213716']
0.81419356

0.81419356

In [None]:
#同时使用
#待优化

In [13]:
predict_test_label(0)