In [1]:
import re
import json
import requests
from stanfordnlp.server import CoreNLPClient
from stanfordcorenlp import StanfordCoreNLP

In [2]:
class CoreServerAnn(object):
    def __init__(self, text, comefrom = None):
        self.text = text
        self.source_sens = []
        self.derived_sens = []
        self.quests = []
        self.comefrom = comefrom

        
class CoreServerQuest(object):
    def __init__(self, text, comefrom):
        self.text = text
        self.comefrom = comefrom
        

class CoreServerSen(object):
    def __init__(self, sen_dict, comefrom = None):
        self.sen_dict = sen_dict
        self.tokens = self.sen_dict['tokens']
        self.words = self._get_words(self.tokens)
        self.text = self._get_text(self.tokens)
        self.comefrom = comefrom
        self.entitymentions = self._get_entitymentions()
        
    def __len__(self):
        return len(self.sen_dict['tokens'])
    
    def _get_text(self, tokens):
        return "".join(token["word"] for token in tokens)
    
    def _get_words(self, tokens):
        return [token["word"] for token in tokens]
    
    def _get_entitymentions(self):
        return self.sen_dict.get('entitymentions', [])

        
class CoreServerAnalyzer(object):
    def __init__(self, url = 'http://localhost:9000',
                 properties = None,
                 lang = "zh",
                 timeout = 60
                ):
        self.url = url
        self.timeout = timeout
        self.lang = lang
        self.timeout = timeout
        if properties is None:
            properties = {"annotators": "tokenize,ssplit,pos,ner",
                          "ssplit.boundaryTokenRegex": "[.。]|[!?！？]",
                          "pipelineLanguage": self.lang,
                          "'ner.applyFineGrained": False
                         }
        self.properties = properties
    
    def __call__(self, ann, comefrom = None):
        if comefrom is None:
            comefrom = ann
        resp = requests.post(self.url, data=ann.text.encode('utf8'), params = {"properties": str(self.properties)},
                             timeout = self.timeout
                            )
        ann.source_sens = [CoreServerSen(sen_dict, comefrom) for sen_dict in resp.json()['sentences']]
        return ann


class CoreServerTranformer(object):
    def __init__(self, max_len = None, by_punc = True, rule = None,
                 url = "http://localhost:9000/tregex", lang = 'zh',
                 timeout = 60,
                 post_min_len = None,
                 post_by_ner = False,
                 entity_labels = None
                ):
        self.max_len = max_len
        self.by_punc = by_punc
        self.puncs = ['?!？！']
        self.rule = rule
        self.lang = lang
        self.properties = {"pipelineLanguage": self.lang}
        self.url = url
        self.timeout = timeout
        self.analyzer = CoreServerAnalyzer()
        self.post_min_len = post_min_len
        self.post_by_ner = post_by_ner
        self.entity_labels = entity_labels
        
    def __call__(self, ann):
        source_sens = ann.source_sens
        kept_sens = self._filter(source_sens)
        simplified_sens = self._simplify(kept_sens)
        derived_sens = self._post_filter(simplified_sens)
        ann.derived_sens = derived_sens
        return ann
        
    def _filter(self, source_sens):
        kept_sens = source_sens
        if self.max_len is not None:
            kept_sens = self._filter_by_len(kept_sens)
        if self.by_punc:
            kept_sens = self._filter_by_punc(kept_sens)
        return kept_sens
        
    def _filter_by_len(self, sens):
        kept_sens = [sen for sen in sens if len(sen) <= self.max_len]
        return kept_sens
    
    def _filter_by_punc(self, sens):
        kept_sens = [sen for sen in sens if sen.words[-1].strip() not in self.puncs]
        return kept_sens
    
    def _simplify(self, kept_sens):
        simplified_sens = []
        for sen in kept_sens:
            res = requests.post(self.url, data=sen.text.encode("utf8"), params = {"pattern": self.rule, "properties": str(self.properties)},
                                timeout = self.timeout
                               )
            res_dict = res.json()
            sub_sens = res_dict['sentences'][0]
            for sub_k, sub_v in sub_sens.items():
                    sub_treestr = sub_v["match"]
                    sub_text = self._treestr2text(sub_treestr)
                    tmp_ann = CoreServerAnn(sub_text)
                    tmp_ann = self.analyzer(tmp_ann, sen)
                    simplified_sens += tmp_ann.source_sens
        return simplified_sens
    
    def _treestr2text(self, treestr):
        pattern = r'[^\(\s\)]+\)'
        words_ = re.findall(pattern, treestr)
        words = [w_[:-1] for w_ in words_]
        text = "".join(words)
        return text
    
    def _post_filter(self, simplified_sens):
        kept_sens = simplified_sens
        if self.post_min_len is not None:
            kept_sens = self._post_filter_by_len(kept_sens)
        if self.post_by_ner:
            kept_sens = self._post_filter_by_ner(kept_sens)
        return kept_sens
    
    def _post_filter_by_len(self, sens):
        kept_sens = [sen for sen in sens if len(sen) >= self.post_min_len]
        return kept_sens
    
    def _post_filter_by_ner(self, sens):
        def check_entity(sen):
            entitymentions = sen.entitymentions
            if entitymentions == []:
                return False
            
            if any(em['ner'] in self.entity_labels for em in entitymentions) == False:
                return False
            
            return True
        kept_sens = [sen for sen in sens if check_entity(sen)]
        return kept_sens
    

class CoreServerTransducer(object):
    def __init__(self, entity2quest):
        self.entity2quest = entity2quest

    
    def __call__(self, ann):
        derived_sens = ann.derived_sens
        quests = self._gen_quests(derived_sens)
        ann.quests = quests
        return ann
   
    def _gen_quests(self, sens):
        quests = []
        for sen in sens:
            qs = self._sen2quests(sen)
            quests += qs
        return quests
    
    def _sen2quests(self, sen):
        entitymentions = sen.entitymentions
        sen_text = sen.text
        quests = []
        for em in entitymentions:
            ner = em['ner']
            cb, ce = em['characterOffsetBegin'], em['characterOffsetEnd']
            if ner in self.entity2quest:
                quest = sen_text[:cb] + self.entity2quest[ner] + sen_text[ce:]
                quests.append(CoreServerQuest(quest, sen))
        return quests

In [3]:
# text = "公民申请普通护照，应当由本人向其户籍所在地县级以上地方人民政府公安机关出入境管理机构提出，并提交以上真实有效的材料，现役军人按照管理权限履行报批手续后，由本人向所属部队驻地县级以上地方人民政府公安机关出入境管理机构提出。"
# text =  '何洛洛在群访中回应称自己一定是会去高考的，虽然今年错过了，\
# 但明年一定会全力以赴。何洛洛坦言每个人在追梦的道路上都有自己的选择和机会，\
# 他会对自己的选择全力以赴坚持到底。至于错过今年高考是否遗憾，\
# 何洛洛给出了否定的答案，“每个人都有自己的选择，既然选择了《创造营2019》，那我明年继续备战高考。”'

# text = "作为山东人具体一年到头要吃几顿饺子真的没有具体数字，想吃了就调上馅儿包上咱就吃说说我们这儿必须吃饺子的日子：小年吃、大年三十12点钟声一敲必须吃饺子、正月初二送年的时候吃、过冬至的时候吃（俗语是为了防止冻耳朵），有人要远行一定要吃送行的饺子（俗语上车的饺子下车的面）还有嫁女儿一定要吃饺子，俗语叫做“滚蛋饺”（哈哈）"
text = "马云是阿里巴巴集团的创始人之一，他于十几年前在浙江杭州创办了阿里巴巴集团。在2019年，阿里巴巴集团与上海签订了战略合作协议。"
analyzer = CoreServerAnalyzer()
ann = CoreServerAnn(text)
ann = analyzer(ann)

In [4]:
ann.source_sens

[<__main__.CoreServerSen at 0x2b5ccec1be0>,
 <__main__.CoreServerSen at 0x2b5ccec1c18>]

In [5]:
rule = "IP<(NP=np $..(VP=vp ?$.. PU)) >(ROOT|IP|CP)"
# rule = "ROOT < (IP<(NP=np $.. VP=vp))"
entity2quest = {"LOCATION": "{ 哪里 | 什么地方 }",
                "PERSON": "{ 谁 }",
                "GPE": "{ 哪里 | 什么地方 }",
                "ORGANIZATION": "{ 什么 组织|机构 }"
               }
entity_labels = list(entity2quest.keys())
print(entity_labels)
tranformer = CoreServerTranformer(rule = rule, entity_labels = entity_labels,
                                  post_by_ner= True
                                 )
ann = tranformer(ann)

['LOCATION', 'PERSON', 'GPE', 'ORGANIZATION']


In [6]:
ann.derived_sens

[<__main__.CoreServerSen at 0x2b5ccf355c0>,
 <__main__.CoreServerSen at 0x2b5ccf35668>,
 <__main__.CoreServerSen at 0x2b5cced89e8>]

In [7]:
for tmp_sen in ann.derived_sens:
    print(tmp_sen.text)

马云是阿里巴巴集团的创始人之一
他于十几年前在浙江杭州创办了阿里巴巴集团
在2019年，阿里巴巴集团与上海签订了战略合作协议。


In [8]:
for tmp_sen in ann.derived_sens:
    print(tmp_sen.entitymentions)

[{'docTokenBegin': 0, 'docTokenEnd': 1, 'tokenBegin': 0, 'tokenEnd': 1, 'text': '马云', 'characterOffsetBegin': 0, 'characterOffsetEnd': 2, 'ner': 'PERSON'}, {'docTokenBegin': 2, 'docTokenEnd': 4, 'tokenBegin': 2, 'tokenEnd': 4, 'text': '阿里巴巴集团', 'characterOffsetBegin': 3, 'characterOffsetEnd': 9, 'ner': 'ORGANIZATION'}, {'docTokenBegin': 6, 'docTokenEnd': 7, 'tokenBegin': 6, 'tokenEnd': 7, 'text': '之一', 'characterOffsetBegin': 13, 'characterOffsetEnd': 15, 'ner': 'MISC'}]
[{'docTokenBegin': 2, 'docTokenEnd': 3, 'tokenBegin': 2, 'tokenEnd': 3, 'text': '十几', 'characterOffsetBegin': 2, 'characterOffsetEnd': 4, 'ner': 'NUMBER'}, {'docTokenBegin': 3, 'docTokenEnd': 5, 'tokenBegin': 3, 'tokenEnd': 5, 'text': '年前', 'characterOffsetBegin': 4, 'characterOffsetEnd': 6, 'ner': 'MISC'}, {'docTokenBegin': 6, 'docTokenEnd': 7, 'tokenBegin': 6, 'tokenEnd': 7, 'text': '浙江', 'characterOffsetBegin': 7, 'characterOffsetEnd': 9, 'ner': 'STATE_OR_PROVINCE'}, {'docTokenBegin': 7, 'docTokenEnd': 8, 'tokenBegi

In [9]:
transducer = CoreServerTransducer(entity2quest)
ann = transducer(ann)

In [10]:
ann.quests

[<__main__.CoreServerQuest at 0x2b5ccf09940>,
 <__main__.CoreServerQuest at 0x2b5ccf097f0>,
 <__main__.CoreServerQuest at 0x2b5ccf09668>,
 <__main__.CoreServerQuest at 0x2b5ccec12b0>]

In [11]:
for q in ann.quests:
    print("问：", q.text)
    print("答1：", q.comefrom.text)
    print("答2：", q.comefrom.comefrom.text)
    print("答3：", q.comefrom.comefrom.comefrom.text)
    print('\n')

问： { 谁 }是阿里巴巴集团的创始人之一
答1： 马云是阿里巴巴集团的创始人之一
答2： 马云是阿里巴巴集团的创始人之一，他于十几年前在浙江杭州创办了阿里巴巴集团。
答3： 马云是阿里巴巴集团的创始人之一，他于十几年前在浙江杭州创办了阿里巴巴集团。在2019年，阿里巴巴集团与上海签订了战略合作协议。


问： 马云是{ 什么 组织|机构 }的创始人之一
答1： 马云是阿里巴巴集团的创始人之一
答2： 马云是阿里巴巴集团的创始人之一，他于十几年前在浙江杭州创办了阿里巴巴集团。
答3： 马云是阿里巴巴集团的创始人之一，他于十几年前在浙江杭州创办了阿里巴巴集团。在2019年，阿里巴巴集团与上海签订了战略合作协议。


问： 他于十几年前在浙江杭州创办了{ 什么 组织|机构 }
答1： 他于十几年前在浙江杭州创办了阿里巴巴集团
答2： 马云是阿里巴巴集团的创始人之一，他于十几年前在浙江杭州创办了阿里巴巴集团。
答3： 马云是阿里巴巴集团的创始人之一，他于十几年前在浙江杭州创办了阿里巴巴集团。在2019年，阿里巴巴集团与上海签订了战略合作协议。


问： 在2019年，{ 什么 组织|机构 }与上海签订了战略合作协议。
答1： 在2019年，阿里巴巴集团与上海签订了战略合作协议。
答2： 在2019年，阿里巴巴集团与上海签订了战略合作协议。
答3： 马云是阿里巴巴集团的创始人之一，他于十几年前在浙江杭州创办了阿里巴巴集团。在2019年，阿里巴巴集团与上海签订了战略合作协议。


