In [17]:
import jieba
import pandas as pd

In [18]:
data = pd.read_csv("tweets.csv", encoding='utf-8')
data[data.key_word=="深圳天气"].shape
#data.key_word.unique()
db = data[data.key_word=="路况"]
contents = db.content

In [None]:
for str in contents:
    seg_list = jieba.cut(str, use_paddle=True)
    print("Paddle Mode: " + '/ '.join(seg_list))

In [1]:
import os
from pyltp import Segmentor, Postagger, Parser, NamedEntityRecognizer, SementicRoleLabeller

class LtpParser(object):
    """ Initializing HIT-LTP project parsers. """
    def __init__(self):
        """ Initialize ltp models by loading. """
        self.segmentor = Segmentor()
        self.postagger = Postagger()
        self.parser = Parser()
        self.recognizer = NamedEntityRecognizer()
        self.labeller = SementicRoleLabeller()
        self.load()
        
    def load(self):
        """ Load pre-train model """
        LTP_DIR="F:/Projects/NACTrans2020/ltp_models_v3.4.0"
        self.segmentor.load(os.path.join(LTP_DIR,"cws.model"))
        self.postagger.load(os.path.join(LTP_DIR, "pos.model"))
        self.parser.load(os.path.join(LTP_DIR,"parser.model"))
        self.recognizer.load(os.path.join(LTP_DIR,"ner.model"))
        self.labeller.load(os.path.join(LTP_DIR,"pisrl_win.model"))
        print("INFO: Models loaded!")

    def release(self):
        """ Release ram space """
        self.segmentor.release()
        self.postagger.release()
        self.parser.release()
        self.recognizer.release()
        self.labeller.release()
        print("INFO: Models released")

    def format_labelrole(self, words, postags):
        """ Semantic role labelling using given modules."""
        arcs = self.parser.parse(words, postags)
        roles = self.labeller.label(words, postags, arcs)
        roles_dict = dict()
        for role in roles:
            roles_dict[role.index] = {arg.name:[arg.name, arg.range.start, arg.range.end] for arg in role.arguments}
        return roles_dict

    def build_parse_child_dict(self, words, postags,arcs):
        """ Parsing by creating a corresponding child dict for each word within, to record dependency parsing result. """
        child_dict_list, format_parse_list = [], []
        for index in range(len(words)):
            child_dict = dict()
            for arc_index in range(len(arcs)):
                if arcs[arc_index] in range(len(arcs)):
                    if arcs[arc_index].head == index+1:     # for arc_index starts from 1
                        if arcs[arc_index].relation in child_dict: # search for current relations
                            child_dict[arcs[arc_index].relation].append(arc_index)
                        else:
                            child_dict[arcs[arc_index].relation] =[]
                            child_dict[arcs[arc_index].relation].append(arc_index)
            child_dict_list.append(child_dict)
            
        # extract parent nodes' ids.
        rely_id = [ arc.head for arc in arcs ]
        relation = [ arc.relation for arc in arcs ]
        heads = ['Root' if id == 0 else words[id - 1] for id in rely_id]
        for i in range(len(words)):
            # output results.
            a = [i, relation[i], words[i], postags[i], heads[i], rely_id[i] - 1, postags[rely_id[i] - 1]]
            format_parse_list.append(a)

        return child_dict_list, format_parse_list

    def parser_main(self, sentence):
        """ Main function """
        words = list(self.segmentor.segment(sentence))
        postags = list(self.postagger.postag(words))
        arcs = self.parser.parse(words, postags)
        child_dict_list, format_parse_list = self.build_parse_child_dict(words, postags,arcs)
        roles_dict = self.format_labelrole(words, postags)
        return words, postags, child_dict_list, roles_dict, format_parse_list
    
#parse = LtpParser()
#parse.release()

In [33]:
#sentence = contents[0].replace("\u200b",' ').strip()
sentence = "王子懿得到习的钦点，火速上任。"
words, postags, child_dict_list, roles_dict, format_parse_list = parse.parser_main(sentence)
print(
    "原句：{0} \n 分词：{1} \n 词性：{2} \n 依存子类字典: {3} \n 语义标记：{4} \n 格式化三元关系组：{5}".format(
        sentence, words, postags, child_dict_list, roles_dict, format_parse_list
    )
)

原句：王子懿得到习的钦点，火速上任。 
 分词：['王子懿', '得到', '习', '的', '钦点', '，', '火速', '上任', '。'] 
 词性：['nh', 'v', 'nh', 'u', 'n', 'wp', 'd', 'v', 'wp'] 
 依存子类字典: [{}, {}, {}, {}, {}, {}, {}, {}, {}] 
 语义标记：{1: {'A0': ['A0', 0, 0], 'A1': ['A1', 2, 4]}, 7: {'A0': ['A0', 0, 0], 'ADV': ['ADV', 6, 6]}} 
 格式化三元关系组：[[0, 'SBV', '王子懿', 'nh', '得到', 1, 'v'], [1, 'HED', '得到', 'v', 'Root', -1, 'wp'], [2, 'ATT', '习', 'nh', '钦点', 4, 'n'], [3, 'RAD', '的', 'u', '习', 2, 'nh'], [4, 'VOB', '钦点', 'n', '得到', 1, 'v'], [5, 'WP', '，', 'wp', '得到', 1, 'v'], [6, 'ADV', '火速', 'd', '上任', 7, 'v'], [7, 'COO', '上任', 'v', '得到', 1, 'v'], [8, 'WP', '。', 'wp', '得到', 1, 'v']]


In [14]:
import re

class tripletableextractor(object):
    """ Triple table extractor for long text contents. """
    def __init__(self):
        self.parser = LtpParser()

    def split_contents(self, content):
        """ Split long text contents by punctuations. """
        return [sentence for sentence in re.split(r'[#？?！!。；;：:\n\r]', content) if sentence]

    def trunk_extractor(self, words, postags, roles_dict, role_index):
        """ Triple extraction by searching for the trunk of sentences, using semantic role labelling. """
        v = words[role_index]
        role_info = roles_dict[role_index]
        if 'A0' in role_info.keys() and 'A1' in role_info.keys():
            # Subject
            s = ' '.join([words[word_index] for word_index in range(role_info['A0'][1], role_info['A0'][2]+1) if postags[word_index][0] not in ['w','u','x'] and words[word_index]])
            # Object
            o = ' '.join([words[word_index] for word_index in range(role_info['A1'][1], role_info['A1'][2]+1) if postags[word_index][0] not in ['w','u','x'] and words[word_index]])
            if s and o:
                return [s, v, o]

        #  Reserved codes for double table extraction. (No-use currently)
        # elif 'A0' in role_info:
        #     s = ''.join([words[word_index] for word_index in range(role_info['A0'][1], role_info['A0'][2] + 1) if
        #                  postags[word_index][0] not in ['w', 'u', 'x']])
        #     if s:
        #         return '2', [s, v]
        # elif 'A1' in role_info:
        #     o = ''.join([words[word_index] for word_index in range(role_info['A1'][1], role_info['A1'][2]+1) if
        #                  postags[word_index][0] not in ['w', 'u', 'x']])
        #     return '3', [v, o]

        return []

    def triple_extractor(self, words, postags, child_dict_list, arcs, roles_dict):
        """ The main function for triple table extraction. """
        svos = []
        for idx in range(len(postags)):
            tmp = 1
            # First, use semantic role labelling for extraction.
            if idx in roles_dict:
                triple = self.trunk_extractor(words, postags, roles_dict, idx)
                if triple:
                    svos.append(triple) 
                    tmp =0 
            if tmp == 1:
                # If returned empty triple, use dependency parsing for extraction.
                if postags[idx]:
                    # Extract triple centring around verbs.
                    child_dict = child_dict_list[idx]
                    
                    # Direct relations
                    if 'SBV' in child_dict and 'VOB' in child_dict:
                        e1 = self.complete_e(words, postags, child_dict_list, child_dict['SBV'][0])
                        r = words[idx]
                        e2 = self.complete_e(words, postags, child_dict_list, child_dict['VOB'][0])
                        svos.append([e1,r,e2])
                    
                    # Variant - attribute
                    relation = arcs[idx][0]
                    head = arcs[idx][2]
                    if relation == 'ATT':
                        if 'VOB' in child_dict:
                            e1 = self.complete_e(words, postags, child_dict_list, head - 1)
                            r = words[idx]
                            e2 = self.complete_e(words, postags, child_dict_list, child_dict['VOB'][0])
                            tmp_string = r + e2
                            if tmp_string == e1[:len(tmp_string)]:
                                e1 = e1[len(tmp_string)]
                            if tmp_string not in e1:
                                svos.append([e1,r,e2])
                    
                    # Variant - complement & preposition-object
                    if 'SBV' in child_dict and 'CMP' in child_dict:
                        e1 = self.complete_e(words, postags, child_dict_list, child_dict['SBV'][0])
                        cmp_idx = child_dict['CMP'][0]
                        r = words[idx] + words[cmp_idx]
                        if 'POB' in child_dict_list[cmp_idx]:
                            e2 = self.complete_e(words, postags, child_dict_list, child_dict_list[cmp_idx]['POB'][0])
                            svos.append([e1,r,e2])
        return svos

    def complete_e(self, words, postags, child_dict_list, word_idx):
        """ Extend subjects or objects found. """
        child_dict = child_dict_list[word_idx]
        prefix = ''
        postfix = ''
        
        # Extension mode 1 - Attribute
        if 'ATT' in child_dict: 
            for idx in range(len(child_dict['ATT'])):
                prefix += self.complete_e(words, postags, child_dict_list, child_dict['ATT'][i])

        # Extension mode 2 - Further subject-verb or verb-object
        if postags[word_idx] == 'v':
            if 'VOB' in child_dict:
                postfix += self.complete_e(words, postags, child_dict_list, child_dict['VOB'][0])
            if 'SBV' in child_dict:
                prefix += self.complete_e(words, postags, child_dict_list, child_dict['SBV'][0])
        
        return prefix + words[word_idx] + postfix

    def triple_main(self, contents):
        """ Main function for triple table extraction. """
        sentences = self.split_contents(contents)
        svos = []
        for sentence in sentences:
            words , postags, child_dict_list, roles_dict, arcs = self.parser.parser_main(sentence)
            svo = self.triple_extractor(words, postags, child_dict_list, arcs, roles_dict)
            svos += svo

        return svos
    
    def load(self):
        self.parser.load()

    def release(self):
        self.parser.release()

In [15]:
extractor = tripletableextractor()

INFO: Models loaded!


In [None]:
"""text = 近日，一条男子高铁吃泡面被女乘客怒怼的视频引发热议。女子情绪激动，言辞激烈，大声斥责该乘客，称高铁上有规定不能吃泡面，质问其“有公德心吗”“没素质”。视频曝光后，该女子回应称，因自己的孩子对泡面过敏，曾跟这名男子沟通过，但对方执意不听，她才发泄不满，并称男子拍视频上传已侵犯了她的隐私权和名誉权，将采取法律手段。12306客服人员表示，高铁、动车上一般不卖泡面，但没有规定高铁、动车上不能吃泡面。
                高铁属于密封性较强的空间，每名乘客都有维护高铁内秩序，不破坏该空间内空气质量的义务。这也是乘客作为公民应当具备的基本品质。但是，在高铁没有明确禁止食用泡面等食物的背景下，以影响自己或孩子为由阻挠他人食用某种食品并厉声斥责，恐怕也超出了权利边界。当人们在公共场所活动时，不宜过分干涉他人权利，这样才能构建和谐美好的公共秩序。
                一般来说，个人的权利便是他人的义务，任何人不得随意侵犯他人权利，这是每个公民得以正常工作、生活的基本条件。如果权利可以被肆意侵犯而得不到救济，社会将无法运转，人们也没有幸福可言。如西谚所说，“你的权利止于我的鼻尖”，“你可以唱歌，但不能在午夜破坏我的美梦”。无论何种权利，其能够得以行使的前提是不影响他人正常生活，不违反公共利益和公序良俗。超越了这个边界，权利便不再为权利，也就不再受到保护。
                在“男子高铁吃泡面被怒怼”事件中，初一看，吃泡面男子可能侵犯公共场所秩序，被怒怼乃咎由自取，其实不尽然。虽然高铁属于封闭空间，但与禁止食用刺激性食品的地铁不同，高铁运营方虽然不建议食用泡面等刺激性食品，但并未作出禁止性规定。由此可见，即使食用泡面、榴莲、麻辣烫等食物可能产生刺激性味道，让他人不适，但是否食用该食品，依然取决于个人喜好，他人无权随意干涉乃至横加斥责。这也是此事件披露后，很多网友并未一边倒地批评食用泡面的男子，反而认为女乘客不该高声喧哗。
                现代社会，公民的义务一般分为法律义务和道德义务。如果某个行为被确定为法律义务，行为人必须遵守，一旦违反，无论是受害人抑或旁观群众，均有权制止、投诉、举报。违法者既会受到应有惩戒，也会受到道德谴责，积极制止者则属于应受鼓励的见义勇为。如果有人违反道德义务，则应受到道德和舆论谴责，并有可能被追究法律责任。如在公共场所随地吐痰、乱扔垃圾、脱掉鞋子、随意插队等。此时，如果行为人对他人的劝阻置之不理甚至行凶报复，无疑要受到严厉惩戒。
                当然，随着社会的发展，某些道德义务可能上升为法律义务。如之前，很多人对公共场所吸烟不以为然，烟民可以旁若无人地吞云吐雾。现在，要是还有人不识时务地在公共场所吸烟，必然将成为众矢之的。
                再回到“高铁吃泡面”事件，要是随着人们观念的更新，在高铁上不得吃泡面等可能产生刺激性气味的食物逐渐成为共识，或者上升到道德义务或法律义务。斥责、制止他人吃泡面将理直气壮，否则很难摆脱“矫情”，“将自我权利凌驾于他人权利之上”的嫌疑。
                在相关部门并未禁止在高铁上吃泡面的背景下，吃不吃泡面系个人权利或者个人私德，是不违反公共利益的个人正常生活的一部分。如果认为他人吃泡面让自己不适，最好是请求他人配合并加以感谢，而非站在道德制高点强制干预。只有每个人行使权利时不逾越边界，与他人沟通时好好说话，不过分自我地将幸福和舒适凌驾于他人之上，人与人之间才更趋于平等，公共生活才更趋向美好有序。"""
text = contents[0]
svos = extractor.triple_main(text)
print("svos: {}".format(svos))
extractor.release()

In [21]:
from operator import methodcaller
from tqdm import tqdm
class test(object):
    def __init__(self):
        self.a = 1
        self.b =2 
        self.c = 3
        self.d = 4
    
    def tqdm_bar(self):
        for  (var,index) in tqdm(vars(self).items()):
            print('\n',index,var)

    def type_tester(self):
        load = methodcaller('load')
        for var in vars(self):
            print(type(methodcaller(var)))

t = test()
t.type_tester()

<class 'operator.methodcaller'>
<class 'operator.methodcaller'>
<class 'operator.methodcaller'>
<class 'operator.methodcaller'>


In [22]:
help(methodcaller)

Help on class methodcaller in module operator:

class methodcaller(builtins.object)
 |  methodcaller(name, ...) --> methodcaller object
 |  
 |  Return a callable object that calls the given method on its operand.
 |  After f = methodcaller('name'), the call f(r) returns r.name().
 |  After g = methodcaller('name', 'date', foo=1), the call g(r) returns
 |  r.name('date', foo=1).
 |  
 |  Methods defined here:
 |  
 |  __call__(self, /, *args, **kwargs)
 |      Call self as a function.
 |  
 |  __getattribute__(self, name, /)
 |      Return getattr(self, name).
 |  
 |  __new__(*args, **kwargs) from builtins.type
 |      Create and return a new object.  See help(type) for accurate signature.
 |  
 |  __reduce__(...)
 |      Return state information for pickling
 |  
 |  __repr__(self, /)
 |      Return repr(self).



In [9]:
import requests
from lxml import etree


url = "https://lishi.tianqi.com/shenzhen/201612.html"
 
headers = {
    'User-Agent': "PostmanRuntime/7.16.3",
    'Accept': "*/*",
    'Cache-Control': "no-cache",
    'Host': "lishi.tianqi.com",
    'Accept-Encoding': "gzip, deflate",
    'Connection': "keep-alive",
    'cache-control': "no-cache"
    }

response = requests.request("GET", url, headers=headers)
tree_node = etree.HTML(response.text)
records = tree_node.xpath('//*[@class="lishitable_content clearfix"]/li')
for record in records:
    try:
        print('_date: ', record.xpath('string(.//a/text())'))
        print('max_temp: ', record.xpath('string(.//div[2]/text())'))
        print('min_temp: ', record.xpath('string(.//div[3]/text())'))
        print('weather: ', record.xpath('string(.//div[4]/text())'))
        print('wind_dir: ', record.xpath('string(.//div[5]/text())'))
        print('\n')
    except Exception as e:
        print("Error: ", e)

_date:  2016-12-01
max_temp:  22
min_temp:  16
weather:  多云
wind_dir:  北风 2级


_date:  2016-12-02
max_temp:  23
min_temp:  17
weather:  多云
wind_dir:  北风 1级


_date:  2016-12-03
max_temp:  23
min_temp:  18
weather:  多云
wind_dir:  西北风 1级


_date:  2016-12-04
max_temp:  24
min_temp:  19
weather:  多云
wind_dir:  东北风 1级


_date:  2016-12-05
max_temp:  26
min_temp:  18
weather:  多云
wind_dir:  北风 1级


_date:  2016-12-06
max_temp:  22
min_temp:  15
weather:  多云
wind_dir:  北风 1级


_date:  2016-12-07
max_temp:  22
min_temp:  16
weather:  多云
wind_dir:  北风 1级


_date:  2016-12-08
max_temp:  25
min_temp:  16
weather:  多云
wind_dir:  北风 1级


_date:  2016-12-09
max_temp:  23
min_temp:  16
weather:  多云
wind_dir:  北风 1级


_date:  2016-12-10
max_temp:  24
min_temp:  17
weather:  多云
wind_dir:  南风 1级


_date:  2016-12-11
max_temp:  25
min_temp:  18
weather:  多云
wind_dir:  东风 2级


_date:  2016-12-12
max_temp:  25
min_temp:  19
weather:  多云
wind_dir:  东风 2级


_date:  2016-12-13
max_temp:  27
min_temp:  16
wea