### 获取数据pos_tag及ner
每个token的范围是原句子中位置的左闭右开区间

In [1]:
import spacy
nlp = spacy.load("zh_core_web_sm")

In [2]:
#nlp = spacy.load("zh_core_web_sm")
#sentences = ["物质决定意识，意识反作用于物质。","清华大学位于海淀区五道口。"]

# pos_tag后的句子
def pos_tag_sentences(sentences):
    pos_sentences = []
    for string in sentences:
        doc = nlp(string)
        pos_string = []
        pos_sentences.append(pos_string)
        i = 0
        for token in doc:
            token_length = len(token.text)
            token_start = i
            token_end = i+token_length
            i = token_end

            pos_string.append((token.text, token.tag_+"_pos", (token_start, token_end)))
    return pos_sentences

In [3]:
# ner后的句子
def ner_tag_sentences(sentences):
    ner_sentences = []
    for string in sentences:
        doc = nlp(string)
        ner_string = []
        ner_sentences.append(ner_string)
        for ent in doc.ents:
            ner_string.append((ent.text, ent.label_, (ent.start_char, ent.end_char)))
            # print(ent.text, ent.start_char, ent.end_char, ent.label_)
    return ner_sentences

### 利用pos_tag和ner找到实体区间
区间范围为原句子中字的位置的左闭右开区间

In [4]:
# 实体抽取，需要记录词语和起止位置
def find_argument(pos_sentences, ner_sentences):
    argument_list = []
    if len(pos_sentences) != 0:
        for pos_token in pos_sentences:
            if pos_token[1] == "NN_pos" or pos_token[1] == "NR_pos" or pos_token[1] == "NT_pos":
                argument_list.append(pos_token[2])
    
    if len(ner_sentences) != 0:
        for ner_token in ner_sentences:
            argument_list.append(ner_token[2])
    print(argument_list)
    
    # 如果两个argument区域相交，合并为一个
    unified_argument = []
    for i in range(len(argument_list)):
        if argument_list[i][0] == (-1,-1):
            continue
        for j in range(i+1,len(argument_list)):
            if argument_list[j] == (-1,-1):
                continue
            if (argument_list[i][1] >= argument_list[j][0] and argument_list[i][0] < argument_list[j][1])\
                or (argument_list[j][1] >= argument_list[i][0] and argument_list[j][0] < argument_list[i][1]):
                argument_list[j] = (min(argument_list[i][0], argument_list[j][0]), max(argument_list[i][1], argument_list[j][1]))
                argument_list[i] = (-1,-1)
                break
        if argument_list[i][0] != -1 and argument_list[i][1] != -1:
            unified_argument.append(argument_list[i])
    
    return unified_argument

In [8]:
sentence = ["清华大学位于海淀区五道口。"]
pos_sentence = pos_tag_sentences(sentence)
ner_sentence = ner_tag_sentences(sentence)
find_argument(pos_sentence, ner_sentence)

[('五道口', 'DATE', (9, 12))]


[('五道口', 'DATE', (9, 12))]

### 利用hueristic信息进行关系词抽取
syntactic信息已知，lexical信息需要构建词典获得

In [9]:
# 关系词抽取
import re

In [10]:
# 关系词unify
def unify_relation(relations):
    unified_relations = []
    for i in range(len(relations)):
        if relations[i][0] == -1 and relations[i][1] == -1:
            continue
        for j in range(i+1, len(relations)):
            if relations[j][0] == -1 and relations[j][1] == -1:
                continue
            if (relations[i][1] >= relations[j][0] and relations[i][0] < relations[j][1]) \
                or (relations[j][1] >= relations[i][0] and relations[j][0] < relations[i][1]):
                relations[j] = (min(relations[i][0], relations[j][0]), max(relations[i][1], relations[j][1]))
                relations[i] = (-1,-1)
                break
        if relations[i][0] != -1 and relations[i][1] != -1:
            unified_relations.append(relations[i])
    return unified_relations

In [11]:
# 可能的关系模式
VERB = "(RB_pos)?" + "(MD_pos|VB_pos|VBD_pos|VBP_pos|VBZ_pos|VBG_pos|VBN_pos)" + "(RP_pos)?(RB_pos)?"
WORD = "(\$_pos|PRP\$_pos|CD_pos|DT_pos|JJ_pos|JJS_pos|JJR_pos|NN_pos" + "|NNS_pos|NNP_pos|NNPS_pos|POS_pos|PRP_pos|RB_pos|RBR_pos|RBS_pos" + "|VBN_pos|VBG_pos)"
PREP = "(RB_pos)?(IN_pos|TO_pos|RP_pos)(RB_pos)?"

LONG_RELATION_PATTERN = "(%s(%s*(%s)+)?)+" % (VERB, WORD, PREP)
SHORT_RELATION_PATTERN = "(%s(%s)?)+" % (VERB, PREP)

In [12]:
# 中文 可能的关系模式
VERB = "(VA_posDEV_pos)?(VC_pos|VE_pos|VV_pos)"
WORD = "(JJ_pos|VA_pos|DEC_pos|DEG_pos" + "|NN_pos|NR_pos|NT_pos" + "|M_pos|LC_pos|DEV_pos|DT_pos)"
PREP = "(VA_posDEV_pos)?(P_pos)(VA_posDEV_pos)?"

LONG_RELATION_PATTERN = "((%s(%s*)+)?%s)+" % (PREP, WORD, VERB)
SHORT_RELATION_PATTERN = "((%s(%s)?)+)|(((%s)?%s)+)" % (VERB, PREP, PREP, VERB)

In [13]:
pattern_long = re.compile(LONG_RELATION_PATTERN)
pattern_short = re.compile(SHORT_RELATION_PATTERN)

# 从经过pos_tag的句子中，提取所有符合sytactic constraint和lexical constraint的关系词，返回关系词在句中的起止位置
def find_relation(processed_sentence, use_lexical, k):
    pos_rep = ""
    position = []
    for i,token in enumerate(processed_sentence):
        pos_rep += token[1]
        for j in range(len(token[1])):
            position.append(i)
    print(pos_rep)
    print(position)
    
    #pos_rep = "RB_posVB_posCD_posIN_posRB_posRB_posRB_posVB_posCD_posIN_pos"
    extract_relation = [] # 存储relation在原句子中的位置（第几个词开始，第几个词结束，按照pos_tag分词后的结构计数）,左闭右开区间
    # vwp结构检测
    m_long = pattern_long.finditer(pos_rep)
    for relation in m_long:
        relation_start_token = position[relation.start()]
        relation_end_token = position[relation.end()-1]
        relation_start = processed_sentence[relation_start_token][2][0]
        relation_end = processed_sentence[relation_end_token][2][1]
        relation_position = (relation_start, relation_end)
        extract_relation.append(relation_position)

    # vp结构检测
    m_short = pattern_short.finditer(pos_rep)
    for relation in m_short:
        relation_start_token = position[relation.start()]
        relation_end_token = position[relation.end()-1]
        relation_start = processed_sentence[relation_start_token][2][0]
        relation_end = processed_sentence[relation_end_token][2][1]
        relation_position = (relation_start, relation_end)
        extract_relation.append(relation_position)
    
    print(extract_relation)
    # 判断是否符合lexical constraint
    if use_lexical == True:
        for index, relation in enumerate(extract_relation):
            relation_text = ""
            for i in range(relation[0], relation[1]):
                relation_text += pocessed_sentence[i][0]
            if dict_relation_freq[relation_text] < k:
                extract_relation[index] = (-1, -1)

    # 关系词集合去重
    extract_relation = unify_relation(extract_relation)
    print(extract_relation)
    
    return extract_relation

In [14]:
find_relation(processed_sentences[0], False, 0)

NameError: name 'processed_sentences' is not defined

### 根据获得的relation列表和argument列表进行关系组合
为每个关系词找到左边和右边最近的argument，构成relation

In [15]:
# 为每个relation找到与之对应的argument，返回arg1-rel-arg2结构的列表，表中实体与关系均由文本中的位置表示
def find_relation_argument(relations, arguments):
    arg1_rel_arg2_pos = []
    for relation in relations:
        # 找到与relation相距最近的左侧和右侧实体
        left_near = (-1,-1)
        right_near = (-1,-1)
        for argument in arguments:
            if argument[0] > left_near[0] and argument[0] < relation[0] and argument[1] <= relation[1]:
                left_near = argument
            if (right_near == (-1,-1) and argument[1] > relation[1] and argument[0] >= relation[0]) \
                or (argument[1] < right_near[1] and argument[1] > relation[1] and argument[0] >= relation[0]):
                right_near = argument
        print(relation, left_near, right_near)
        
        # 如果没有符合要求的实体，则该关系无法找到合适的实体，开始对新的关系进行三元组匹配
        if left_near == (-1,-1) or right_near == (-1,-1):
            continue
        
        # 如果实体与关系有交叉部分，则将实体缩短成不交叉的部分
        if left_near[1] > relation[0]:
            left_near = (left_near[0], relation[0])
        if right_near[0] < relation[1]:
            right_near = (relation[1], right_near[1])
            
        arg1_rel_arg2_pos.append((left_near, relation, right_near))
    return arg1_rel_arg2_pos

In [16]:
relations = [(3,5), (8,9)]
arguments = [(1,2), (10,11), (2,4), (6,7)]
find_relation_argument(relations, arguments)

(3, 5) (2, 4) (6, 7)
(8, 9) (6, 7) (10, 11)


[((2, 3), (3, 5), (6, 7)), ((6, 7), (8, 9), (10, 11))]

### 将relaiton与argument还原为文本，形成文本组成的三元组抽取结果

In [17]:
def find_relation_argument_text(sentence, arg1_rel_arg2_pos):
    arg1 = ""
    for i in range(arg1_rel_arg2_pos[0][0], arg1_rel_arg2_pos[0][1]):
        arg1 += sentence[i]
    
    rel = ""
    for j in range(arg1_rel_arg2_pos[1][0], arg1_rel_arg2_pos[1][1]):
        rel += sentence[j]
    
    arg2 = ""
    for k in range(arg1_rel_arg2_pos[2][0], arg1_rel_arg2_pos[2][1]):
        arg2 += sentence[k]
    
    extract_triple = {'sentence': sentence, 'arg1':arg1, 'relation': rel, 'arg2': arg2}
    return extract_triple

In [18]:
sentence = "奥巴马是美国总统。"
arg1_rel_arg2_pos = [(0,3),(3,4),(4,8)]
find_relation_argument_text(sentence, arg1_rel_arg2_pos)

{'sentence': '奥巴马是美国总统。', 'arg1': '奥巴马', 'relation': '是', 'arg2': '美国总统'}

### 用logistic回归对每个三元组的正确概率进行估计

In [199]:
from sklearn.linear_model import LogisticRegression

In [None]:
def train_log_regression:
    

In [None]:
def log_prob(extract_triple):
    return 1

### 整体流程

In [21]:
sentences = ["德国总统高克。","高克访问中国。","习近平在上海视察。","习近平对埃及进行国事访问。","奥巴马毕业于哈佛大学。",
             "习近平主席和李克强总理接见普京。","习近平访问了美国和英国。","高克访问中国，并在同济大学发表演讲。"]
pos_tag = pos_tag_sentences(sentences)
print(pos_tag)
ner_tag = ner_tag_sentences(sentences)
print(ner_tag)

extracted_triple = []
for i, sentence in enumerate(sentences):
    arguments = find_argument(pos_tag[i], ner_tag[i])
    relations = find_relation(pos_tag[i], False, 1)
    arg1_rel_arg2_pos = find_relation_argument(relations, arguments)
    
    for triple in arg1_rel_arg2_pos:
        text_triple = find_relation_argument_text(sentence, triple)
        extracted_triple.append(text_triple)

print(extracted_triple)

[[('德国', 'NR_pos', (0, 2)), ('总统', 'NN_pos', (2, 4)), ('高克', 'NR_pos', (4, 6)), ('。', 'PU_pos', (6, 7))], [('高克', 'AD_pos', (0, 2)), ('访问', 'VV_pos', (2, 4)), ('中国', 'NR_pos', (4, 6)), ('。', 'PU_pos', (6, 7))], [('习近平', 'NR_pos', (0, 3)), ('在', 'P_pos', (3, 4)), ('上海', 'NR_pos', (4, 6)), ('视察', 'VV_pos', (6, 8)), ('。', 'PU_pos', (8, 9))], [('习近', 'VV_pos', (0, 2)), ('平对', 'NR_pos', (2, 4)), ('埃及', 'NR_pos', (4, 6)), ('进行', 'VV_pos', (6, 8)), ('国事', 'NN_pos', (8, 10)), ('访问', 'NN_pos', (10, 12)), ('。', 'PU_pos', (12, 13))], [('奥巴马', 'NR_pos', (0, 3)), ('毕业于', 'VV_pos', (3, 6)), ('哈佛', 'NR_pos', (6, 8)), ('大学', 'NN_pos', (8, 10)), ('。', 'PU_pos', (10, 11))], [('习近平', 'NR_pos', (0, 3)), ('主席', 'NN_pos', (3, 5)), ('和', 'CC_pos', (5, 6)), ('李克强', 'NR_pos', (6, 9)), ('总理', 'NN_pos', (9, 11)), ('接见', 'VV_pos', (11, 13)), ('普京', 'NR_pos', (13, 15)), ('。', 'PU_pos', (15, 16))], [('习近平', 'NR_pos', (0, 3)), ('访问', 'VV_pos', (3, 5)), ('了', 'AS_pos', (5, 6)), ('美国', 'NR_pos', (6, 8)), ('和', 'CC_pos

In [20]:
sentences = ["小明成为了第一个吃螃蟹的人。","清华大学位于海淀区五道口。"]
pos_tag = pos_tag_sentences(sentences)
pos_tag

[[('小明', 'NR_pos', (0, 2)),
  ('成为', 'VV_pos', (2, 4)),
  ('了', 'AS_pos', (4, 5)),
  ('第一', 'OD_pos', (5, 7)),
  ('个', 'M_pos', (7, 8)),
  ('吃', 'VV_pos', (8, 9)),
  ('螃蟹', 'NN_pos', (9, 11)),
  ('的', 'DEC_pos', (11, 12)),
  ('人', 'NN_pos', (12, 13)),
  ('。', 'PU_pos', (13, 14))],
 [('清华', 'NR_pos', (0, 2)),
  ('大学', 'NN_pos', (2, 4)),
  ('位于', 'VV_pos', (4, 6)),
  ('海淀区', 'NR_pos', (6, 9)),
  ('五道口', 'NT_pos', (9, 12)),
  ('。', 'PU_pos', (12, 13))]]