In [1]:
import jieba
import pyltp
from snownlp import SnowNLP
import os


In [2]:
# 引入模型文件
LTP_DATA_DIR = '/Users/sunhongchao/Documents/craft/Awesome/Zero-Preprocessing/resources/ltp_data_v3.4.0'  # ltp模型目录的路径

cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model')  # 分词模型路径，模型名称为`cws.model`
pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model')  # 词性标注模型路径，模型名称为`pos.model`
ner_model_path = os.path.join(LTP_DATA_DIR, 'ner.model')  # 命名实体识别模型路径，模型名称为`ner.model`
par_model_path = os.path.join(LTP_DATA_DIR, 'parser.model')  # 分词模型路径， 模型名称为'parser.model'


#分词
from pyltp import Segmentor
segmentor = Segmentor()  # 初始化实例
segmentor.load(cws_model_path)  # 加载模型

#词性标注
from pyltp import Postagger
postagger = Postagger()  # 初始化实例
postagger.load(pos_model_path)  # 加载模型

#命名实体识别
from pyltp import NamedEntityRecognizer
recognizer = NamedEntityRecognizer()  # 初始化实例
recognizer.load(ner_model_path)  # 加载模型

#句法分析
from pyltp import Parser
parser = Parser()   # 初始化实例
parser.load(par_model_path)   # 加载模型

In [3]:
def deal_ner(words_list, postags_list, netags_list, ner_type):

    if ner_type == 'name':
        _tag = 'Nh'
    elif ner_type == 'org':
        _tag = 'Ni'
    elif ner_type == 'loc':
        _tag = 'Nt'
    elif ner_type == 'date':
        _tag = 'Ns'
    else:
        pass
    print(ner_type)

    #去除非命名实体
    a = len(words_list)
    words_list_1=[]
    postags_list_1=[]
    netags_list_1=[]
    i = 0
    while i < a:
        if netags_list[i] != 'O':
            words_list_1.append(words_list[i])
            postags_list_1.append(postags_list[i])
            netags_list_1.append(netags_list[i])
        i += 1
    a1 = len(words_list_1)

    #提取人名
    i = 0
    lists = []
    while i<a1:
        # 人名
        print(i)
        if netags_list_1[i] == 'S-'+_tag:
            lists.append(words_list_1[i])
        elif netags_list_1[i] == 'B-'+ _tag:
            temp_s3 = ''
            temp_s3 += words_list_1[i]
            j = i+1
            while (j<a1) and (netags_list_1[j]=='I-'+_tag or netags_list_1[j]=='E-'+_tag):
                temp_s3 += words_list_1[j]
                j += 1
            lists.append(temp_s3)

        i += 1

    return lists


In [4]:
def query_deal(input_str:str):

    words = segmentor.segment(input_str)  # 分词
    words_list = list(words)   #words_list列表保存着分词的结果
    print('word list', words_list)

    postags = postagger.postag(words)  # 词性标注
    postags_list = list(postags)  #postags_list保存着词性标注的结果
    print('pos list', postags_list)

    netags = recognizer.recognize(words, postags)  # 命名实体识别
    netags_list = list(netags)  #netags_list保存着命名实体识别的结果
    print('ner list', netags_list)

    # words = ['元芳', '你', '怎么', '看']
    # postags = ['nh', 'r', 'r', 'v']
    arcs = parser.parse(words, postags)   # 句法分析
    print('\t'.join('%d: %s' %(arc.head, arc.relation) for arc in arcs))

    # # 关键词
    # from snownlp import SnowNLP

    # s = SnowNLP(input_str)
    # keywords_list = s.keywords(5)

    # # 长句压缩
    # if len(input_str) > 20:

    
    # # 信息量衡量

    # # QQ相似， 找到类似的Q
         

    return words_list, postags_list, netags_list


In [5]:
results = query_deal('姚明在08年北京奥运会上担任旗手')

word list ['姚明', '在', '08年', '北京', '奥运会', '上', '担任', '旗手']
pos list ['nh', 'p', 'nt', 'ns', 'j', 'nd', 'v', 'n']
ner list ['S-Nh', 'O', 'O', 'S-Ns', 'O', 'O', 'O', 'O']
7: SBV	7: ADV	5: ATT	5: ATT	6: ATT	2: POB	0: HED	7: VOB


In [6]:
def ltp_release():
    segmentor.release()  # 释放模型
    postagger.release()  # 释放模型
    recognizer.release()  # 释放模型
    parser.release()

In [None]:
# 关键词
from snownlp import SnowNLP

s = SnowNLP('自然语言处理是计算机科学领域与人工智能领域中的一个重要方向')
s_key = s.keywords(5)
print(s_key)