In [1]:
# 《自然语言处理入门》2.9 准确率评测
import re
import jieba
import hanlp
import time
import thulac
from snownlp import SnowNLP
from pyhanlp import *
from test_utility import ensure_data

In [2]:
def to_region(segmentation: str) -> list:
    """
    将分词结果转换为区间
    :param segmentation: 商品 和 服务
    :return: [(0, 2), (2, 3), (3, 5)]
    """
    region = []
    start = 0
    for word in re.compile("\\s+").split(segmentation.strip()):
        end = start + len(word)
        region.append((start, end))
        start = end
    return region

In [3]:
def prf(gold: str, pred: str) -> tuple:
    """
    计算P、R、F1
    : gold: 标准答案文件，比如“商品 和 服务”
    : pred: 分词结果文件，比如“商品 和服 务”
    """
    A_size, B_size, A_cap_B_size = 0, 0, 0
    with open(gold, encoding='utf-8') as gd, open(pred, encoding='utf-8') as pd:
        for g, p in zip(gd, pd):
            A, B = set(to_region(g)), set(to_region(p))
            A_size += len(A)
            B_size += len(B)
            A_cap_B_size += len(A & B)
            text = re.sub("\\s+", "", g)
            
    p, r = A_cap_B_size / B_size * 100, A_cap_B_size / A_size * 100
    return p, r, 2 * p * r / (p + r)

In [4]:
sighan05 = ensure_data('icwb2-data', 'http://sighan.cs.uchicago.edu/bakeoff2005/data/icwb2-data.zip')

## 使用msr语料库 ，另外还有'cityu', 'as', 'msr', 'pku'

In [5]:
msr_dict = os.path.join(sighan05, 'gold', 'msr_training_words.utf8')
msr_test = os.path.join(sighan05, 'testing', 'msr_test.utf8')
msr_output = os.path.join(sighan05, 'testing', 'msr_output.txt')
msr_gold = os.path.join(sighan05, 'gold', 'msr_test_gold.utf8')

print(msr_dict)
print(msr_test)
print(msr_output)
print(msr_gold)

C:\Users\David\anaconda3\lib\site-packages\pyhanlp\static\data\test\icwb2-data\gold\msr_training_words.utf8
C:\Users\David\anaconda3\lib\site-packages\pyhanlp\static\data\test\icwb2-data\testing\msr_test.utf8
C:\Users\David\anaconda3\lib\site-packages\pyhanlp\static\data\test\icwb2-data\testing\msr_output.txt
C:\Users\David\anaconda3\lib\site-packages\pyhanlp\static\data\test\icwb2-data\gold\msr_test_gold.utf8


In [6]:
class Segment:
    def jieba_segment(self, sentence):
        seg_list = jieba.cut(sentence, cut_all=False)
        sentence = ' '.join(seg_list)
        return sentence

    def hanlp_segment(self, han_tokenizer, sentence):
        # hanlp分词
        sentence = han_tokenizer(sentence)
        return ' '.join(sentence)

    def thulac_segment(self, thu, sentence):
        # thulac 分词
        sentence = thu.cut(sentence, text=True)  # 进行一句话分词
        # cut_f(输入文件, 输出文件)
        return ''.join(sentence) # 注意：此处单引号不要空格，否则结果不对。 David 2020.8.13

    def pkuseg_segment(self, sentence):
        seg = pkuseg.pkuseg(postag=False)  # 以默认配置加载模型
        sentence = seg.cut(sentence)  # 进行分词
        return ' '.join(sentence)

    def pynlpir_segment(self, sentence):
        # pynlpir分词
        pynlpir.open()
        sentence = pynlpir.segment(sentence, pos_tagging=False)
        pynlpir.close()
        return ' '.join(sentence)

    def snownlp_segment(self, sentence):
        # snownlp分词
        # unicode_sentence = sentence.decode('gbk')
        sentence = SnowNLP(sentence).words
        return ' '.join(sentence)

## 输入要使用的 中文分词 包：'hanlp', 'jieba', 'snownlp', 'pynlpir', 'pkuseg', 'thulac'

In [7]:
# 'hanlp', 'jieba', 'snownlp', 'pynlpir', 'pkuseg', 'thulac'
segment_func = 'thulac'+'_segment'
print(segment_func)

thulac_segment


In [8]:
if 'thulac' in segment_func:
    thu = thulac.thulac(seg_only=True, deli=' ')

Model loaded succeed


In [9]:
start = time.perf_counter()
with open(msr_gold, encoding='utf-8') as test, open(msr_output, 'w', encoding='utf-8') as output:
    for line in test:
#         print(line)
        line = getattr(Segment(), segment_func)(thu, re.sub("\\s+", "", line))  # 根据参数调用不同分词工具
#         print(line)
#         print('\n')
        output.write(line)
        output.write("\n")   
end = time.perf_counter()


In [10]:
print('分词运行时间：', end - start)
print("P:%.2f \nR:%.2f \nF1:%.2f " % prf(msr_gold, msr_output))

分词运行时间： 4.750853299999999
P:83.38 
R:87.86 
F1:85.56 
