In [1]:
import zipfile

from pyhanlp import *
from pyhanlp.static import download, remove_file, HANLP_DATA_PATH

In [2]:
def test_data_path():
    """
    获取测试数据路径，位于$root/data/test，根目录由配置文件指定。
    :return:
    """
    data_path = os.path.join(HANLP_DATA_PATH, 'test')
    if not os.path.isdir(data_path):
        os.mkdir(data_path)
    return data_path


def ensure_data(data_name, data_url):
    root_path = test_data_path()
    dest_path = os.path.join(root_path, data_name)
    if os.path.exists(dest_path):
        return dest_path
    if data_url.endswith('.zip'):
        dest_path += '.zip'
    download(data_url, dest_path)
    if data_url.endswith('.zip'):
        with zipfile.ZipFile(dest_path, "r") as archive:
            archive.extractall(root_path)
        remove_file(dest_path)
        dest_path = dest_path[:-len('.zip')]
    return dest_path

In [3]:
WordVectorModel = JClass('com.hankcs.hanlp.mining.word2vec.WordVectorModel')
DocVectorModel = JClass('com.hankcs.hanlp.mining.word2vec.DocVectorModel')
model_path = os.path.join(
    ensure_data('hanlp-wiki-vec-zh', 'http://hanlp.linrunsoft.com/release/model/hanlp-wiki-vec-zh.zip'),
    'hanlp-wiki-vec-zh.txt')
word2vec = WordVectorModel(model_path)
doc2vec = DocVectorModel(word2vec)
docs = ["山东苹果丰收", "农民在江苏种水稻", "奥运会女排夺冠", "世界锦标赛胜出", "中国足球失败"]
for idx, doc in enumerate(docs):
    doc2vec.addDocument(idx, doc)

print(word2vec.nearest('语言'))

for res in doc2vec.nearest('我要看比赛'):
    print('%s = %.2f' % (docs[res.getKey().intValue()], res.getValue().floatValue()))

[语法=0.68476176, 方言=0.6454435, 词汇=0.6102719, 文字=0.60930157, 语系=0.6036772, 口语=0.59481555, 汉语=0.58868086, 语言学=0.57250684, 母语=0.56637216, 语族=0.56171715]
奥运会女排夺冠 = 0.55
世界锦标赛胜出 = 0.53
中国足球失败 = 0.13
山东苹果丰收 = -0.05
农民在江苏种水稻 = -0.07


# 语义距离

In [4]:
# 语义距离
CoreSynonymDictionary = JClass("com.hankcs.hanlp.dictionary.CoreSynonymDictionary")

word_array = [
    "香蕉",
    "苹果",
    "白菜",
    "水果",
    "蔬菜",
    "自行车",
    "公交车",
    "飞机",
    "买",
    "卖",
    "购入",
    "新年",
    "春节",
    "丢失",
    "补办",
    "办理",
    "送给",
    "寻找",
    "孩子",
    "教室",
    "教师",
    "会计",
]
print("%-5s\t%-5s\t%-10s\t%-5s\n" % ("词A", "词B", "语义距离", "语义相似度"))
for a in word_array:
    for b in word_array:
        print("%-5s\t%-5s\t%-15d\t%-5.10f" % (a, b, CoreSynonymDictionary.distance(a, b),
                                              CoreSynonymDictionary.similarity(a, b)))

词A   	词B   	语义距离      	语义相似度

香蕉   	香蕉   	0              	1.0000000000
香蕉   	苹果   	19980          	0.9999997311
香蕉   	白菜   	2628369        	0.9999646244
香蕉   	水果   	32967          	0.9999995563
香蕉   	蔬菜   	2630367        	0.9999645975
香蕉   	自行车  	1854515628     	0.9750398066
香蕉   	公交车  	1854535619     	0.9750395376
香蕉   	飞机   	1857307833     	0.9750022259
香蕉   	买    	39729797433    	0.4652709248
香蕉   	卖    	39729897333    	0.4652695802
香蕉   	购入   	39729797433    	0.4652709248
香蕉   	新年   	4981789224     	0.9329493801
香蕉   	春节   	4981789224     	0.9329493801
香蕉   	丢失   	46784535633    	0.3703201856
香蕉   	补办   	39205230527    	0.4723311464
香蕉   	办理   	39205222533    	0.4723312540
香蕉   	送给   	40831595534    	0.4504416652
香蕉   	寻找   	41124601233    	0.4464980592
香蕉   	孩子   	6734891367     	0.9093541255
香蕉   	教室   	1548030420     	0.9791648353
香蕉   	教师   	7516908567     	0.8988288432
香蕉   	会计   	7547972472     	0.8984107496
苹果   	香蕉   	19980          	0.9999997311
苹果   	苹果   	0              

# 同义改写

In [5]:
# 同义改写
CoreSynonymDictionary = JClass("com.hankcs.hanlp.dictionary.CoreSynonymDictionary")
text = "这个方法可以利用同义词词典将一段文本改写成意思相似的另一段文本，而且差不多符合语法"
print(CoreSynonymDictionary.rewrite(text))

是法可采取同义词词典将一如既往段子文本改写成意思相似之其余一样截文本，而且大多符合语法
