In [1]:
import os
import jieba
import pickle
import pandas as pd
import numpy as np
from tqdm import tqdm
from functools import reduce
from sklearn.metrics.pairwise import cosine_similarity, cosine_distances

In [2]:
WORD_VEC_PATH = '/efs/downloads/word_vector/sgns.baidubaike.bigram-char'
LINE_NUM = 635975

PADDLE_BOW_WORD_VEC_PKL = '../data/paddle_models/sim_net/word_vec/paddle_bow_wv.pkl'

In [3]:
def read_vectors(path, topn):  # read top n word vectors, i.e. top is 10000
    lines_num, dim = 0, 0
    vectors = {}
    iw = []
    wi = {}
    with open(path, encoding='utf-8', errors='ignore') as f:
        first_line = True
        for line in tqdm(f, total= topn):
            if first_line:
                first_line = False
                dim = int(line.rstrip().split()[1])
                continue
            lines_num += 1
            tokens = line.rstrip().split(' ')
            vectors[tokens[0]] = np.asarray([float(x) for x in tokens[1:]])
            iw.append(tokens[0])
            if topn != 0 and lines_num >= topn:
                break
    for i, w in enumerate(iw):
        wi[w] = i
    return vectors, iw, wi, dim


def read_paddle_bow_vectors(pkl):
    with open(pkl, 'rb') as f:
        wv = pickle.load(f)
        vectors = wv['embedding']
        wi = wv['w2i']
    return vectors, wi, len(vectors[0])

In [4]:
vecotrs, iw, wi, dim = read_vectors(WORD_VEC_PATH, LINE_NUM)

100%|██████████| 635975/635975 [00:59<00:00, 10658.30it/s]


In [5]:
paddle_vectors, paddle_wi, paddle_dim = read_paddle_bow_vectors(PADDLE_BOW_WORD_VEC_PKL)

In [6]:
def wv_cos_sim(text1, text2):
    words1 = jieba.lcut(text1)
    words2 = jieba.lcut(text2)
    ids1 = [wi[w] for w in words1 if w in wi]
    ids2 = [wi[w] for w in words2 if w in wi]
    vec1 = reduce(lambda x, y: x + y, (vecotrs[w] for w in words1 if w in wi), np.zeros(dim))
    vec2 = reduce(lambda x, y: x + y, (vecotrs[w] for w in words2 if w in wi), np.zeros(dim))
    vec1 = vec1.reshape(1, -1)
    vec2 = vec2.reshape(1, -1)
    cos_sim = cosine_similarity(vec1, vec2)[0,0]
    cos_dis = cosine_distances(vec1, vec2)[0,0]
    score = (cos_sim + 1.0)/ 2.0
    print("wv_cos_score:", score)
#     print('wv_cos_sim:', cos_sim)
#     print('cos_dis:', cos_dis)
#     print('words1:', words1)
#     print('words2:', words2)
#     print('ids1', ids1)
#     print('ids2', ids2)
    return score, cos_sim, cos_dis, words1, words2, ids1, ids2

def wv_contractive_sim(text1, text2):
    words1 = jieba.lcut(text1)
    words2 = jieba.lcut(text2)
    ids1 = [wi[w] for w in words1 if w in wi]
    ids2 = [wi[w] for w in words2 if w in wi]
    vec1 = reduce(lambda x, y: x + y, (vecotrs[w] for w in words1 if w in wi), np.zeros(dim))
    vec2 = reduce(lambda x, y: x + y, (vecotrs[w] for w in words2 if w in wi), np.zeros(dim))
    out1 = vec1 = vec1.reshape(1, -1)
    out2 = vec2 = vec2.reshape(1, -1)

    eps = 1e-5
    dis = np.linalg.norm(out1 - out2, axis = 1)
    norm1 = np.linalg.norm(out1, axis=1)
    norm2 = np.linalg.norm(out2, axis=1)
    score = 1 - dis / (norm1 + norm2 + eps)
    score = score[0]
    
    print("wv_contractive_score:", score)
#     print('words1:', words1)
#     print('words2:', words2)
#     print('ids1', ids1)
#     print('ids2', ids2)
    return score, words1, words2, ids1, ids2


def paddle_wv_cos_sim(text1, text2):
    words1 = jieba.lcut(text1)
    words2 = jieba.lcut(text2)
    ids1 = [paddle_wi[w] for w in words1 if w in paddle_wi]
    ids2 = [paddle_wi[w] for w in words2 if w in paddle_wi]
    vec1 = reduce(lambda x, y: x + y, (paddle_vectors[i] for i in ids1), np.zeros(paddle_dim))
    vec2 = reduce(lambda x, y: x + y, (paddle_vectors[i] for i in ids2), np.zeros(paddle_dim))
    vec1 = vec1.reshape(1, -1)
    vec2 = vec2.reshape(1, -1)
    cos_sim = cosine_similarity(vec1, vec2)[0,0]
    cos_dis = cosine_distances(vec1, vec2)[0,0]
    score = (cos_sim + 1.0)/ 2.0
    print("paddle_wv_cos_score:", score)
#     print('paddle_wv_cos_sim:', cos_sim)
#     print('paddle_cos_dis:', cos_dis)
#     print('paddle_words1:', words1)
#     print('paddle_words2:', words2)
#     print('paddle_ids1', ids1)
#     print('paddle_ids2', ids2)
    return score, cos_sim, cos_dis, words1, words2, ids1, ids2

def paddle_wv_contractive_sim(text1, text2):
    words1 = jieba.lcut(text1)
    words2 = jieba.lcut(text2)
    ids1 = [paddle_wi[w] for w in words1 if w in paddle_wi]
    ids2 = [paddle_wi[w] for w in words2 if w in paddle_wi]
    vec1 = reduce(lambda x, y: x + y, (paddle_vectors[i] for i in ids1), np.zeros(paddle_dim))
    vec2 = reduce(lambda x, y: x + y, (paddle_vectors[i] for i in ids2), np.zeros(paddle_dim))
    out1 = vec1 = vec1.reshape(1, -1)
    out2 = vec2 = vec2.reshape(1, -1)
    
    eps = 1e-5
    dis = np.linalg.norm(out1 - out2, axis = 1)
    norm1 = np.linalg.norm(out1, axis=1)
    norm2 = np.linalg.norm(out2, axis=1)
    score = 1 - dis / (norm1 + norm2 + eps)
    score = score[0]
    
    print("paddle_wv_contractive_score:", score)
#     print('paddle_cos_dis:', cos_dis)
#     print('paddle_words1:', words1)
#     print('paddle_words2:', words2)
#     print('paddle_ids1', ids1)
#     print('paddle_ids2', ids2)
    return score, words1, words2, ids1, ids2

In [7]:
text1 = '高兴'
text2 = '开心'
_ = wv_cos_sim(text1, text2)
# print('\n')
_ = wv_contractive_sim(text1, text2)
print('\n')
_ = paddle_wv_cos_sim(text1, text2)
# print('\n')
_ = paddle_wv_contractive_sim(text1, text2)

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.716 seconds.
Prefix dict has been built succesfully.


wv_cos_score: 0.8281019216369938
wv_contractive_score: 0.5852930547954851


paddle_wv_cos_score: 0.6905868668471822
paddle_wv_contractive_score: 0.4424834449251699


# 一般用例

In [8]:
text1 = "我很高兴"
text2 = "我很开心"
_ = wv_cos_sim(text1, text2)
# print('\n')
_ = wv_contractive_sim(text1, text2)
# print('\n')
_ = paddle_wv_cos_sim(text1, text2)
# print('\n')
_ = paddle_wv_contractive_sim(text1, text2)

wv_cos_score: 0.9570162769504424
wv_contractive_score: 0.7926704162681197
paddle_wv_cos_score: 0.8808241778335343
paddle_wv_contractive_score: 0.6536591991951604


In [9]:
text1 = "我很开心"
text2 = "我很高兴"
_ = wv_cos_sim(text1, text2)
# print('\n')
_ = wv_contractive_sim(text1, text2)
# print('\n')
_ = paddle_wv_cos_sim(text1, text2)
# print('\n')
_ = paddle_wv_contractive_sim(text1, text2)

wv_cos_score: 0.9570162769504424
wv_contractive_score: 0.7926704162681197
paddle_wv_cos_score: 0.8808241778335343
paddle_wv_contractive_score: 0.6536591991951604


In [10]:
text1 = "我很高兴"
text2 = "我特别特别开心"
_ = wv_cos_sim(text1, text2)
# print('\n')
_ = wv_contractive_sim(text1, text2)
# print('\n')
_ = paddle_wv_cos_sim(text1, text2)
# print('\n')
_ = paddle_wv_contractive_sim(text1, text2)

wv_cos_score: 0.8448190526511059
wv_contractive_score: 0.5864756499541417
paddle_wv_cos_score: 0.8643486089250102
paddle_wv_contractive_score: 0.5880322239663949


In [11]:
text1 = "我很高兴"
text2 = "我其实觉得自己很开心"
_ = wv_cos_sim(text1, text2)
# print('\n')
_ = wv_contractive_sim(text1, text2)
# print('\n')
_ = paddle_wv_cos_sim(text1, text2)
# print('\n')
_ = paddle_wv_contractive_sim(text1, text2)

wv_cos_score: 0.9325086328895044
wv_contractive_score: 0.6104962167177967
paddle_wv_cos_score: 0.8420618349095348
paddle_wv_contractive_score: 0.3993552045187527


In [12]:
text1 = "我特别特别开心"
text2 = "我其实觉得自己很开心"
_ = wv_cos_sim(text1, text2)
# print('\n')
_ = wv_contractive_sim(text1, text2)
# print('\n')
_ = paddle_wv_cos_sim(text1, text2)
# print('\n')
_ = paddle_wv_contractive_sim(text1, text2)

wv_cos_score: 0.8589374021662693
wv_contractive_score: 0.5924577994563505
paddle_wv_cos_score: 0.8926304995019502
paddle_wv_contractive_score: 0.5517033243257636


In [13]:
text1 = "我很高兴"
text2 = "我不开心"
_ = wv_cos_sim(text1, text2)
# print('\n')
_ = wv_contractive_sim(text1, text2)
# print('\n')
_ = paddle_wv_cos_sim(text1, text2)
# print('\n')
_ = paddle_wv_contractive_sim(text1, text2)

wv_cos_score: 0.9015086602997187
wv_contractive_score: 0.6846698339712085
paddle_wv_cos_score: 0.8109343745959197
paddle_wv_contractive_score: 0.5448094889725493


In [14]:
text1 = "我很高兴"
text2 = "我不高兴"
_ = wv_cos_sim(text1, text2)
# print('\n')
_ = wv_contractive_sim(text1, text2)
# print('\n')
_ = paddle_wv_cos_sim(text1, text2)
# print('\n')
_ = paddle_wv_contractive_sim(text1, text2)

wv_cos_score: 0.9501109405497903
wv_contractive_score: 0.7742153197319808
paddle_wv_cos_score: 0.9183226619661914
paddle_wv_contractive_score: 0.6971149910566445


In [15]:
text1 = "我很高兴"
text2 = "我很高兴"
_ = wv_cos_sim(text1, text2)
# print('\n')
_ = wv_contractive_sim(text1, text2)
# print('\n')
_ = paddle_wv_cos_sim(text1, text2)
# print('\n')
_ = paddle_wv_contractive_sim(text1, text2)

wv_cos_score: 1.0
wv_contractive_score: 1.0
paddle_wv_cos_score: 0.9999999999999999
paddle_wv_contractive_score: 1.0


In [16]:
text1 = "我很伤心"
text2 = "我很难过"
_ = wv_cos_sim(text1, text2)
# print('\n')
_ = wv_contractive_sim(text1, text2)
# print('\n')
_ = paddle_wv_cos_sim(text1, text2)
# print('\n')
_ = paddle_wv_contractive_sim(text1, text2)

wv_cos_score: 0.967386685677459
wv_contractive_score: 0.8184265749242942
paddle_wv_cos_score: 0.9031836876042222
paddle_wv_contractive_score: 0.6882917583529546


In [17]:
text1 = "真好"
text2 = "不错"
_ = wv_cos_sim(text1, text2)
# print('\n')
_ = wv_contractive_sim(text1, text2)
# print('\n')
_ = paddle_wv_cos_sim(text1, text2)
# print('\n')
_ = paddle_wv_contractive_sim(text1, text2)

wv_cos_score: 0.7265842940035037
wv_contractive_score: 0.4696840389546333
paddle_wv_cos_score: 0.772969949751045
paddle_wv_contractive_score: 0.5123438581363968


In [18]:
text1 = "高兴"
text2 = "开心"
_ = wv_cos_sim(text1, text2)
# print('\n')
_ = wv_contractive_sim(text1, text2)
# print('\n')
_ = paddle_wv_cos_sim(text1, text2)
# print('\n')
_ = paddle_wv_contractive_sim(text1, text2)

wv_cos_score: 0.8281019216369938
wv_contractive_score: 0.5852930547954851
paddle_wv_cos_score: 0.6905868668471822
paddle_wv_contractive_score: 0.4424834449251699


In [19]:
text1 = "大家好才是真的好"
text2 = "大家好才是真的好"
_ = wv_cos_sim(text1, text2)
# print('\n')
_ = wv_contractive_sim(text1, text2)
# print('\n')
_ = paddle_wv_cos_sim(text1, text2)
# print('\n')
_ = paddle_wv_contractive_sim(text1, text2)

wv_cos_score: 0.9999999999999999
wv_contractive_score: 1.0
paddle_wv_cos_score: 1.0
paddle_wv_contractive_score: 1.0


In [20]:
text1 = "为什么能开出腾讯信用却没有微粒贷朋友的没用腾讯信用却有30000的额度呢"
text2 = "我钱包里没有你们这个应用"
_ = wv_cos_sim(text1, text2)
# print('\n')
_ = wv_contractive_sim(text1, text2)
# print('\n')
_ = paddle_wv_cos_sim(text1, text2)
# print('\n')
_ = paddle_wv_contractive_sim(text1, text2)

wv_cos_score: 0.8746715977377394
wv_contractive_score: 0.45459313729130923
paddle_wv_cos_score: 0.9145588025993442
paddle_wv_contractive_score: 0.47683956744520983


In [21]:
text1 = "我也不知道"
text2 = "好吧"
_ = wv_cos_sim(text1, text2)
# print('\n')
_ = wv_contractive_sim(text1, text2)
# print('\n')
_ = paddle_wv_cos_sim(text1, text2)
# print('\n')
_ = paddle_wv_contractive_sim(text1, text2)

wv_cos_score: 0.752343381049728
wv_contractive_score: 0.46237874882420593
paddle_wv_cos_score: 0.6458158955009841
paddle_wv_contractive_score: 0.39704336869787493


In [22]:
text1 = "深度学习"
text2 = "机器学习"
_ = wv_cos_sim(text1, text2)
# print('\n')
_ = wv_contractive_sim(text1, text2)
# print('\n')
_ = paddle_wv_cos_sim(text1, text2)
# print('\n')
_ = paddle_wv_contractive_sim(text1, text2)

wv_cos_score: 0.8077021822927694
wv_contractive_score: 0.5613108826953594
paddle_wv_cos_score: 0.8072427005214211
paddle_wv_contractive_score: 0.5609382808216271


In [23]:
text1 = "机器学习"
text2 = "深度学习"
_ = wv_cos_sim(text1, text2)
# print('\n')
_ = wv_contractive_sim(text1, text2)
# print('\n')
_ = paddle_wv_cos_sim(text1, text2)
# print('\n')
_ = paddle_wv_contractive_sim(text1, text2)

wv_cos_score: 0.8077021822927694
wv_contractive_score: 0.5613108826953594
paddle_wv_cos_score: 0.8072427005214211
paddle_wv_contractive_score: 0.5609382808216271


In [24]:
text1 = "人民团体是什么"
text2 = "人民团体是指"
_ = wv_cos_sim(text1, text2)
# print('\n')
_ = wv_contractive_sim(text1, text2)
# print('\n')
_ = paddle_wv_cos_sim(text1, text2)
# print('\n')
_ = paddle_wv_contractive_sim(text1, text2)

wv_cos_score: 0.9205896814384116
wv_contractive_score: 0.7181795327040212
paddle_wv_cos_score: 0.9451578094564517
paddle_wv_contractive_score: 0.7294826528760326


# 百度-车牌，不太一致

In [25]:
text1 = "车头如何放置车牌"
text2 = "前牌照怎么装"
_ = wv_cos_sim(text1, text2)
# print('\n')
_ = wv_contractive_sim(text1, text2)
# print('\n')
_ = paddle_wv_cos_sim(text1, text2)
# print('\n')
_ = paddle_wv_contractive_sim(text1, text2)

wv_cos_score: 0.8472181322669425
wv_contractive_score: 0.6088366956951178
paddle_wv_cos_score: 0.9054306614122677
paddle_wv_contractive_score: 0.688717326916612


In [26]:
text1 = "车头如何放置车牌"
text2 = "如何办理北京车牌"
_ = wv_cos_sim(text1, text2)
# print('\n')
_ = wv_contractive_sim(text1, text2)
# print('\n')
_ = paddle_wv_cos_sim(text1, text2)
# print('\n')
_ = paddle_wv_contractive_sim(text1, text2)

wv_cos_score: 0.8525999363399437
wv_contractive_score: 0.6160159198514521
paddle_wv_cos_score: 0.9022443510648442
paddle_wv_contractive_score: 0.6853736754468566


In [27]:
text1 = "车头如何放置车牌"
text2 = "后牌照怎么装"
_ = wv_cos_sim(text1, text2)
# print('\n')
_ = wv_contractive_sim(text1, text2)
# print('\n')
_ = paddle_wv_cos_sim(text1, text2)
# print('\n')
_ = paddle_wv_contractive_sim(text1, text2)

wv_cos_score: 0.8482941668533275
wv_contractive_score: 0.6100026239017078
paddle_wv_cos_score: 0.9024062911295447
paddle_wv_contractive_score: 0.6726822485266621


# 百度-信号

In [28]:
text1 = "信号忽强忽弱"
text2 = "信号忽高忽低"
_ = wv_cos_sim(text1, text2)
# print('\n')
_ = wv_contractive_sim(text1, text2)
# print('\n')
_ = paddle_wv_cos_sim(text1, text2)
# print('\n')
_ = paddle_wv_contractive_sim(text1, text2)

wv_cos_score: 0.9456504264480937
wv_contractive_score: 0.7515324679204718
paddle_wv_cos_score: 1.0
paddle_wv_contractive_score: 1.0


In [29]:
text1 = "信号忽强忽弱"
text2 = "信号忽左忽右"
_ = wv_cos_sim(text1, text2)
# print('\n')
_ = wv_contractive_sim(text1, text2)
# print('\n')
_ = paddle_wv_cos_sim(text1, text2)
# print('\n')
_ = paddle_wv_contractive_sim(text1, text2)

wv_cos_score: 0.9473848671198349
wv_contractive_score: 0.7653444985677005
paddle_wv_cos_score: 1.0
paddle_wv_contractive_score: 1.0


In [30]:
text1 = "信号忽强忽弱"
text2 = "信号忽然中断"
_ = wv_cos_sim(text1, text2)
# print('\n')
_ = wv_contractive_sim(text1, text2)
# print('\n')
_ = paddle_wv_cos_sim(text1, text2)
# print('\n')
_ = paddle_wv_contractive_sim(text1, text2)

wv_cos_score: 0.891666217443357
wv_contractive_score: 0.6039585816425094
paddle_wv_cos_score: 0.8336431249447759
paddle_wv_contractive_score: 0.3138190207472159


# 百度-机器学习

In [31]:
text1 = "如何学好深度学习"
text2 = "深入学习习近平讲话材料"
_ = wv_cos_sim(text1, text2)
# print('\n')
_ = wv_contractive_sim(text1, text2)
# print('\n')
_ = paddle_wv_cos_sim(text1, text2)
# print('\n')
_ = paddle_wv_contractive_sim(text1, text2)

wv_cos_score: 0.8229907739842071
wv_contractive_score: 0.5743127562808652
paddle_wv_cos_score: 0.7769436355093958
paddle_wv_contractive_score: 0.5229830811350675


In [32]:
text1 = "如何学好深度学习"
text2 = "机器学习教程"
_ = wv_cos_sim(text1, text2)
# print('\n')
_ = wv_contractive_sim(text1, text2)
# print('\n')
_ = paddle_wv_cos_sim(text1, text2)
# print('\n')
_ = paddle_wv_contractive_sim(text1, text2)

wv_cos_score: 0.8474467180839824
wv_contractive_score: 0.5991109928331003
paddle_wv_cos_score: 0.8638531935037835
paddle_wv_contractive_score: 0.5751737897625182


In [33]:
text1 = "如何学好深度学习"
text2 = "人工智能教程"
_ = wv_cos_sim(text1, text2)
# print('\n')
_ = wv_contractive_sim(text1, text2)
# print('\n')
_ = paddle_wv_cos_sim(text1, text2)
# print('\n')
_ = paddle_wv_contractive_sim(text1, text2)

wv_cos_score: 0.7548484959727588
wv_contractive_score: 0.4785972751606392
paddle_wv_cos_score: 0.844388527200332
paddle_wv_contractive_score: 0.5473747426721861


# 百度-香蕉的翻译，偏小但排序一致

In [34]:
text1 = "香蕉的翻译"
text2 = "香蕉用英文怎么说"
_ = wv_cos_sim(text1, text2)
# print('\n')
_ = wv_contractive_sim(text1, text2)
# print('\n')
_ = paddle_wv_cos_sim(text1, text2)
# print('\n')
_ = paddle_wv_contractive_sim(text1, text2)

wv_cos_score: 0.8269175714674947
wv_contractive_score: 0.5433842670117862
paddle_wv_cos_score: 0.9121909132112216
paddle_wv_contractive_score: 0.5963063373215077


In [35]:
text1 = "香蕉的翻译"
text2 = "香蕉怎么吃"
_ = wv_cos_sim(text1, text2)
# print('\n')
_ = wv_contractive_sim(text1, text2)
# print('\n')
_ = paddle_wv_cos_sim(text1, text2)
# print('\n')
_ = paddle_wv_contractive_sim(text1, text2)

wv_cos_score: 0.8145810622602462
wv_contractive_score: 0.5659560763843421
paddle_wv_cos_score: 0.86359811523314
paddle_wv_contractive_score: 0.616648138592799


In [36]:
text1 = "香蕉的翻译"
text2 = "桔子用英文怎么说"
_ = wv_cos_sim(text1, text2)
# print('\n')
_ = wv_contractive_sim(text1, text2)
# print('\n')
_ = paddle_wv_cos_sim(text1, text2)
# print('\n')
_ = paddle_wv_contractive_sim(text1, text2)

wv_cos_score: 0.785925220470842
wv_contractive_score: 0.5031901241093717
paddle_wv_cos_score: 0.8170505774207248
paddle_wv_contractive_score: 0.5142364377380706


# 百度-腹泻，排序有差别

In [37]:
text1 = "小儿腹泻偏方"
text2 = "宝宝拉肚子偏方"
_ = wv_cos_sim(text1, text2)
# print('\n')
_ = wv_contractive_sim(text1, text2)
# print('\n')
_ = paddle_wv_cos_sim(text1, text2)
# print('\n')
_ = paddle_wv_contractive_sim(text1, text2)

wv_cos_score: 0.9021876863102303
wv_contractive_score: 0.6811934693561489
paddle_wv_cos_score: 0.9498422633173413
paddle_wv_contractive_score: 0.7653954485854974


In [38]:
text1 = "小儿腹泻偏方"
text2 = "小儿感冒偏方"
_ = wv_cos_sim(text1, text2)
# print('\n')
_ = wv_contractive_sim(text1, text2)
# print('\n')
_ = paddle_wv_cos_sim(text1, text2)
# print('\n')
_ = paddle_wv_contractive_sim(text1, text2)

wv_cos_score: 0.9610365368862377
wv_contractive_score: 0.8023364923833134
paddle_wv_cos_score: 0.9411412869373583
paddle_wv_contractive_score: 0.7541986436709069


In [39]:
text1 = "小儿腹泻偏方"
text2 = "腹泻偏方"
_ = wv_cos_sim(text1, text2)
# print('\n')
_ = wv_contractive_sim(text1, text2)
# print('\n')
_ = paddle_wv_cos_sim(text1, text2)
# print('\n')
_ = paddle_wv_contractive_sim(text1, text2)

wv_cos_score: 0.9691007194867957
wv_contractive_score: 0.7645992940602572
paddle_wv_cos_score: 0.925622652797353
paddle_wv_contractive_score: 0.7004439253018993


# 百度-LOL，数值偏小，但排序一致

In [40]:
text1 = "英雄联盟好玩吗，怎么升级"
text2 = "lol攻略"
_ = wv_cos_sim(text1, text2)
# print('\n')
_ = wv_contractive_sim(text1, text2)
# print('\n')
_ = paddle_wv_cos_sim(text1, text2)
# print('\n')
_ = paddle_wv_contractive_sim(text1, text2)

wv_cos_score: 0.7544918722999556
wv_contractive_score: 0.3463905024992596
paddle_wv_cos_score: 0.7834562258439148
paddle_wv_contractive_score: 0.40074318366156425


In [41]:
text1 = "英雄联盟好玩吗，怎么升级"
text2 = "英雄联盟服务器升级"
_ = wv_cos_sim(text1, text2)
# print('\n')
_ = wv_contractive_sim(text1, text2)
# print('\n')
_ = paddle_wv_cos_sim(text1, text2)
# print('\n')
_ = paddle_wv_contractive_sim(text1, text2)

wv_cos_score: 0.8789936376232452
wv_contractive_score: 0.6199826139033153
paddle_wv_cos_score: 0.8229713240725569
paddle_wv_contractive_score: 0.43666092968399073


In [42]:
text1 = "英雄联盟好玩吗，怎么升级"
text2 = "怎么打好lol"
_ = wv_cos_sim(text1, text2)
# print('\n')
_ = wv_contractive_sim(text1, text2)
# print('\n')
_ = paddle_wv_cos_sim(text1, text2)
# print('\n')
_ = paddle_wv_contractive_sim(text1, text2)

wv_cos_score: 0.8182181028852128
wv_contractive_score: 0.4673993151756787
paddle_wv_cos_score: 0.9232834722351844
paddle_wv_contractive_score: 0.5321528133134676


# 百度-红米

In [43]:
text1 = "红米更新出错"
text2 = "红米升级系统出错"
_ = wv_cos_sim(text1, text2)
# print('\n')
_ = wv_contractive_sim(text1, text2)
# print('\n')
_ = paddle_wv_cos_sim(text1, text2)
# print('\n')
_ = paddle_wv_contractive_sim(text1, text2)

wv_cos_score: 0.8869212793184333
wv_contractive_score: 0.6442268981822192
paddle_wv_cos_score: 0.9773239201825801
paddle_wv_contractive_score: 0.8492904696591512


In [44]:
text1 = "红米更新出错"
text2 = "红米账户出错"
_ = wv_cos_sim(text1, text2)
# print('\n')
_ = wv_contractive_sim(text1, text2)
# print('\n')
_ = paddle_wv_cos_sim(text1, text2)
# print('\n')
_ = paddle_wv_contractive_sim(text1, text2)

wv_cos_score: 0.8171595794987088
wv_contractive_score: 0.5721524632614874
paddle_wv_cos_score: 0.9602569713652505
paddle_wv_contractive_score: 0.799916821496678


In [45]:
text1 = "红米更新出错"
text2 = "如何买到小米手机"
_ = wv_cos_sim(text1, text2)
# print('\n')
_ = wv_contractive_sim(text1, text2)
# print('\n')
_ = paddle_wv_cos_sim(text1, text2)
# print('\n')
_ = paddle_wv_contractive_sim(text1, text2)

wv_cos_score: 0.716509271565022
wv_contractive_score: 0.43367116138509965
paddle_wv_cos_score: 0.8650319952894423
paddle_wv_contractive_score: 0.6271451365206246


# 百度-李彦宏

In [46]:
text1 = "李彦宏是百度公司创始人"
text2 = "百度是李彦宏创办的"
_ = wv_cos_sim(text1, text2)
# print('\n')
_ = wv_contractive_sim(text1, text2)
# print('\n')
_ = paddle_wv_cos_sim(text1, text2)
# print('\n')
_ = paddle_wv_contractive_sim(text1, text2)

wv_cos_score: 0.9291079155551757
wv_contractive_score: 0.7315582662717826
paddle_wv_cos_score: 0.9379429434515629
paddle_wv_contractive_score: 0.7426660309177112


In [47]:
text1 = "李彦宏是百度公司创始人"
text2 = "马化腾创办了腾讯公司"
_ = wv_cos_sim(text1, text2)
# print('\n')
_ = wv_contractive_sim(text1, text2)
# print('\n')
_ = paddle_wv_cos_sim(text1, text2)
# print('\n')
_ = paddle_wv_contractive_sim(text1, text2)

wv_cos_score: 0.8883458700743044
wv_contractive_score: 0.665848450740703
paddle_wv_cos_score: 0.7815430392282243
paddle_wv_contractive_score: 0.5249245963579848


In [48]:
text1 = "李彦宏是百度公司创始人"
text2 = "姚明是NBA的著名球星"
_ = wv_cos_sim(text1, text2)
# print('\n')
_ = wv_contractive_sim(text1, text2)
# print('\n')
_ = paddle_wv_cos_sim(text1, text2)
# print('\n')
_ = paddle_wv_contractive_sim(text1, text2)

wv_cos_score: 0.7931885427567575
wv_contractive_score: 0.538356102645319
paddle_wv_cos_score: 0.8131982506917126
paddle_wv_contractive_score: 0.5630062827957616


# 百度-中国历史

In [49]:
text1 = "中国有五千年的历史"
text2 = "中国是个历史悠久的国家"
_ = wv_cos_sim(text1, text2)
# print('\n')
_ = wv_contractive_sim(text1, text2)
# print('\n')
_ = paddle_wv_cos_sim(text1, text2)
# print('\n')
_ = paddle_wv_contractive_sim(text1, text2)

wv_cos_score: 0.898408357642802
wv_contractive_score: 0.6310698984037133
paddle_wv_cos_score: 0.8976356059987008
paddle_wv_contractive_score: 0.6748279059922007


In [50]:
text1 = "中国有五千年的历史"
text2 = "中国有很多少数民族"
_ = wv_cos_sim(text1, text2)
# print('\n')
_ = wv_contractive_sim(text1, text2)
# print('\n')
_ = paddle_wv_cos_sim(text1, text2)
# print('\n')
_ = paddle_wv_contractive_sim(text1, text2)

wv_cos_score: 0.8610327244620704
wv_contractive_score: 0.623092928340065
paddle_wv_cos_score: 0.8607510944731321
paddle_wv_contractive_score: 0.6231225893634298


In [51]:
text1 = "中国有五千年的历史"
text2 = "中国有13亿人口"
_ = wv_cos_sim(text1, text2)
# print('\n')
_ = wv_contractive_sim(text1, text2)
# print('\n')
_ = paddle_wv_cos_sim(text1, text2)
# print('\n')
_ = paddle_wv_contractive_sim(text1, text2)

wv_cos_score: 0.8244656029589372
wv_contractive_score: 0.5680423119356994
paddle_wv_cos_score: 0.7762823733834808
paddle_wv_contractive_score: 0.5213492407177933


# 百度-北京奥运会，偏小，但数值一致

In [52]:
text1 = "北京成功申办了2008年奥运会"
text2 = "2008年奥运会在北京举行"
_ = wv_cos_sim(text1, text2)
# print('\n')
_ = wv_contractive_sim(text1, text2)
# print('\n')
_ = paddle_wv_cos_sim(text1, text2)
# print('\n')
_ = paddle_wv_contractive_sim(text1, text2)

wv_cos_score: 0.9530094119475041
wv_contractive_score: 0.7719337459520468
paddle_wv_cos_score: 0.8978380784382165
paddle_wv_contractive_score: 0.6595879107257957


In [53]:
text1 = "北京成功申办了2008年奥运会"
text2 = "伦敦奥运会在2012年举行"
_ = wv_cos_sim(text1, text2)
# print('\n')
_ = wv_contractive_sim(text1, text2)
# print('\n')
_ = paddle_wv_cos_sim(text1, text2)
# print('\n')
_ = paddle_wv_contractive_sim(text1, text2)

wv_cos_score: 0.9237447994267305
wv_contractive_score: 0.718513434592359
paddle_wv_cos_score: 0.7929355569880036
paddle_wv_contractive_score: 0.5412258281259482


In [54]:
text1 = "北京成功申办了2008年奥运会"
text2 = "东京奥运会即将举办"
_ = wv_cos_sim(text1, text2)
# print('\n')
_ = wv_contractive_sim(text1, text2)
# print('\n')
_ = paddle_wv_cos_sim(text1, text2)
# print('\n')
_ = paddle_wv_contractive_sim(text1, text2)

wv_cos_score: 0.882547947410617
wv_contractive_score: 0.6028934758641549
paddle_wv_cos_score: 0.7931222344721893
paddle_wv_contractive_score: 0.5193583901508702
