In [1]:
import sys
sys.path.append('../')

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

import numpy as np

from textpair.single.paddle_bow import PaddleBowSim
from textpair.single.ann import Ann
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
VOCAB_FILE = '../data/paddle_models/sim_net/data/term2id.dict'
MODEL_PATH = '../data/paddle_models/sim_net/model_files/simnet_bow_pairwise_pretrained_model/'

In [4]:
sim = PaddleBowSim(MODEL_PATH, VOCAB_FILE)

In [5]:
text1 = "人民团体是什么"
text2 = "人民团体是指"
ann1 = Ann(text1)
ann2 = Ann(text2)

In [6]:
res1 = sim(ann1, ann2)
score1 = res1["score"]
vec1 = res1["left_vec"]

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.694 seconds.
Prefix dict has been built succesfully.


In [7]:
score1

0.9515146315097809

In [8]:
for i in tqdm(range(3000)):
    ann1 = Ann(text1)
    ann2 = Ann(text2)
    res = sim(ann1, ann2)

100%|██████████| 3000/3000 [00:04<00:00, 675.17it/s]


In [9]:
vec1

array([[-1.70560837e-01, -3.74650657e-02,  5.14803767e-01,
        -8.30973387e-02, -6.99905083e-02,  2.11101487e-01,
         2.78322905e-01,  1.80265188e-01,  3.15677524e-01,
        -2.06045508e-02,  6.17800474e-01,  3.22973356e-04,
         4.44537103e-02,  3.98693144e-01,  1.59228556e-02,
         4.28976625e-01,  3.96663427e-01, -7.29420111e-02,
        -9.72113460e-02, -1.44232795e-01,  5.54061159e-02,
         4.19091910e-01, -2.40897864e-01,  2.23761261e-01,
         2.55655229e-01,  4.06692475e-02,  5.28913081e-01,
         8.28170657e-01,  2.57944703e-01,  5.23537576e-01,
         3.37901026e-01,  2.48735040e-01, -5.66174865e-01,
         7.18245506e-02, -6.48569405e-01,  7.92640269e-01,
         5.29523551e-01, -4.94030625e-01,  3.55076492e-02,
        -2.81550527e-01, -2.68068910e-01,  9.48423445e-02,
         4.47783560e-01, -4.00970876e-02,  4.84471083e-01,
        -2.65492916e-01,  3.08129162e-01,  2.60512233e-02,
         1.25101686e-01, -1.20797649e-01, -5.25810309e-0

In [10]:
ann1 = Ann(text2)
ann2 = Ann(text1)
res2 = sim(ann1, ann2)
score2 = res2["score"]
vec2 = res2["left_vec"]

In [11]:
score2

0.9515146315097809

In [12]:
vec2

array([[-0.07193196, -0.02850997,  0.41007337, -0.23986799, -0.1817587 ,
         0.38449994,  0.5324818 ,  0.16439936,  0.08232102, -0.10547133,
         0.80433553, -0.07167185,  0.12474293,  0.1335844 ,  0.15730451,
         0.48062217,  0.31346598, -0.12053117,  0.06695841, -0.17203717,
         0.31366545,  0.45438132, -0.05947053,  0.3432764 ,  0.32160527,
        -0.03298597,  0.23279572,  0.961488  ,  0.18464224,  0.4629622 ,
         0.4032657 ,  0.14319557, -0.77499163,  0.42104667, -0.50555253,
         0.671253  ,  0.43920183, -0.2757621 ,  0.15027243, -0.18368083,
        -0.19620834, -0.03260723,  0.45094404,  0.08652598,  0.62859684,
        -0.31752995,  0.21158959, -0.09861019,  0.28990868, -0.10580368,
        -0.18310775, -0.14328662, -0.60351646, -0.13392398,  0.09366333,
        -0.4327727 ,  0.11153603,  0.31390622,  0.01611447, -0.60121864,
        -0.24603459, -0.2323186 , -0.5872388 ,  0.18065348, -0.30484897,
         0.00930198, -0.28493944,  0.3666643 , -0.1

In [13]:
score_vec1_vec2 = (cosine_similarity(vec1, vec2)[0, 0] + 1) / 2

In [14]:
score1, score2, score_vec1_vec2

(0.9515146315097809, 0.9515146315097809, 0.9515146613121033)

In [15]:
ann1.text

'人民团体是指'

In [16]:
ann1.ptext

'人民团体是指'

In [17]:
ann1.ares

['人民团体', '是', '指']

In [18]:
def contractive_sim(text1, text2):
    ann1 = Ann(text1)
    ann2 = Ann(text2)
    _ann = Ann('你好')
    res1 = sim(ann1, _ann)
    res2 = sim(ann2, _ann)
    out1 = res1['left_vec']
    out2 = res2['left_vec']
    eps = 1e-5
    dis = np.linalg.norm(out1 - out2, axis = 1)
    norm1 = np.linalg.norm(out1, axis=1)
    norm2 = np.linalg.norm(out2, axis=1)
    score = 1 - dis / (norm1 + norm2 + eps)
    return 'contractive score:', score[0]


def cos_sim(text1, text2):
    ann1 = Ann(text1)
    ann2 = Ann(text2)
    _ann = Ann('你好')
    res1 = sim(ann1, _ann)
    res2 = sim(ann2, _ann)
    out1 = res1['left_vec']
    out2 = res2['left_vec']
    eps = 1e-5
    score = (cosine_similarity(out1, out2)[0,0] + 1) / 2.0
    return 'cos score:', score

# 一般用例

In [19]:
text1 = "我很高兴"
text2 = "我很开心"
cos_sim(text1, text2)
contractive_sim(text1, text2)

('cos score:', 0.9455671310424805)

('contractive score:', 0.7657492)

In [20]:
text1 = "我很开心"
text2 = "我很高兴"
cos_sim(text1, text2)
contractive_sim(text1, text2)

('cos score:', 0.9455671310424805)

('contractive score:', 0.7657492)

In [21]:
text1 = "我很高兴"
text2 = "我特别特别开心"
cos_sim(text1, text2)
contractive_sim(text1, text2)

('cos score:', 0.9033189415931702)

('contractive score:', 0.6808901)

In [22]:
text1 = "我很高兴"
text2 = "我其实觉得自己很开心"
cos_sim(text1, text2)
contractive_sim(text1, text2)

('cos score:', 0.8452472686767578)

('contractive score:', 0.60181135)

In [23]:
text1 = "我特别特别开心"
text2 = "我其实觉得自己很开心"
cos_sim(text1, text2)
contractive_sim(text1, text2)

('cos score:', 0.8501988351345062)

('contractive score:', 0.61287975)

In [24]:
text1 = "我很高兴"
text2 = "我不开心"
cos_sim(text1, text2)
contractive_sim(text1, text2)

('cos score:', 0.8993157148361206)

('contractive score:', 0.68220186)

In [25]:
text1 = "我很高兴"
text2 = "我不高兴"
cos_sim(text1, text2)
contractive_sim(text1, text2)

('cos score:', 0.9513366520404816)

('contractive score:', 0.7758317)

In [26]:
text1 = "我很高兴"
text2 = "我很高兴"
cos_sim(text1, text2)
contractive_sim(text1, text2)

('cos score:', 1.0)

('contractive score:', 1.0)

In [27]:
text1 = "我很伤心"
text2 = "我很难过"
cos_sim(text1, text2)
contractive_sim(text1, text2)

('cos score:', 0.9376441240310669)

('contractive score:', 0.7447754)

In [28]:
text1 = "真好"
text2 = "不错"
cos_sim(text1, text2)
contractive_sim(text1, text2)

('cos score:', 0.9662007093429565)

('contractive score:', 0.8124738)

In [29]:
text1 = "高兴"
text2 = "开心"
cos_sim(text1, text2)
contractive_sim(text1, text2)

('cos score:', 0.9407589435577393)

('contractive score:', 0.7565402)

In [30]:
text1 = "大家好才是真的好"
text2 = "大家好才是真的好"
cos_sim(text1, text2)
contractive_sim(text1, text2)

('cos score:', 1.0)

('contractive score:', 1.0)

In [31]:
text1 = "为什么能开出腾讯信用却没有微粒贷朋友的没用腾讯信用却有30000的额度呢"
text2 = "我钱包里没有你们这个应用"
cos_sim(text1, text2)
contractive_sim(text1, text2)

('cos score:', 0.6067420840263367)

('contractive score:', 0.3539492)

In [32]:
text1 = "我也不知道"
text2 = "好吧"
cos_sim(text1, text2)
contractive_sim(text1, text2)

('cos score:', 0.7849870920181274)

('contractive score:', 0.53630126)

In [33]:
text1 = "深度学习"
text2 = "机器学习"
cos_sim(text1, text2)
contractive_sim(text1, text2)

('cos score:', 0.9327083826065063)

('contractive score:', 0.73363996)

In [34]:
text1 = "机器学习"
text2 = "深度学习"
cos_sim(text1, text2)
contractive_sim(text1, text2)

('cos score:', 0.9327083826065063)

('contractive score:', 0.73363996)

In [35]:
text1 = "人民团体是什么"
text2 = "人民团体是指"
cos_sim(text1, text2)
contractive_sim(text1, text2)

('cos score:', 0.9515146613121033)

('contractive score:', 0.778327)

# 百度-车牌，不太一致

In [36]:
text1 = "车头如何放置车牌"
text2 = "前牌照怎么装"
cos_sim(text1, text2)
contractive_sim(text1, text2)

('cos score:', 0.8534723520278931)

('contractive score:', 0.6164715)

In [37]:
text1 = "车头如何放置车牌"
text2 = "如何办理北京车牌"
cos_sim(text1, text2)
contractive_sim(text1, text2)

('cos score:', 0.8042252361774445)

('contractive score:', 0.5575353)

In [38]:
text1 = "车头如何放置车牌"
text2 = "后牌照怎么装"
cos_sim(text1, text2)
contractive_sim(text1, text2)

('cos score:', 0.8330043852329254)

('contractive score:', 0.59109885)

# 百度-信号

In [39]:
text1 = "信号忽强忽弱"
text2 = "信号忽高忽低"
cos_sim(text1, text2)
contractive_sim(text1, text2)

('cos score:', 0.9649812877178192)

('contractive score:', 0.81281734)

In [40]:
text1 = "信号忽强忽弱"
text2 = "信号忽左忽右"
cos_sim(text1, text2)
contractive_sim(text1, text2)

('cos score:', 0.9399107694625854)

('contractive score:', 0.7546192)

In [41]:
text1 = "信号忽强忽弱"
text2 = "信号忽然中断"
cos_sim(text1, text2)
contractive_sim(text1, text2)

('cos score:', 0.8135907053947449)

('contractive score:', 0.5313622)

# 百度-机器学习

In [42]:
text1 = "如何学好深度学习"
text2 = "深入学习习近平讲话材料"
cos_sim(text1, text2)
contractive_sim(text1, text2)

('cos score:', 0.7102710008621216)

('contractive score:', 0.44851905)

In [43]:
text1 = "如何学好深度学习"
text2 = "机器学习教程"
cos_sim(text1, text2)
contractive_sim(text1, text2)

('cos score:', 0.8645129799842834)

('contractive score:', 0.63090575)

In [44]:
text1 = "如何学好深度学习"
text2 = "人工智能教程"
cos_sim(text1, text2)
contractive_sim(text1, text2)

('cos score:', 0.8162215948104858)

('contractive score:', 0.54566514)

# 百度-香蕉的翻译，偏小但排序一致

In [45]:
text1 = "香蕉的翻译"
text2 = "香蕉用英文怎么说"
cos_sim(text1, text2)
contractive_sim(text1, text2)

('cos score:', 0.9050406217575073)

('contractive score:', 0.6917991)

In [46]:
text1 = "香蕉的翻译"
text2 = "香蕉怎么吃"
cos_sim(text1, text2)
contractive_sim(text1, text2)

('cos score:', 0.8909212350845337)

('contractive score:', 0.66930497)

In [47]:
text1 = "香蕉的翻译"
text2 = "桔子用英文怎么说"
cos_sim(text1, text2)
contractive_sim(text1, text2)

('cos score:', 0.8208398520946503)

('contractive score:', 0.5764553)

# 百度-腹泻，排序有差别

In [48]:
text1 = "小儿腹泻偏方"
text2 = "宝宝拉肚子偏方"
cos_sim(text1, text2)
contractive_sim(text1, text2)

('cos score:', 0.9634384512901306)

('contractive score:', 0.8083662)

In [49]:
text1 = "小儿腹泻偏方"
text2 = "小儿感冒偏方"
cos_sim(text1, text2)
contractive_sim(text1, text2)

('cos score:', 0.9265573024749756)

('contractive score:', 0.7286538)

In [50]:
text1 = "小儿腹泻偏方"
text2 = "腹泻偏方"
cos_sim(text1, text2)
contractive_sim(text1, text2)

('cos score:', 0.9557473063468933)

('contractive score:', 0.78730416)

# 百度-LOL，数值偏小，但排序一致

In [51]:
text1 = "英雄联盟好玩吗，怎么升级"
text2 = "lol攻略"
cos_sim(text1, text2)
contractive_sim(text1, text2)

('cos score:', 0.8630011081695557)

('contractive score:', 0.6145499)

In [52]:
text1 = "英雄联盟好玩吗，怎么升级"
text2 = "英雄联盟服务器升级"
cos_sim(text1, text2)
contractive_sim(text1, text2)

('cos score:', 0.9044623076915741)

('contractive score:', 0.6789144)

In [53]:
text1 = "英雄联盟好玩吗，怎么升级"
text2 = "怎么打好lol"
cos_sim(text1, text2)
contractive_sim(text1, text2)

('cos score:', 0.9122909903526306)

('contractive score:', 0.6822505)

# 百度-红米

In [54]:
text1 = "红米更新出错"
text2 = "红米升级系统出错"
cos_sim(text1, text2)
contractive_sim(text1, text2)

('cos score:', 0.9650112390518188)

('contractive score:', 0.812607)

In [55]:
text1 = "红米更新出错"
text2 = "红米账户出错"
cos_sim(text1, text2)
contractive_sim(text1, text2)

('cos score:', 0.9234814047813416)

('contractive score:', 0.7222552)

In [56]:
text1 = "红米更新出错"
text2 = "如何买到小米手机"
cos_sim(text1, text2)
contractive_sim(text1, text2)

('cos score:', 0.7189841568470001)

('contractive score:', 0.4698354)

# 百度-李彦宏

In [57]:
text1 = "李彦宏是百度公司创始人"
text2 = "百度是李彦宏创办的"
cos_sim(text1, text2)
contractive_sim(text1, text2)

('cos score:', 0.9416636824607849)

('contractive score:', 0.7431462)

In [58]:
text1 = "李彦宏是百度公司创始人"
text2 = "马化腾创办了腾讯公司"
cos_sim(text1, text2)
contractive_sim(text1, text2)

('cos score:', 0.8182513117790222)

('contractive score:', 0.56456757)

In [59]:
text1 = "李彦宏是百度公司创始人"
text2 = "姚明是NBA的著名球星"
cos_sim(text1, text2)
contractive_sim(text1, text2)

('cos score:', 0.7644396424293518)

('contractive score:', 0.50262636)

# 百度-中国历史

In [60]:
text1 = "中国有五千年的历史"
text2 = "中国是个历史悠久的国家"
cos_sim(text1, text2)
contractive_sim(text1, text2)

('cos score:', 0.8344405293464661)

('contractive score:', 0.5546527)

In [61]:
text1 = "中国有五千年的历史"
text2 = "中国有很多少数民族"
cos_sim(text1, text2)
contractive_sim(text1, text2)

('cos score:', 0.8080736994743347)

('contractive score:', 0.5552473)

In [62]:
text1 = "中国有五千年的历史"
text2 = "中国有13亿人口"
cos_sim(text1, text2)
contractive_sim(text1, text2)

('cos score:', 0.7523093521595001)

('contractive score:', 0.47597486)

# 百度-北京奥运会，偏小，但数值一致

In [63]:
text1 = "北京成功申办了2008年奥运会"
text2 = "2008年奥运会在北京举行"
cos_sim(text1, text2)
contractive_sim(text1, text2)

('cos score:', 0.8722507357597351)

('contractive score:', 0.64042765)

In [64]:
text1 = "北京成功申办了2008年奥运会"
text2 = "伦敦奥运会在2012年举行"
cos_sim(text1, text2)
contractive_sim(text1, text2)

('cos score:', 0.7630066275596619)

('contractive score:', 0.5131706)

In [65]:
text1 = "北京成功申办了2008年奥运会"
text2 = "东京奥运会即将举办"
cos_sim(text1, text2)
contractive_sim(text1, text2)

('cos score:', 0.7212799340486526)

('contractive score:', 0.46924585)