In [1]:
import sys
sys.path.append('../')

In [2]:
import os
import pickle
import json
import torch
from copy import deepcopy
from pytorch_nlp_models.text_pair.siamese_cnn import SiameseCNN
from utils.preprocess import text2ids
# from utils.datasets import LCQMCDataset
# from utils.model_utils import model_train, model_eval

# from torch.utils.data import DataLoader
# from dumb_containers import evaluate_performance

In [3]:
DATA_PATH = '../data/'
# LCQMC_PATH = os.path.join(DATA_PATH, 'LCQMC')
WORD_VECTORS_PATH = os.path.join(DATA_PATH, 'word_vectors')
BAIDUBAIKE_PKL = os.path.join(WORD_VECTORS_PATH, 'baidubaike.pkl')

MAX_SEQ_LEN = 40

MODEL_PATH = os.path.join(DATA_PATH, 'model_files/siamese_cnn')
if not os.path.exists(MODEL_PATH):
    os.makedirs(MODEL_PATH)
    
MODEL_FILE = os.path.join(MODEL_PATH, 'model.pkl')
MODEL_CONFIG_JSON = os.path.join(MODEL_PATH, 'config.json')

with open(MODEL_CONFIG_JSON, 'r') as f:
    MODEL_CONFIG = json.load(f)

In [4]:
with open(BAIDUBAIKE_PKL, 'rb') as f:
    wvs = pickle.load(f)
    
wi = wvs['wi']
iw = wvs['iw']
dim = wvs['dim']
emb = wvs['emb']


MODEL_CONFIG = {'vocab_size': len(iw),
                'emb_dim': dim,
                'hidden_dim': dim,
                'dropout': 0.2,
               }

In [5]:
model = SiameseCNN(**MODEL_CONFIG)
checkpoint = torch.load(MODEL_FILE)
model.load_state_dict(checkpoint['model_state_dict'])

IncompatibleKeys(missing_keys=[], unexpected_keys=[])

In [6]:
model.emb.state_dict()

OrderedDict([('weight',
              tensor([[-0.0591,  0.0365, -0.0309,  ...,  0.1121, -0.0794, -0.0350],
                      [-0.0547,  0.0733, -0.0126,  ...,  0.0449,  0.0316, -0.0057],
                      [-0.1045, -0.4096,  0.0025,  ...,  0.2424,  0.5210,  0.0380],
                      ...,
                      [ 0.1317, -0.0819,  0.0877,  ..., -0.0862, -0.0418, -0.1139],
                      [ 0.0918,  0.1966, -0.0043,  ..., -0.1252,  0.0385,  0.0049],
                      [ 0.0351,  0.1157, -0.0244,  ..., -0.0970,  0.0307, -0.0839]]))])

In [7]:
def sim(text1, text2):
    ids1 = text2ids(text1, wi, charmode=True)
    ids2 = text2ids(text2, wi, charmode=True)
    len1 = len(ids1)
    len2 = len(ids2)
    
    if len(ids1) > MAX_SEQ_LEN:
        ids1 = ids1[:MAX_SEQ_LEN]
        len1 = MAX_SEQ_LEN
    else:
        len1 = len(ids1)
        ids1 += [0] * (MAX_SEQ_LEN - len1)

    if len(ids2) > MAX_SEQ_LEN:
        ids2 = ids2[:MAX_SEQ_LEN]
        len2 = MAX_SEQ_LEN
    else:
        len2 = len(ids2)
        ids2 += [0] * (MAX_SEQ_LEN - len2)

    ids1_tensor = torch.tensor([ids1], dtype = torch.long)
    ids2_tensor = torch.tensor([ids2], dtype = torch.long)
    len1_tensor = torch.tensor([len1], dtype = torch.long)
    len2_tensor = torch.tensor([len2], dtype = torch.long)
    model.eval()
    with torch.no_grad():
        logits, vec1, vec2 = model(ids1_tensor, ids2_tensor, len1_tensor, len2_tensor)
        probs = torch.softmax(logits, dim = 1)
    print(probs)
    return probs[0, 1].item(), vec1, vec2

# 一般测试

In [8]:
text1 = "英雄联盟什么英雄最好"
text2 = "英雄联盟最好英雄是什么"
pos_prob, vec1, vec2 = sim(text1, text2)
pos_prob

tensor([[8.3553e-05, 9.9992e-01]])


0.9999164342880249

In [9]:
text1 = "苏宁电器这只股票怎么样"
text2 = "苏宁电器的股票怎么样"
pos_prob, vec1, vec2 = sim(text1, text2)
pos_prob

tensor([[4.7788e-04, 9.9952e-01]])


0.9995220899581909

In [10]:
text1 = "我很高兴"
text2 = "我很开心"
pos_prob, vec1, vec2 = sim(text1, text2)
pos_prob

tensor([[0.7645, 0.2355]])


0.23548971116542816

In [11]:
text1 = "我很喜欢吃冰淇淋"
text2 = "我很爱吃冰淇淋"
pos_prob, vec1, vec2 = sim(text1, text2)
pos_prob

tensor([[0.0096, 0.9904]])


0.9903648495674133

In [12]:
text1 = "我很开心"
text2 = "我很高兴"
pos_prob, vec1, vec2 = sim(text1, text2)
pos_prob

tensor([[0.7645, 0.2355]])


0.23548971116542816

In [13]:
text1 = "我很高兴"
text2 = "我特别特别开心"
pos_prob, vec1, vec2 = sim(text1, text2)
pos_prob

tensor([[0.5352, 0.4648]])


0.4647536873817444

In [14]:
text1 = "我很高兴"
text2 = "我其实觉得自己很开心"
pos_prob, vec1, vec2 = sim(text1, text2)
pos_prob

tensor([[0.7732, 0.2268]])


0.22675396502017975

In [15]:
text1 = "我特别特别开心"
text2 = "我其实觉得自己很开心"
pos_prob, vec1, vec2 = sim(text1, text2)
pos_prob

tensor([[0.0276, 0.9724]])


0.9723561406135559

In [16]:
text1 = "我很高兴"
text2 = "我不开心"
pos_prob, vec1, vec2 = sim(text1, text2)
pos_prob

tensor([[0.9851, 0.0149]])


0.014875711873173714

In [17]:
text1 = "我很高兴"
text2 = "我不高兴"
pos_prob, vec1, vec2 = sim(text1, text2)
pos_prob

tensor([[0.0137, 0.9863]])


0.9862529635429382

In [18]:
text1 = "我很高兴"
text2 = "我很高兴"
pos_prob, vec1, vec2 = sim(text1, text2)
pos_prob

tensor([[1.0840e-04, 9.9989e-01]])


0.9998916387557983

In [19]:
text1 = "我很伤心"
text2 = "我很难过"
pos_prob, vec1, vec2 = sim(text1, text2)
pos_prob

tensor([[0.4059, 0.5941]])


0.5941320657730103

In [20]:
text1 = "真好"
text2 = "不错"
pos_prob, vec1, vec2 = sim(text1, text2)
pos_prob

tensor([[0.8520, 0.1480]])


0.14801786839962006

In [21]:
text1 = "高兴"
text2 = "开心"
pos_prob, vec1, vec2 = sim(text1, text2)
pos_prob

tensor([[0.9086, 0.0914]])


0.0913928672671318

In [22]:
text1 = "大家好才是真的好"
text2 = "大家好才是真的好"
pos_prob, vec1, vec2 = sim(text1, text2)
pos_prob

tensor([[2.8391e-06, 1.0000e+00]])


0.9999971389770508

In [23]:
text1 = "为什么能开出腾讯信用却没有微粒贷朋友的没用腾讯信用却有30000的额度呢"
text2 = "我钱包里没有你们这个应用"
pos_prob, vec1, vec2 = sim(text1, text2)
pos_prob

tensor([[9.9987e-01, 1.2513e-04]])


0.00012513212277553976

In [24]:
text1 = "我也不知道"
text2 = "好吧"
pos_prob, vec1, vec2 = sim(text1, text2)
pos_prob

tensor([[0.9642, 0.0358]])


0.035836685448884964

In [25]:
text1 = "深度学习"
text2 = "机器学习"
pos_prob, vec1, vec2 = sim(text1, text2)
pos_prob

tensor([[0.2789, 0.7211]])


0.7211363315582275

In [26]:
text1 = "机器学习"
text2 = "深度学习"
pos_prob, vec1, vec2 = sim(text1, text2)
pos_prob

tensor([[0.2789, 0.7211]])


0.7211363315582275

In [27]:
text1 = "人民团体是什么"
text2 = "人民团体是指"
pos_prob, vec1, vec2 = sim(text1, text2)
pos_prob

tensor([[1.8772e-04, 9.9981e-01]])


0.9998123049736023

# 百度-车牌，不太一致

In [28]:
text1 = "车头如何放置车牌"
text2 = "前牌照怎么装"
pos_prob, vec1, vec2 = sim(text1, text2)
pos_prob

tensor([[0.8987, 0.1013]])


0.1012992262840271

In [29]:
text1 = "车头如何放置车牌"
text2 = "如何办理北京车牌"
pos_prob, vec1, vec2 = sim(text1, text2)
pos_prob

tensor([[0.9081, 0.0919]])


0.0918712392449379

In [30]:
text1 = "车头如何放置车牌"
text2 = "后牌照怎么装"
pos_prob, vec1, vec2 = sim(text1, text2)
pos_prob

tensor([[0.9236, 0.0764]])


0.07642624527215958

# 百度-信号

In [31]:
text1 = "信号忽强忽弱"
text2 = "信号忽高忽低"
pos_prob, vec1, vec2 = sim(text1, text2)
pos_prob

tensor([[0.0037, 0.9963]])


0.9963286519050598

In [32]:
text1 = "信号忽强忽弱"
text2 = "信号忽左忽右"
pos_prob, vec1, vec2 = sim(text1, text2)
pos_prob

tensor([[0.0296, 0.9704]])


0.9703766107559204

In [33]:
text1 = "信号忽强忽弱"
text2 = "信号忽然中断"
pos_prob, vec1, vec2 = sim(text1, text2)
pos_prob

tensor([[0.0377, 0.9623]])


0.9622653722763062

# 百度-机器学习

In [34]:
text1 = "如何学好深度学习"
text2 = "深入学习习近平讲话材料"
pos_prob, vec1, vec2 = sim(text1, text2)
pos_prob

tensor([[0.9925, 0.0075]])


0.007539540529251099

In [35]:
text1 = "如何学好深度学习"
text2 = "机器学习教程"
pos_prob, vec1, vec2 = sim(text1, text2)
pos_prob

tensor([[0.5992, 0.4008]])


0.4007570147514343

In [36]:
text1 = "如何学好深度学习"
text2 = "人工智能教程"
pos_prob, vec1, vec2 = sim(text1, text2)
pos_prob

tensor([[0.9865, 0.0135]])


0.013544592075049877

# 百度-香蕉的翻译，偏小但排序一致

In [37]:
text1 = "香蕉的翻译"
text2 = "香蕉用英文怎么说"
pos_prob, vec1, vec2 = sim(text1, text2)
pos_prob

tensor([[0.7377, 0.2623]])


0.26231491565704346

In [38]:
text1 = "香蕉的翻译"
text2 = "香蕉怎么吃"
pos_prob, vec1, vec2 = sim(text1, text2)
pos_prob

tensor([[0.5814, 0.4186]])


0.4185939431190491

In [39]:
text1 = "香蕉的翻译"
text2 = "桔子用英文怎么说"
pos_prob, vec1, vec2 = sim(text1, text2)
pos_prob

tensor([[0.9905, 0.0095]])


0.009474558755755424

# 百度-腹泻，排序有差别

In [40]:
text1 = "小儿腹泻偏方"
text2 = "宝宝拉肚子偏方"
pos_prob, vec1, vec2 = sim(text1, text2)
pos_prob

tensor([[0.5363, 0.4637]])


0.4636746644973755

In [41]:
text1 = "小儿腹泻偏方"
text2 = "小儿感冒偏方"
pos_prob, vec1, vec2 = sim(text1, text2)
pos_prob

tensor([[0.0473, 0.9527]])


0.9527326226234436

In [42]:
text1 = "小儿腹泻偏方"
text2 = "腹泻偏方"
pos_prob, vec1, vec2 = sim(text1, text2)
pos_prob

tensor([[0.0014, 0.9986]])


0.9986273050308228

# 百度-LOL，数值偏小，但排序一致

In [43]:
text1 = "英雄联盟好玩吗，怎么升级"
text2 = "英雄联盟攻略"
pos_prob, vec1, vec2 = sim(text1, text2)
pos_prob

tensor([[0.4916, 0.5084]])


0.5083925724029541

In [44]:
text1 = "英雄联盟好玩吗，怎么升级"
text2 = "英雄联盟服务器升级"
pos_prob, vec1, vec2 = sim(text1, text2)
pos_prob

tensor([[0.0435, 0.9565]])


0.9565330147743225

In [45]:
text1 = "英雄联盟好玩吗，怎么升级"
text2 = "怎么打好英雄联盟"
pos_prob, vec1, vec2 = sim(text1, text2)
pos_prob

tensor([[0.2259, 0.7741]])


0.7741127014160156

# 百度-红米

In [46]:
text1 = "红米更新出错"
text2 = "红米升级系统出错"
pos_prob, vec1, vec2 = sim(text1, text2)
pos_prob

tensor([[0.1326, 0.8674]])


0.867403507232666

In [47]:
text1 = "红米更新出错"
text2 = "红米账户出错"
pos_prob, vec1, vec2 = sim(text1, text2)
pos_prob

tensor([[0.0661, 0.9339]])


0.9339197278022766

In [48]:
text1 = "红米更新出错"
text2 = "如何买到小米手机"
pos_prob, vec1, vec2 = sim(text1, text2)
pos_prob

tensor([[0.9983, 0.0017]])


0.0016654371283948421

# 百度-李彦宏

In [49]:
text1 = "李彦宏是百度公司创始人"
text2 = "百度是李彦宏创办的"
pos_prob, vec1, vec2 = sim(text1, text2)
pos_prob

tensor([[0.0543, 0.9457]])


0.9457142949104309

In [50]:
text1 = "李彦宏是百度公司创始人"
text2 = "马化腾创办了腾讯公司"
pos_prob, vec1, vec2 = sim(text1, text2)
pos_prob

tensor([[0.8830, 0.1170]])


0.11702600866556168

In [51]:
text1 = "李彦宏是百度公司创始人"
text2 = "姚明是NBA的著名球星"
pos_prob, vec1, vec2 = sim(text1, text2)
pos_prob

tensor([[0.9976, 0.0024]])


0.0024302422534674406

# 百度-中国历史

In [52]:
text1 = "中国有五千年的历史"
text2 = "中国是个历史悠久的国家"
pos_prob, vec1, vec2 = sim(text1, text2)
pos_prob

tensor([[0.9110, 0.0890]])


0.0889529436826706

In [53]:
text1 = "中国有五千年的历史"
text2 = "中国有很多少数民族"
pos_prob, vec1, vec2 = sim(text1, text2)
pos_prob

tensor([[0.5871, 0.4129]])


0.41286370158195496

In [54]:
text1 = "中国有五千年的历史"
text2 = "中国有13亿人口"
pos_prob, vec1, vec2 = sim(text1, text2)
pos_prob

tensor([[0.9613, 0.0387]])


0.03867652267217636

# 百度-北京奥运会，偏小，但数值一致

In [55]:
text1 = "北京成功申办了2008年奥运会"
text2 = "2008年奥运会在北京举行"
pos_prob, vec1, vec2 = sim(text1, text2)
pos_prob

tensor([[0.1808, 0.8192]])


0.8191573619842529

In [56]:
text1 = "北京成功申办了2008年奥运会"
text2 = "伦敦奥运会在2012年举行"
pos_prob, vec1, vec2 = sim(text1, text2)
pos_prob

tensor([[0.8822, 0.1178]])


0.11780460923910141

In [57]:
text1 = "北京成功申办了2008年奥运会"
text2 = "东京奥运会即将举办"
pos_prob, vec1, vec2 = sim(text1, text2)
pos_prob

tensor([[0.5057, 0.4943]])


0.49431949853897095