In [1]:
import os
import jieba
import pickle
import pandas as pd
import numpy as np
from tqdm import tqdm
from functools import reduce
from sklearn.metrics.pairwise import cosine_similarity, cosine_distances

In [2]:
WORD_VEC_PATH = '/efs/downloads/word_vector/sgns.baidubaike.bigram-char'
LINE_NUM = 635975

PADDLE_BOW_WORD_VEC_PKL = '../data/paddle_models/sim_net/word_vec/paddle_bow_wv.pkl'

In [3]:
def read_vectors(path, topn):  # read top n word vectors, i.e. top is 10000
    lines_num, dim = 0, 0
    vectors = {}
    iw = []
    wi = {}
    with open(path, encoding='utf-8', errors='ignore') as f:
        first_line = True
        for line in tqdm(f, total= topn):
            if first_line:
                first_line = False
                dim = int(line.rstrip().split()[1])
                continue
            lines_num += 1
            tokens = line.rstrip().split(' ')
            vectors[tokens[0]] = np.asarray([float(x) for x in tokens[1:]])
            iw.append(tokens[0])
            if topn != 0 and lines_num >= topn:
                break
    for i, w in enumerate(iw):
        wi[w] = i
    return vectors, iw, wi, dim


def read_paddle_bow_vectors(pkl):
    with open(pkl, 'rb') as f:
        wv = pickle.load(f)
        vectors = wv['embedding']
        wi = wv['w2i']
    return vectors, wi, len(vectors[0])

In [4]:
vecotrs, iw, wi, dim = read_vectors(WORD_VEC_PATH, LINE_NUM)

100%|██████████| 635975/635975 [00:56<00:00, 11268.75it/s]


In [5]:
paddle_vectors, paddle_wi, paddle_dim = read_paddle_bow_vectors(PADDLE_BOW_WORD_VEC_PKL)

In [6]:
def sim(text1, text2):
    words1 = jieba.lcut(text1)
    words2 = jieba.lcut(text2)
    ids1 = [wi[w] for w in words1 if w in wi]
    ids2 = [wi[w] for w in words2 if w in wi]
    vec1 = reduce(lambda x, y: x + y, (vecotrs[w] for w in words1 if w in wi), np.zeros(dim))
    vec2 = reduce(lambda x, y: x + y, (vecotrs[w] for w in words2 if w in wi), np.zeros(dim))
    vec1 = vec1.reshape(1, -1)
    vec2 = vec2.reshape(1, -1)
    cos_sim = cosine_similarity(vec1, vec2)[0,0]
    cos_dis = cosine_distances(vec1, vec2)[0,0]
    score = (cos_sim + 1.0)/ 2.0
    print("score:", score)
    print('cos_sim:', cos_sim)
    print('cos_dis:', cos_dis)
    print('words1:', words1)
    print('words2:', words2)
    print('ids1', ids1)
    print('ids2', ids2)
    return score, cos_sim, cos_dis, words1, words2, ids1, ids2


def paddle_sim(text1, text2):
    words1 = jieba.lcut(text1)
    words2 = jieba.lcut(text2)
    ids1 = [paddle_wi[w] for w in words1 if w in paddle_wi]
    ids2 = [paddle_wi[w] for w in words2 if w in paddle_wi]
    vec1 = reduce(lambda x, y: x + y, (paddle_vectors[i] for i in ids1), np.zeros(paddle_dim))
    vec2 = reduce(lambda x, y: x + y, (paddle_vectors[i] for i in ids2), np.zeros(paddle_dim))
    vec1 = vec1.reshape(1, -1)
    vec2 = vec2.reshape(1, -1)
    cos_sim = cosine_similarity(vec1, vec2)[0,0]
    cos_dis = cosine_distances(vec1, vec2)[0,0]
    score = (cos_sim + 1.0)/ 2.0
    print("paddle_score:", score)
    print('paddle_cos_sim:', cos_sim)
    print('paddle_cos_dis:', cos_dis)
    print('paddle_words1:', words1)
    print('paddle_words2:', words2)
    print('paddle_ids1', ids1)
    print('paddle_ids2', ids2)
    return score, cos_sim, cos_dis, words1, words2, ids1, ids2

In [7]:
text1 = '高兴'
text2 = '开心'
_ = sim(text1, text2)
print('\n')
_ = paddle_sim(text1, text2)

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.693 seconds.
Prefix dict has been built succesfully.


score: 0.8281019216369938
cos_sim: 0.6562038432739876
cos_dis: 0.3437961567260124
words1: ['高兴']
words2: ['开心']
ids1 [4764]
ids2 [6473]


paddle_score: 0.6905868668471822
paddle_cos_sim: 0.38117373369436447
paddle_cos_dis: 0.6188262663056355
paddle_words1: ['高兴']
paddle_words2: ['开心']
paddle_ids1 [410020]
paddle_ids2 [457777]


In [8]:
text1 = '伤心'
text2 = '不开心'
_ = sim(text1, text2)
print('\n')
_ = paddle_sim(text1, text2)

score: 0.7473167564084009
cos_sim: 0.4946335128168017
cos_dis: 0.5053664871831983
words1: ['伤心']
words2: ['不', '开心']
ids1 [6735]
ids2 [51, 6473]


paddle_score: 0.6418545701125207
paddle_cos_sim: 0.2837091402250414
paddle_cos_dis: 0.7162908597749587
paddle_words1: ['伤心']
paddle_words2: ['不', '开心']
paddle_ids1 [124147]
paddle_ids2 [51832, 457777]


In [9]:
text1 = '我很开心'
text2 = '我很高兴'
_ = sim(text1, text2)
print('\n')
_ = paddle_sim(text1, text2)

score: 0.9570162769504424
cos_sim: 0.9140325539008848
cos_dis: 0.08596744609911522
words1: ['我', '很', '开心']
words2: ['我', '很', '高兴']
ids1 [73, 363, 6473]
ids2 [73, 363, 4764]


paddle_score: 0.8808241778335343
paddle_cos_sim: 0.7616483556670686
paddle_cos_dis: 0.23835164433293143
paddle_words1: ['我', '很', '开心']
paddle_words2: ['我', '很', '高兴']
paddle_ids1 [445680, 483704, 457777]
paddle_ids2 [445680, 483704, 410020]


In [10]:
text1 = '我特别特别开心'
text2 = '我很高兴'
_ = sim(text1, text2)
print('\n')
_ = paddle_sim(text1, text2)

score: 0.8448190526511059
cos_sim: 0.6896381053022118
cos_dis: 0.3103618946977882
words1: ['我', '特别', '特别', '开心']
words2: ['我', '很', '高兴']
ids1 [73, 405, 405, 6473]
ids2 [73, 363, 4764]


paddle_score: 0.8643486089250102
paddle_cos_sim: 0.7286972178500204
paddle_cos_dis: 0.2713027821499796
paddle_words1: ['我', '特别', '特别', '开心']
paddle_words2: ['我', '很', '高兴']
paddle_ids1 [445680, 484013, 484013, 457777]
paddle_ids2 [445680, 483704, 410020]


In [11]:
text1 = '我其实觉得自己很开心'
text2 = '我很高兴'
_ = sim(text1, text2)
print('\n')
_ = paddle_sim(text1, text2)

score: 0.9325086328895044
cos_sim: 0.8650172657790087
cos_dis: 0.13498273422099127
words1: ['我', '其实', '觉得', '自己', '很', '开心']
words2: ['我', '很', '高兴']
ids1 [73, 968, 1148, 57, 363, 6473]
ids2 [73, 363, 4764]


paddle_score: 0.8420618349095348
paddle_cos_sim: 0.6841236698190695
paddle_cos_dis: 0.31587633018093053
paddle_words1: ['我', '其实', '觉得', '自己', '很', '开心']
paddle_words2: ['我', '很', '高兴']
paddle_ids1 [445680, 498032, 242293, 487829, 483704, 457777]
paddle_ids2 [445680, 483704, 410020]


In [12]:
text1 = '我其实觉得自己很开心'
text2 = '我很高兴'
_ = sim(text1, text2)
print('\n')
_ = paddle_sim(text1, text2)

score: 0.9325086328895044
cos_sim: 0.8650172657790087
cos_dis: 0.13498273422099127
words1: ['我', '其实', '觉得', '自己', '很', '开心']
words2: ['我', '很', '高兴']
ids1 [73, 968, 1148, 57, 363, 6473]
ids2 [73, 363, 4764]


paddle_score: 0.8420618349095348
paddle_cos_sim: 0.6841236698190695
paddle_cos_dis: 0.31587633018093053
paddle_words1: ['我', '其实', '觉得', '自己', '很', '开心']
paddle_words2: ['我', '很', '高兴']
paddle_ids1 [445680, 498032, 242293, 487829, 483704, 457777]
paddle_ids2 [445680, 483704, 410020]


In [13]:
text1 = '我不开心'
text2 = '我很高兴'
_ = sim(text1, text2)
print('\n')
_ = paddle_sim(text1, text2)

score: 0.9015086602997187
cos_sim: 0.8030173205994372
cos_dis: 0.1969826794005628
words1: ['我', '不', '开心']
words2: ['我', '很', '高兴']
ids1 [73, 51, 6473]
ids2 [73, 363, 4764]


paddle_score: 0.8109343745959197
paddle_cos_sim: 0.6218687491918392
paddle_cos_dis: 0.3781312508081608
paddle_words1: ['我', '不', '开心']
paddle_words2: ['我', '很', '高兴']
paddle_ids1 [445680, 51832, 457777]
paddle_ids2 [445680, 483704, 410020]


In [14]:
text1 = '函数将一个数据集合（链表，元组等）中的所有数据进行下列操作'
text2 = '微粒贷'
_ = sim(text1, text2)
print('\n')
_ = paddle_sim(text1, text2)

score: 0.7024310208935683
cos_sim: 0.4048620417871367
cos_dis: 0.5951379582128633
words1: ['函数', '将', '一个', '数据', '集合', '（', '链表', '，', '元组', '等', '）', '中', '的', '所有', '数据', '进行', '下列', '操作']
words2: ['微粒', '贷']
ids1 [2130, 60, 25, 609, 5846, 15, 83145, 0, 90464, 23, 17, 27, 1, 294, 609, 59, 2856, 780]
ids2 [19340, 20983]


paddle_score: 0.5594106757860289
paddle_cos_sim: 0.11882135157205781
paddle_cos_dis: 0.8811786484279422
paddle_words1: ['函数', '将', '一个', '数据', '集合', '（', '链表', '，', '元组', '等', '）', '中', '的', '所有', '数据', '进行', '下列', '操作']
paddle_words2: ['微粒', '贷']
paddle_ids1 [244637, 310705, 419503, 21870, 240579, 397594, 171938, 338628, 82131, 2465, 145998, 493310, 423467, 137401, 21870, 132441, 194865, 317771]
paddle_ids2 [58029, 499964]
