In [116]:
from gensim.models import Word2Vec, KeyedVectors
import pandas as pd
import numpy as np
import jieba

# w2v compress
def word2vec_compress(w2v_path,
                      vocab_path,
                      outp,):

    print('start word2vec_compress() function.')
    tic = time.time()

    def get_vocab_size():
        with open(w2v_path, 'r') as fr:
            count = 0
            for i in fr:
                keyword = i.split(' ')[0]
                if keyword in vocab:
                    count += 1

        return count

    def load_vocab():
        with codecs.open(vocab_path, 'r') as fr:
            vocab = [i.strip() for i in fr]

        vocab = list(filter(None, vocab))
        vocab = set(vocab)

        return vocab

    vocab = load_vocab()
    n = get_vocab_size()

    with codecs.open(w2v_path, 'r') as fr, codecs.open(outp, 'w') as fw:
        first_line = fr.readline()
        print(first_line)
        word, emb_size = first_line.split(' ')
        first_line = '{} {}'.format(n, emb_size)
        print(first_line)
        fw.write(first_line)

        for line in fr:
            word = line.split(' ')[0]
            if word in vocab:
                fw.write(line)

    word2vec = gensim.models.KeyedVectors.load_word2vec_format(outp,
                                                               binary=False,
                                                               encoding='utf-8',
                                                               unicode_errors='ignore')

    toc = time.time()
    print('word2vec_compress() function time use: %.3f' % (toc - tic))
    print('finish word2vec_compress() function.')

In [143]:
class word_embed():
    def __init__(self, embed_path='datas/100w-small.txt'):
        self.embed_path = embed_path
        self.w2v = KeyedVectors.load_word2vec_format(embed_path)
    
    def sentence_embed(self, sentence):
        s = list(jieba.cut(sentence))
        s_embed = []
        for i in s:
            if i in self.w2v:
                s_embed.append(self.w2v[i])
            else:
                print(i, 'not in vocabs.')
        s_embed = np.array(s_embed)   
        # get avg embed
        if s_embed.size>0:
            s_embed = s_embed.sum(axis=0) / len(s_embed)
            
        return s_embed
    
    def cal_cosine_similarity(self, embed1, embed2):
        if embed1.size>0 and embed2.size>0:
            m1 = np.linalg.norm(embed1)
            m2 = np.linalg.norm(embed2)

            sim = np.matmul(embed1,embed2) / (m1*m2)
        else:
            sim = 0
        return sim

    def sentence_similarity(self, sentence1, sentence2):
        s_embed1 = self.sentence_embed(sentence1)
        s_embed2 = self.sentence_embed(sentence2)
        
        cos_sim_score = self.cal_cosine_similarity(s_embed1, s_embed2)
        return cos_sim_score

In [144]:
w = word_embed()

In [146]:
w.sentence_similarity('找不着', '没有选项')
w.sentence_similarity('我想贷', '你好，我现在有个网友说帮我开微粒贷，我想问可以开吗？')

找不着 not in vocabs.
我想贷 not in vocabs.
， not in vocabs.
， not in vocabs.


0

In [137]:
dir_path = '/home/jasoncheung/project/work/alg-coachingbot/datas/'
df_normal = pd.read_excel(dir_path+'normal_QC.xlsx')
df_bank = pd.read_excel(dir_path+'bank_QC.xlsx')

In [138]:
# cal w2v cos similarity
normal_score_w2v = []
for t1, t2 in zip(df_normal.text1.tolist(), df_normal.text2.tolist()):
    tmp_score = w.sentence_similarity(t1, t2)
    normal_score_w2v.append(tmp_score)
df_normal['w2v_score'] = normal_score_w2v

bank_score_w2v = []
for t1, t2 in zip(df_bank.text1.tolist(), df_bank.text2.tolist()):
    tmp_score = w.sentence_similarity(t1, t2)
    bank_score_w2v.append(tmp_score)
df_bank['w2v_score'] = bank_score_w2v

# calculate MSE
from sklearn.metrics import mean_squared_error

res_bank_w2v = mean_squared_error(df_bank.score.tolist(), df_bank.w2v_score.tolist())

res_normal_w2v = mean_squared_error(df_normal.score.tolist(), df_normal.w2v_score.tolist())

print('bank w2v avg embed MSE: ', res_bank_w2v)

print('normal w2v avg embed MSE: ', res_normal_w2v)

bank w2v avg embed MSE:  0.33497529899181905
normal w2v avg embed MSE:  0.43082670450919386


In [141]:
df_bank

Unnamed: 0,text1,text2,score,ed_score,simhash_score,w2v_score,diff_w2v,diff_simhash
0,用微信都6年，微信没有微粒贷功能,4。号码来微粒贷,0,0.187500,0.863636,0.890093,0.890093,0.863636
1,微信消费算吗,还有多少钱没还,0,0.000000,0.621212,0.784591,0.784591,0.621212
2,交易密码忘记了找回密码绑定的手机卡也掉了,怎么最近安全老是要改密码呢好麻烦,0,0.050000,0.636364,0.834854,0.834854,0.636364
3,你好我昨天晚上申请的没有打电话给我今天之内一定会打吗？,什么时候可以到账,0,0.000000,0.424242,0.879229,0.879229,0.424242
4,"“微粒贷开通""",你好，我的微粒贷怎么没有开通呢,0,0.333333,0.878788,0.877519,0.877519,0.878788
...,...,...,...,...,...,...,...,...
109995,您好，我还款了怎么还没扣款,今天一直没有扣款,1,0.153846,0.712121,0.897655,0.102345,0.287879
109996,有联系方式吗？,电话号码是多少,1,0.000000,0.530303,0.813413,0.186587,0.469697
109997,昨天打的电话我没接到,我开通却总是接不到你们的电话,1,0.000000,0.575758,0.916955,0.083045,0.424242
109998,0.05%是日利率,如何借款，利息如何计算,0,0.000000,0.590909,0.727211,0.727211,0.590909


In [140]:
df_bank['diff_w2v'] = np.abs(np.array(df_bank['score'].tolist()) - np.array(df_bank['w2v_score'].tolist()))
df_bank['diff_simhash'] = np.abs(np.array(df_bank['score'].tolist()) - np.array(df_bank['simhash_score'].tolist()))

df_bank.sort_values(by='diff_w2v', ascending=False)


Unnamed: 0,text1,text2,score,ed_score,simhash_score,w2v_score,diff_w2v,diff_simhash
61974,找不着,没有选项,1,0.000000,0.469697,0.0,1.0,0.530303
72487,我想贷,你好，我现在有个网友说帮我开微粒贷，我想问可以开吗？,1,0.076923,0.560606,0.0,1.0,0.439394
17562,几久放,何时会下来,1,0.000000,0.484848,0.0,1.0,0.515152
11585,優惠券,优惠卡,1,0.333333,0.454545,0.0,1.0,0.545455
16096,下期账单,帳单,1,0.250000,0.484848,0.0,1.0,0.515152
...,...,...,...,...,...,...,...,...
39336,錢還沒到帳,钱打到哪,0,0.200000,0.651515,0.0,0.0,0.651515
80101,微信能否使用微粒贷,微理贷,0,0.222222,0.469697,0.0,0.0,0.469697
43268,开通微粒贷撒,开通微粒贷下,1,0.833333,0.969697,1.0,0.0,0.030303
24404,想把微众钱转出怎么整,微立贷,0,0.100000,0.515152,0.0,0.0,0.515152


In [126]:
print(w.sentence_similarity('好', '坏'))
print(w.sentence_similarity('男', '女'))

0.6505824
0.9036293


In [121]:
df_normal.to_excel(dir_path+'normal_QC.xlsx', index=False)
df_bank.to_excel(dir_path+'bank_QC.xlsx', index=False)