In [202]:
import pandas as pd
import numpy as np

import jieba
import jieba.analyse
import numpy as np


# simhash,推荐使用
class SimhashStr():
    def __init__(self, tfidf_dict=None):
        self.tfidf_dict = tfidf_dict

    # 得到输入字符串的hash值
    def get_hash(self, text, topK=None):
        # 结巴分词
        seg = jieba.cut(text)
        # print(len(set(list(jieba.cut(text)))))
        # 取前20个关键词
        # 这个可以替换成我们的tfidf词表
        if self.tfidf_dict is None:
            keyword = jieba.analyse.extract_tags('|'.join(seg), topK=topK, withWeight=True, allowPOS=())
            # 若jieba解析不出任何词汇，则默认使用所有词汇，并赋权1
            if not keyword:
                keyword = [(i, 1) for i in list(jieba.cut(text))]
        else:
            # 若传入了自定义tfidf词典，则用自己的
            keyword = dict()
            for i in list(seg):
                if i in d:
                    if i in keyword:
                        keyword[i] += d[i]
                    else:
                        keyword[i] = d[i]
            keyword = sorted(keyword.items(),key = lambda x:x[1],reverse = True)[:topK]
            
        keyList = []
        # 获取每个词的权重
        for feature, weight in keyword:
            # 每个关键词的权重*总单词数
            weight = int(weight * 20)
            # 获取每个关键词的特征
            feature = self.string_hash(feature)
            temp = []
            # 获取每个关键词的权重
            for i in feature:
                if i == '1':
                    temp.append(weight)
                else:
                    temp.append(-weight)
                keyList.append(temp)
        # 将每个关键词的权重变成一维矩阵
        list1 = np.sum(np.array(keyList), axis=0)
        # 获取simhash值
        simhash = ''
        for i in list1:
            # 对特征标准化表示
            if i > 0:
                simhash = simhash + '1'
            else:
                simhash = simhash + '0'
        return simhash

    def string_hash(self, feature):
        if feature == "":
            return 0
        else:
            # 将字符转为二进制，并向左移动7位
            x = ord(feature[0]) << 7
            m = 1000003
            mask = 2 ** 128 - 1
            # 拼接每个关键词中字符的特征
            for c in feature:
                x = ((x * m) ^ ord(c)) & mask
            x ^= len(feature)
            if x == -1:
                x = -2
            # 获取关键词的64位表示
            x = bin(x).replace('0b', '').zfill(64)[-64:]
            return str(x)
        
    def get_distance(self, sim1, sim2):
        # 转为二进制结构
        t1 = '0b' + sim1
        t2 = '0b' + sim2
        
        max_hashbit = max(len(t1), len(t2))
        
        n = int(t1, 2) ^ int(t2, 2)
        # 相当于对每一位进行异或操作
        i = 0
        while n:
            n &= (n - 1)
            i += 1
        return i, max_hashbit
    
    def similar(self, text1, text2, topK=20):
        simhash1 = self.get_hash(text1, topK=topK)
        simhash2 = self.get_hash(text2, topK=topK)

        # 汉明距离
        distince, max_hashbit = self.get_distance(simhash1, simhash2)
        similar = 1 - distince / max_hashbit
        return similar
    

# 编辑距离相似度
def editdistance(str1, str2, type_='score'):
    # cal Levenshtein Distance(Edit Distance)
    edit = [[i+j for j in range(len(str2)+1)] for i in range(len(str1)+1)]
    for i in range(1, len(str1)+1):
        for j in range(1, len(str2)+1):
            if str1[i-1] == str2[j-1]:
                d = 0
            else:
                d = 1
                
            edit[i][j] = min(edit[i-1][j]+1, edit[i][j-1]+1, edit[i-1][j-1]+d)
    res = edit[len(str1)][len(str2)]
    if type_ == 'score':
        res = 1 - res/max(len(str1), len(str2))
    return res

In [207]:
text1 = '中国国务院扶贫办最近表示，中国脱贫攻坚已经取得决定性成就，到今年年底，所有贫困人口会全部退出。作为其践行社会责任的重要一环，汇丰中国一直致力推动大众就业、支持全面脱贫'
text2 = '美国国安局扶贫办最近表示，美国脱贫攻坚已经取得决定性成就，到今年年底，所有贫困人口会全部退出。作为其践行社会责任的重要一环，花旗美国一直致力推动大众就业、支持全面脱贫'
print('len: ', len(text1), len(text2))
res = SimhashStr().similar(text1, text2)
print('Simhash similar: ', res)
res = editdistance(text1, text2)
print('editdistance similar: ', res)

text1 = '今天心情好'
text2 = '今天心情坏'
res = SimhashStr().similar(text1, text2)
print('Simhash similar: ', res)
res = editdistance(text1, text2)
print('editdistance similar: ', res)

len:  83 83
Simhash similar:  0.7727272727272727
editdistance similar:  0.9156626506024097
Simhash similar:  0.4696969696969697
editdistance similar:  0.8


In [158]:
dir_path = '/home/jasoncheung/project/work/alg-coachingbot/datas/'
df_normal = pd.read_excel(dir_path+'normal_QC.xlsx')
df_bank = pd.read_excel(dir_path+'bank_QC.xlsx')

In [203]:
# cal simhash similar & editdistance similar
normal_score_ed = []
for t1, t2 in zip(df_normal.text1.tolist(), df_normal.text2.tolist()):
    tmp_score = editdistance(t1, t2)
    normal_score_ed.append(tmp_score)
df_normal['ed_score'] = normal_score_ed

normal_score_sh = []
for t1, t2 in zip(df_normal.text1.tolist(), df_normal.text2.tolist()):
    tmp_score = SimhashStr().similar(t1, t2)
    normal_score_sh.append(tmp_score)
df_normal['simhash_score'] = normal_score_sh


bank_score_ed = []
for t1, t2 in zip(df_bank.text1.tolist(), df_bank.text2.tolist()):
    tmp_score = editdistance(t1, t2)
    bank_score_ed.append(tmp_score)
df_bank['ed_score'] = bank_score_ed

bank_score_sh = []
for t1, t2 in zip(df_bank.text1.tolist(), df_bank.text2.tolist()):
    tmp_score = SimhashStr().similar(t1, t2)
    bank_score_sh.append(tmp_score)
df_bank['simhash_score'] = bank_score_sh

# calculate MSE
from sklearn.metrics import mean_squared_error

res_bank_ed = mean_squared_error(df_bank.score.tolist(), df_bank.ed_score.tolist())
res_bank_sh = mean_squared_error(df_bank.score.tolist(), df_bank.simhash_score.tolist())

res_normal_ed = mean_squared_error(df_normal.score.tolist(), df_normal.ed_score.tolist())
res_normal_sh = mean_squared_error(df_normal.score.tolist(), df_normal.simhash_score.tolist())
print('bank EditDistance MSE: ', res_bank_ed)
print('bank SimHash MSE: ', res_bank_sh)

print('normal EditDistance MSE: ', res_normal_ed)
print('normal SimHash MSE: ', res_normal_sh)

bank EditDistance MSE:  0.3121040805733899
bank SimHash MSE:  0.25456280156941313
normal EditDistance MSE:  0.04814843628120935
normal SimHash MSE:  0.24849385616796132


In [None]:
df_normal.to_excel(dir_path+'normal_QC.xlsx', index=False)
df_bank.to_excel(dir_path+'bank_QC.xlsx', index=False)