In [5]:
from transformers import AutoTokenizer, AutoModel
import numpy as np
import pandas as pd
import torch

# SentenceTransformers Bert MiniLM,推荐使用
class SentenceTransformers():
    def __init__(self, dir_path='embed/paraphrase-multilingual-MiniLM-L12-v2/'):
        self.dir_path = dir_path
        self.model = AutoModel.from_pretrained(dir_path)
        self.tokenizer = AutoTokenizer.from_pretrained(dir_path)

    def mean_pooling(self, model_output, attention_mask):
        token_embeddings = model_output[0] #First element of model_output contains all token embeddings
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

    def get_embed(self, sentence1, sentence2):
        sentences = [sentence1, sentence2]
        # Tokenize sentences
        encoded_input = self.tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
        
        # Compute token embeddings
        with torch.no_grad():
            model_output = self.model(**encoded_input)
            
        # Perform pooling. In this case, max pooling.
        sentence_embeddings = self.mean_pooling(model_output, encoded_input['attention_mask'])

        return sentence_embeddings[0], sentence_embeddings[1]
        
    def cal_cosine_similarity(self, embed1, embed2):
        embed1 = np.array(embed1)
        embed2 = np.array(embed2)
        if embed1.size>0 and embed2.size>0:
            m1 = np.linalg.norm(embed1)
            m2 = np.linalg.norm(embed2)

            sim = np.matmul(embed1,embed2) / (m1*m2)
        else:
            sim = 0
            
        return sim
    
    def sentence_similarity(self, text1, text2):
        embed1, embed2 = self.get_embed(text1, text2)
        cos_sim_score = self.cal_cosine_similarity(embed1, embed2)
        
        return cos_sim_score


text1 = '我是个好人'
text2 = '我是个坏人'
st = SentenceTransformers()
res = st.sentence_similarity(text1, text2)
print('cosine similarity score:', res)

cosine similarity score: 0.5259402


In [7]:
import pandas as pd
import numpy as np

dir_path = '/home/jasoncheung/project/work/alg-coachingbot/datas/'
df_normal = pd.read_excel(dir_path+'normal_QC.xlsx')
df_bank = pd.read_excel(dir_path+'bank_QC.xlsx')

In [105]:
# cal w2v cos similarity
normal_score_st = []
for t1, t2 in zip(df_normal.text1.tolist(), df_normal.text2.tolist()):
    tmp_score = st.sentence_similarity(t1, t2)
    normal_score_st.append(tmp_score)
df_normal['st_score'] = normal_score_st

bank_score_st = []
for t1, t2 in zip(df_bank.text1.tolist(), df_bank.text2.tolist()):
    tmp_score = st.sentence_similarity(t1, t2)
    bank_score_st.append(tmp_score)
df_bank['st_score'] = bank_score_st

# calculate MSE
from sklearn.metrics import mean_squared_error

res_bank_st = mean_squared_error(df_bank.score.tolist(), df_bank.st_score.tolist())

res_normal_st = mean_squared_error(df_normal.score.tolist(), df_normal.st_score.tolist())

print('bank w2v avg embed MSE: ', res_bank_st)

print('normal w2v avg embed MSE: ', res_normal_st)

NameError: name 'normal_score_w2v' is not defined

In [118]:
df_normal.to_excel(dir_path+'normal_QC.xlsx', index=False)
df_bank.to_excel(dir_path+'bank_QC.xlsx', index=False)

In [117]:
# calculate MSE
from sklearn.metrics import mean_squared_error

res_bank_ed = mean_squared_error(df_bank.score.tolist(), df_bank.ed_score.tolist())
res_bank_sh = mean_squared_error(df_bank.score.tolist(), df_bank.simhash_score.tolist())
res_bank_st = mean_squared_error(df_bank.score.tolist(), df_bank.st_score.tolist())
res_bank_w2v = mean_squared_error(df_bank.score.tolist(), df_bank.w2v_score.tolist())

res_normal_ed = mean_squared_error(df_normal.score.tolist(), df_normal.ed_score.tolist())
res_normal_sh = mean_squared_error(df_normal.score.tolist(), df_normal.simhash_score.tolist())
res_normal_w2v = mean_squared_error(df_normal.score.tolist(), df_normal.w2v_score.tolist())
res_normal_st = mean_squared_error(df_normal.score.tolist(), df_normal.st_score.tolist())

print('bank EditDistance MSE: ', res_bank_ed)
print('bank SimHash MSE: ', res_bank_sh)
print('bank w2v cos avg MSE: ', res_bank_w2v)
print('bank st cos avg embed MSE: ', res_bank_st, '\n')

print('normal EditDistance MSE: ', res_normal_ed)
print('normal SimHash MSE: ', res_normal_sh)
print('normal w2v cos avg embed MSE: ', res_normal_w2v)
print('normal st cos avg embed MSE: ', res_normal_st)

bank EditDistance MSE:  0.3121040805733899
bank SimHash MSE:  0.25456280156941313
bank w2v cos avg MSE:  0.33497529899181905
bank st cos avg embed MSE:  0.21352857993357788 

normal EditDistance MSE:  0.048148436281209364
normal SimHash MSE:  0.24849385616796132
normal w2v cos avg embed MSE:  0.43082670450919386
normal st cos avg embed MSE:  0.02366014663656099


In [9]:
from sklearn.metrics import average_precision_score
def one2two(labels):
    res = []
    for i in labels:
        res.append([1-i, i])
    return res

def compute_mAP(labels,outputs):
    y_true = np.array(one2two(labels))
    y_pred = np.array(one2two(outputs))

    AP = []
    for i in range(y_true.shape[0]):
        AP.append(average_precision_score(y_true[i],y_pred[i]))
    return np.mean(AP)

b_true = df_bank.score.tolist()
n_true = df_normal.score.tolist()

b_ed_pred = df_bank.ed_score.tolist()
n_ed_pred = df_normal.ed_score.tolist()

b_sh_pred = df_bank.simhash_score.tolist()
n_sh_pred = df_normal.simhash_score.tolist()

b_w2v_pred = df_bank.w2v_score.tolist()
n_w2v_pred = df_normal.w2v_score.tolist()

b_st_pred = df_bank.st_score.tolist()
n_st_pred = df_normal.st_score.tolist()

res_b_ed = compute_mAP(b_true, b_ed_pred)
res_b_sh = compute_mAP(b_true, b_sh_pred)
res_b_w2v = compute_mAP(b_true, b_w2v_pred)
res_b_st = compute_mAP(b_true, b_st_pred)

print('MAP-EditDistance bank: %.4f' % res_b_ed)
print('MAP-SimHash bank: %.4f' % res_b_sh)
print('MAP-w2v Cos bank: %.4f' % res_b_w2v)
print('MAP-MiniLM SentenceBert Cos bank: %.4f' % res_b_st)

MAP-EditDistance bank: 0.7675
MAP-SimHash bank: 0.7572
MAP-w2v Cos bank: 0.7548
MAP-MiniLM SentenceBert Cos bank: 0.8265
