In [1]:
import sys
sys.path.append('../')

In [2]:
import os
import pandas as pd

from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity, cosine_distances

from textpair.single.paddle_bow import PaddleBowTextU, PaddleBowSim
from textpair.single.ann import Ann

In [3]:
ATEC_CCKS_PATH = '../../bert_fine_tune/fine_tune/data/train_dev_test/ATEC_CCKS/processed/'
SEP = '\t'
TRAIN_CSV = os.path.join(ATEC_CCKS_PATH, 'train.csv')
DEV_CSV = os.path.join(ATEC_CCKS_PATH, 'dev.csv')
TEST_CSV = os.path.join(ATEC_CCKS_PATH, 'test.csv')


VOCAB_FILE = '../data/paddle_models/sim_net/data/term2id.dict'
MODEL_PATH = '../data/paddle_models/sim_net/model_files/simnet_bow_pairwise_pretrained_model/'

In [4]:
train_df = pd.read_csv(TRAIN_CSV, sep = SEP, na_filter = False)
dev_df = pd.read_csv(DEV_CSV, sep = SEP, na_filter = False)
test_df = pd.read_csv(TEST_CSV, sep = SEP, na_filter = False)

In [5]:
all_df = pd.concat([train_df, dev_df, test_df], axis= 0, ignore_index=True)
all_df.head()

Unnamed: 0,text_1,text_2,label
0,蚂蚁借呗还可以分期还款吗,借呗可以分期还款吗每个月还一部分的那种,1
1,延期1天还款,27号是还款日28号还这样是逾期吗,0
2,我花呗提现额度怎么开不了,为什么花呗不你提出来,0
3,未满足银行要求,怎样满足微重银行审批,0
4,商铺可以开通花呗分期吗,我再办一张银行卡可以开通花呗吗,0


In [6]:
all_df.shape

(202477, 3)

In [7]:
sim = PaddleBowSim(MODEL_PATH, VOCAB_FILE)

def compute_cos(text1, text2):
    ann1 = Ann(text1)
    ann2 = Ann(text2)
    ph = '你好'
    ann_ph = Ann(ph)
    res1 = sim(ann1, ann_ph)
    res2 = sim(ann2, ann_ph)
    vec1 = res1['left_vec']
    vec2 = res2['left_vec']
    cos_sim = cosine_similarity(vec1, vec2)[0, 0]
    cos_dis = cosine_distances(vec1, vec2)[0, 0]
    return cos_sim, cos_dis

In [8]:
cos_res = []
for row in tqdm(all_df.itertuples(), total=all_df.shape[0]):
    text1 = row.text_1
    text2 = row.text_2
    cos_sim, cos_dis = compute_cos(text1, text2)
    cos_res.append([cos_sim, cos_dis])

  0%|                                                                                           | 0/202477 [00:00<?, ?it/s]Building prefix dict from the default dictionary ...
2019-07-10 10:53:43,427-DEBUG: Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\aaronnli\AppData\Local\Temp\jieba.cache
2019-07-10 10:53:43,435-DEBUG: Loading model from cache C:\Users\aaronnli\AppData\Local\Temp\jieba.cache
Loading model cost 0.695 seconds.
2019-07-10 10:53:44,124-DEBUG: Loading model cost 0.695 seconds.
Prefix dict has been built succesfully.
2019-07-10 10:53:44,126-DEBUG: Prefix dict has been built succesfully.
100%|█████████████████████████████████████████████████████████████████████████████| 202477/202477 [09:23<00:00, 359.03it/s]


In [9]:
cos_res_df = pd.DataFrame(cos_res, columns=['cos_sim', 'cos_dis'])

In [10]:
cos_res_df.describe()

Unnamed: 0,cos_sim,cos_dis
count,202477.0,202477.0
mean,0.696581,0.303419
std,0.153228,0.153228
min,-0.355114,0.0
25%,0.612292,0.192848
50%,0.723668,0.276332
75%,0.807152,0.387708
max,1.0,1.355114


In [12]:
SAVE_PATH = 'test_data/atec_ccks_cos.csv'
cos_res_df.to_csv(SAVE_PATH, sep=SEP, index=False)