In [1]:
from sklearn.decomposition import TruncatedSVD
import numpy as np
from wordfreq import word_frequency
import jieba

import time
import json
from collections import OrderedDict

from pathlib import Path

In [2]:
def init_dict(dictfile='tc_min.dict'):
    # jieba 加载自定义词典
    jieba.load_userdict(dictfile)
    jieba.enable_paddle()
    
    # 加载词频数据并返回
    domain = int(2 ** 31 - 1)
    freq_dict = {}
    with open(dictfile, 'r',encoding='utf8') as f:
        for line in f:
            segs = line.split(' ')
            token = segs[0]
            freq = int(segs[1])
        freq_dict[token] = float(freq / domain)
    
    return freq_dict 

In [3]:
freq_dict = init_dict()

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\Xiaoi\AppData\Local\Temp\jieba.cache
Loading model cost 0.822 seconds.
Prefix dict has been built succesfully.


FileNotFoundError: [Errno 2] No such file or directory: 'tc_min.dict'

In [None]:
def get_word_frequency(word,freq_dict=freq_dict):
    
    if word in freq_dict:
        return freq_dict[word]
    else:
        return word_frequency(word, 'zh')

In [None]:
from annoy import AnnoyIndex

def init_index(annoy_indexfile = 'tc_index_build10.ann.index',word2indexfile='tc_word_index.json'):
    # 我们用保存好的索引文件重新创建一个Annoy索引, 单独进行加载
    annoy_index = AnnoyIndex(200)
    annoy_index.load(annoy_indexfile)
    
    with open(word2indexfile, 'r') as fp:
        word_index = json.load(fp)

    #准备一个反向id==>word映射词表
    reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
    
    return annoy_index,word_index,reverse_word_index

In [None]:
annoy_index,word_index,reverse_word_index = init_index()

In [None]:
def AVG_embedding(line, embed_index=annoy_index,word2index=word_index,dim=200,pc=0):
    #start = time.time()
    
    word_list = [token for token in list(jieba.cut(line,use_paddle=True))
                 if token in word2index.keys()]
    
    #stop = time.time()

    #print("time for cut words = %.2f s" % (float(stop - start)))


    #start = time.time()
    sent_length = len(word_list)
    vs = np.zeros(dim)
    if not sent_length:
        return vs
    for token in word_list:
        vs += embed_index.get_item_vector(word2index[token])
    
    #stop = time.time()
    #print("time for calc avg vector = %.2f s" % (float(stop - start)))
    
    return vs / sent_length

In [None]:
AVG_embedding("喜欢打篮球的男生喜欢什么样的女生").shape

In [None]:
from numpy import array

def FREQ_embedding(line, embed_index=annoy_index,word2index=word_index,dim=200, a=1e-3,pc=0):
    #start = time.time()
    word_list = [token for token in list(jieba.cut(line,use_paddle=True))
                 if token in word2index.keys()]
    #stop = time.time()

    #print("time for cut words = %.2f s" % (float(stop - start)))
    
    sent_length = len(word_list)
    vs = np.zeros(dim)
    
    #start = time.time()
    
    if not sent_length:
        return vs
    for token in word_list:
        token_freq = get_word_frequency(token)         
        a_value = a / (a + token_freq)        
        vs += a_value * array(embed_index.get_item_vector(word2index[token]))
    
    #stop = time.time()
    #print("time for calc weighted vector = %.2f s" % (float(stop - start)))
    
    return vs / sent_length

In [None]:
FREQ_embedding("无线上网卡和无线路由器怎么用").shape


In [None]:
def SVD_Lines(lines):
    start = time.time()
    
    matrix = np.array([FREQ_embedding(line)
                       for line in lines])
    
    stop = time.time()

    print("time for calc matrix = %.2f s" % (float(stop - start)))
    
    start = time.time()
    svd = TruncatedSVD(n_components=1, n_iter=7, random_state=0)
    svd.fit(matrix)
    pc = svd.components_
    
    stop = time.time()

    print("time for calc pc = %.2f s" % (float(stop - start)))
    return pc

In [None]:
lines = ["新概念英语第二册练习册41课答案","无线上网卡和无线路由器怎么用","福州哪家装修公司好"]
pc = SVD_Lines(lines)
print(pc.shape)

In [None]:
def SIF_embedding(line,pc):
    embed = FREQ_embedding(line)
    return embed - embed @ pc.T @ pc

In [None]:
SIF_embedding("石家庄天气如何？",pc)

In [None]:
def load_sentence_data(file_path):
    sentences1 = []
    sentences2 = []
    labels = []
    with open(file_path, 'r',encoding='utf8') as f:
        for line in f:
            s1, s2, label = line.split('\t')
            if not label:
                continue
            labels.append(int(label))
            sentences1.append(s1)
            sentences2.append(s2)
    return sentences1, sentences2, labels

In [None]:
LCQMC = [
    'LCQMC/processed/train.tsv',
    'LCQMC/processed/dev.tsv',
    'LCQMC/processed/test.tsv',
    ]

CCKS = [
    'CCKS/processed/train.tsv',
    'CCKS/processed/dev.tsv',
    'CCKS/processed/test.tsv',
    ]

ATEC = [
    'ATEC/processed/train.tsv',
    'ATEC/processed/dev.tsv',
    'ATEC/processed/test.tsv',
    ]

CORPUS = [
    (LCQMC,'LCQMC'),
    (CCKS, 'CCKS'),
    (ATEC, 'ATEC')    
    ]

In [None]:
def texts_to_embeddings(texts, embedding_method, pc):
    embedding_list = []
    for text in texts:
        embedding = embedding_method(text,pc)
        embedding_list.append(embedding)
    return np.array(embedding_list)
    

In [None]:
def build_dataset(dataset,embedding_method,pc):
    s1,s2,labels = load_sentence_data(dataset)        
    left_X = texts_to_embeddings(s1, embedding_method,pc)
    right_X = texts_to_embeddings(s2, embedding_method,pc)
    Y = np.array(labels)
    
    return [left_X, right_X], Y
    

In [None]:
def build_SVD(train_file,dev_file):
    sentences = []

    s1,s2,labels = load_sentence_data(train_file)

    sentences.extend(s1)
    sentences.extend(s2)

    s1,s2,labels = load_sentence_data(dev_file)

    sentences.extend(s1)
    sentences.extend(s2)

    pc = SVD_Lines(sentences)
    
    return pc

In [None]:
from keras import Input, Model
from keras import backend as K
from keras.layers import Lambda, Dense
from keras.layers import concatenate, multiply, Dense


In [None]:
def build_model(dim_size=200,dense_unit=100):
    u_input = Input(shape=(dim_size,), dtype='float32', name="left_x")
    v_input = Input(shape=(dim_size,), dtype='float32', name='right_x')
    
    u_sub_v = Lambda(lambda x: K.abs(x[0] - x[1]))([u_input, v_input])
    u_mul_v = multiply([u_input, v_input])
    
    u_concat_v = concatenate([u_input, v_input, u_sub_v, u_mul_v])
    
    dense = Dense(dense_unit, activation='relu')(u_concat_v)
    similarity = Dense(1, activation='sigmoid')(dense)
    
    model = Model([u_input, v_input], similarity)
    #print(model.summary())
    return model

In [None]:
from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint
from sklearn.metrics import classification_report

In [None]:
def train_model(model, train_x, train_y, dev_x, dev_y,checkpointpath,lr=1e-3,batch_size=128,epochs=32):
    adam = Adam(lr=lr)
    model.compile(loss='binary_crossentropy',
                       optimizer=adam,
                       metrics=['accuracy'])    
    checkpoint_callback = ModelCheckpoint(checkpointpath,
                                          monitor='val_acc',
                                          verbose=0,
                                          save_best_only=True,
                                          save_weights_only=True,
                                          mode='auto',
                                          period=1)
    model.fit(train_x, train_y,
               batch_size=batch_size,
               epochs=epochs,
               validation_data=(dev_x, dev_y),
               shuffle=True,
               verbose=0,
               callbacks=[checkpoint_callback])


In [None]:
def model_predict(model, test_x, test_y=None,predict_batchsize=128):
    
    predict_y = model.predict(test_x, batch_size=predict_batchsize)[:, 0]
    if test_y.any():
        predict_y[predict_y >= 0.5] = 1
        predict_y[predict_y < 0.5] = 0
        print(classification_report(test_y, predict_y))


In [None]:
from pathlib import Path

def evaluate(dataset,embedding_method,checkpoint):    
    dataset_dir = Path('../datasets/sentence-similarity-zoo-master/data')
    train_file = dataset_dir / dataset[0]
    dev_file = dataset_dir / dataset[1]
    test_file = dataset_dir / dataset[2]
    
    pc = build_SVD(train_file,dev_file)
    
    train_x, train_y = build_dataset(train_file,SIF_embedding,pc)
    dev_x, dev_y = build_dataset(dev_file,SIF_embedding,pc)
    test_x, test_y = build_dataset(test_file,SIF_embedding,pc)
    
    model = build_model()
    
    train_model(model, train_x, train_y,dev_x, dev_y,checkpointpath=checkpoint )
    
    model.load_weights(checkpoint)
    
    model_predict(model,test_x,test_y)

In [None]:

for embedding_method, embedding_name in [
    (AVG_embedding,'average weighted'),
    (FREQ_embedding,'freq weighted'),
    (SIF_embedding,'freq weighted + SVD')
    ]:
    for data in CORPUS:
        print('------Embedding Method:{0}, DataSet:{1}---------'.format(embedding_name,data[1]))
        checkpoint = embedding_name + "_" + data[1] + "_best.h5"
        evaluate(data[0], embedding_method,checkpoint)
        
    
    

------Embedding Method:average weighted, DataSet:LCQMC---------
time for calc matrix = 200.30 s
time for calc pc = 8.87 s


              precision    recall  f1-score   support

           0       0.80      0.70      0.74      6250
           1       0.73      0.83      0.78      6250

   micro avg       0.76      0.76      0.76     12500
   macro avg       0.77      0.76      0.76     12500
weighted avg       0.77      0.76      0.76     12500

------Embedding Method:average weighted, DataSet:CCKS---------
time for calc matrix = 95.08 s
time for calc pc = 3.21 s
              precision    recall  f1-score   support

           0       0.84      0.78      0.81      5001
           1       0.79      0.85      0.82      4999

   micro avg       0.82      0.82      0.82     10000
   macro avg       0.82      0.82      0.81     10000
weighted avg       0.82      0.82      0.81     10000

------Embedding Method:average weighted, DataSet:ATEC---------
time for calc matrix = 110.12 s
time for calc pc = 3.74 s
              precision    recall  f1-score   support

           0       0.92      0.94      0.93     13675
           1       0.69      0.60      0.64      2886

   micro avg       0.88      0.88      0.88     16561
   macro avg       0.80      0.77      0.79     16561
weighted avg       0.88      0.88      0.88     16561

------Embedding Method:freq weighted, DataSet:LCQMC---------
time for calc matrix = 205.51 s
time for calc pc = 8.45 s
              precision    recall  f1-score   support

           0       0.67      0.79      0.73      6250
           1       0.75      0.61      0.67      6250

   micro avg       0.70      0.70      0.70     12500
   macro avg       0.71      0.70      0.70     12500
weighted avg       0.71      0.70      0.70     12500

------Embedding Method:freq weighted, DataSet:CCKS---------
time for calc matrix = 81.72 s
time for calc pc = 2.17 s
              precision    recall  f1-score   support

           0       0.80      0.83      0.81      5001
           1       0.82      0.79      0.81      4999

   micro avg       0.81      0.81      0.81     10000
   macro avg       0.81      0.81      0.81     10000
weighted avg       0.81      0.81      0.81     10000

------Embedding Method:freq weighted, DataSet:ATEC---------
time for calc matrix = 171.28 s
time for calc pc = 4.69 s
              precision    recall  f1-score   support

           0       0.91      0.95      0.93     13675
           1       0.72      0.57      0.64      2886

   micro avg       0.89      0.89      0.89     16561
   macro avg       0.82      0.76      0.79     16561
weighted avg       0.88      0.89      0.88     16561

------Embedding Method:freq weighted + SVD, DataSet:LCQMC---------
time for calc matrix = 195.74 s
time for calc pc = 5.52 s
              precision    recall  f1-score   support

           0       0.82      0.63      0.71      6250
           1       0.70      0.86      0.77      6250

   micro avg       0.75      0.75      0.75     12500
   macro avg       0.76      0.75      0.74     12500
weighted avg       0.76      0.75      0.74     12500

------Embedding Method:freq weighted + SVD, DataSet:CCKS---------
time for calc matrix = 92.64 s
time for calc pc = 4.27 s
              precision    recall  f1-score   support

           0       0.81      0.82      0.82      5001
           1       0.82      0.81      0.81      4999

   micro avg       0.82      0.82      0.82     10000
   macro avg       0.82      0.82      0.82     10000
weighted avg       0.82      0.82      0.82     10000

------Embedding Method:freq weighted + SVD, DataSet:ATEC---------
time for calc matrix = 104.21 s
time for calc pc = 6.26 s
              precision    recall  f1-score   support

           0       0.91      0.95      0.93     13675
           1       0.71      0.56      0.63      2886

   micro avg       0.88      0.88      0.88     16561
   macro avg       0.81      0.76      0.78     16561
weighted avg       0.88      0.88      0.88     16561

------Embedding Method:average weighted, DataSet:LCQMC---------
time for calc matrix = 29267.18 s
time for calc pc = 5.09 s


              precision    recall  f1-score   support

           0       0.84      0.67      0.75      6250
           1       0.73      0.88      0.79      6250

   micro avg       0.77      0.77      0.77     12500
   macro avg       0.79      0.77      0.77     12500
weighted avg       0.79      0.77      0.77     12500

------Embedding Method:average weighted, DataSet:CCKS---------
time for calc matrix = 1448.29 s
time for calc pc = 2.63 s
              precision    recall  f1-score   support

           0       0.81      0.84      0.82      5001
           1       0.83      0.80      0.82      4999

   micro avg       0.82      0.82      0.82     10000
   macro avg       0.82      0.82      0.82     10000
weighted avg       0.82      0.82      0.82     10000

------Embedding Method:average weighted, DataSet:ATEC---------
time for calc matrix = 2378.26 s
time for calc pc = 3.51 s
              precision    recall  f1-score   support

           0       0.90      0.94      0.92     13675
           1       0.65      0.50      0.57      2886

   micro avg       0.87      0.87      0.87     16561
   macro avg       0.77      0.72      0.74     16561
weighted avg       0.86      0.87      0.86     16561

------Embedding Method:freq weighted, DataSet:LCQMC---------
time for calc matrix = 3045.52 s
time for calc pc = 5.61 s
              precision    recall  f1-score   support

           0       0.86      0.65      0.74      6250
           1       0.72      0.89      0.80      6250

   micro avg       0.77      0.77      0.77     12500
   macro avg       0.79      0.77      0.77     12500
weighted avg       0.79      0.77      0.77     12500

------Embedding Method:freq weighted, DataSet:CCKS---------
time for calc matrix = 1255.87 s
time for calc pc = 2.08 s
              precision    recall  f1-score   support

           0       0.83      0.81      0.82      5001
           1       0.82      0.84      0.83      4999

   micro avg       0.82      0.82      0.82     10000
   macro avg       0.82      0.82      0.82     10000
weighted avg       0.82      0.82      0.82     10000

------Embedding Method:freq weighted, DataSet:ATEC---------
time for calc matrix = 2198.74 s
time for calc pc = 3.38 s
              precision    recall  f1-score   support

           0       0.90      0.94      0.92     13675
           1       0.65      0.49      0.56      2886

   micro avg       0.86      0.86      0.86     16561
   macro avg       0.77      0.72      0.74     16561
weighted avg       0.85      0.86      0.86     16561

------Embedding Method:freq weighted + SVD, DataSet:LCQMC---------
time for calc matrix = 3068.15 s
time for calc pc = 5.74 s
              precision    recall  f1-score   support

           0       0.83      0.68      0.75      6250
           1       0.73      0.86      0.79      6250

   micro avg       0.77      0.77      0.77     12500
   macro avg       0.78      0.77      0.77     12500
weighted avg       0.78      0.77      0.77     12500

------Embedding Method:freq weighted + SVD, DataSet:CCKS---------
time for calc matrix = 1233.87 s
time for calc pc = 2.07 s
              precision    recall  f1-score   support

           0       0.81      0.84      0.82      5001
           1       0.83      0.81      0.82      4999

   micro avg       0.82      0.82      0.82     10000
   macro avg       0.82      0.82      0.82     10000
weighted avg       0.82      0.82      0.82     10000

------Embedding Method:freq weighted + SVD, DataSet:ATEC---------
time for calc matrix = 2236.74 s
time for calc pc = 6.83 s
              precision    recall  f1-score   support

           0       0.89      0.96      0.92     13675
           1       0.68      0.45      0.54      2886

   micro avg       0.87      0.87      0.87     16561
   macro avg       0.78      0.70      0.73     16561
weighted avg       0.85      0.87      0.86     16561