In [1]:
import jiagu 
import jieba 
import pkuseg 
import genius 
import jieba.posseg as jieb_seg
from sklearn.metrics.pairwise import euclidean_distances as distance

import pandas as pd 
import sys 

import json
from tencentcloud.common import credential
from tencentcloud.common.profile.client_profile import ClientProfile
from tencentcloud.common.profile.http_profile import HttpProfile
from tencentcloud.common.exception.tencent_cloud_sdk_exception import TencentCloudSDKException
from tencentcloud.nlp.v20190408 import nlp_client, models

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
class word_obj:
    
    def __init__(self, word, *seg_results):
        self.word = word 
        self.is_n = sum([item == 'n' for item in seg_results]) >= 2 
        self.is_cn = all(item != 'eng' for item in seg_results) and all(not char.isdigit() for char in word)
        
    def __repr__(self):
        return self.word 
    
    __str__ = __repr__ 

    
class vector_obj:
    
    def __init__(self, id_, title, vector):
        self.id = id_ 
        self.title = title 
        self.vector = vector 
        
    def __repr__(self):
        return f'{self.id} {self.title}'
    
    __str__ = __repr__ 


def get_tencent_client():
    with open('api_keys.txt', 'r') as filein: 
        api_id = filein.readline().partition(':')[-1].strip()
        api_secret = filein.readline().partition(':')[-1].strip()
    cred = credential.Credential(api_id, api_secret) 
    httpProfile = HttpProfile()
    httpProfile.endpoint = "nlp.tencentcloudapi.com"

    clientProfile = ClientProfile()
    clientProfile.httpProfile = httpProfile
    client = nlp_client.NlpClient(cred, "ap-guangzhou", clientProfile) 
    return client 

In [4]:
client = get_tencent_client()
word_request = models.WordEmbeddingRequest() 

items = pd.read_csv('items.csv')

title_vectors = [] 
all_word_vector = {}
does_not_exist = set()
pku_seg = pkuseg.pkuseg(postag=True)

total_rows = items.shape[0]

In [5]:
def generate_avg_vector(query, does_not_exist, all_word_vector):
    global client, word_request
    
    temp_result = []
    segment_list = list(filter(lambda item: item != ' ', jieba.lcut_for_search(query)))
    # get part of speech, vote for noun and chinese
    for word in segment_list:
        jieba_pos = jieb_seg.lcut(word)
        jiagu_pos = jiagu.pos([word])
        pku_pos = pku_seg.cut(word)
        temp_result.append(word_obj(word, jieba_pos[0].flag, jiagu_pos[0], pku_pos[0][1]))
    result_cleaned = list(filter(lambda item: item.is_n and item.is_cn, temp_result))
    # fetch vector for each word, then take avg 
    vector_list = []
    for word_ins in result_cleaned: 
        current_word = word_ins.word
        if current_word in does_not_exist: 
            continue 
        elif current_word in all_word_vector.keys():
            word_vector = all_word_vector[current_word]
        else:
            try:
                params = {'Text': current_word}
                word_request.from_json_string(json.dumps(params))
                current_response = client.WordEmbedding(word_request)
                word_vector = json.loads(current_response.to_json_string())['Vector']
            except TencentCloudSDKException as error:
                print(f'\rreached exception with inner {error}')
                does_not_exist.add(title)
                continue
            except: 
                client = get_tencent_client()
        all_word_vector[current_word] = word_vector
        vector_list.append(word_vector)
    
    query_vector = list(map(lambda item: round( sum(item)/len(item) , 5), zip(*vector_list)))
    return query_vector

In [None]:
for series in items.iterrows(): 
    current_index, series = series 
    id_, title, *_ = series 
    print(f'\r{current_index:>{total_rows}}/{total_rows}, segmenting {title}', end='  '*50, flush=True)
    vector = generate_avg_vector(title, does_not_exist, all_word_vector)
    title_vectors.append(vector_obj(id_, title, vector))

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.373 seconds.
Prefix dict has been built successfully.


reached exception with inner [TencentCloudSDKException] code:FailedOperation.WordNotFound message:查找不到【层书椅】的词向量 requestId:951e282c-2e8e-4365-a7eb-debf762ce364                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          

In [None]:
def search_title(query):
    global title_vectors 
    
    title_distance = []
    vector = generate_avg_vector(query)
    for vector_object in title_vectors: 
        euclidean = distance([vector, vector_object.vector])[0].sum() 
        title_distance.append((vector_object.title, euclidean))
    title_distance.sort(key=lambda item: item[-1])

    return title_distance[:20]

# Default to string matching 
# take average for all vector, segment for query

## query time for each part, segment, POS, API, etc 