In [None]:
import jiagu 
import jieba 
import pkuseg 
import genius 
import jieba.posseg as jieb_seg
import pandas as pd 
from sklearn.metrics.pairwise import euclidean_distances as distance

import json
from tencentcloud.common import credential
from tencentcloud.common.profile.client_profile import ClientProfile
from tencentcloud.common.profile.http_profile import HttpProfile
from tencentcloud.common.exception.tencent_cloud_sdk_exception import TencentCloudSDKException
from tencentcloud.nlp.v20190408 import nlp_client, models

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
class word_obj:
    
    def __init__(self, word, *seg_results):
        self.word = word 
        self.is_n = any(item=='n' for item in seg_results)
        self.is_cn = all(item != 'eng' for item in seg_results) and all((not char.isdigit()) and char.isalpha() for char in word)
        
    def __repr__(self):
        return self.word 
    
    __str__ = __repr__ 


class vector_obj:
    
    def __init__(self, id_, title, vector):
        self.id = id_ 
        self.title = title 
        self.vector = vector 
        
    def __repr__(self):
        return f'{self.id} {self.title}'
    
    __str__ = __repr__ 


def flatten_list(input_list):
    return input_list[0] + flatten_list(input_list[1:]) if len(input_list) > 1 else input_list[0]


def get_tencent_client():
    with open('api_keys.txt', 'r') as filein: 
        api_id = filein.readline().partition(':')[1]
        api_secret = filein.readline().partition(':')[1]
    cred = credential.Credential(api_id, api_secret) 
    httpProfile = HttpProfile()
    httpProfile.endpoint = "nlp.tencentcloudapi.com"

    clientProfile = ClientProfile()
    clientProfile.httpProfile = httpProfile
    client = nlp_client.NlpClient(cred, "ap-guangzhou", clientProfile) 
    return client 

In [None]:
items = pd.read_csv('items.csv')
item_names = items.sample(n=100)['Used_title'].tolist()

In [None]:
test_case1 = '三星note10+256G手机'
test_case2 = '只用过不到三天的switch lite'

In [None]:
client = get_tencent_client()
word_request = models.WordEmbeddingRequest() 
sentence_request = models.SentenceEmbeddingRequest()

In [None]:
try:
    result
except:
    result = [] 

total_rows = items.shape[0]
for series in items.iterrows(): 
    current_index, series = series 
    id_, title, *_ = series 
    print(f'\r{current_index:>{total_rows}}/{total_rows}', end='', flush=True)
    if current_index < 9354: continue 
    
    sentence_request.from_json_string(json.dumps({'Text': title}))
    response = client.SentenceEmbedding(sentence_request)
    vector = json.loads(response.to_json_string())['Vector']
    result.append(vector_obj(id_, title, vector))

In [None]:
from itertools import count 
id_counter = count(1)

all_words = {} # word -> id
title_to_word = []
pku_seg = pkuseg.pkuseg(postag=True)
previous_step = -1 

while previous_step < total_rows:
    try:
        for series in items.iterrows(): 
            current_index, series = series 
            id_, title, *_ = series 
            if current_index < previous_step:
                continue 

            print(f'\r{current_index:>{total_rows}}/{total_rows}', end='', flush=True)
            temp_result = [] 
            search_list = jieba.lcut_for_search(title) #+ jiagu.seg(item_names[index])
            search_list = list(filter(lambda item: item != ' ', search_list))
            for word in search_list:
                jieba_pos = jieb_seg.lcut(word)
                jiagu_pos = jiagu.pos([word])
                pku_pos = pku_seg.cut(word)
            #    print(jieba_pos, jiagu_pos, word)
                temp_result.append(word_obj(word, jieba_pos[0].flag, jiagu_pos[0], pku_pos[0][1]))

            record.append(result)
            result_cleaned = list(filter(lambda item: item.is_n and item.is_cn, temp_result))
            title_to_word.append([title, []])

            for word in result_cleaned: 
                word_id = next(id_counter)
                params = {'Text': word}
                word_request.from_json_string(json.dumps(params))
                current_response = client.WordEmbedding(word_request)
                vector = json.loads(response.to_json_string())['Vector']
                result.append(vector_obj(word_id, word, vector))
                title_to_word[-1][-1].append(word_id)
    except:
        previous_step = current_index 
        continue 


In [None]:
def search_title(query):
    global client, sentence_request, result
    params = {'Text': query}
    sentence_request.from_json_string(json.dumps(params))
    response = client.SentenceEmbedding(sentence_request)
    vector = json.loads(response.to_json_string())['Vector']
    distance_map = []
    for vector_ins in result: 
        euclidean = distance([vector, vector_ins.vector])[0].sum() 
        distance_map.append([euclidean, vector_ins])
    distance_map.sort(key=lambda item: item[0])
    return distance_map[:20]

In [None]:
params = {'Text': '搅拌器'}
word_request.from_json_string(json.dumps(params))
response1 = client.WordEmbedding(word_request)
vector1 = json.loads(response1.to_json_string())['Vector'].copy()

params = {'Text': '显示器'}
word_request.from_json_string(json.dumps(params))
response2 = client.WordEmbedding(word_request)
vector2 = json.loads(response2.to_json_string())['Vector'].copy()

distance([vector1, vector2])

In [None]:
search_title('屏幕')

In [None]:
items[items['Used_title'].str.contains('有线电', na=False)]