In [1]:
import jiagu 
import jieba 
import pkuseg 
import genius 
import jieba.posseg as jieb_seg
from sklearn.metrics.pairwise import euclidean_distances as distance

import pandas as pd 
import sys 

import json
from tencentcloud.common import credential
from tencentcloud.common.profile.client_profile import ClientProfile
from tencentcloud.common.profile.http_profile import HttpProfile
from tencentcloud.common.exception.tencent_cloud_sdk_exception import TencentCloudSDKException
from tencentcloud.nlp.v20190408 import nlp_client, models

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
class word_obj:
    
    def __init__(self, word, *seg_results):
        self.word = word 
        self.is_n = sum([item == 'n' for item in seg_results]) >= 2 
        self.is_cn = all(item != 'eng' for item in seg_results) and all(not char.isdigit() for char in word)
        
    def __repr__(self):
        return self.word 
    
    __str__ = __repr__ 

    
class vector_obj:
    
    def __init__(self, id_, title, vector):
        self.id = id_ 
        self.title = title 
        self.vector = vector 
        
    def __repr__(self):
        return f'{self.id} {self.title}'
    
    __str__ = __repr__ 


def get_tencent_client():
    with open('api_keys.txt', 'r') as filein: 
        api_id = filein.readline().partition(':')[-1].strip()
        api_secret = filein.readline().partition(':')[-1].strip()
    cred = credential.Credential(api_id, api_secret) 
    httpProfile = HttpProfile()
    httpProfile.endpoint = "nlp.tencentcloudapi.com"

    clientProfile = ClientProfile()
    clientProfile.httpProfile = httpProfile
    client = nlp_client.NlpClient(cred, "ap-guangzhou", clientProfile) 
    return client 

In [4]:
client = get_tencent_client()
word_request = models.WordEmbeddingRequest() 

items = pd.read_csv('items.csv')

title_vectors = [] 
all_word_vector = {}
does_not_exist = set()
pku_seg = pkuseg.pkuseg(postag=True)

total_rows = items.shape[0]

In [5]:
def generate_avg_vector(query, does_not_exist=None, all_word_vector=None):
    global client, word_request
    
    if does_not_exist is None: 
        does_not_exist = set() 
        all_word_vector = dict()
    temp_result = []
    try:
        segment_list = list(filter(lambda item: item != ' ', jieba.lcut_for_search(query)))
    except: 
        continue 
    # get part of speech, vote for noun and chinese
    for word in segment_list:
        jieba_pos = jieb_seg.lcut(word)
        jiagu_pos = jiagu.pos([word])
        pku_pos = pku_seg.cut(word)
        try:
            temp_result.append(word_obj(word, jieba_pos[0].flag, jiagu_pos[0], pku_pos[0][1]))
        except:
            continue
    result_cleaned = list(filter(lambda item: item.is_n and item.is_cn, temp_result))
    # fetch vector for each word
    vector_list = []
    for word_ins in result_cleaned: 
        current_word = word_ins.word
        if current_word in does_not_exist: 
            continue 
        elif current_word in all_word_vector.keys():
            word_vector = all_word_vector[current_word]
        else:
            try:
                params = {'Text': current_word}
                word_request.from_json_string(json.dumps(params))
                current_response = client.WordEmbedding(word_request)
                word_vector = json.loads(current_response.to_json_string())['Vector']
            except TencentCloudSDKException as error:
                print(f'\rreached exception with inner {error}')
                does_not_exist.add(title)
                continue
            except: 
                client = get_tencent_client()
        all_word_vector[current_word] = word_vector
        vector_list.append(word_vector)
    
    query_vector = list(map(lambda item: round( sum(item)/len(item) , 5), zip(*vector_list)))
    return query_vector

In [9]:
for series in items.iterrows(): 
    current_index, series = series 
    id_, title, *_ = series 
    if current_index < previous_index+1:
        continue 
    else:
        previous_index = current_index
    print(f'\r{current_index:>{total_rows}}/{total_rows}, segmenting {title}', end='  '*50, flush=True)
    vector = generate_avg_vector(title, does_not_exist, all_word_vector)
    title_vectors.append(vector_obj(id_, title, vector))

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        

In [17]:
def search_title(query):
    global title_vectors 
    
    title_distance = []
    vector = generate_avg_vector(query)
    for vector_object in title_vectors: 
        if vector_object.vector:
            euclidean = distance([vector, vector_object.vector])[0].sum() 
            title_distance.append((vector_object.title, euclidean))
    title_distance.sort(key=lambda item: item[-1])

    return title_distance[:20]

In [26]:
search_title('音响')

[('b&o 音响', 2.1073424255447017e-08),
 ('小音响', 2.1073424255447017e-08),
 ('bose companion series音响', 2.1073424255447017e-08),
 ('Bose C2S2 音响', 2.1073424255447017e-08),
 ('marshall蓝牙音响', 2.1073424255447017e-08),
 ('蓝牙音响', 2.1073424255447017e-08),
 ('marshall 音响', 2.1073424255447017e-08),
 ('全新蓝牙音响', 2.1073424255447017e-08),
 ('Beosound balance音响', 2.1073424255447017e-08),
 ('音响 Crate DSP on Board BFX15 Guitar Amp', 2.1073424255447017e-08),
 ('Bose音响！', 2.1073424255447017e-08),
 ('一对音响', 2.1073424255447017e-08),
 ('JBL extreme 音响', 2.1073424255447017e-08),
 ('ue boom2 蓝牙音响', 2.1073424255447017e-08),
 ('Jawbone Mini Jambox 蓝牙小音响', 2.1073424255447017e-08),
 ('罗技的mini音响', 2.1073424255447017e-08),
 ('小音响', 2.1073424255447017e-08),
 ('Beoplay 蓝牙音响', 2.1073424255447017e-08),
 ('harman kardon音响 可蓝牙', 2.1073424255447017e-08),
 ('Bose音响', 2.1073424255447017e-08)]

In [27]:
search_title('音箱')

[('bose音箱', 2.1073424255447017e-08),
 ('日本TDK蓝牙音箱再降价', 2.1073424255447017e-08),
 ('全新篮芽音箱', 2.1073424255447017e-08),
 ('UE Megaboom 蓝牙音箱', 2.1073424255447017e-08),
 ('三防蓝牙音箱', 2.1073424255447017e-08),
 ('miniso蓝牙音箱', 2.1073424255447017e-08),
 ('sony蓝牙音箱', 2.1073424255447017e-08),
 ('蓝牙音箱', 2.1073424255447017e-08),
 ('Marshall 音箱', 2.1073424255447017e-08),
 ('Focal alpha 50 5寸监听音箱', 2.1073424255447017e-08),
 ('全新jbl pulse 3蓝牙音箱', 2.1073424255447017e-08),
 ('可变色伸缩蓝牙小音箱', 2.1073424255447017e-08),
 ('sony 防水音箱 音质', 0.3307952924392974),
 ('Jackson电吉他＋crate音箱', 0.43056739263441746),
 ('ibanez电吉他+音箱', 0.43056739263441746),
 ('bose电脑音箱', 0.4726130044761787),
 ('谷歌智能蓝牙音箱google home mini', 0.5163068607911392),
 ('电视音响带低音炮', 0.5395692768310669),
 ('b&o 音响', 0.6087238178353138),
 ('小音响', 0.6087238178353138)]

In [18]:
search_title('无线耳机')

[('sony索尼xb950n1降噪无线耳机', 0.0),
 ('耳机', 0.0),
 ('AirPods 2 无线充耳机', 0.0),
 ('AirPods 2 无线充耳机', 0.0),
 ('Bose 耳机', 0.0),
 ('蓝牙耳机', 0.0),
 ('Samsung 蓝牙耳机再降价', 0.0),
 ('蓝牙耳机', 0.0),
 ('JBL reflect flow无线蓝牙耳机', 0.0),
 ('耳机', 0.0),
 ('iphone 耳机', 0.0),
 ('Beats Studio 耳机', 0.0),
 ('挥泪降价！全新Microsoft Surface Headphones耳机，多买 免税 降价', 0.0),
 ('全新未开封耳机', 0.0),
 ('Sony WH-ch700n 降噪耳机', 0.0),
 ('sony 耳机 无线降噪', 0.0),
 ('Logitech 耳机', 0.0),
 ('无线耳机', 0.0),
 ('AKG Y50蓝牙耳机', 0.0),
 ('beats耳机', 0.0)]

In [19]:
search_title('屏幕')

[('屏幕架', 0.0),
 ('非常实用的屏幕', 0.0),
 ('带鱼屏 21:9屏幕 ultrawide 屏幕', 0.3793799205282214),
 ('带鱼屏 21:9屏幕 ultrawide 屏幕', 0.3793799205282214),
 ('27英寸电脑屏幕', 0.3999659523009424),
 ('Dell U2518D 显示器 2K 分辨率 25寸 屏幕可旋转', 0.43551706372999904),
 ('三星显示器 1080p 32寸 曲面屏幕', 0.4631583222829963),
 ('买屏幕送主机', 0.4954157102878345),
 ('电脑屏幕VGA线', 0.5272210395080987),
 ('MSI电竞显示器 144hz 1080p 曲面屏幕', 0.5291348689133987),
 ('iPhone XS / X 屏幕保护膜', 0.5426172835433828),
 ('Dell 27寸显示屏', 0.5547113230320793),
 ('Dell P2317H显示屏', 0.5547113230320793),
 ('Acer 显示屏', 0.5547113230320793),
 ('Acer 21.5 inch 显示屏', 0.5547113230320793),
 ('HP 25寸超薄显示屏', 0.5547113230320793),
 ('LG特宽显示屏', 0.5547113230320793),
 ('只用了八个月的显示屏便宜卖', 0.5547113230320793),
 ('显示屏', 0.5547113230320793),
 ('显示屏', 0.5547113230320793)]

In [23]:
search_title('上学的文具')

[('文具', 0.0),
 ('布朗熊文具盒', 0.3751543369068249),
 ('文具盒', 0.3751543369068249),
 ('出 一堆笔芯 铅笔 文具', 0.5097738736930327),
 ('一堆文具加一个玻璃杯 可以用来喝水也可以用来放文具', 0.5137868710856673),
 ('三层柜/文具架各5$', 0.5284234337158034),
 ('3刀带走！能开文具店的笔量', 0.5394484107864255),
 ('文具套装', 0.5695852304089355),
 ('可爱文具袋化妆袋', 0.5803766742383776),
 ('各种文具1刀一个，买五送一', 0.6180662012276679),
 ('笔袋 全新', 0.6496433691649597),
 ('铅笔盒', 0.6858049511340669),
 ('书包，几乎全新送笔芯', 0.696365203969871),
 ('晨光套尺4件，全新，孔庙祈福款，赠同款笔芯+橡皮', 0.698206655725939),
 ('桌面文件文具收纳架', 0.7179207140764223),
 ('文具全家桶', 0.7195946732710019),
 ('今日取给钱就卖全新各种文具 文件夹本子笔橡皮等等 数量很多可实地挑选', 0.7318973094635614),
 ('买铅笔送橡皮笔帽', 0.7408637428704417),
 ('化妆品收纳盒', 0.7451504290410091),
 ('化妆品收纳盒', 0.7451504290410091)]