In [2]:
# coding=utf-8

import hashlib
import json
import os
import pickle
import random
import re
import shutil
import sys
import time
from datetime import datetime, timedelta

import numpy as np
import pandas as pd
import requests
from tqdm.auto import tqdm

pd.set_option("display.max_columns", 10)
pd.set_option("display.max_rows", 10)
pd.set_option("mode.chained_assignment", None)

In [3]:
from elasticsearch import Elasticsearch
from elasticsearch.client import IndicesClient
from elasticsearch import helpers

In [4]:
import jieba
from gensim.models import Word2Vec

In [5]:
path_database = os.popen("locate -b DATABASE").read().split("\n")[0]
sys.path.append(path_database)
from read_data import *

In [6]:
def get_hash_code(code):
    code = str(code).strip().upper()
    md5 = hashlib.md5()
    md5.update(code.encode('utf-8'))
    hash_code = md5.hexdigest().upper()
    return hash_code

### 基于购买数据

In [6]:
# 读取数据
data_seriesapi = pd.read_pickle(path_seriesapi)
data_seriesapi = pd.DataFrame(data_seriesapi)

cols_rename = {
    'seriesCode': 'itemCd',
    'seriesName': 'name',
    'productImageList': 'imgUrl',
    'brandName': 'maker',
    'minStandardUnitPrice': 'priceFrom',
    'maxStandardUnitPrice': 'priceTo',
    'minStandardDaysToShip': 'deliveryFrom',
    'maxStandardDaysToShip': 'deliveryTo',
    'categoryCode': 'categoryCd',
}
data_seriesapi = data_seriesapi.reindex(columns=cols_rename.keys())

data_seriesapi['linkUrl'] = data_seriesapi.seriesCode.apply(lambda d: f'www.misumi.com.cn/vona2/detail/{d}')
data_seriesapi['productImageList'] = data_seriesapi.productImageList.apply(lambda d: d[0]['url'] if len(d) else '')


def get_price(d):
    if d <= 0:
        return ''
    if d > 0:
        return f'￥{d}起'

data_seriesapi['minStandardUnitPrice'] = data_seriesapi.minStandardUnitPrice.fillna(-99)
data_seriesapi['maxStandardUnitPrice'] = data_seriesapi.maxStandardUnitPrice.fillna(-99)
data_seriesapi['price'] = data_seriesapi.minStandardUnitPrice.apply(get_price)


def get_delivery(d):
    if d < -1:
        return ''
    if d <=0:
        return '当天'
    if d > 0:
        d = int(d)
        return f'{d}天起'
    
    
data_seriesapi['minStandardDaysToShip'] = data_seriesapi.minStandardDaysToShip.fillna(-99)
data_seriesapi['maxStandardDaysToShip'] = data_seriesapi.maxStandardDaysToShip.fillna(-99)
data_seriesapi['delivery'] = data_seriesapi.minStandardDaysToShip.apply(get_delivery)

data_seriesapi['isActive'] = 1
data_seriesapi['ttl'] = str(int(datetime.now().timestamp()))

data_seriesapi = data_seriesapi.rename(columns=cols_rename)
for col in data_seriesapi.columns:
    data_seriesapi[col] = data_seriesapi[col].apply(str)
    
dict_seriesapi = dict(zip(data_seriesapi.itemCd, data_seriesapi.to_dict('records')))

In [7]:
data_category = pd.read_pickle(path_category)
data_seriesol = pd.read_pickle(path_seriesol)
data_seriesall = pd.read_pickle(path_seriesall)

data_so = [pd.read_pickle(i) for i in paths_so[-1:]]
data_so = pd.concat(data_so, ignore_index=True)

In [8]:
%%time
# 创建序列
# 基于MRO品长尾，基于is_mro对订单进行拆分
data_series_bu = (
    data_seriesall
    .reindex(columns=['series_code', 'category_code'])
    .dropna()
    .merge(data_category, how='left', on=['category_code'])
    .reindex(columns=['series_code', 'category_bu'])
)
data_so = data_so.merge(data_series_bu, how='left', on=['series_code'])
data_so['is_mro'] = data_so.category_bu.apply(lambda d: int(d=='VM'))

data_so_list = (
    data_so
    .dropna(subset=['so_voucher_no', 'is_mro', 'series_code'])
    .groupby(['so_voucher_no', 'is_mro'])['series_code']
    .agg(list)
    .reset_index()
)


# 训练模型
sentences = data_so_list.series_code
model = Word2Vec(vector_size=100, window=5, min_count=5, sg=1, hs=1, workers=4)
model.build_vocab(sentences)
model.train(sentences, total_examples=model.corpus_count, epochs=model.epochs)

CPU times: user 2min 4s, sys: 1.04 s, total: 2min 5s
Wall time: 50.9 s


(23540588, 25686695)

In [9]:
data_emb = pd.DataFrame({
    'series_code': model.wv.index_to_key, 
    'embedding': list(model.wv.get_normed_vectors()), 
})
data_emb = (
    data_seriesol
    .reindex(columns=['series_code', 'category_code'])
    .merge(data_category[['category_code', 'path_code']])
    .merge(data_emb, how='left', on=['series_code'])   
)
data_emb['path_code'] = data_emb.path_code.apply(lambda d: d.split('>'))


# 类别向量填充
data_path_emb = (
    data_emb
    .drop(['path_code'], axis=1)
    .join(data_emb.path_code.explode())
    .dropna()
    .groupby(['path_code'])
    .agg({'embedding': np.mean})
    .reset_index()
)
dict_path_emb = dict(zip(data_path_emb.path_code, data_path_emb.embedding))


data_emb = data_emb.to_dict('records')
for i in data_emb:
    if type(i['embedding']) == np.ndarray:
        continue
          
    for p in i['path_code'][::-1]:
        p_emb = dict_path_emb.get(p)
        
        if type(p_emb) == np.ndarray:
            i['embedding'] = p_emb
            break

data_emb = pd.DataFrame(data_emb)
data_emb = data_emb.drop(['category_code', 'path_code'], axis=1)

In [47]:
data_emb = pd.DataFrame(data_emb)

In [48]:
topn = 10
series_code = data_emb.series_code
embedding = data_emb.embedding.to_list()
series_api = data_seriesapi.itemCd.to_list()


data_emb = data_emb.to_dict('records')
for i in tqdm(data_emb):
    scores = model.wv.cosine_similarities(i['embedding'], embedding)
    series = series_code[np.argsort(scores)[::-1]].to_list()
    series = [j for j in series[:1000] if j not in series_api]
    
    recomd = [j for j in series if j!=i['series_code']][:topn]
    i['recomd'] = recomd
    
data_emb = pd.DataFrame(data_emb)

  0%|          | 0/49833 [00:00<?, ?it/s]

In [None]:
# 测试

idx = random.choice(range(len(data_emb)))
i_series = data_emb.iloc[idx]
a = data_seriesol[data_seriesol.series_code==i_series['series_code']].series_name.iloc[0]
b = data_seriesol[data_seriesol.series_code.isin(i_series['recomd'][:5])].series_name.tolist()

print(a)
print(b)

In [17]:
%%time
data_api = []
for i in tqdm(data_emb.to_dict('records')):
    i_key = i['series_code']
    i_val = list(map(lambda d: dict_seriesapi.get(d).copy(), i['recomd']))
    
    for j_idx, j_val in enumerate(i_val):
        j_pos = str(j_idx + 1)
        j_ext = f'?rid=detail_purchase_{i_key}_{j_pos}'
        j_val['position'] = j_pos
        j_val['linkUrl'] = j_val['linkUrl'] + j_ext
        
    data_api.append((i_key, {'data': str(i_val)}))

  0%|          | 0/49833 [00:00<?, ?it/s]

AttributeError: 'NoneType' object has no attribute 'copy'

In [580]:
url_recomd = 'https://vpc-ec-recomd-stg-sk5q5e6dcred74nwkgbzn6xe7e.cn-north-1.es.amazonaws.com.cn'
url_recomd = 'https://vpc-ec-recomd-prd-7tegdwbsfek66bfe26mdgoudoy.cn-north-1.es.amazonaws.com.cn'

es_recomd = Elasticsearch([url_recomd])
ic_recomd = IndicesClient(es_recomd)
es_recomd.ping()

True

In [581]:
%%time
size = 1000
for i in tqdm(range(0, len(data_api), size)):
    i_data = data_api[i:i+size]

    i_data_update = [{
        '_op_type': 'index',
        '_index': 'detail_purchase',
        '_type': '_doc',
        '_id': get_hash_code(i[0] + ',,,'),
        '_source': i[1]} for i in i_data]
    h = helpers.bulk(es_recomd, i_data_update)

    time.sleep(0.5)
    es_recomd.indices.refresh()

  0%|          | 0/50 [00:00<?, ?it/s]

CPU times: user 3.51 s, sys: 565 ms, total: 4.08 s
Wall time: 3min 36s


In [7]:
%%time
url = 'https://apihosts.misumi.com.cn/aidata/ec_recomd_series?page=detail&area=purchase&parm1=110610027719&parm2=&parm3=&parm4='
requests.get(url).json()

CPU times: user 48.3 ms, sys: 85 µs, total: 48.4 ms
Wall time: 263 ms


{'data': [{'categoryCd': 'T1708010000',
   'delivery': '当天',
   'deliveryFrom': '0.0',
   'deliveryTo': '0.0',
   'imgUrl': '//www.misumi.com.cn/linked/material/fs/MSM1/PHOTO/110610382769_004.jpg',
   'isActive': '1',
   'itemCd': '110610382769',
   'linkUrl': 'www.misumi.com.cn/vona2/detail/110610382769?rid=detail_purchase_110610027719_1',
   'maker': '米思米(MISUMI)',
   'name': '轻便款10针500g纱线手套（优级棉纱）',
   'position': '1',
   'price': '￥17.08起',
   'priceFrom': '17.08',
   'priceTo': '17.08',
   'ttl': '1663060364'},
  {'categoryCd': 'T1708010000',
   'delivery': '当天',
   'deliveryFrom': '0.0',
   'deliveryTo': '0.0',
   'imgUrl': '//content.misumi.com.cn/image/upload/v1/p/cn/product/series/110610185399/110610185399_20190712103854.jpg',
   'isActive': '1',
   'itemCd': '110610185399',
   'linkUrl': 'www.misumi.com.cn/vona2/detail/110610185399?rid=detail_purchase_110610027719_2',
   'maker': '米思米(MISUMI)',
   'name': '600g点塑纱线手套 涤棉新料 10针本白 12副/袋',
   'position': '2',
   'price': '￥23.83起'