In [None]:
%run utils.ipynb

import pandas as pd
import numpy as np
from tqdm import tqdm
from annoy import AnnoyIndex
import os
import warnings
from collections import defaultdict
import math
import pickle
import multitasking
import signal
from gensim.models import Word2Vec

multitasking.set_max_threads(10)
multitasking.set_engine('process')
signal.signal(signal.SIGINT, multitasking.killall)

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('precision', 10)

warnings.filterwarnings('ignore')

In [None]:
seed = 2020

In [None]:
df_qtime = pd.read_pickle('../user_data/data/qtime.pkl')
df_click = pd.read_pickle('../user_data/data/click.pkl')

In [None]:
phases = sorted(list(df_qtime['phase'].unique()))
phases

In [None]:
# 根据历史点击记录做word2vec
def item2vec(df_, f1, f2):
    df = df_.copy()
    tmp = df.groupby(f1, as_index=False)[f2].agg({'{}_{}_list'.format(f1, f2): list})

    sentences = tmp['{}_{}_list'.format(f1, f2)].values.tolist()
    del tmp['{}_{}_list'.format(f1, f2)]
    for i in range(len(sentences)):
        x = [str(x) for x in sentences[i]]
        sentences[i] = x

    model = Word2Vec(sentences, size=256, window=5, min_count=1,
                         sg=1, hs=0, seed=seed, iter=300, negative=5, workers=6)
    return model

In [None]:
def recall(df_qtime, model, user_item):
    data_list = []

    for user_id, query_time, item_id, phase in tqdm(df_qtime.values):
        rank = {}
        interacted_items = user_item[user_id]
        sim_items = model.wv.most_similar(positive=[str(x) for x in interacted_items[-2:]], topn=100)
        item_ids = [int(item[0]) for item in sim_items]
        item_sim_scores = [item[1] for item in sim_items]

        df_temp = pd.DataFrame()
        df_temp['item_id'] = item_ids
        df_temp['sim_score'] = item_sim_scores
        df_temp['user_id'] = user_id
        df_temp['query_time'] = query_time
        df_temp['phase'] = phase

        if item_id == -1:
            df_temp['label'] = np.nan
        else:
            df_temp['label'] = 0
            df_temp.loc[df_temp['item_id'] == item_id, 'label'] = 1

        df_temp.sort_values(['sim_score'], inplace=True, ascending=False)
        df_temp = df_temp[['user_id', 'phase', 'query_time',
                           'item_id', 'sim_score', 'label']]
        df_temp['user_id'] = df_temp['user_id'].astype('int')
        df_temp['item_id'] = df_temp['item_id'].astype('int')

        data_list.append(df_temp)

    df_data = pd.concat(data_list, sort=False)
    return df_data

In [None]:
@multitasking.task
def work(phase, force=False):
    os.makedirs('../user_data/model/recall_v6', exist_ok=True)

    if force or (not os.path.exists('../user_data/model/recall_v6/w2v_{}.m'.format(phase))):
        # 获取当前阶段的click
        df_click_phase = df_click[df_click['phase'] == phase]
        model = item2vec(
            df_click_phase, 'user_id', 'item_id')
        
        # 保存模型
        model.save('../user_data/model/recall_v6/w2v_{}.m'.format(phase))

        user_item_ = df_click_phase.groupby(
            'user_id')['item_id'].agg(list).reset_index()
        user_item_dict = dict(
            zip(user_item_['user_id'], user_item_['item_id']))

        # 获取当前阶段的qtime, 召回
        df_qtime_phase = df_qtime[df_qtime['phase'] == phase]
        df_data = recall(df_qtime_phase, model, user_item_dict)
        df_data.to_pickle('../user_data/model/recall_v6/recall_{}.pkl'.format(phase))

        print('phase {} finish'.format(phase))

In [None]:
item_sim_phase = {}
df_recall = pd.DataFrame()
val_score = np.array([0.0, 0.0, 0.0, 0.0])
force = True

for phase in phases:
    work(phase, force)
    
multitasking.wait_for_tasks()
print('合并任务')

for phase in phases:
    df_data = pd.read_pickle('../user_data/model/recall_v6/recall_{}.pkl'.format(phase))
    df_recall = df_recall.append(df_data)

    score = evaluate_scores(df_data, phase)
    val_score += score

    print('phase', phase, score)

In [None]:
val_score

In [None]:
df_recall.sort_values(['user_id', 'phase', 'query_time'], inplace=True)
df_recall.to_pickle('../user_data/data/recall_v6.pkl')
df_recall.head()