In [1]:
%run utils.ipynb

import pandas as pd
import numpy as np
from tqdm import tqdm
from annoy import AnnoyIndex
import os
import warnings
from collections import defaultdict
import math
import pickle
import multitasking
import signal

multitasking.set_max_threads(10)
multitasking.set_engine('process')
signal.signal(signal.SIGINT, multitasking.killall)

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('precision', 10)

warnings.filterwarnings('ignore')

In [2]:
df_qtime = pd.read_pickle('../user_data/data/qtime.pkl')
df_click = pd.read_pickle('../user_data/data/click.pkl')

In [3]:
phases = sorted(list(df_qtime['phase'].unique()))
phases

[0]

In [4]:
def cal_item_sim(df, user_col, item_col):
    user_item_ = df.groupby(user_col)[item_col].agg(list).reset_index()
    user_item_dict = dict(zip(user_item_[user_col], user_item_[item_col]))

    item_user_ = df.groupby(item_col)[user_col].agg(list).reset_index()
    item_user_dict = dict(zip(item_user_[item_col], item_user_[user_col]))

    user_time_ = df.groupby(user_col)['time'].agg(
        lambda x: list(x)).reset_index()
    user_time_dict = dict(zip(user_time_[user_col], user_time_['time']))

    sim_item = {}
    for item, users in tqdm(item_user_dict.items()):
        sim_item.setdefault(item, {})

        for u in users:
            items = user_item_dict[u]

            for relate_item in items:
                loc1 = user_item_dict[u].index(item)
                loc2 = user_item_dict[u].index(relate_item)

                t1 = user_time_dict[u][loc1]
                t2 = user_time_dict[u][loc2]

                sim_item[item].setdefault(relate_item, 0)
                sim_item[item][relate_item] += 1 / \
                    (math.log(len(users)+1) * math.log(len(items)+1))

    return sim_item, user_item_dict

In [5]:
def recall(df_qtime, item_sim_list, user_item):
    data_list = []

    for user_id, query_time, item_id, phase in tqdm(df_qtime.values):
        rank = {}
        interacted_items = user_item[user_id]
        interacted_items = interacted_items[::-1]
        for loc, i in enumerate(interacted_items):
            for j, wij in sorted(item_sim_list[i].items(),
                                 key=lambda d: d[1],
                                 reverse=True)[0:500]:
                if j not in interacted_items:
                    rank.setdefault(j, 0)
                    rank[j] += wij * (0.7**loc)

        sim_items = sorted(rank.items(), key=lambda d: d[1],
                           reverse=True)[:100]
        item_ids = [item[0] for item in sim_items]
        item_sim_scores = [item[1] for item in sim_items]

        df_temp = pd.DataFrame()
        df_temp['item_id'] = item_ids
        df_temp['sim_score'] = item_sim_scores
        df_temp['user_id'] = user_id
        df_temp['query_time'] = query_time
        df_temp['phase'] = phase

        if item_id == -1:
            df_temp['label'] = np.nan
        else:
            df_temp['label'] = 0
            df_temp.loc[df_temp['item_id'] == item_id, 'label'] = 1

        df_temp.sort_values(['sim_score'], inplace=True, ascending=False)
        df_temp = df_temp[[
            'user_id', 'phase', 'query_time', 'item_id', 'sim_score', 'label'
        ]]
        df_temp['user_id'] = df_temp['user_id'].astype('int')
        df_temp['item_id'] = df_temp['item_id'].astype('int')

        data_list.append(df_temp)

    df_data = pd.concat(data_list, sort=False)
    return df_data

In [6]:
@multitasking.task
def work(phase, force=False):
    os.makedirs('../user_data/model/recall_v2', exist_ok=True)

    if force or (
            not os.path.exists(
                '../user_data/model/recall_v2/sim_{}.pkl'.format(phase))
            or not os.path.exists(
                '../user_data/model/recall_v2/recall_{}.pkl'.format(phase))):
        # 获取当前阶段的click
        df_click_phase = df_click[df_click['phase'] == phase]
        sim_item, user_item_dict = cal_item_sim(df_click_phase, 'user_id',
                                                'item_id')

        f = open('../user_data/model/recall_v2/sim_{}.pkl'.format(phase), 'wb')
        pickle.dump(sim_item, f)
        f.close()

        # 获取当前阶段的qtime, 召回
        df_qtime_phase = df_qtime[df_qtime['phase'] == phase]
        df_data = recall(df_qtime_phase, sim_item, user_item_dict)
        df_data.to_pickle(
            '../user_data/model/recall_v2/recall_{}.pkl'.format(phase))

        print('phase {} finish'.format(phase))

In [7]:
item_sim_phase = {}
df_recall = pd.DataFrame()
val_score = np.array([0.0, 0.0, 0.0, 0.0])
force = False

for phase in phases:
    work(phase, force)

multitasking.wait_for_tasks()
print('合并任务')

for phase in phases:
    f = open('../user_data/model/recall_v2/sim_{}.pkl'.format(phase), 'rb')
    item_sim = pickle.load(f)
    f.close()

    df_data = pd.read_pickle(
        '../user_data/model/recall_v2/recall_{}.pkl'.format(phase))

    item_sim_phase[phase] = item_sim
    df_recall = df_recall.append(df_data)

    score = evaluate_scores(df_data, phase)
    val_score += score

    print('phase', phase, score)

合并任务


100%|██████████| 18505/18505 [00:18<00:00, 987.60it/s]

phase 0 (0.049057716332143474, 0.10996318726992044, 0.0229912553375824, 0.0649183147033534)





In [8]:
# 保存相似度字典给后续使用
f = open('../user_data/model/bn_sim.pkl', 'wb')
pickle.dump(item_sim_phase, f)
f.close()

In [9]:
val_score

array([0.04905772, 0.10996319, 0.02299126, 0.06491831])

In [10]:
df_recall.sort_values(['user_id', 'phase', 'query_time'], inplace=True)
df_recall.to_pickle('../user_data/data/recall_v2.pkl')
df_recall.head()

Unnamed: 0,user_id,phase,query_time,item_id,sim_score,label
0,1,0.0,0.9839419315,87837,0.3453144185,0.0
2,1,0.0,0.9839419315,19228,0.3453144185,0.0
3,1,0.0,0.9839419315,109854,0.3453144185,0.0
4,1,0.0,0.9839419315,55738,0.3453144185,0.0
1,1,0.0,0.9839419315,91290,0.3453144185,0.0
