In [1]:
%run utils.ipynb

import pandas as pd
import numpy as np
from tqdm import tqdm
from annoy import AnnoyIndex
import os
import warnings
from collections import defaultdict
import math
import pickle
import multitasking
import signal

multitasking.set_max_threads(10)
multitasking.set_engine('process')
signal.signal(signal.SIGINT, multitasking.killall)

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('precision', 10)

warnings.filterwarnings('ignore')

In [2]:
df_qtime = pd.read_pickle('../user_data/data/qtime.pkl')
df_click = pd.read_pickle('../user_data/data/click.pkl')

In [3]:
phases = sorted(list(df_qtime['phase'].unique()))
phases

[0]

In [4]:
def euc(v, w):
    if np.max(v) == 0 and np.min(v) == 0 and np.max(w) == 0 and np.min(w) == 0:
        return 10
    return np.sqrt(np.sum(np.square(v - w)))

In [5]:
def cal_item_sim(df, user_col, item_col):
    user_item_ = df.groupby(user_col)[item_col].agg(
        lambda x: list(x)).reset_index()
    user_item_dict = dict(zip(user_item_[user_col], user_item_[item_col]))

    user_time_ = df.groupby(user_col)['time'].agg(
        lambda x: list(x)).reset_index()  # 引入时间因素
    user_time_dict = dict(zip(user_time_[user_col], user_time_['time']))

    txt_vec_cols = ['txt_vec_{}'.format(i) for i in range(128)]
    img_vec_cols = ['img_vec_{}'.format(i) for i in range(128)]
    vec_df = pd.read_csv(
        '../data/underexpose_train/underexpose_item_feat.csv',
        names=['item_id'] + txt_vec_cols + img_vec_cols)
    vec_df['txt_vec_0'] = vec_df['txt_vec_0'].apply(lambda x: x.strip()[1:])
    vec_df['txt_vec_127'] = vec_df['txt_vec_127'].apply(
        lambda x: x.strip()[:-1])
    vec_df['img_vec_0'] = vec_df['img_vec_0'].apply(lambda x: x.strip()[1:])
    vec_df['img_vec_127'] = vec_df['img_vec_127'].apply(
        lambda x: x.strip()[:-1])
    vec_df[txt_vec_cols + img_vec_cols] = vec_df[txt_vec_cols +
                                                 img_vec_cols].astype('float')
    df = df.merge(vec_df, on='item_id', how='left')
    for f in tqdm(txt_vec_cols + img_vec_cols):
        df[f] = df[f].fillna(0)
    txt_vec_dict = dict(zip(df['item_id'], df[txt_vec_cols].values))
    img_vec_dict = dict(zip(df['item_id'], df[img_vec_cols].values))

    sim_item = {}
    item_cnt = defaultdict(int)
    for user, items in tqdm(user_item_dict.items()):
        for loc1, item in enumerate(items):
            item_cnt[item] += 1
            sim_item.setdefault(item, {})
            for loc2, relate_item in enumerate(items):
                t1 = user_time_dict[user][loc1]
                t2 = user_time_dict[user][loc2]

                if abs(loc2 -
                       loc1) > 5 or item == relate_item or abs(t2 -
                                                               t1) > 0.000003:
                    continue

                sim_item[item].setdefault(relate_item, 0)

                txt_euc_factor = 1 / \
                    np.sqrt(
                        1 + euc(txt_vec_dict[item], txt_vec_dict[relate_item]))
                img_euc_factor = 1 / \
                    np.sqrt(
                        1 + euc(img_vec_dict[item], img_vec_dict[relate_item]) / 10)

                if loc1 - loc2 > 0:
                    sim_item[item][
                        relate_item] += 1 * txt_euc_factor * img_euc_factor * 0.7 * (
                            0.8**(loc1 - loc2 -
                                  1)) * (1 - (t1 - t2) * 10000) / math.log(
                                      1 + len(items))  # 逆向
                else:
                    sim_item[item][
                        relate_item] += 1 * txt_euc_factor * img_euc_factor * 1.0 * (
                            0.8**(loc2 - loc1 -
                                  1)) * (1 - (t2 - t1) * 10000) / math.log(
                                      1 + len(items))  # 正向

    sim_item_corr = sim_item.copy()
    for i, related_items in tqdm(sim_item.items()):
        for j, cij in related_items.items():
            sim_item_corr[i][j] = cij / math.sqrt(item_cnt[i] * item_cnt[j])

    return sim_item_corr, user_item_dict, item_cnt, user_time_dict

In [6]:
def recall(df_qtime, item_sim_list, user_item, item_cnt, user_time_dict):
    data_list = []
    for user_id, query_time, item_id, phase in tqdm(df_qtime.values):
        rank = {}
        interacted_items = user_item[user_id]

        interacted_items = interacted_items[::-1]
        for loc, i in enumerate(interacted_items):
            time_factor = 1 - 1000 * \
                (query_time - user_time_dict[user_id]
                 [len(interacted_items)-loc-1])
            for j, wij in sorted(item_sim_list[i].items(),
                                 key=lambda d: d[1],
                                 reverse=True)[0:500]:
                if j not in interacted_items:
                    rank.setdefault(j, 0)
                    rank[j] += wij * (0.7**loc) * item_cnt[j] * time_factor

        sim_items = sorted(rank.items(), key=lambda d: d[1],
                           reverse=True)[:100]
        item_ids = [item[0] for item in sim_items]
        item_sim_scores = [item[1] for item in sim_items]

        df_temp = pd.DataFrame()
        df_temp['item_id'] = item_ids
        df_temp['sim_score'] = item_sim_scores
        df_temp['user_id'] = user_id
        df_temp['query_time'] = query_time
        df_temp['phase'] = phase

        if item_id == -1:
            df_temp['label'] = np.nan
        else:
            df_temp['label'] = 0
            df_temp.loc[df_temp['item_id'] == item_id, 'label'] = 1

        df_temp.sort_values(['sim_score'], inplace=True, ascending=False)
        df_temp = df_temp[[
            'user_id', 'phase', 'query_time', 'item_id', 'sim_score', 'label'
        ]]
        df_temp['user_id'] = df_temp['user_id'].astype('int')
        df_temp['item_id'] = df_temp['item_id'].astype('int')

        data_list.append(df_temp)

    df_data = pd.concat(data_list, sort=False)

    return df_data

In [7]:
@multitasking.task
def work(phase, force=False):
    os.makedirs('../user_data/model/recall_v1', exist_ok=True)

    if force or (
            not os.path.exists(
                '../user_data/model/recall_v1/sim_{}.pkl'.format(phase))
            or not os.path.exists(
                '../user_data/model/recall_v1/recall_{}.pkl'.format(phase))):
        # 获取当前阶段的click
        df_click_phase = df_click[df_click['phase'] == phase]
        item_sim, user_item, item_cnt, user_time_dict = cal_item_sim(
            df_click_phase, 'user_id', 'item_id')

        f = open('../user_data/model/recall_v1/sim_{}.pkl'.format(phase), 'wb')
        pickle.dump(item_sim, f)
        f.close()

        # 获取当前阶段的qtime, 召回
        df_qtime_phase = df_qtime[df_qtime['phase'] == phase]
        df_data = recall(df_qtime_phase, item_sim, user_item, item_cnt,
                         user_time_dict)
        df_data.to_pickle(
            '../user_data/model/recall_v1/recall_{}.pkl'.format(phase))

        print('phase {} finish'.format(phase))

In [None]:
item_sim_phase = {}
df_recall = pd.DataFrame()
val_score = np.array([0.0, 0.0, 0.0, 0.0])
force = False

for phase in phases:
    work(phase, force)

multitasking.wait_for_tasks()
print('合并任务')

for phase in phases:
    f = open('../user_data/model/recall_v1/sim_{}.pkl'.format(phase), 'rb')
    item_sim = pickle.load(f)
    f.close()

    df_data = pd.read_pickle(
        '../user_data/model/recall_v1/recall_{}.pkl'.format(phase))

    item_sim_phase[phase] = item_sim
    df_recall = df_recall.append(df_data)

    score = evaluate_scores(df_data, phase)
    val_score += score

    print('phase', phase, score)

合并任务


 91%|█████████ | 16843/18504 [00:17<00:01, 967.08it/s]

In [None]:
# 保存相似度字典给后续使用
f = open('../user_data/model/if_sim.pkl', 'wb')
pickle.dump(item_sim_phase, f)
f.close()

In [None]:
val_score

In [None]:
df_recall.sort_values(['user_id', 'phase', 'query_time'], inplace=True)
df_recall.to_pickle('../user_data/data/recall_v1.pkl')
df_recall.head()