In [1]:
%run utils.ipynb

import pandas as pd
import numpy as np
from tqdm import tqdm
from annoy import AnnoyIndex
import os
import warnings
from collections import defaultdict
import math
import pickle
import gc
import multitasking
import signal

multitasking.set_max_threads(10)
multitasking.set_engine('process')
signal.signal(signal.SIGINT, multitasking.killall)

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('precision', 10)

warnings.filterwarnings('ignore')

In [2]:
df_qtime = pd.read_pickle('../user_data/data/qtime.pkl')
df_click = pd.read_pickle('../user_data/data/click.pkl')

In [3]:
phases = sorted(list(df_qtime['phase'].unique()))
phases

[0]

In [4]:
def euc(v, w):
    if np.max(v) == 0 and np.min(v) == 0 and np.max(w) == 0 and np.min(w) == 0:
        return 10
    return np.sqrt(np.sum(np.square(v - w)))

In [5]:
def item2item(user_item_set_dict, item_user_set_dict, rank_dict, time_dict,
              txt_vec_dict, img_vec_dict, sim_topN):
    item2item_sim_dict = {}
    for user in tqdm(user_item_set_dict.keys()):
        for item1 in user_item_set_dict[user]:
            if item1 not in item2item_sim_dict.keys():
                item2item_sim_dict[item1] = {}
            for item2 in user_item_set_dict[user]:
                if item1 == item2:
                    continue
                if item2 not in item2item_sim_dict[item1].keys():
                    item2item_sim_dict[item1][item2] = 0
                heat_factor = 1 / (
                    np.log2(1 + len(user_item_set_dict[user])) + np.sqrt(
                        abs(rank_dict[user][item1] - rank_dict[user][item2])))
                time_factor = 1 / \
                    np.sqrt(
                        1 + 20000 * abs(time_dict[user][item1] - time_dict[user][item2]))
                txt_euc_factor = 1 / \
                    np.sqrt(1 + euc(txt_vec_dict[item1], txt_vec_dict[item2]))
                img_euc_factor = 1 / \
                    np.sqrt(
                        1 + euc(img_vec_dict[item1], img_vec_dict[item2]) / 10)
                score = heat_factor * time_factor * txt_euc_factor * img_euc_factor
                item2item_sim_dict[item1][item2] += score / np.sqrt(
                    len(item_user_set_dict[item1]) *
                    len(item_user_set_dict[item2]))

    items = item2item_sim_dict.keys()
    for item in tqdm(items):
        if len(item2item_sim_dict[item]) == 0:
            del item2item_sim_dict[item]
        if len(item2item_sim_dict[item]) > sim_topN:
            item2item_sim_dict[item] = dict(
                sorted(item2item_sim_dict[item].items(),
                       key=lambda x: x[1],
                       reverse=True)[:sim_topN])

    return item2item_sim_dict

In [6]:
def recall(df_qtime, item2item_sim_dict, user_item_set_dict, rank_dict,
           time_dict, txt_vec_dict, recall_topN):
    data_list = []

    qtime_dict = dict(zip(df_qtime['user_id'], df_qtime['query_time']))

    for user_id, query_time, item_id, phase in tqdm(df_qtime.values):
        rank = {}
        for item in user_item_set_dict[user_id]:
            for i in item2item_sim_dict[item].keys():
                if i in user_item_set_dict[user_id]:
                    continue
                if i not in rank.keys():
                    rank[i] = 0
                rank_factor = 1 / np.sqrt(rank_dict[user_id][item])
                time_factor = 1 - 1000 * \
                    (qtime_dict[user_id] - time_dict[user_id][item])
                txt_euc_factor = 1 / \
                    (1 + euc(txt_vec_dict[i], txt_vec_dict[item]) / 100)
                rank[i] += item2item_sim_dict[item][i] * \
                    rank_factor * time_factor * txt_euc_factor

        sim_items = sorted(rank.items(), key=lambda d: d[1],
                           reverse=True)[:recall_topN]
        item_ids = [item[0] for item in sim_items]
        item_sim_scores = [item[1] for item in sim_items]

        df_temp = pd.DataFrame()
        df_temp['item_id'] = item_ids
        df_temp['sim_score'] = item_sim_scores
        df_temp['user_id'] = user_id
        df_temp['query_time'] = query_time
        df_temp['phase'] = phase

        if item_id == -1:
            df_temp['label'] = np.nan
        else:
            df_temp['label'] = 0
            df_temp.loc[df_temp['item_id'] == item_id, 'label'] = 1

        df_temp.sort_values(['sim_score'], inplace=True, ascending=False)
        df_temp = df_temp[[
            'user_id', 'phase', 'query_time', 'item_id', 'sim_score', 'label'
        ]]
        df_temp['user_id'] = df_temp['user_id'].astype('int')
        df_temp['item_id'] = df_temp['item_id'].astype('int')

        data_list.append(df_temp)

    df_data = pd.concat(data_list, sort=False)
    return df_data

In [7]:
txt_vec_cols = ['txt_vec_{}'.format(i) for i in range(128)]
img_vec_cols = ['img_vec_{}'.format(i) for i in range(128)]
vec_df = pd.read_csv('../data/underexpose_train/underexpose_item_feat.csv',
                     names=['item_id'] + txt_vec_cols + img_vec_cols)
vec_df['txt_vec_0'] = vec_df['txt_vec_0'].apply(lambda x: x.strip()[1:])
vec_df['txt_vec_127'] = vec_df['txt_vec_127'].apply(lambda x: x.strip()[:-1])
vec_df['img_vec_0'] = vec_df['img_vec_0'].apply(lambda x: x.strip()[1:])
vec_df['img_vec_127'] = vec_df['img_vec_127'].apply(lambda x: x.strip()[:-1])
vec_df[txt_vec_cols + img_vec_cols] = vec_df[txt_vec_cols +
                                             img_vec_cols].astype('float')

In [8]:
@multitasking.task
def work(phase, force=False):
    os.makedirs('../user_data/model/recall_v5', exist_ok=True)

    if force or (not os.path.exists('../user_data/model/recall_v5/sim_{}.pkl'.format(phase))
                 or not os.path.exists(
                     '../user_data/model/recall_v5/recall_{}.pkl'.format(phase))):
        # 获取当前阶段的click
        df_click_phase = df_click[df_click['phase'] == phase]

        df_click_phase = df_click_phase.merge(vec_df, on='item_id', how='left')
        for f in tqdm(txt_vec_cols + img_vec_cols):
            df_click_phase[f] = df_click_phase[f].fillna(0)

        txt_vec_dict = dict(
            zip(df_click_phase['item_id'],
                df_click_phase[txt_vec_cols].values))
        img_vec_dict = dict(
            zip(df_click_phase['item_id'],
                df_click_phase[img_vec_cols].values))

        df_click_phase['rank'] = df_click_phase.groupby(
            'user_id')['time'].rank(method='first', ascending=False)
        rank_df = df_click_phase.groupby('user_id')['item_id', 'rank'].apply(
            lambda x: dict(zip(x['item_id'], x['rank']))).reset_index()
        rank_dict = dict(zip(rank_df['user_id'], rank_df[0]))

        time_df = df_click_phase.groupby('user_id')['item_id', 'time'].apply(
            lambda x: dict(zip(x['item_id'], x['time']))).reset_index()
        time_dict = dict(zip(time_df['user_id'], time_df[0]))

        item_user_set_df = df_click_phase.groupby(
            'item_id', as_index=False)['user_id'].agg({'item_user_set': set})
        item_user_set_dict = dict(
            zip(item_user_set_df['item_id'],
                item_user_set_df['item_user_set']))

        user_item_set_df = df_click_phase.groupby(
            'user_id', as_index=False)['item_id'].agg({'user_item_set': set})
        user_item_set_dict = dict(
            zip(user_item_set_df['user_id'],
                user_item_set_df['user_item_set']))

        del rank_df, time_df, item_user_set_df, user_item_set_df
        gc.collect()

        item2item_sim_dict = item2item(user_item_set_dict, item_user_set_dict,
                                       rank_dict, time_dict, txt_vec_dict,
                                       img_vec_dict, sim_topN)

        f = open('../user_data/model/recall_v5/sim_{}.pkl'.format(phase), 'wb')
        pickle.dump(item2item_sim_dict, f)
        f.close()

        # 获取当前阶段的qtime, 召回
        df_qtime_phase = df_qtime[df_qtime['phase'] == phase]
        df_data = recall(df_qtime_phase, item2item_sim_dict,
                         user_item_set_dict, rank_dict, time_dict,
                         txt_vec_dict, recall_topN)
        df_data.to_pickle('../user_data/model/recall_v5/recall_{}.pkl'.format(phase))

In [9]:
sim_topN = 500
recall_topN = 100
item_sim_phase = {}
df_recall = pd.DataFrame()
val_score = np.array([0.0, 0.0, 0.0, 0.0])
force = False

for phase in phases:
    work(phase, force)

multitasking.wait_for_tasks()
print('合并任务')

for phase in phases:
    f = open('../user_data/model/recall_v5/sim_{}.pkl'.format(phase), 'rb')
    item_sim = pickle.load(f)
    f.close()

    df_data = pd.read_pickle('../user_data/model/recall_v5/recall_{}.pkl'.format(phase))

    item_sim_phase[phase] = item_sim
    df_recall = df_recall.append(df_data)

    score = evaluate_scores(df_data, phase)
    val_score += score

    print('phase', phase, score)

100%|██████████| 256/256 [00:00<00:00, 671.53it/s]
100%|██████████| 18505/18505 [08:58<00:00, 34.35it/s]
100%|██████████| 40768/40768 [00:00<00:00, 106961.58it/s]
100%|██████████| 18505/18505 [35:09<00:00,  8.77it/s] 


合并任务


100%|██████████| 18505/18505 [00:18<00:00, 982.11it/s]

phase 0 (0.05837995479642042, 0.13341645885286782, 0.052475814576360795, 0.10920034393809114)





In [10]:
# 保存相似度字典给后续使用
f = open('../user_data/model/tiancai_sim.pkl', 'wb')
pickle.dump(item_sim_phase, f)
f.close()

In [None]:
val_score

In [12]:
df_recall.sort_values(['user_id', 'phase', 'query_time'], inplace=True)
df_recall.to_pickle('../user_data/data/recall_v5.pkl')
df_recall.head()

Unnamed: 0,user_id,phase,query_time,item_id,sim_score,label
0,1,0.0,0.9839419315,103421,0.0050941158,0.0
1,1,0.0,0.9839419315,91290,0.0045766652,0.0
2,1,0.0,0.9839419315,35217,0.0045467833,0.0
3,1,0.0,0.9839419315,95676,0.0040608463,0.0
4,1,0.0,0.9839419315,109853,0.0040455135,0.0
