In [1]:
# 这一步目的是将多路召回的item合并到一起，删除重复召回的item
# 删除没有召回到真实item的训练集user，减少无用负样本

In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from annoy import AnnoyIndex
import os
import warnings
from collections import defaultdict
import math

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('precision', 10)

warnings.filterwarnings('ignore')

In [3]:
recall_v1 = pd.read_pickle('../user_data/data/recall_v1.pkl')
recall_v2 = pd.read_pickle('../user_data/data/recall_v2.pkl')
recall_v3 = pd.read_pickle('../user_data/data/recall_v3.pkl')
recall_v4 = pd.read_pickle('../user_data/data/recall_v4.pkl')
recall_v5 = pd.read_pickle('../user_data/data/recall_v5.pkl')
recall_v6 = pd.read_pickle('../user_data/data/recall_v6.pkl')

recall_list = [recall_v1, recall_v2, recall_v3, recall_v4, recall_v5, recall_v6]

In [4]:
recall = pd.concat(recall_list, sort=False)
recall['phase'] = recall['phase'].astype('int')
del recall['sim_score']
recall.sort_values(['user_id', 'phase'], inplace=True)
print(recall.shape)
recall.drop_duplicates(subset=['user_id', 'phase', 'item_id'], inplace=True)
print(recall.shape)

(78255223, 5)
(58904530, 5)


In [5]:
phases = sorted(list(recall['phase'].unique()))
phases

[0, 1, 2, 3, 4, 5, 6]

In [6]:
recall_hit_total = 0
for phase in phases:
    recall_phase = recall[recall['phase'] == phase]

    recall_hit = recall_phase[recall_phase['label'] == 1]['user_id'].nunique() / \
        recall_phase[recall_phase['label'].notnull()]['user_id'].nunique()
    recall_hit_total += recall_hit
    print('phase', phase, 'recall hit', recall_hit)
recall_hit_total

phase 0 recall hit 0.24213276332977082
phase 1 recall hit 0.24625280302136196
phase 2 recall hit 0.25371079722288725
phase 3 recall hit 0.2544616820249621
phase 4 recall hit 0.26321703134240093
phase 5 recall hit 0.26215956061378176
phase 6 recall hit 0.2630417227456258


1.7849763603007909

In [7]:
# phase 0 recall hit 0.22099513121957012
# phase 1 recall hit 0.2261890711672371
# phase 2 recall hit 0.2341393344505626
# phase 3 recall hit 0.23632334072086783
# phase 4 recall hit 0.24452986398580723
# phase 5 recall hit 0.24353094388766208
# phase 6 recall hit 0.24382234185733512
# 1.649530027289042

# phase 0 recall hit 0.24213276332977082
# phase 1 recall hit 0.24625280302136196
# phase 2 recall hit 0.25371079722288725
# phase 3 recall hit 0.2544616820249621
# phase 4 recall hit 0.26321703134240093
# phase 5 recall hit 0.26215956061378176
# phase 6 recall hit 0.2630417227456258
# 1.7849763603007909

In [8]:
for phase in phases:
    recall_phase = recall[recall['phase'] == phase]

    test_user_num = recall_phase[recall_phase['label'].isnull(
    )]['user_id'].nunique()
    train_user_num = recall_phase[recall_phase['label'].notnull(
    )]['user_id'].nunique()

    print('phase', phase, 'train_user_num',
          train_user_num, 'test_user_num', test_user_num)

phase 0 train_user_num 16842 test_user_num 1663
phase 1 train_user_num 16946 test_user_num 1726
phase 2 train_user_num 16708 test_user_num 1690
phase 3 train_user_num 17146 test_user_num 1675
phase 4 train_user_num 16910 test_user_num 1708
phase 5 train_user_num 17661 test_user_num 1798
phase 6 train_user_num 18575 test_user_num 1821


In [9]:
recall.head()

Unnamed: 0,user_id,phase,query_time,item_id,label
0,1,0,0.9839419315,92349,0.0
1,1,0,0.9839419315,87837,0.0
2,1,0,0.9839419315,38168,0.0
3,1,0,0.9839419315,91290,0.0
4,1,0,0.9839419315,13663,0.0


In [10]:
# 删除无正样本的训练集用户
gg = recall.groupby(['user_id', 'phase'])
useful_recall = []

for (user_id, phase), g in tqdm(gg):
    if g['label'].isnull().sum() > 0:
        useful_recall.append(g)
    else:
        label_sum = g['label'].sum()
        if label_sum > 1:
            print('error', user_id)
        elif label_sum == 1:
            useful_recall.append(g)

df_useful_recall = pd.concat(useful_recall, sort=False)

100%|██████████| 132869/132869 [01:08<00:00, 1936.47it/s]


In [11]:
for phase in phases:
    recall_phase = df_useful_recall[df_useful_recall['phase'] == phase]

    test_user_num = recall_phase[recall_phase['label'].isnull(
    )]['user_id'].nunique()
    train_user_num = recall_phase[recall_phase['label'].notnull(
    )]['user_id'].nunique()

    print('phase', phase, 'train_user_num',
          train_user_num, 'test_user_num', test_user_num)

phase 0 train_user_num 4078 test_user_num 1663
phase 1 train_user_num 4173 test_user_num 1726
phase 2 train_user_num 4239 test_user_num 1690
phase 3 train_user_num 4363 test_user_num 1675
phase 4 train_user_num 4451 test_user_num 1708
phase 5 train_user_num 4630 test_user_num 1798
phase 6 train_user_num 4886 test_user_num 1821


In [12]:
df_useful_recall.to_pickle('../user_data/data/recall.pkl')

In [13]:
df_useful_recall.head()

Unnamed: 0,user_id,phase,query_time,item_id,label
0,1,1,0.9839420823,101060,
1,1,1,0.9839420823,92349,
2,1,1,0.9839420823,46297,
3,1,1,0.9839420823,94147,
4,1,1,0.9839420823,102129,


In [14]:
df_useful_recall.shape

(18853803, 5)

In [15]:
for phase in phases:
    recall_phase = df_useful_recall[df_useful_recall['phase'] == phase]

    df = recall_phase['user_id'].value_counts().reset_index()
    df.columns = ['user_id', 'cnt']
    print('phase', phase, df['cnt'].mean())

phase 0 433.96585960634036
phase 1 434.3914222749619
phase 2 435.2145387080452
phase 3 440.2562106657834
phase 4 442.9108621529469
phase 5 443.63363410080893
phase 6 444.5670195318324


In [16]:
# phase 0 373.2750232126277
# phase 1 373.47094801223244
# phase 2 373.6526240628347
# phase 3 376.6348873755893
# phase 4 378.364025329454
# phase 5 379.4556484669618
# phase 6 379.9409448818898

# phase 0 433.96585960634036
# phase 1 434.3914222749619
# phase 2 435.2145387080452
# phase 3 440.2562106657834
# phase 4 442.9108621529469
# phase 5 443.63363410080893
# phase 6 444.5670195318324

In [17]:
df_useful_recall.shape

(18853803, 5)

In [18]:
df_useful_recall[df_useful_recall['label'].notnull()].shape

(13492625, 5)

In [19]:
df_useful_recall[df_useful_recall['label'].notnull()]['label'].value_counts()

0.0    13461805
1.0       30820
Name: label, dtype: int64