In [1]:
import pandas as pd
from tqdm import tqdm
from annoy import AnnoyIndex
import os
from joblib import Parallel, delayed

In [2]:
current_phase = 1

In [3]:
train_qtime_list = []
train_history_list = []

test_qtime_list = []
test_history_list = []

for i in tqdm(range(current_phase+1)):
    # 处理train
    df_click_train = pd.read_csv(
        'raw_data/underexpose_train/underexpose_train_click-{}.csv'.format(i), header=None)
    df_click_train.columns = ['user_id', 'item_id', 'time']
    df_click_train['phase'] = i
    df_click_train.sort_values(['user_id', 'time'], inplace=True)
    df_click_train['index'] = df_click_train.index

    df_qtime_train = df_click_train.groupby(['user_id']).last().reset_index()
    df_qtime_train.rename(columns={'time': 'query_time'}, inplace=True)
    df_history_train = df_click_train[~(
        df_click_train['index'].isin(df_qtime_train['index'].values.tolist()))]

    del df_history_train['index']
    del df_qtime_train['index']
    train_history_list.append(df_history_train)
    train_qtime_list.append(df_qtime_train)

    df_click_test = pd.read_csv(
        'raw_data/underexpose_test/underexpose_test_click-{}/underexpose_test_click-{}.csv'.format(i, i), header=None)
    df_click_test.columns = ['user_id', 'item_id', 'time']
    df_click_test['phase'] = i
    df_click_test.sort_values(['user_id', 'time'], inplace=True)

    df_qtime_test = pd.read_csv(
        'raw_data/underexpose_test/underexpose_test_click-{}/underexpose_test_qtime-{}.csv'.format(i, i), header=None)
    df_qtime_test.columns = ['user_id', 'query_time']
    df_qtime_test['phase'] = i
    df_qtime_test['item_id'] = -1

    test_history_list.append(df_click_test)
    test_qtime_list.append(df_qtime_test)

100%|██████████| 2/2 [00:00<00:00,  4.41it/s]


In [4]:
df_train_history = pd.concat(train_history_list, sort=False)
df_train_qtime = pd.concat(train_qtime_list, sort=False)
df_train_qtime = df_train_qtime[['user_id', 'item_id', 'phase', 'query_time']]

df_test_history = pd.concat(test_history_list, sort=False)
df_test_qtime = pd.concat(test_qtime_list, sort=False)
df_test_qtime = df_train_qtime[['user_id', 'item_id', 'phase', 'query_time']]

In [5]:
# 计算商品之间相似性
df_item = pd.read_csv(
    'raw_data/underexpose_train/underexpose_item_feat.csv', header=None)
df_item.columns = ['item_id'] + ['txt_vec' +
                                 str(i) for i in range(128)] + ['img_vec'+str(i) for i in range(128)]
df_item['txt_vec0'] = df_item['txt_vec0'].apply(lambda x: float(x[1:]))
df_item['txt_vec127'] = df_item['txt_vec127'].apply(
    lambda x: float(x[:-1]))
df_item['img_vec0'] = df_item['img_vec0'].apply(lambda x: float(x[1:]))
df_item['img_vec127'] = df_item['img_vec127'].apply(
    lambda x: float(x[:-1]))
print(df_item.shape)
df_item.drop_duplicates(['item_id'], inplace=True)
print(df_item.shape)
df_item.head()

(108916, 257)
(108916, 257)


Unnamed: 0,item_id,txt_vec0,txt_vec1,txt_vec2,txt_vec3,txt_vec4,txt_vec5,txt_vec6,txt_vec7,txt_vec8,...,img_vec118,img_vec119,img_vec120,img_vec121,img_vec122,img_vec123,img_vec124,img_vec125,img_vec126,img_vec127
0,42844,4.514945,-2.38372,0.500414,0.407068,-1.995229,0.109078,-0.691775,2.22746,-6.437974,...,-3.374727,-1.506969,-1.82018,-3.024644,0.445263,0.013933,-1.300239,2.759948,2.056171,0.508703
1,67898,-2.002905,-0.929881,0.790017,-1.380895,-0.510463,-1.810096,1.363962,0.497401,-4.038903,...,-0.53833,-2.620164,1.277195,0.601015,-0.345312,0.993457,1.351633,2.162675,2.768597,-0.937197
2,66446,4.221673,-1.497139,1.13357,-2.745607,-4.197045,-0.542392,-1.396256,1.838419,-6.066454,...,-4.582711,-1.05691,-2.568084,-2.038061,2.508719,-0.764789,-0.657116,3.252782,2.687366,0.844332
3,63651,2.65797,-0.941863,1.121529,-5.109496,-0.279041,-0.351968,-1.086983,2.703607,-6.494977,...,-0.487683,-1.889119,0.943015,-2.834418,1.633184,2.001801,-2.333152,2.645595,2.280233,-0.694448
4,46824,3.192195,-1.936676,1.199909,-2.562152,-2.573456,0.575841,-2.358653,1.620844,-4.302936,...,-0.621475,-2.09141,0.5016,-3.083864,-1.060091,2.0536,-2.025008,2.399251,2.562317,0.694134


In [6]:
def get_index(name):
    index_to_item_dict = {}
    item_to_index_dict = {}

    if os.path.exists('models/{}.ann'.format(name)):
        u = AnnoyIndex(128, 'angular')
        u.load('models/{}.ann'.format(name))

        f = open('models/index_to_item_dict_{}.dict'.format(name), 'r')
        a = f.read()
        index_to_item_dict = eval(a)
        f.close()

        f = open('models/item_to_index_dict_{}.dict'.format(name), 'r')
        a = f.read()
        item_to_index_dict = eval(a)
        f.close()

        return u, index_to_item_dict, item_to_index_dict

    else:
        t = AnnoyIndex(128, 'angular')
        t.set_seed(2020)

        for i, row in tqdm(df_item.iterrows()):
            if name == 'txt':
                emb = row[-128-128: -128].values
            else:
                emb = row[-128:].values

            item = row[0]
            index_to_item_dict[i] = item
            item_to_index_dict[item] = i

            t.add_item(i, emb)

        t.build(100)

        f = open('models/index_to_item_dict_{}.dict'.format(name), 'w')
        f.write(str(index_to_item_dict))
        f.close()

        f = open('models/item_to_index_dict_{}.dict'.format(name), 'w')
        f.write(str(item_to_index_dict))
        f.close()

        t.save('models/{}.ann'.format(name))

        return t, index_to_item_dict, item_to_index_dict

In [9]:
def recall(history_items, user_id, item_id, phase, query_time, t, index_to_item_dict, item_to_index_dict):
    df_temp = pd.DataFrame()
    item_ids = []
    item_sim_scores = []

    for history_item in history_items:
        if history_item in item_to_index_dict:
            ids, scores = t.get_nns_by_item(
                item_to_index_dict[history_item], 10, include_distances=True)

            item_ids += [index_to_item_dict[id] for id in ids]
            item_sim_scores += scores

    df_temp['item_id'] = item_ids
    df_temp['sim_score'] = item_sim_scores

    df_temp.sort_values(['sim_score'], inplace=True, ascending=False)
    df_temp.drop_duplicates(['item_id'], inplace=True)
    df_temp = df_temp.head(300)

    df_temp['user_id'] = user_id
    df_temp['query_time'] = query_time
    df_temp['phase'] = phase
    df_temp['label'] = 0
    df_temp.loc[df_temp['item_id'] == item_id, 'label'] = 1

    del df_temp['sim_score']

    return df_temp

In [10]:
train_data_txt_list = []
index, index_to_item_dict, item_to_index_dict = get_index('txt')

for user_id, item_id, phase, query_time in tqdm(df_train_qtime.values):
    a = recall(df_train_history[df_train_history['user_id'] == user_id]
               ['item_id'].values.tolist(), user_id, item_id, phase, query_time, index, index_to_item_dict, item_to_index_dict)
    train_data_txt_list.append(a)

train_data_txt = pd.concat(train_data_txt_list)

100%|██████████| 33788/33788 [06:57<00:00, 80.91it/s]


In [11]:
train_data_txt['label'].value_counts()

0    4282371
1       8025
Name: label, dtype: int64

In [17]:
train_data_img_list = []
index, index_to_item_dict, item_to_index_dict = get_index('img')

for user_id, item_id, phase, query_time in tqdm(df_train_qtime.values):
    a = recall(df_train_history[df_train_history['user_id'] == user_id]
               ['item_id'].values.tolist(), user_id, item_id, phase, query_time, index, index_to_item_dict, item_to_index_dict)
    train_data_img_list.append(a)

train_data_img = pd.concat(train_data_img_list)

108916it [00:21, 5053.62it/s]
100%|██████████| 33788/33788 [07:02<00:00, 79.95it/s]


In [20]:
train_data = pd.concat([train_data_txt, train_data_img], sort=False)

In [25]:
print(train_data.shape)
train_data_tmp = train_data.drop_duplicates(['user_id', 'item_id', 'phase'])
print(train_data_tmp['label'].value_counts())
print(train_data_tmp.shape)

(8679293, 5)
0    8055458
1       8429
Name: label, dtype: int64
(8063887, 5)


In [27]:
train_data_tmp.head()

Unnamed: 0,item_id,user_id,query_time,phase,label
19,13294.0,1.0,0.983942,0.0,0
39,48983.0,1.0,0.983942,0.0,0
18,1212.0,1.0,0.983942,0.0,0
38,77528.0,1.0,0.983942,0.0,0
17,5228.0,1.0,0.983942,0.0,0


In [28]:
len(train_data_tmp.groupby(['user_id', 'phase']))

33741

In [29]:
8429 / 33741

0.24981476541892653

In [30]:
train_data_tmp['user_id'].nunique()

20892