In [1]:
import pandas as pd
import warnings
import lightgbm as lgb
from sklearn.model_selection import GroupKFold
import gc
from tqdm import tqdm
import numpy as np
from collections import defaultdict
import math
from pandarallel import pandarallel
import pickle
from gensim.models import Word2Vec
import os

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pandarallel.initialize()
warnings.filterwarnings('ignore')

INFO: Pandarallel will run on 32 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
seed = 2020

In [3]:
df_feature = pd.read_pickle('../user_data/data/recall.pkl')
print(df_feature.shape)

(2496846, 5)


In [4]:
phases = sorted(list(df_feature['phase'].unique()))
phases

[0]

In [5]:
df_item = pd.read_csv(
    '../data/underexpose_train/underexpose_item_feat.csv', header=None)
df_item.columns = ['item_id'] + ['txt_vec' +
                                 str(i) for i in range(128)] + ['img_vec'+str(i) for i in range(128)]
df_item['txt_vec0'] = df_item['txt_vec0'].apply(lambda x: float(x[1:]))
df_item['txt_vec127'] = df_item['txt_vec127'].apply(
    lambda x: float(x[:-1]))
df_item['img_vec0'] = df_item['img_vec0'].apply(lambda x: float(x[1:]))
df_item['img_vec127'] = df_item['img_vec127'].apply(
    lambda x: float(x[:-1]))
df_item.drop_duplicates(['item_id'], inplace=True)

In [6]:
df_user = pd.read_csv(
    '../data/underexpose_train/underexpose_user_feat.csv', header=None)
df_user.columns = ['user_id', 'user_age_level',
                   'user_gender', 'user_city_level']

gender_map = {'F': 0, 'M': 1}
df_user['user_gender'] = df_user['user_gender'].map(gender_map)

df_user.drop_duplicates(['user_id'], inplace=True)

In [7]:
df_click = pd.read_pickle('../user_data/data/click.pkl')
df_click = df_click.merge(df_user, how='left')

In [8]:
df_feature.head()

Unnamed: 0,user_id,phase,query_time,item_id,label
0,7,0,0.98394,591,0.0
1,7,0,0.98394,20201,0.0
2,7,0,0.98394,7709,0.0
3,7,0,0.98394,4340,0.0
4,7,0,0.98394,9338,0.0


In [9]:
def group_func(df, group_func_dic, group_key):
    if isinstance(group_func_dic, str):
        group_func_dic = [group_func_dic]

    features = df.groupby(group_key).agg(group_func_dic)
    features.columns = ['_'.join(group_key) + '_' + e[0] + "_" + e[1]
                        for e in features.columns.tolist()]
    features.reset_index(inplace=True)
    return features

# 物品属性

In [10]:
from sklearn.decomposition import PCA
dim = 10
pca = PCA(n_components=dim, random_state=seed)
df_txt_pca = pd.DataFrame(pca.fit_transform(
    df_item[['txt_vec' + str(i) for i in range(128)]]))
df_txt_pca.columns = ['txt_vec{}_pca{}'.format(dim, i) for i in range(dim)]
df_txt_pca['item_id'] = df_item[['item_id']]
df_feature = df_feature.merge(df_txt_pca, how='left')

pca = PCA(n_components=dim, random_state=seed)
df_img_pca = pd.DataFrame(pca.fit_transform(
    df_item[['img_vec' + str(i) for i in range(128)]]))
df_img_pca.columns = ['img_vec{}_pca{}'.format(dim, i) for i in range(dim)]
df_img_pca['item_id'] = df_item[['item_id']]
df_feature = df_feature.merge(df_img_pca, how='left')
print(df_feature.shape)

(2496846, 25)


In [11]:
df_feature.head()

Unnamed: 0,user_id,phase,query_time,item_id,label,txt_vec10_pca0,txt_vec10_pca1,txt_vec10_pca2,txt_vec10_pca3,txt_vec10_pca4,txt_vec10_pca5,txt_vec10_pca6,txt_vec10_pca7,txt_vec10_pca8,txt_vec10_pca9,img_vec10_pca0,img_vec10_pca1,img_vec10_pca2,img_vec10_pca3,img_vec10_pca4,img_vec10_pca5,img_vec10_pca6,img_vec10_pca7,img_vec10_pca8,img_vec10_pca9
0,7,0,0.98394,591,0.0,,,,,,,,,,,,,,,,,,,,
1,7,0,0.98394,20201,0.0,7.315396,2.445671,-2.456841,-0.265093,1.541942,-1.857202,-0.661244,1.367767,3.947654,1.537049,-3.902055,7.722759,1.712466,1.263972,-4.513811,1.919901,5.909709,-2.219446,1.426612,-5.519471
2,7,0,0.98394,7709,0.0,,,,,,,,,,,,,,,,,,,,
3,7,0,0.98394,4340,0.0,,,,,,,,,,,,,,,,,,,,
4,7,0,0.98394,9338,0.0,-0.939243,4.632142,-4.438131,-0.889346,-1.173805,-4.121347,2.271114,5.32742,-1.557398,4.299243,-3.981642,9.816935,1.646795,-2.676538,-5.349474,2.044283,7.449821,-6.21031,-1.30953,-1.486131


In [12]:
df_tmp = df_click.groupby(['phase', 'item_id']).size().reset_index()
df_tmp.columns = ['phase', 'item_id', 'phase_item_clickd_count']
df_feature = df_feature.merge(df_tmp, how='left')
print(df_feature.shape)
del df_tmp
gc.collect()

(2496846, 26)


0

In [13]:
# item 点击间隔
df_temp = df_click[['phase', 'item_id', 'time']].copy()
df_temp.sort_values(['time'], inplace=True)
df_temp['phase_item_click_time_diff'] = df_temp.groupby(['item_id', 'phase'])[
    'time'].diff()
df_temp = df_temp.groupby(['item_id'])['phase_item_click_time_diff'].agg(
    phase_item_click_time_diff_mean='mean').reset_index()
df_feature = df_feature.merge(df_temp, how='left')
print(df_feature.shape)
del df_temp
gc.collect()

(2496846, 27)


0

In [14]:
# item 消费年龄统计
group_func_dict = {
    'user_age_level': ['mean', 'min', 'max', 'std'],
}
df_temp = group_func(df_click, group_func_dict, group_key=['item_id', 'phase'])
df_feature = df_feature.merge(df_temp, how='left')
del df_temp
gc.collect()
print(df_feature.shape)

(2496846, 31)


In [15]:
# item 性别统计
from scipy import stats
df_temp = df_click.groupby(['item_id', 'phase'])[
    'user_gender'].mean().reset_index()
df_temp.columns = ['item_id', 'phase', 'phase_item_click_gender_mean']
df_feature = df_feature.merge(df_temp, how='left')
del df_temp
gc.collect()
print(df_feature.shape)

(2496846, 32)


# 用户属性

In [16]:
# user 属性信息
df_feature = df_feature.merge(df_user, how='left')
print(df_feature.shape)

(2496846, 35)


In [17]:
# count 特征
df_tmp = df_click.groupby(['user_id', 'phase']).size().reset_index()
df_tmp.columns = ['user_id', 'phase', 'phase_user_click_count']
df_feature = df_feature.merge(df_tmp, how='left')
print(df_feature.shape)
del df_tmp
gc.collect()

df_tmp = df_click.groupby(['phase', 'user_age_level']).size().reset_index()
df_tmp.columns = ['phase', 'user_age_level',
                  'phase_user_age_level_click_count']
df_feature = df_feature.merge(df_tmp, how='left')
print(df_feature.shape)
del df_tmp
gc.collect()

(2496846, 36)
(2496846, 37)


0

In [18]:
group_func_dict = {
    'time': ['min', 'max', 'std'],
}
df_temp = group_func(df_click, group_func_dict, group_key=['user_id', 'phase'])
df_feature = df_feature.merge(df_temp, how='left')
print(df_feature.shape)
del df_temp
gc.collect()

df_feature['user_id_phase_time_max_min_diff'] = df_feature['user_id_phase_time_max'] - \
    df_feature['user_id_phase_time_min']

df_feature['user_id_phase_query_lastbuy_time_diff'] = df_feature['query_time'] - \
    df_feature['user_id_phase_time_max']
print(df_feature.shape)

del df_feature['user_id_phase_time_max'], df_feature['user_id_phase_time_min']

(2496846, 40)
(2496846, 42)


In [19]:
df_feature.head()

Unnamed: 0,user_id,phase,query_time,item_id,label,txt_vec10_pca0,txt_vec10_pca1,txt_vec10_pca2,txt_vec10_pca3,txt_vec10_pca4,txt_vec10_pca5,txt_vec10_pca6,txt_vec10_pca7,txt_vec10_pca8,txt_vec10_pca9,img_vec10_pca0,img_vec10_pca1,img_vec10_pca2,img_vec10_pca3,img_vec10_pca4,img_vec10_pca5,img_vec10_pca6,img_vec10_pca7,img_vec10_pca8,img_vec10_pca9,phase_item_clickd_count,phase_item_click_time_diff_mean,item_id_phase_user_age_level_mean,item_id_phase_user_age_level_min,item_id_phase_user_age_level_max,item_id_phase_user_age_level_std,phase_item_click_gender_mean,user_age_level,user_gender,user_city_level,phase_user_click_count,phase_user_age_level_click_count,user_id_phase_time_std,user_id_phase_time_max_min_diff,user_id_phase_query_lastbuy_time_diff
0,7,0,0.98394,591,0.0,,,,,,,,,,,,,,,,,,,,,98.0,2.321776e-06,4.25,1.0,8.0,1.900764,0.125,,,,4,,3.9e-05,6.8e-05,9.4e-05
1,7,0,0.98394,20201,0.0,7.315396,2.445671,-2.456841,-0.265093,1.541942,-1.857202,-0.661244,1.367767,3.947654,1.537049,-3.902055,7.722759,1.712466,1.263972,-4.513811,1.919901,5.909709,-2.219446,1.426612,-5.519471,82.0,5.267002e-07,4.608696,2.0,7.0,1.269901,0.086957,,,,4,,3.9e-05,6.8e-05,9.4e-05
2,7,0,0.98394,7709,0.0,,,,,,,,,,,,,,,,,,,,,89.0,6.000701e-06,4.608696,2.0,7.0,1.64425,0.086957,,,,4,,3.9e-05,6.8e-05,9.4e-05
3,7,0,0.98394,4340,0.0,,,,,,,,,,,,,,,,,,,,,83.0,1.250319e-06,4.230769,2.0,8.0,2.178214,0.115385,,,,4,,3.9e-05,6.8e-05,9.4e-05
4,7,0,0.98394,9338,0.0,-0.939243,4.632142,-4.438131,-0.889346,-1.173805,-4.121347,2.271114,5.32742,-1.557398,4.299243,-3.981642,9.816935,1.646795,-2.676538,-5.349474,2.044283,7.449821,-6.21031,-1.30953,-1.486131,95.0,3.006781e-06,4.421053,2.0,7.0,1.387075,0.0,,,,4,,3.9e-05,6.8e-05,9.4e-05


# U-I 交互属性

In [20]:
phase_user_item_dict = {}
for phase in phases:
    df_click_temp = df_click[df_click['phase'] == phase]

    user_item_ = df_click_temp.groupby(
        'user_id')['item_id'].agg(list).reset_index()
    user_item_dict = dict(zip(user_item_['user_id'], user_item_['item_id']))

    phase_user_item_dict[phase] = user_item_dict

## itemcf

In [21]:
# 用户历史点击物品与待预测物品相似度（itemcf）
f = open('../user_data/model/if_sim.pkl', 'rb')
item_sim_if = pickle.load(f)
f.close()

In [22]:
def func_if_sum(x):
    user_id = x['user_id']
    item_id = x['item_id']
    phase = x['phase']

    interacted_items = phase_user_item_dict[phase][user_id]
    interacted_items = interacted_items[::-1]

    sim_sum = 0
    for loc, i in enumerate(interacted_items):
        try:
            sim_sum += item_sim_if[phase][i][item_id] * (0.7**loc)
        except Exception as e:
            pass
    return sim_sum


def func_if_max(x):
    user_id = x['user_id']
    item_id = x['item_id']
    phase = x['phase']

    interacted_items = phase_user_item_dict[phase][user_id]
    interacted_items = interacted_items[::-1]

    sim_max = 0
    for loc, i in enumerate(interacted_items):
        try:
            sim = item_sim_if[phase][i][item_id]
            if sim > sim_max:
                sim_max = sim
        except Exception as e:
            pass
    return sim_max


def func_if_last(x):
    user_id = x['user_id']
    item_id = x['item_id']
    phase = x['phase']

    last_item = phase_user_item_dict[phase][user_id][-1]

    sim = 0
    try:
        sim = item_sim_if[phase][last_item][item_id]
    except Exception as e:
        pass
    return sim


def func_if_rolling_sum(x, window):
    user_id = x['user_id']
    item_id = x['item_id']
    phase = x['phase']

    interacted_items = phase_user_item_dict[phase][user_id]
    interacted_items = interacted_items[-window:]

    sim_sum = 0
    for loc, i in enumerate(interacted_items):
        try:
            sim_sum += item_sim_if[phase][i][item_id]
        except Exception as e:
            pass
    return sim_sum


def func_if_rolling_mean(x, window):
    user_id = x['user_id']
    item_id = x['item_id']
    phase = x['phase']

    interacted_items = phase_user_item_dict[phase][user_id]
    interacted_items = interacted_items[-window:]

    sim_sum = 0
    count = 0
    for loc, i in enumerate(interacted_items):
        try:
            sim_sum += item_sim_if[phase][i][item_id]
            count += 1
        except Exception as e:
            pass
    if count != 0:
        return sim_sum / count
    else:
        return 0

In [23]:
df_feature['user_click_item_if_sim_sum'] = df_feature[[
    'user_id', 'phase', 'item_id']].parallel_apply(func_if_sum, axis=1)
df_feature['user_click_item_if_sim_max'] = df_feature[[
    'user_id', 'phase', 'item_id']].parallel_apply(func_if_max, axis=1)
df_feature['user_last_click_item_if_sim'] = df_feature[[
    'user_id', 'phase', 'item_id']].parallel_apply(func_if_last, axis=1)

df_feature['user_click_item_if_sim_rolling2_sum'] = df_feature[[
    'user_id', 'phase', 'item_id']].parallel_apply(lambda x: func_if_rolling_sum(x, 2), axis=1)
print(df_feature.shape)

(2496846, 44)


In [24]:
del item_sim_if
gc.collect()

0

## 天才召回相似度

In [25]:
# 天才召回相似度
f = open('../user_data/model/tiancai_sim.pkl', 'rb')
item_sim_tc = pickle.load(f)
f.close()

In [26]:
def func_tc_sum(x):
    user_id = x['user_id']
    item_id = x['item_id']
    phase = x['phase']

    interacted_items = phase_user_item_dict[phase][user_id]

    sim_sum = 0
    for loc, i in enumerate(interacted_items):
        try:
            sim_sum += item_sim_tc[phase][i][item_id]
        except Exception as e:
            pass
    return sim_sum


def func_tc_max(x):
    user_id = x['user_id']
    item_id = x['item_id']
    phase = x['phase']

    interacted_items = phase_user_item_dict[phase][user_id]
    interacted_items = interacted_items[::-1]

    sim_max = 0
    for loc, i in enumerate(interacted_items):
        try:
            sim = item_sim_tc[phase][i][item_id]
            if sim > sim_max:
                sim_max = sim
        except Exception as e:
            pass
    return sim_max


def func_tc_rolling_sum(x, window):
    user_id = x['user_id']
    item_id = x['item_id']
    phase = x['phase']

    interacted_items = phase_user_item_dict[phase][user_id]
    interacted_items = interacted_items[-window:]

    sim_sum = 0
    for loc, i in enumerate(interacted_items):
        try:
            sim_sum += item_sim_tc[phase][i][item_id]
        except Exception as e:
            pass
    return sim_sum

In [27]:
df_feature['user_click_item_tc_sim_sum'] = df_feature[[
    'user_id', 'phase', 'item_id']].parallel_apply(func_tc_sum, axis=1)
df_feature['user_click_item_tc_sim_max'] = df_feature[[
    'user_id', 'phase', 'item_id']].parallel_apply(func_tc_max, axis=1)
df_feature['user_click_item_tc_sim_rolling2_sum'] = df_feature[[
    'user_id', 'phase', 'item_id']].parallel_apply(lambda x: func_tc_rolling_sum(x, 2), axis=1)
print(df_feature.shape)

(2496846, 47)


In [28]:
del item_sim_tc
gc.collect()

0

## binetwork

In [29]:
# 用户历史点击物品与待预测物品相似度(binetwork)
f = open('../user_data/model/bn_sim.pkl', 'rb')
item_sim_bn = pickle.load(f)
f.close()

In [30]:
def func_bn_sum(x):
    user_id = x['user_id']
    item_id = x['item_id']
    phase = x['phase']

    interacted_items = phase_user_item_dict[phase][user_id]
    interacted_items = interacted_items[::-1]

    sim_sum = 0
    for loc, i in enumerate(interacted_items):
        try:
            sim_sum += item_sim_bn[phase][i][item_id] * (0.7**loc)
        except Exception as e:
            pass
    return sim_sum

In [31]:
df_feature['user_click_item_bn_sim_sum'] = df_feature[[
    'user_id', 'phase', 'item_id']].parallel_apply(func_bn_sum, axis=1)
print(df_feature.shape)

(2496846, 48)


In [32]:
del item_sim_bn
gc.collect()

0

## Word2Vec

In [33]:
def consine_distance(vector1, vector2):
    if type(vector1) != np.ndarray or type(vector2) != np.ndarray:
        return -1
    distance = np.dot(vector1, vector2) / \
        (np.linalg.norm(vector1)*(np.linalg.norm(vector2)))
    return distance

In [34]:
emb_size = 32
tmp = df_click.groupby(['user_id', 'phase'], as_index=False)['item_id'].agg(
    {'list': list})
sentences = tmp['list'].values.tolist()
words = []
for i in range(len(sentences)):
    words += [x for x in sentences[i]]
    sentences[i] = [str(x) for x in sentences[i]]

if os.path.exists('../user_data/model/word2vec.model'):
    model = Word2Vec.load('model/word2vec.model')
else:
    model = Word2Vec(sentences, size=emb_size, window=10,
                     min_count=1, sg=1, hs=1, seed=seed)
    model.save('../user_data/model/word2vec.model')


emb_matrix = []
words = list(set(words))
items = []
for word in tqdm(words):
    if str(word) in model:
        items.append(word)
        emb_matrix.append(model[str(word)])

item_w2w_vec_dict = dict(zip(items, emb_matrix))

100%|██████████| 117705/117705 [00:01<00:00, 65483.83it/s]


In [35]:
df_item_w2v = pd.DataFrame(emb_matrix)
df_item_w2v.columns = ['item_w2v_{}'.format(i) for i in range(emb_size)]
df_item_w2v['item_id'] = items
df_feature = df_feature.merge(df_item_w2v, how='left')
print(df_feature.shape)

(2496846, 80)


In [36]:
def func_w2w_sum(x, num):
    user_id = x['user_id']
    item_id = x['item_id']
    phase = x['phase']

    interacted_items = phase_user_item_dict[phase][user_id]
    interacted_items = interacted_items[::-1][:num]

    sim_sum = 0
    for loc, i in enumerate(interacted_items):
        try:
            sim_sum += consine_distance(
                item_w2w_vec_dict[item_id], item_w2w_vec_dict[i])
        except Exception as e:
            pass
    return sim_sum


def func_w2w_last_sim(x):
    user_id = x['user_id']
    item_id = x['item_id']
    phase = x['phase']

    last_item = phase_user_item_dict[phase][user_id][-1]

    sim = 0
    try:
        sim = consine_distance(
            item_w2w_vec_dict[item_id], item_w2w_vec_dict[last_item])
    except Exception as e:
        pass
    return sim

In [37]:
df_feature['user_last_click_item_w2w_sim'] = df_feature[[
    'user_id', 'phase', 'item_id']].parallel_apply(func_w2w_last_sim, axis=1)
df_feature['user_click_item_w2w_sim_sum_2'] = df_feature[[
    'user_id', 'phase', 'item_id']].parallel_apply(lambda x: func_w2w_sum(x, 2), axis=1)
print(df_feature.shape)

(2496846, 82)


In [38]:
del item_w2w_vec_dict
gc.collect()

0

## txt_vec

In [39]:
# 用户与待预测物品文本相似度
df_user_txt_vec = pd.read_pickle('../user_data/model/user_txt_vec.pkl')
df_item_txt_vec = pd.read_pickle('../user_data/model/item_txt_vec.pkl')
item_txt_vec_dict = dict(
    zip(df_item_txt_vec['item_id'], df_item_txt_vec['item_txt_vec']))


phase_user_txt_vec_dict = {}
for phase in phases:
    df_user_txt_vec_phase = df_user_txt_vec[df_user_txt_vec['phase'] == phase]

    user_txt_vec_dict = dict(
        zip(df_user_txt_vec_phase['user_id'], df_user_txt_vec_phase['user_txt_vec']))
    phase_user_txt_vec_dict[phase] = user_txt_vec_dict

In [40]:
def func_txt_sim(x):
    user_id = x['user_id']
    item_id = x['item_id']
    phase = x['phase']

    sim = 0
    try:
        sim = consine_distance(
            phase_user_txt_vec_dict[phase][user_id], item_txt_vec_dict[item_id])
    except Exception as e:
        pass
    return sim

In [41]:
df_feature['user_item_txt_sim'] = df_feature[[
    'user_id', 'phase', 'item_id']].parallel_apply(func_txt_sim, axis=1)
print(df_feature.shape)

(2496846, 83)


In [42]:
# 用户历史购买物品与待预测物品文本相似度
def func_txt_sum(x):
    user_id = x['user_id']
    item_id = x['item_id']
    phase = x['phase']

    interacted_items = phase_user_item_dict[phase][user_id]
    interacted_items = interacted_items[::-1]

    sim_sum = 0
    for loc, i in enumerate(interacted_items):
        try:
            sim_sum += consine_distance(
                item_txt_vec_dict[i], item_txt_vec_dict[item_id]) * (0.7 ** loc)
        except Exception as e:
            pass
    return sim_sum


def func_txt_max(x):
    user_id = x['user_id']
    item_id = x['item_id']
    phase = x['phase']

    interacted_items = phase_user_item_dict[phase][user_id]
    interacted_items = interacted_items[::-1]

    sim_max = 0
    for loc, i in enumerate(interacted_items):
        try:
            sim = consine_distance(
                item_txt_vec_dict[i], item_txt_vec_dict[item_id])
            if sim > sim_max:
                sim_max = sim
        except Exception as e:
            pass
    return sim_max


def func_txt_last(x):
    user_id = x['user_id']
    item_id = x['item_id']
    phase = x['phase']

    last_item = phase_user_item_dict[phase][user_id][-1]

    sim = 0
    try:
        sim = consine_distance(
            item_txt_vec_dict[last_item], item_txt_vec_dict[item_id])
    except Exception as e:
        pass
    return sim


def func_txt_rolling_sum(x, window):
    user_id = x['user_id']
    item_id = x['item_id']
    phase = x['phase']

    interacted_items = phase_user_item_dict[phase][user_id]
    interacted_items = interacted_items[-window:]

    sim_sum = 0
    for loc, i in enumerate(interacted_items):
        try:
            sim_sum += consine_distance(
                item_txt_vec_dict[i], item_txt_vec_dict[item_id])
        except Exception as e:
            pass
    return sim_sum


def func_txt_rolling_mean(x, window):
    user_id = x['user_id']
    item_id = x['item_id']
    phase = x['phase']

    interacted_items = phase_user_item_dict[phase][user_id]
    interacted_items = interacted_items[-window:]

    sim_sum = 0
    count = 0
    for loc, i in enumerate(interacted_items):
        try:
            sim_sum += consine_distance(
                item_txt_vec_dict[i], item_txt_vec_dict[item_id])
            count += 1
        except Exception as e:
            pass
    if count != 0:
        return sim_sum / count
    else:
        return 0

In [43]:
df_feature['user_click_item_txt_sim_sum'] = df_feature[[
    'user_id', 'phase', 'item_id']].parallel_apply(func_txt_sum, axis=1)
print(df_feature.shape)
df_feature['user_click_item_txt_sim_max'] = df_feature[[
    'user_id', 'phase', 'item_id']].parallel_apply(func_txt_max, axis=1)
print(df_feature.shape)
df_feature['user_last_click_item_txt_sim'] = df_feature[[
    'user_id', 'phase',  'item_id']].parallel_apply(func_txt_last, axis=1)
print(df_feature.shape)

df_feature['user_click_item_txt_sim_rolling2_sum'] = df_feature[[
    'user_id', 'phase', 'item_id']].parallel_apply(lambda x: func_txt_rolling_sum(x, 2), axis=1)
print(df_feature.shape)
df_feature['user_click_item_txt_sim_rolling2_mean'] = df_feature[[
    'user_id', 'phase', 'item_id']].parallel_apply(lambda x: func_txt_rolling_mean(x, 2), axis=1)
print(df_feature.shape)
df_feature['user_click_item_txt_sim_rolling3_mean'] = df_feature[[
    'user_id', 'phase', 'item_id']].parallel_apply(lambda x: func_txt_rolling_mean(x, 3), axis=1)
print(df_feature.shape)

(2496846, 84)
(2496846, 85)
(2496846, 86)
(2496846, 87)
(2496846, 88)
(2496846, 89)


In [44]:
del df_user_txt_vec, df_item_txt_vec, phase_user_txt_vec_dict
gc.collect()

0

## img vec

In [45]:
# 用户与待预测物品图片相似度
df_user_img_vec = pd.read_pickle('../user_data/model/user_img_vec.pkl')
df_item_img_vec = pd.read_pickle('../user_data/model/item_img_vec.pkl')
item_img_vec_dict = dict(
    zip(df_item_img_vec['item_id'], df_item_img_vec['item_img_vec']))

phase_user_img_vec_dict = {}
for phase in phases:
    df_user_img_vec_phase = df_user_img_vec[df_user_img_vec['phase'] == phase]

    user_img_vec_dict = dict(
        zip(df_user_img_vec['user_id'], df_user_img_vec['user_img_vec']))
    phase_user_img_vec_dict[phase] = user_txt_vec_dict

In [46]:
def func_img_sim(x):
    user_id = x['user_id']
    item_id = x['item_id']
    phase = x['phase']

    sim = 0
    try:
        sim = consine_distance(
            phase_user_img_vec_dict[phase][user_id], item_img_vec_dict[item_id])
    except Exception as e:
        pass
    return sim

In [47]:
df_feature['user_item_img_sim'] = df_feature[[
    'user_id', 'phase', 'item_id']].parallel_apply(func_img_sim, axis=1)
print(df_feature.shape)

(2496846, 90)


In [48]:
del df_user_img_vec, df_item_img_vec, item_img_vec_dict, phase_user_img_vec_dict
gc.collect()

0

In [49]:
df_feature.head()

Unnamed: 0,user_id,phase,query_time,item_id,label,txt_vec10_pca0,txt_vec10_pca1,txt_vec10_pca2,txt_vec10_pca3,txt_vec10_pca4,txt_vec10_pca5,txt_vec10_pca6,txt_vec10_pca7,txt_vec10_pca8,txt_vec10_pca9,img_vec10_pca0,img_vec10_pca1,img_vec10_pca2,img_vec10_pca3,img_vec10_pca4,img_vec10_pca5,img_vec10_pca6,img_vec10_pca7,img_vec10_pca8,img_vec10_pca9,phase_item_clickd_count,phase_item_click_time_diff_mean,item_id_phase_user_age_level_mean,item_id_phase_user_age_level_min,item_id_phase_user_age_level_max,item_id_phase_user_age_level_std,phase_item_click_gender_mean,user_age_level,user_gender,user_city_level,phase_user_click_count,phase_user_age_level_click_count,user_id_phase_time_std,user_id_phase_time_max_min_diff,user_id_phase_query_lastbuy_time_diff,user_click_item_if_sim_sum,user_click_item_if_sim_max,user_last_click_item_if_sim,user_click_item_if_sim_rolling2_sum,user_click_item_tc_sim_sum,user_click_item_tc_sim_max,user_click_item_tc_sim_rolling2_sum,user_click_item_bn_sim_sum,item_w2v_0,item_w2v_1,item_w2v_2,item_w2v_3,item_w2v_4,item_w2v_5,item_w2v_6,item_w2v_7,item_w2v_8,item_w2v_9,item_w2v_10,item_w2v_11,item_w2v_12,item_w2v_13,item_w2v_14,item_w2v_15,item_w2v_16,item_w2v_17,item_w2v_18,item_w2v_19,item_w2v_20,item_w2v_21,item_w2v_22,item_w2v_23,item_w2v_24,item_w2v_25,item_w2v_26,item_w2v_27,item_w2v_28,item_w2v_29,item_w2v_30,item_w2v_31,user_last_click_item_w2w_sim,user_click_item_w2w_sim_sum_2,user_item_txt_sim,user_click_item_txt_sim_sum,user_click_item_txt_sim_max,user_last_click_item_txt_sim,user_click_item_txt_sim_rolling2_sum,user_click_item_txt_sim_rolling2_mean,user_click_item_txt_sim_rolling3_mean,user_item_img_sim
0,7,0,0.98394,591,0.0,,,,,,,,,,,,,,,,,,,,,98.0,2.321776e-06,4.25,1.0,8.0,1.900764,0.125,,,,4,,3.9e-05,6.8e-05,9.4e-05,0.012211,0.012211,0.012211,0.012211,0.013266,0.012349,0.013266,3.066729,0.557681,0.383235,-0.276825,-0.020304,0.281853,0.266906,-0.254671,-1.088647,0.542492,0.127196,0.030291,0.116781,0.413767,0.433424,-0.105323,0.31853,-0.177472,0.324909,0.070692,0.14177,1.375137,-0.703162,0.449329,-0.600703,0.117703,0.73036,0.049353,0.34708,-0.495872,-0.437462,0.262964,0.012519,0.962058,1.739598,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,7,0,0.98394,20201,0.0,7.315396,2.445671,-2.456841,-0.265093,1.541942,-1.857202,-0.661244,1.367767,3.947654,1.537049,-3.902055,7.722759,1.712466,1.263972,-4.513811,1.919901,5.909709,-2.219446,1.426612,-5.519471,82.0,5.267002e-07,4.608696,2.0,7.0,1.269901,0.086957,,,,4,,3.9e-05,6.8e-05,9.4e-05,0.008038,0.011483,0.0,0.011483,0.009501,0.009501,0.009501,2.021681,0.777837,0.311143,-0.586426,0.41908,0.186758,-0.551356,0.460286,-1.098006,1.075584,0.95767,-0.011792,0.471877,0.365782,0.365816,-0.513682,-0.847186,-0.648519,0.738351,0.54222,-0.211778,1.394127,-0.319415,1.464682,-1.367131,-0.37086,0.998552,0.380614,0.17422,-0.910878,-0.997351,0.79861,0.014986,0.763618,1.750311,0.470762,0.319291,0.465308,0.0,0.0,0.0,0.465308,-0.074763
2,7,0,0.98394,7709,0.0,,,,,,,,,,,,,,,,,,,,,89.0,6.000701e-06,4.608696,2.0,7.0,1.64425,0.086957,,,,4,,3.9e-05,6.8e-05,9.4e-05,0.00732,0.009956,0.000351,0.010306,0.011179,0.010091,0.010896,1.72028,0.357292,0.605224,-0.930103,0.57354,0.099205,-0.130631,-0.20767,-1.377808,0.694993,0.570842,0.018772,0.268982,0.47455,0.216821,-0.281803,-0.456248,-0.503473,0.545643,-0.074638,0.224989,1.435362,-0.242414,1.091328,-0.795241,-0.231915,0.993607,0.156555,-0.44523,-0.432221,-0.684011,0.436246,-0.291141,0.807694,1.706162,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,7,0,0.98394,4340,0.0,,,,,,,,,,,,,,,,,,,,,83.0,1.250319e-06,4.230769,2.0,8.0,2.178214,0.115385,,,,4,,3.9e-05,6.8e-05,9.4e-05,0.00784,0.00784,0.00784,0.00784,0.009253,0.007942,0.008756,1.928506,0.232698,0.606206,-0.68387,0.323362,-0.064403,0.85125,-0.243279,-1.153997,0.116295,-0.517095,-0.181246,0.134761,0.441163,0.509473,0.102148,0.543459,-0.009501,0.095397,-0.371087,0.037042,1.189251,-0.51431,0.188141,-0.596496,-0.103534,0.555813,-0.242944,0.166725,-0.252259,0.183289,-0.355331,-0.834423,0.744072,1.162606,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,7,0,0.98394,9338,0.0,-0.939243,4.632142,-4.438131,-0.889346,-1.173805,-4.121347,2.271114,5.32742,-1.557398,4.299243,-3.981642,9.816935,1.646795,-2.676538,-5.349474,2.044283,7.449821,-6.21031,-1.30953,-1.486131,95.0,3.006781e-06,4.421053,2.0,7.0,1.387075,0.0,,,,4,,3.9e-05,6.8e-05,9.4e-05,0.005604,0.007599,0.000285,0.007883,0.006948,0.006948,0.006948,2.137496,1.00008,0.300363,-0.738722,0.38863,0.354816,-0.486257,0.544119,-1.198438,1.093679,0.745305,-0.10579,0.648465,0.333116,0.422558,-0.356302,-0.759719,-0.6948,0.838911,0.485842,-0.002966,1.365363,-0.40298,1.41241,-1.280845,-0.202187,0.9426,0.461122,0.060639,-0.975104,-1.021553,0.794295,-0.228268,0.788804,1.778702,0.570234,0.412694,0.546543,0.0,0.0,0.0,0.459653,-0.096689


In [50]:
# Function to reduce the memory usage
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in tqdm([f for f in df.columns if f not in ['query_time']]):
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(
                        np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(
                        np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(
                        np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(
                        np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(
                        np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(
                        np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose:
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(
            end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [51]:
df_feature = reduce_mem_usage(df_feature)

100%|██████████| 89/89 [00:36<00:00,  2.44it/s]

Mem. usage decreased to 469.09 Mb (72.9% reduction)





In [52]:
df_feature.head()

Unnamed: 0,user_id,phase,query_time,item_id,label,txt_vec10_pca0,txt_vec10_pca1,txt_vec10_pca2,txt_vec10_pca3,txt_vec10_pca4,txt_vec10_pca5,txt_vec10_pca6,txt_vec10_pca7,txt_vec10_pca8,txt_vec10_pca9,img_vec10_pca0,img_vec10_pca1,img_vec10_pca2,img_vec10_pca3,img_vec10_pca4,img_vec10_pca5,img_vec10_pca6,img_vec10_pca7,img_vec10_pca8,img_vec10_pca9,phase_item_clickd_count,phase_item_click_time_diff_mean,item_id_phase_user_age_level_mean,item_id_phase_user_age_level_min,item_id_phase_user_age_level_max,item_id_phase_user_age_level_std,phase_item_click_gender_mean,user_age_level,user_gender,user_city_level,phase_user_click_count,phase_user_age_level_click_count,user_id_phase_time_std,user_id_phase_time_max_min_diff,user_id_phase_query_lastbuy_time_diff,user_click_item_if_sim_sum,user_click_item_if_sim_max,user_last_click_item_if_sim,user_click_item_if_sim_rolling2_sum,user_click_item_tc_sim_sum,user_click_item_tc_sim_max,user_click_item_tc_sim_rolling2_sum,user_click_item_bn_sim_sum,item_w2v_0,item_w2v_1,item_w2v_2,item_w2v_3,item_w2v_4,item_w2v_5,item_w2v_6,item_w2v_7,item_w2v_8,item_w2v_9,item_w2v_10,item_w2v_11,item_w2v_12,item_w2v_13,item_w2v_14,item_w2v_15,item_w2v_16,item_w2v_17,item_w2v_18,item_w2v_19,item_w2v_20,item_w2v_21,item_w2v_22,item_w2v_23,item_w2v_24,item_w2v_25,item_w2v_26,item_w2v_27,item_w2v_28,item_w2v_29,item_w2v_30,item_w2v_31,user_last_click_item_w2w_sim,user_click_item_w2w_sim_sum_2,user_item_txt_sim,user_click_item_txt_sim_sum,user_click_item_txt_sim_max,user_last_click_item_txt_sim,user_click_item_txt_sim_rolling2_sum,user_click_item_txt_sim_rolling2_mean,user_click_item_txt_sim_rolling3_mean,user_item_img_sim
0,7,0,0.98394,591,0.0,,,,,,,,,,,,,,,,,,,,,98.0,2.324581e-06,4.25,1.0,8.0,1.900391,0.125,,,,4,,3.9e-05,6.8e-05,9.4e-05,0.012207,0.012207,0.012207,0.012207,0.013268,0.012352,0.013268,3.066406,0.557617,0.383301,-0.276855,-0.020309,0.281738,0.266846,-0.254639,-1.088867,0.54248,0.127197,0.030289,0.11676,0.413818,0.43335,-0.105347,0.318604,-0.17749,0.324951,0.070679,0.141724,1.375,-0.703125,0.449219,-0.600586,0.117676,0.730469,0.049347,0.347168,-0.49585,-0.4375,0.262939,0.01252,0.961914,1.739258,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,7,0,0.98394,20201,0.0,7.316406,2.445312,-2.457031,-0.265137,1.541992,-1.857422,-0.661133,1.368164,3.947266,1.537109,-3.902344,7.722656,1.712891,1.263672,-4.515625,1.919922,5.910156,-2.21875,1.426758,-5.519531,82.0,5.364418e-07,4.609375,2.0,7.0,1.269531,0.086975,,,,4,,3.9e-05,6.8e-05,9.4e-05,0.008041,0.011482,0.0,0.011482,0.009499,0.009499,0.009499,2.021484,0.777832,0.311035,-0.586426,0.419189,0.186768,-0.55127,0.460205,-1.097656,1.075195,0.95752,-0.011795,0.471924,0.365723,0.365723,-0.513672,-0.847168,-0.648438,0.738281,0.541992,-0.211792,1.394531,-0.319336,1.464844,-1.367188,-0.37085,0.998535,0.380615,0.174194,-0.910645,-0.997559,0.798828,0.014984,0.763672,1.75,0.470703,0.319336,0.465332,0.0,0.0,0.0,0.465332,-0.074768
2,7,0,0.98394,7709,0.0,,,,,,,,,,,,,,,,,,,,,89.0,6.020069e-06,4.609375,2.0,7.0,1.644531,0.086975,,,,4,,3.9e-05,6.8e-05,9.4e-05,0.00732,0.009956,0.000351,0.010307,0.011177,0.010094,0.010895,1.720703,0.357178,0.60498,-0.930176,0.57373,0.099182,-0.130615,-0.207642,-1.37793,0.694824,0.570801,0.018768,0.269043,0.474609,0.216797,-0.281738,-0.456299,-0.503418,0.54541,-0.074646,0.224976,1.435547,-0.242432,1.091797,-0.79541,-0.231934,0.993652,0.156616,-0.445312,-0.432129,-0.684082,0.436279,-0.29126,0.807617,1.706055,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,7,0,0.98394,4340,0.0,,,,,,,,,,,,,,,,,,,,,83.0,1.251698e-06,4.230469,2.0,8.0,2.177734,0.115356,,,,4,,3.9e-05,6.8e-05,9.4e-05,0.007843,0.007843,0.007843,0.007843,0.009254,0.007942,0.008759,1.928711,0.232666,0.606445,-0.684082,0.323242,-0.064392,0.851074,-0.243286,-1.154297,0.116272,-0.51709,-0.181274,0.134766,0.441162,0.509277,0.102173,0.543457,-0.009499,0.095398,-0.371094,0.037048,1.189453,-0.51416,0.18811,-0.59668,-0.103516,0.555664,-0.24292,0.166748,-0.252197,0.18335,-0.355225,-0.834473,0.744141,1.163086,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,7,0,0.98394,9338,0.0,-0.939453,4.632812,-4.4375,-0.88916,-1.173828,-4.121094,2.271484,5.328125,-1.557617,4.300781,-3.982422,9.820312,1.646484,-2.675781,-5.347656,2.044922,7.449219,-6.210938,-1.30957,-1.486328,95.0,2.980232e-06,4.421875,2.0,7.0,1.386719,0.0,,,,4,,3.9e-05,6.8e-05,9.4e-05,0.005604,0.007599,0.000285,0.007881,0.006947,0.006947,0.006947,2.136719,1.0,0.300293,-0.73877,0.388672,0.354736,-0.486328,0.543945,-1.198242,1.09375,0.745117,-0.105774,0.648438,0.333008,0.422607,-0.356201,-0.759766,-0.694824,0.838867,0.48584,-0.002966,1.365234,-0.403076,1.412109,-1.28125,-0.202148,0.942383,0.461182,0.060638,-0.975098,-1.021484,0.794434,-0.228271,0.788574,1.77832,0.570312,0.412598,0.546387,0.0,0.0,0.0,0.459717,-0.09668


In [53]:
df_feature.to_pickle('../user_data/data/rank_feature.pkl')

In [54]:
df_feature.shape

(2496846, 90)