## 召回和精排统一采用这一版的特征进行训练
只制作用户的最近1、3、5、7次点击, 以及点击时间差特征, 其余均采用原始类别特征

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from datetime import datetime

root = './data/'
train = pd.read_csv(root+"train.csv").sort_values(by=['user_id', 'click_timestamp']).reset_index(drop=True)
test = pd.read_csv(root+"test.csv").sort_values(by=['user_id', 'click_timestamp']).reset_index(drop=True)
article = pd.read_csv(root+"article.csv")

def timelogger(message=None):
    print(f"[{datetime.now()}]") if message==None else print(f"[{datetime.now()}]" + str(message))

timelogger("hello world!")

[2022-02-09 19:51:05.070917]hello world!


In [2]:
# 先合并数据, 
data = train.append(test).reset_index(drop=True)
timelogger(f"all data: {len(data)}")
data.head()

[2022-02-09 19:51:05.245890]all data: 1112623


Unnamed: 0,user_id,click_article_id,click_timestamp,click_environment,click_deviceGroup,click_os,click_country,click_region,click_referrer_type
0,0,2050,1508211672520,2,0,5,0,24,1
1,0,14287,1508211702520,2,0,5,0,24,1
2,3,4573,1508211359672,2,2,0,0,24,1
3,3,2821,1508211389672,2,2,0,0,24,1
4,4,3848,1508211625466,2,0,3,0,15,0


In [3]:
# 提取最后一次点击作为标签
train_data = data.groupby("user_id").tail(1)[['user_id', "click_article_id"]]
#hist_data = data.groupby("user_id").apply(lambda x:x[:-1])[['user_id', "click_article_id"]]
timelogger(f"pos nums: {len(train_data)}")
#timelogger(f"hist nums: {len(hist_data)}")
train_data.head()       

[2022-02-09 19:51:05.570962]pos nums: 200000


Unnamed: 0,user_id,click_article_id
1,0,14287
3,3,2821
5,4,3339
7,5,20952
9,6,753


In [4]:
# 制作user feature
# 获取历史点击序列 -> list
user_click_hist = {}
for i, row in tqdm(data.iterrows()):
    user, item, time = row['user_id'], row['click_article_id'], row['click_timestamp']
    user_click_hist.setdefault(user, [])
    user_click_hist[user].append((item, time))

# 为每个用户生成最后N次点击序列, 不足部分用PAD补齐
PAD_IDX = len(article)
def get_last_N_click(user, N):
    #res = user_click_hist[user][-N-1:-1]
    res = [x[0] for x in user_click_hist[user]][-N-1:-1]
    while(len(res)<N):
        res.append(PAD_IDX)
    return res

# 统计特征
count_feature = [
    'user_last_click_1t',
    'user_last_click_3t',
    'user_last_click_5t',
    'user_last_click_7t',
]

last_click_dict = {"user_id":[]}
for i in count_feature: last_click_dict[i] = []
for user in tqdm(list(train_data['user_id'])):
    last_click_dict['user_id'].append(user)
    for i in range(4):
        N = 2*i + 1
        last_click_dict[count_feature[i]].append(get_last_N_click(user, N))

user_last_click_fea = pd.DataFrame(last_click_dict)

timelogger(f"finish get count feature || {len(user_last_click_fea)}")
user_last_click_fea.tail()


1112623it [03:19, 5576.94it/s]
100%|██████████| 200000/200000 [00:06<00:00, 32734.96it/s]


[2022-02-09 19:54:32.713221]finish get count feature || 200000


Unnamed: 0,user_id,user_last_click_1t,user_last_click_3t,user_last_click_5t,user_last_click_7t
199995,199978,[26094],"[25003, 25315, 26094]","[29721, 26534, 25003, 25315, 26094]","[29721, 26534, 25003, 25315, 26094, 31116, 31116]"
199996,199982,[18192],"[11816, 689, 18192]","[20806, 19527, 11816, 689, 18192]","[18914, 24804, 20806, 19527, 11816, 689, 18192]"
199997,199988,[14180],"[14180, 31116, 31116]","[14180, 31116, 31116, 31116, 31116]","[14180, 31116, 31116, 31116, 31116, 31116, 31116]"
199998,199990,[8226],"[17806, 8322, 8226]","[23843, 17806, 8322, 8226, 31116]","[23843, 17806, 8322, 8226, 31116, 31116, 31116]"
199999,199999,[14520],"[6705, 14209, 14520]","[14752, 3743, 6705, 14209, 14520]","[12036, 15544, 14752, 3743, 6705, 14209, 14520]"


In [5]:
# 获取类别特征, 取众数
user_feature = [
    'user_id',
    'click_environment',
    'click_deviceGroup',
    'click_os',
    'click_country',
    'click_region',
    'click_referrer_type',
]

timelogger("start build user category feature")
user_fea = data[user_feature].groupby("user_id").agg(lambda x:int(np.mean(x.mode()))).reset_index()

# 获取最后一次点击的时间, 并拼入
user_fea['last_click_time'] = [user_click_hist[user][-2][1] for user in user_click_hist]

timelogger(f"user feature: {len(user_fea)}")

user_fea.head()

[2022-02-09 20:04:18.751605]user feature: 200000


Unnamed: 0,user_id,click_environment,click_deviceGroup,click_os,click_country,click_region,click_referrer_type,last_click_time
0,0,2,0,5,0,24,1,1508211672520
1,1,2,0,5,0,24,5,1508211359672
2,2,2,2,7,0,24,1,1508211625466
3,3,2,2,0,0,24,1,1508211243884
4,4,2,0,3,0,15,0,1508212140776


In [6]:
# 拼接user特征
user_fea = user_fea.merge(user_last_click_fea, how='left', on='user_id')

for fea in user_feature:
    print(f"\"{fea}\": {max(list(user_fea[fea]))+1}")

# 保存用户特征, 后面排序用
train_user = np.unique(np.array(train['user_id']))
test_user = np.unique(np.array(test['user_id']))

user_fea[user_fea["user_id"].isin(train_user)].to_csv('./feature/train_user_feature.csv', index=False)
user_fea[user_fea["user_id"].isin(test_user)].to_csv('./feature/test_user_feature.csv', index=False)
timelogger("successfully save user feature to ./feature/user_feature.csv")

"user_id": 200000
"click_environment": 3
"click_deviceGroup": 5
"click_os": 8
"click_country": 11
"click_region": 28
"click_referrer_type": 7
[2022-02-09 20:04:25.114963]successfully save test user feature to ./feature/user_feature.csv


In [7]:
# 制作item的特征
item_feature = [
    'click_article_id',
    'category_id',
]

item_fea = {"click_article_id":[], "category_id":[], "created_at_ts":[]}
for i,row in tqdm(article.iterrows()):
    for fea in item_fea:
        item_fea[fea].append(row[fea])

item_fea = pd.DataFrame(item_fea)
timelogger(f"item_fea: {len(item_fea)}")

for fea in item_feature:
    print(f"\"{fea}\": {max(list(item_fea[fea]))+1}")

# 保存文章特征
item_fea.to_csv('./feature/item_feature.csv', index=False)
timelogger("successfully save item feature to ./feature/item_feature.csv")

item_fea.head()

31116it [00:05, 5690.28it/s]


[2022-02-09 20:04:30.888323]item_fea: 31116
"click_article_id": 31116
"category_id": 290
[2022-02-09 20:04:31.048340]successfully save test user feature to ./feature/item_feature.csv


Unnamed: 0,click_article_id,category_id,created_at_ts
0,0,0,1408468313000
1,1,0,1368528105000
2,2,0,1371821736000
3,3,0,1373808015000
4,4,0,1378460105000
