## 拼接特征生成样本 & 负采样(4:1)

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from datetime import datetime

data_path = './data/'
train = pd.read_csv(data_path+"train.csv").reset_index(drop=True)
test = pd.read_csv(data_path+"test.csv").reset_index(drop=True)

feature_path = './feature/'
train_user_feature = pd.read_csv(feature_path+"train_user_feature.csv")
test_user_feature = pd.read_csv(feature_path+"test_user_feature.csv")
item_feature = pd.read_csv(feature_path+"item_feature.csv")

def timelogger(message=None):
    print(f"[{datetime.now()}]") if message==None else print(f"[{datetime.now()}]" + str(message))

timelogger("hello world!")

[2022-02-10 20:26:48.022307]hello world!


In [3]:
# 先合并数据, 
data = train.append(test).sort_values(by=["user_id", "click_timestamp"]).reset_index(drop=True)
user_feature = train_user_feature.append(test_user_feature).reset_index(drop=True)

# 提取最后一次点击
train_data = data.groupby("user_id").tail(1)[['user_id', "click_article_id"]]

# 打标签
train_data['label'] = np.ones(len(train_data), dtype=int)

timelogger(f"pos nums: {len(train_data)}")
print(train_data.head())       

[2022-02-10 20:27:00.372099]pos nums: 200000
   user_id  click_article_id  label
1        0             14287      1
3        1              5881      1
5        2             15525      1
7        3              2821      1
9        4              3339      1


In [4]:
# 随机负采样, 从用户最后一次点击之前创建的文章中, 每个用户随机选4篇不在历史点击中的文章作为负样本
# 获取历史点击序列
user_click_hist = {}
for i, row in tqdm(data.iterrows()):
    user, item = row['user_id'], row['click_article_id']
    user_click_hist.setdefault(user, set())
    user_click_hist[user].add(item)

# 获取文章创建时间的字典
article_click_dict = {}
for i,row in tqdm(item_feature.iterrows()):
    article_click_dict[row['click_article_id']] = row['created_at_ts']

# 负采样
NEG_SAMPLE = 4
neg_data = {"user_id":[], "click_article_id":[]}
for i,row in tqdm(user_feature.iterrows()):
    user, clk_time = row['user_id'], row['last_click_time']
    candidate = list(article_click_dict.keys())
    np.random.shuffle(candidate)
    count = 0
    for art in candidate:
        if art not in user_click_hist[user] and article_click_dict[art]<clk_time:
            neg_data['user_id'].append(user)
            neg_data['click_article_id'].append(art)
            count += 1
        if count==NEG_SAMPLE:break

neg_data = pd.DataFrame(neg_data)
neg_data['label'] = np.zeros(len(neg_data), dtype=int)

timelogger(f"finish neg sample: {len(neg_data)}")

print(neg_data.head())


1112623it [03:03, 6054.41it/s]
31116it [00:04, 6624.62it/s]
200000it [14:13, 234.39it/s]


[2022-02-10 20:44:26.595564]finish neg sample: 800000
   user_id  click_article_id  label
0        0             21875      0
1        0             10749      0
2        0              9482      0
3        0             16964      0
4        3             25267      0


In [5]:
# 合并正负样本
train_data = train_data.append(neg_data).sort_values(by=['user_id'])

# 拼接特征
train_data = train_data.merge(user_feature, on='user_id', how='left').merge(item_feature, on='click_article_id', how='left').reset_index(drop=True)

timelogger(f"finish merge pos and neg sample : {len(train_data)}")
train_data.head(10)

[2022-02-10 21:09:34.676749]finish merge pos and neg sample : 1000000


Unnamed: 0,user_id,click_article_id,label,click_environment,click_deviceGroup,click_os,click_country,click_region,click_referrer_type,last_click_time,user_last_click_1t,user_last_click_3t,user_last_click_5t,user_last_click_7t,category_id,created_at_ts
0,0,14287,1,2,0,5,0,24,1,1508211672520,[2050],"[2050, 31116, 31116]","[2050, 31116, 31116, 31116, 31116]","[2050, 31116, 31116, 31116, 31116, 31116, 31116]",175,1508236945000
1,0,16964,0,2,0,5,0,24,1,1508211672520,[2050],"[2050, 31116, 31116]","[2050, 31116, 31116, 31116, 31116]","[2050, 31116, 31116, 31116, 31116, 31116, 31116]",189,1507626059000
2,0,9482,0,2,0,5,0,24,1,1508211672520,[2050],"[2050, 31116, 31116]","[2050, 31116, 31116, 31116, 31116]","[2050, 31116, 31116, 31116, 31116, 31116, 31116]",127,1505837375000
3,0,10749,0,2,0,5,0,24,1,1508211672520,[2050],"[2050, 31116, 31116]","[2050, 31116, 31116, 31116, 31116]","[2050, 31116, 31116, 31116, 31116, 31116, 31116]",141,1507753934000
4,0,21875,0,2,0,5,0,24,1,1508211672520,[2050],"[2050, 31116, 31116]","[2050, 31116, 31116, 31116, 31116]","[2050, 31116, 31116, 31116, 31116, 31116, 31116]",236,1504952882000
5,1,24929,0,2,0,5,0,24,5,1508211359672,[25682],"[25682, 31116, 31116]","[25682, 31116, 31116, 31116, 31116]","[25682, 31116, 31116, 31116, 31116, 31116, 31116]",256,1506723144000
6,1,13081,0,2,0,5,0,24,5,1508211359672,[25682],"[25682, 31116, 31116]","[25682, 31116, 31116, 31116, 31116]","[25682, 31116, 31116, 31116, 31116, 31116, 31116]",164,1507456697000
7,1,11846,0,2,0,5,0,24,5,1508211359672,[25682],"[25682, 31116, 31116]","[25682, 31116, 31116, 31116, 31116]","[25682, 31116, 31116, 31116, 31116, 31116, 31116]",152,1506796408000
8,1,20656,0,2,0,5,0,24,5,1508211359672,[25682],"[25682, 31116, 31116]","[25682, 31116, 31116, 31116, 31116]","[25682, 31116, 31116, 31116, 31116, 31116, 31116]",230,1455272540000
9,1,5881,1,2,0,5,0,24,5,1508211359672,[25682],"[25682, 31116, 31116]","[25682, 31116, 31116, 31116, 31116]","[25682, 31116, 31116, 31116, 31116, 31116, 31116]",78,1508142585000


In [6]:
# 构造点击时差特征
def get_time_diff(last_clk_time, created_ts):
    time_diff = last_clk_time-created_ts
    if time_diff < 3*60*60*1000: return 0
    elif 3*60*60*1000 < time_diff < 12*60*60*1000: return 1
    elif 12*60*60*1000 < time_diff < 24*60*60*1000 : return 2
    else: return 3

time_diff = []
for i,row in train_data.iterrows():
    time_diff.append(get_time_diff(row['last_click_time'], row['created_at_ts']))

train_data['time_diff'] = time_diff

train_data = train_data.drop(['last_click_time', 'created_at_ts'], axis=1)

In [8]:
# 这里对train_data可视化一下就可以发现, 标签为0的样本对基本上满足关系time_diff=3, 而正样本基本上time_diff=0 or 1
train_data[['user_id', 'click_article_id', 'time_diff', 'label']]

Unnamed: 0,user_id,click_article_id,time_diff,label
0,0,14287,0,1
1,0,16964,3,0
2,0,9482,3,0
3,0,10749,3,0
4,0,21875,3,0
...,...,...,...,...
999995,199999,21216,3,0
999996,199999,20828,3,0
999997,199999,322,3,0
999998,199999,19718,1,1


In [9]:
# 拆分训练 测试集, 并打乱
train_user = np.array(train_user_feature['user_id'])
test_user = np.array(test_user_feature['user_id'])

save_path = './data/'
train_data[train_data['user_id'].isin(train_user)].sample(frac=1).to_csv(save_path+'train_data.csv', index=False)
train_data[train_data['user_id'].isin(test_user)].sample(frac=1).to_csv(save_path+'test_data.csv', index=False)
timelogger(f"successfully save train data to ./data/train_data(test_data).csv")

[2022-02-10 21:44:35.791023]successfully save train data to ./data/train_data(test_data).csv
