In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from datetime import datetime

root = './data/'
train = pd.read_csv(root+"train_click_log.csv")
#test = pd.read_csv(root+"testA_click_log.csv")
article = pd.read_csv(root+"articles.csv")

def timelogger(message=None):
    print(f"[{datetime.now()}]") if message==None else print(f"[{datetime.now()}]" + str(message))

timelogger("hello world!")

[2022-02-07 14:17:54.325244]hello world!


In [2]:
# 合并数据
#data = train.append(test).sort_values(by=['user_id', 'click_article_id', 'click_timestamp']).reset_index(drop=True)
data = train.sort_values(by=['user_id', 'click_timestamp']).reset_index(drop=True)

timelogger(f'all samples nums: {len(data)}')
data.head()

[2022-02-07 14:17:57.420931]all samples nums: 1112623


Unnamed: 0,user_id,click_article_id,click_timestamp,click_environment,click_deviceGroup,click_os,click_country,click_region,click_referrer_type
0,0,30760,1508211672520,4,1,17,1,25,2
1,0,157507,1508211702520,4,1,17,1,25,2
2,1,289197,1508211316889,4,1,17,1,25,6
3,1,63746,1508211346889,4,1,17,1,25,6
4,2,36162,1508211438695,4,3,20,1,25,2


In [3]:
# # 时间分桶
# def time_norm(T, t):
#     if t < T - 24*60*60*1000: return 0
#     elif t < T - 12*60*60*1000: return 1
#     elif t < T - 3*60*60*1000: return 2
#     else: return 3

In [4]:
# 获取用户的历史点击 user -> item -> click_time
user_click_hist = {}
for i, row in tqdm(data.iterrows()):
    user, item, time = row['user_id'], row['click_article_id'], row['click_timestamp']
    user_click_hist.setdefault(user, {})
    user_click_hist[user][item] = time


# 选出点击序列大于1的用户并保留
valid_users = []
for user in tqdm(user_click_hist):
    if len(user_click_hist[user])>1:
        valid_users.append(user)

timelogger(f'valid users: {len(valid_users)}')

data = data[data['user_id'].isin(valid_users)].reset_index(drop=True)
timelogger(f'all samples nums: {len(data)}')

data.head()

1112623it [03:09, 5871.85it/s]
100%|██████████| 200000/200000 [00:00<00:00, 582966.09it/s]


[2022-02-07 14:21:07.447187]valid users: 200000
[2022-02-07 14:21:07.741252]all samples nums: 1112623


Unnamed: 0,user_id,click_article_id,click_timestamp,click_environment,click_deviceGroup,click_os,click_country,click_region,click_referrer_type
0,0,30760,1508211672520,4,1,17,1,25,2
1,0,157507,1508211702520,4,1,17,1,25,2
2,1,289197,1508211316889,4,1,17,1,25,6
3,1,63746,1508211346889,4,1,17,1,25,6
4,2,36162,1508211438695,4,3,20,1,25,2


In [5]:
# 选出有交互的文章
valid_articles = np.unique(np.array(data['click_article_id']))
article = article[article['article_id'].isin(valid_articles)].reset_index(drop=True).rename(columns={'article_id':"click_article_id"})

timelogger(f'valid articles: {len(article)}')
article.head()

[2022-02-07 14:21:08.058343]valid articles: 31116


Unnamed: 0,click_article_id,category_id,created_at_ts,words_count
0,3,1,1408468313000,230
1,69,1,1368528105000,290
2,84,1,1371821736000,158
3,94,1,1373808015000,204
4,125,1,1378460105000,163


In [6]:
# 各种id特征重新编码
from sklearn.preprocessing import LabelEncoder

features = [
    'user_id',
    'click_article_id',
    'click_environment',
    'click_deviceGroup',
    'click_os',
    'click_country',
    'click_region',
    'click_referrer_type',
    'category_id',
]

timelogger("Begin Label Encoder and Transform.")
feature_map = {}
for fea in features:
    feature_map[fea] = LabelEncoder()
    if fea in data.columns.values:
        feature_map[fea].fit(np.array(data[fea]))
        data[fea] = feature_map[fea].transform(np.array(data[fea]))
    if fea in article.columns.values:
        if fea == "category_id":
            feature_map[fea].fit(np.array(article[fea]))
        article[fea] = feature_map[fea].transform(np.array(article[fea]))


print(feature_map)
data.head()

[2022-02-07 14:21:12.599379]Begin Label Encoder and Transform.
{'user_id': LabelEncoder(), 'click_article_id': LabelEncoder(), 'click_environment': LabelEncoder(), 'click_deviceGroup': LabelEncoder(), 'click_os': LabelEncoder(), 'click_country': LabelEncoder(), 'click_region': LabelEncoder(), 'click_referrer_type': LabelEncoder(), 'category_id': LabelEncoder()}


Unnamed: 0,user_id,click_article_id,click_timestamp,click_environment,click_deviceGroup,click_os,click_country,click_region,click_referrer_type
0,0,2050,1508211672520,2,0,5,0,24,1
1,0,14287,1508211702520,2,0,5,0,24,1
2,1,25682,1508211316889,2,0,5,0,24,5
3,1,5881,1508211346889,2,0,5,0,24,5
4,2,2821,1508211438695,2,2,7,0,24,1


In [7]:
# 拆分训练集, 随机选取15w作为测试集
all_users = np.array(list(range(len(valid_users))))
np.random.shuffle(all_users)
train_users = all_users[:int(0.75*len(all_users))]
test_users = all_users[int(0.75*len(all_users)):]

train = data[data['user_id'].isin(train_users)]
test = data[data['user_id'].isin(test_users)]

timelogger(f"train examples: {len(train)}|| test examples: {len(test)}")
train.head()

[2022-02-07 14:21:15.026907]train examples: 833718|| test examples: 278905


Unnamed: 0,user_id,click_article_id,click_timestamp,click_environment,click_deviceGroup,click_os,click_country,click_region,click_referrer_type
0,0,2050,1508211672520,2,0,5,0,24,1
1,0,14287,1508211702520,2,0,5,0,24,1
6,3,4573,1508211359672,2,2,0,0,24,1
7,3,2821,1508211389672,2,2,0,0,24,1
8,4,3848,1508211625466,2,0,3,0,15,0


In [8]:
save_path = '../data/'
train.to_csv(save_path + 'train.csv', index=False)
test.to_csv(save_path + 'test.csv', index=False)
article.to_csv(save_path + 'article.csv', index=False)