In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import collections
from tqdm import tqdm

In [7]:
category_features = (['user_id', 'weekday', 'hourmin', 'user_active_degree', 'is_video_author',
                      'follow_user_num_range', 'fans_user_num_range', 'friend_user_num_range', 'register_days_range']
                     + [f'onehot_feat{i}' for i in range(18)]
                     + ['video_id', 'author_id', 'upload_type', 'tag'])
continuous_features = ['duration_ms', 'server_width', 'server_height', 'follow_user_num', 'fans_user_num',
                       'friend_user_num']
labels = ['effective_view', 'is_like', 'long_view', 'is_follow', 'is_comment', 'is_forward', 'is_not_hate']
raw_df = pd.read_csv('./data/log_standard_4_08_to_4_21_pure.csv')
print(raw_df.shape)
user_df = pd.read_csv('./data/user_features_pure.csv')
video_df = pd.read_csv('./data/video_features_basic_pure.csv')
raw_df = raw_df.merge(user_df, on=['user_id'])
print(raw_df.shape)
raw_df = raw_df.merge(video_df, on=['video_id'])
print(raw_df.shape)
# 处理时间
raw_df['hour'] = raw_df['hourmin'] // 100
# 加入星期几特征
raw_df['date'] = pd.to_datetime(raw_df['date'], format='%Y%m%d')
raw_df['weekday'] = raw_df['date'].dt.day_name()
# 处理视频时长
raw_df['duration_ms'] = raw_df['duration_ms'] // 100

# 处理tag
raw_df['tag'] = raw_df['tag'].apply(lambda x: int(str(x).split(',')[0]) if str(x).split(',')[0].isdigit() else -1)

# 处理hate标签
raw_df['is_not_hate'] = 1 - raw_df['is_hate']

raw_df['effective_view'] = raw_df['is_click']

(1141112, 19)
(1141112, 49)
(1141112, 60)


In [8]:
history_length_max_per_user = 20
history_length_min_per_user = 5
user_history_id_record = collections.defaultdict(list)
user_history_tag_record = collections.defaultdict(list)
emp_xtr_record = dict((label, collections.defaultdict(list)) for label in labels)
user_item_record = collections.defaultdict(list)
# 使用NumPy数组进行操作
history_id_columns = [f'history_id_{i}' for i in range(1, history_length_max_per_user + 1)]
history_tag_columns = [f'history_tag_{i}' for i in range(1, history_length_max_per_user + 1)]
gen_columns = history_tag_columns + history_id_columns + ['emp_' + label for label in labels] + ['flag']
history_data = np.zeros((raw_df.shape[0], 2*history_length_max_per_user+len(labels)+1), dtype=np.int64)

In [9]:
raw_df = raw_df.sort_values('time_ms', ascending=True).reset_index(drop=True)
for i in tqdm(range(raw_df.shape[0])):
    user_id = raw_df.loc[i, 'user_id']
    item_id = raw_df.loc[i, 'video_id']
    tag_id = raw_df.loc[i, 'tag']
    curr_len = len(user_history_id_record[user_id])
    # 填入用户历史行为
    if curr_len >= history_length_max_per_user:
        history_id = user_history_id_record[user_id]
        history_tag = user_history_tag_record[user_id]
    else:
        history_id = [-1] * (history_length_max_per_user - curr_len) + user_history_id_record[user_id]
        history_tag = [-1] * (history_length_max_per_user - curr_len) + user_history_tag_record[user_id]
    # 填入emp_xtr
    xtr_list = []
    n = len(user_item_record[user_id])
    flag = False
    if n == history_length_max_per_user:
        post_item_id = user_item_record[user_id].pop(0)
        flag = True
    for label in labels:
        if n == 0:
            xtr_list.append(0)
        else:
            # print(emp_xtr_record[label][user_id], n)
            xtr_list.append(len(emp_xtr_record[label][user_id]) / n)
        # 只计算20个item内的emp_xtr
        if flag and emp_xtr_record[label][user_id] and (post_item_id == emp_xtr_record[label][user_id][0]):
            emp_xtr_record[label][user_id].pop(0)
        if raw_df.loc[i, label]:  
            emp_xtr_record[label][user_id].append(item_id)
    # 确定这条样本是否保留，如果小于历史记录最小长度则去掉
    if curr_len >= history_length_min_per_user:
        history = np.concatenate([history_tag, history_id, xtr_list, [True]])
    else:
        history = np.concatenate([history_tag, history_id, xtr_list, [False]])
    # 使用NumPy数组进行赋值
    # print(history)
    history_data[i] = history
    if raw_df.loc[i, 'effective_view']:
        user_history_id_record[user_id].append(item_id)
        user_history_tag_record[user_id].append(tag_id)
        curr_len += 1
        if curr_len >= history_length_max_per_user:
            user_history_id_record[user_id].pop(0)
            user_history_tag_record[user_id].pop(0)
        
    user_item_record[user_id].append(item_id)
raw_df[gen_columns] = history_data

100%|██████████| 1141112/1141112 [07:57<00:00, 2387.47it/s]


In [12]:
print(raw_df.columns)
full_df = raw_df[raw_df['flag']==1].reset_index(drop=True).copy()
full_df.head()

Index(['user_id', 'video_id', 'date', 'hourmin', 'time_ms', 'is_click',
       'is_like', 'is_follow', 'is_comment', 'is_forward',
       ...
       'history_id_19', 'history_id_20', 'emp_effective_view', 'emp_is_like',
       'emp_long_view', 'emp_is_follow', 'emp_is_comment', 'emp_is_forward',
       'emp_is_not_hate', 'flag'],
      dtype='object', length=112)


Unnamed: 0,user_id,video_id,date,hourmin,time_ms,is_click,is_like,is_follow,is_comment,is_forward,...,history_id_19,history_id_20,emp_effective_view,emp_is_like,emp_long_view,emp_is_follow,emp_is_comment,emp_is_forward,emp_is_not_hate,flag
0,206,6580,2022-04-09,1500,1649489630616,0,0,0,0,0,...,2309,2635,0,0,0,0,0,0,1,1
1,15207,6975,2022-04-09,1600,1649490958921,0,0,0,0,0,...,4763,4763,0,0,0,0,0,0,1,1
2,23901,53,2022-04-09,1600,1649491325278,1,0,0,0,0,...,6975,3310,0,0,0,0,0,0,1,1
3,1012,3310,2022-04-09,1600,1649491826829,0,0,0,0,0,...,655,6532,0,0,0,0,0,0,1,1
4,21600,7184,2022-04-09,1600,1649492300922,0,1,0,0,0,...,3876,3706,1,0,0,0,0,0,1,1


In [22]:
full_df = full_df[gen_columns]
full_df[history_tag_columns]

Unnamed: 0,history_tag_1,history_tag_2,history_tag_3,history_tag_4,history_tag_5,history_tag_6,history_tag_7,history_tag_8,history_tag_9,history_tag_10,history_tag_11,history_tag_12,history_tag_13,history_tag_14,history_tag_15,history_tag_16,history_tag_17,history_tag_18,history_tag_19,history_tag_20
0,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,3,3,9,20,9
1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,39,39,1,1,1
2,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,20,9,17,12,17
3,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,12,20,20,39,6
4,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,39,39,7,7,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
862517,-1,9,36,39,9,14,39,4,9,20,17,17,9,39,3,12,8,11,29,12
862518,-1,17,6,17,9,42,39,7,39,28,6,9,39,39,15,17,39,1,9,9
862519,-1,28,62,39,8,39,39,11,39,-1,4,6,5,17,6,12,39,2,12,9
862520,-1,5,39,39,39,39,39,39,39,39,39,12,39,39,29,6,11,39,17,10
