In [1]:
import numpy as np
import pandas as pd
from pandas import DataFrame
from scipy.stats import stats
from collections import Counter

In [2]:
one_dataSet_train_path = "data/1_data_train_"
one_dataSet_test_path = 'data/1_data_test_'
two_dataSet_train_path = 'data/2_data_train_'
two_dataSet_test_path = 'data/2_data_test_'
three_dataSet_train_path = 'data/3_data_train_'
train_path = 'data/train_and_test/train.csv'
test_path = 'data/train_and_test/test.csv'

In [3]:
register = 'register.csv'
create = 'create.csv'
launch = 'launch.csv'
activity = 'activity.csv'

In [4]:
#构建训练集与测试集与特征
#获取所有id,查看对应id是否在测试集中出现过
def get_train_label(train_path, test_path):
    train_reg = pd.read_csv(train_path + register, usecols=['user_id'])
    train_data_id = np.unique(train_reg)
    
    test_cre = pd.read_csv(test_path + create, usecols=['user_id'])
    test_lau = pd.read_csv(test_path + launch, usecols=['user_id'])
    test_act = pd.read_csv(test_path + activity, usecols=['user_id'])
    test_data_id = np.unique(pd.concat([test_act, test_cre, test_lau]))
    
    train_label = []
    for i in train_data_id:
        if i in test_data_id:
            train_label.append(1)
        else:
            train_label.append(0)
    train_data = pd.DataFrame()
    train_data['user_id'] = train_data_id
    train_data['label'] = train_label
    return train_data

In [5]:
def get_test(test_path):
    test_reg = pd.read_csv(test_path + register, usecols=['user_id'])
    test_cre = pd.read_csv(test_path + create, usecols=['user_id'])
    test_lau = pd.read_csv(test_path + launch, usecols=['user_id'])
    test_act = pd.read_csv(test_path + activity, usecols=['user_id'])
    test_data_id = np.unique(test_reg)
    test_data = pd.DataFrame()
    test_data['user_id'] = test_data_id
    return test_data

In [106]:
def get_create_feature(row):
    feature = pd.Series()
    feature['user_id'] = list(row['user_id'])[0]
    feature['create_count_recent_days'] = int (pd.DataFrame(row['user_id']).count())
    diff = list(row['max_day'])[0] - list(row['register_day'])[0] + 1
    if diff >= 15:
        diff = 15
    feature['create_count_in_last_3_day'] = int (row.day[row.day >= (list(row['max_day'])[0] - 2)].count())
    feature['create_count_in_last_5_day'] = int (row.day[row.day >= (list(row['max_day'])[0] - 4)].count())
    feature['create_count_in_last_7_day'] = int (row.day[row.day >= (list(row['max_day'])[0] - 6)].count())
    feature['create_count_in_last_9_day'] = int (row.day[row.day >= (list(row['max_day'])[0] - 8)].count())
    
    feature['create_day_mean'] = float (feature['create_count_recent_days']/(diff+1))
    feature['create_day_max'] = int(pd.DataFrame(row['day']).max())
    feature['create_day_min'] = int (pd.DataFrame(row['day']).min())
    feature['create_day_max_sub_min'] = feature['create_day_max'] - feature['create_day_min']
    feature['create_day_std'] = float (pd.DataFrame(row['day']).std())
    feature['create_day_var'] = float (pd.DataFrame(row['day']).var())
    feature['create_day_skew'] = float (pd.DataFrame(row['day']).skew())
    feature['create_day_kur'] = float (pd.DataFrame(row['day']).kurt())
    feature['last_day_cut_max_day'] = list(row['max_day'])[0] - feature['create_day_max']
    feature['first_day_cut_register_day'] = feature['create_day_min'] - list(row['register_day'])[0]
    feature['max_create_times_one_day'] = int (row.groupby('day').count()['user_id'].max())
    
    tmp = np.diff(row.day)
    feature['create_day_diff_mean'] = float (tmp.mean())
    feature['create_day_diff_max'] = int (tmp.max())
    feature['create_day_diff_min'] = int (tmp.min())
    feature['create_day_diff_std'] = float (tmp.std())
    feature['create_day_diff_var'] = float (tmp.var())
    return feature      

def get_launch_feature(row):
    feature = pd.Series()
    feature['user_id'] = list(row['user_id'])[0]
    feature['launch_count_recent_days'] = int (pd.DataFrame(row['user_id']).count())
    diff = list(row['max_day'])[0] - list(row['register_day'])[0] + 1
    feature['reg_now_diff'] = diff
    if diff >= 15:
        diff = 15
    feature['launch_count_in_last_3_day'] = int (row.day[row.day >= (list(row['max_day'])[0] - 2)].count())
    feature['launch_count_in_last_5_day'] = int (row.day[row.day >= (list(row['max_day'])[0] - 4)].count())
    feature['launch_count_in_last_7_day'] = int (row.day[row.day >= (list(row['max_day'])[0] - 6)].count())
    feature['launch_count_in_last_9_day'] = int (row.day[row.day >= (list(row['max_day'])[0] - 8)].count())    
    feature['launch_day_mean'] = float (feature['launch_count_recent_days']/(diff+1))
    feature['launch_day_max'] = int (pd.DataFrame(row['day']).max())
    feature['launch_day_min'] = int (pd.DataFrame(row['day']).min())
    feature['launch_day_max_sub_min'] = feature['launch_day_max'] - feature['launch_day_min']
    feature['launch_day_std'] = float (pd.DataFrame(row['day']).std())
    feature['launch_day_var'] = float (pd.DataFrame(row['day']).var())
    feature['launch_day_kur'] = float (pd.DataFrame(row['day']).kurt())
    feature['launch_day_skew'] = float (pd.DataFrame(row['day']).skew())
    feature['last_launch_day_cut_max_day'] = list(row['max_day'])[0] - feature['launch_day_max']
    feature['first_launch_day_cut_register_day'] = feature['launch_day_min'] - list(row['register_day'])[0]
    feature['launch_mean_cut_max_day'] = feature['launch_day_mean'] - list(row['max_day'])[0]
    feature['max_launch_times_one_day'] = int (row.groupby('day').count()['user_id'].max())
    tmp = np.diff(row.day)
    feature['launch_day_diff_mean'] = float (tmp.mean())
    feature['launch_day_diff_max']  = int (tmp.max())
    feature['launch_day_diff_min']  = int (tmp.min())
    feature['launch_day_diff_std'] = float (tmp.std())
    feature['launch_day_diff_var'] = float (tmp.var())
    return feature   

def get_activity_feature(row):
    feature = pd.Series()
    feature['user_id'] = list(row['user_id'])[0]
    diff = list(row['max_day'])[0] - list(row['register_day'])[0] + 1
    if diff >= 15:
        diff = 15
    feature['act_count_in_last_3_day'] = int (row.day[row.day >= (list(row['max_day'])[0] - 2)].count())
    feature['act_count_in_last_5_day'] = int (row.day[row.day >= (list(row['max_day'])[0] - 4)].count())
    feature['act_count_in_last_7_day'] = int (row.day[row.day >= (list(row['max_day'])[0] - 6)].count())
    feature['act_count_in_last_9_day'] = int (row.day[row.day >= (list(row['max_day'])[0] - 8)].count())
    feature['activity_count'] = int (pd.DataFrame(row['user_id']).count())
    feature['activity_day_mean'] = float (feature['activity_count']/(diff+1))
    feature['activity_day_max'] = int (row['day'].max())
    feature['activity_day_min'] = int (row['day'].min())
    feature['activity_day_std'] = float (row['day'].std())
    feature['activity_day_var'] = float (row['day'].var())
    feature['activity_day_ske'] = float (row['day'].skew())
    feature['activity_day_kur'] = float (row['day'].skew())
    feature['activity_day_max_cut_min'] =  feature['activity_day_max'] - feature['activity_day_min']
    feature['activity_day_cut_max_day'] = list(row['max_day'])[0] - feature['activity_day_max']
    feature['activity_day_cut_register_day'] = feature['activity_day_min'] - list(row['register_day'])[0]
    feature['act_sub_register'] = feature['activity_day_min'] - list(row['max_day'])[0]
    feature['act_count_in_last_3_day'] = int (row.day[row.day >= (list(row['max_day'])[0] - 2)].count())
    feature['0_page_count'] = int (row.page[row.page==0].count())
    feature['1_page_count'] = int (row.page[row.page==1].count())
    feature['2_page_count'] = int (row.page[row.page==2].count())
    feature['3_page_count'] = int (row.page[row.page==3].count())
    feature['4_page_count'] = int (row.page[row.page==4].count())
    
    feature['0_page_count_in_3_days'] = int (row.page[(row.page==0)&(row.day >= (list(row['max_day'])[0] - 2))].count())
    feature['1_page_count_in_3_days'] = int (row.page[(row.page==1)&(row.day >= (list(row['max_day'])[0] - 2))].count())
    feature['2_page_count_in_3_days'] = int (row.page[(row.page==2)&(row.day >= (list(row['max_day'])[0] - 2))].count())
    feature['3_page_count_in_3_days'] = int (row.page[(row.page==3)&(row.day >= (list(row['max_day'])[0] - 2))].count())
    feature['4_page_count_in_3_days'] = int (row.page[(row.page==4)&(row.day >= (list(row['max_day'])[0] - 2))].count())
    
    feature['0_page_count_in_5_days'] = int (row.page[(row.page==0)&(row.day >= (list(row['max_day'])[0] - 4))].count())
    feature['1_page_count_in_5_days'] = int (row.page[(row.page==1)&(row.day >= (list(row['max_day'])[0] - 4))].count())
    feature['2_page_count_in_5_days'] = int (row.page[(row.page==2)&(row.day >= (list(row['max_day'])[0] - 4))].count())
    feature['3_page_count_in_5_days'] = int (row.page[(row.page==3)&(row.day >= (list(row['max_day'])[0] - 4))].count())
    feature['4_page_count_in_5_days'] = int (row.page[(row.page==4)&(row.day >= (list(row['max_day'])[0] - 4))].count())
    
    feature['0_page_count_in_7_days'] = int (row.page[(row.page==0)&(row.day >= (list(row['max_day'])[0] - 6))].count())
    feature['1_page_count_in_7_days'] = int (row.page[(row.page==1)&(row.day >= (list(row['max_day'])[0] - 6))].count())
    feature['2_page_count_in_7_days'] = int (row.page[(row.page==2)&(row.day >= (list(row['max_day'])[0] - 6))].count())
    feature['3_page_count_in_7_days'] = int (row.page[(row.page==3)&(row.day >= (list(row['max_day'])[0] - 6))].count())
    feature['4_page_count_in_7_days'] = int (row.page[(row.page==4)&(row.day >= (list(row['max_day'])[0] - 6))].count())
    
    feature['0_page_count_div_sum'] = float((feature['0_page_count'] + 1)/(feature['activity_count'] + 1))
    feature['1_page_count_div_sum'] = float((feature['1_page_count'] + 1)/(feature['activity_count'] + 1))
    feature['2_page_count_div_sum'] = float((feature['2_page_count'] + 1)/(feature['activity_count'] + 1))
    feature['3_page_count_div_sum'] = float((feature['3_page_count'] + 1)/(feature['activity_count'] + 1))
    feature['4_page_count_div_sum'] = float((feature['4_page_count'] + 1)/(feature['activity_count'] + 1))
    
    feature['0_action_count'] = int (row.action_type[row.action_type==0].count())
    feature['1_action_count'] = int (row.action_type[row.action_type==1].count())
    feature['2_action_count'] = int (row.action_type[row.action_type==2].count())
    feature['3_action_count'] = int (row.action_type[row.action_type==3].count())
    feature['4_action_count'] = int (row.action_type[row.action_type==4].count())
    feature['5_action_count'] = int (row.action_type[row.action_type==5].count())
    
    feature['0_action_count_in_3_days'] = int (row.action_type[(row.action_type==0)&(row.day >= (list(row['max_day'])[0] - 2))].count())
    feature['1_action_count_in_3_days'] = int (row.action_type[(row.action_type==1)&(row.day >= (list(row['max_day'])[0] - 2))].count())
    feature['2_action_count_in_3_days'] = int (row.action_type[(row.action_type==2)&(row.day >= (list(row['max_day'])[0] - 2))].count())
    feature['3_action_count_in_3_days'] = int (row.action_type[(row.action_type==3)&(row.day >= (list(row['max_day'])[0] - 2))].count())
    feature['4_action_count_in_3_days'] = int (row.action_type[(row.action_type==4)&(row.day >= (list(row['max_day'])[0] - 2))].count())
    feature['5_action_count_in_3_days'] = int (row.action_type[(row.action_type==5)&(row.day >= (list(row['max_day'])[0] - 2))].count())
    
    feature['0_action_count_in_5_days'] = int (row.action_type[(row.action_type==0)&(row.day >= (list(row['max_day'])[0] - 4))].count())
    feature['1_action_count_in_5_days'] = int (row.action_type[(row.action_type==1)&(row.day >= (list(row['max_day'])[0] - 4))].count())
    feature['2_action_count_in_5_days'] = int (row.action_type[(row.action_type==2)&(row.day >= (list(row['max_day'])[0] - 4))].count())
    feature['3_action_count_in_5_days'] = int (row.action_type[(row.action_type==3)&(row.day >= (list(row['max_day'])[0] - 4))].count())
    feature['4_action_count_in_5_days'] = int (row.action_type[(row.action_type==4)&(row.day >= (list(row['max_day'])[0] - 4))].count())
    feature['5_action_count_in_5_days'] = int (row.action_type[(row.action_type==5)&(row.day >= (list(row['max_day'])[0] - 4))].count())
    
    feature['0_action_count_in_7_days'] = int (row.action_type[(row.action_type==0)&(row.day >= (list(row['max_day'])[0] - 6))].count())
    feature['1_action_count_in_7_days'] = int (row.action_type[(row.action_type==1)&(row.day >= (list(row['max_day'])[0] - 6))].count())
    feature['2_action_count_in_7_days'] = int (row.action_type[(row.action_type==2)&(row.day >= (list(row['max_day'])[0] - 6))].count())
    feature['3_action_count_in_7_days'] = int (row.action_type[(row.action_type==3)&(row.day >= (list(row['max_day'])[0] - 6))].count())
    feature['4_action_count_in_7_days'] = int (row.action_type[(row.action_type==4)&(row.day >= (list(row['max_day'])[0] - 6))].count())
    feature['5_action_count_in_7_days'] = int (row.action_type[(row.action_type==5)&(row.day >= (list(row['max_day'])[0] - 6))].count())
    feature['0_action_count_div_sum'] = float(feature['0_action_count']/feature['activity_count'])
    feature['1_action_count_div_sum'] = (feature['1_action_count'] + 1)/(feature['activity_count'] + 1)
    feature['2_action_count_div_sum'] = (feature['2_action_count'] + 1)/(feature['activity_count'] + 1)
    feature['3_action_count_div_sum'] = (feature['3_action_count'] + 1)/(feature['activity_count'] + 1)
    feature['4_action_count_div_sum'] = (feature['4_action_count'] + 1)/(feature['activity_count'] + 1)
    feature['5_action_count_div_sum'] = (feature['5_action_count'] + 1)/(feature['activity_count'] + 1)
    feature['max_act_times_per_day'] = int (row.groupby('day').count()['user_id'].max())
    feature['max_hot_vid'] = float(row.vid_hotness.max())
    feature['max_hot_auth'] = float(row.auth_hotness.max())
    feature['mean_hot_vid'] = float(row.vid_hotness.mean())
    feature['mean_hot_auth'] = float(row.auth_hotness.mean())
    feature['std_hot_vid'] = float(row.vid_hotness.std())
    feature['kur_hot_vid'] = float(row.vid_hotness.kurt())
    feature['min_hot_vid'] = float(row.vid_hotness.min())
    feature['min_hot_auth'] = float(row.auth_hotness.min())
    feature['sum_hot_vid'] = float(row.vid_hotness.sum())
    feature['sum_hot_auth'] = float(row.auth_hotness.sum())
    feature['std_hot_auth'] = float(row.auth_hotness.std())
    feature['kur_hot_auth'] = float(row.auth_hotness.kurt())
    feature['norm_max_hot_vid'] = float(row.video_norm_hot.max())
    feature['norm_max_hot_auth'] = float(row.author_norm_hot.max())
    feature['norm_mean_hot_vid'] = float(row.video_norm_hot.mean())
    feature['norm_mean_hot_auth'] = float(row.author_norm_hot.mean())
    feature['norm_std_hot_vid'] = float(row.video_norm_hot.std())
    feature['norm_kur_hot_vid'] = float(row.video_norm_hot.kurt())
    feature['norm_min_hot_vid'] = float(row.video_norm_hot.min())
    feature['norm_min_hot_auth'] = float(row.author_norm_hot.min())
    feature['norm_sum_hot_vid'] = float(row.video_norm_hot.sum())
    feature['norm_sum_hot_auth'] = float(row.author_norm_hot.sum())
    feature['norm_std_hot_auth'] = float(row.author_norm_hot.std())
    feature['norm_kur_hot_auth'] = float(row.author_norm_hot.kurt())
    tmp = np.diff(row.day)
    feature['activity_day_diff_mean'] = float (tmp.mean())
    feature['activity_day_diff_max']  = int (tmp.max())
    feature['activity_day_diff_min']  = int (tmp.min())
    feature['activity_day_diff_std'] = float (tmp.std())
    feature['activity_day_diff_var'] = float (tmp.var())
    feature['activity_day_diff_ske'] = float (tmp.skew())
    feature['activity_day_diff_kur'] = float (tmp.kurt())
    return feature  

In [110]:
def deal_feature(path, user_id):
    reg = pd.read_csv(path + register)
    cre = pd.read_csv(path + create)
    lau = pd.read_csv(path + launch)
    act = pd.read_csv(path + activity)
    feature = pd.DataFrame()
    feature['user_id'] = user_id
    
    reg_day = pd.DataFrame()
    reg_day['user_id'] = reg.user_id
    reg_day['register_day'] = reg.register_day
    
    
    cre = pd.merge(cre,reg_day,on='user_id', how='left')
    cre['max_day'] = np.max(reg['register_day'])
    cre_feature = cre.groupby('user_id', sort = True).apply(get_create_feature)
    feature = pd.merge(feature, pd.DataFrame(cre_feature), on='user_id', how='left')
    print('create表特征提取完毕')
    
    reg['max_day'] = np.max(reg['register_day'])
    print(np.max(reg['register_day']))
    reg_feature = reg.groupby('user_id', sort = True).apply(get_register_feature)
    feature = pd.merge(feature, reg, on='user_id', how='left')
    feature['register_day_cut_max_day'] = (np.max(reg['register_day']) - feature.register_day)
    print('register表特征提取完毕')
    
    lau = pd.merge(lau,reg_day,on='user_id', how='left')
    lau['max_day'] = np.max(reg['register_day'])
    lau_feature = lau.groupby('user_id', sort = True).apply(get_launch_feature)
    feature = pd.merge(feature, pd.DataFrame(lau_feature), on='user_id', how='left')
    print('launch表特征提取完毕')
    
    act = pd.merge(act,reg_day,on='user_id', how='left')
    act['max_day'] = np.max(reg['register_day'])
    act_feature = act.groupby('user_id', sort = True).apply(get_activity_feature)
    feature = pd.merge(feature, pd.DataFrame(act_feature), on='user_id', how='left')
    print('activity表特征提取完毕')
    
#     aut = pd.DataFrame(act.author_id)
#     aut['watched_times_recent_days'] = 1
#     a = aut.groupby('author_id').count()
#     a_norm = (a.watched_times_recent_days - a.watched_times_recent_days.min()) / (a.watched_times_recent_days.max() - a.watched_times_recent_days.min())
#     a['norm_watched_times_recent_days'] = a_norm
#     a.reset_index(inplace=True)
#     a.rename(columns={'author_id':'user_id'}, inplace=True)
#     feature = pd.merge(feature, a, on='user_id', how='left')
    
#     auth_3 = pd.DataFrame(act.author_id[act.day >= int (np.max(reg['register_day']) - 2)])
#     auth_3['watched_times_recent_3_days'] = 1
#     auth3 = auth_3.groupby('author_id').count()
#     auth3.reset_index(inplace=True)
#     auth3.rename(columns={'author_id':'user_id'}, inplace=True)
#     feature = pd.merge(feature, auth3, on='user_id', how='left')
    
#     auth_5 = pd.DataFrame(act.author_id[act.day >= int (np.max(reg['register_day']) - 4)])
#     auth_5['watched_times_recent_5_days'] = 1
#     auth5 = auth_5.groupby('author_id').count()
#     auth5.reset_index(inplace=True)
#     auth5.rename(columns={'author_id':'user_id'}, inplace=True)
#     feature = pd.merge(feature, auth5, on='user_id', how='left')
    
#     auth_7 = pd.DataFrame(act.author_id[act.day >= int (np.max(reg['register_day']) - 6)])
#     auth_7['watched_times_recent_7_days'] = 1
#     auth7 = auth_7.groupby('author_id').count()
#     auth7.reset_index(inplace=True)
#     auth7.rename(columns={'author_id':'user_id'}, inplace=True)
#     feature = pd.merge(feature, auth7, on='user_id', how='left')
    feature['watched_times_recent_5_3_days'] = feature['watched_times_recent_5_days'] - feature['watched_times_recent_3_days']
    feature['watched_times_recent_7_5_days'] = feature['watched_times_recent_7_days'] - feature['watched_times_recent_5_days']
    feature['create_count_in_last_3_5_day'] = feature['create_count_in_last_5_day'] - feature['create_count_in_last_3_day']
    feature['create_count_in_last_5_7_day'] = feature['create_count_in_last_7_day'] - feature['create_count_in_last_5_day']
    feature['create_count_in_last_7_9_day'] = feature['create_count_in_last_9_day'] - feature['create_count_in_last_7_day']
    feature['launch_count_in_3_5_day'] = feature['launch_count_in_last_5_day'] - feature['launch_count_in_last_3_day']
    feature['launch_count_in_5_7_day'] = feature['launch_count_in_last_7_day'] - feature['launch_count_in_last_5_day']
    feature['launch_count_in_7_9_day'] = feature['launch_count_in_last_9_day'] - feature['launch_count_in_last_7_day']
    feature['act_count_in_last_3_5_day'] = feature['act_count_in_last_5_day'] - feature['act_count_in_last_3_day']
    feature['act_count_in_last_5_7_day'] = feature['act_count_in_last_7_day'] - feature['act_count_in_last_5_day']
    feature['act_count_in_last_7_9_day'] = feature['act_count_in_last_9_day'] - feature['act_count_in_last_7_day']

    return feature

In [111]:
def get_data_feature():
    one_train_data = get_train_label(one_dataSet_train_path, one_dataSet_test_path)
    one_feature = deal_feature(one_dataSet_train_path, one_train_data['user_id'])
#     one_feature['label'] = one_train_data['label']
    one_feature.to_csv('data_one_plus.csv', index=False)
    print('第一组训练数据特征值提取完毕并保存')

    two_train_data = get_train_label(two_dataSet_train_path, two_dataSet_test_path)
    two_feature = deal_feature(two_dataSet_train_path, two_train_data['user_id'])
#     two_feature['label'] = two_train_data['label']
    two_feature.to_csv('data_two_plus.csv', index=False)
    print('第二组训练数据特征值提取完毕')
    
    train_feature = pd.concat([one_feature, two_feature])
    train_feature.to_csv('data/train_and_test/train_plus.csv', index = False)
    print('训练数据存储完毕')
    
    test_data = get_test(three_dataSet_train_path)
    test_feature = deal_feature(three_dataSet_train_path, test_data['user_id'])
    test_feature.to_csv('data/train_and_test/test_plus.csv', index=False)
    print('测试数据存储完毕')

In [112]:
get_data_feature()

create表特征提取完毕
16
register表特征提取完毕
launch表特征提取完毕
activity表特征提取完毕
第一组训练数据特征值提取完毕并保存
create表特征提取完毕
23
register表特征提取完毕
launch表特征提取完毕
activity表特征提取完毕
第二组训练数据特征值提取完毕
训练数据存储完毕
create表特征提取完毕
30
register表特征提取完毕
launch表特征提取完毕
activity表特征提取完毕
测试数据存储完毕
