In [93]:
import numpy as np
import pandas as pd
from pandas import DataFrame
from scipy.stats import stats
from collections import Counter

In [94]:
one_dataSet_train_path = "data/one_data_train_"
one_dataSet_test_path = 'data/one_data_test_'
two_dataSet_train_path = 'data/two_data_train_'
two_dataSet_test_path = 'data/two_data_test_'
three_dataSet_train_path = 'data/three_data_train_'

train_path = 'data/train_and_test/train.csv'
test_path = 'data/train_and_test/test.csv'

In [95]:
register = 'register.csv'
create = 'create.csv'
launch = 'launch.csv'
activity = 'activity.csv'

In [96]:
#构建训练集与测试集与特征
#获取所有id,查看对应id是否在测试集中出现过
def get_train_label(train_path, test_path):
    train_reg = pd.read_csv(train_path + register, usecols=['user_id'])
    train_cre = pd.read_csv(train_path + create, usecols=['user_id'])
    train_lau = pd.read_csv(train_path + launch, usecols=['user_id'])
    train_act = pd.read_csv(train_path + activity, usecols=['user_id'])
    train_data_id = np.unique(pd.concat([train_act, train_cre, train_lau, train_reg]))
    
    test_reg = pd.read_csv(test_path + register, usecols=['user_id'])
    test_cre = pd.read_csv(test_path + create, usecols=['user_id'])
    test_lau = pd.read_csv(test_path + launch, usecols=['user_id'])
    test_act = pd.read_csv(test_path + activity, usecols=['user_id'])
    test_data_id = np.unique(pd.concat([test_act, test_cre, test_lau, test_reg]))
    
    train_label = []
    for i in train_data_id:
        if i in test_data_id:
            train_label.append(1)
        else:
            train_label.append(0)
    train_data = pd.DataFrame()
    train_data['user_id'] = train_data_id
    train_data['label'] = train_label
    return train_data

In [97]:
def get_test(test_path):
    test_reg = pd.read_csv(test_path + register, usecols=['user_id'])
    test_cre = pd.read_csv(test_path + create, usecols=['user_id'])
    test_lau = pd.read_csv(test_path + launch, usecols=['user_id'])
    test_act = pd.read_csv(test_path + activity, usecols=['user_id'])
    test_data_id = np.unique(pd.concat([test_act, test_cre, test_lau, test_reg]))
    test_data = pd.DataFrame()
    test_data['user_id'] = test_data_id
    return test_data

In [98]:
def get_create_feature(row):
    feature = pd.Series()
    feature['user_id'] = list(row['user_id'])[0]
    feature['create_count'] = int (pd.DataFrame(row['user_id']).count())
    feature['create_day_mean'] = float (pd.DataFrame(row['day']).mean())
    feature['create_day_max'] = int(pd.DataFrame(row['day']).max())
    feature['create_day_min'] = int (pd.DataFrame(row['day']).min())
    feature['create_day_std'] = float (pd.DataFrame(row['day']).std())
    feature['create_day_var'] = float (pd.DataFrame(row['day']).var())
    feature['last_day_cut_max_day'] = feature['create_day_max'] - list(row['max_day'])[0]
    feature['create_sub_register'] = feature['create_day_min'] - list(row['max_day'])[0]
    feature['max_create_times_per_day'] = int (row.groupby('day').count()['user_id'].max())
    if feature['create_count'] != 1:
        tmp = pd.DataFrame()
        for i in range(1,int (row['user_id'].count())):
            tmp.append(row['day'][i] - row['day'][i-1])  
        tmp = tmp[0]
        feature['create_day_diff_mean'] = float (tmp.)
        feature['create_day_diff_max'] = int (tmp[0].max())
        feature['create_day_diff_min'] = int (tmp[0].min())
        feature['create_day_diff_std'] = float (tmp.std())
        feature['create_day_diff_var'] = float (tmp.var())
    else:
        feature['create_day_diff_mean'] = None
        feature['create_day_diff_max'] =None
        feature['create_day_diff_min'] = None
        feature['create_day_diff_std'] = None
        feature['create_day_diff_var'] = None
    return feature   

def get_register_feature(row):
    feature = pd.Series()
    feature['user_id'] = list(row['register_type'])[0]
    feature['device_type'] = list(row['device_type'])[0]
    feature['register_type'] = list(row['register_type'])[0]
    feature['register_day_cut_max_day'] = list(row['max_day'])[0] - list(row['register_day'])[0]
    return feature    

def get_launch_feature(row):
    feature = pd.Series()
    feature['user_id'] = list(row['user_id'])[0]
    feature['launch_count'] = int (pd.DataFrame(row['user_id']).count())
    feature['launch_day_mean'] = int (pd.DataFrame(row['day']).mean())
    feature['launch_day_max'] = int (pd.DataFrame(row['day']).max())
    feature['launch_day_min'] = int (pd.DataFrame(row['day']).min())
    feature['launch_day_std'] = float (pd.DataFrame(row['day']).std())
    feature['launch_day_var'] = float (pd.DataFrame(row['day']).var())
    feature['launch_day_cut_max_day'] = feature['launch_day_max'] - list(row['max_day'])[0]
    feature['create_sub_register'] = feature['launch_day_min'] - list(row['max_day'])[0]
    feature['max_launch_times_per_day'] = int (row.groupby('day').count()['user_id'].max())
    if feature['create_count'] != 1:
        tmp = pd.DataFrame()
        for i in range(1,int (row['user_id'].count())):
            tmp.append(row['day'][i] - row['day'][i-1])  
        tmp = tmp[0]
        feature['launch_day_diff_mean'] = float (tmp.mean())
        feature['launch_day_diff_max']  = int (tmp.max())
        feature['launch_day_diff_min']  = int (tmp.min())
        feature['launch_day_diff_std'] = float (tmp.std())
        feature['launch_day_diff_var'] = float (tmp.var())
    else:
        feature['launch_day_diff_mean'] = None
        feature['launch_day_diff_max']  = int (tmp.max())
        feature['launch_day_diff_min']  = int (tmp.min())
        feature['launch_day_diff_std'] = float (tmp.std())
        feature['launch_day_diff_var'] = float (tmp.var())
    return feature   

def get_activity_feature(row):
    feature = pd.Series()
    feature['user_id'] = list(row['user_id'])[0]
    feature['activity_count'] = int (row['user_id'].count())
    feature['activity_day_mean'] = int (row['day'].mean())
    feature['activity_day_max'] = int (row['day'].max())
    feature['activity_day_min'] = int (row['day'].min())
    feature['activity_day_std'] = float (row['day'].std())
    feature['activity_day_var'] = float (row['day'].var())
    feature['activity_day_ske'] = float (row['day'].skew())
    feature['activity_day_kur'] = float (row['day'].skew())
    feature['activity_day_cut_max_day'] = feature['activity_day_max'] - list(row['max_day'])[0]
    feature['activity_day_cut_max_day'] = feature['activity_day_min'] - list(row['max_day'])[0]
    feature['create_sub_register'] = feature['launch_day_min'] - list(row['max_day'])[0]
    feature['0_page_count'] = int (row.page[row.page==0].count())
    feature['1_page_count'] = int (row.page[row.page==1].count())
    feature['2_page_count'] = int (row.page[row.page==2].count())
    feature['3_page_count'] = int (row.page[row.page==3].count())
    feature['4_page_count'] = int (row.page[row.page==4].count())
    feature['0_page_count_div_sum'] = feature['0_page_count']/feature['activity_count']
    feature['1_page_count_div_sum'] = feature['1_page_count']/feature['activity_count']
    feature['2_page_count_div_sum'] = feature['2_page_count']/feature['activity_count'] 
    feature['3_page_count_div_sum'] = feature['3_page_count']/feature['activity_count']
    feature['4_page_count_div_sum'] = feature['4_page_count']/feature['activity_count']
    feature['0_action_count'] = int (row.action_type[row.action_type==0].count())
    feature['1_action_count'] = int (row.action_type[row.action_type==1].count())
    feature['2_action_count'] = int (row.action_type[row.action_type==2].count())
    feature['3_action_count'] = int (row.action_type[row.action_type==3].count())
    feature['4_action_count'] = int (row.action_type[row.action_type==4].count())
    feature['5_action_count'] = int (row.action_type[row.action_type==5].count())
    feature['0_action_count_div_sum'] = feature['0_action_count']/feature['activity_count']
    feature['1_action_count_div_sum'] = feature['1_action_count']/feature['activity_count']
    feature['2_action_count_div_sum'] = feature['2_action_count']/feature['activity_count']
    feature['3_action_count_div_sum'] = feature['3_action_count']/feature['activity_count']
    feature['4_action_count_div_sum'] = feature['4_action_count']/feature['activity_count']
    feature['5_action_count_div_sum'] = feature['5_action_count']/feature['activity_count']
    feature['max_act_times_per_day'] = int (row.groupby('day').count()['user_id'].max())
    a = []
    for i in range(1,int (row['user_id'].count())):
        a.append(int(row['day'][i]) - int (row['day'][i-1]))
    tmp = pd.DataFrame(a)

    feature['activity_day_diff_mean'] = float (tmp.mean())
    feature['activity_day_diff_max']  = int (tmp.max())
    feature['activity_day_diff_min']  = int (tmp.min())
    feature['activity_day_diff_std'] = float (tmp.std())
    feature['activity_day_diff_var'] = float (tmp.var())
    feature['activity_day_diff_ske'] = float (tmp.skew())
    feature['activity_day_diff_kur'] = float (tmp.kurt())
    if feature['user_id'] in row.author_id:
        feature['watch_self'] = int (1)
    else:
        feature['watch_self'] = int (0)
    
    return feature   

In [99]:
def deal_feature(path, user_id):
    reg = pd.read_csv(path + register)
    cre = pd.read_csv(path + create)
    lau = pd.read_csv(path + launch)
    act = pd.read_csv(path + activity)
    feature = pd.DataFrame()
    feature['user_id'] = user_id
    
    cre['max_day'] = np.max(reg['register_day'])
    cre_feature = cre.groupby('user_id', sort = True).apply(get_create_feature)
    feature = pd.merge(feature, pd.DataFrame(cre_feature), on='user_id', how='left')
    print('create表特征提取完毕')
    
    reg['max_day'] = np.max(reg['register_day'])
    reg_feature = reg.groupby('user_id', sort = True).apply(get_register_feature)
    feature = pd.merge(feature, pd.DataFrame(reg_feature), on='user_id', how='left')
    print('register表特征提取完毕')
    
    lau['max_day'] = np.max(reg['register_day'])
    lau_feature = lau.groupby('user_id', sort = True).apply(get_launch_feature)
    feature = pd.merge(feature, pd.DataFrame(lau_feature), on='user_id', how='left')
    print('launch表特征提取完毕')
    
    act['max_day'] = np.max(reg['register_day'])
    act_feature = act.groupby('user_id', sort = True).apply(get_activity_feature)
    feature = pd.merge(feature, pd.DataFrame(act_feature), on='user_id', how='left')
    print('activity表特征提取完毕')
    return feature

In [100]:
def get_data_feature():
    one_train_data = get_train_label(one_dataSet_train_path, one_dataSet_test_path)
    one_feature = deal_feature(one_dataSet_train_path, one_train_data['user_id'])
    one_feature['label'] = one_train_data['label']
    one_feature.to_csv('data_one.csv', index=False)
    print('第一组训练数据特征值提取完毕并保存')
    
#     two_train_data = get_train_label(two_dataSet_train_path, two_dataSet_test_path)
#     two_feature = deal_feature(two_dataSet_train_path, two_train_data['user_id'])
#     two_feature['label'] = two_train_data['label']
#     two_feature.to_csv('data_one.csv', index=False)
#     print('第二组训练数据特征值提取完毕')
    
#     two_feature = pd.read_csv('data_two.csv')
#     train_feature = pd.concat([one_feature, two_feature])
#     train_feature.to_csv(train_path, index = False)
#     print('训练数据存储完毕')
    
#     test_data = get_test(three_dataSet_train_path)
#     test_feature = deal_feature(three_dataSet_train_path, test_data['user_id'])
#     test_feature.to_csv(test_path, index=False)
#     print('测试数据存储完毕')

In [101]:
get_data_feature()

1.0
-----------------------------
Empty DataFrame
Columns: []
Index: []
1.0
-----------------------------
Empty DataFrame
Columns: []
Index: []
4.0
-----------------------------
1.0
-----------------------------
Empty DataFrame
Columns: []
Index: []
4.0
-----------------------------


KeyError: 1