In [2]:
import numpy as np
import pandas as pd
from pandas import DataFrame
from scipy.stats import stats
from collections import Counter

In [36]:
one_dataSet_train_path = "data/one_data_train_"
one_dataSet_test_path = 'data/one_data_test_'
two_dataSet_train_path = 'data/two_data_train_'
two_dataSet_test_path = 'data/two_data_test_'
three_dataSet_train_path = 'data/three_data_train_'

train_path = 'data/train_and_test/train.csv'
test_path = 'data/train_and_test/test.csv'

In [4]:
register = 'register.csv'
create = 'create.csv'
launch = 'launch.csv'
activity = 'activity.csv'

In [19]:
#构建训练集与测试集与特征
#获取所有id,查看对应id是否在测试集中出现过
def get_train_label(train_path, test_path):
    train_reg = pd.read_csv(train_path + register, usecols=['user_id'])
    train_cre = pd.read_csv(train_path + create, usecols=['user_id'])
    train_lau = pd.read_csv(train_path + launch, usecols=['user_id'])
    train_act = pd.read_csv(train_path + activity, usecols=['user_id'])
    train_data_id = np.unique(pd.concat([train_act, train_cre, train_lau, train_reg]))
    
    test_reg = pd.read_csv(test_path + register, usecols=['user_id'])
    test_cre = pd.read_csv(test_path + create, usecols=['user_id'])
    test_lau = pd.read_csv(test_path + launch, usecols=['user_id'])
    test_act = pd.read_csv(test_path + activity, usecols=['user_id'])
    test_data_id = np.unique(pd.concat([test_act, test_cre, test_lau, test_reg]))
    
    train_label = []
    for i in train_data_id:
        if i in test_data_id:
            train_label.append(1)
        else:
            train_label.append(0)
    train_data = pd.DataFrame()
    train_data['user_id'] = train_data_id
    train_data['label'] = train_label
    return train_data

In [44]:
def get_test(test_path):
    test_reg = pd.read_csv(test_path + register, usecols=['user_id'])
    test_cre = pd.read_csv(test_path + create, usecols=['user_id'])
    test_lau = pd.read_csv(test_path + launch, usecols=['user_id'])
    test_act = pd.read_csv(test_path + activity, usecols=['user_id'])
    test_data_id = np.unique(pd.concat([test_act, test_cre, test_lau, test_reg]))
    test_data = pd.DataFrame()
    test_data['user_id'] = test_data_id
    return test_data

In [82]:
def get_create_feature(row):
    feature = pd.Series()
    feature['user_id'] = list(row['user_id'])[0]
    feature['create_count'] = int (pd.DataFrame(row['user_id']).count())
    feature['create_day_diff_mean'] = int (pd.DataFrame(row['day']).mean())
    feature['create_day_diff_max'] = int(pd.DataFrame(row['day']).max())
    feature['create_day_diff_min'] = int (pd.DataFrame(row['day']).min())
    feature['create_day_diff_std'] = float (pd.DataFrame(row['day']).std())
    feature['create_day_diff_var'] = float (pd.DataFrame(row['day']).var())
    feature['last_day_cut_max_day'] = list(row['max_day'])[0] - feature['create_day_diff_max']
    return feature   

def get_register_feature(row):
    feature = pd.Series()
    feature['user_id'] = list(row['register_type'])[0]
    feature['device_type'] = list(row['device_type'])[0]
    feature['register_type'] = list(row['register_type'])[0]
    feature['register_day_cut_max_day'] = list(row['max_day'])[0] - list(row['register_day'])[0]
    return feature    

def get_launch_feature(row):
    feature = pd.Series()
    feature['user_id'] = list(row['user_id'])[0]
    feature['launch_count'] = int (pd.DataFrame(row['user_id']).count())
    feature['launch_day_diff_mean'] = int (pd.DataFrame(row['day']).mean())
    feature['launch_day_diff_max'] = int (pd.DataFrame(row['day']).max())
    feature['launch_day_diff_min'] = int (pd.DataFrame(row['day']).min())
    feature['launch_day_diff_std'] = float (pd.DataFrame(row['day']).std())
    feature['launch_day_diff_var'] = float (pd.DataFrame(row['day']).var())
    feature['launch_day_cut_max_day'] = list(row['max_day'])[0] - feature['launch_day_diff_max']
    return feature   

def get_activity_feature(row):
    feature = pd.Series()
    feature['user_id'] = list(row['user_id'])[0]
    feature['activity_count'] = int (pd.DataFrame(row['user_id']).count())
    feature['activity_day_diff_mean'] = int (pd.DataFrame(row['day']).mean())
    feature['activity_day_diff_max'] = int (pd.DataFrame(row['day']).max())
    feature['activity_day_diff_min'] = int (pd.DataFrame(row['day']).min())
    feature['activity_day_diff_std'] = float (pd.DataFrame(row['day']).fillna(0).std())
    feature['activity_day_diff_var'] = float (pd.DataFrame(row['day']).var())
    feature['activity_day_diff_ske'] = float (pd.DataFrame(row['day']).skew())
    feature['activity_day_diff_kur'] = float (pd.DataFrame(row['day']).skew())
    feature['activity_day_cut_max_day'] = list(row['max_day'])[0] - feature['activity_day_diff_max']
    return feature   

In [83]:
def deal_feature(path, user_id):
    reg = pd.read_csv(path + register)
    cre = pd.read_csv(path + create)
    lau = pd.read_csv(path + launch)
    act = pd.read_csv(path + activity)
    feature = pd.DataFrame()
    feature['user_id'] = user_id
    
    cre['max_day'] = np.max(reg['register_day'])
    cre_feature = cre.groupby('user_id', sort = True).apply(get_create_feature)
    feature = pd.merge(feature, pd.DataFrame(cre_feature), on='user_id', how='left')
    print('create表特征提取完毕')
    
    reg['max_day'] = np.max(reg['register_day'])
    reg_feature = reg.groupby('user_id', sort = True).apply(get_register_feature)
    feature = pd.merge(feature, pd.DataFrame(reg_feature), on='user_id', how='left')
    print('register表特征提取完毕')
    
    lau['max_day'] = np.max(reg['register_day'])
    lau_feature = lau.groupby('user_id', sort = True).apply(get_launch_feature)
    feature = pd.merge(feature, pd.DataFrame(lau_feature), on='user_id', how='left')
    print('launch表特征提取完毕')
    
    act['max_day'] = np.max(reg['register_day'])
    act_feature = act.groupby('user_id', sort = True).apply(get_activity_feature)
    feature = pd.merge(feature, pd.DataFrame(act_feature), on='user_id', how='left')
    print('activity表特征提取完毕')
    return feature

In [89]:
def get_data_feature():
    one_train_data = get_train_label(one_dataSet_train_path, one_dataSet_test_path)
    one_feature = deal_feature(one_dataSet_train_path, one_train_data['user_id'])
    one_feature['label'] = one_train_data['label']
    one_feature.to_csv('data1.csv', index=False)
    print('第一组训练数据特征值提取完毕并保存')
    
#     two_train_data = get_train_label(two_dataSet_train_path, two_dataSet_test_path)
#     two_feature = deal_feature(two_dataSet_train_path, two_train_data['user_id'])
#     two_feature['label'] = two_train_data['label']
#     two_feature.to_csv('data_one.csv', index=False)
#     print('第二组训练数据特征值提取完毕')
    two_feature = pd.read_csv('data_one.csv')
    train_feature = pd.concat([one_feature, two_feature])
    train_feature.to_csv(train_path, index = False)
    print('训练数据存储完毕')
    
    test_data = get_test(three_dataSet_train_path)
    test_feature = deal_feature(three_dataSet_train_path, test_data['user_id'])
    test_feature.to_csv(test_path, index=False)
    print('测试数据存储完毕')

In [90]:
get_data_feature()

create表特征提取完毕
register表特征提取完毕
launch表特征提取完毕
activity表特征提取完毕
第一组训练数据特征值提取完毕并保存
训练数据存储完毕
create表特征提取完毕
register表特征提取完毕
launch表特征提取完毕
activity表特征提取完毕
测试数据存储完毕
