In [1]:
import pandas as pd
import numpy as np

from sklearn import preprocessing
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
plt.rc('font', family='SimHei', size=13)
%matplotlib inline

import seaborn as sns
sns.set()

import plotly.graph_objs as go
import plotly.plotly as py
import plotly.offline as of
of.offline.init_notebook_mode(connected=True)

In [2]:
app_launch_log = pd.read_csv(
    '../data/B/app_launch_log.txt', delimiter='\t', header=None)
user_register_log = pd.read_csv(
    '../data/B/user_register_log.txt', delimiter='\t', header=None)
video_create_log = pd.read_csv(
    '../data/B/video_create_log.txt', delimiter='\t', header=None)
user_activity_log = pd.read_csv(
    '../data/B/user_activity_log.txt', delimiter='\t', header=None)

print(app_launch_log.shape, user_register_log.shape, video_create_log.shape,
      user_activity_log.shape)

(252496, 2) (51480, 4) (35849, 2) (21072403, 6)


In [5]:
user_register_log.columns=['user_id','register_day','register_type','device_type']
app_launch_log.columns=['user_id','day']
video_create_log.columns=['user_id','day']
user_activity_log.columns=['user_id','day','page','video_id','author_id','action_type']

## 数据划分

In [12]:
user=user_register_log[user_register_log.register_day.values<17].user_id.unique()

In [14]:
user_video=video_create_log[video_create_log.day<17]
user_reg=user_register_log[user_register_log.register_day<17]
user_lau=app_launch_log[app_launch_log.day<17]
user_act=user_activity_log[user_activity_log.day<17]

In [65]:
u1=video_create_log[video_create_log.day >= 17].user_id.unique()
u2=user_register_log[user_register_log.register_day >= 17].user_id.unique()
u3=app_launch_log[app_launch_log.day >= 17].user_id.unique()
u4=user_activity_log[user_activity_log.day >= 17].user_id.unique()
u=np.unique(np.concatenate([u1,u2,u3,u4]))

In [41]:
#特征1 操作次数
group = user_act.groupby(user_act.user_id)
act_count = group.size()

#特征2 video创建次数
group = user_video.groupby(user_video.user_id)
video_count = group.size()

#特征3 注册时间（注册日期）
reg_day = user_reg.iloc[:, :2]
reg_day = reg_day.set_index(reg_day.user_id).loc[:, 'register_day']

#特征4 启动次数
group = user_lau.groupby(user_lau.user_id)
lau_count = group.size()

#特征5 最后一次使用日期
group = user_act.groupby(user_act.user_id)
act_maxday = group.day.max()

#特征6 最后一次创建video
group = user_video.groupby(user_video.user_id)
video_maxday = group.day.max()

#特征7 最大操作次数
group = user_act.groupby(['user_id', 'day'])
act_max = group.size()
v = [act_max[i].max() for i in act_max.index.levels[0]]
act_max = pd.Series(v, index=act_max.index.levels[0])

In [149]:
#特征8 连续使用天数
from itertools import groupby
def count_series_day(days):
    lst=days.tolist()
    
    fun = lambda x: x[1] - x[0]
    return np.max([len([v for i, v in g]) for k, g in groupby(enumerate(lst), fun)])

group=user_act.groupby(user_act.user_id)

In [150]:
group.day.apply(lambda df: count_series_day(df.values))

user_id
8          2
73         2
129        1
228        1
351        1
398        1
409        1
509        1
556        2
627        1
948        1
1115       2
1197       3
1341       2
1374       2
1504       4
1607       1
1644       1
1657       1
1704       1
1822       1
1893       1
1914       2
1927       3
1988       3
1995       1
2116       1
2328       1
2332       1
2387       2
          ..
1381525    3
1381614    2
1381665    1
1381697    1
1381715    1
1381725    1
1381752    1
1382136    1
1382173    3
1382328    1
1382333    1
1382361    1
1382423    1
1382514    3
1382548    1
1382639    3
1382683    2
1382849    1
1382992    2
1383128    1
1383142    1
1383263    1
1383289    1
1383467    1
1383475    2
1383627    1
1383642    1
1383684    1
1383705    1
1383717    1
Name: day, Length: 20535, dtype: int64

In [72]:
data = pd.concat(
    [act_count, video_count, reg_day, lau_count, act_maxday, video_maxday],
    axis=1)
data.columns = [
    'act_count', 'video_count', 'reg_day', 'lau_count', 'act_maxday',
    'video_maxday'
]
data = data.fillna(0)
label = [1 if i in u else 0 for i in data.index]
data['target'] = label

In [73]:
data.head()

Unnamed: 0_level_0,act_count,video_count,reg_day,lau_count,act_maxday,video_maxday,target
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
8,96.0,0.0,9,4,13.0,0.0,1
73,36.0,0.0,2,3,5.0,0.0,0
129,1.0,0.0,12,1,12.0,0.0,1
228,22.0,0.0,13,1,13.0,0.0,1
351,176.0,0.0,3,1,3.0,0.0,0
