In [1]:
import pandas as pd
import numpy as np
import datetime as dt
from IPython.display import display

In [2]:
# 你可以导入其中的一个数据，或者自己concat完后再导入
data = pd.read_csv('../second_data/JData_Action_201604.csv')

In [22]:
user = pd.read_csv('../second_data/JData_User.csv')
product = pd.read_csv('../second_data/JData_Product.csv')
comment = pd.read_csv('../second_data/JData_Comment.csv')

# do some conversion

In [3]:
data['date'] = data.time.apply(lambda x: x.split(' ')[0])
data['hour'] = data.time.apply(lambda x: x.split(' ')[1])

In [4]:
del data['time']
del data['model_id']

In [5]:
# 标准化列名
data.columns = ['user_id','item_id','behavior_type','cate','brand','date','hour']

![Paste_Image.png](http://upload-images.jianshu.io/upload_images/1132123-86518acb2e1f623a.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)

# feature

# uif

- 1:browse
- 2:cart
- 3:del_cart
- 4:buy
- 5:focus
- 6:click

In [8]:
# 全局变量
behavior_str = ['browse', 'cart', 'del_cart', 'buy','focus','click']

In [9]:
def behavior_diy_hour(df, date, hour, behavior = 1, windows_hour = 1, label = 'hour_%s_%s_%s'):
    '''
    按照给定的hour区间对行为计数
    windows_hour: 1,2,3,4,6,8,12
    rtype:DataFrame
    '''

    key = label %(str(hour), str(hour + windows_hour), behavior_str[behavior-1])
    ret = df[df.date.eq(date)& df.behavior_type.eq(behavior) & df.hour.ge(str(hour)) & df.hour.lt(str(hour + windows_hour))]\
    .groupby(['user_id','item_id']).agg({'hour':'count'})
    ret[key] = ret['hour']
    return ret[[key]]

In [10]:
def behavior_times_date(df, date, behavior, label = None):
    '''
    把当天发生的行为的数量提取出来
    rtype:DataFrame
    '''
    ret = df[df['date'].eq(date) & (df['behavior_type'].eq(behavior))].groupby(['user_id', 'item_id']).agg(
        {'behavior_type':'count'})
    key = "behavior_times_date_%s_%s_times" % (date, behavior_str[behavior - 1]) if label == None else label
    ret[key] = ret['behavior_type']
    return ret[[key]]

In [11]:
def behavior_first_time(df, date, behavior, label = None):
    '''
    提取指定行为最早发生时间(小时)
    rtype:DataFrame
    '''
    ret = df[df.date.eq(date) & df.behavior_type.eq(behavior)].groupby(['user_id','item_id']).agg({'hour':'first'})
    key = 'behavior_first_time_%s_%s_hour' % (date, behavior_str[behavior - 1]) if label is None else label
    ret[key] = ret['hour'].apply(lambda x: int(x.split(':')[0]))
    return ret[[key]]

In [12]:
def behavior_last_time(df, date, behavior, label = None):
    '''
    rtype:DataFrame
    '''
    ret = df[df.date.eq(date) & df.behavior_type.eq(behavior)].groupby(['user_id','item_id']).agg({'hour':'max'})
    key = 'behavior_last_time_%s_%s_hour' % (date, behavior_str[behavior - 1]) if label is None else label
    ret[key] = ret['hour'].apply(lambda x: int(x.split(':')[0]))
    return ret[[key]]

In [13]:
def behavior_first_last_time(df, date, behavior, label = None):
    '''
    某个操作的操作时间最大间隔(单位秒)
    rtype:DataFrame
    '''
    first = df[df.date.eq(date) & df.behavior_type.eq(behavior)].groupby(['user_id','item_id']).agg({'hour':'first'})
    last = df[df.date.eq(date) & df.behavior_type.eq(behavior)].groupby(['user_id','item_id']).agg({'hour':'max'})
    def cal(x):
        h, m, s = x.split(':')
        return int(h)*3600 + int(m)*60 + int(s)
    key = 'behavior_first_last_time_%s_%s_second' % (date, behavior_str[behavior - 1]) if label is None else label
    # print first['hour'].apply(lambda x: cal(x))
    first[key] =  last['hour'].apply(lambda x: cal(x)) - first['hour'].apply(lambda x: cal(x))
    return first[[key]]

In [14]:
def all_behavior_first_last_time(df, date, label = None):
    '''
    某个商品总的操作时间(单位秒),与行为无关
    rtype:DataFrame
    '''
    first = df[df.date.eq(date) ].groupby(['user_id','item_id']).agg({'hour':'first'})
    last = df[df.date.eq(date) ].groupby(['user_id','item_id']).agg({'hour':'max'})
    def cal(x):
        h, m, s = x.split(':')
        return int(h)*3600 + int(m)*60 + int(s)
    key = 'no_behavior_first_last_time_%s_second' % (date) if label is None else label
    # print first['hour'].apply(lambda x: cal(x))
    first[key] =  last['hour'].apply(lambda x: cal(x)) - first['hour'].apply(lambda x: cal(x))
    return first[[key]]

In [15]:
def behavior_day_times(df, s_date, l_date ,behavior, label = None):
    '''
    在给定天数里面，所发生行为的天数
    左开右闭
    rtype:DataFrame
    '''
    def cal(series):
        return len(series.unique())
    ret = df[df.behavior_type.eq(behavior)& df.date.ge(s_date) & df.date.lt(l_date)].groupby(['user_id','item_id'])\
    .agg({'date':cal})
    key = 'behavior_day_times_%s_%s_%s_days' % (s_date,l_date, behavior_str[behavior - 1]) if label is None else label
    ret[key] = ret['date']
    return ret[[key]]

In [16]:
def behavior_flag(df, s_date, l_date, behavior, label = None):
    '''
    在给定范围里，是否发生该行为
    左开右闭
    rtype:DataFrame
    '''
    def cal(series):
        if behavior in series.unique():
            return 1
        return 0
    ret = df[ df.date.ge(s_date) & df.date.lt(l_date)].groupby(['user_id','item_id'])\
    .agg({'behavior_type':cal})
    key = 'behavior_flag_%s_%s_%s_days' % (s_date,l_date, behavior_str[behavior - 1]) if label is None else label
    ret[key] = ret['behavior_type']
    return ret[[key]]

In [17]:
def check_v(ui,ii):
    # 用来测试函数
    display(data[data.user_id.eq(ui) & data.item_id.eq(ii)])

In [18]:
def rule_last_time_behavior(df, date, behavior,label = None):
    '''
    在指定date的前一天是否在最后一刻加入购物车/focus/ｏｒ没有．
    是:1
    不是:0
    date为label_day,我们要计算的是前一天.
    rtype:DataFrame
    '''
    def cal(df):
        temp = df.sort_values(['hour'])['behavior_type'].tail(1).values[0]
        if temp == behavior:
            return 1
        return 0
    date = (dt.datetime.strptime(date, '%Y-%m-%d') - dt.timedelta(1)).strftime('%Y-%m-%d')
    p = df[df.date.eq(date)].groupby(['user_id','item_id']).apply(cal)
    key = 'rule_last_time_behavior_%s_%s' % (date, behavior_str[behavior - 1]) if label is None else label 
    return pd.DataFrame(p.rename(key))

In [12]:
def get_action_feat(actions, start_date, end_date):
    '''
    各种行为在一段时间上的累积
    rtype:DataFrame
    '''
    days = (dt.datetime.strptime(end_date, '%Y-%m-%d') - dt.datetime.strptime(start_date,'%Y-%m-%d')).days
    actions = actions[(actions.date >= start_date) & (actions.date < end_date)]
    
    actions = actions[['user_id', 'item_id', 'behavior_type']]
    df = pd.get_dummies(actions['behavior_type'], prefix='accumulate_%d-action' % (days))
    print actions.head()
    actions = pd.concat([actions[['user_id', 'item_id']], df], axis=1)  # type: pd.DataFrame
    actions = actions.groupby(['user_id', 'item_id']).sum()
    return actions

In [19]:
def get_accumulate_action_by_time(actions, start_date, end_date):
    '''
    各种行为随着时间衰减
    rtype:DataFram
    '''
    days = (dt.datetime.strptime(end_date, '%Y-%m-%d') - dt.datetime.strptime(start_date,'%Y-%m-%d')).days
    actions = actions[(actions.date >= start_date) & (actions.date < end_date)]
    actions = actions[['user_id', 'item_id', 'behavior_type']]
    df2 = pd.get_dummies(actions['behavior_type'], prefix='accumulate_by_time_%d-action' % (days))
    actions['weight'] = np.exp(-days)
    for i in df2.columns:
        df2[i]*=actions['weight']
    actions = pd.concat([actions[['user_id','item_id']], df2], axis=1)
    actions = actions.groupby(['user_id', 'item_id']).sum()
    return actions

In [19]:
def ranking(df):
    '''
    对给定的DataFrame进行排序，返回名字: col_name_rank
    rtype:DataFrame
    '''
    ret = df.reset_index()
    name = df.columns[-1]
    ret[name + '_rank'] = ret.groupby(['user_id'])[name].rank(ascending=False,method='dense')
    ret = ret.set_index(['user_id','item_id'])
    return ret[[name + '_rank']]

# tool

In [20]:
def extract_more_label(df, pred_date, more_day = 5):
    '''
    把pred_date这一天的label提取出来
    左开右闭
    '''
    pred_end = (dt.datetime.strptime(pred_date, '%Y-%m-%d') + dt.timedelta(more_day)).strftime('%Y-%m-%d')
    print pred_end
    tlabel = df[df['date'].ge(pred_date) & df['date'].lt(pred_end)].groupby(['user_id', 'item_id']).apply(lambda x: np.sum(x == 4))
    # print df[df['date'].ge(pred_date) & df['date'].lt(pred_end)].shape
    tlabel['label'] = [1 if t >= 1 else 0 for t in tlabel['behavior_type']]
    return tlabel['label']

In [21]:
def extract_label(df, pred_date):
    '''
    把pred_date这一天的label提取出来
    '''
    tlabel = df[(df['date'] == pred_date)].groupby(['user_id', 'item_id']).apply(lambda x: np.sum(x == 4))
    tlabel['label'] = [1 if t >= 1 else 0 for t in tlabel['behavior_type']]
    return tlabel['label']

# do some testing
- 加上滑窗

-　behavior_diy_hour(df, date, hour, behavior = 1, windows_hour = 1, label = 'hour_%s_%s')

-　behavior_times_date(df, date, behavior, label = None)

-　behavior_first_time(df, date, behavior, label = None)

-　behavior_last_time(df, date, behavior, label = None)

-　behavior_first_last_time(df, date, behavior, label = None)

-　all_behavior_first_last_time(df, date, label = None)

-　behavior_day_times(df, s_date, l_date ,behavior, label = None)

-　behavior_flag(df, s_date, l_date, behavior, label = None)

-　rule_last_time_behavior(df, date, behavior,label = None)

In [22]:
def extract_user_item_windows(df, date, days = 5, detail_days = 2):
    '''
    滑动窗口，不取date这一天
    '''
    today = dt.datetime.strptime(date, '%Y-%m-%d') # today是考察日
    prev_day = [(today - dt.timedelta(d+1)).strftime('%Y-%m-%d')
               for d in xrange(days)]
    
    result = []
    # 构建小时的行为数（浏览/加购/删除/下单/关注/点击的次数）
    for day in xrange(detail_days):
        for hour in xrange(24):
            for behavior in range(6):
                result.append(behavior_diy_hour(df, prev_day[day], hour, behavior = behavior+1,label='uif_hour_last_%d_%%s-%%s_%%s' % (day+1)))
        

            
    # 构建天数的操作数,和时间层级的特征
    for day in xrange(days):
        for behavior in range(6):
            result.append(behavior_first_time(df, prev_day[day], behavior+1,
                                                  label="uif_first_%d_%s" % (day+1, behavior_str[behavior ])))
            result.append(behavior_last_time(df, prev_day[day], behavior+1,
                                                  label='uif_last_%d_%s' %(day+1, behavior_str[behavior ])))
            result.append(behavior_times_date(df, prev_day[day], behavior+1, 
                                              label="uif_behavior_times_%d_%s" % (day+1, behavior_str[behavior])))
            result.append(behavior_first_last_time(df, prev_day[day], behavior+1, 
                                                   label = 'uif_behavior_first_last_time_%d_%s' % (day+1, behavior_str[behavior])))
        result.append(all_behavior_first_last_time(df, prev_day[day], 
                                                  label = 'uif_all_behavior_first_last_time_%d'%(day+1)))
    # 左开右闭
    s_date = (today - dt.timedelta(days)).strftime('%Y-%m-%d')
    l_date = today.strftime('%Y-%m-%d')
    for behavior in range(6):
        result.append(behavior_day_times(df, s_date, l_date, behavior+1,
                                        label = 'uif_behavior_day_times_%s'%(behavior_str[behavior])))
        result.append(behavior_flag(df, s_date, l_date, behavior+1,
                                        label = 'uif_behavior_flag_%s'%(behavior_str[behavior])))
        result.append(rule_last_time_behavior(df, l_date, behavior+1,
                                             label = 'uif_rule_last_time_behavior_%s'%(behavior_str[behavior]) ))
        

    rdf = reduce(lambda x, y: x.join(y, how='outer'), result).fillna(0) # 全连接
    print 'rdf_shape: ',rdf.shape
    
    # ranking
    rank_result = []
    for col in rdf.columns:
        # 对每小时的行为排序
        if ('uif_hour_last' in col) and (('browse' in col) or('click' in col)):
            rank_result.append(ranking(rdf[[col]]))
        # 对每天的行为排序
        elif ('uif_behavior_times' in col) and (('browse' in col) or ('click' in col)):
            rank_result.append(ranking(rdf[[col]]))
    
    radf = reduce(lambda x,y: x.join(y, how = 'outer'), rank_result).fillna(0)
    print 'radf_shape: ',radf.shape
    
    
    return rdf.join(radf, how = 'left')

In [23]:
def extract_user_item_with_label(df, date, days=5, detail_days=1):
    """
    与前者相同，但是加入了标签（正样本为 1，负样本为 0）。不同的是，这里抽取的特征所用的日期指的是标签的日期，即给定日前的前一天。
    - 这里的train不取date这一天为train-set
    :param df: 用户消费记录
    :param date: 给定的日期（标签的日期）
    :param days: 概要信息天数
    :param detail_days: 细化到每小时行为的天数.
    :return: 联合用户的测试。
    :rtype: pd.DataFrame
    """
    today = dt.datetime.strptime(date, '%Y-%m-%d')
    training_data = extract_user_item_windows(df, date, days, detail_days) # days是说有多小个last day
    # 添加类标
    # tlabel = extract_label(df, date) 预测后一天
    # 预测后五天
    tlabel = extract_more_label(df, date)
    return training_data.join(tlabel, how='left').fillna(0) # 只有30％左右的正样本出现在考察日之前

In [None]:
# 可以写个for`循环来提取所有训练集

In [309]:

def training_validation(clf, traindata):
    '''
    clf:分类器
    traindata:numpy array
    '''
    print clf
    result = zip(traindata[:, -1].tolist(), clf.predict(traindata[:, :-1]).tolist())
    print 'Actual:\t', len(filter(lambda d: d[0]==1, result))
    print 'Model:\t', len(filter(lambda d: d[1]==1, result))
    print 'Hit:\t', len(filter(lambda d: d[0] == 1 and d[0] == d[1], result))

In [325]:
# 评测函数
def sample_validation(y_test, y_pred, train_without):
    '''
    y_test: nnumpy array
    y_pred: numpy array
    train_without:列中含有user_id和item_id的DataFrame
    '''
    p_A_tr = np.sum(train_without.loc[y_pred==1].user_id.isin(train_without.loc[y_test ==1].user_id))
    sub_all = len(y_pred)
    A_tr = np.sum(y_test == 1)
    p11 = p_A_tr / float(sub_all)
    r11 = p_A_tr / float(A_tr)
    F11 = 6 * r11 * p11 / (5 * r11 + p11)

    # F12
    p_iA_tr = np.sum((y_pred == y_test) & (y_pred == 1))
    p12 = p_iA_tr / float(sub_all)
    r12 = p_iA_tr / float(A_tr)
    F12 = 5 * r12 * p12 / (2 * r12 + 3*p12)

    F = 0.4 * F11 + 0.6 * F12


    result = zip(y_test, y_pred.tolist())
    print 'Actual:\t', len(filter(lambda d: d[0]==1, result))
    print 'Model:\t', len(filter(lambda d: d[1]==1, result))
    print 'Hit:\t', len(filter(lambda d: d[0] == 1 and d[0] == d[1], result))
    print 'F11: ', F11
    print 'F12: ', F12
    print 'F: ', F

In [339]:
sample_validation(a.reset_index().label,clf.predict(a.drop('label', axis=1).values),a.reset_index())

Actual:	412
Model:	1856
Hit:	67
F11:  0.000667808452547
F12:  0.000948288551468
F:  0.0008360965119


In [323]:
training_validation(clf, a.values)

RandomForestClassifier(bootstrap=True, class_weight={0: 1, 1: 100},
            criterion='gini', max_depth=10, max_features='auto',
            max_leaf_nodes=None, min_impurity_split=1e-07,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
Actual:	412
Model:	1856
Hit:	67


In [324]:
sorted(zip(a.drop('label',axis=1).columns,clf.feature_importances_), key = lambda x:x[1],reverse=True)

[('uif_all_behavior_first_last_time_1', 0.048070137375339349),
 ('uif_behavior_day_times_browse', 0.040853140463115924),
 ('uif_behavior_first_last_time_1_browse', 0.040572201803783417),
 ('uif_behavior_first_last_time_1_click', 0.036222041599309508),
 ('uif_behavior_flag_cart', 0.035992611511771216),
 ('uif_behavior_times_1_click', 0.035379696553136303),
 ('uif_behavior_day_times_cart', 0.030591694739793351),
 ('uif_behavior_times_1_browse', 0.02726275522112875),
 ('uif_behavior_times_2_click', 0.017917288053179093),
 ('uif_behavior_first_last_time_2_click', 0.017392540755376096),
 ('uif_behavior_day_times_click', 0.016507919646641261),
 ('uif_last_1_browse', 0.016202969985238513),
 ('uif_behavior_first_last_time_2_browse', 0.01470572384856389),
 ('uif_behavior_times_1_click_rank', 0.014578812855042993),
 ('uif_first_1_cart', 0.014357571921237882),
 ('uif_all_behavior_first_last_time_2', 0.014071945777508858),
 ('uif_behavior_times_1_cart', 0.014000642186008634),
 ('uif_all_behavior_f

# user-feature

# item-feature