# 这个脚本是展示如何时间滑窗特征的构建

In [None]:
#假设我已经有用户购买商品的dataframe[['customer_id','date','order_total_num']]
import numpy as np
import pandas as pd
import reduce_mem_usage

In [None]:
df_goods = df[['customer_id','date','order_total_num']]
df_goods = df_goods.groupby(['date','customer_id']).agg({'order_total_num': ['sum']})
df_goods.columns = ['day_total_num']
df_goods.reset_index(inplace=True)
df_goods = df_goods.set_index(
    ["customer_id", "date"])[["day_total_num"]].unstack(level=-1).fillna(0)
df_goods.columns = df_goods.columns.get_level_values(1)

In [None]:
# 这函数是在数据(df)中获得时间点(dt)之前多少天（minus）的周期(periods)的dataframe
def get_timespan(df, dt, minus, periods, freq='D'):
    return df[pd.date_range(dt - timedelta(days=minus), periods=periods, freq=freq)]
#函数最后在返回值处做了内存优化，用时间代价尽可能避免内存溢出

In [None]:
def prepare_dataset(df_payment, t2020, is_train=True):
    #生成的结果放在x字典里，后面再转df
    X = {}
    # 整合用户id
    tmp = df_payment.reset_index()
    X['customer_id'] = tmp['customer_id']
    # 滑窗消费特征举例，取前[7,14,30,49,60,91,120]天的滑动窗口
    for i in [7,14,30,49,60,91,120]:
        #取前7天的数据
        tmp_1 = get_timespan(df_payment, t2020, i, i)
        #对各列数据各种统计
        X['diff_%s_mean' % i] = tmp_1.diff(axis=1).mean(axis=1).values
        X['mean_%s_decay' % i] = (tmp_1 * np.power(0.9, np.arange(i)[::-1])).sum(axis=1).values
        X['mean_%s' % i] = tmp_1.mean(axis=1).values
        X['median_%s' % i] = tmp_1.median(axis=1).values
        X['min_%s' % i] = tmp_1.min(axis=1).values
        X['max_%s' % i] = tmp_1.max(axis=1).values
        X['std_%s' % i] = tmp_1.std(axis=1).values
        X['sum_%s' % i] = tmp_1.sum(axis=1).values
        
        tmp_2 = get_timespan(df_payment, t2020, i, i)
        X['has_sales_days_in_last_%s' % i] = (tmp_2 != 0).sum(axis=1).values
        X['last_has_sales_day_in_last_%s' % i] = i - ((tmp_2 != 0) * np.arange(i)).max(axis=1).values
        X['first_has_sales_day_in_last_%s' % i] = ((tmp_2 != 0) * np.arange(i, 0, -1)).max(axis=1).values

    # 对此处进行微调，主要考虑近期因素
    for i in range(1, 4):
        X['day_%s_' % i] = get_timespan(df_payment, t2020, i*30, 30).sum(axis=1).values

    for i in range(7):
        X['mean_4_dow{}_2020'.format(i)] = get_timespan(df_payment, t2020, 56-i*2, 4, freq='14D').mean(axis=1).values
        X['mean_20_dow{}_2020'.format(i)] = get_timespan(df, t2020, 140-i, 20, freq='7D').mean(axis=1).values
        X['mean_20_dow{}_2020'.format(i)] = get_timespan(df_payment, t2020, 140-i*2, 10, freq='14D').mean(axis=1).values
    
    X = pd.DataFrame(X)
    #可以用同一个仓库的另外一个减少内存脚本
    reduce_mem_usage(X)
    
    if is_train:
        # 这样转换之后，打标签直接用numpy切片就可以了
        # 当然这里前提是确认付款总额没有负数的问题
        y = df_goods[pd.date_range(t2020, periods=30)].max(axis=1).values
        y[y > 0] = 1
        return X, y
    return X

In [None]:
# 将用户下单金额按天进行汇总
def make_slides(train,df_part,begin,end,column):
    df = train[train.order_status <= 6][train.order_pay_time > '2020-02-01']
    df = train[train.order_pay_time > begin][train.order_pay_time < end]
    df = pd.merge(df,df_part,how='inner')
    df['date'] = pd.DatetimeIndex(df['order_pay_time']).date
    df = df[['customer_id', 'date', column]]
    df = df.groupby(['date', 'customer_id']).agg({column: ['sum']})
    df.columns = ['day_' + column]
    df.reset_index(inplace=True)
    df = df.set_index(["customer_id", "date"])[['day_' + column]].unstack(level=-1).fillna(0)
    df.columns = df.columns.get_level_values(1)
    return df

In [None]:
df_0606 = train[train.order_pay_time > '2020-01-01'][train.order_pay_time <= '2020-06-06'][['customer_id']]
# 删除重复行
df_0606 = df_0606.drop_duplicates(['customer_id'])
df_part1_partment = make_slides(train,df_0606,'2020-01-01','2020-07-06','order_total_payment')
df_part1_goods = make_slides(train,df_0606,'2020-01-01','2020-07-06','order_total_num')
X_part1, y_part1 = prepare_dataset(df_part1_partment, df_part1_goods, date(2020, 6, 6))

In [None]:
X_part1.to_pickle('xxx.pkl')
np.save("xxx.npy", y_part1)

# 纯时间序列滑窗和其他特征交叉效果更好，如用户X商品，用户X会员，用户X省份等。