In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
from tqdm import tqdm
from datetime import date, timedelta
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.normalization import BatchNormalization
from keras.layers import LSTM,Conv1D, Input, Dense, Add, Multiply
from keras import callbacks
from keras import optimizers,models
from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau

import warnings
warnings.filterwarnings('ignore')

Using TensorFlow backend.


In [2]:
path = './input/'
df_train = pd.read_csv(path+'train.csv',
converters={'unit_sales':lambda u: np.log1p(float(u)) if float(u) > 0 else 0},parse_dates=["date"])
df_test  = pd.read_csv(path + "test.csv",parse_dates=["date"])
items = pd.read_csv(path+'items.csv')
stores = pd.read_csv(path+'stores.csv')
# 类型转换
df_train['onpromotion'] = df_train['onpromotion'].astype(bool)
df_test['onpromotion'] = df_test['onpromotion'].astype(bool)

In [3]:
df_2017 = df_train.loc[df_train.date>=pd.datetime(2015,12,1)] 
del df_train

df_2017 = df_2017.merge(items, on='item_nbr', how='left')
df_2017 = df_2017.merge(stores, on='store_nbr', how='left')

In [4]:
tmp = df_2017[df_2017['date']=='2016-12-26']
tmp['date'] = '2016-12-25'
df_2017 = pd.concat([df_2017, tmp], axis=0, ignore_index=True)

In [5]:
promo_2017_train = df_2017.set_index(["store_nbr", "item_nbr", "date"])[["onpromotion"]].unstack(level=-1).fillna(False)
promo_2017_train.columns = promo_2017_train.columns.get_level_values(1)

promo_2017_test = df_test.set_index(["store_nbr", "item_nbr", "date"])[["onpromotion"]].unstack(level=-1).fillna(False)
promo_2017_test.columns = promo_2017_test.columns.get_level_values(1)
promo_2017_test = promo_2017_test.reindex(promo_2017_train.index).fillna(False)

promo_2017 = pd.concat([promo_2017_train, promo_2017_test], axis=1)
promo_2017 = promo_2017.astype('int')

In [6]:
df_2017 = df_2017.set_index(["store_nbr", "item_nbr", "city", "class", "date"])[["unit_sales"]].unstack(level=-1).fillna(0)
df_2017.columns = df_2017.columns.get_level_values(1) 

In [7]:
def get_date_range(df, dt, forward_steps, periods, freq='D'):
    return df[pd.date_range(start=dt-timedelta(days=forward_steps), periods=periods, freq=freq)]

In [8]:
def prepare_dataset(t2017, is_train=True):
    X = pd.DataFrame({
        # 视点前 N日促销次数
        'promo_3_2017': get_date_range(promo_2017, t2017, 3, 3).sum(axis=1).values,
        'promo_7_2017': get_date_range(promo_2017, t2017, 7, 7).sum(axis=1).values,
        'promo_14_2017': get_date_range(promo_2017, t2017, 14, 14).sum(axis=1).values,
        # 预测集一年前的16日统计销量
        "last_year_mean": get_date_range(df_2017, t2017, 365, 16).mean(axis=1).values,
        "last_year_meidan": get_date_range(df_2017, t2017, 365, 16).median(axis=1).values,
        "last_year_max": get_date_range(df_2017, t2017, 365, 16).max(axis=1).values,
        "last_year_min": get_date_range(df_2017, t2017, 365, 16).min(axis=1).values,
        # 预测集一年前的16日0销次数
        "last_year_count0": (get_date_range(df_2017, t2017, 365, 16)==0).sum(axis=1).values,
        # 预测集一年前的16日促销次数
        "last_year_promo": get_date_range(promo_2017, t2017, 365, 16).sum(axis=1).values
    })
    
    for i in range(1,8):
        # 历史平移，前 N天的销量
        X["day_{}_hist".format(i)] = get_date_range(df_2017, t2017, i, 1).values.ravel()
        
    
    for i in [3,5,7,14,21,30,60,90,150,365]:
        for d in [0,7,14]:
            # 窗口统计，销量 diff/mean/meidan/max/min/std
            X['before_diff_{}_day_mean'.format(i)] = get_date_range(df_2017, t2017-timedelta(days=d), i, i).diff(1,axis=1).mean(axis=1).values
            X['after_diff_{}_day_mean'.format(i)] = get_date_range(df_2017, t2017-timedelta(days=d), i, i).diff(-1,axis=1).mean(axis=1).values
            X['mean_%s_decay_1' % i] = (get_date_range(df_2017, t2017-timedelta(days=d), i, i) * np.power(0.9, np.arange(i)[::-1])).sum(axis=1).values
            X['mean_%s_decay_2' % i] = (get_date_range(df_2017, t2017-timedelta(days=d), i, i) * np.power(0.7, np.arange(i)[::-1])).sum(axis=1).values
            X['mean_%s_decay_3' % i] = (get_date_range(df_2017, t2017-timedelta(days=d), i, i) * np.power(0.5, np.arange(i)[::-1])).sum(axis=1).values
            X['mean_{}_day'.format(i)] = get_date_range(df_2017, t2017-timedelta(days=d), i, i).mean(axis=1).values
            X['median_{}_day'.format(i)] = get_date_range(df_2017, t2017-timedelta(days=d), i, i).median(axis=1).values
            X['max_{}_day'.format(i)] = get_date_range(df_2017, t2017-timedelta(days=d), i, i).max(axis=1).values
            X['min_{}_day'.format(i)] = get_date_range(df_2017, t2017-timedelta(days=d), i, i).min(axis=1).values
            X['std_{}_day'.format(i)] = get_date_range(df_2017, t2017-timedelta(days=d), i, i).std(axis=1).values

        # 有/无促销的时间，销量统计
        X['mean_{}_day_haspromo'.format(i)] = get_date_range(df_2017, t2017, i, i)[get_date_range(promo_2017, t2017, i, i)==1].mean(axis=1).values
        X['median_{}_day_haspromo'.format(i)] = get_date_range(df_2017, t2017, i, i)[get_date_range(promo_2017, t2017, i, i)==1].median(axis=1).values
        X['max_{}_day_haspromo'.format(i)] = get_date_range(df_2017, t2017, i, i)[get_date_range(promo_2017, t2017, i, i)==1].max(axis=1).values
        X['min_{}_day_haspromo'.format(i)] = get_date_range(df_2017, t2017, i, i)[get_date_range(promo_2017, t2017, i, i)==1].min(axis=1).values
        X['std_{}_day_hasnopromo'.format(i)] = get_date_range(df_2017, t2017, i, i)[get_date_range(promo_2017, t2017, i, i)==1].std(axis=1).values

        X['mean_{}_day_nopromo'.format(i)] = get_date_range(df_2017, t2017, i, i)[get_date_range(promo_2017, t2017, i, i)==0].mean(axis=1).values
        X['median_{}_day_nopromo'.format(i)] = get_date_range(df_2017, t2017, i, i)[get_date_range(promo_2017, t2017, i, i)==0].median(axis=1).values
        X['max_{}_day_nopromo'.format(i)] = get_date_range(df_2017, t2017, i, i)[get_date_range(promo_2017, t2017, i, i)==0].max(axis=1).values
        X['min_{}_day_nopromo'.format(i)] = get_date_range(df_2017, t2017, i, i)[get_date_range(promo_2017, t2017, i, i)==0].min(axis=1).values
        X['std_{}_day_nopromo'.format(i)] = get_date_range(df_2017, t2017, i, i)[get_date_range(promo_2017, t2017, i, i)==0].std(axis=1).values

        # 无销量次数与促销次数
        X['count0_{}_2017'.format(i)] = (get_date_range(df_2017, t2017, i, i)==0).sum(axis=1).values
        X['promo_{}_2017'.format(i)] = get_date_range(promo_2017, t2017, i, i).sum(axis=1).values
                               
        
    for i in range(7):
        # 前 N 周平均每周 i 的销量
        for periods in [5,10,15,20]:
            steps = periods * 7
            X['before_diff_{}_dow{}_2017'.format(periods,i)] = get_date_range(df_2017, t2017, steps-i, periods, freq='7D').diff(1,axis=1).mean(axis=1).values
            X['after_diff_{}_dow{}_2017'.format(periods,i)] = get_date_range(df_2017, t2017, steps-i, periods, freq='7D').diff(-1,axis=1).mean(axis=1).values
            X['mean_{}_dow{}_2017'.format(periods,i)] = get_date_range(df_2017, t2017, steps-i, periods, freq='7D').mean(axis=1).values
            X['median_{}_dow{}_2017'.format(periods,i)] = get_date_range(df_2017, t2017, steps-i, periods, freq='7D').median(axis=1).values
            X['max_{}_dow{}_2017'.format(periods,i)] = get_date_range(df_2017, t2017, steps-i, periods, freq='7D').max(axis=1).values
            X['min_{}_dow{}_2017'.format(periods,i)] = get_date_range(df_2017, t2017, steps-i, periods, freq='7D').min(axis=1).values
            X['std_{}_dow{}_2017'.format(periods,i)] = get_date_range(df_2017, t2017, steps-i, periods, freq='7D').std(axis=1).values
        
        
    for i in range(16):
        # 未来16天是否促销日
        X["promo_{}".format(i)] = promo_2017[str(t2017 + timedelta(days=i))].values.astype(np.uint8)
        X["promo_bef_{}".format(i)] = promo_2017[str(t2017 + timedelta(days=-i))].values.astype(np.uint8)
    
    if is_train:
        y = df_2017[
            pd.date_range(t2017, periods=16)
        ].values
        return X, y
    return X

In [9]:
#【训练集】以7月5日后的16天为最后一个训练集窗口，依次向前递推14周得到14个训练窗口的训练数据
X_l, y_l = [], []
t2017 = date(2017, 7, 5)
n_range = 25
for i in tqdm(range(n_range)):
    
    X_tmp, y_tmp = prepare_dataset(t2017 - timedelta(days=7 * i))
    X_l.append(X_tmp)
    y_l.append(y_tmp)
    
X_train = pd.concat(X_l, axis=0)
y_train = np.concatenate(y_l, axis=0)
del X_l, y_l

#【验证集】7.26
X_val, y_val = prepare_dataset(date(2017, 7, 26))
#【测试集】8.16
X_test = prepare_dataset(date(2017, 8, 16), is_train=False)

100%|██████████| 25/25 [59:44<00:00, 102.31s/it]


In [10]:
def build_model(shape_):
    
    def wave_block(x, filters, kernel_size, n):
        dilation_rates = [2**i for i in range(n)]
        x = Conv1D(filters = filters,
                   kernel_size = 1,
                   padding = 'same')(x)
        res_x = x
        for dilation_rate in dilation_rates:
            tanh_out = Conv1D(filters = filters,
                              kernel_size = kernel_size,
                              padding = 'same', 
                              activation = 'tanh', 
                              dilation_rate = dilation_rate)(x)
            sigm_out = Conv1D(filters = filters,
                              kernel_size = kernel_size,
                              padding = 'same',
                              activation = 'sigmoid', 
                              dilation_rate = dilation_rate)(x)
            x = Multiply()([tanh_out, sigm_out])
            x = Conv1D(filters = filters,
                       kernel_size = 1,
                       padding = 'same')(x)
            res_x = Add()([res_x, x])
        return res_x
    
    inp = Input(shape = (shape_))
    
    x = wave_block(inp, 32, 3, 8)
    x = wave_block(x, 64, 3, 4)
    x = wave_block(x, 128, 3, 1)
    
    out = Dense(1, name = 'out')(x)
    
    model = models.Model(inputs = inp, outputs = out)
    
    return model

In [11]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(pd.concat([X_train, X_val, X_test]))
X_train[:] = scaler.transform(X_train)
X_val[:] = scaler.transform(X_val)
X_test[:] = scaler.transform(X_test)

X_train = X_train.fillna(0)
X_val = X_val.fillna(0)
X_test = X_test.fillna(0)

X_train = X_train.values.reshape((X_train.shape[0], 1, X_train.shape[1]))
X_test = X_test.values.reshape((X_test.shape[0], 1, X_test.shape[1]))
X_val = X_val.values.reshape((X_val.shape[0], 1, X_val.shape[1]))

In [12]:
val_pred = []
test_pred = []

item_perishable_dict = dict(zip(items['item_nbr'],items['perishable'].values))
train_weight = []
val_weight = []

items_ = df_2017.reset_index()['item_nbr'].tolist() * n_range
for item in items_:
    train_weight.append(item_perishable_dict[item] * 0.25 + 1)

for i in range(16):

    print("====== Step %d ======" % (i+1))
    
    # 编译部分
    shape_ = (None, X_train.shape[2])
    model = build_model(shape_)
    model.compile(loss='mse', optimizer=optimizers.Adam(lr=0.001), metrics=['mse'])
    # 回调函数
    reduce_lr = ReduceLROnPlateau(
        monitor='val_loss', factor=0.5, patience=3, min_lr=0.0001, verbose=1)
    earlystopping = EarlyStopping(
        monitor='val_loss', min_delta=0.0001, patience=3, verbose=1, mode='min')
    callbacks = [reduce_lr, earlystopping]
    # 训练部分
    model.fit(X_train, y_train[:, i].reshape((y_train.shape[0], 1, 1)), batch_size = 8192*2, epochs = 3, verbose=1,
              sample_weight=np.array(train_weight), validation_data=(X_val, y_val[:, i].reshape((y_val.shape[0], 1, 1))), 
              callbacks=callbacks, shuffle=True)

    val_pred.append(model.predict(X_val))
    test_pred.append(model.predict(X_test))
        
# 0.279288 0.310069
# 0.278647 0.309754
# 0.27667  0.310175
# 0.276117 0.309686
# 0.274491 0.307200 sale_lgb_9








Train on 4308475 samples, validate on 172339 samples
Epoch 1/3





Epoch 2/3
Epoch 3/3
Train on 4308475 samples, validate on 172339 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Train on 4308475 samples, validate on 172339 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Train on 4308475 samples, validate on 172339 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Train on 4308475 samples, validate on 172339 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Train on 4308475 samples, validate on 172339 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Train on 4308475 samples, validate on 172339 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Train on 4308475 samples, validate on 172339 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Train on 4308475 samples, validate on 172339 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Train on 4308475 samples, validate on 172339 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Train on 4308475 samples, validate on 172339 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Train on 4308475 samples, validate on 172339 samples
Epoch 1/3
Epoch 2/3
Ep

In [24]:
val_pred[0].transpose()[0].transpose()

(172339, 16)

In [42]:
np.array([val_pred[i].transpose()[0].transpose() for i in range(16)]).transpose()[0].shape

(172339, 16)

In [44]:
print("验证集 mse:", mean_squared_error(
    y_val, np.array([val_pred[i].transpose()[0].transpose() for i in range(16)]).transpose()[0]))

y_test = np.array([test_pred[i].transpose()[0].transpose() for i in range(16)]).transpose()[0]
df_preds = pd.DataFrame(
    y_test, index=df_2017.index,
    columns=pd.date_range("2017-08-16", periods=16)
).stack().to_frame('unit_sales')
df_preds = df_preds.reset_index()
df_preds.columns = ['store_nbr', 'item_nbr', 'city', 'class', 'date', 'unit_sales']

submission = df_test[['id','date','store_nbr','item_nbr']].merge(df_preds, on=['date','store_nbr','item_nbr'], how='left').fillna(0)
submission['unit_sales'] = np.clip(np.expm1(submission["unit_sales"]), 0, 1000)
submission[['id','unit_sales']].to_csv('sale_wavenet_1.csv', float_format='%.4f', index=None)

# 0.3479347767794617
# 0.3436085951363862
# 0.3420840578048175
# 0.3417190536788496
# 0.3388075841917157 sale_lgb_8.csv
# 

验证集 mse: 0.3527629813294579


In [None]:
submission.head()

In [None]:
sale_lgb1 = pd.read_csv('sale_lgb_98.csv')
sale_lgb2 = pd.read_csv('sale_lgb.csv')

In [None]:
sale_lgb1.info()

In [None]:
sale_lgb2.info()

In [6]:
df_train

Unnamed: 0,id,date,store_nbr,item_nbr,unit_sales,onpromotion
0,0,2013-01-01,25,103665,7.0,
1,1,2013-01-01,25,105574,1.0,
2,2,2013-01-01,25,105575,2.0,
3,3,2013-01-01,25,108079,1.0,
4,4,2013-01-01,25,108701,1.0,
...,...,...,...,...,...,...
125497035,125497035,2017-08-15,54,2089339,4.0,False
125497036,125497036,2017-08-15,54,2106464,1.0,True
125497037,125497037,2017-08-15,54,2110456,192.0,False
125497038,125497038,2017-08-15,54,2113914,198.0,True
