In [1]:
from datetime import date,timedelta
import gc
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
import time
import lightgbm as lgb


# 减内存占用
def reduce_mem_usage(df, verbose=True):
    """
    减少内存
    """
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (
                start_mem - end_mem) / start_mem))
    return df

# 1.导入数据集

In [2]:
start = time.time()
print("Loading data......")

# train.csv
df_train = pd.read_csv('../data/train.csv',
                       usecols = [1,2,3,4,5],
                      dtype = {'onpromotion':bool},
                      converters = {'unit_sales':lambda u:np.log1p(float(u)) 
                                    if float(u) > 0 else 0},
                      parse_dates = ["date"],
                      # skip 2016
                      skiprows = range(1,66458909)) 

# test.csv
df_test = pd.read_csv('../data/test.csv',
                     usecols = [0,1,2,3,4],
                     dtype = {'onpromotion':bool},
                     parse_dates = ["date"],
                     ).set_index(['store_nbr','item_nbr','date'])

# items.csv
items = pd.read_csv('../data/items.csv').set_index("item_nbr")

# stores.csv
stores = pd.read_csv('../data/stores.csv').set_index("store_nbr")

end = time.time()
print("Finish!!!Using %dmin"%int((end-start)/60))

Loading data......
Finish!!!Using 2min


In [3]:
print("Shape of df_train is ",df_train.shape)
print("Shape of df_test is ",df_test.shape)
print("Shape of items is ",items.shape)
print("Shape of stores is ",stores.shape)

Shape of df_train is  (59038132, 5)
Shape of df_test is  (3370464, 2)
Shape of items is  (4100, 3)
Shape of stores is  (54, 4)


In [4]:
df_train.head()

Unnamed: 0,date,store_nbr,item_nbr,unit_sales,onpromotion
0,2016-01-01,25,105574,2.564949,False
1,2016-01-01,25,105575,2.302585,False
2,2016-01-01,25,105857,1.386294,False
3,2016-01-01,25,108634,1.386294,False
4,2016-01-01,25,108701,1.098612,True


In [5]:
df_test.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,id,onpromotion
store_nbr,item_nbr,date,Unnamed: 3_level_1,Unnamed: 4_level_1
1,96995,2017-08-16,125497040,False
1,99197,2017-08-16,125497041,False
1,103501,2017-08-16,125497042,False
1,103520,2017-08-16,125497043,False
1,103665,2017-08-16,125497044,False


In [6]:
items.head()

Unnamed: 0_level_0,family,class,perishable
item_nbr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
96995,GROCERY I,1093,0
99197,GROCERY I,1067,0
103501,CLEANING,3008,0
103520,GROCERY I,1028,0
103665,BREAD/BAKERY,2712,1


In [7]:
stores.head()

Unnamed: 0_level_0,city,state,type,cluster
store_nbr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Quito,Pichincha,D,13
2,Quito,Pichincha,D,13
3,Quito,Pichincha,D,8
4,Quito,Pichincha,D,9
5,Santo Domingo,Santo Domingo de los Tsachilas,D,4


# 2.数据预处理

### 相关类别型特征整数编码

In [8]:
# 相关特征整数编码
le = LabelEncoder()
items['family'] = le.fit_transform(items['family'].values)

stores['city'] = le.fit_transform(stores['city'].values)
stores['state'] = le.fit_transform(stores['state'].values)
stores['type'] = le.fit_transform(stores['type'].values)

#只使用2017的训练数据
df_2017 = df_train.loc[df_train.date >= pd.datetime(2017,1,1)]
del df_train

### 训练集、测试集不同id的促销情况dataframe

In [9]:
# 创建训练集促销情况dataframe,index为['store_nbr','item_nbr'],col_index为['onpromotion','date']
promo_2017_train = df_2017.set_index(
    ['store_nbr','item_nbr','date'])[['onpromotion']].unstack(
    level = -1).fillna(False)

# 将col_index选为'date'
promo_2017_train.columns = promo_2017_train.columns.get_level_values(1)
# promo_2017_train.head()

# 创建测试集促销情况dataframe,index为['store_nbr','item_nbr'],col_index为['onpromotion','date']
promo_2017_test = df_test[['onpromotion']].unstack(level = -1).fillna(False)

# 将col_index选为'date'
promo_2017_test.columns = promo_2017_test.columns.get_level_values(1)
promo_2017_test.head()

Unnamed: 0_level_0,date,2017-08-16 00:00:00,2017-08-17 00:00:00,2017-08-18 00:00:00,2017-08-19 00:00:00,2017-08-20 00:00:00,2017-08-21 00:00:00,2017-08-22 00:00:00,2017-08-23 00:00:00,2017-08-24 00:00:00,2017-08-25 00:00:00,2017-08-26 00:00:00,2017-08-27 00:00:00,2017-08-28 00:00:00,2017-08-29 00:00:00,2017-08-30 00:00:00,2017-08-31 00:00:00
store_nbr,item_nbr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1,96995,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,99197,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,103501,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,103520,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,103665,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


### 合并训练集、测试集不同id的促销情况dataframe

In [10]:
print('Prev shape of promo_2017_train is ',promo_2017_train.shape)
print('Prev shape of promo_2017_test is ',promo_2017_test.shape)

# 将促销dataframe的index统一为promo_2017_train的index
promo_2017_test = promo_2017_test.reindex(promo_2017_train.index).fillna(False)

print('Now shape of promo_2017_train is ',promo_2017_train.shape)
print('Now shape of promo_2017_test is ',promo_2017_test.shape)


# 合并两个促销dataframe
promo_2017 = pd.concat([promo_2017_train,promo_2017_test],axis=1)
print('Shape of promo_2017 is ',promo_2017.shape)
del promo_2017_test,promo_2017_train

Prev shape of promo_2017_train is  (167515, 227)
Prev shape of promo_2017_test is  (210654, 16)
Now shape of promo_2017_train is  (167515, 227)
Now shape of promo_2017_test is  (167515, 16)
Shape of promo_2017 is  (167515, 243)


### 训练集不同id的销售额dataframe

In [11]:
# 将df_2017的index设为['store_nbr','item_nbr'],col_index设为['date'],值为['unit_sales']
df_2017 = df_2017.set_index(['store_nbr','item_nbr','date'])[['unit_sales']].unstack(level = -1).fillna(0)
df_2017.columns = df_2017.columns.get_level_values(1)
df_2017.head()

Unnamed: 0_level_0,date,2017-01-01 00:00:00,2017-01-02 00:00:00,2017-01-03 00:00:00,2017-01-04 00:00:00,2017-01-05 00:00:00,2017-01-06 00:00:00,2017-01-07 00:00:00,2017-01-08 00:00:00,2017-01-09 00:00:00,2017-01-10 00:00:00,...,2017-08-06 00:00:00,2017-08-07 00:00:00,2017-08-08 00:00:00,2017-08-09 00:00:00,2017-08-10 00:00:00,2017-08-11 00:00:00,2017-08-12 00:00:00,2017-08-13 00:00:00,2017-08-14 00:00:00,2017-08-15 00:00:00
store_nbr,item_nbr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,96995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.098612,1.098612,0.0,0.0,0.693147,0.0,0.0,0.0,0.0,0.0
1,99197,0.0,0.0,1.386294,0.693147,0.693147,0.693147,1.098612,0.0,0.0,0.693147,...,0.0,1.098612,0.0,1.098612,0.0,0.0,0.0,0.0,0.0,0.0
1,103520,0.0,0.693147,1.098612,0.0,1.098612,1.386294,0.693147,0.0,0.693147,0.693147,...,0.0,0.0,1.386294,0.0,1.386294,0.693147,0.693147,0.693147,0.0,0.0
1,103665,0.0,0.0,0.0,1.386294,1.098612,1.098612,0.693147,1.098612,0.0,2.079442,...,0.693147,1.098612,0.0,2.079442,2.302585,1.098612,0.0,0.0,0.693147,0.693147
1,105574,0.0,0.0,1.791759,2.564949,2.302585,1.94591,1.609438,1.098612,1.386294,2.302585,...,0.0,1.791759,2.079442,1.94591,2.397895,1.791759,1.791759,0.0,1.386294,1.609438


### 不同商品(item)的销售额、促销情况dataframe

In [12]:
# 统一items和df_2017的index
print("Prev shape of items is ",items.shape)
items = items.reindex(df_2017.index.get_level_values(1))
print("Now shape of items is ",items.shape)


# 各个item在所有商店的销售额的和
df_2017_item = df_2017.groupby('item_nbr')[df_2017.columns].sum()
# df_2017_item.head()


# 各个item在所有商店的促销情况和
promo_2017_item = promo_2017.groupby('item_nbr')[promo_2017.columns].sum()
promo_2017_item.head()

Prev shape of items is  (4100, 3)
Now shape of items is  (167515, 3)


date,2017-01-01 00:00:00,2017-01-02 00:00:00,2017-01-03 00:00:00,2017-01-04 00:00:00,2017-01-05 00:00:00,2017-01-06 00:00:00,2017-01-07 00:00:00,2017-01-08 00:00:00,2017-01-09 00:00:00,2017-01-10 00:00:00,...,2017-08-22 00:00:00,2017-08-23 00:00:00,2017-08-24 00:00:00,2017-08-25 00:00:00,2017-08-26 00:00:00,2017-08-27 00:00:00,2017-08-28 00:00:00,2017-08-29 00:00:00,2017-08-30 00:00:00,2017-08-31 00:00:00
item_nbr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
96995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
99197,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
103501,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,1.0
103520,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
103665,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,12.0,0.0,0.0,1.0,0.0


### 特定商品在特定商店 (class+store_nbr)的销售额与促销情况

In [13]:
# 商品类型+商店类型的销售额dataframe
df_2017_store_class = df_2017.reset_index()

df_2017_store_class['class'] = items['class'].values
df_2017_store_class_index = df_2017_store_class[['class','store_nbr']]
df_2017_store_class = df_2017_store_class.groupby(['class','store_nbr'])[df_2017.columns].sum()
# df_2017_store_class.head()


# 商品类型+商店类型的促销情况dataframe
df_2017_promo_store_class = promo_2017.reset_index()

df_2017_promo_store_class['class'] = items['class'].values
df_2017_promo_store_class_index = df_2017_promo_store_class[['class','store_nbr']]
df_2017_promo_store_class = df_2017_promo_store_class.groupby(['class','store_nbr'])[promo_2017.columns].sum()
df_2017_promo_store_class.head()

Unnamed: 0_level_0,date,2017-01-01 00:00:00,2017-01-02 00:00:00,2017-01-03 00:00:00,2017-01-04 00:00:00,2017-01-05 00:00:00,2017-01-06 00:00:00,2017-01-07 00:00:00,2017-01-08 00:00:00,2017-01-09 00:00:00,2017-01-10 00:00:00,...,2017-08-22 00:00:00,2017-08-23 00:00:00,2017-08-24 00:00:00,2017-08-25 00:00:00,2017-08-26 00:00:00,2017-08-27 00:00:00,2017-08-28 00:00:00,2017-08-29 00:00:00,2017-08-30 00:00:00,2017-08-31 00:00:00
class,store_nbr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1002,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1002,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1002,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1002,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0
1002,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0


# 3.特征工程

In [14]:
# 定义get_timespan()函数，用于找出df的特定时间段数据
def get_timespan(df,dt,minus,periods,freq='D'):
    '''
    Input
    df : 要处理的dataframe
    dt : 当前时间戳
    minus : 要回溯的天数
    periods : 输出包含时间戳数
    freq : 频率
    
    Output : dt - minus开始，频率为freq的periods个时间戳对应的部分df
    '''
    return df[pd.date_range(dt - timedelta(days=minus),periods=periods,freq=freq)]

In [15]:
# 创建prepare_dataset函数，用于构建统计学特征
def prepare_dataset(df,promo_df,t2017,is_train=True,name_prefix=None):
    X = {
        # t2017过去14天的促销情况和
        "promo_14_2017":get_timespan(promo_df,t2017,14,14).sum(axis=1).values,
        # t2017过去60天的促销情况和
        "promo_60_2017":get_timespan(promo_df,t2017,60,60).sum(axis=1).values,
        # t2017过去140天的促销情况和
        "promo_140_2017":get_timespan(promo_df,t2017,140,140).sum(axis=1).values,
        
        
        # t2017之后3天的促销情况和
        "promo_3_2017_aft":get_timespan(promo_df,t2017+timedelta(days=16),15,3).sum(axis=1).values,
        # t2017之后7天的促销情况和
        "promo_7_2017_aft":get_timespan(promo_df,t2017+timedelta(days=16),15,7).sum(axis=1).values,
        # t2017之后14天的促销情况和
        "promo_14_2017_aft":get_timespan(promo_df,t2017+timedelta(days=16),15,14).sum(axis=1).values,
        
    }
    

    for i in [3,7,14,30,60,140]:
        # 取df在t2017前i天的所有列
        tmp1 = get_timespan(df,t2017,i,i)
        # 取promo在t2017前i天的所有列,并将布尔型转换为数值型
        tmp2 = (get_timespan(promo_df,t2017,i,i)>0)*1
        
        # t2017前i天销售的促销商品的销售额均值
        X['has_promo_mean_%s' % i] = (tmp1 * tmp2.replace(0,np.nan)).mean(axis=1).values
        # t2017前i天销售的促销商品的销售额衰变和，衰变因子为0.9
        X['has_promo_mean_%s_decay' % i] = (tmp1 * tmp2.replace(0, np.nan) * np.power(0.9, np.arange(i)[::-1])).sum(axis=1).values
        # t2017前i天销售的非促销商品的销售额均值
        X['no_promo_mean_%s' % i] = (tmp1 * (1 - tmp2).replace(0, np.nan)).mean(axis=1).values
        # t2017前i天销售的非促销商品的销售额均值
        X['no_promo_mean_%s_decay' % i] = (tmp1 * (1 - tmp2).replace(0, np.nan) * np.power(0.9, np.arange(i)[::-1])).sum(axis=1).values
    
    for i in [3,7,14,30,60,140]:
        # 取df在t2017前i天的所有列
        tmp = get_timespan(df,t2017,i,i)
        # t2017前i天商品的销售额日变化值均值
        X['diff_%s_mean' % i] = tmp.diff(axis=1).mean(axis=1).values
        # t2017前i天商品的销售额衰变和，衰变因子为0.9
        X['mean_%s_decay' % i] = (tmp * np.power(0.9, np.arange(i)[::-1])).sum(axis=1).values
        # t2017前i天商品的销售额均值
        X['mean_%s' % i] = tmp.mean(axis=1).values
        # t2017前i天商品的销售额中位数
        X['median_%s' % i] = tmp.median(axis=1).values
        # t2017前i天商品的销售额最小值
        X['min_%s' % i] = tmp.min(axis=1).values
        # t2017前i天商品的销售额最大值
        X['max_%s' % i] = tmp.max(axis=1).values
        # t2017前i天商品的销售额方差
        X['std_%s' % i] = tmp.std(axis=1).values
        
    for i in [3, 7, 14, 30, 60, 140]:
        # 取df在t2017一周后前i天的所有列
        tmp = get_timespan(df, t2017 + timedelta(days=-7), i, i)
        # t2017前i天商品的销售额日变化值均值
        X['diff_%s_mean_2' % i] = tmp.diff(axis=1).mean(axis=1).values
        # t2017一周后前i天商品的销售额衰变和，衰变因子为0.9
        X['mean_%s_decay_2' % i] = (tmp * np.power(0.9, np.arange(i)[::-1])).sum(axis=1).values
        # t2017一周后前i天商品的销售额均值
        X['mean_%s_2' % i] = tmp.mean(axis=1).values
        # t2017一周后前i天商品的销售额中位数
        X['median_%s_2' % i] = tmp.median(axis=1).values
        # t2017一周后前i天商品的销售额最小值
        X['min_%s_2' % i] = tmp.min(axis=1).values
        # t2017一周后前i天商品的销售额最大值
        X['max_%s_2' % i] = tmp.max(axis=1).values
        # t2017一周后前i天商品的销售额方差
        X['std_%s_2' % i] = tmp.std(axis=1).values
        
    for i in [7, 14, 30, 60, 140]: 
        # 取df在t2017前i天的所有列
        tmp = get_timespan(df,t2017,i,i)
        # t2017前i天有销售额的天数
        X['has_sales_days_in_last_%s' % i] = (tmp > 0).sum(axis=1).values
        # t2017前i天有销售额的最后一天距离t2017的天数
        X['last_has_sales_day_in_last_%s' % i] = i - ((tmp > 0) * np.arange(i)).max(axis=1).values
        # t2017前i天有销售额的第一天距离t2017的天数
        X['first_has_sales_day_in_last_%s' % i] = ((tmp > 0) * np.arange(i,0,-1)).max(axis=1).values
        
        # 取promo_df在t2017前i天的所有列
        tmp = get_timespan(promo_df, t2017, i, i)
        # t2017前i天有促销的天数
        X['has_promo_days_in_last_%s' % i] = (tmp > 0).sum(axis=1).values
        # t2017前i天有促销的最后一天距离t2017的天数
        X['last_has_promo_day_in_last_%s' % i] = i - ((tmp > 0) * np.arange(i)).max(axis=1).values
        # t2017前i天有促销的第一天距离t2017的天数
        X['first_has_promo_day_in_last_%s' % i] = ((tmp > 0) * np.arange(i, 0, -1)).max(axis=1).values
        
    # 取promo_df在t2017后15天的所有列
    tmp = get_timespan(promo_df,t2017+timedelta(days=16),15,15)
    # t2017后15天有促销的天数
    X['has_promo_days_in_after_15_days'] = (tmp > 0).sum(axis=1).values
    # t2017后15天有促销的最后一天距离t2017的天数
    X['last_has_promo_day_in_after_15_days'] = i - ((tmp > 0) * np.arange(15)).max(axis=1).values
    # t2017后15天促销的第一天距离t2017的天数
    X['first_has_promo_day_in_after_15_days'] = ((tmp > 0) * np.arange(15, 0, -1)).max(axis=1).values
    
    # t2017过去15天的销售额
    for i in range(1,16):
        X['day_%s_2017' % i] = get_timespan(df,t2017,i,1).values.ravel()
    
    for i in range(7):
        # t2017+i前4周同一天的均值
        X['mean_4_dow{}_2017'.format(i)] = get_timespan(df,t2017,28-i,4,freq='7D').mean(axis=1).values
        # t2017+i前20周同一天的均值
        X['mean_20_dow{}_2017'.format(i)] = get_timespan(df,t2017,140-i,20,freq='7D').mean(axis=1).values
        
    X = pd.DataFrame(X)
    
    if is_train:
        y = df[pd.date_range(t2017,periods=16)].values
        return X,y
    if name_prefix is not None:
        X.columns = ['%s_%s' % (name_prefix,c) for c in X.columns]
    
    return X

In [16]:
#准备训练集
start = time.time()
print("Preparing dataset...")
t2017 = date(2017,6,14)
num_days = 6
X_l,y_l = [],[]

for i in range(num_days):
    # 对2017-6-14当前及前5周每周同一天执行计算统计学特征
    delta = timedelta(days = 7 * i)
    
    # 生成 t2017+delta 的各类特征（不同ID的销售额与促销情况） 
    X_tmp,y_tmp = prepare_dataset(df_2017,promo_2017,t2017 + delta)
    
    # 生成 t2017+delta 的各类特征 (不同item的销售额与促销情况)
    X_tmp2 = prepare_dataset(df_2017_item,promo_2017_item,t2017 + delta,is_train = False,name_prefix = 'item')
    X_tmp2.index = df_2017_item.index
    X_tmp2 = X_tmp2.reindex(df_2017.index.get_level_values(1)).reset_index(drop = True)
    
    # 生成 t2017+delta 的各类特征 （特定商品特定商店的销售额与促销情况）
    X_tmp3 = prepare_dataset(df_2017_store_class,df_2017_promo_store_class,t2017 + delta,is_train=False,name_prefix='store_class')
    X_tmp3.index = df_2017_store_class.index
    X_tmp3 = X_tmp3.reindex(df_2017_store_class_index).reset_index(drop=True)
    
    X_tmp = pd.concat([X_tmp,X_tmp2,X_tmp3,items.reset_index(),stores.reset_index()],axis=1)
    X_l.append(X_tmp)
    y_l.append(y_tmp)
    
    del X_tmp2
    gc.collect()

# 合并为训练集
X_train = pd.concat(X_l,axis=0)
y_train = np.concatenate(y_l,axis=0)
del X_l,y_l

end = time.time()
print("Finish!!!Using %dmin"%int((end-start)/60))

Preparing dataset...
Finish!!!Using 2min


In [17]:
# 生成 2017-7-26 的各类特征（不同ID的销售额与促销情况） 
X_val, y_val = prepare_dataset(df_2017, promo_2017, date(2017, 7, 26))

# 生成 2017-7-26 的各类特征 (不同item的销售额与促销情况)
X_val2 = prepare_dataset(df_2017_item, promo_2017_item, date(2017, 7, 26), is_train=False, name_prefix='item')
X_val2.index = df_2017_item.index
X_val2 = X_val2.reindex(df_2017.index.get_level_values(1)).reset_index(drop=True)

# 生成 2017-7-26 的各类特征 （特定商品特定商店的销售额与促销情况）
X_val3 = prepare_dataset(df_2017_store_class, df_2017_promo_store_class, date(2017, 7, 26), is_train=False, name_prefix='store_class')
X_val3.index = df_2017_store_class.index
X_val3 = X_val3.reindex(df_2017_store_class_index).reset_index(drop=True)

#合并为验证集
X_val = pd.concat([X_val, X_val2, X_val3, items.reset_index(), stores.reset_index()], axis=1)

In [18]:
# 生成 2017-8-16 的各类特征（不同ID的销售额与促销情况） 
X_test = prepare_dataset(df_2017, promo_2017, date(2017, 8, 16), is_train=False)

# 生成 2017-8-16 的各类特征 (不同item的销售额与促销情况)
X_test2 = prepare_dataset(df_2017_item, promo_2017_item, date(2017, 8, 16), is_train=False, name_prefix='item')
X_test2.index = df_2017_item.index
X_test2 = X_test2.reindex(df_2017.index.get_level_values(1)).reset_index(drop=True)

# 生成 2017-8-16 的各类特征 （特定商品特定商店的销售额与促销情况）
X_test3 = prepare_dataset(df_2017_store_class, df_2017_promo_store_class, date(2017, 8, 16), is_train=False, name_prefix='store_class')
X_test3.index = df_2017_store_class.index
X_test3 = X_test3.reindex(df_2017_store_class_index).reset_index(drop=True)

#合并为测试集
X_test = pd.concat([X_test, X_test2, X_test3, items.reset_index(), stores.reset_index()], axis=1)

In [19]:
# Memory Saving
del X_test2, X_val2, df_2017_item, promo_2017_item, df_2017_store_class, df_2017_promo_store_class, df_2017_store_class_index
gc.collect()

X_train = reduce_mem_usage(X_train)
X_val = reduce_mem_usage(X_val)
X_test = reduce_mem_usage(X_test)

y_train = reduce_mem_usage(pd.DataFrame(y_train)).values
y_val = reduce_mem_usage(pd.DataFrame(y_val)).values

Mem. usage decreased to 957.57 Mb (75.2% reduction)
Mem. usage decreased to 158.32 Mb (75.4% reduction)
Mem. usage decreased to 158.32 Mb (75.4% reduction)
Mem. usage decreased to 30.67 Mb (75.0% reduction)
Mem. usage decreased to  5.11 Mb (75.0% reduction)


In [21]:
print("Shape of X_train is",X_train.shape)
print("Shape of y_train is",y_train.shape)
print("Shape of X_val is",X_val.shape)
print("Shape of y_val is",y_val.shape)
print("Shape of X_test is",X_test.shape)

Shape of X_train is (1005090, 537)
Shape of y_train is (1005090, 16)
Shape of X_val is (167515, 537)
Shape of y_val is (167515, 16)
Shape of X_test is (167515, 537)


# 4.模型

In [22]:
params = {
    'num_leaves': 80,
    'objective': 'regression',
    'min_data_in_leaf': 200,
    'learning_rate': 0.02,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.7,
    'bagging_freq': 1,
    'metric': 'l2',
    'num_threads': 16
}

In [26]:
MAX_ROUNDS = 5000
val_pred = []
test_pred = []
cate_vars = []

for i in range(16):
    print('='*50)
    # 16个要预测的变量
    print('Step %d'%(i + 1))
    print('='*50)
    
    # lgb训练集
    dtrain = lgb.Dataset(
        X_train,label = y_train[:,i],
        categorical_feature = cate_vars,
        weight = pd.concat([items['perishable']] * num_days) * 0.25 + 1
    )
    # lgb验证集
    dval = lgb.Dataset(
        X_val,label = y_val[:,i],reference = dtrain,
        weight = items['perishable'] * 0.25 + 1,
        categorical_feature = cate_vars)
    
    # 训练模型
    bst = lgb.train(
        params,dtrain,num_boost_round = MAX_ROUNDS,
        valid_sets = [dtrain,dval],early_stopping_rounds = 125,verbose_eval = 500
    )
    
    # 打印特征重要性
    print("\n".join(("%s: %.2f" % x) for x in sorted(
        zip(X_train.columns, bst.feature_importance("gain")),
        key=lambda x: x[1], reverse=True
    )))
    
    # 验证集与测试集的预测
    val_pred.append(bst.predict(
        X_val, num_iteration=bst.best_iteration or MAX_ROUNDS))
    test_pred.append(bst.predict(
        X_test, num_iteration=bst.best_iteration or MAX_ROUNDS))

Step 1




Training until validation scores don't improve for 125 rounds.
[500]	training's l2: 0.304004	valid_1's l2: 0.316178
[1000]	training's l2: 0.293242	valid_1's l2: 0.315342
[1500]	training's l2: 0.284628	valid_1's l2: 0.315096
Early stopping, best iteration is:
[1588]	training's l2: 0.283215	valid_1's l2: 0.315062
mean_30_decay: 3697059.09
mean_140_decay: 3136427.28
mean_7_decay: 2695631.41
mean_60_decay: 1960653.38
mean_14_decay: 1641428.10
mean_7: 510987.22
mean_20_dow0_2017: 258132.70
mean_4_dow0_2017: 196674.08
median_7: 135052.63
day_1_2017: 79790.69
last_has_sales_day_in_last_7: 38558.47
item_diff_30_mean: 36573.26
family: 32509.84
item_diff_14_mean: 28466.92
median_14: 26151.03
item_diff_60_mean: 25178.02
no_promo_mean_60: 22833.44
mean_3_decay: 22201.70
std_7: 21309.28
item_diff_7_mean: 21115.16
first_has_promo_day_in_after_15_days: 20479.21
promo_7_2017_aft: 20257.29
diff_3_mean: 19921.14
diff_7_mean: 19668.56
min_3: 17943.02
std_14: 17316.60
last_has_promo_day_in_after_15_days: 

Step 2
Training until validation scores don't improve for 125 rounds.
[500]	training's l2: 0.296819	valid_1's l2: 0.314427
[1000]	training's l2: 0.286272	valid_1's l2: 0.313501
[1500]	training's l2: 0.277915	valid_1's l2: 0.313215
Early stopping, best iteration is:
[1414]	training's l2: 0.279264	valid_1's l2: 0.313203
mean_140_decay: 4786740.54
mean_60_decay: 4472272.91
mean_7_decay: 758578.84
mean_14_decay: 550095.63
mean_7: 532970.77
mean_30_decay: 335465.09
first_has_promo_day_in_after_15_days: 269630.07
mean_20_dow1_2017: 159331.65
median_7: 152400.07
mean_4_dow1_2017: 83436.47
no_promo_mean_60: 61174.63
median_60: 47780.63
promo_7_2017_aft: 42154.82
item_promo_3_2017_aft: 39509.24
promo_3_2017_aft: 38690.80
median_14: 38210.06
item_diff_30_mean: 38079.96
no_promo_mean_7_decay: 37193.28
no_promo_mean_140_decay: 31403.87
no_promo_mean_60_decay: 25134.21
no_promo_mean_14_decay: 22684.62
no_promo_mean_30: 22048.46
no_promo_mean_3_decay: 21955.78
item_diff_7_mean: 19752.49
item_diff_14

Step 3
Training until validation scores don't improve for 125 rounds.
[500]	training's l2: 0.299684	valid_1's l2: 0.330554
[1000]	training's l2: 0.288065	valid_1's l2: 0.329153
[1500]	training's l2: 0.279406	valid_1's l2: 0.328748
[2000]	training's l2: 0.271837	valid_1's l2: 0.328489
Early stopping, best iteration is:
[1984]	training's l2: 0.272055	valid_1's l2: 0.328483
mean_140_decay: 6236728.12
mean_60_decay: 3921316.48
mean_7: 1423100.32
mean_4_dow2_2017: 826301.13
mean_20_dow2_2017: 712002.42
mean_30_decay: 412213.39
first_has_promo_day_in_after_15_days: 223482.39
promo_3_2017_aft: 157379.03
median_7: 126829.51
mean_14_decay: 117983.72
item_diff_30_mean: 55085.71
item_promo_3_2017_aft: 51535.43
no_promo_mean_140_decay: 49255.83
mean_7_decay: 36042.60
promo_7_2017_aft: 34789.61
day_5_2017: 32769.33
no_promo_mean_60_decay: 30786.87
median_14: 28711.76
has_promo_mean_140: 28597.75
item_diff_7_mean: 28517.54
no_promo_mean_60: 28130.14
item_promo_7_2017_aft: 23645.93
std_14: 23298.09
i

Step 4
Training until validation scores don't improve for 125 rounds.
[500]	training's l2: 0.319939	valid_1's l2: 0.345327
[1000]	training's l2: 0.306529	valid_1's l2: 0.343124
[1500]	training's l2: 0.297043	valid_1's l2: 0.342568
[2000]	training's l2: 0.288854	valid_1's l2: 0.342299
Early stopping, best iteration is:
[2174]	training's l2: 0.286227	valid_1's l2: 0.342197
mean_60_decay: 8171935.57
mean_140_decay: 5584066.93
mean_7_decay: 579113.49
mean_30_decay: 498201.36
mean_4_dow3_2017: 444468.08
mean_20_dow3_2017: 399387.92
mean_14_decay: 243111.22
promo_3_2017_aft: 208960.54
mean_7: 165592.62
no_promo_mean_60: 69899.96
first_has_promo_day_in_after_15_days: 60084.19
promo_7_2017_aft: 57092.67
day_3_2017: 45104.87
item_diff_30_mean: 43254.09
item_diff_7_mean: 39640.95
day_4_2017: 34002.54
item_diff_14_mean: 31876.69
item_diff_140_mean: 27467.87
no_promo_mean_140_decay: 24925.08
item_first_has_promo_day_in_after_15_days: 24862.03
item_promo_7_2017_aft: 24847.55
no_promo_mean_30: 24085

Step 5
Training until validation scores don't improve for 125 rounds.
[500]	training's l2: 0.32881	valid_1's l2: 0.351424
[1000]	training's l2: 0.314405	valid_1's l2: 0.34901
[1500]	training's l2: 0.304086	valid_1's l2: 0.348167
[2000]	training's l2: 0.295348	valid_1's l2: 0.347604
[2500]	training's l2: 0.287554	valid_1's l2: 0.347365
[3000]	training's l2: 0.280485	valid_1's l2: 0.347221
Early stopping, best iteration is:
[3137]	training's l2: 0.278641	valid_1's l2: 0.347192
mean_140_decay: 7314200.85
mean_60_decay: 6076907.97
mean_4_dow4_2017: 1635182.70
mean_20_dow4_2017: 781905.27
mean_30_decay: 520726.60
mean_7_decay: 459154.71
promo_7_2017_aft: 196575.59
mean_3: 126300.95
day_3_2017: 88190.56
item_diff_14_mean: 59307.37
first_has_promo_day_in_after_15_days: 47869.06
store_class_diff_3_mean: 46229.03
max_3: 44735.42
mean_4_dow3_2017: 43128.69
no_promo_mean_60: 37717.09
promo_14_2017_aft: 36359.73
item_diff_30_mean: 30660.96
item_first_has_promo_day_in_after_15_days: 29286.77
item_d

Step 6
Training until validation scores don't improve for 125 rounds.
[500]	training's l2: 0.331832	valid_1's l2: 0.356881
[1000]	training's l2: 0.318466	valid_1's l2: 0.354967
[1500]	training's l2: 0.308456	valid_1's l2: 0.354516
[2000]	training's l2: 0.299923	valid_1's l2: 0.354249
Early stopping, best iteration is:
[2176]	training's l2: 0.297131	valid_1's l2: 0.354166
mean_60_decay: 6289906.48
mean_140_decay: 5819552.47
mean_30_decay: 800257.64
promo_7_2017_aft: 196892.40
mean_20_dow5_2017: 192141.27
mean_4_dow5_2017: 147823.19
mean_7_decay: 126265.82
mean_3_decay: 90497.68
no_promo_mean_60: 86968.96
mean_3: 82986.97
mean_14_decay: 82289.81
max_3: 65295.41
median_60: 55740.19
no_promo_mean_30: 49824.03
first_has_promo_day_in_after_15_days: 33288.62
item_diff_14_mean: 33124.34
promo_14_2017_aft: 30714.84
promo_3_2017_aft: 27643.89
median_3: 26295.33
item_diff_60_mean: 26165.99
item_diff_7_mean: 22787.97
has_promo_days_in_after_15_days: 22773.56
last_has_promo_day_in_after_15_days: 20

Step 7
Training until validation scores don't improve for 125 rounds.
[500]	training's l2: 0.3234	valid_1's l2: 0.411814
[1000]	training's l2: 0.310419	valid_1's l2: 0.408504
[1500]	training's l2: 0.300773	valid_1's l2: 0.40742
[2000]	training's l2: 0.292463	valid_1's l2: 0.406848
[2500]	training's l2: 0.285062	valid_1's l2: 0.406428
Early stopping, best iteration is:
[2829]	training's l2: 0.28051	valid_1's l2: 0.406093
mean_60_decay: 5525467.96
mean_140_decay: 5416926.52
mean_30_decay: 770565.65
mean_20_dow6_2017: 415030.43
mean_4_dow6_2017: 304445.70
promo_7_2017_aft: 221583.17
mean_14_decay: 186225.46
first_has_promo_day_in_after_15_days: 83962.89
mean_7_decay: 77251.22
family: 68468.11
median_3: 49611.07
no_promo_mean_60: 48387.28
mean_3_decay: 39988.56
no_promo_mean_30: 39781.94
last_has_promo_day_in_after_15_days: 34047.37
promo_14_2017_aft: 33949.15
median_60: 32704.23
promo_3_2017_aft: 31448.38
item_diff_14_mean: 28871.14
item_diff_60_mean: 26365.14
no_promo_mean_140_decay: 249

Step 8
Training until validation scores don't improve for 125 rounds.
[500]	training's l2: 0.310471	valid_1's l2: 0.375994
[1000]	training's l2: 0.297846	valid_1's l2: 0.373159
[1500]	training's l2: 0.288493	valid_1's l2: 0.372486
[2000]	training's l2: 0.280497	valid_1's l2: 0.37231
Early stopping, best iteration is:
[2145]	training's l2: 0.278275	valid_1's l2: 0.372215
mean_140_decay: 7229866.40
mean_60_decay: 3857073.92
mean_7: 532013.01
mean_20_dow0_2017: 455911.73
mean_30_decay: 425765.27
mean_4_dow0_2017: 326596.58
promo_7_2017_aft: 265338.98
median_60: 155147.91
median_7: 148358.57
first_has_promo_day_in_after_15_days: 130066.33
last_has_promo_day_in_after_15_days: 88279.86
no_promo_mean_60: 83613.77
no_promo_mean_30: 82231.88
family: 70880.86
promo_14_2017_aft: 51994.42
has_promo_days_in_after_15_days: 35742.46
no_promo_mean_140_decay: 33498.20
item_diff_30_mean: 29641.14
has_promo_mean_140: 28576.79
item_promo_7_2017_aft: 27256.73
median_14: 25185.71
no_promo_mean_14: 22946.89


Step 9
Training until validation scores don't improve for 125 rounds.
[500]	training's l2: 0.32433	valid_1's l2: 0.373906
[1000]	training's l2: 0.31147	valid_1's l2: 0.371737
[1500]	training's l2: 0.301855	valid_1's l2: 0.371017
[2000]	training's l2: 0.293569	valid_1's l2: 0.37078
[2500]	training's l2: 0.286035	valid_1's l2: 0.370632
Early stopping, best iteration is:
[2748]	training's l2: 0.282525	valid_1's l2: 0.37059
mean_140_decay: 5505242.66
mean_60_decay: 3988150.15
median_60: 456681.36
mean_30_decay: 363110.64
mean_20_dow1_2017: 269681.87
no_promo_mean_30: 246062.51
has_promo_days_in_after_15_days: 165569.09
no_promo_mean_60: 139161.11
median_7: 123480.60
mean_4_dow1_2017: 90693.18
promo_14_2017_aft: 88070.43
median_30: 68351.91
first_has_promo_day_in_after_15_days: 62672.57
last_has_promo_day_in_after_15_days: 59219.96
promo_7_2017_aft: 43036.40
item_diff_30_mean: 40826.06
mean_7: 40278.32
mean_30: 35948.27
no_promo_mean_140: 30061.99
mean_20_dow2_2017: 29422.10
item_promo_7_20

Step 10
Training until validation scores don't improve for 125 rounds.
[500]	training's l2: 0.324002	valid_1's l2: 0.36226
[1000]	training's l2: 0.309966	valid_1's l2: 0.359682
[1500]	training's l2: 0.300066	valid_1's l2: 0.358932
Early stopping, best iteration is:
[1612]	training's l2: 0.298091	valid_1's l2: 0.358804
mean_140_decay: 6570427.17
mean_60_decay: 3178602.64
mean_20_dow2_2017: 1345314.34
mean_4_dow2_2017: 1221065.54
mean_30_decay: 365194.67
last_has_promo_day_in_after_15_days: 171242.78
has_promo_days_in_after_15_days: 154356.06
no_promo_mean_30: 135251.87
mean_7: 127017.79
median_60: 99400.70
promo_14_2017_aft: 94182.97
median_7: 79561.14
first_has_promo_day_in_after_15_days: 59083.36
no_promo_mean_60: 56597.96
item_diff_30_mean: 53927.93
day_5_2017: 45424.98
no_promo_mean_140_decay: 37303.12
no_promo_mean_14: 35547.20
item_promo_7_2017_aft: 30592.07
promo_7_2017_aft: 29872.71
no_promo_mean_60_decay: 29335.18
has_promo_mean_140: 27409.11
std_140: 25875.71
item_promo_14_201

Step 11
Training until validation scores don't improve for 125 rounds.
[500]	training's l2: 0.342218	valid_1's l2: 0.367816
[1000]	training's l2: 0.326332	valid_1's l2: 0.365188
[1500]	training's l2: 0.315732	valid_1's l2: 0.364354
[2000]	training's l2: 0.306804	valid_1's l2: 0.364016
[2500]	training's l2: 0.298952	valid_1's l2: 0.363786
Early stopping, best iteration is:
[2583]	training's l2: 0.297684	valid_1's l2: 0.363744
mean_140_decay: 6949472.45
mean_60_decay: 5761244.47
mean_4_dow3_2017: 712664.74
mean_20_dow3_2017: 681254.18
mean_30_decay: 540625.78
has_promo_days_in_after_15_days: 192587.01
no_promo_mean_30: 159839.45
mean_60: 157533.15
no_promo_mean_60: 141824.52
median_60: 116504.48
last_has_promo_day_in_after_15_days: 107788.67
promo_14_2017_aft: 93452.30
mean_7: 62573.85
item_diff_30_mean: 54306.29
day_4_2017: 54109.29
mean_4_dow4_2017: 43732.28
first_has_promo_day_in_after_15_days: 37078.52
item_diff_14_mean: 33919.79
item_promo_7_2017_aft: 33918.08
no_promo_mean_140_deca

Step 12
Training until validation scores don't improve for 125 rounds.
[500]	training's l2: 0.347685	valid_1's l2: 0.376818
[1000]	training's l2: 0.330582	valid_1's l2: 0.373518
[1500]	training's l2: 0.318983	valid_1's l2: 0.372265
[2000]	training's l2: 0.309597	valid_1's l2: 0.371597
[2500]	training's l2: 0.301276	valid_1's l2: 0.371194
[3000]	training's l2: 0.293688	valid_1's l2: 0.370949
Early stopping, best iteration is:
[3238]	training's l2: 0.290259	valid_1's l2: 0.370804
mean_140_decay: 7668910.37
mean_60_decay: 3828077.45
mean_4_dow4_2017: 3325415.63
mean_20_dow4_2017: 1065457.55
mean_30_decay: 349007.67
last_has_promo_day_in_after_15_days: 171682.15
has_promo_days_in_after_15_days: 168604.96
promo_14_2017_aft: 114629.31
no_promo_mean_60: 79801.23
mean_7_decay: 67820.07
mean_60: 58242.99
store_class_diff_3_mean_2: 57105.40
mean_3: 52130.08
day_3_2017: 48528.14
max_3: 42267.68
item_diff_14_mean: 40456.06
item_diff_7_mean: 37712.80
no_promo_mean_140_decay: 34845.73
item_promo_14_

Step 13
Training until validation scores don't improve for 125 rounds.
[500]	training's l2: 0.348739	valid_1's l2: 0.37254
[1000]	training's l2: 0.333333	valid_1's l2: 0.370133
[1500]	training's l2: 0.322537	valid_1's l2: 0.369501
[2000]	training's l2: 0.313316	valid_1's l2: 0.369225
Early stopping, best iteration is:
[2005]	training's l2: 0.313231	valid_1's l2: 0.369216
mean_140_decay: 6927151.39
mean_60_decay: 4666469.66
mean_30_decay: 485301.96
median_60: 301059.56
mean_20_dow5_2017: 274815.35
no_promo_mean_30: 227035.06
mean_4_dow5_2017: 205455.25
last_has_promo_day_in_after_15_days: 156104.90
no_promo_mean_60: 147109.77
has_promo_days_in_after_15_days: 129631.68
promo_14_2017_aft: 108328.52
mean_60: 94685.21
mean_3: 46021.69
mean_3_decay: 35338.07
item_promo_14_2017_aft: 33339.88
item_promo_7_2017_aft: 31961.14
item_diff_14_mean: 31233.36
item_diff_60_mean: 30188.48
no_promo_mean_140: 29466.56
family: 29194.30
item_diff_7_mean: 29191.48
has_promo_mean_140: 26817.36
promo_7_2017_af

Step 14
Training until validation scores don't improve for 125 rounds.
[500]	training's l2: 0.342239	valid_1's l2: 0.360271
[1000]	training's l2: 0.326452	valid_1's l2: 0.358058
[1500]	training's l2: 0.315752	valid_1's l2: 0.357281
[2000]	training's l2: 0.306614	valid_1's l2: 0.356907
Early stopping, best iteration is:
[2294]	training's l2: 0.301781	valid_1's l2: 0.356766
mean_140_decay: 6406468.18
mean_60_decay: 4210380.69
mean_20_dow6_2017: 656970.58
mean_30_decay: 453076.62
mean_4_dow6_2017: 319216.63
last_has_promo_day_in_after_15_days: 308048.28
median_60: 125933.92
no_promo_mean_60: 113777.92
promo_14_2017_aft: 107208.58
no_promo_mean_30: 87670.42
has_promo_days_in_after_15_days: 75546.38
family: 71941.19
median_30: 71387.32
mean_30: 65402.80
has_promo_mean_140: 41917.48
no_promo_mean_140: 41087.73
item_promo_7_2017_aft: 39935.02
mean_60: 39821.44
item_diff_140_mean: 39193.95
item_diff_7_mean: 38532.43
item_promo_14_2017_aft: 32313.97
item_diff_60_mean: 30159.86
item_diff_30_mean

Step 15
Training until validation scores don't improve for 125 rounds.
[500]	training's l2: 0.323997	valid_1's l2: 0.343587
[1000]	training's l2: 0.309379	valid_1's l2: 0.341083
[1500]	training's l2: 0.299237	valid_1's l2: 0.340364
[2000]	training's l2: 0.29078	valid_1's l2: 0.339999
[2500]	training's l2: 0.283229	valid_1's l2: 0.339724
Early stopping, best iteration is:
[2582]	training's l2: 0.282069	valid_1's l2: 0.339678
mean_140_decay: 6310623.10
mean_60_decay: 3449604.56
median_60: 710689.49
no_promo_mean_30: 669792.42
mean_20_dow0_2017: 639519.50
last_has_promo_day_in_after_15_days: 604748.98
mean_30: 549429.75
no_promo_mean_60: 209188.43
mean_4_dow0_2017: 162033.29
mean_30_decay: 87940.47
promo_14_2017_aft: 86078.83
has_promo_mean_140: 84244.70
mean_7: 68598.41
no_promo_mean_140_decay: 64782.83
has_promo_days_in_after_15_days: 56056.13
family: 54260.28
median_30: 50457.65
no_promo_mean_140: 47969.13
item_diff_30_mean: 38583.53
item_promo_14_2017_aft: 33693.33
std_140: 33265.08
f

Step 16
Training until validation scores don't improve for 125 rounds.
[500]	training's l2: 0.327033	valid_1's l2: 0.359478
[1000]	training's l2: 0.312927	valid_1's l2: 0.357419
[1500]	training's l2: 0.303013	valid_1's l2: 0.356679
[2000]	training's l2: 0.294612	valid_1's l2: 0.35648
[2500]	training's l2: 0.287053	valid_1's l2: 0.356304
Early stopping, best iteration is:
[2408]	training's l2: 0.288398	valid_1's l2: 0.356274
mean_140_decay: 4797817.38
mean_60_decay: 2474218.87
mean_30: 1244424.11
median_60: 902747.30
no_promo_mean_30: 668701.42
last_has_promo_day_in_after_15_days: 604068.13
mean_20_dow1_2017: 313293.64
no_promo_mean_60: 284455.18
mean_60: 157394.93
median_30: 131342.23
mean_30_decay: 57229.38
no_promo_mean_140: 55616.79
mean_4_dow1_2017: 45458.28
item_diff_30_mean: 45004.94
has_promo_mean_140: 44947.19
mean_20_dow2_2017: 42899.79
item_nbr: 36702.15
no_promo_mean_140_decay: 35881.04
has_promo_days_in_after_15_days: 34769.83
item_promo_14_2017_aft: 33289.27
no_promo_mean_

In [27]:
# 打印验证集预测误差
print("Validation mse:",mean_squared_error(y_val,np.array(val_pred).transpose()))

weight = items["perishable"] * 0.25 + 1
err = (y_val - np.array(val_pred).transpose())**2
err = err.sum(axis=1) * weight
err = np.sqrt(err.sum() / weight.sum() / 16)
print('nwrmsle = {}'.format(err))

Validation mse: 0.3542872025605699
nwrmsle = 0.5950045765697544


  return umr_sum(a, axis, dtype, out, keepdims, initial)


In [28]:
# 生成提交文件

#cv
y_val = np.array(val_pred).transpose()
df_preds = pd.DataFrame(
    y_val, index=df_2017.index,
    columns=pd.date_range("2017-07-26", periods=16)
).stack().to_frame("unit_sales")
df_preds.index.set_names(["store_nbr", "item_nbr", "date"], inplace=True)
df_preds["unit_sales"] = np.clip(np.expm1(df_preds["unit_sales"]), 0, 1000)
df_preds.reset_index().to_csv('submit_versions/lgb_cv.csv', index=False)

# submit
print("Making submission...")
y_test = np.array(test_pred).transpose()
df_preds = pd.DataFrame(
    y_test, index=df_2017.index,
    columns=pd.date_range("2017-08-16", periods=16)
).stack().to_frame("unit_sales")
df_preds.index.set_names(["store_nbr", "item_nbr", "date"], inplace=True)

submission = df_test[["id"]].join(df_preds, how="left").fillna(0)
submission["unit_sales"] = np.clip(np.expm1(submission["unit_sales"]), 0, 1000)
submission.to_csv('submit_versions/lgb_sub.csv', float_format='%.4f', index=None)

Making submission...
