In [1]:
from datetime import date, timedelta

import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
import lightgbm as lgb

In [2]:
df_train = pd.read_csv('database/train.csv',usecols=[1,2,3,4,5], parse_dates=['date'],
                       dtype={'onpromotion':bool},
                      converters = {'unit_sales': lambda u : np.log1p(float(u) if float(u) > 0 else 0)},
                      skiprows=range(1,66458909))

In [3]:
df_train.head()

Unnamed: 0,date,store_nbr,item_nbr,unit_sales,onpromotion
0,2016-01-01,25,105574,2.564949,False
1,2016-01-01,25,105575,2.302585,False
2,2016-01-01,25,105857,1.386294,False
3,2016-01-01,25,108634,1.386294,False
4,2016-01-01,25,108701,1.098612,True


In [4]:
df_test = pd.read_csv('database/test.csv', dtype={'onpromotion':bool},
                     parse_dates = ['date'],usecols=[0,1,2,3,4])

In [5]:
df_test.head()

Unnamed: 0,id,date,store_nbr,item_nbr,onpromotion
0,125497040,2017-08-16,1,96995,False
1,125497041,2017-08-16,1,99197,False
2,125497042,2017-08-16,1,103501,False
3,125497043,2017-08-16,1,103520,False
4,125497044,2017-08-16,1,103665,False


In [6]:
items = pd.read_csv('database/items.csv')

In [7]:
items.head()

Unnamed: 0,item_nbr,family,class,perishable
0,96995,GROCERY I,1093,0
1,99197,GROCERY I,1067,0
2,103501,CLEANING,3008,0
3,103520,GROCERY I,1028,0
4,103665,BREAD/BAKERY,2712,1


In [8]:
df_2017 = df_train.loc[df_train.date >= pd.datetime(2017,1,1)]
del df_train
df_2017.head()

Unnamed: 0,date,store_nbr,item_nbr,unit_sales,onpromotion
35229871,2017-01-01,25,99197,0.693147,False
35229872,2017-01-01,25,103665,2.079442,False
35229873,2017-01-01,25,105574,0.693147,False
35229874,2017-01-01,25,105857,1.609438,False
35229875,2017-01-01,25,106716,1.098612,False


In [9]:
df_store = pd.read_csv('database/stores.csv')
df_store['big_city'] = False
df_store['middle_city'] = False
df_store['little_city'] = False

citys = pd.read_csv('database/city_population.csv')
big_citys = citys[citys['2017 Population'] >= 1000000]['Name'].values
middle_citys = citys.loc[(citys['2017 Population'] >= 100000) & (citys['2017 Population'] < 1000000)]['Name'].values

df_store['big_city'] = df_store['city'].map(lambda u :True if u in big_citys else False).astype(bool)
df_store['middle_city'] = df_store['city'].map(lambda u: True if u in middle_citys else False).astype(bool)
df_store['little_city'] = df_store['city'].map(lambda u: True if u not in big_citys and u not in middle_citys else False).astype(bool)

In [10]:
df_store = df_store[['store_nbr', 'big_city','middle_city','little_city']]

In [11]:
df_store.head()

Unnamed: 0,store_nbr,big_city,middle_city,little_city
0,1,True,False,False
1,2,True,False,False
2,3,True,False,False
3,4,True,False,False
4,5,False,True,False


In [12]:
big_city_dict = {}
mid_city_dict = {}
lit_city_dict = {}
for i,j in zip(df_store['store_nbr'], df_store['big_city']):
    big_city_dict[i] = j
for i,j in zip(df_store['store_nbr'], df_store['middle_city']):
    mid_city_dict[i] = j
for i,j in zip(df_store['store_nbr'], df_store['little_city']):
    lit_city_dict[i] = j

In [13]:
city_2017_train = pd.merge(df_2017, df_store, how='left', on=['store_nbr'])
city_2017_test = pd.merge(df_test, df_store, how='left', on=['store_nbr'])
#city_2017_train

In [14]:
bc_2017_train = city_2017_train.set_index(['store_nbr', 'item_nbr', 'date'])[['big_city']].unstack(level=-1)
mc_2017_train = city_2017_train.set_index(['store_nbr', 'item_nbr', 'date'])[['middle_city']].unstack(level=-1)
lc_2017_train = city_2017_train.set_index(['store_nbr', 'item_nbr', 'date'])[['little_city']].unstack(level=-1)
bc_2017_train.columns = bc_2017_train.columns.get_level_values(1)
mc_2017_train.columns = mc_2017_train.columns.get_level_values(1)
lc_2017_train.columns = lc_2017_train.columns.get_level_values(1)

In [15]:
ind = list(set(bc_2017_train.index.get_level_values(0)))
for i in ind:
    bc_2017_train.loc[i] = big_city_dict[i]
    
ind = list(set(mc_2017_train.index.get_level_values(0)))
for i in ind:
    mc_2017_train.loc[i] = mid_city_dict[i]
    
ind = list(set(lc_2017_train.index.get_level_values(0)))
for i in ind:
    lc_2017_train.loc[i] = lit_city_dict[i]

In [16]:
bc_2017_test = city_2017_test.set_index(['store_nbr', 'item_nbr', 'date'])[['big_city']].unstack(level=-1)
mc_2017_test = city_2017_test.set_index(['store_nbr', 'item_nbr', 'date'])[['middle_city']].unstack(level=-1)
lc_2017_test = city_2017_test.set_index(['store_nbr', 'item_nbr', 'date'])[['little_city']].unstack(level=-1)

bc_2017_test.columns = bc_2017_test.columns.get_level_values(1)
mc_2017_test.columns = mc_2017_test.columns.get_level_values(1)
lc_2017_test.columns = lc_2017_test.columns.get_level_values(1)

In [17]:
ind = list(set(bc_2017_test.index.get_level_values(0)))
for i in ind:
    bc_2017_test.loc[i] = big_city_dict[i]
    
ind = list(set(mc_2017_test.index.get_level_values(0)))
for i in ind:
    mc_2017_test.loc[i] = mid_city_dict[i]
    
ind = list(set(lc_2017_test.index.get_level_values(0)))
for i in ind:
    lc_2017_test.loc[i] = lit_city_dict[i]

In [18]:
bc_2017_test = bc_2017_test.reindex(bc_2017_train.index).fillna(False) # 肯定会丢掉了很多train里没有的商品
bc_2017 = pd.concat([bc_2017_train, bc_2017_test], axis=1)

mc_2017_test = mc_2017_test.reindex(mc_2017_train.index).fillna(False)
mc_2017 = pd.concat([mc_2017_train, mc_2017_test], axis=1)

lc_2017_test = lc_2017_test.reindex(lc_2017_train.index).fillna(False)
lc_2017 = pd.concat([lc_2017_train, lc_2017_test], axis=1)

In [19]:
bc_2017

Unnamed: 0_level_0,date,2017-01-01 00:00:00,2017-01-02 00:00:00,2017-01-03 00:00:00,2017-01-04 00:00:00,2017-01-05 00:00:00,2017-01-06 00:00:00,2017-01-07 00:00:00,2017-01-08 00:00:00,2017-01-09 00:00:00,2017-01-10 00:00:00,...,2017-08-22 00:00:00,2017-08-23 00:00:00,2017-08-24 00:00:00,2017-08-25 00:00:00,2017-08-26 00:00:00,2017-08-27 00:00:00,2017-08-28 00:00:00,2017-08-29 00:00:00,2017-08-30 00:00:00,2017-08-31 00:00:00
store_nbr,item_nbr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,96995,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
1,99197,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
1,103520,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
1,103665,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
1,105574,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
1,105575,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
1,105577,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
1,105693,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
1,105737,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
1,105857,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True


In [20]:
del bc_2017_train, bc_2017_test
del mc_2017_train, mc_2017_test
del lc_2017_train, lc_2017_test

## 处理节假日信息

In [21]:
df_holiday = pd.read_csv('database/holidays_events.csv', 
                        parse_dates = ['date'],
                        dtype={'transferred':bool})

In [22]:
holiday_2017 = df_holiday.loc[df_holiday.date >= pd.datetime(2017,1,1)]
holiday_2017 = holiday_2017.loc[holiday_2017['transferred'] == False]

firstday = date(2017,1,1)
endday = date(2017,9,1)
periods = endday - firstday
all_days = pd.date_range(firstday, periods=periods.days, freq='D')

In [23]:
weekend_tmp = map(lambda day: True if(day.dayofweek >=5) else False, all_days)
weekend = []
for i in weekend_tmp:
    weekend.append(i)   
df_weekend = pd.DataFrame({'date':all_days, 'weekend_or_holiday': weekend })

In [24]:
tmp = holiday_2017['date'].values
for i in range(len(tmp)):
    df_weekend.loc[df_weekend['date'] == tmp[i], 'weekend_or_holiday'] = True
df_weekend_and_holiday = df_weekend

In [25]:
hw_2017_train = pd.merge(df_2017, df_weekend_and_holiday, how='left', on=['date'])
hw_2017_train = hw_2017_train.set_index(['store_nbr', 'item_nbr', 'date'])[['weekend_or_holiday']].unstack(level=-1)
hw_2017_train.columns = hw_2017_train.columns.get_level_values(1)

In [26]:
holiday_zip = zip(df_weekend['date'],df_weekend['weekend_or_holiday'])
tmp_dict = {}
for d,h in holiday_zip:
    tmp_dict[d] = h

In [27]:
columns = hw_2017_train.columns
for i in columns:
    hw_2017_train[i] = tmp_dict[i]

In [28]:
hw_2017_train

Unnamed: 0_level_0,date,2017-01-01 00:00:00,2017-01-02 00:00:00,2017-01-03 00:00:00,2017-01-04 00:00:00,2017-01-05 00:00:00,2017-01-06 00:00:00,2017-01-07 00:00:00,2017-01-08 00:00:00,2017-01-09 00:00:00,2017-01-10 00:00:00,...,2017-08-06 00:00:00,2017-08-07 00:00:00,2017-08-08 00:00:00,2017-08-09 00:00:00,2017-08-10 00:00:00,2017-08-11 00:00:00,2017-08-12 00:00:00,2017-08-13 00:00:00,2017-08-14 00:00:00,2017-08-15 00:00:00
store_nbr,item_nbr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,96995,True,True,False,False,False,False,True,True,False,False,...,True,False,False,False,False,True,True,True,False,True
1,99197,True,True,False,False,False,False,True,True,False,False,...,True,False,False,False,False,True,True,True,False,True
1,103520,True,True,False,False,False,False,True,True,False,False,...,True,False,False,False,False,True,True,True,False,True
1,103665,True,True,False,False,False,False,True,True,False,False,...,True,False,False,False,False,True,True,True,False,True
1,105574,True,True,False,False,False,False,True,True,False,False,...,True,False,False,False,False,True,True,True,False,True
1,105575,True,True,False,False,False,False,True,True,False,False,...,True,False,False,False,False,True,True,True,False,True
1,105577,True,True,False,False,False,False,True,True,False,False,...,True,False,False,False,False,True,True,True,False,True
1,105693,True,True,False,False,False,False,True,True,False,False,...,True,False,False,False,False,True,True,True,False,True
1,105737,True,True,False,False,False,False,True,True,False,False,...,True,False,False,False,False,True,True,True,False,True
1,105857,True,True,False,False,False,False,True,True,False,False,...,True,False,False,False,False,True,True,True,False,True


In [29]:
hw_2017_train.columns

DatetimeIndex(['2017-01-01', '2017-01-02', '2017-01-03', '2017-01-04',
               '2017-01-05', '2017-01-06', '2017-01-07', '2017-01-08',
               '2017-01-09', '2017-01-10',
               ...
               '2017-08-06', '2017-08-07', '2017-08-08', '2017-08-09',
               '2017-08-10', '2017-08-11', '2017-08-12', '2017-08-13',
               '2017-08-14', '2017-08-15'],
              dtype='datetime64[ns]', name='date', length=227, freq=None)

In [30]:
hw_2017_test = pd.merge(df_test, df_weekend_and_holiday, how='left', on=['date'])
hw_2017_test = hw_2017_test.set_index(['store_nbr', 'item_nbr', 'date'])[['weekend_or_holiday']].unstack(level=-1)
hw_2017_test.columns = hw_2017_test.columns.get_level_values(1)

In [31]:
columns = hw_2017_test.columns
for i in columns:
    hw_2017_test[i] = tmp_dict[i]
hw_2017_test = hw_2017_test.reindex(hw_2017_train.index)

In [32]:
hw_2017 = pd.concat([hw_2017_train, hw_2017_test], axis=1)
del hw_2017_train, hw_2017_test
hw_2017

Unnamed: 0_level_0,date,2017-01-01 00:00:00,2017-01-02 00:00:00,2017-01-03 00:00:00,2017-01-04 00:00:00,2017-01-05 00:00:00,2017-01-06 00:00:00,2017-01-07 00:00:00,2017-01-08 00:00:00,2017-01-09 00:00:00,2017-01-10 00:00:00,...,2017-08-22 00:00:00,2017-08-23 00:00:00,2017-08-24 00:00:00,2017-08-25 00:00:00,2017-08-26 00:00:00,2017-08-27 00:00:00,2017-08-28 00:00:00,2017-08-29 00:00:00,2017-08-30 00:00:00,2017-08-31 00:00:00
store_nbr,item_nbr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,96995,True,True,False,False,False,False,True,True,False,False,...,False,False,True,False,True,True,False,False,False,False
1,99197,True,True,False,False,False,False,True,True,False,False,...,False,False,True,False,True,True,False,False,False,False
1,103520,True,True,False,False,False,False,True,True,False,False,...,False,False,True,False,True,True,False,False,False,False
1,103665,True,True,False,False,False,False,True,True,False,False,...,False,False,True,False,True,True,False,False,False,False
1,105574,True,True,False,False,False,False,True,True,False,False,...,False,False,True,False,True,True,False,False,False,False
1,105575,True,True,False,False,False,False,True,True,False,False,...,False,False,True,False,True,True,False,False,False,False
1,105577,True,True,False,False,False,False,True,True,False,False,...,False,False,True,False,True,True,False,False,False,False
1,105693,True,True,False,False,False,False,True,True,False,False,...,False,False,True,False,True,True,False,False,False,False
1,105737,True,True,False,False,False,False,True,True,False,False,...,False,False,True,False,True,True,False,False,False,False
1,105857,True,True,False,False,False,False,True,True,False,False,...,False,False,True,False,True,True,False,False,False,False


## 处理促销信息

In [33]:
df_test = df_test.set_index(['store_nbr', 'item_nbr', 'date'])

In [34]:
promo_2017_train = df_2017.set_index(['store_nbr','item_nbr','date'])[['onpromotion']].unstack(level=-1).fillna(False)
promo_2017_train

Unnamed: 0_level_0,Unnamed: 1_level_0,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion
Unnamed: 0_level_1,date,2017-01-01,2017-01-02,2017-01-03,2017-01-04,2017-01-05,2017-01-06,2017-01-07,2017-01-08,2017-01-09,2017-01-10,...,2017-08-06,2017-08-07,2017-08-08,2017-08-09,2017-08-10,2017-08-11,2017-08-12,2017-08-13,2017-08-14,2017-08-15
store_nbr,item_nbr,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
1,96995,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,99197,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,103520,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,103665,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,105574,False,False,True,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,105575,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,105577,False,False,False,False,False,False,False,False,False,False,...,True,True,False,True,False,False,False,False,False,False
1,105693,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,105737,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,105857,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [35]:
promo_2017_train.columns = promo_2017_train.columns.get_level_values(1)
promo_2017_train.columns

DatetimeIndex(['2017-01-01', '2017-01-02', '2017-01-03', '2017-01-04',
               '2017-01-05', '2017-01-06', '2017-01-07', '2017-01-08',
               '2017-01-09', '2017-01-10',
               ...
               '2017-08-06', '2017-08-07', '2017-08-08', '2017-08-09',
               '2017-08-10', '2017-08-11', '2017-08-12', '2017-08-13',
               '2017-08-14', '2017-08-15'],
              dtype='datetime64[ns]', name='date', length=227, freq=None)

In [36]:
promo_2017_test = df_test[['onpromotion']].unstack(level=-1).fillna(False)
promo_2017_test.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion
Unnamed: 0_level_1,date,2017-08-16,2017-08-17,2017-08-18,2017-08-19,2017-08-20,2017-08-21,2017-08-22,2017-08-23,2017-08-24,2017-08-25,2017-08-26,2017-08-27,2017-08-28,2017-08-29,2017-08-30,2017-08-31
store_nbr,item_nbr,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2
1,96995,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,99197,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,103501,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,103520,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,103665,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [37]:
promo_2017_test.columns

MultiIndex(levels=[['onpromotion'], [2017-08-16 00:00:00, 2017-08-17 00:00:00, 2017-08-18 00:00:00, 2017-08-19 00:00:00, 2017-08-20 00:00:00, 2017-08-21 00:00:00, 2017-08-22 00:00:00, 2017-08-23 00:00:00, 2017-08-24 00:00:00, 2017-08-25 00:00:00, 2017-08-26 00:00:00, 2017-08-27 00:00:00, 2017-08-28 00:00:00, 2017-08-29 00:00:00, 2017-08-30 00:00:00, 2017-08-31 00:00:00]],
           labels=[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]],
           names=[None, 'date'])

In [38]:
promo_2017_test.columns = promo_2017_test.columns.get_level_values(1)
promo_2017_test.columns

DatetimeIndex(['2017-08-16', '2017-08-17', '2017-08-18', '2017-08-19',
               '2017-08-20', '2017-08-21', '2017-08-22', '2017-08-23',
               '2017-08-24', '2017-08-25', '2017-08-26', '2017-08-27',
               '2017-08-28', '2017-08-29', '2017-08-30', '2017-08-31'],
              dtype='datetime64[ns]', name='date', freq=None)

In [39]:
promo_2017_test = promo_2017_test.reindex(promo_2017_train.index).fillna(False)

In [40]:
promo_2017 = pd.concat([promo_2017_train, promo_2017_test], axis=1)

In [41]:
del promo_2017_test, promo_2017_train

In [42]:
promo_2017.head()

Unnamed: 0_level_0,date,2017-01-01 00:00:00,2017-01-02 00:00:00,2017-01-03 00:00:00,2017-01-04 00:00:00,2017-01-05 00:00:00,2017-01-06 00:00:00,2017-01-07 00:00:00,2017-01-08 00:00:00,2017-01-09 00:00:00,2017-01-10 00:00:00,...,2017-08-22 00:00:00,2017-08-23 00:00:00,2017-08-24 00:00:00,2017-08-25 00:00:00,2017-08-26 00:00:00,2017-08-27 00:00:00,2017-08-28 00:00:00,2017-08-29 00:00:00,2017-08-30 00:00:00,2017-08-31 00:00:00
store_nbr,item_nbr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,96995,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,99197,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,103520,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,103665,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,105574,False,False,True,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [43]:
promo_2017.columns

DatetimeIndex(['2017-01-01', '2017-01-02', '2017-01-03', '2017-01-04',
               '2017-01-05', '2017-01-06', '2017-01-07', '2017-01-08',
               '2017-01-09', '2017-01-10',
               ...
               '2017-08-22', '2017-08-23', '2017-08-24', '2017-08-25',
               '2017-08-26', '2017-08-27', '2017-08-28', '2017-08-29',
               '2017-08-30', '2017-08-31'],
              dtype='datetime64[ns]', name='date', length=243, freq=None)

In [44]:
df_2017.head()

Unnamed: 0,date,store_nbr,item_nbr,unit_sales,onpromotion
35229871,2017-01-01,25,99197,0.693147,False
35229872,2017-01-01,25,103665,2.079442,False
35229873,2017-01-01,25,105574,0.693147,False
35229874,2017-01-01,25,105857,1.609438,False
35229875,2017-01-01,25,106716,1.098612,False


In [45]:
df_2017 = df_2017.set_index(['store_nbr', 'item_nbr', 'date'])[['unit_sales']].unstack(level=-1).fillna(0)
df_2017.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,unit_sales,unit_sales,unit_sales,unit_sales,unit_sales,unit_sales,unit_sales,unit_sales,unit_sales,unit_sales,unit_sales,unit_sales,unit_sales,unit_sales,unit_sales,unit_sales,unit_sales,unit_sales,unit_sales,unit_sales,unit_sales
Unnamed: 0_level_1,date,2017-01-01,2017-01-02,2017-01-03,2017-01-04,2017-01-05,2017-01-06,2017-01-07,2017-01-08,2017-01-09,2017-01-10,...,2017-08-06,2017-08-07,2017-08-08,2017-08-09,2017-08-10,2017-08-11,2017-08-12,2017-08-13,2017-08-14,2017-08-15
store_nbr,item_nbr,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
1,96995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.098612,1.098612,0.0,0.0,0.693147,0.0,0.0,0.0,0.0,0.0
1,99197,0.0,0.0,1.386294,0.693147,0.693147,0.693147,1.098612,0.0,0.0,0.693147,...,0.0,1.098612,0.0,1.098612,0.0,0.0,0.0,0.0,0.0,0.0
1,103520,0.0,0.693147,1.098612,0.0,1.098612,1.386294,0.693147,0.0,0.693147,0.693147,...,0.0,0.0,1.386294,0.0,1.386294,0.693147,0.693147,0.693147,0.0,0.0
1,103665,0.0,0.0,0.0,1.386294,1.098612,1.098612,0.693147,1.098612,0.0,2.079442,...,0.693147,1.098612,0.0,2.079442,2.302585,1.098612,0.0,0.0,0.693147,0.693147
1,105574,0.0,0.0,1.791759,2.564949,2.302585,1.94591,1.609438,1.098612,1.386294,2.302585,...,0.0,1.791759,2.079442,1.94591,2.397895,1.791759,1.791759,0.0,1.386294,1.609438


In [46]:
df_2017.columns = df_2017.columns.get_level_values(1)

## 处理商品信息

In [47]:
items['family_nbr'] = items['family'].astype('category').cat.codes
items['class_nbr'] = items['class'].astype('category').cat.codes

In [48]:
items.head()

Unnamed: 0,item_nbr,family,class,perishable,family_nbr,class_nbr
0,96995,GROCERY I,1093,0,12,64
1,99197,GROCERY I,1067,0,12,44
2,103501,CLEANING,3008,0,7,217
3,103520,GROCERY I,1028,0,12,17
4,103665,BREAD/BAKERY,2712,1,5,187


In [49]:
items = items.set_index('item_nbr')

In [50]:
len(items)

4100

In [51]:
items = items.reindex(df_2017.index.get_level_values(1))

In [52]:
len(items)

167515

## 处理商店信息

In [53]:
df_store = pd.read_csv('database/stores.csv')
df_store['city_nbr'] = df_store['city'].astype('category').cat.codes
df_store['state_nbr'] = df_store['state'].astype('category').cat.codes
df_store['type_nbr'] = df_store['type'].astype('category').cat.codes
df_store = df_store.reindex(df_2017.index.get_level_values(0))
df_store.head()

Unnamed: 0_level_0,store_nbr,city,state,type,cluster,city_nbr,state_nbr,type_nbr
store_nbr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,2.0,Quito,Pichincha,D,13.0,18.0,12.0,3.0
1,2.0,Quito,Pichincha,D,13.0,18.0,12.0,3.0
1,2.0,Quito,Pichincha,D,13.0,18.0,12.0,3.0
1,2.0,Quito,Pichincha,D,13.0,18.0,12.0,3.0
1,2.0,Quito,Pichincha,D,13.0,18.0,12.0,3.0


## 提取数据

In [54]:
from datetime import timedelta, date
def get_timespan(df, dt, minus, periods, freq='D'):
    return df[pd.date_range(dt - timedelta(days=minus), periods=periods, freq=freq)]

In [55]:
def prepare_dataset(t2017, is_train=True):
    X = pd.DataFrame({
        
        "city_nbr":df_store['city_nbr'].values,
        "state_nbr":df_store['state_nbr'].values,
        "type_nbr":df_store['type_nbr'].values,
        "cluster":df_store['cluster'].values,
        
        "perishable":items['perishable'].values,
        "item_family_nbr":items['family_nbr'].values,
        "item_class_nbr": items['class_nbr'].values,    
        
        "day_1_2017": get_timespan(df_2017, t2017, 1, 1).values.ravel(),
        #"weekend_and_holiday":get_timespan(hw_2017, t2017, 0, 1).fillna(0).values.astype(np.uint8).ravel(),
        "restday_2_2017": get_timespan(hw_2017, t2017, 2, 2).sum(axis=1).values,
        "restday_7_2017": get_timespan(hw_2017, t2017, 7, 7).sum(axis=1).values,            
        "big_city":get_timespan(bc_2017, t2017, 0, 1).sum(axis=1).values.ravel(),
        "mid_city": get_timespan(mc_2017, t2017, 0, 1).sum(axis=1).values.ravel(),
        "lit_city": get_timespan(lc_2017, t2017, 0, 1).sum(axis=1).values.ravel(),
         
        "mean_3_2017": get_timespan(df_2017, t2017, 3, 3).mean(axis=1).values,
        "mean_5_2017": get_timespan(df_2017, t2017, 5, 5).mean(axis=1).values,
        "mean_7_2017": get_timespan(df_2017, t2017, 7, 7).mean(axis=1).values,
        "mean_14_2017": get_timespan(df_2017, t2017, 14, 14).mean(axis=1).values,
        "mean_30_2017": get_timespan(df_2017, t2017, 30, 30).mean(axis=1).values,
        "mean_60_2017": get_timespan(df_2017, t2017, 60, 60).mean(axis=1).values,
        "mean_90_2017": get_timespan(df_2017, t2017, 90, 90).mean(axis=1).values,
        "mean_140_2017": get_timespan(df_2017, t2017, 140, 140).mean(axis=1).values,
        
        "promo_21_2017": get_timespan(promo_2017, t2017, 21, 21).sum(axis=1).values,
        "promo_30_2017": get_timespan(promo_2017, t2017, 30, 30).sum(axis=1).values,
        "promo_60_2017": get_timespan(promo_2017, t2017, 60, 60).sum(axis=1).values,
        "promo_90_2017": get_timespan(promo_2017, t2017, 90, 90).sum(axis=1).values,
        "promo_140_2017": get_timespan(promo_2017, t2017, 140, 140).sum(axis=1).values
    })
    for i in range(7):
        X['mean_2_dow{}_2017'.format(i)] = get_timespan(df_2017, t2017, 14-i, 2, freq='7D').mean(axis=1).values
        X['mean_4_dow{}_2017'.format(i)] = get_timespan(df_2017, t2017, 28-i, 4, freq='7D').mean(axis=1).values
        X['mean_20_dow{}_2017'.format(i)] = get_timespan(df_2017, t2017, 140-i, 20, freq='7D').mean(axis=1).values
        X['mean_12_dow{}_2017'.format(i)] = get_timespan(df_2017, t2017, 84-i, 12, freq='7D').mean(axis=1).values

    for i in range(16):
        X["promo_{}".format(i)] = promo_2017[
            t2017 + timedelta(days=i)].values.astype(np.uint8)
    if is_train:
        y = df_2017[
            pd.date_range(t2017, periods=16)
        ].values
        return X, y
    return X

In [56]:
print("Preparing dataset...")
t2017 = date(2017, 5, 31)
X_l, y_l = [], []
for i in range(6): # 6
    delta = timedelta(days=7 * i)
    X_tmp, y_tmp = prepare_dataset(
        t2017 + delta
    )
    X_l.append(X_tmp)
    y_l.append(y_tmp)
X_train = pd.concat(X_l, axis=0)
y_train = np.concatenate(y_l, axis=0)
del X_l, y_l
X_val, y_val = prepare_dataset(date(2017, 7, 26))
X_test = prepare_dataset(date(2017, 8, 16), is_train=False)

Preparing dataset...


In [57]:
X_train

Unnamed: 0,big_city,city_nbr,cluster,day_1_2017,item_class_nbr,item_family_nbr,lit_city,mean_140_2017,mean_14_2017,mean_30_2017,...,promo_6,promo_7,promo_8,promo_9,promo_10,promo_11,promo_12,promo_13,promo_14,promo_15
0,1.0,18.0,13.0,0.000000,64,12,0.0,0.070156,0.148532,0.138629,...,0,0,0,0,0,0,0,0,0,0
1,1.0,18.0,13.0,0.000000,44,12,0.0,0.134989,0.511931,0.381457,...,0,0,0,0,0,0,0,0,0,0
2,1.0,18.0,13.0,0.000000,17,12,0.0,0.712362,0.667989,0.868856,...,0,0,0,0,0,0,0,0,0,0
3,1.0,18.0,13.0,0.000000,187,5,0.0,1.015355,0.866918,0.956552,...,0,0,0,0,0,0,0,0,0,0
4,1.0,18.0,13.0,1.098612,31,12,0.0,1.805308,1.556041,1.774012,...,0,0,0,0,0,0,0,0,0,0
5,1.0,18.0,13.0,2.302585,31,12,0.0,2.272804,2.066668,2.151082,...,0,0,0,0,0,0,0,0,0,0
6,1.0,18.0,13.0,0.000000,31,12,0.0,0.547424,0.375535,0.608580,...,0,0,0,0,0,0,0,0,0,0
7,1.0,18.0,13.0,0.000000,22,12,0.0,0.117239,0.000000,0.000000,...,0,0,0,0,0,0,0,0,0,0
8,1.0,18.0,13.0,0.693147,30,12,0.0,0.835684,1.034304,0.905477,...,0,0,0,0,0,0,0,0,0,0
9,1.0,18.0,13.0,1.791759,63,12,0.0,1.305548,1.904952,1.634912,...,0,0,0,0,0,0,0,0,0,0


In [66]:
X_test

Unnamed: 0,big_city,city_nbr,cluster,day_1_2017,item_class_nbr,item_family_nbr,lit_city,mean_140_2017,mean_14_2017,mean_30_2017,...,promo_6,promo_7,promo_8,promo_9,promo_10,promo_11,promo_12,promo_13,promo_14,promo_15
0,1,18.0,13.0,0.000000,64,12,0,0.153952,0.334438,0.275522,...,0,0,0,0,0,0,0,0,0,0
1,1,18.0,13.0,0.000000,44,12,0,0.376532,0.206455,0.331321,...,0,0,0,0,0,0,0,0,0,0
2,1,18.0,13.0,0.000000,17,12,0,0.821010,0.573577,0.714515,...,0,0,0,0,0,0,0,0,0,0
3,1,18.0,13.0,0.693147,187,5,0,1.040541,1.031388,1.017638,...,0,0,0,0,0,0,0,0,0,0
4,1,18.0,13.0,1.609438,31,12,0,1.765433,1.629185,1.714960,...,0,0,0,0,0,0,0,0,0,0
5,1,18.0,13.0,2.197225,31,12,0,2.238009,2.382527,2.356156,...,0,0,1,1,1,1,1,1,1,1
6,1,18.0,13.0,1.098612,31,12,0,0.602872,0.474556,0.483466,...,0,1,1,1,1,1,1,0,1,1
7,1,18.0,13.0,0.693147,22,12,0,0.155165,0.276514,0.258080,...,0,0,0,0,0,0,0,0,1,0
8,1,18.0,13.0,0.000000,30,12,0,0.795222,0.681012,0.688510,...,0,0,0,0,0,0,0,0,0,0
9,1,18.0,13.0,2.708050,63,12,0,1.600535,1.619796,1.592271,...,0,0,0,0,0,0,0,0,0,0


In [58]:
print("Training and predicting models...")
params = {
    'num_leaves': 63, # 31
    'objective': 'regression',
    'min_data_in_leaf': 250,
    'learning_rate': 0.01,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 2, # 2
    'metric': 'l2',
    'num_threads': 4
}

MAX_ROUNDS = 5000
val_pred = []
test_pred = []
cate_vars = []
for i in range(16):
    print("=" * 50)
    print("Step %d" % (i+1))
    print("=" * 50)
    dtrain = lgb.Dataset(
        X_train, 
        label=y_train[:, i],
        
        categorical_feature=cate_vars,
        weight=pd.concat([items["perishable"]] * 6) * 0.25 + 1
    )
    dval = lgb.Dataset(
        X_val, label=y_val[:, i], reference=dtrain,
        weight=items["perishable"] * 0.25 + 1,
        categorical_feature=cate_vars)
    bst = lgb.train(
        params, dtrain, num_boost_round=MAX_ROUNDS,
        valid_sets=[dtrain, dval], early_stopping_rounds=150, verbose_eval=200
    )
    print("\n".join(("%s: %.2f" % x) for x in sorted(
        zip(X_train.columns, bst.feature_importance("gain")),
        key=lambda x: x[1], reverse=True
    )))
    val_pred.append(bst.predict(
        X_val, num_iteration=bst.best_iteration or MAX_ROUNDS))
    test_pred.append(bst.predict(
        X_test, num_iteration=bst.best_iteration or MAX_ROUNDS))

print("Validation mse:", mean_squared_error(
    y_val, np.array(val_pred).transpose()))

Training and predicting models...
Step 1




Training until validation scores don't improve for 150 rounds.
[200]	training's l2: 0.327336	valid_1's l2: 0.315227
[400]	training's l2: 0.303028	valid_1's l2: 0.296054
[600]	training's l2: 0.298182	valid_1's l2: 0.293517
[800]	training's l2: 0.295533	valid_1's l2: 0.292409
[1000]	training's l2: 0.293697	valid_1's l2: 0.291844
[1200]	training's l2: 0.292201	valid_1's l2: 0.291431
[1400]	training's l2: 0.290923	valid_1's l2: 0.291195
[1600]	training's l2: 0.289739	valid_1's l2: 0.29102
[1800]	training's l2: 0.28863	valid_1's l2: 0.290894
[2000]	training's l2: 0.287605	valid_1's l2: 0.29079
[2200]	training's l2: 0.286599	valid_1's l2: 0.29073
[2400]	training's l2: 0.285618	valid_1's l2: 0.290642
[2600]	training's l2: 0.284672	valid_1's l2: 0.29059
[2800]	training's l2: 0.28376	valid_1's l2: 0.29054
[3000]	training's l2: 0.282854	valid_1's l2: 0.290514
[3200]	training's l2: 0.281966	valid_1's l2: 0.290452
[3400]	training's l2: 0.281105	valid_1's l2: 0.290413
[3600]	training's l2: 0.280284

Step 4
Training until validation scores don't improve for 150 rounds.
[200]	training's l2: 0.37746	valid_1's l2: 0.378838
[400]	training's l2: 0.350177	valid_1's l2: 0.356895
[600]	training's l2: 0.343767	valid_1's l2: 0.352883
[800]	training's l2: 0.340016	valid_1's l2: 0.351346
[1000]	training's l2: 0.337648	valid_1's l2: 0.35054
[1200]	training's l2: 0.335732	valid_1's l2: 0.350043
[1400]	training's l2: 0.334081	valid_1's l2: 0.349707
[1600]	training's l2: 0.332608	valid_1's l2: 0.349495
[1800]	training's l2: 0.331192	valid_1's l2: 0.349265
[2000]	training's l2: 0.329843	valid_1's l2: 0.34911
[2200]	training's l2: 0.328611	valid_1's l2: 0.349009
[2400]	training's l2: 0.327398	valid_1's l2: 0.348868
[2600]	training's l2: 0.326234	valid_1's l2: 0.348786
[2800]	training's l2: 0.325125	valid_1's l2: 0.348706
[3000]	training's l2: 0.324053	valid_1's l2: 0.348603
[3200]	training's l2: 0.32298	valid_1's l2: 0.34853
[3400]	training's l2: 0.321938	valid_1's l2: 0.348436
[3600]	training's l2:

Step 7
Training until validation scores don't improve for 150 rounds.
[200]	training's l2: 0.3691	valid_1's l2: 0.457568
[400]	training's l2: 0.344962	valid_1's l2: 0.432255
[600]	training's l2: 0.338773	valid_1's l2: 0.432303
Early stopping, best iteration is:
[505]	training's l2: 0.341286	valid_1's l2: 0.431657
mean_14_2017: 12178804.32
mean_30_2017: 9225433.42
mean_12_dow6_2017: 2893948.35
mean_7_2017: 2365940.81
promo_6: 1151523.54
mean_3_2017: 761608.78
mean_60_2017: 690292.00
mean_20_dow6_2017: 601553.30
mean_4_dow6_2017: 434450.69
mean_5_2017: 205582.50
mean_2_dow6_2017: 173822.95
promo_21_2017: 116356.10
promo_3: 110508.59
promo_7: 109803.88
item_family_nbr: 105498.61
promo_30_2017: 76907.29
day_1_2017: 66586.33
promo_5: 59269.41
item_class_nbr: 58218.96
mean_4_dow5_2017: 51296.34
mean_140_2017: 48722.72
restday_2_2017: 41355.74
mean_20_dow5_2017: 40410.89
restday_7_2017: 38467.01
mean_20_dow1_2017: 34835.91
promo_13: 34440.07
mean_2_dow5_2017: 31055.57
mean_90_2017: 29927.17
p

Step 11
Training until validation scores don't improve for 150 rounds.
[200]	training's l2: 0.4012	valid_1's l2: 0.399446
[400]	training's l2: 0.374153	valid_1's l2: 0.378347
[600]	training's l2: 0.366913	valid_1's l2: 0.375913
[800]	training's l2: 0.362377	valid_1's l2: 0.375486
[1000]	training's l2: 0.35935	valid_1's l2: 0.375354
[1200]	training's l2: 0.357034	valid_1's l2: 0.375261
Early stopping, best iteration is:
[1179]	training's l2: 0.357244	valid_1's l2: 0.375242
mean_30_2017: 16629463.36
mean_14_2017: 7744268.40
mean_60_2017: 2891899.44
mean_12_dow3_2017: 2373980.19
mean_7_2017: 2180388.08
mean_5_2017: 1805397.08
mean_4_dow3_2017: 1295380.17
promo_10: 946994.91
mean_20_dow3_2017: 876711.98
item_class_nbr: 134179.21
promo_21_2017: 121058.33
item_family_nbr: 92943.55
mean_4_dow4_2017: 85860.17
promo_30_2017: 83858.62
promo_14: 74938.41
mean_3_2017: 73502.57
promo_12: 72613.05
restday_7_2017: 68618.07
promo_9: 61509.53
mean_12_dow2_2017: 61051.61
mean_140_2017: 59603.56
promo_7:

Step 15
Training until validation scores don't improve for 150 rounds.
[200]	training's l2: 0.370782	valid_1's l2: 0.369897
[400]	training's l2: 0.344813	valid_1's l2: 0.349148
[600]	training's l2: 0.338613	valid_1's l2: 0.34693
[800]	training's l2: 0.335135	valid_1's l2: 0.346247
[1000]	training's l2: 0.332731	valid_1's l2: 0.345894
[1200]	training's l2: 0.330806	valid_1's l2: 0.345658
[1400]	training's l2: 0.329138	valid_1's l2: 0.345516
[1600]	training's l2: 0.32764	valid_1's l2: 0.345561
Early stopping, best iteration is:
[1545]	training's l2: 0.328071	valid_1's l2: 0.345486
mean_30_2017: 14854787.58
mean_14_2017: 6381498.11
mean_12_dow0_2017: 2747777.20
mean_7_2017: 2310600.66
promo_14: 1788228.06
mean_60_2017: 1415099.41
mean_20_dow0_2017: 1120744.31
mean_4_dow0_2017: 359440.87
item_class_nbr: 209106.23
mean_5_2017: 153736.31
item_family_nbr: 151540.90
promo_7: 143842.03
promo_13: 137213.33
promo_30_2017: 113769.48
promo_21_2017: 96304.50
promo_0: 96160.53
day_1_2017: 83489.86
pr

In [61]:
print("Making submission...")
y_test = np.array(test_pred).transpose()
df_preds = pd.DataFrame(
    y_test, index=df_2017.index,
    columns=pd.date_range("2017-08-16", periods=16)
).stack().to_frame("unit_sales")
df_preds.index.set_names(["store_nbr", "item_nbr", "date"], inplace=True)

Making submission...


In [62]:
submission = df_test[["id"]].join(df_preds, how="left").fillna(0)
submission["unit_sales"] = np.clip(np.expm1(submission["unit_sales"]), 0, 1000)
submission.to_csv('lgb_v3-1.csv', float_format='%.4f', index=None)

In [69]:
len(test_pred[0])

167515

In [68]:
len(test_pred)

16

In [67]:
test_pred

[array([ 0.20218495,  0.33581692,  0.85266579, ...,  0.1671704 ,
         2.51688696,  0.53726671]),
 array([ 0.21570338,  0.30517955,  0.82888579, ...,  0.14881887,
         2.01291047,  0.38810882]),
 array([ 0.22423024,  0.33551373,  0.90349146, ...,  0.15923332,
         2.13028749,  0.42664252]),
 array([ 0.20921377,  0.2992341 ,  0.67507867, ...,  0.21968677,
         2.23148755,  0.55693083]),
 array([ 0.13756174,  0.16813315,  0.24833117, ...,  0.26475569,
         2.57776234,  0.68146849]),
 array([ 0.22399479,  0.30007167,  0.65521359, ...,  0.17694149,
         2.04588128,  0.43876589]),
 array([ 0.22204345,  0.28575259,  0.71561694, ...,  0.1826231 ,
         2.48041909,  0.43065157]),
 array([ 0.22638763,  0.29095567,  0.74315114, ...,  0.16888347,
         2.02333003,  0.35618663]),
 array([ 0.19871981,  0.28789683,  0.75601349, ...,  0.14383795,
         2.11073692,  0.33677368]),
 array([ 0.23490149,  0.31654716,  0.84780062, ...,  0.15688149,
         2.01229264,  0.35

In [63]:
y_test

array([[ 0.20218495,  0.21570338,  0.22423024, ...,  0.20887905,
         0.22833   ,  0.21068279],
       [ 0.33581692,  0.30517955,  0.33551373, ...,  0.32877253,
         0.35186089,  0.34428008],
       [ 0.85266579,  0.82888579,  0.90349146, ...,  0.72418873,
         0.79865668,  0.87588746],
       ..., 
       [ 0.1671704 ,  0.14881887,  0.15923332, ...,  0.22484744,
         0.18164825,  0.1728592 ],
       [ 2.51688696,  2.01291047,  2.13028749, ...,  2.45391267,
         1.94535927,  2.23485087],
       [ 0.53726671,  0.38810882,  0.42664252, ...,  0.47131078,
         0.39002785,  0.36411368]])

In [64]:
df_preds

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,unit_sales
store_nbr,item_nbr,date,Unnamed: 3_level_1
1,96995,2017-08-16,0.202185
1,96995,2017-08-17,0.215703
1,96995,2017-08-18,0.224230
1,96995,2017-08-19,0.209214
1,96995,2017-08-20,0.137562
1,96995,2017-08-21,0.223995
1,96995,2017-08-22,0.222043
1,96995,2017-08-23,0.226388
1,96995,2017-08-24,0.198720
1,96995,2017-08-25,0.234901


In [65]:
df_preds1 = pd.DataFrame(
    y_test, index=df_2017.index,
    columns=pd.date_range("2017-08-16", periods=16)
)
df_preds1

Unnamed: 0_level_0,Unnamed: 1_level_0,2017-08-16 00:00:00,2017-08-17 00:00:00,2017-08-18 00:00:00,2017-08-19 00:00:00,2017-08-20 00:00:00,2017-08-21 00:00:00,2017-08-22 00:00:00,2017-08-23 00:00:00,2017-08-24 00:00:00,2017-08-25 00:00:00,2017-08-26 00:00:00,2017-08-27 00:00:00,2017-08-28 00:00:00,2017-08-29 00:00:00,2017-08-30 00:00:00,2017-08-31 00:00:00
store_nbr,item_nbr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1,96995,0.202185,0.215703,0.224230,0.209214,0.137562,0.223995,0.222043,0.226388,0.198720,0.234901,0.247881,0.164602,0.228710,0.208879,0.228330,0.210683
1,99197,0.335817,0.305180,0.335514,0.299234,0.168133,0.300072,0.285753,0.290956,0.287897,0.316547,0.349073,0.207046,0.322955,0.328773,0.351861,0.344280
1,103520,0.852666,0.828886,0.903491,0.675079,0.248331,0.655214,0.715617,0.743151,0.756013,0.847801,0.730912,0.339166,0.683533,0.724189,0.798657,0.875887
1,103665,1.184268,1.013745,1.217854,1.162776,0.624608,0.907893,0.900552,1.028950,0.895870,1.153608,1.158510,0.549176,0.918380,0.927292,1.085033,0.975237
1,105574,1.905691,1.826220,1.826158,1.603457,0.795591,1.728382,1.639346,1.775253,1.629213,1.786172,1.476190,0.788064,1.764099,1.693511,1.865777,1.708159
1,105575,2.510634,2.291694,2.355016,2.255759,1.580556,2.355976,2.291283,2.239910,2.766906,3.069166,2.869993,2.348495,3.227881,3.132258,3.240386,2.974922
1,105577,0.667436,0.526803,0.525271,0.487218,0.315330,0.574757,0.542497,1.184662,1.185143,1.285636,1.209990,1.056985,1.376830,0.052387,1.390639,1.264965
1,105693,0.321160,0.263855,0.297076,0.273316,0.168180,0.281226,0.259765,0.264278,0.242501,0.297007,0.262910,0.207804,0.340188,0.277129,0.996662,0.202302
1,105737,0.903273,0.743017,0.790098,0.585519,0.295649,0.789708,0.706682,0.783476,0.650692,0.772784,0.649526,0.345655,0.806429,0.754437,0.869535,0.703255
1,105857,1.780670,1.637936,1.685184,1.581972,0.983515,1.626225,1.611083,1.707307,1.506887,1.627137,1.537129,1.020482,1.633192,1.684066,1.673667,1.624890
