In [1]:
from datetime import date, timedelta

import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
import lightgbm as lgb

In [2]:
df_train = pd.read_csv('database/train.csv',usecols=[1,2,3,4,5], parse_dates=['date'],
                       dtype={'onpromotion':bool},
                      converters = {'unit_sales': lambda u : np.log1p(float(u) if float(u) > 0 else 0)},
                      skiprows=range(1,66458909))

In [3]:
df_train.head()

Unnamed: 0,date,store_nbr,item_nbr,unit_sales,onpromotion
0,2016-01-01,25,105574,2.564949,False
1,2016-01-01,25,105575,2.302585,False
2,2016-01-01,25,105857,1.386294,False
3,2016-01-01,25,108634,1.386294,False
4,2016-01-01,25,108701,1.098612,True


In [4]:
df_test = pd.read_csv('database/test.csv', dtype={'onpromotion':bool},
                     parse_dates = ['date'],usecols=[0,1,2,3,4])

In [5]:
df_test.head()

Unnamed: 0,id,date,store_nbr,item_nbr,onpromotion
0,125497040,2017-08-16,1,96995,False
1,125497041,2017-08-16,1,99197,False
2,125497042,2017-08-16,1,103501,False
3,125497043,2017-08-16,1,103520,False
4,125497044,2017-08-16,1,103665,False


In [6]:
items = pd.read_csv('database/items.csv')

In [7]:
items.head()

Unnamed: 0,item_nbr,family,class,perishable
0,96995,GROCERY I,1093,0
1,99197,GROCERY I,1067,0
2,103501,CLEANING,3008,0
3,103520,GROCERY I,1028,0
4,103665,BREAD/BAKERY,2712,1


In [8]:
df_2017 = df_train.loc[df_train.date >= pd.datetime(2017,1,1)]
del df_train
df_2017.head()

Unnamed: 0,date,store_nbr,item_nbr,unit_sales,onpromotion
35229871,2017-01-01,25,99197,0.693147,False
35229872,2017-01-01,25,103665,2.079442,False
35229873,2017-01-01,25,105574,0.693147,False
35229874,2017-01-01,25,105857,1.609438,False
35229875,2017-01-01,25,106716,1.098612,False


In [9]:
df_store = pd.read_csv('database/stores.csv')
df_store['big_city'] = False
df_store['middle_city'] = False
df_store['little_city'] = False

citys = pd.read_csv('database/city_population.csv')
big_citys = citys[citys['2017 Population'] >= 1000000]['Name'].values
middle_citys = citys.loc[(citys['2017 Population'] >= 100000) & (citys['2017 Population'] < 1000000)]['Name'].values

df_store['big_city'] = df_store['city'].map(lambda u :True if u in big_citys else False).astype(bool)
df_store['middle_city'] = df_store['city'].map(lambda u: True if u in middle_citys else False).astype(bool)
df_store['little_city'] = df_store['city'].map(lambda u: True if u not in big_citys and u not in middle_citys else False).astype(bool)

In [10]:
df_store = df_store[['store_nbr', 'big_city','middle_city','little_city']]

In [11]:
df_store.head()

Unnamed: 0,store_nbr,big_city,middle_city,little_city
0,1,True,False,False
1,2,True,False,False
2,3,True,False,False
3,4,True,False,False
4,5,False,True,False


In [12]:
big_city_dict = {}
mid_city_dict = {}
lit_city_dict = {}
for i,j in zip(df_store['store_nbr'], df_store['big_city']):
    big_city_dict[i] = j
for i,j in zip(df_store['store_nbr'], df_store['middle_city']):
    mid_city_dict[i] = j
for i,j in zip(df_store['store_nbr'], df_store['little_city']):
    lit_city_dict[i] = j

In [13]:
city_2017_train = pd.merge(df_2017, df_store, how='left', on=['store_nbr'])
city_2017_test = pd.merge(df_test, df_store, how='left', on=['store_nbr'])
#city_2017_train

In [14]:
bc_2017_train = city_2017_train.set_index(['store_nbr', 'item_nbr', 'date'])[['big_city']].unstack(level=-1)
mc_2017_train = city_2017_train.set_index(['store_nbr', 'item_nbr', 'date'])[['middle_city']].unstack(level=-1)
lc_2017_train = city_2017_train.set_index(['store_nbr', 'item_nbr', 'date'])[['little_city']].unstack(level=-1)
bc_2017_train.columns = bc_2017_train.columns.get_level_values(1)
mc_2017_train.columns = mc_2017_train.columns.get_level_values(1)
lc_2017_train.columns = lc_2017_train.columns.get_level_values(1)

In [15]:
ind = list(set(bc_2017_train.index.get_level_values(0)))
for i in ind:
    bc_2017_train.loc[i] = big_city_dict[i]
    
ind = list(set(mc_2017_train.index.get_level_values(0)))
for i in ind:
    mc_2017_train.loc[i] = mid_city_dict[i]
    
ind = list(set(lc_2017_train.index.get_level_values(0)))
for i in ind:
    lc_2017_train.loc[i] = lit_city_dict[i]

In [16]:
bc_2017_test = city_2017_test.set_index(['store_nbr', 'item_nbr', 'date'])[['big_city']].unstack(level=-1)
mc_2017_test = city_2017_test.set_index(['store_nbr', 'item_nbr', 'date'])[['middle_city']].unstack(level=-1)
lc_2017_test = city_2017_test.set_index(['store_nbr', 'item_nbr', 'date'])[['little_city']].unstack(level=-1)

bc_2017_test.columns = bc_2017_test.columns.get_level_values(1)
mc_2017_test.columns = mc_2017_test.columns.get_level_values(1)
lc_2017_test.columns = lc_2017_test.columns.get_level_values(1)

In [17]:
ind = list(set(bc_2017_test.index.get_level_values(0)))
for i in ind:
    bc_2017_test.loc[i] = big_city_dict[i]
    
ind = list(set(mc_2017_test.index.get_level_values(0)))
for i in ind:
    mc_2017_test.loc[i] = mid_city_dict[i]
    
ind = list(set(lc_2017_test.index.get_level_values(0)))
for i in ind:
    lc_2017_test.loc[i] = lit_city_dict[i]

In [18]:
bc_2017_test = bc_2017_test.reindex(bc_2017_train.index).fillna(False) # 肯定会丢掉了很多train里没有的商品
bc_2017 = pd.concat([bc_2017_train, bc_2017_test], axis=1)

mc_2017_test = mc_2017_test.reindex(mc_2017_train.index).fillna(False)
mc_2017 = pd.concat([mc_2017_train, mc_2017_test], axis=1)

lc_2017_test = lc_2017_test.reindex(lc_2017_train.index).fillna(False)
lc_2017 = pd.concat([lc_2017_train, lc_2017_test], axis=1)

In [19]:
bc_2017

Unnamed: 0_level_0,date,2017-01-01 00:00:00,2017-01-02 00:00:00,2017-01-03 00:00:00,2017-01-04 00:00:00,2017-01-05 00:00:00,2017-01-06 00:00:00,2017-01-07 00:00:00,2017-01-08 00:00:00,2017-01-09 00:00:00,2017-01-10 00:00:00,...,2017-08-22 00:00:00,2017-08-23 00:00:00,2017-08-24 00:00:00,2017-08-25 00:00:00,2017-08-26 00:00:00,2017-08-27 00:00:00,2017-08-28 00:00:00,2017-08-29 00:00:00,2017-08-30 00:00:00,2017-08-31 00:00:00
store_nbr,item_nbr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,96995,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
1,99197,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
1,103520,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
1,103665,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
1,105574,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
1,105575,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
1,105577,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
1,105693,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
1,105737,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
1,105857,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True


In [20]:
del bc_2017_train, bc_2017_test
del mc_2017_train, mc_2017_test
del lc_2017_train, lc_2017_test

## 处理节假日信息

In [21]:
df_holiday = pd.read_csv('database/holidays_events.csv', 
                        parse_dates = ['date'],
                        dtype={'transferred':bool})

In [22]:
holiday_2017 = df_holiday.loc[df_holiday.date >= pd.datetime(2017,1,1)]
holiday_2017 = holiday_2017.loc[holiday_2017['transferred'] == False]

firstday = date(2017,1,1)
endday = date(2017,9,1)
periods = endday - firstday
all_days = pd.date_range(firstday, periods=periods.days, freq='D')

In [23]:
weekend_tmp = map(lambda day: True if(day.dayofweek >=5) else False, all_days)
weekend = []
for i in weekend_tmp:
    weekend.append(i)   
df_weekend = pd.DataFrame({'date':all_days, 'weekend_or_holiday': weekend })

In [24]:
tmp = holiday_2017['date'].values
for i in range(len(tmp)):
    df_weekend.loc[df_weekend['date'] == tmp[i], 'weekend_or_holiday'] = True
df_weekend_and_holiday = df_weekend

In [25]:
hw_2017_train = pd.merge(df_2017, df_weekend_and_holiday, how='left', on=['date'])
hw_2017_train = hw_2017_train.set_index(['store_nbr', 'item_nbr', 'date'])[['weekend_or_holiday']].unstack(level=-1)
hw_2017_train.columns = hw_2017_train.columns.get_level_values(1)

In [26]:
holiday_zip = zip(df_weekend['date'],df_weekend['weekend_or_holiday'])
tmp_dict = {}
for d,h in holiday_zip:
    tmp_dict[d] = h

In [27]:
columns = hw_2017_train.columns
for i in columns:
    hw_2017_train[i] = tmp_dict[i]

In [28]:
hw_2017_train

Unnamed: 0_level_0,date,2017-01-01 00:00:00,2017-01-02 00:00:00,2017-01-03 00:00:00,2017-01-04 00:00:00,2017-01-05 00:00:00,2017-01-06 00:00:00,2017-01-07 00:00:00,2017-01-08 00:00:00,2017-01-09 00:00:00,2017-01-10 00:00:00,...,2017-08-06 00:00:00,2017-08-07 00:00:00,2017-08-08 00:00:00,2017-08-09 00:00:00,2017-08-10 00:00:00,2017-08-11 00:00:00,2017-08-12 00:00:00,2017-08-13 00:00:00,2017-08-14 00:00:00,2017-08-15 00:00:00
store_nbr,item_nbr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,96995,True,True,False,False,False,False,True,True,False,False,...,True,False,False,False,False,True,True,True,False,True
1,99197,True,True,False,False,False,False,True,True,False,False,...,True,False,False,False,False,True,True,True,False,True
1,103520,True,True,False,False,False,False,True,True,False,False,...,True,False,False,False,False,True,True,True,False,True
1,103665,True,True,False,False,False,False,True,True,False,False,...,True,False,False,False,False,True,True,True,False,True
1,105574,True,True,False,False,False,False,True,True,False,False,...,True,False,False,False,False,True,True,True,False,True
1,105575,True,True,False,False,False,False,True,True,False,False,...,True,False,False,False,False,True,True,True,False,True
1,105577,True,True,False,False,False,False,True,True,False,False,...,True,False,False,False,False,True,True,True,False,True
1,105693,True,True,False,False,False,False,True,True,False,False,...,True,False,False,False,False,True,True,True,False,True
1,105737,True,True,False,False,False,False,True,True,False,False,...,True,False,False,False,False,True,True,True,False,True
1,105857,True,True,False,False,False,False,True,True,False,False,...,True,False,False,False,False,True,True,True,False,True


In [29]:
hw_2017_train.columns

DatetimeIndex(['2017-01-01', '2017-01-02', '2017-01-03', '2017-01-04',
               '2017-01-05', '2017-01-06', '2017-01-07', '2017-01-08',
               '2017-01-09', '2017-01-10',
               ...
               '2017-08-06', '2017-08-07', '2017-08-08', '2017-08-09',
               '2017-08-10', '2017-08-11', '2017-08-12', '2017-08-13',
               '2017-08-14', '2017-08-15'],
              dtype='datetime64[ns]', name='date', length=227, freq=None)

In [30]:
hw_2017_test = pd.merge(df_test, df_weekend_and_holiday, how='left', on=['date'])
hw_2017_test = hw_2017_test.set_index(['store_nbr', 'item_nbr', 'date'])[['weekend_or_holiday']].unstack(level=-1)
hw_2017_test.columns = hw_2017_test.columns.get_level_values(1)

In [31]:
columns = hw_2017_test.columns
for i in columns:
    hw_2017_test[i] = tmp_dict[i]
hw_2017_test = hw_2017_test.reindex(hw_2017_train.index)

In [32]:
hw_2017 = pd.concat([hw_2017_train, hw_2017_test], axis=1)
del hw_2017_train, hw_2017_test
hw_2017

Unnamed: 0_level_0,date,2017-01-01 00:00:00,2017-01-02 00:00:00,2017-01-03 00:00:00,2017-01-04 00:00:00,2017-01-05 00:00:00,2017-01-06 00:00:00,2017-01-07 00:00:00,2017-01-08 00:00:00,2017-01-09 00:00:00,2017-01-10 00:00:00,...,2017-08-22 00:00:00,2017-08-23 00:00:00,2017-08-24 00:00:00,2017-08-25 00:00:00,2017-08-26 00:00:00,2017-08-27 00:00:00,2017-08-28 00:00:00,2017-08-29 00:00:00,2017-08-30 00:00:00,2017-08-31 00:00:00
store_nbr,item_nbr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,96995,True,True,False,False,False,False,True,True,False,False,...,False,False,True,False,True,True,False,False,False,False
1,99197,True,True,False,False,False,False,True,True,False,False,...,False,False,True,False,True,True,False,False,False,False
1,103520,True,True,False,False,False,False,True,True,False,False,...,False,False,True,False,True,True,False,False,False,False
1,103665,True,True,False,False,False,False,True,True,False,False,...,False,False,True,False,True,True,False,False,False,False
1,105574,True,True,False,False,False,False,True,True,False,False,...,False,False,True,False,True,True,False,False,False,False
1,105575,True,True,False,False,False,False,True,True,False,False,...,False,False,True,False,True,True,False,False,False,False
1,105577,True,True,False,False,False,False,True,True,False,False,...,False,False,True,False,True,True,False,False,False,False
1,105693,True,True,False,False,False,False,True,True,False,False,...,False,False,True,False,True,True,False,False,False,False
1,105737,True,True,False,False,False,False,True,True,False,False,...,False,False,True,False,True,True,False,False,False,False
1,105857,True,True,False,False,False,False,True,True,False,False,...,False,False,True,False,True,True,False,False,False,False


In [57]:
hw_2017.fillna(False)

Unnamed: 0_level_0,date,2017-01-01 00:00:00,2017-01-02 00:00:00,2017-01-03 00:00:00,2017-01-04 00:00:00,2017-01-05 00:00:00,2017-01-06 00:00:00,2017-01-07 00:00:00,2017-01-08 00:00:00,2017-01-09 00:00:00,2017-01-10 00:00:00,...,2017-08-22 00:00:00,2017-08-23 00:00:00,2017-08-24 00:00:00,2017-08-25 00:00:00,2017-08-26 00:00:00,2017-08-27 00:00:00,2017-08-28 00:00:00,2017-08-29 00:00:00,2017-08-30 00:00:00,2017-08-31 00:00:00
store_nbr,item_nbr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,96995,True,True,False,False,False,False,True,True,False,False,...,False,False,True,False,True,True,False,False,False,False
1,99197,True,True,False,False,False,False,True,True,False,False,...,False,False,True,False,True,True,False,False,False,False
1,103520,True,True,False,False,False,False,True,True,False,False,...,False,False,True,False,True,True,False,False,False,False
1,103665,True,True,False,False,False,False,True,True,False,False,...,False,False,True,False,True,True,False,False,False,False
1,105574,True,True,False,False,False,False,True,True,False,False,...,False,False,True,False,True,True,False,False,False,False
1,105575,True,True,False,False,False,False,True,True,False,False,...,False,False,True,False,True,True,False,False,False,False
1,105577,True,True,False,False,False,False,True,True,False,False,...,False,False,True,False,True,True,False,False,False,False
1,105693,True,True,False,False,False,False,True,True,False,False,...,False,False,True,False,True,True,False,False,False,False
1,105737,True,True,False,False,False,False,True,True,False,False,...,False,False,True,False,True,True,False,False,False,False
1,105857,True,True,False,False,False,False,True,True,False,False,...,False,False,True,False,True,True,False,False,False,False


## 处理促销信息

In [33]:
df_test = df_test.set_index(['store_nbr', 'item_nbr', 'date'])

In [34]:
promo_2017_train = df_2017.set_index(['store_nbr','item_nbr','date'])[['onpromotion']].unstack(level=-1).fillna(False)
promo_2017_train

Unnamed: 0_level_0,Unnamed: 1_level_0,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion
Unnamed: 0_level_1,date,2017-01-01,2017-01-02,2017-01-03,2017-01-04,2017-01-05,2017-01-06,2017-01-07,2017-01-08,2017-01-09,2017-01-10,...,2017-08-06,2017-08-07,2017-08-08,2017-08-09,2017-08-10,2017-08-11,2017-08-12,2017-08-13,2017-08-14,2017-08-15
store_nbr,item_nbr,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
1,96995,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,99197,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,103520,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,103665,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,105574,False,False,True,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,105575,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,105577,False,False,False,False,False,False,False,False,False,False,...,True,True,False,True,False,False,False,False,False,False
1,105693,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,105737,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,105857,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [35]:
promo_2017_train.columns = promo_2017_train.columns.get_level_values(1)
promo_2017_train.columns

DatetimeIndex(['2017-01-01', '2017-01-02', '2017-01-03', '2017-01-04',
               '2017-01-05', '2017-01-06', '2017-01-07', '2017-01-08',
               '2017-01-09', '2017-01-10',
               ...
               '2017-08-06', '2017-08-07', '2017-08-08', '2017-08-09',
               '2017-08-10', '2017-08-11', '2017-08-12', '2017-08-13',
               '2017-08-14', '2017-08-15'],
              dtype='datetime64[ns]', name='date', length=227, freq=None)

In [36]:
promo_2017_test = df_test[['onpromotion']].unstack(level=-1).fillna(False)
promo_2017_test.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion
Unnamed: 0_level_1,date,2017-08-16,2017-08-17,2017-08-18,2017-08-19,2017-08-20,2017-08-21,2017-08-22,2017-08-23,2017-08-24,2017-08-25,2017-08-26,2017-08-27,2017-08-28,2017-08-29,2017-08-30,2017-08-31
store_nbr,item_nbr,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2
1,96995,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,99197,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,103501,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,103520,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,103665,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [37]:
promo_2017_test.columns

MultiIndex(levels=[['onpromotion'], [2017-08-16 00:00:00, 2017-08-17 00:00:00, 2017-08-18 00:00:00, 2017-08-19 00:00:00, 2017-08-20 00:00:00, 2017-08-21 00:00:00, 2017-08-22 00:00:00, 2017-08-23 00:00:00, 2017-08-24 00:00:00, 2017-08-25 00:00:00, 2017-08-26 00:00:00, 2017-08-27 00:00:00, 2017-08-28 00:00:00, 2017-08-29 00:00:00, 2017-08-30 00:00:00, 2017-08-31 00:00:00]],
           labels=[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]],
           names=[None, 'date'])

In [38]:
promo_2017_test.columns = promo_2017_test.columns.get_level_values(1)
promo_2017_test.columns

DatetimeIndex(['2017-08-16', '2017-08-17', '2017-08-18', '2017-08-19',
               '2017-08-20', '2017-08-21', '2017-08-22', '2017-08-23',
               '2017-08-24', '2017-08-25', '2017-08-26', '2017-08-27',
               '2017-08-28', '2017-08-29', '2017-08-30', '2017-08-31'],
              dtype='datetime64[ns]', name='date', freq=None)

In [39]:
promo_2017_test = promo_2017_test.reindex(promo_2017_train.index).fillna(False)

In [40]:
promo_2017 = pd.concat([promo_2017_train, promo_2017_test], axis=1)

In [41]:
del promo_2017_test, promo_2017_train

In [42]:
promo_2017.head()

Unnamed: 0_level_0,date,2017-01-01 00:00:00,2017-01-02 00:00:00,2017-01-03 00:00:00,2017-01-04 00:00:00,2017-01-05 00:00:00,2017-01-06 00:00:00,2017-01-07 00:00:00,2017-01-08 00:00:00,2017-01-09 00:00:00,2017-01-10 00:00:00,...,2017-08-22 00:00:00,2017-08-23 00:00:00,2017-08-24 00:00:00,2017-08-25 00:00:00,2017-08-26 00:00:00,2017-08-27 00:00:00,2017-08-28 00:00:00,2017-08-29 00:00:00,2017-08-30 00:00:00,2017-08-31 00:00:00
store_nbr,item_nbr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,96995,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,99197,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,103520,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,103665,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,105574,False,False,True,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [43]:
promo_2017.columns

DatetimeIndex(['2017-01-01', '2017-01-02', '2017-01-03', '2017-01-04',
               '2017-01-05', '2017-01-06', '2017-01-07', '2017-01-08',
               '2017-01-09', '2017-01-10',
               ...
               '2017-08-22', '2017-08-23', '2017-08-24', '2017-08-25',
               '2017-08-26', '2017-08-27', '2017-08-28', '2017-08-29',
               '2017-08-30', '2017-08-31'],
              dtype='datetime64[ns]', name='date', length=243, freq=None)

In [44]:
df_2017.head()

Unnamed: 0,date,store_nbr,item_nbr,unit_sales,onpromotion
35229871,2017-01-01,25,99197,0.693147,False
35229872,2017-01-01,25,103665,2.079442,False
35229873,2017-01-01,25,105574,0.693147,False
35229874,2017-01-01,25,105857,1.609438,False
35229875,2017-01-01,25,106716,1.098612,False


In [45]:
df_2017 = df_2017.set_index(['store_nbr', 'item_nbr', 'date'])[['unit_sales']].unstack(level=-1).fillna(0)
df_2017.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,unit_sales,unit_sales,unit_sales,unit_sales,unit_sales,unit_sales,unit_sales,unit_sales,unit_sales,unit_sales,unit_sales,unit_sales,unit_sales,unit_sales,unit_sales,unit_sales,unit_sales,unit_sales,unit_sales,unit_sales,unit_sales
Unnamed: 0_level_1,date,2017-01-01,2017-01-02,2017-01-03,2017-01-04,2017-01-05,2017-01-06,2017-01-07,2017-01-08,2017-01-09,2017-01-10,...,2017-08-06,2017-08-07,2017-08-08,2017-08-09,2017-08-10,2017-08-11,2017-08-12,2017-08-13,2017-08-14,2017-08-15
store_nbr,item_nbr,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
1,96995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.098612,1.098612,0.0,0.0,0.693147,0.0,0.0,0.0,0.0,0.0
1,99197,0.0,0.0,1.386294,0.693147,0.693147,0.693147,1.098612,0.0,0.0,0.693147,...,0.0,1.098612,0.0,1.098612,0.0,0.0,0.0,0.0,0.0,0.0
1,103520,0.0,0.693147,1.098612,0.0,1.098612,1.386294,0.693147,0.0,0.693147,0.693147,...,0.0,0.0,1.386294,0.0,1.386294,0.693147,0.693147,0.693147,0.0,0.0
1,103665,0.0,0.0,0.0,1.386294,1.098612,1.098612,0.693147,1.098612,0.0,2.079442,...,0.693147,1.098612,0.0,2.079442,2.302585,1.098612,0.0,0.0,0.693147,0.693147
1,105574,0.0,0.0,1.791759,2.564949,2.302585,1.94591,1.609438,1.098612,1.386294,2.302585,...,0.0,1.791759,2.079442,1.94591,2.397895,1.791759,1.791759,0.0,1.386294,1.609438


In [46]:
df_2017.columns = df_2017.columns.get_level_values(1)

## 处理商品信息

In [47]:
items['family_nbr'] = items['family'].astype('category').cat.codes
items['class_nbr'] = items['class'].astype('category').cat.codes

In [48]:
items.head()

Unnamed: 0,item_nbr,family,class,perishable,family_nbr,class_nbr
0,96995,GROCERY I,1093,0,12,64
1,99197,GROCERY I,1067,0,12,44
2,103501,CLEANING,3008,0,7,217
3,103520,GROCERY I,1028,0,12,17
4,103665,BREAD/BAKERY,2712,1,5,187


In [49]:
items = items.set_index('item_nbr')

In [50]:
len(items)

4100

In [51]:
items = items.reindex(df_2017.index.get_level_values(1))

In [52]:
len(items)

167515

## 处理商店信息

In [53]:
df_store = pd.read_csv('database/stores.csv')
df_store['city_nbr'] = df_store['city'].astype('category').cat.codes
df_store['state_nbr'] = df_store['state'].astype('category').cat.codes
df_store['type_nbr'] = df_store['type'].astype('category').cat.codes
df_store = df_store.reindex(df_2017.index.get_level_values(0))
df_store.head()

Unnamed: 0_level_0,store_nbr,city,state,type,cluster,city_nbr,state_nbr,type_nbr
store_nbr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,2.0,Quito,Pichincha,D,13.0,18.0,12.0,3.0
1,2.0,Quito,Pichincha,D,13.0,18.0,12.0,3.0
1,2.0,Quito,Pichincha,D,13.0,18.0,12.0,3.0
1,2.0,Quito,Pichincha,D,13.0,18.0,12.0,3.0
1,2.0,Quito,Pichincha,D,13.0,18.0,12.0,3.0


## 提取数据

In [54]:
from datetime import timedelta, date
def get_timespan(df, dt, minus, periods, freq='D'):
    return df[pd.date_range(dt - timedelta(days=minus), periods=periods, freq=freq)]

In [59]:
def prepare_dataset(t2017, is_train=True):
    X = pd.DataFrame({
        
        "city_nbr":df_store['city_nbr'].values,
        "state_nbr":df_store['state_nbr'].values,
        "type_nbr":df_store['type_nbr'].values,
        "cluster":df_store['cluster'].values,
        
        "perishable":items['perishable'].values,
        "item_family_nbr":items['family_nbr'].values,
        "item_class_nbr": items['class_nbr'].values,    
        
        "day_1_2017": get_timespan(df_2017, t2017, 1, 1).values.ravel(),
        #"weekend_and_holiday":get_timespan(hw_2017, t2017, 0, 1).fillna(0).values.astype(np.uint8).ravel(),
        "restday_2_2017": get_timespan(hw_2017, t2017, 2, 2).sum(axis=1).values,
        "restday_7_2017": get_timespan(hw_2017, t2017, 7, 7).sum(axis=1).values,            
        "big_city":get_timespan(bc_2017, t2017, 0, 1).sum(axis=1).values.ravel(),
        "mid_city": get_timespan(mc_2017, t2017, 0, 1).sum(axis=1).values.ravel(),
        "lit_city": get_timespan(lc_2017, t2017, 0, 1).sum(axis=1).values.ravel(),
         
        # 往前n天的销量均值
        "mean_3_2017": get_timespan(df_2017, t2017, 3, 3).mean(axis=1).values,
        "mean_5_2017": get_timespan(df_2017, t2017, 5, 5).mean(axis=1).values,
        "mean_7_2017": get_timespan(df_2017, t2017, 7, 7).mean(axis=1).values,
        "mean_14_2017": get_timespan(df_2017, t2017, 14, 14).mean(axis=1).values,
        "mean_30_2017": get_timespan(df_2017, t2017, 30, 30).mean(axis=1).values,
        "mean_60_2017": get_timespan(df_2017, t2017, 60, 60).mean(axis=1).values,
        "mean_90_2017": get_timespan(df_2017, t2017, 90, 90).mean(axis=1).values,
        "mean_140_2017": get_timespan(df_2017, t2017, 140, 140).mean(axis=1).values,
        
        # 往前n天的促销次数
        "promo_1_2017": get_timespan(promo_2017, t2017, 1, 1).sum(axis=1).values,
        "promo_3_2017": get_timespan(promo_2017, t2017, 3, 3).sum(axis=1).values,
        "promo_7_2017": get_timespan(promo_2017, t2017, 7, 7).sum(axis=1).values,
        "promo_14_2017": get_timespan(promo_2017, t2017, 14, 14).sum(axis=1).values,
        "promo_21_2017": get_timespan(promo_2017, t2017, 21, 21).sum(axis=1).values,
        "promo_30_2017": get_timespan(promo_2017, t2017, 30, 30).sum(axis=1).values,
        "promo_60_2017": get_timespan(promo_2017, t2017, 60, 60).sum(axis=1).values,
        "promo_90_2017": get_timespan(promo_2017, t2017, 90, 90).sum(axis=1).values,
        "promo_140_2017": get_timespan(promo_2017, t2017, 140, 140).sum(axis=1).values
    })
    for i in range(7):
        X['mean_2_dow{}_2017'.format(i)] = get_timespan(df_2017, t2017, 14-i, 2, freq='7D').mean(axis=1).values
        X['mean_4_dow{}_2017'.format(i)] = get_timespan(df_2017, t2017, 28-i, 4, freq='7D').mean(axis=1).values
        X['mean_20_dow{}_2017'.format(i)] = get_timespan(df_2017, t2017, 140-i, 20, freq='7D').mean(axis=1).values
        X['mean_12_dow{}_2017'.format(i)] = get_timespan(df_2017, t2017, 84-i, 12, freq='7D').mean(axis=1).values

    for i in range(16):
        # 往后16天的促销信息
        X["promo_{}".format(i)] = promo_2017[
            t2017 + timedelta(days=i)].values.astype(np.uint8)
        X["rest_{}".format(i)] = hw_2017[
            t2017 + timedelta(days=i)].values.astype(np.float)
    
    if is_train:
        y = df_2017[
            pd.date_range(t2017, periods=16)
        ].values
        return X, y
    return X

In [60]:
print("Preparing dataset...")
t2017 = date(2017, 5, 31)
X_l, y_l = [], []
for i in range(6): # 6
    delta = timedelta(days=7 * i)
    X_tmp, y_tmp = prepare_dataset(
        t2017 + delta
    )
    X_l.append(X_tmp)
    y_l.append(y_tmp)
X_train = pd.concat(X_l, axis=0)              # 装载了6个日期的数据
y_train = np.concatenate(y_l, axis=0)
del X_l, y_l
X_val, y_val = prepare_dataset(date(2017, 7, 26))
X_test = prepare_dataset(date(2017, 8, 16), is_train=False)

Preparing dataset...


In [64]:
X_train.head()

Unnamed: 0,big_city,city_nbr,cluster,day_1_2017,item_class_nbr,item_family_nbr,lit_city,mean_140_2017,mean_14_2017,mean_30_2017,...,promo_11,rest_11,promo_12,rest_12,promo_13,rest_13,promo_14,rest_14,promo_15,rest_15
0,1.0,18.0,13.0,0.0,64,12,0.0,0.070156,0.148532,0.138629,...,0,1.0,0,0.0,0,0.0,0,0.0,0,0.0
1,1.0,18.0,13.0,0.0,44,12,0.0,0.134989,0.511931,0.381457,...,0,1.0,0,0.0,0,0.0,0,0.0,0,0.0
2,1.0,18.0,13.0,0.0,17,12,0.0,0.712362,0.667989,0.868856,...,0,1.0,0,0.0,0,0.0,0,0.0,0,0.0
3,1.0,18.0,13.0,0.0,187,5,0.0,1.015355,0.866918,0.956552,...,0,1.0,0,0.0,0,0.0,0,0.0,0,0.0
4,1.0,18.0,13.0,1.098612,31,12,0.0,1.805308,1.556041,1.774012,...,0,1.0,0,0.0,0,0.0,0,0.0,0,0.0


In [65]:
X_test.head()

Unnamed: 0,big_city,city_nbr,cluster,day_1_2017,item_class_nbr,item_family_nbr,lit_city,mean_140_2017,mean_14_2017,mean_30_2017,...,promo_11,rest_11,promo_12,rest_12,promo_13,rest_13,promo_14,rest_14,promo_15,rest_15
0,1,18.0,13.0,0.0,64,12,0,0.153952,0.334438,0.275522,...,0,1.0,0,0.0,0,0.0,0,0.0,0,0.0
1,1,18.0,13.0,0.0,44,12,0,0.376532,0.206455,0.331321,...,0,1.0,0,0.0,0,0.0,0,0.0,0,0.0
2,1,18.0,13.0,0.0,17,12,0,0.82101,0.573577,0.714515,...,0,1.0,0,0.0,0,0.0,0,0.0,0,0.0
3,1,18.0,13.0,0.693147,187,5,0,1.040541,1.031388,1.017638,...,0,1.0,0,0.0,0,0.0,0,0.0,0,0.0
4,1,18.0,13.0,1.609438,31,12,0,1.765433,1.629185,1.71496,...,0,1.0,0,0.0,0,0.0,0,0.0,0,0.0


In [66]:
print("Training and predicting models...")
params = {
    'num_leaves': 63, 
    'objective': 'regression',
    'min_data_in_leaf': 200, # 250
    'learning_rate': 0.01,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 2, 
    'metric': 'l2',
    'num_threads': 4
}

MAX_ROUNDS = 20000
val_pred = []
test_pred = []
cate_vars = []
for i in range(16):
    print("=" * 50)
    print("Step %d" % (i+1))
    print("=" * 50)
    dtrain = lgb.Dataset(
        X_train, 
        label=y_train[:, i],
        
        categorical_feature=cate_vars,
        weight=pd.concat([items["perishable"]] * 6) * 0.25 + 1
    )
    dval = lgb.Dataset(
        X_val, label=y_val[:, i], reference=dtrain,
        weight=items["perishable"] * 0.25 + 1,
        categorical_feature=cate_vars)
    bst = lgb.train(
        params, dtrain, num_boost_round=MAX_ROUNDS,
        valid_sets=[dtrain, dval], early_stopping_rounds=300, verbose_eval=200 
    )
    print("\n".join(("%s: %.2f" % x) for x in sorted(
        zip(X_train.columns, bst.feature_importance("gain")),
        key=lambda x: x[1], reverse=True
    )))
    val_pred.append(bst.predict(
        X_val, num_iteration=bst.best_iteration or MAX_ROUNDS))
    test_pred.append(bst.predict(
        X_test, num_iteration=bst.best_iteration or MAX_ROUNDS))

print("Validation mse:", mean_squared_error(
    y_val, np.array(val_pred).transpose()))

Training and predicting models...
Step 1




Training until validation scores don't improve for 300 rounds.
[200]	training's l2: 0.325383	valid_1's l2: 0.313347
[400]	training's l2: 0.300682	valid_1's l2: 0.294637
[600]	training's l2: 0.29558	valid_1's l2: 0.292226
[800]	training's l2: 0.292848	valid_1's l2: 0.291185
[1000]	training's l2: 0.290936	valid_1's l2: 0.290558
[1200]	training's l2: 0.289409	valid_1's l2: 0.29018
[1400]	training's l2: 0.288065	valid_1's l2: 0.289928
[1600]	training's l2: 0.286832	valid_1's l2: 0.289765
[1800]	training's l2: 0.285722	valid_1's l2: 0.289616
[2000]	training's l2: 0.28465	valid_1's l2: 0.289488
[2200]	training's l2: 0.283605	valid_1's l2: 0.289394
[2400]	training's l2: 0.282622	valid_1's l2: 0.289324
[2600]	training's l2: 0.281648	valid_1's l2: 0.289277
[2800]	training's l2: 0.280693	valid_1's l2: 0.289212
[3000]	training's l2: 0.279794	valid_1's l2: 0.289155
[3200]	training's l2: 0.278872	valid_1's l2: 0.289108
[3400]	training's l2: 0.27798	valid_1's l2: 0.289091
[3600]	training's l2: 0.277

[3800]	training's l2: 0.298094	valid_1's l2: 0.335522
[4000]	training's l2: 0.297149	valid_1's l2: 0.335456
[4200]	training's l2: 0.296231	valid_1's l2: 0.335398
[4400]	training's l2: 0.295326	valid_1's l2: 0.335358
[4600]	training's l2: 0.294458	valid_1's l2: 0.335286
[4800]	training's l2: 0.293595	valid_1's l2: 0.335289
[5000]	training's l2: 0.292752	valid_1's l2: 0.335236
[5200]	training's l2: 0.291925	valid_1's l2: 0.335204
[5400]	training's l2: 0.291084	valid_1's l2: 0.33515
[5600]	training's l2: 0.290241	valid_1's l2: 0.335118
[5800]	training's l2: 0.289416	valid_1's l2: 0.335097
[6000]	training's l2: 0.288588	valid_1's l2: 0.335092
[6200]	training's l2: 0.287783	valid_1's l2: 0.33508
Early stopping, best iteration is:
[6061]	training's l2: 0.288337	valid_1's l2: 0.335069
mean_14_2017: 18720274.81
mean_7_2017: 5961236.28
mean_12_dow2_2017: 2831694.44
mean_30_2017: 2134728.09
mean_20_dow2_2017: 1094128.75
mean_4_dow2_2017: 1004289.25
promo_2: 911635.36
mean_5_2017: 737104.64
item_

[5600]	training's l2: 0.312766	valid_1's l2: 0.34441
[5800]	training's l2: 0.311875	valid_1's l2: 0.34439
[6000]	training's l2: 0.31097	valid_1's l2: 0.344388
[6200]	training's l2: 0.310104	valid_1's l2: 0.344344
[6400]	training's l2: 0.309223	valid_1's l2: 0.34431
[6600]	training's l2: 0.308361	valid_1's l2: 0.34428
[6800]	training's l2: 0.307522	valid_1's l2: 0.344242
[7000]	training's l2: 0.30667	valid_1's l2: 0.344224
[7200]	training's l2: 0.305837	valid_1's l2: 0.344236
Early stopping, best iteration is:
[6954]	training's l2: 0.306861	valid_1's l2: 0.344214
mean_14_2017: 15641532.48
mean_4_dow4_2017: 8026584.41
mean_5_2017: 6216687.41
mean_30_2017: 4959690.27
mean_12_dow4_2017: 3665683.11
mean_20_dow4_2017: 1367675.20
promo_4: 695625.54
mean_3_2017: 575226.42
mean_7_2017: 494329.57
mean_2_dow4_2017: 203056.78
item_class_nbr: 186345.12
restday_7_2017: 133069.94
rest_9: 128344.18
restday_2_2017: 117282.16
mean_4_dow3_2017: 106717.36
item_family_nbr: 106314.62
mean_60_2017: 104872.52

[400]	training's l2: 0.331074	valid_1's l2: 0.393592
[600]	training's l2: 0.324976	valid_1's l2: 0.391666
[800]	training's l2: 0.321615	valid_1's l2: 0.392272
Early stopping, best iteration is:
[583]	training's l2: 0.325382	valid_1's l2: 0.391611
mean_14_2017: 12279653.99
mean_30_2017: 9934252.62
mean_7_2017: 4231710.52
mean_12_dow0_2017: 2074467.19
promo_7: 1616770.26
mean_20_dow0_2017: 822367.99
mean_60_2017: 534167.49
mean_4_dow0_2017: 436500.32
promo_0: 201197.60
mean_5_2017: 175412.51
item_family_nbr: 106086.48
day_1_2017: 103264.22
mean_3_2017: 101135.16
promo_14: 90348.95
item_class_nbr: 78036.21
promo_6: 61696.61
promo_21_2017: 59099.72
promo_30_2017: 51868.15
promo_3: 49526.75
mean_140_2017: 47331.10
promo_140_2017: 41273.12
mean_90_2017: 39365.68
promo_14_2017: 38940.00
promo_90_2017: 38206.30
promo_5: 36510.29
mean_2_dow0_2017: 31431.25
rest_5: 31197.71
promo_60_2017: 28914.03
mean_4_dow5_2017: 28717.19
promo_7_2017: 28677.87
mean_20_dow2_2017: 27442.75
restday_7_2017: 24364

[1600]	training's l2: 0.349943	valid_1's l2: 0.374123
Early stopping, best iteration is:
[1378]	training's l2: 0.352009	valid_1's l2: 0.374055
mean_30_2017: 16395755.87
mean_14_2017: 7956380.14
mean_60_2017: 2765951.80
mean_7_2017: 2405052.81
mean_12_dow3_2017: 2296932.62
mean_5_2017: 1817207.02
mean_4_dow3_2017: 1275423.44
promo_10: 947749.19
mean_20_dow3_2017: 922823.23
item_class_nbr: 152112.67
rest_2: 121712.27
item_family_nbr: 95347.98
mean_4_dow4_2017: 79089.23
promo_14: 74810.13
mean_3_2017: 74093.97
promo_12: 69558.35
promo_30_2017: 64343.37
promo_9: 63967.12
promo_7: 63642.26
mean_12_dow2_2017: 62808.85
promo_11: 56774.53
mean_140_2017: 54789.37
promo_13: 52576.90
promo_21_2017: 51129.63
promo_7_2017: 49491.55
promo_140_2017: 48306.17
promo_90_2017: 48118.47
promo_14_2017: 45453.11
promo_8: 43316.14
mean_2_dow3_2017: 36997.44
promo_60_2017: 36165.60
mean_4_dow2_2017: 33494.97
mean_20_dow2_2017: 30673.60
mean_90_2017: 30469.28
type_nbr: 29917.94
mean_20_dow5_2017: 25977.64
rest

Early stopping, best iteration is:
[1214]	training's l2: 0.341872	valid_1's l2: 0.359772
mean_30_2017: 15347099.85
mean_14_2017: 4709635.44
mean_12_dow6_2017: 2948312.34
mean_7_2017: 2067928.57
promo_13: 1380985.92
mean_60_2017: 1213450.10
mean_20_dow6_2017: 876129.63
mean_3_2017: 486964.91
mean_4_dow6_2017: 266990.99
mean_5_2017: 245380.18
item_class_nbr: 165306.55
item_family_nbr: 145226.33
promo_14: 142679.03
promo_12: 105651.19
day_1_2017: 104670.96
promo_10: 90188.62
mean_90_2017: 77333.16
mean_4_dow5_2017: 74418.48
mean_20_dow1_2017: 71689.89
mean_140_2017: 69853.51
promo_30_2017: 69283.01
rest_2: 56843.09
promo_21_2017: 52168.23
mean_20_dow5_2017: 49728.85
mean_2_dow6_2017: 46851.80
promo_1_2017: 44124.01
promo_140_2017: 40905.92
mean_20_dow0_2017: 40047.65
promo_14_2017: 36763.69
mean_12_dow5_2017: 35615.37
promo_6: 34612.98
promo_90_2017: 33782.10
mean_12_dow1_2017: 29435.46
promo_60_2017: 28485.82
promo_11: 27448.95
promo_7_2017: 25876.00
promo_9: 23037.27
promo_15: 20029.03


In [67]:
print("Making submission...")
y_test = np.array(test_pred).transpose()
df_preds = pd.DataFrame(
    y_test, index=df_2017.index,
    columns=pd.date_range("2017-08-16", periods=16)
).stack().to_frame("unit_sales")
df_preds.index.set_names(["store_nbr", "item_nbr", "date"], inplace=True)

Making submission...


In [68]:
submission = df_test[["id"]].join(df_preds, how="left").fillna(0)
submission["unit_sales"] = np.clip(np.expm1(submission["unit_sales"]), 0, 1000)
submission.to_csv('submission/2017-12-22-2200.csv', float_format='%.4f', index=None)

In [1]:
len(test_pred[0])

NameError: name 'test_pred' is not defined

In [68]:
len(test_pred)

16

In [67]:
test_pred

[array([ 0.20218495,  0.33581692,  0.85266579, ...,  0.1671704 ,
         2.51688696,  0.53726671]),
 array([ 0.21570338,  0.30517955,  0.82888579, ...,  0.14881887,
         2.01291047,  0.38810882]),
 array([ 0.22423024,  0.33551373,  0.90349146, ...,  0.15923332,
         2.13028749,  0.42664252]),
 array([ 0.20921377,  0.2992341 ,  0.67507867, ...,  0.21968677,
         2.23148755,  0.55693083]),
 array([ 0.13756174,  0.16813315,  0.24833117, ...,  0.26475569,
         2.57776234,  0.68146849]),
 array([ 0.22399479,  0.30007167,  0.65521359, ...,  0.17694149,
         2.04588128,  0.43876589]),
 array([ 0.22204345,  0.28575259,  0.71561694, ...,  0.1826231 ,
         2.48041909,  0.43065157]),
 array([ 0.22638763,  0.29095567,  0.74315114, ...,  0.16888347,
         2.02333003,  0.35618663]),
 array([ 0.19871981,  0.28789683,  0.75601349, ...,  0.14383795,
         2.11073692,  0.33677368]),
 array([ 0.23490149,  0.31654716,  0.84780062, ...,  0.15688149,
         2.01229264,  0.35

In [63]:
y_test

array([[ 0.20218495,  0.21570338,  0.22423024, ...,  0.20887905,
         0.22833   ,  0.21068279],
       [ 0.33581692,  0.30517955,  0.33551373, ...,  0.32877253,
         0.35186089,  0.34428008],
       [ 0.85266579,  0.82888579,  0.90349146, ...,  0.72418873,
         0.79865668,  0.87588746],
       ..., 
       [ 0.1671704 ,  0.14881887,  0.15923332, ...,  0.22484744,
         0.18164825,  0.1728592 ],
       [ 2.51688696,  2.01291047,  2.13028749, ...,  2.45391267,
         1.94535927,  2.23485087],
       [ 0.53726671,  0.38810882,  0.42664252, ...,  0.47131078,
         0.39002785,  0.36411368]])

In [69]:
df_preds

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,unit_sales
store_nbr,item_nbr,date,Unnamed: 3_level_1
1,96995,2017-08-16,0.215341
1,96995,2017-08-17,0.208789
1,96995,2017-08-18,0.229419
1,96995,2017-08-19,0.210360
1,96995,2017-08-20,0.138135
1,96995,2017-08-21,0.229623
1,96995,2017-08-22,0.223251
1,96995,2017-08-23,0.224950
1,96995,2017-08-24,0.197267
1,96995,2017-08-25,0.236336


In [65]:
df_preds1 = pd.DataFrame(
    y_test, index=df_2017.index,
    columns=pd.date_range("2017-08-16", periods=16)
)
df_preds1

Unnamed: 0_level_0,Unnamed: 1_level_0,2017-08-16 00:00:00,2017-08-17 00:00:00,2017-08-18 00:00:00,2017-08-19 00:00:00,2017-08-20 00:00:00,2017-08-21 00:00:00,2017-08-22 00:00:00,2017-08-23 00:00:00,2017-08-24 00:00:00,2017-08-25 00:00:00,2017-08-26 00:00:00,2017-08-27 00:00:00,2017-08-28 00:00:00,2017-08-29 00:00:00,2017-08-30 00:00:00,2017-08-31 00:00:00
store_nbr,item_nbr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1,96995,0.202185,0.215703,0.224230,0.209214,0.137562,0.223995,0.222043,0.226388,0.198720,0.234901,0.247881,0.164602,0.228710,0.208879,0.228330,0.210683
1,99197,0.335817,0.305180,0.335514,0.299234,0.168133,0.300072,0.285753,0.290956,0.287897,0.316547,0.349073,0.207046,0.322955,0.328773,0.351861,0.344280
1,103520,0.852666,0.828886,0.903491,0.675079,0.248331,0.655214,0.715617,0.743151,0.756013,0.847801,0.730912,0.339166,0.683533,0.724189,0.798657,0.875887
1,103665,1.184268,1.013745,1.217854,1.162776,0.624608,0.907893,0.900552,1.028950,0.895870,1.153608,1.158510,0.549176,0.918380,0.927292,1.085033,0.975237
1,105574,1.905691,1.826220,1.826158,1.603457,0.795591,1.728382,1.639346,1.775253,1.629213,1.786172,1.476190,0.788064,1.764099,1.693511,1.865777,1.708159
1,105575,2.510634,2.291694,2.355016,2.255759,1.580556,2.355976,2.291283,2.239910,2.766906,3.069166,2.869993,2.348495,3.227881,3.132258,3.240386,2.974922
1,105577,0.667436,0.526803,0.525271,0.487218,0.315330,0.574757,0.542497,1.184662,1.185143,1.285636,1.209990,1.056985,1.376830,0.052387,1.390639,1.264965
1,105693,0.321160,0.263855,0.297076,0.273316,0.168180,0.281226,0.259765,0.264278,0.242501,0.297007,0.262910,0.207804,0.340188,0.277129,0.996662,0.202302
1,105737,0.903273,0.743017,0.790098,0.585519,0.295649,0.789708,0.706682,0.783476,0.650692,0.772784,0.649526,0.345655,0.806429,0.754437,0.869535,0.703255
1,105857,1.780670,1.637936,1.685184,1.581972,0.983515,1.626225,1.611083,1.707307,1.506887,1.627137,1.537129,1.020482,1.633192,1.684066,1.673667,1.624890
