In [1]:
"""LGBM Starter

This is watered-down version of one of my earlier scripts. 
Only very basic features are retained so hopefully it won't ruin the fun for you.
"""
from datetime import date, timedelta

import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
import lightgbm as lgb

In [2]:
df_train = pd.read_csv(
    'database/train.csv', usecols=[1, 2, 3, 4, 5],
    dtype={'onpromotion': bool},
    converters={'unit_sales': lambda u: np.log1p(
        float(u)) if float(u) > 0 else 0},
    parse_dates=["date"],
    skiprows=range(1, 66458909)  # 2016-01-01
)

In [3]:
df_train.head()

Unnamed: 0,date,store_nbr,item_nbr,unit_sales,onpromotion
0,2016-01-01,25,105574,2.564949,False
1,2016-01-01,25,105575,2.302585,False
2,2016-01-01,25,105857,1.386294,False
3,2016-01-01,25,108634,1.386294,False
4,2016-01-01,25,108701,1.098612,True


In [4]:
df_train.tail()

Unnamed: 0,date,store_nbr,item_nbr,unit_sales,onpromotion
59038127,2017-08-15,54,2089339,1.609438,False
59038128,2017-08-15,54,2106464,0.693147,True
59038129,2017-08-15,54,2110456,5.26269,False
59038130,2017-08-15,54,2113914,5.293305,True
59038131,2017-08-15,54,2116416,1.098612,False


In [5]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59038132 entries, 0 to 59038131
Data columns (total 5 columns):
date           datetime64[ns]
store_nbr      int64
item_nbr       int64
unit_sales     float64
onpromotion    bool
dtypes: bool(1), datetime64[ns](1), float64(1), int64(2)
memory usage: 1.8 GB


In [6]:
df_test = pd.read_csv(
    "database/test.csv", usecols=[0, 1, 2, 3, 4],
    dtype={'onpromotion': bool},
    parse_dates=["date"]  # , date_parser=parser
).set_index(
    ['store_nbr', 'item_nbr', 'date']
)

items = pd.read_csv(
    "database/items.csv",
).set_index("item_nbr")



In [7]:
df_test.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,id,onpromotion
store_nbr,item_nbr,date,Unnamed: 3_level_1,Unnamed: 4_level_1
1,96995,2017-08-16,125497040,False
1,99197,2017-08-16,125497041,False
1,103501,2017-08-16,125497042,False
1,103520,2017-08-16,125497043,False
1,103665,2017-08-16,125497044,False


In [8]:
df_test.tail()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,id,onpromotion
store_nbr,item_nbr,date,Unnamed: 3_level_1,Unnamed: 4_level_1
54,2132163,2017-08-31,128867499,False
54,2132318,2017-08-31,128867500,False
54,2132945,2017-08-31,128867501,False
54,2132957,2017-08-31,128867502,False
54,2134244,2017-08-31,128867503,False


In [9]:
df_2017 = df_train[df_train.date.isin(
    pd.date_range("2017-05-31", periods=7 * 11))].copy()

In [10]:
df_2017

Unnamed: 0,date,store_nbr,item_nbr,unit_sales,onpromotion
50912462,2017-05-31,1,96995,0.693147,False
50912463,2017-05-31,1,99197,0.693147,False
50912464,2017-05-31,1,103520,1.386294,False
50912465,2017-05-31,1,103665,2.197225,False
50912466,2017-05-31,1,105574,1.386294,False
50912467,2017-05-31,1,105575,1.791759,False
50912468,2017-05-31,1,105577,1.609438,False
50912469,2017-05-31,1,105737,1.098612,False
50912470,2017-05-31,1,105857,2.197225,False
50912471,2017-05-31,1,106716,1.609438,False


In [11]:
del df_train

In [12]:
promo_2017_train = df_2017.set_index(
    ["store_nbr", "item_nbr", "date"])[["onpromotion"]].unstack(
        level=-1).fillna(False)

In [13]:
promo_2017_train.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion
Unnamed: 0_level_1,date,2017-05-31,2017-06-01,2017-06-02,2017-06-03,2017-06-04,2017-06-05,2017-06-06,2017-06-07,2017-06-08,2017-06-09,...,2017-08-06,2017-08-07,2017-08-08,2017-08-09,2017-08-10,2017-08-11,2017-08-12,2017-08-13,2017-08-14,2017-08-15
store_nbr,item_nbr,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
1,96995,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,99197,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,103520,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,103665,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,105574,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [14]:
promo_2017_train.columns

MultiIndex(levels=[['onpromotion'], [2017-05-31 00:00:00, 2017-06-01 00:00:00, 2017-06-02 00:00:00, 2017-06-03 00:00:00, 2017-06-04 00:00:00, 2017-06-05 00:00:00, 2017-06-06 00:00:00, 2017-06-07 00:00:00, 2017-06-08 00:00:00, 2017-06-09 00:00:00, 2017-06-10 00:00:00, 2017-06-11 00:00:00, 2017-06-12 00:00:00, 2017-06-13 00:00:00, 2017-06-14 00:00:00, 2017-06-15 00:00:00, 2017-06-16 00:00:00, 2017-06-17 00:00:00, 2017-06-18 00:00:00, 2017-06-19 00:00:00, 2017-06-20 00:00:00, 2017-06-21 00:00:00, 2017-06-22 00:00:00, 2017-06-23 00:00:00, 2017-06-24 00:00:00, 2017-06-25 00:00:00, 2017-06-26 00:00:00, 2017-06-27 00:00:00, 2017-06-28 00:00:00, 2017-06-29 00:00:00, 2017-06-30 00:00:00, 2017-07-01 00:00:00, 2017-07-02 00:00:00, 2017-07-03 00:00:00, 2017-07-04 00:00:00, 2017-07-05 00:00:00, 2017-07-06 00:00:00, 2017-07-07 00:00:00, 2017-07-08 00:00:00, 2017-07-09 00:00:00, 2017-07-10 00:00:00, 2017-07-11 00:00:00, 2017-07-12 00:00:00, 2017-07-13 00:00:00, 2017-07-14 00:00:00, 2017-07-15 00:00:0

In [15]:
promo_2017_train.columns = promo_2017_train.columns.get_level_values(1)

In [16]:
promo_2017_train.columns

DatetimeIndex(['2017-05-31', '2017-06-01', '2017-06-02', '2017-06-03',
               '2017-06-04', '2017-06-05', '2017-06-06', '2017-06-07',
               '2017-06-08', '2017-06-09', '2017-06-10', '2017-06-11',
               '2017-06-12', '2017-06-13', '2017-06-14', '2017-06-15',
               '2017-06-16', '2017-06-17', '2017-06-18', '2017-06-19',
               '2017-06-20', '2017-06-21', '2017-06-22', '2017-06-23',
               '2017-06-24', '2017-06-25', '2017-06-26', '2017-06-27',
               '2017-06-28', '2017-06-29', '2017-06-30', '2017-07-01',
               '2017-07-02', '2017-07-03', '2017-07-04', '2017-07-05',
               '2017-07-06', '2017-07-07', '2017-07-08', '2017-07-09',
               '2017-07-10', '2017-07-11', '2017-07-12', '2017-07-13',
               '2017-07-14', '2017-07-15', '2017-07-16', '2017-07-17',
               '2017-07-18', '2017-07-19', '2017-07-20', '2017-07-21',
               '2017-07-22', '2017-07-23', '2017-07-24', '2017-07-25',
      

In [17]:
promo_2017_test = df_test[["onpromotion"]].unstack(level=-1).fillna(False)
promo_2017_test.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion
Unnamed: 0_level_1,date,2017-08-16,2017-08-17,2017-08-18,2017-08-19,2017-08-20,2017-08-21,2017-08-22,2017-08-23,2017-08-24,2017-08-25,2017-08-26,2017-08-27,2017-08-28,2017-08-29,2017-08-30,2017-08-31
store_nbr,item_nbr,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2
1,96995,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,99197,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,103501,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,103520,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,103665,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [18]:
promo_2017_test.columns = promo_2017_test.columns.get_level_values(1)

In [19]:
promo_2017_test.head()

Unnamed: 0_level_0,date,2017-08-16 00:00:00,2017-08-17 00:00:00,2017-08-18 00:00:00,2017-08-19 00:00:00,2017-08-20 00:00:00,2017-08-21 00:00:00,2017-08-22 00:00:00,2017-08-23 00:00:00,2017-08-24 00:00:00,2017-08-25 00:00:00,2017-08-26 00:00:00,2017-08-27 00:00:00,2017-08-28 00:00:00,2017-08-29 00:00:00,2017-08-30 00:00:00,2017-08-31 00:00:00
store_nbr,item_nbr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1,96995,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,99197,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,103501,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,103520,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,103665,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [20]:
promo_2017_train.index

MultiIndex(levels=[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54], [96995, 99197, 103501, 103520, 103665, 105574, 105575, 105576, 105577, 105693, 105737, 105857, 106716, 108079, 108634, 108696, 108698, 108701, 108786, 108797, 108831, 108833, 108862, 108952, 111223, 111397, 112830, 114778, 114790, 114799, 114800, 115267, 115611, 115693, 115720, 115847, 115850, 115891, 115892, 115893, 115894, 116017, 116018, 116279, 116311, 119023, 119024, 119026, 119141, 119187, 119191, 119193, 119624, 121964, 122095, 122419, 122425, 123347, 123601, 123602, 123927, 125430, 127534, 127547, 129296, 129297, 129635, 129758, 129759, 153078, 153239, 153267, 153395, 153398, 155499, 155500, 155600, 155601, 155607, 155610, 155621, 155625, 157956, 158680, 158788, 158789, 158842, 158875, 158956, 159156, 159242, 161288, 162066, 164036, 164037, 164088, 16464

In [21]:
promo_2017_test = promo_2017_test.reindex(promo_2017_train.index).fillna(False)

In [22]:
promo_2017_test.head()

Unnamed: 0_level_0,date,2017-08-16 00:00:00,2017-08-17 00:00:00,2017-08-18 00:00:00,2017-08-19 00:00:00,2017-08-20 00:00:00,2017-08-21 00:00:00,2017-08-22 00:00:00,2017-08-23 00:00:00,2017-08-24 00:00:00,2017-08-25 00:00:00,2017-08-26 00:00:00,2017-08-27 00:00:00,2017-08-28 00:00:00,2017-08-29 00:00:00,2017-08-30 00:00:00,2017-08-31 00:00:00
store_nbr,item_nbr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1,96995,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,99197,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,103520,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,103665,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,105574,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


## 合并两个表格，行数不变

In [23]:
promo_2017 = pd.concat([promo_2017_train, promo_2017_test], axis=1)

In [24]:
promo_2017.head()

Unnamed: 0_level_0,date,2017-05-31 00:00:00,2017-06-01 00:00:00,2017-06-02 00:00:00,2017-06-03 00:00:00,2017-06-04 00:00:00,2017-06-05 00:00:00,2017-06-06 00:00:00,2017-06-07 00:00:00,2017-06-08 00:00:00,2017-06-09 00:00:00,...,2017-08-22 00:00:00,2017-08-23 00:00:00,2017-08-24 00:00:00,2017-08-25 00:00:00,2017-08-26 00:00:00,2017-08-27 00:00:00,2017-08-28 00:00:00,2017-08-29 00:00:00,2017-08-30 00:00:00,2017-08-31 00:00:00
store_nbr,item_nbr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,96995,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,99197,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,103520,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,103665,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,105574,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [25]:
promo_2017.tail()

Unnamed: 0_level_0,date,2017-05-31 00:00:00,2017-06-01 00:00:00,2017-06-02 00:00:00,2017-06-03 00:00:00,2017-06-04 00:00:00,2017-06-05 00:00:00,2017-06-06 00:00:00,2017-06-07 00:00:00,2017-06-08 00:00:00,2017-06-09 00:00:00,...,2017-08-22 00:00:00,2017-08-23 00:00:00,2017-08-24 00:00:00,2017-08-25 00:00:00,2017-08-26 00:00:00,2017-08-27 00:00:00,2017-08-28 00:00:00,2017-08-29 00:00:00,2017-08-30 00:00:00,2017-08-31 00:00:00
store_nbr,item_nbr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
54,2109909,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
54,2110456,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
54,2113343,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
54,2113914,False,False,False,False,False,False,False,False,False,False,...,True,True,True,True,True,True,True,True,True,True
54,2116416,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


现在的promo_2017是任意一个商品从2017-5-31到2017-8-31之间每天是否促销的信息

In [26]:
del promo_2017_test, promo_2017_train

In [27]:
df_2017 = df_2017.set_index(
    ["store_nbr", "item_nbr", "date"])[["unit_sales"]].unstack(
        level=-1).fillna(0)

In [28]:
df_2017.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,unit_sales,unit_sales,unit_sales,unit_sales,unit_sales,unit_sales,unit_sales,unit_sales,unit_sales,unit_sales,unit_sales,unit_sales,unit_sales,unit_sales,unit_sales,unit_sales,unit_sales,unit_sales,unit_sales,unit_sales,unit_sales
Unnamed: 0_level_1,date,2017-05-31,2017-06-01,2017-06-02,2017-06-03,2017-06-04,2017-06-05,2017-06-06,2017-06-07,2017-06-08,2017-06-09,...,2017-08-06,2017-08-07,2017-08-08,2017-08-09,2017-08-10,2017-08-11,2017-08-12,2017-08-13,2017-08-14,2017-08-15
store_nbr,item_nbr,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
1,96995,0.693147,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.693147,...,1.098612,1.098612,0.0,0.0,0.693147,0.0,0.0,0.0,0.0,0.0
1,99197,0.693147,1.386294,1.098612,1.94591,1.098612,1.098612,0.0,0.0,0.693147,0.693147,...,0.0,1.098612,0.0,1.098612,0.0,0.0,0.0,0.0,0.0,0.0
1,103520,1.386294,1.098612,1.098612,0.693147,0.0,0.693147,1.609438,0.693147,0.693147,1.098612,...,0.0,0.0,1.386294,0.0,1.386294,0.693147,0.693147,0.693147,0.0,0.0
1,103665,2.197225,0.0,1.791759,1.791759,1.098612,1.386294,1.791759,1.386294,0.0,1.098612,...,0.693147,1.098612,0.0,2.079442,2.302585,1.098612,0.0,0.0,0.693147,0.693147
1,105574,1.386294,2.484907,1.791759,1.386294,1.386294,1.386294,2.079442,2.397895,1.94591,2.079442,...,0.0,1.791759,2.079442,1.94591,2.397895,1.791759,1.791759,0.0,1.386294,1.609438


In [29]:
df_2017.columns = df_2017.columns.get_level_values(1)

items = items.reindex(df_2017.index.get_level_values(1))

In [30]:
df_2017.index.get_level_values(1)

Int64Index([  96995,   99197,  103520,  103665,  105574,  105575,  105577,
             105693,  105737,  105857,
            ...
            2101795, 2103250, 2105347, 2106464, 2108569, 2109909, 2110456,
            2113343, 2113914, 2116416],
           dtype='int64', name='item_nbr', length=156790)

In [31]:
items

Unnamed: 0_level_0,family,class,perishable
item_nbr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
96995,GROCERY I,1093,0
99197,GROCERY I,1067,0
103520,GROCERY I,1028,0
103665,BREAD/BAKERY,2712,1
105574,GROCERY I,1045,0
105575,GROCERY I,1045,0
105577,GROCERY I,1045,0
105693,GROCERY I,1034,0
105737,GROCERY I,1044,0
105857,GROCERY I,1092,0


In [32]:
df_2017.head()

Unnamed: 0_level_0,date,2017-05-31 00:00:00,2017-06-01 00:00:00,2017-06-02 00:00:00,2017-06-03 00:00:00,2017-06-04 00:00:00,2017-06-05 00:00:00,2017-06-06 00:00:00,2017-06-07 00:00:00,2017-06-08 00:00:00,2017-06-09 00:00:00,...,2017-08-06 00:00:00,2017-08-07 00:00:00,2017-08-08 00:00:00,2017-08-09 00:00:00,2017-08-10 00:00:00,2017-08-11 00:00:00,2017-08-12 00:00:00,2017-08-13 00:00:00,2017-08-14 00:00:00,2017-08-15 00:00:00
store_nbr,item_nbr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,96995,0.693147,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.693147,...,1.098612,1.098612,0.0,0.0,0.693147,0.0,0.0,0.0,0.0,0.0
1,99197,0.693147,1.386294,1.098612,1.94591,1.098612,1.098612,0.0,0.0,0.693147,0.693147,...,0.0,1.098612,0.0,1.098612,0.0,0.0,0.0,0.0,0.0,0.0
1,103520,1.386294,1.098612,1.098612,0.693147,0.0,0.693147,1.609438,0.693147,0.693147,1.098612,...,0.0,0.0,1.386294,0.0,1.386294,0.693147,0.693147,0.693147,0.0,0.0
1,103665,2.197225,0.0,1.791759,1.791759,1.098612,1.386294,1.791759,1.386294,0.0,1.098612,...,0.693147,1.098612,0.0,2.079442,2.302585,1.098612,0.0,0.0,0.693147,0.693147
1,105574,1.386294,2.484907,1.791759,1.386294,1.386294,1.386294,2.079442,2.397895,1.94591,2.079442,...,0.0,1.791759,2.079442,1.94591,2.397895,1.791759,1.791759,0.0,1.386294,1.609438


In [33]:
t2017 = date(2017, 7, 26)
tmp = pd.date_range(t2017 - timedelta(days=7),periods=7)
tmp

DatetimeIndex(['2017-07-19', '2017-07-20', '2017-07-21', '2017-07-22',
               '2017-07-23', '2017-07-24', '2017-07-25'],
              dtype='datetime64[ns]', freq='D')

In [34]:
df_2017[tmp]

Unnamed: 0_level_0,date,2017-07-19 00:00:00,2017-07-20 00:00:00,2017-07-21 00:00:00,2017-07-22 00:00:00,2017-07-23 00:00:00,2017-07-24 00:00:00,2017-07-25 00:00:00
store_nbr,item_nbr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,96995,0.000000,0.000000,1.386294,1.098612,0.000000,0.000000,0.000000
1,99197,1.098612,0.693147,1.791759,0.693147,0.000000,0.000000,0.000000
1,103520,0.693147,0.693147,0.000000,1.386294,0.693147,1.386294,1.098612
1,103665,1.386294,0.000000,0.000000,1.098612,0.693147,1.609438,1.386294
1,105574,1.609438,2.484907,1.945910,1.609438,1.609438,2.197225,1.791759
1,105575,2.564949,2.302585,2.079442,2.197225,1.945910,2.302585,2.484907
1,105577,1.098612,1.098612,1.098612,1.386294,0.693147,1.386294,0.000000
1,105693,0.693147,0.000000,0.000000,0.000000,0.693147,0.000000,0.693147
1,105737,0.000000,0.693147,0.000000,0.693147,0.000000,1.386294,1.098612
1,105857,1.791759,1.945910,1.386294,1.098612,1.098612,1.791759,0.693147


In [35]:
df_2017[tmp].mean(axis=1)

store_nbr  item_nbr
1          96995       0.354987
           99197       0.610952
           103520      0.850092
           103665      0.881969
           105574      1.892588
           105575      2.268229
           105577      0.965939
           105693      0.297063
           105737      0.553029
           105857      1.400871
           106716      1.466876
           108079      0.099021
           108634      0.000000
           108696      0.814826
           108698      0.792168
           108701      0.396084
           108786      1.060936
           108797      0.642830
           108862      0.427962
           108952      1.150737
           111223      1.673466
           111397      0.709973
           112830      1.176783
           114778      1.221905
           114790      1.007036
           114799      0.610952
           114800      1.227100
           115267      0.866918
           115611      2.198871
           115693      0.980990
                    

In [36]:
for i in range(16):
    print(t2017 + timedelta(days=i))

2017-07-26
2017-07-27
2017-07-28
2017-07-29
2017-07-30
2017-07-31
2017-08-01
2017-08-02
2017-08-03
2017-08-04
2017-08-05
2017-08-06
2017-08-07
2017-08-08
2017-08-09
2017-08-10


In [37]:
df_2017[pd.date_range(t2017, periods=16)]

Unnamed: 0_level_0,date,2017-07-26 00:00:00,2017-07-27 00:00:00,2017-07-28 00:00:00,2017-07-29 00:00:00,2017-07-30 00:00:00,2017-07-31 00:00:00,2017-08-01 00:00:00,2017-08-02 00:00:00,2017-08-03 00:00:00,2017-08-04 00:00:00,2017-08-05 00:00:00,2017-08-06 00:00:00,2017-08-07 00:00:00,2017-08-08 00:00:00,2017-08-09 00:00:00,2017-08-10 00:00:00
store_nbr,item_nbr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1,96995,0.000000,0.000000,0.000000,0.000000,0.000000,1.098612,0.000000,0.693147,1.098612,0.000000,0.000000,1.098612,1.098612,0.000000,0.000000,0.693147
1,99197,0.000000,0.000000,0.693147,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.693147,0.000000,1.098612,0.000000,1.098612,0.000000
1,103520,0.693147,1.098612,1.098612,1.386294,0.000000,0.693147,1.386294,0.693147,1.098612,1.386294,0.000000,0.000000,0.000000,1.386294,0.000000,1.386294
1,103665,1.609438,0.000000,1.098612,1.609438,1.098612,1.098612,1.609438,1.098612,1.098612,2.197225,1.386294,0.693147,1.098612,0.000000,2.079442,2.302585
1,105574,2.302585,1.386294,1.945910,1.386294,0.693147,1.609438,2.197225,2.197225,1.945910,1.791759,2.079442,0.000000,1.791759,2.079442,1.945910,2.397895
1,105575,1.945910,1.945910,2.397895,2.397895,1.386294,2.564949,2.708050,2.197225,2.995732,2.708050,1.791759,1.609438,2.833213,3.091042,2.484907,2.484907
1,105577,0.000000,0.000000,0.000000,1.098612,0.000000,0.000000,0.000000,0.000000,0.693147,0.000000,0.000000,0.693147,1.386294,0.000000,0.693147,0.693147
1,105693,1.098612,0.000000,0.000000,0.000000,0.000000,0.000000,0.693147,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.098612,0.693147
1,105737,0.693147,0.693147,0.693147,0.693147,0.000000,0.693147,0.693147,0.000000,0.000000,1.098612,0.693147,0.000000,1.791759,2.079442,1.791759,0.693147
1,105857,2.397895,1.945910,1.386294,1.945910,0.000000,1.791759,1.945910,1.791759,1.609438,1.945910,2.302585,2.079442,0.000000,2.302585,1.609438,1.609438


In [38]:
def get_timespan(df, dt, minus, periods):
    return df[
        pd.date_range(dt - timedelta(days=minus), periods=periods)
    ]
def prepare_dataset(t2017, is_train=True):
    X = pd.DataFrame({
        "mean_3_2017": get_timespan(df_2017, t2017, 3, 3).mean(axis=1).values,      # 前3天，该商品销量的均值
        "mean_7_2017": get_timespan(df_2017, t2017, 7, 7).mean(axis=1).values,      # 前7天，该商品销量的均值
        "mean_14_2017": get_timespan(df_2017, t2017, 14, 14).mean(axis=1).values,   # 前14天，该商品销量的均值
        "promo_14_2017": get_timespan(promo_2017, t2017, 14, 14).sum(axis=1).values # 前14天，该商品促销的次数
    })
    for i in range(16):
        X["promo_{}".format(i)] = promo_2017[                          
            t2017 + timedelta(days=i)].values.astype(np.uint8)                      # 往后16天里每一天是否有促销
    if is_train:
        y = df_2017[
            pd.date_range(t2017, periods=16)                                        # 16天的销量数据
        ].values
        return X, y
    return X

In [39]:
tt = date(2017,6,21) + timedelta(21)
tt

datetime.date(2017, 7, 12)

In [40]:
print("Preparing dataset...") # 从2017-6-21开始，每隔7天，堆叠四组数据用来做训练集
t2017 = date(2017, 6, 21)
X_l, y_l = [], []
for i in range(4):
    delta = timedelta(days=7 * i)
    X_tmp, y_tmp = prepare_dataset(
        t2017 + delta
    )
    X_l.append(X_tmp)
    y_l.append(y_tmp)
X_train = pd.concat(X_l, axis=0)           
y_train = np.concatenate(y_l, axis=0)
del X_l, y_l
X_val, y_val = prepare_dataset(date(2017, 7, 26)) # 2017-7-26，用作验证集
X_test = prepare_dataset(date(2017, 8, 16), is_train=False)

Preparing dataset...


In [41]:
y_train

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.69314718,  0.        ,  0.        , ...,  0.        ,
         0.        ,  1.38629436],
       [ 0.69314718,  1.38629436,  1.38629436, ...,  0.        ,
         0.69314718,  1.94591015],
       ..., 
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.69314718,  5.30330491,  2.19722458, ...,  2.48490665,
         1.94591015,  1.38629436],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [42]:
X_train

Unnamed: 0,mean_14_2017,mean_3_2017,mean_7_2017,promo_14_2017,promo_0,promo_1,promo_2,promo_3,promo_4,promo_5,promo_6,promo_7,promo_8,promo_9,promo_10,promo_11,promo_12,promo_13,promo_14,promo_15
0,0.099021,0.000000,0.099021,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0.835944,1.245890,0.987960,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0.840554,0.462098,0.773092,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1.141420,1.059351,1.243926,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,1.645124,1.416165,1.505723,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,2.250081,2.507286,2.380866,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,0.688537,0.462098,0.526983,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,0.297063,0.000000,0.297063,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8,1.070138,0.231049,0.973349,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,1.818947,2.132310,2.019905,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [43]:
X_val.head()

Unnamed: 0,mean_14_2017,mean_3_2017,mean_7_2017,promo_14_2017,promo_0,promo_1,promo_2,promo_3,promo_4,promo_5,promo_6,promo_7,promo_8,promo_9,promo_10,promo_11,promo_12,promo_13,promo_14,promo_15
0,0.177493,0.0,0.354987,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0.709973,0.0,0.610952,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0.840554,1.059351,0.850092,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0.853895,1.229626,0.881969,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,1.82682,1.866141,1.892588,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [44]:
X_test.head()

Unnamed: 0,mean_14_2017,mean_3_2017,mean_7_2017,promo_14_2017,promo_0,promo_1,promo_2,promo_3,promo_4,promo_5,promo_6,promo_7,promo_8,promo_9,promo_10,promo_11,promo_12,promo_13,promo_14,promo_15
0,0.334438,0.0,0.099021,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0.206455,0.0,0.156945,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0.573577,0.231049,0.495105,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1.031388,0.462098,0.98099,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,1.629185,0.998577,1.560437,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [45]:
print("Training and predicting models...")
params = {
    'num_leaves': 2**5 - 1,
    'objective': 'regression_l2',
    'max_depth': 8,
    'min_data_in_leaf': 50,
    'learning_rate': 0.05,
    'feature_fraction': 0.75,
    'bagging_fraction': 0.75,
    'bagging_freq': 1,
    'metric': 'l2',
    'num_threads': 4
}
MAX_ROUNDS = 1000
val_pred = []
test_pred = []
cate_vars = []

Training and predicting models...


In [46]:
y_train[:]

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.69314718,  0.        ,  0.        , ...,  0.        ,
         0.        ,  1.38629436],
       [ 0.69314718,  1.38629436,  1.38629436, ...,  0.        ,
         0.69314718,  1.94591015],
       ..., 
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.69314718,  5.30330491,  2.19722458, ...,  2.48490665,
         1.94591015,  1.38629436],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [47]:

for i in range(16):
    print("=" * 50)
    print("Step %d" % (i+1))
    print("=" * 50)
    dtrain = lgb.Dataset(
        X_train, label=y_train[:, i],
        categorical_feature=cate_vars,
        weight=pd.concat([items["perishable"]] * 4) * 0.25 + 1
    )
    dval = lgb.Dataset(
        X_val, label=y_val[:, i], reference=dtrain,
        weight=items["perishable"] * 0.25 + 1,
        categorical_feature=cate_vars)
    bst = lgb.train(
        params, dtrain, num_boost_round=MAX_ROUNDS,
        valid_sets=[dtrain, dval], early_stopping_rounds=50, verbose_eval=50
    )
    print("\n".join(("%s: %.2f" % x) for x in sorted(
        zip(X_train.columns, bst.feature_importance("gain")),
        key=lambda x: x[1], reverse=True
    )))
    val_pred.append(bst.predict(
        X_val, num_iteration=bst.best_iteration or MAX_ROUNDS))
    test_pred.append(bst.predict(
        X_test, num_iteration=bst.best_iteration or MAX_ROUNDS))

print("Validation mse:", mean_squared_error(
    y_val, np.array(val_pred).transpose()))


Step 1




Training until validation scores don't improve for 50 rounds.
[50]	training's l2: 0.345149	valid_1's l2: 0.341455
[100]	training's l2: 0.333379	valid_1's l2: 0.330172
[150]	training's l2: 0.331405	valid_1's l2: 0.328823
[200]	training's l2: 0.330344	valid_1's l2: 0.328317
[250]	training's l2: 0.329476	valid_1's l2: 0.327889
[300]	training's l2: 0.328793	valid_1's l2: 0.327649
[350]	training's l2: 0.328187	valid_1's l2: 0.327459
[400]	training's l2: 0.327652	valid_1's l2: 0.327329
[450]	training's l2: 0.327151	valid_1's l2: 0.327218
[500]	training's l2: 0.326681	valid_1's l2: 0.327129
[550]	training's l2: 0.326264	valid_1's l2: 0.327102
[600]	training's l2: 0.325878	valid_1's l2: 0.327031
[650]	training's l2: 0.325453	valid_1's l2: 0.326991
[700]	training's l2: 0.325065	valid_1's l2: 0.326944
[750]	training's l2: 0.324698	valid_1's l2: 0.326981
Early stopping, best iteration is:
[712]	training's l2: 0.324969	valid_1's l2: 0.326938
mean_14_2017: 1984359.02
mean_7_2017: 1460047.19
mean_3_

Step 7
Training until validation scores don't improve for 50 rounds.
[50]	training's l2: 0.399251	valid_1's l2: 0.499954
[100]	training's l2: 0.387491	valid_1's l2: 0.484431
[150]	training's l2: 0.384487	valid_1's l2: 0.483274
[200]	training's l2: 0.382909	valid_1's l2: 0.482963
[250]	training's l2: 0.381732	valid_1's l2: 0.482566
[300]	training's l2: 0.380894	valid_1's l2: 0.482571
Early stopping, best iteration is:
[265]	training's l2: 0.381465	valid_1's l2: 0.482494
mean_14_2017: 2176120.92
mean_7_2017: 788873.90
mean_3_2017: 309822.71
promo_6: 154213.49
promo_14_2017: 50427.60
promo_3: 14230.01
promo_7: 10208.19
promo_13: 8896.97
promo_5: 7975.26
promo_0: 4785.81
promo_1: 4559.44
promo_4: 4185.97
promo_9: 3828.04
promo_2: 3637.22
promo_14: 3187.51
promo_8: 2329.80
promo_11: 1460.21
promo_15: 1382.96
promo_12: 1252.57
promo_10: 1153.89
Step 8
Training until validation scores don't improve for 50 rounds.
[50]	training's l2: 0.387986	valid_1's l2: 0.460971
[100]	training's l2: 0.37561

Step 15
Training until validation scores don't improve for 50 rounds.
[50]	training's l2: 0.411001	valid_1's l2: 0.410482
[100]	training's l2: 0.397157	valid_1's l2: 0.398623
[150]	training's l2: 0.394315	valid_1's l2: 0.397762
[200]	training's l2: 0.392755	valid_1's l2: 0.397444
[250]	training's l2: 0.391625	valid_1's l2: 0.397205
[300]	training's l2: 0.390647	valid_1's l2: 0.397032
[350]	training's l2: 0.389848	valid_1's l2: 0.396838
[400]	training's l2: 0.38916	valid_1's l2: 0.396693
[450]	training's l2: 0.388521	valid_1's l2: 0.396705
Early stopping, best iteration is:
[418]	training's l2: 0.388918	valid_1's l2: 0.396637
mean_14_2017: 2326294.15
mean_7_2017: 697294.28
mean_3_2017: 242807.08
promo_14: 232132.33
promo_14_2017: 49620.78
promo_7: 31713.90
promo_0: 29744.91
promo_15: 19946.17
promo_13: 11068.67
promo_9: 9069.83
promo_12: 8493.43
promo_10: 6377.01
promo_2: 4226.62
promo_6: 4122.19
promo_8: 3039.53
promo_11: 3002.61
promo_4: 2528.08
promo_1: 1724.67
promo_3: 1156.68
promo

In [48]:
test_pred

[array([ 0.24158492,  0.20325735,  0.49971383, ...,  0.22033994,
         2.4818273 ,  0.4154033 ]),
 array([ 0.24897964,  0.19115871,  0.45190069, ...,  0.19916939,
         2.2558101 ,  0.40396979]),
 array([ 0.27285152,  0.2179842 ,  0.52879003, ...,  0.22632535,
         2.42612214,  0.36684869]),
 array([ 0.4003835 ,  0.30827023,  0.70526274, ...,  0.29936944,
         2.69888525,  0.55833793]),
 array([ 0.39993962,  0.30768212,  0.70100918, ...,  0.31117762,
         2.94732528,  0.62947935]),
 array([ 0.31550028,  0.23218988,  0.5434266 , ...,  0.23483409,
         2.78462413,  0.45736259]),
 array([ 0.27353423,  0.21391034,  0.48453527, ...,  0.21046586,
         2.3963621 ,  0.46204475]),
 array([ 0.29847978,  0.21556857,  0.50754496, ...,  0.20253882,
         2.73110864,  0.42040179]),
 array([ 0.27332308,  0.20475818,  0.4751818 , ...,  0.19446531,
         2.37216687,  0.40286435]),
 array([ 0.28254225,  0.23347887,  0.53939256, ...,  0.20575302,
         2.39574805,  0.35

In [49]:
print("Making submission...")
y_test = np.array(test_pred).transpose()
df_preds = pd.DataFrame(
    y_test, index=df_2017.index,
    columns=pd.date_range("2017-08-16", periods=16)
).stack().to_frame("unit_sales")
df_preds.index.set_names(["store_nbr", "item_nbr", "date"], inplace=True)

submission = df_test[["id"]].join(df_preds, how="left").fillna(0)
submission["unit_sales"] = np.clip(np.expm1(submission["unit_sales"]), 0, 1000)
submission.to_csv('lgb_from_Ceshine.csv', float_format='%.4f', index=None)

Making submission...


In [50]:
y_test

array([[ 0.24158492,  0.24897964,  0.27285152, ...,  0.29180975,
         0.30408553,  0.27975772],
       [ 0.20325735,  0.19115871,  0.2179842 , ...,  0.2247866 ,
         0.21264327,  0.20899921],
       [ 0.49971383,  0.45190069,  0.52879003, ...,  0.50822165,
         0.48575495,  0.48011781],
       ..., 
       [ 0.22033994,  0.19916939,  0.22632535, ...,  0.20631559,
         0.20248792,  0.1843162 ],
       [ 2.4818273 ,  2.2558101 ,  2.42612214, ...,  2.69716597,
         2.42605486,  2.29840843],
       [ 0.4154033 ,  0.40396979,  0.36684869, ...,  0.42157276,
         0.3874901 ,  0.38230881]])

In [51]:
df_preds.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,unit_sales
store_nbr,item_nbr,date,Unnamed: 3_level_1
1,96995,2017-08-16,0.241585
1,96995,2017-08-17,0.24898
1,96995,2017-08-18,0.272852
1,96995,2017-08-19,0.400383
1,96995,2017-08-20,0.39994


In [52]:
df_test.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,id,onpromotion
store_nbr,item_nbr,date,Unnamed: 3_level_1,Unnamed: 4_level_1
1,96995,2017-08-16,125497040,False
1,99197,2017-08-16,125497041,False
1,103501,2017-08-16,125497042,False
1,103520,2017-08-16,125497043,False
1,103665,2017-08-16,125497044,False


In [53]:
submission

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,id,unit_sales
store_nbr,item_nbr,date,Unnamed: 3_level_1,Unnamed: 4_level_1
1,96995,2017-08-16,125497040,0.273266
1,99197,2017-08-16,125497041,0.225388
1,103501,2017-08-16,125497042,0.000000
1,103520,2017-08-16,125497043,0.648250
1,103665,2017-08-16,125497044,1.352113
1,105574,2017-08-16,125497045,3.222305
1,105575,2017-08-16,125497046,7.469437
1,105576,2017-08-16,125497047,0.000000
1,105577,2017-08-16,125497048,0.294655
1,105693,2017-08-16,125497049,0.319934
