In [1]:
from datetime import date, timedelta

import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
import lightgbm as lgb

In [2]:
df_train = pd.read_csv('database/train.csv',usecols=[1,2,3,4,5], parse_dates=['date'],
                       dtype={'onpromotion':bool},
                      converters = {'unit_sales': lambda u : np.log1p(float(u) if float(u) > 0 else 0)},
                      skiprows=range(1,66458909))

In [3]:
#df_train.head()

In [4]:
df_test = pd.read_csv('database/test.csv', dtype={'onpromotion':bool},
                     parse_dates = ['date'],usecols=[0,1,2,3,4])

In [5]:
#df_test.head()

In [6]:
items = pd.read_csv('database/items.csv')

In [7]:
#items.head()

In [8]:
df_2017 = df_train.loc[df_train.date >= pd.datetime(2017,1,1)]
del df_train
#df_2017.head()

In [9]:
df_store = pd.read_csv('database/stores.csv')
df_store['big_city'] = False
df_store['middle_city'] = False
df_store['little_city'] = False

citys = pd.read_csv('database/city_population.csv')
big_citys = citys[citys['2017 Population'] >= 1000000]['Name'].values
middle_citys = citys.loc[(citys['2017 Population'] >= 100000) & (citys['2017 Population'] < 1000000)]['Name'].values

df_store['big_city'] = df_store['city'].map(lambda u :True if u in big_citys else False).astype(bool)
df_store['middle_city'] = df_store['city'].map(lambda u: True if u in middle_citys else False).astype(bool)
df_store['little_city'] = df_store['city'].map(lambda u: True if u not in big_citys and u not in middle_citys else False).astype(bool)

In [10]:
df_store = df_store[['store_nbr', 'big_city','middle_city','little_city']]

In [11]:
#df_store.head()

In [12]:
big_city_dict = {}
mid_city_dict = {}
lit_city_dict = {}
for i,j in zip(df_store['store_nbr'], df_store['big_city']):
    big_city_dict[i] = j
for i,j in zip(df_store['store_nbr'], df_store['middle_city']):
    mid_city_dict[i] = j
for i,j in zip(df_store['store_nbr'], df_store['little_city']):
    lit_city_dict[i] = j

In [13]:
city_2017_train = pd.merge(df_2017, df_store, how='left', on=['store_nbr'])
city_2017_test = pd.merge(df_test, df_store, how='left', on=['store_nbr'])
#city_2017_train

In [14]:
bc_2017_train = city_2017_train.set_index(['store_nbr', 'item_nbr', 'date'])[['big_city']].unstack(level=-1)
mc_2017_train = city_2017_train.set_index(['store_nbr', 'item_nbr', 'date'])[['middle_city']].unstack(level=-1)
lc_2017_train = city_2017_train.set_index(['store_nbr', 'item_nbr', 'date'])[['little_city']].unstack(level=-1)
bc_2017_train.columns = bc_2017_train.columns.get_level_values(1)
mc_2017_train.columns = mc_2017_train.columns.get_level_values(1)
lc_2017_train.columns = lc_2017_train.columns.get_level_values(1)

In [15]:
ind = list(set(bc_2017_train.index.get_level_values(0)))
for i in ind:
    bc_2017_train.loc[i] = big_city_dict[i]
    
ind = list(set(mc_2017_train.index.get_level_values(0)))
for i in ind:
    mc_2017_train.loc[i] = mid_city_dict[i]
    
ind = list(set(lc_2017_train.index.get_level_values(0)))
for i in ind:
    lc_2017_train.loc[i] = lit_city_dict[i]

In [16]:
bc_2017_test = city_2017_test.set_index(['store_nbr', 'item_nbr', 'date'])[['big_city']].unstack(level=-1)
mc_2017_test = city_2017_test.set_index(['store_nbr', 'item_nbr', 'date'])[['middle_city']].unstack(level=-1)
lc_2017_test = city_2017_test.set_index(['store_nbr', 'item_nbr', 'date'])[['little_city']].unstack(level=-1)

bc_2017_test.columns = bc_2017_test.columns.get_level_values(1)
mc_2017_test.columns = mc_2017_test.columns.get_level_values(1)
lc_2017_test.columns = lc_2017_test.columns.get_level_values(1)

In [17]:
ind = list(set(bc_2017_test.index.get_level_values(0)))
for i in ind:
    bc_2017_test.loc[i] = big_city_dict[i]
    
ind = list(set(mc_2017_test.index.get_level_values(0)))
for i in ind:
    mc_2017_test.loc[i] = mid_city_dict[i]
    
ind = list(set(lc_2017_test.index.get_level_values(0)))
for i in ind:
    lc_2017_test.loc[i] = lit_city_dict[i]

In [18]:
bc_2017_test = bc_2017_test.reindex(bc_2017_train.index).fillna(False) # 肯定会丢掉了很多train里没有的商品
bc_2017 = pd.concat([bc_2017_train, bc_2017_test], axis=1)

mc_2017_test = mc_2017_test.reindex(mc_2017_train.index).fillna(False)
mc_2017 = pd.concat([mc_2017_train, mc_2017_test], axis=1)

lc_2017_test = lc_2017_test.reindex(lc_2017_train.index).fillna(False)
lc_2017 = pd.concat([lc_2017_train, lc_2017_test], axis=1)

In [19]:
#bc_2017

In [20]:
del bc_2017_train, bc_2017_test
del mc_2017_train, mc_2017_test
del lc_2017_train, lc_2017_test

## 处理节假日信息

In [21]:
df_holiday = pd.read_csv('database/holidays_events.csv', 
                        parse_dates = ['date'],
                        dtype={'transferred':bool})

In [22]:
holiday_2017 = df_holiday.loc[df_holiday.date >= pd.datetime(2017,1,1)]
holiday_2017 = holiday_2017.loc[holiday_2017['transferred'] == False]

firstday = date(2017,1,1)
endday = date(2017,9,1)
periods = endday - firstday
all_days = pd.date_range(firstday, periods=periods.days, freq='D')

In [23]:
weekend_tmp = map(lambda day: True if(day.dayofweek >=5) else False, all_days)
weekend = []
for i in weekend_tmp:
    weekend.append(i)   
df_weekend = pd.DataFrame({'date':all_days, 'weekend_or_holiday': weekend })

In [24]:
tmp = holiday_2017['date'].values
for i in range(len(tmp)):
    df_weekend.loc[df_weekend['date'] == tmp[i], 'weekend_or_holiday'] = True
df_weekend_and_holiday = df_weekend

In [25]:
hw_2017_train = pd.merge(df_2017, df_weekend_and_holiday, how='left', on=['date'])
hw_2017_train = hw_2017_train.set_index(['store_nbr', 'item_nbr', 'date'])[['weekend_or_holiday']].unstack(level=-1)
hw_2017_train.columns = hw_2017_train.columns.get_level_values(1)

In [26]:
holiday_zip = zip(df_weekend['date'],df_weekend['weekend_or_holiday'])
tmp_dict = {}
for d,h in holiday_zip:
    tmp_dict[d] = h

In [27]:
columns = hw_2017_train.columns
for i in columns:
    hw_2017_train[i] = tmp_dict[i]

In [28]:
#hw_2017_train

In [29]:
hw_2017_train.columns

DatetimeIndex(['2017-01-01', '2017-01-02', '2017-01-03', '2017-01-04',
               '2017-01-05', '2017-01-06', '2017-01-07', '2017-01-08',
               '2017-01-09', '2017-01-10',
               ...
               '2017-08-06', '2017-08-07', '2017-08-08', '2017-08-09',
               '2017-08-10', '2017-08-11', '2017-08-12', '2017-08-13',
               '2017-08-14', '2017-08-15'],
              dtype='datetime64[ns]', name='date', length=227, freq=None)

In [30]:
hw_2017_test = pd.merge(df_test, df_weekend_and_holiday, how='left', on=['date'])
hw_2017_test = hw_2017_test.set_index(['store_nbr', 'item_nbr', 'date'])[['weekend_or_holiday']].unstack(level=-1)
hw_2017_test.columns = hw_2017_test.columns.get_level_values(1)

In [31]:
columns = hw_2017_test.columns
for i in columns:
    hw_2017_test[i] = tmp_dict[i]
hw_2017_test = hw_2017_test.reindex(hw_2017_train.index)

In [32]:
hw_2017 = pd.concat([hw_2017_train, hw_2017_test], axis=1)
del hw_2017_train, hw_2017_test
#hw_2017

## 处理促销信息

In [33]:
df_test = df_test.set_index(['store_nbr', 'item_nbr', 'date'])

In [34]:
promo_2017_train = df_2017.set_index(['store_nbr','item_nbr','date'])[['onpromotion']].unstack(level=-1).fillna(False)
#promo_2017_train

In [35]:
promo_2017_train.columns = promo_2017_train.columns.get_level_values(1)
promo_2017_train.columns

DatetimeIndex(['2017-01-01', '2017-01-02', '2017-01-03', '2017-01-04',
               '2017-01-05', '2017-01-06', '2017-01-07', '2017-01-08',
               '2017-01-09', '2017-01-10',
               ...
               '2017-08-06', '2017-08-07', '2017-08-08', '2017-08-09',
               '2017-08-10', '2017-08-11', '2017-08-12', '2017-08-13',
               '2017-08-14', '2017-08-15'],
              dtype='datetime64[ns]', name='date', length=227, freq=None)

In [36]:
promo_2017_test = df_test[['onpromotion']].unstack(level=-1).fillna(False)
#promo_2017_test.head()

In [37]:
promo_2017_test.columns

MultiIndex(levels=[['onpromotion'], [2017-08-16 00:00:00, 2017-08-17 00:00:00, 2017-08-18 00:00:00, 2017-08-19 00:00:00, 2017-08-20 00:00:00, 2017-08-21 00:00:00, 2017-08-22 00:00:00, 2017-08-23 00:00:00, 2017-08-24 00:00:00, 2017-08-25 00:00:00, 2017-08-26 00:00:00, 2017-08-27 00:00:00, 2017-08-28 00:00:00, 2017-08-29 00:00:00, 2017-08-30 00:00:00, 2017-08-31 00:00:00]],
           labels=[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]],
           names=[None, 'date'])

In [38]:
promo_2017_test.columns = promo_2017_test.columns.get_level_values(1)
promo_2017_test.columns

DatetimeIndex(['2017-08-16', '2017-08-17', '2017-08-18', '2017-08-19',
               '2017-08-20', '2017-08-21', '2017-08-22', '2017-08-23',
               '2017-08-24', '2017-08-25', '2017-08-26', '2017-08-27',
               '2017-08-28', '2017-08-29', '2017-08-30', '2017-08-31'],
              dtype='datetime64[ns]', name='date', freq=None)

In [39]:
promo_2017_test = promo_2017_test.reindex(promo_2017_train.index).fillna(False)

In [40]:
promo_2017 = pd.concat([promo_2017_train, promo_2017_test], axis=1)

In [41]:
del promo_2017_test, promo_2017_train

In [42]:
#promo_2017.head()

In [43]:
promo_2017.columns

DatetimeIndex(['2017-01-01', '2017-01-02', '2017-01-03', '2017-01-04',
               '2017-01-05', '2017-01-06', '2017-01-07', '2017-01-08',
               '2017-01-09', '2017-01-10',
               ...
               '2017-08-22', '2017-08-23', '2017-08-24', '2017-08-25',
               '2017-08-26', '2017-08-27', '2017-08-28', '2017-08-29',
               '2017-08-30', '2017-08-31'],
              dtype='datetime64[ns]', name='date', length=243, freq=None)

In [44]:
#df_2017.head()

In [45]:
df_2017 = df_2017.set_index(['store_nbr', 'item_nbr', 'date'])[['unit_sales']].unstack(level=-1).fillna(0)
#df_2017.head()

In [46]:
df_2017.columns = df_2017.columns.get_level_values(1)

## 处理商品信息

In [47]:
items['family_nbr'] = items['family'].astype('category').cat.codes
items['class_nbr'] = items['class'].astype('category').cat.codes

In [48]:
#items.head()

In [49]:
items = items.set_index('item_nbr')

In [50]:
len(items)

4100

In [51]:
items = items.reindex(df_2017.index.get_level_values(1))

In [52]:
len(items)

167515

## 处理商店信息

In [53]:
df_store = pd.read_csv('database/stores.csv')
df_store['city_nbr'] = df_store['city'].astype('category').cat.codes
df_store['state_nbr'] = df_store['state'].astype('category').cat.codes
df_store['type_nbr'] = df_store['type'].astype('category').cat.codes
df_store = df_store.reindex(df_2017.index.get_level_values(0))
#df_store.head()

## 提取数据

In [54]:
from datetime import timedelta, date
def get_timespan(df, dt, minus, periods, freq='D'):
    return df[pd.date_range(dt - timedelta(days=minus), periods=periods, freq=freq)]

In [55]:
def prepare_dataset(t2017, is_train=True):
    X = pd.DataFrame({
        
        "city_nbr":df_store['city_nbr'].values,
        "state_nbr":df_store['state_nbr'].values,
        "type_nbr":df_store['type_nbr'].values,
        "cluster":df_store['cluster'].values,
        
        "perishable":items['perishable'].values,
        "item_family_nbr":items['family_nbr'].values,
        "item_class_nbr": items['class_nbr'].values,    
        
        "day_1_2017": get_timespan(df_2017, t2017, 1, 1).values.ravel(),
        "restday_2_2017": get_timespan(hw_2017, t2017, 2, 2).sum(axis=1).values,
        "restday_7_2017": get_timespan(hw_2017, t2017, 7, 7).sum(axis=1).values,            
        "big_city":get_timespan(bc_2017, t2017, 0, 1).sum(axis=1).values.ravel(),
        "mid_city": get_timespan(mc_2017, t2017, 0, 1).sum(axis=1).values.ravel(),
        "lit_city": get_timespan(lc_2017, t2017, 0, 1).sum(axis=1).values.ravel(),
         
        "mean_3_2017": get_timespan(df_2017, t2017, 3, 3).mean(axis=1).values,
        "mean_5_2017": get_timespan(df_2017, t2017, 5, 5).mean(axis=1).values,
        "mean_7_2017": get_timespan(df_2017, t2017, 7, 7).mean(axis=1).values,
        "mean_14_2017": get_timespan(df_2017, t2017, 14, 14).mean(axis=1).values,
        "mean_30_2017": get_timespan(df_2017, t2017, 30, 30).mean(axis=1).values,
        "mean_60_2017": get_timespan(df_2017, t2017, 60, 60).mean(axis=1).values,
        "mean_90_2017": get_timespan(df_2017, t2017, 90, 90).mean(axis=1).values,
        "mean_140_2017": get_timespan(df_2017, t2017, 140, 140).mean(axis=1).values,
        
        #"restday_30_2017": get_timespan(hw_2017, t2017, 30, 30).sum(axis=1).values, 
        #"restday_60_2017": get_timespan(hw_2017, t2017, 60, 60).sum(axis=1).values, 
        #"restday_90_2017": get_timespan(hw_2017, t2017, 90, 90).sum(axis=1).values, 
        
        "promo_21_2017": get_timespan(promo_2017, t2017, 21, 21).sum(axis=1).values,
        "promo_30_2017": get_timespan(promo_2017, t2017, 30, 30).sum(axis=1).values,
        "promo_60_2017": get_timespan(promo_2017, t2017, 60, 60).sum(axis=1).values,
        "promo_90_2017": get_timespan(promo_2017, t2017, 90, 90).sum(axis=1).values,
        "promo_140_2017": get_timespan(promo_2017, t2017, 140, 140).sum(axis=1).values
    })
    for i in range(7):
        X['mean_2_dow{}_2017'.format(i)] = get_timespan(df_2017, t2017, 14-i, 2, freq='7D').mean(axis=1).values
        X['mean_4_dow{}_2017'.format(i)] = get_timespan(df_2017, t2017, 28-i, 4, freq='7D').mean(axis=1).values
        X['mean_20_dow{}_2017'.format(i)] = get_timespan(df_2017, t2017, 140-i, 20, freq='7D').mean(axis=1).values
        X['mean_12_dow{}_2017'.format(i)] = get_timespan(df_2017, t2017, 84-i, 12, freq='7D').mean(axis=1).values

    for i in range(16):
        X["promo_{}".format(i)] = promo_2017[
            t2017 + timedelta(days=i)].values.astype(np.uint8)
       # X["rest_{}".format(i)] = hw_2017[
       #     t2017 + timedelta(days=i)].values.astype(float)
    if is_train:
        y = df_2017[
            pd.date_range(t2017, periods=16)
        ].values
        return X, y
    return X

### 模型1 mod_0.520.csv

In [56]:
print("Preparing dataset...")
t2017 = date(2017, 5, 30)#(2017, 5, 31)
X_l, y_l = [], []
for i in range(6): # 6
    delta = timedelta(days=7 * i)
    X_tmp, y_tmp = prepare_dataset(
        t2017 + delta
    )
    X_l.append(X_tmp)
    y_l.append(y_tmp)
X_train = pd.concat(X_l, axis=0)
y_train = np.concatenate(y_l, axis=0)
del X_l, y_l
X_val, y_val = prepare_dataset(date(2017, 7, 26))
X_test = prepare_dataset(date(2017, 8, 16), is_train=False)

Preparing dataset...


In [57]:
print("Training and predicting models...")
params = {
    'num_leaves': 63, # 31
    'objective': 'regression',
    'min_data_in_leaf': 250,
    'learning_rate': 0.01,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 2, # 2
    'metric': 'l2',
    'num_threads': 4
}

MAX_ROUNDS = 5000
val_pred = []
test_pred = []
cate_vars = []
for i in range(16):
    print("=" * 50)
    print("Step %d" % (i+1))
    print("=" * 50)
    dtrain = lgb.Dataset(
        X_train, 
        label=y_train[:, i],  
        categorical_feature=cate_vars,
        weight=pd.concat([items["perishable"]] * 6) * 0.25 + 1
    )
    dval = lgb.Dataset(
        X_val, label=y_val[:, i], reference=dtrain,
        weight=items["perishable"] * 0.25 + 1,
        categorical_feature=cate_vars)
    bst = lgb.train(
        params, dtrain, num_boost_round=MAX_ROUNDS,
        valid_sets=[dtrain, dval], early_stopping_rounds=200, verbose_eval=500
    )
    print("\n".join(("%s: %.2f" % x) for x in sorted(
        zip(X_train.columns, bst.feature_importance("gain")),
        key=lambda x: x[1], reverse=True
    )))
    val_pred.append(bst.predict(
        X_val, num_iteration=bst.best_iteration or MAX_ROUNDS))
    test_pred.append(bst.predict(
        X_test, num_iteration=bst.best_iteration or MAX_ROUNDS))

print("Validation mse:", mean_squared_error(
    y_val, np.array(val_pred).transpose()))

Training and predicting models...
Step 1




Training until validation scores don't improve for 200 rounds.
[500]	training's l2: 0.310661	valid_1's l2: 0.305192
[1000]	training's l2: 0.30403	valid_1's l2: 0.304393
Early stopping, best iteration is:
[956]	training's l2: 0.304389	valid_1's l2: 0.304316
mean_7_2017: 15159196.60
mean_14_2017: 12001223.23
mean_5_2017: 1521239.85
promo_0: 687027.57
mean_3_2017: 685842.81
day_1_2017: 659763.69
mean_30_2017: 618377.47
mean_20_dow0_2017: 450625.68
mean_12_dow0_2017: 359662.27
promo_21_2017: 188355.07
mean_4_dow6_2017: 164906.20
mean_4_dow0_2017: 162121.79
mean_60_2017: 126353.07
mean_2_dow6_2017: 79031.84
mean_140_2017: 60774.90
item_family_nbr: 57491.39
promo_1: 54684.35
mean_2_dow0_2017: 51413.34
item_class_nbr: 46550.82
mean_90_2017: 36953.23
restday_2_2017: 32801.34
promo_30_2017: 32558.78
mean_12_dow6_2017: 29640.54
mean_20_dow6_2017: 26169.97
promo_7: 19444.06
mean_20_dow2_2017: 19334.81
promo_140_2017: 18411.66
cluster: 18025.11
mean_4_dow2_2017: 16486.77
promo_4: 16205.16
city_nbr

Step 6
Training until validation scores don't improve for 200 rounds.
[500]	training's l2: 0.358017	valid_1's l2: 0.36476
Early stopping, best iteration is:
[569]	training's l2: 0.355527	valid_1's l2: 0.364588
mean_4_dow5_2017: 10900357.08
mean_30_2017: 10265984.40
mean_14_2017: 9226919.73
mean_12_dow5_2017: 3537693.69
mean_3_2017: 2636683.12
mean_5_2017: 1601899.42
mean_20_dow5_2017: 1267641.28
mean_7_2017: 918820.70
promo_5: 690733.74
mean_2_dow5_2017: 163730.47
restday_7_2017: 159475.45
restday_2_2017: 130116.59
promo_21_2017: 116006.31
promo_6: 74210.98
promo_4: 72397.43
promo_8: 67478.99
mean_60_2017: 64268.39
item_family_nbr: 61358.68
item_class_nbr: 56251.17
promo_30_2017: 47506.84
mean_4_dow4_2017: 42760.72
promo_3: 35279.41
promo_7: 32545.97
city_nbr: 30975.14
type_nbr: 30078.70
promo_1: 30072.89
day_1_2017: 29412.43
promo_140_2017: 23126.72
cluster: 22369.68
promo_60_2017: 21223.28
promo_90_2017: 19439.75
promo_2: 19101.58
perishable: 17105.85
mean_140_2017: 16821.66
mean_2_d

Step 11
Training until validation scores don't improve for 200 rounds.
[500]	training's l2: 0.343851	valid_1's l2: 0.396108
Early stopping, best iteration is:
[406]	training's l2: 0.347426	valid_1's l2: 0.395318
mean_30_2017: 10010543.62
mean_14_2017: 7936244.33
mean_12_dow3_2017: 6059948.43
mean_7_2017: 2124676.32
mean_4_dow3_2017: 1713434.98
promo_10: 1240283.85
mean_20_dow3_2017: 1135114.93
mean_5_2017: 799108.62
mean_60_2017: 393187.13
promo_21_2017: 119855.79
promo_11: 102758.90
promo_3: 97248.07
item_class_nbr: 93944.04
mean_12_dow2_2017: 86240.32
promo_30_2017: 79772.08
mean_3_2017: 79152.95
item_family_nbr: 75103.37
promo_9: 59582.08
promo_8: 57094.48
promo_140_2017: 54465.91
promo_15: 49982.39
promo_90_2017: 44996.57
mean_4_dow2_2017: 42347.63
mean_2_dow3_2017: 39805.04
mean_20_dow2_2017: 35890.27
promo_13: 33715.98
mean_20_dow5_2017: 29264.07
mean_4_dow1_2017: 28492.98
day_1_2017: 28057.47
promo_12: 27223.10
mean_4_dow4_2017: 26635.65
promo_60_2017: 25686.67
perishable: 20179

Step 16
Training until validation scores don't improve for 200 rounds.
[500]	training's l2: 0.342689	valid_1's l2: 0.378041
Early stopping, best iteration is:
[422]	training's l2: 0.345428	valid_1's l2: 0.377628
mean_30_2017: 14133384.49
mean_14_2017: 6562064.95
mean_12_dow1_2017: 3057295.17
mean_7_2017: 1945608.92
promo_15: 1849866.39
mean_60_2017: 1677009.56
mean_20_dow1_2017: 1064976.16
mean_4_dow1_2017: 347451.40
mean_5_2017: 271305.53
day_1_2017: 139545.35
promo_8: 123515.40
promo_14: 122657.62
item_family_nbr: 117573.09
item_class_nbr: 115939.95
promo_30_2017: 89632.70
promo_21_2017: 81197.07
promo_1: 78523.10
promo_13: 69162.23
mean_140_2017: 67700.76
mean_3_2017: 67160.09
promo_140_2017: 55460.76
promo_11: 48376.47
mean_20_dow3_2017: 42703.88
mean_2_dow1_2017: 39613.40
mean_4_dow6_2017: 38560.22
promo_90_2017: 35242.14
mean_90_2017: 34699.32
promo_60_2017: 27561.35
mean_20_dow2_2017: 27215.44
mean_12_dow3_2017: 25401.25
promo_10: 22653.76
perishable: 22338.83
mean_4_dow3_2017: 

In [58]:
print("Making submission...")
y_test = np.array(test_pred).transpose()
df_preds = pd.DataFrame(
    y_test, index=df_2017.index,
    columns=pd.date_range("2017-08-16", periods=16)
).stack().to_frame("unit_sales")
df_preds.index.set_names(["store_nbr", "item_nbr", "date"], inplace=True)

Making submission...


In [59]:
submission = df_test[["id"]].join(df_preds, how="left").fillna(0)
submission["unit_sales"] = np.clip(np.expm1(submission["unit_sales"]), 0, 1000)
submission.to_csv('submission/model1.csv', float_format='%.4f', index=None)

### 模型2 0.512

In [60]:
print("Preparing dataset...")
t2017 = date(2017, 5, 24)#(2017, 5, 31)
X_l, y_l = [], []
for i in range(8): # 6
    delta = timedelta(days=7 * i)
    X_tmp, y_tmp = prepare_dataset(
        t2017 + delta
    )
    X_l.append(X_tmp)
    y_l.append(y_tmp)
X_train = pd.concat(X_l, axis=0)
y_train = np.concatenate(y_l, axis=0)
del X_l, y_l
X_val, y_val = prepare_dataset(date(2017, 7, 26))
X_test = prepare_dataset(date(2017, 8, 16), is_train=False)

Preparing dataset...


In [61]:
print("Training and predicting models...")
params = {
    'num_leaves': 63, # 31
    'objective': 'regression',
    'min_data_in_leaf': 250,
    'learning_rate': 0.01,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 2, # 2
    'metric': 'l2',
    'num_threads': 4
}

MAX_ROUNDS = 5000
val_pred = []
test_pred = []
cate_vars = []
for i in range(16):
    print("=" * 50)
    print("Step %d" % (i+1))
    print("=" * 50)
    dtrain = lgb.Dataset(
        X_train, 
        label=y_train[:, i],  
        categorical_feature=cate_vars,
        weight=pd.concat([items["perishable"]] * 8) * 0.25 + 1
    )
    dval = lgb.Dataset(
        X_val, label=y_val[:, i], reference=dtrain,
        weight=items["perishable"] * 0.25 + 1,
        categorical_feature=cate_vars)
    bst = lgb.train(
        params, dtrain, num_boost_round=MAX_ROUNDS,
        valid_sets=[dtrain, dval], early_stopping_rounds=200, verbose_eval=500
    )
    print("\n".join(("%s: %.2f" % x) for x in sorted(
        zip(X_train.columns, bst.feature_importance("gain")),
        key=lambda x: x[1], reverse=True
    )))
    val_pred.append(bst.predict(
        X_val, num_iteration=bst.best_iteration or MAX_ROUNDS))
    test_pred.append(bst.predict(
        X_test, num_iteration=bst.best_iteration or MAX_ROUNDS))

print("Validation mse:", mean_squared_error(
    y_val, np.array(val_pred).transpose()))

Training and predicting models...
Step 1




Training until validation scores don't improve for 200 rounds.
[500]	training's l2: 0.29836	valid_1's l2: 0.294096
[1000]	training's l2: 0.292461	valid_1's l2: 0.291192
[1500]	training's l2: 0.28953	valid_1's l2: 0.290312
[2000]	training's l2: 0.287202	valid_1's l2: 0.2899
[2500]	training's l2: 0.28519	valid_1's l2: 0.289587
[3000]	training's l2: 0.283312	valid_1's l2: 0.289388
[3500]	training's l2: 0.281581	valid_1's l2: 0.289276
[4000]	training's l2: 0.279896	valid_1's l2: 0.289179
[4500]	training's l2: 0.278269	valid_1's l2: 0.289078
[5000]	training's l2: 0.276712	valid_1's l2: 0.288997
mean_7_2017: 21442804.11
mean_14_2017: 16766496.50
mean_5_2017: 2090108.49
promo_0: 1340983.49
mean_30_2017: 1231295.24
day_1_2017: 712719.02
mean_20_dow0_2017: 645326.15
mean_4_dow0_2017: 496300.32
mean_3_2017: 406616.30
mean_12_dow0_2017: 398978.69
promo_21_2017: 274115.55
mean_60_2017: 245327.62
promo_7: 122004.04
item_family_nbr: 116129.73
item_class_nbr: 115206.10
mean_2_dow0_2017: 95787.95
mean

Step 5
Training until validation scores don't improve for 200 rounds.
[500]	training's l2: 0.354273	valid_1's l2: 0.355304
[1000]	training's l2: 0.344929	valid_1's l2: 0.349794
[1500]	training's l2: 0.340451	valid_1's l2: 0.347945
[2000]	training's l2: 0.337155	valid_1's l2: 0.346997
[2500]	training's l2: 0.334331	valid_1's l2: 0.346388
[3000]	training's l2: 0.331803	valid_1's l2: 0.346007
[3500]	training's l2: 0.329506	valid_1's l2: 0.345642
[4000]	training's l2: 0.327374	valid_1's l2: 0.345414
[4500]	training's l2: 0.325331	valid_1's l2: 0.345233
[5000]	training's l2: 0.32336	valid_1's l2: 0.345072
mean_14_2017: 21967270.89
mean_4_dow4_2017: 11618622.05
mean_5_2017: 8464087.33
mean_12_dow4_2017: 3989310.59
mean_30_2017: 3985029.17
mean_20_dow4_2017: 1576926.15
mean_7_2017: 1198932.47
mean_3_2017: 978550.22
promo_4: 934711.61
mean_2_dow4_2017: 242754.09
promo_21_2017: 235626.91
item_class_nbr: 197028.90
restday_7_2017: 189849.30
restday_2_2017: 141236.59
mean_60_2017: 131349.90
mean_4

Step 9
Training until validation scores don't improve for 200 rounds.
[500]	training's l2: 0.347441	valid_1's l2: 0.380789
Early stopping, best iteration is:
[675]	training's l2: 0.343643	valid_1's l2: 0.380077
mean_30_2017: 17408938.11
mean_14_2017: 9336512.05
mean_7_2017: 4116493.86
mean_60_2017: 1885128.29
mean_12_dow1_2017: 1315500.48
promo_8: 1234154.58
mean_20_dow1_2017: 680920.57
promo_21_2017: 185835.63
mean_5_2017: 185238.39
mean_90_2017: 106763.65
promo_7: 103810.95
item_class_nbr: 85928.14
mean_3_2017: 85920.62
item_family_nbr: 77629.21
promo_30_2017: 76306.35
promo_10: 74378.60
day_1_2017: 71511.17
mean_12_dow2_2017: 63088.04
mean_4_dow1_2017: 61756.46
mean_140_2017: 58641.89
mean_2_dow6_2017: 56684.62
mean_20_dow2_2017: 47945.91
promo_90_2017: 37797.59
promo_60_2017: 35436.30
promo_140_2017: 33840.66
promo_12: 32987.28
promo_9: 31338.70
restday_2_2017: 29321.80
mean_20_dow4_2017: 29187.61
promo_11: 27226.01
mean_4_dow5_2017: 26418.54
promo_13: 26076.99
promo_3: 25643.68
pr

Step 13
Training until validation scores don't improve for 200 rounds.
[500]	training's l2: 0.372636	valid_1's l2: 0.376512
[1000]	training's l2: 0.364154	valid_1's l2: 0.374438
[1500]	training's l2: 0.360042	valid_1's l2: 0.374003
Early stopping, best iteration is:
[1765]	training's l2: 0.358259	valid_1's l2: 0.373932
mean_30_2017: 22929137.48
mean_14_2017: 7565265.55
mean_60_2017: 3614096.54
mean_7_2017: 2381601.44
promo_12: 1283842.53
mean_3_2017: 1140814.80
mean_12_dow5_2017: 1013327.48
mean_4_dow5_2017: 786770.88
mean_20_dow5_2017: 741609.48
mean_5_2017: 319834.01
item_class_nbr: 204529.39
promo_13: 199109.70
promo_21_2017: 171170.78
promo_14: 141685.66
item_family_nbr: 140738.94
promo_10: 138150.96
promo_30_2017: 100393.43
mean_140_2017: 89374.29
mean_20_dow0_2017: 87400.84
promo_140_2017: 59157.37
mean_90_2017: 56042.06
mean_4_dow6_2017: 55663.26
promo_11: 53699.95
day_1_2017: 50221.06
mean_20_dow6_2017: 48459.79
promo_60_2017: 46807.59
mean_12_dow6_2017: 46376.29
promo_7: 45055

Validation mse: 0.360791041812


In [62]:
print("Making submission...")
y_test = np.array(test_pred).transpose()
df_preds = pd.DataFrame(
    y_test, index=df_2017.index,
    columns=pd.date_range("2017-08-16", periods=16)
).stack().to_frame("unit_sales")
df_preds.index.set_names(["store_nbr", "item_nbr", "date"], inplace=True)

Making submission...


In [63]:
submission = df_test[["id"]].join(df_preds, how="left").fillna(0)
submission["unit_sales"] = np.clip(np.expm1(submission["unit_sales"]), 0, 1000)
submission.to_csv('submission/model1.csv', float_format='%.4f', index=None)

## 模型3 0.524

In [None]:
print("Preparing dataset...")
t2017 = date(2017, 5, 22)#(2017, 5, 31)
X_l, y_l = [], []
for i in range(10): # 6
    delta = timedelta(days=7 * i)
    X_tmp, y_tmp = prepare_dataset(
        t2017 + delta
    )
    X_l.append(X_tmp)
    y_l.append(y_tmp)
X_train = pd.concat(X_l, axis=0)
y_train = np.concatenate(y_l, axis=0)
del X_l, y_l
X_val, y_val = prepare_dataset(date(2017, 7, 26))
X_test = prepare_dataset(date(2017, 8, 16), is_train=False)

Preparing dataset...


In [None]:
print("Training and predicting models...")
params = {
    'num_leaves': 63, # 31
    'objective': 'regression',
    'min_data_in_leaf': 250,
    'learning_rate': 0.01,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 2, # 2
    'metric': 'l2',
    'num_threads': 4
}

MAX_ROUNDS = 5000
val_pred = []
test_pred = []
cate_vars = []
for i in range(16):
    print("=" * 50)
    print("Step %d" % (i+1))
    print("=" * 50)
    dtrain = lgb.Dataset(
        X_train, 
        label=y_train[:, i],  
        categorical_feature=cate_vars,
        weight=pd.concat([items["perishable"]] * 10) * 0.25 + 1
    )
    dval = lgb.Dataset(
        X_val, label=y_val[:, i], reference=dtrain,
        weight=items["perishable"] * 0.25 + 1,
        categorical_feature=cate_vars)
    bst = lgb.train(
        params, dtrain, num_boost_round=MAX_ROUNDS,
        valid_sets=[dtrain, dval], early_stopping_rounds=200, verbose_eval=500
    )
    print("\n".join(("%s: %.2f" % x) for x in sorted(
        zip(X_train.columns, bst.feature_importance("gain")),
        key=lambda x: x[1], reverse=True
    )))
    val_pred.append(bst.predict(
        X_val, num_iteration=bst.best_iteration or MAX_ROUNDS))
    test_pred.append(bst.predict(
        X_test, num_iteration=bst.best_iteration or MAX_ROUNDS))

print("Validation mse:", mean_squared_error(
    y_val, np.array(val_pred).transpose()))

Training and predicting models...
Step 1




Training until validation scores don't improve for 200 rounds.
[500]	training's l2: 0.320805	valid_1's l2: 0.304355
Early stopping, best iteration is:
[716]	training's l2: 0.317445	valid_1's l2: 0.302361
mean_14_2017: 27413070.59
mean_7_2017: 22874288.33
day_1_2017: 1643050.13
mean_5_2017: 951494.27
promo_0: 683109.81
mean_30_2017: 651209.07
mean_20_dow0_2017: 541886.22
mean_12_dow0_2017: 474708.05
mean_3_2017: 418218.91
mean_60_2017: 329698.62
promo_21_2017: 235178.69
mean_4_dow0_2017: 200211.58
promo_1: 105784.06
mean_140_2017: 86426.27
mean_2_dow6_2017: 82977.25
mean_2_dow0_2017: 74655.32
item_class_nbr: 59585.59
mean_12_dow1_2017: 57015.90
mean_90_2017: 50725.14
item_family_nbr: 42196.79
promo_30_2017: 41058.93
promo_2: 38940.46
mean_20_dow1_2017: 37737.21
mean_4_dow1_2017: 36296.95
promo_5: 27788.55
type_nbr: 26955.65
city_nbr: 26882.20
mean_20_dow2_2017: 26519.73
mean_20_dow5_2017: 26220.06
promo_140_2017: 19473.34
mean_12_dow2_2017: 18551.63
mean_20_dow6_2017: 18222.21
cluster: 

Step 7
Training until validation scores don't improve for 200 rounds.
[500]	training's l2: 0.360983	valid_1's l2: 0.393512
Early stopping, best iteration is:
[497]	training's l2: 0.361071	valid_1's l2: 0.393485
mean_4_dow6_2017: 28430982.13
mean_14_2017: 13482734.69
mean_30_2017: 8784978.41
mean_12_dow6_2017: 3989914.96
mean_3_2017: 3894013.67
mean_7_2017: 3429582.57
mean_20_dow6_2017: 1641316.33
promo_6: 1213791.39
mean_2_dow6_2017: 1069463.29
day_1_2017: 723113.57
mean_5_2017: 355500.74
mean_4_dow5_2017: 208476.74
restday_7_2017: 206292.96
promo_21_2017: 167912.00
promo_7: 127993.12
promo_9: 110869.79
promo_0: 101945.62
promo_5: 95345.64
mean_60_2017: 93468.87
item_family_nbr: 93368.09
promo_30_2017: 66799.82
item_class_nbr: 57363.65
promo_8: 56834.52
promo_2: 49347.72
city_nbr: 47080.31
mean_2_dow5_2017: 43400.32
promo_4: 40964.62
mean_4_dow0_2017: 39536.17
mean_140_2017: 38067.37
type_nbr: 29901.48
promo_140_2017: 29686.47
perishable: 28141.40
promo_3: 26310.44
cluster: 24908.34
pr

Step 12
Training until validation scores don't improve for 200 rounds.
[500]	training's l2: 0.355146	valid_1's l2: 0.407438
Early stopping, best iteration is:
[405]	training's l2: 0.358344	valid_1's l2: 0.406826
mean_30_2017: 21990155.59
mean_14_2017: 10225092.32
mean_12_dow4_2017: 8736921.14
mean_5_2017: 4320295.55
mean_4_dow4_2017: 2206602.05
promo_11: 2023630.53
mean_20_dow4_2017: 1492273.77
mean_7_2017: 1178290.92
mean_60_2017: 411644.36
mean_3_2017: 254773.17
promo_12: 234462.46
promo_21_2017: 155832.35
item_class_nbr: 143873.96
promo_4: 143376.19
promo_30_2017: 129745.96
item_family_nbr: 125496.35
promo_9: 121881.00
promo_140_2017: 93171.67
mean_4_dow5_2017: 87813.01
mean_12_dow3_2017: 75983.19
promo_60_2017: 69842.71
mean_4_dow3_2017: 66303.39
promo_14: 58499.58
mean_20_dow3_2017: 55252.86
perishable: 52644.50
promo_0: 40111.86
mean_20_dow6_2017: 38528.48
mean_2_dow4_2017: 37947.26
promo_13: 36840.98
promo_15: 36181.88
promo_90_2017: 35972.91
day_1_2017: 35808.15
promo_10: 34744

Validation mse: 0.374468686299


In [None]:
print("Making submission...")
y_test = np.array(test_pred).transpose()
df_preds = pd.DataFrame(
    y_test, index=df_2017.index,
    columns=pd.date_range("2017-08-16", periods=16)
).stack().to_frame("unit_sales")
df_preds.index.set_names(["store_nbr", "item_nbr", "date"], inplace=True)
submission = df_test[["id"]].join(df_preds, how="left").fillna(0)
submission["unit_sales"] = np.clip(np.expm1(submission["unit_sales"]), 0, 1000)
submission.to_csv('submission/model3.csv', float_format='%.4f', index=None)

Making submission...


日期：2017-12-30
## 模型4 0.534
以上三个模型保持特征不变，不断地增加训练数据，下面的模型开始修改特征

In [57]:
def prepare_dataset(t2017, is_train=True):
    X = pd.DataFrame({
        
        "city_nbr":df_store['city_nbr'].values,
        "state_nbr":df_store['state_nbr'].values,
        "type_nbr":df_store['type_nbr'].values,
        "cluster":df_store['cluster'].values,
        
        "perishable":items['perishable'].values,
        "item_family_nbr":items['family_nbr'].values,
        "item_class_nbr": items['class_nbr'].values,    
        
        "day_1_2017": get_timespan(df_2017, t2017, 1, 1).values.ravel(),
        "restday_2_2017": get_timespan(hw_2017, t2017, 2, 2).sum(axis=1).values,
        "restday_7_2017": get_timespan(hw_2017, t2017, 7, 7).sum(axis=1).values,            
#        "big_city":get_timespan(bc_2017, t2017, 0, 1).sum(axis=1).values.ravel(),
#        "mid_city": get_timespan(mc_2017, t2017, 0, 1).sum(axis=1).values.ravel(),
#        "lit_city": get_timespan(lc_2017, t2017, 0, 1).sum(axis=1).values.ravel(),
         
        "mean_3_2017": get_timespan(df_2017, t2017, 3, 3).mean(axis=1).values,
        "mean_5_2017": get_timespan(df_2017, t2017, 5, 5).mean(axis=1).values,
        "mean_7_2017": get_timespan(df_2017, t2017, 7, 7).mean(axis=1).values,
        "mean_14_2017": get_timespan(df_2017, t2017, 14, 14).mean(axis=1).values,
        "mean_21_2017": get_timespan(df_2017, t2017, 21, 21).mean(axis=1).values,
        "mean_30_2017": get_timespan(df_2017, t2017, 30, 30).mean(axis=1).values,
        "mean_60_2017": get_timespan(df_2017, t2017, 60, 60).mean(axis=1).values,
        "mean_90_2017": get_timespan(df_2017, t2017, 90, 90).mean(axis=1).values,
        "mean_140_2017": get_timespan(df_2017, t2017, 140, 140).mean(axis=1).values,
        
        "restday_30_2017": get_timespan(hw_2017, t2017, 30, 30).sum(axis=1).values, 
        "restday_60_2017": get_timespan(hw_2017, t2017, 60, 60).sum(axis=1).values, 
        "restday_90_2017": get_timespan(hw_2017, t2017, 90, 90).sum(axis=1).values, 
        
        "promo_21_2017": get_timespan(promo_2017, t2017, 21, 21).sum(axis=1).values,
        "promo_30_2017": get_timespan(promo_2017, t2017, 30, 30).sum(axis=1).values,
        "promo_60_2017": get_timespan(promo_2017, t2017, 60, 60).sum(axis=1).values,
        "promo_90_2017": get_timespan(promo_2017, t2017, 90, 90).sum(axis=1).values,
        "promo_140_2017": get_timespan(promo_2017, t2017, 140, 140).sum(axis=1).values
    })
    for i in range(7):
        X['mean_2_dow{}_2017'.format(i)] = get_timespan(df_2017, t2017, 14-i, 2, freq='7D').mean(axis=1).values
        X['mean_4_dow{}_2017'.format(i)] = get_timespan(df_2017, t2017, 28-i, 4, freq='7D').mean(axis=1).values
        X['mean_20_dow{}_2017'.format(i)] = get_timespan(df_2017, t2017, 140-i, 20, freq='7D').mean(axis=1).values
        X['mean_12_dow{}_2017'.format(i)] = get_timespan(df_2017, t2017, 84-i, 12, freq='7D').mean(axis=1).values

    for i in range(16):
        X["promo_{}".format(i)] = promo_2017[
            t2017 + timedelta(days=i)].values.astype(np.uint8)
        X["rest_{}".format(i)] = hw_2017[
            t2017 + timedelta(days=i)].values.astype(float)
    if is_train:
        y = df_2017[
            pd.date_range(t2017, periods=16)
        ].values
        return X, y
    return X

In [58]:
print("Preparing dataset...")
t2017 = date(2017, 5, 21)#(2017, 5, 31)
X_l, y_l = [], []
for i in range(9): # 6
    delta = timedelta(days=7 * i)
    X_tmp, y_tmp = prepare_dataset(
        t2017 + delta
    )
    X_l.append(X_tmp)
    y_l.append(y_tmp)
X_train = pd.concat(X_l, axis=0)
y_train = np.concatenate(y_l, axis=0)
del X_l, y_l
X_val, y_val = prepare_dataset(date(2017, 7, 26))
X_test = prepare_dataset(date(2017, 8, 16), is_train=False)

Preparing dataset...


In [60]:
print("Training and predicting models...")
params = {
    'num_leaves': 63, # 31
    'objective': 'regression',
    'min_data_in_leaf': 250,
    'learning_rate': 0.03,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 2, # 2
    'metric': 'l2',
    'num_threads': 4
}

MAX_ROUNDS = 5000
val_pred = []
test_pred = []
cate_vars = []
for i in range(16):
    print("=" * 50)
    print("Step %d" % (i+1))
    print("=" * 50)
    dtrain = lgb.Dataset(
        X_train, 
        label=y_train[:, i],  
        categorical_feature=cate_vars,
        weight=pd.concat([items["perishable"]] * 9) * 0.25 + 1
    )
    dval = lgb.Dataset(
        X_val, label=y_val[:, i], reference=dtrain,
        weight=items["perishable"] * 0.25 + 1,
        categorical_feature=cate_vars)
    bst = lgb.train(
        params, dtrain, num_boost_round=MAX_ROUNDS,
        valid_sets=[dtrain, dval], early_stopping_rounds=200, verbose_eval=500
    )
    print("\n".join(("%s: %.2f" % x) for x in sorted(
        zip(X_train.columns, bst.feature_importance("gain")),
        key=lambda x: x[1], reverse=True
    )))
    val_pred.append(bst.predict(
        X_val, num_iteration=bst.best_iteration or MAX_ROUNDS))
    test_pred.append(bst.predict(
        X_test, num_iteration=bst.best_iteration or MAX_ROUNDS))

print("Validation mse:", mean_squared_error(
    y_val, np.array(val_pred).transpose()))

Training and predicting models...
Step 1




Training until validation scores don't improve for 200 rounds.
[500]	training's l2: 0.306689	valid_1's l2: 0.319137
Early stopping, best iteration is:
[689]	training's l2: 0.30341	valid_1's l2: 0.318474
mean_7_2017: 9606888.98
mean_14_2017: 8719647.65
mean_12_dow0_2017: 761429.00
day_1_2017: 708494.03
mean_20_dow0_2017: 519560.61
mean_4_dow0_2017: 418029.48
mean_2_dow6_2017: 352734.50
promo_0: 240138.84
mean_3_2017: 140810.43
mean_21_2017: 131676.70
mean_2_dow0_2017: 82215.82
promo_21_2017: 58140.14
mean_5_2017: 47979.69
mean_30_2017: 45392.62
rest_15: 43841.36
item_class_nbr: 41910.42
promo_1: 32293.77
city_nbr: 27148.86
item_family_nbr: 27137.51
mean_4_dow6_2017: 25941.27
mean_20_dow5_2017: 25410.34
mean_60_2017: 24143.88
cluster: 22528.37
type_nbr: 21556.99
promo_2: 21239.44
mean_140_2017: 18043.38
promo_30_2017: 15052.57
restday_30_2017: 13053.70
mean_4_dow1_2017: 12905.67
state_nbr: 12678.83
mean_20_dow4_2017: 12575.15
restday_60_2017: 12306.96
promo_7: 11523.57
restday_2_2017: 11

Step 5
Training until validation scores don't improve for 200 rounds.
Early stopping, best iteration is:
[148]	training's l2: 0.334129	valid_1's l2: 0.431338
mean_14_2017: 6286692.73
mean_5_2017: 2364489.65
mean_30_2017: 1489884.55
mean_21_2017: 1248020.83
mean_7_2017: 808898.04
mean_3_2017: 612999.93
promo_4: 361059.07
mean_12_dow4_2017: 325586.77
mean_60_2017: 233724.55
mean_20_dow4_2017: 219343.74
promo_21_2017: 62968.89
restday_90_2017: 59884.84
day_1_2017: 38106.94
mean_4_dow4_2017: 34775.34
mean_90_2017: 27700.40
promo_6: 22585.14
promo_3: 21939.35
promo_0: 19174.72
mean_4_dow5_2017: 16447.83
mean_20_dow5_2017: 15293.55
mean_2_dow6_2017: 15165.24
promo_30_2017: 15060.34
mean_2_dow5_2017: 13558.46
item_family_nbr: 12543.00
mean_140_2017: 12505.28
promo_7: 11658.01
promo_1: 11437.84
promo_8: 10790.47
restday_60_2017: 10448.34
item_class_nbr: 10239.10
mean_2_dow4_2017: 9011.45
promo_60_2017: 8435.39
mean_12_dow5_2017: 8341.86
mean_20_dow0_2017: 7256.42
promo_90_2017: 6910.69
promo_1

Step 9
Training until validation scores don't improve for 200 rounds.
Early stopping, best iteration is:
[121]	training's l2: 0.367176	valid_1's l2: 0.375594
mean_14_2017: 5063685.97
mean_21_2017: 4344214.76
mean_7_2017: 2995007.52
mean_30_2017: 1560175.28
mean_60_2017: 729688.38
promo_8: 398094.91
mean_12_dow1_2017: 264207.83
mean_20_dow1_2017: 254445.09
mean_90_2017: 114916.17
mean_4_dow1_2017: 103789.04
day_1_2017: 56282.93
promo_21_2017: 55886.35
mean_3_2017: 52591.98
mean_5_2017: 51959.10
promo_6: 46278.67
promo_9: 43178.93
mean_140_2017: 34982.21
restday_2_2017: 33454.77
promo_10: 33423.88
item_family_nbr: 19671.99
promo_30_2017: 18206.96
item_class_nbr: 17794.43
mean_20_dow2_2017: 13536.52
promo_1: 11496.17
mean_12_dow2_2017: 10870.97
rest_8: 10075.29
mean_2_dow1_2017: 8813.55
promo_60_2017: 8574.25
mean_4_dow2_2017: 7787.43
promo_140_2017: 7022.12
promo_3: 5980.89
promo_5: 5791.39
promo_0: 5688.32
mean_20_dow3_2017: 5187.81
promo_7: 4983.69
perishable: 4046.33
restday_90_2017: 

Step 13
Training until validation scores don't improve for 200 rounds.
Early stopping, best iteration is:
[143]	training's l2: 0.35677	valid_1's l2: 0.384316
mean_30_2017: 5124863.66
mean_21_2017: 3826158.98
mean_12_dow5_2017: 2462121.36
mean_5_2017: 1211007.14
mean_14_2017: 910904.34
promo_12: 619658.38
mean_4_dow5_2017: 599843.93
mean_20_dow5_2017: 573136.93
mean_3_2017: 475981.12
mean_60_2017: 99038.72
promo_13: 67225.27
promo_30_2017: 56660.90
day_1_2017: 50111.47
item_class_nbr: 46382.89
promo_10: 42767.35
promo_5: 41589.63
promo_21_2017: 38416.77
item_family_nbr: 36544.35
mean_7_2017: 31188.69
promo_15: 27355.66
promo_140_2017: 25811.09
mean_12_dow4_2017: 22481.99
mean_4_dow6_2017: 19088.84
mean_20_dow4_2017: 16553.35
mean_2_dow5_2017: 15975.27
rest_5: 14767.54
promo_60_2017: 14693.13
perishable: 13906.24
mean_4_dow3_2017: 13138.98
promo_90_2017: 13040.76
promo_11: 12597.07
promo_14: 12257.56
mean_140_2017: 11092.60
mean_20_dow0_2017: 11053.08
promo_0: 10568.76
restday_30_2017: 1

Validation mse: 0.382866898334


In [61]:
print("Making submission...")
y_test = np.array(test_pred).transpose()
df_preds = pd.DataFrame(
    y_test, index=df_2017.index,
    columns=pd.date_range("2017-08-16", periods=16)
).stack().to_frame("unit_sales")
df_preds.index.set_names(["store_nbr", "item_nbr", "date"], inplace=True)
submission = df_test[["id"]].join(df_preds, how="left").fillna(0)
submission["unit_sales"] = np.clip(np.expm1(submission["unit_sales"]), 0, 1000)
submission.to_csv('submission/model4.csv', float_format='%.4f', index=None)

Making submission...


## 模型5 0.534

In [62]:
print("Training and predicting models...")
params = {
    'num_leaves': 63, # 31
    'objective': 'regression',
    'min_data_in_leaf': 250,
    'learning_rate': 0.05,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 2, # 2
    'metric': 'l2',
    'num_threads': 4
}

MAX_ROUNDS = 5000
val_pred = []
test_pred = []
cate_vars = []
for i in range(16):
    print("=" * 50)
    print("Step %d" % (i+1))
    print("=" * 50)
    dtrain = lgb.Dataset(
        X_train, 
        label=y_train[:, i],  
        categorical_feature=cate_vars,
        weight=pd.concat([items["perishable"]] * 9) * 0.25 + 1
    )
    dval = lgb.Dataset(
        X_val, label=y_val[:, i], reference=dtrain,
        weight=items["perishable"] * 0.25 + 1,
        categorical_feature=cate_vars)
    bst = lgb.train(
        params, dtrain, num_boost_round=MAX_ROUNDS,
        valid_sets=[dtrain, dval], early_stopping_rounds=200, verbose_eval=500
    )
    print("\n".join(("%s: %.2f" % x) for x in sorted(
        zip(X_train.columns, bst.feature_importance("gain")),
        key=lambda x: x[1], reverse=True
    )))
    val_pred.append(bst.predict(
        X_val, num_iteration=bst.best_iteration or MAX_ROUNDS))
    test_pred.append(bst.predict(
        X_test, num_iteration=bst.best_iteration or MAX_ROUNDS))

print("Validation mse:", mean_squared_error(
    y_val, np.array(val_pred).transpose()))

Training and predicting models...
Step 1




Training until validation scores don't improve for 200 rounds.
[500]	training's l2: 0.301565	valid_1's l2: 0.319481
Early stopping, best iteration is:
[561]	training's l2: 0.300191	valid_1's l2: 0.319151
mean_7_2017: 5601070.96
mean_14_2017: 5573371.04
mean_12_dow0_2017: 471938.25
day_1_2017: 415602.23
mean_20_dow0_2017: 325584.59
mean_4_dow0_2017: 258811.36
mean_2_dow6_2017: 209858.97
promo_0: 146319.55
mean_3_2017: 85371.84
mean_2_dow0_2017: 47591.73
mean_21_2017: 45068.31
promo_21_2017: 35529.54
item_class_nbr: 28953.87
rest_15: 27504.56
mean_5_2017: 25715.12
promo_1: 18753.20
city_nbr: 17669.38
mean_20_dow5_2017: 16987.29
mean_30_2017: 16479.66
item_family_nbr: 16430.21
mean_60_2017: 15549.18
cluster: 14410.38
type_nbr: 12999.03
promo_2: 12945.29
mean_4_dow6_2017: 12089.69
mean_140_2017: 11298.71
promo_30_2017: 9973.42
mean_4_dow1_2017: 8291.30
restday_2_2017: 8280.07
restday_30_2017: 8244.06
mean_20_dow4_2017: 7979.47
state_nbr: 7592.09
rest_8: 7387.85
promo_7: 7068.45
restday_60_

Step 5
Training until validation scores don't improve for 200 rounds.
Early stopping, best iteration is:
[93]	training's l2: 0.333638	valid_1's l2: 0.4306
mean_14_2017: 3933173.46
mean_5_2017: 1490401.35
mean_30_2017: 965002.97
mean_21_2017: 589696.63
mean_3_2017: 387485.58
mean_7_2017: 378691.21
promo_4: 219912.63
mean_12_dow4_2017: 203591.58
mean_60_2017: 151129.96
mean_20_dow4_2017: 139615.75
promo_21_2017: 38289.74
restday_90_2017: 36383.76
mean_4_dow4_2017: 22681.63
day_1_2017: 20954.57
mean_90_2017: 19035.71
promo_6: 13825.49
promo_3: 13188.62
promo_0: 11201.08
mean_2_dow6_2017: 10271.54
promo_30_2017: 9959.39
mean_20_dow5_2017: 8875.32
item_class_nbr: 7695.65
promo_7: 7618.45
promo_1: 7320.45
mean_2_dow5_2017: 7202.62
mean_4_dow5_2017: 7200.24
item_family_nbr: 7105.68
mean_140_2017: 7075.19
restday_60_2017: 6435.30
promo_60_2017: 6184.92
mean_2_dow4_2017: 5669.54
promo_8: 5573.02
promo_5: 4973.18
mean_12_dow5_2017: 4693.49
mean_20_dow0_2017: 4161.85
promo_140_2017: 3995.43
promo

Step 9
Training until validation scores don't improve for 200 rounds.
Early stopping, best iteration is:
[77]	training's l2: 0.366003	valid_1's l2: 0.376002
mean_14_2017: 3236953.28
mean_21_2017: 2678691.47
mean_7_2017: 1596644.61
mean_30_2017: 975893.72
mean_60_2017: 378422.66
promo_8: 244774.81
mean_12_dow1_2017: 177313.00
mean_20_dow1_2017: 142369.01
mean_90_2017: 85059.54
mean_4_dow1_2017: 70325.57
day_1_2017: 38048.19
mean_5_2017: 37760.44
mean_3_2017: 33255.34
promo_21_2017: 32460.05
promo_6: 29070.03
promo_9: 27598.91
restday_2_2017: 20198.19
promo_10: 19837.61
mean_140_2017: 19113.83
promo_30_2017: 13769.80
item_family_nbr: 13300.25
item_class_nbr: 11103.92
mean_20_dow2_2017: 8862.02
promo_1: 8274.70
mean_12_dow2_2017: 7471.55
rest_8: 6234.92
promo_60_2017: 5207.77
mean_2_dow1_2017: 5003.20
promo_140_2017: 4032.53
promo_3: 3650.30
promo_5: 3522.90
perishable: 3382.46
mean_4_dow2_2017: 3353.75
mean_20_dow3_2017: 3288.88
restday_90_2017: 3099.77
promo_0: 2993.68
promo_7: 2712.39


Step 13
Training until validation scores don't improve for 200 rounds.
Early stopping, best iteration is:
[88]	training's l2: 0.356307	valid_1's l2: 0.38454
mean_30_2017: 3157479.74
mean_21_2017: 2367728.47
mean_12_dow5_2017: 1478097.88
mean_5_2017: 717185.35
mean_14_2017: 456654.17
promo_12: 375100.79
mean_20_dow5_2017: 345451.01
mean_4_dow5_2017: 344120.93
mean_3_2017: 310237.36
mean_60_2017: 80052.91
promo_13: 38382.73
promo_30_2017: 35128.88
day_1_2017: 28682.27
item_class_nbr: 28306.06
promo_10: 26799.62
mean_7_2017: 26175.72
promo_5: 24000.08
promo_21_2017: 23562.80
item_family_nbr: 22245.95
promo_15: 18662.05
promo_140_2017: 16215.48
mean_12_dow4_2017: 16201.17
mean_4_dow6_2017: 13398.85
mean_2_dow5_2017: 9831.22
promo_11: 9548.21
mean_20_dow4_2017: 9506.88
perishable: 9136.03
promo_60_2017: 8973.07
rest_5: 8668.61
promo_90_2017: 7803.12
promo_14: 7481.22
promo_0: 7226.79
mean_4_dow3_2017: 7208.67
mean_140_2017: 6626.08
mean_20_dow0_2017: 6617.08
restday_30_2017: 6284.31
promo_1

Validation mse: 0.383081087301


In [63]:
print("Making submission...")
y_test = np.array(test_pred).transpose()
df_preds = pd.DataFrame(
    y_test, index=df_2017.index,
    columns=pd.date_range("2017-08-16", periods=16)
).stack().to_frame("unit_sales")
df_preds.index.set_names(["store_nbr", "item_nbr", "date"], inplace=True)
submission = df_test[["id"]].join(df_preds, how="left").fillna(0)
submission["unit_sales"] = np.clip(np.expm1(submission["unit_sales"]), 0, 1000)
submission.to_csv('submission/model5.csv', float_format='%.4f', index=None)

Making submission...
