In [1]:
from datetime import date, timedelta
import datetime
import pandas as pd
import numpy as np
from tqdm import tqdm, tnrange

from sklearn.metrics import mean_squared_error
import lightgbm as lgb

import mlflow
import mlflow.sklearn

from config import (
    RAW_DATA_DIR,
    FEATURE_DIR,
    LAG_DICT,
    SLIDING_DICT
)

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
# solve lightgbm error on MAC
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [3]:
# load data
df_train = pd.read_csv(
    RAW_DATA_DIR+'train.csv', usecols=[1, 2, 3, 4, 5],
    dtype={'onpromotion': bool},
    converters={'unit_sales': lambda u: np.log1p(
        float(u)) if float(u) > 0 else 0},
    parse_dates=["date"],
    skiprows=range(1, 66458909)  # 2016-01-01
)

df_test = pd.read_csv(
    RAW_DATA_DIR+'test.csv', usecols=[0, 1, 2, 3, 4],
    dtype={'onpromotion': bool},
    parse_dates=["date"]  # , date_parser=parser
).set_index(
    ['store_nbr', 'item_nbr', 'date']
)

items = pd.read_csv(
    RAW_DATA_DIR+'items.csv',
).set_index("item_nbr")

stores = pd.read_csv(
    RAW_DATA_DIR+'stores.csv',
).set_index("store_nbr")

transactions_df = pd.read_csv(
    RAW_DATA_DIR+'transactions.csv'
)

transactions_df = pd.read_csv(
    RAW_DATA_DIR+'transactions.csv'
)

oil_df = pd.read_csv(
    RAW_DATA_DIR+'oil.csv',
    parse_dates=['date'])

### Test Period

2017-08-16 to 2017-08-31

In [4]:
test_start = date(2017, 8, 16)
test_end = date(2017,8, 31)

In [5]:
valid_start = test_start - timedelta(16)
while(1):
    if valid_start.weekday() == test_start.weekday():
        break
    valid_start = valid_start-timedelta(days=1)
valid_end = valid_start + timedelta(15)
print('valid starts from {} to {}'.format(valid_start, valid_end))

valid starts from 2017-07-26 to 2017-08-10


### Valid Period

Considering the more nearer peiods of sales data may have more in common, it would be better to find the nearest period as valid period.

Based on the analysis before, we assume the sales data is periodically with the frequency of 7 days, so we want to keep that feature same
in the train, valid and test period.

So finally, we choose valid period:

2017-07-26 to 2017-08-10


In [6]:
valid_start = date(2017, 7, 26)
valid_end = date(2017, 8, 10)

### Filter Period

#### Earthquake happended on April 16, 2016. It may affect for the next several weeks.

In [7]:
# filter the period which is affected by earthquake.
filter_date = date(2016,4,16) + timedelta(7*4)
lag_max = 140
train_start=  filter_date+timedelta(days=lag_max)

while(1):
    train_start = train_start + timedelta(1)
    if train_start.weekday() == valid_start.weekday():
        break
print('train datasets starts from {}'.format(train_start))

train datasets starts from 2016-10-05


In [8]:
train_start = date(2017,2,8)

### Wages in the public sector are paid every two weeks on the 15 th and on the last day of the month. Supermarket sales could be affected by this.


In [9]:
df_train = df_train[df_train['date']>=filter_date]

'datetime.date' is coerced to a datetime. In the future pandas will
not coerce, and a TypeError will be raised. To retain the current
behavior, convert the 'datetime.date' to a datetime with
'pd.Timestamp'.
  """Entry point for launching an IPython kernel.


#### Promo feature

In [10]:
promo_train = df_train.set_index(
    ["store_nbr", "item_nbr", "date"])[["onpromotion"]]

# missing onpromotions filling
promo_train = promo_train.unstack(level=-1).fillna(False)
promo_train.columns = promo_train.columns.get_level_values(1)

In [11]:
# missing test onpromotions filling
promo_test = df_test[["onpromotion"]].unstack(level=-1).fillna(False)
promo_test.columns = promo_test.columns.get_level_values(1)
# filter those items/stores in promo_test but not in promo_train
promo_test = promo_test.reindex(promo_train.index).fillna(False)

In [12]:
promo_features = pd.concat([promo_train, promo_test], axis=1)
del promo_test, promo_train

In [13]:
transactions_df = pd.read_csv(
    RAW_DATA_DIR+'transactions.csv'
)

## Label

In [14]:
# label
df_train = df_train.set_index(["store_nbr", "item_nbr", "date"])[["unit_sales"]].unstack(level=-1).fillna(0)
# tmp = df_train.set_index(["store_nbr", "item_nbr", "date"])[["unit_sales"]].unstack(level=-1).fillna(0)

## Transactions

In [16]:
transactions_df['date'] = pd.to_datetime(transactions_df['date'])
transactions_df = transactions_df.set_index(["store_nbr", "date"])[["transactions"]].unstack(level=-1).fillna(0)
transactions_df = transactions_df.reindex(df_train.index.get_level_values(0))
transactions_df.columns = transactions_df.columns.get_level_values(1)

## Oil

In [45]:
oil_df = pd.read_csv(
    RAW_DATA_DIR+'oil.csv',
    parse_dates=['date'])

In [46]:
oil_df = oil_df.fillna(oil_df.dcoilwtico.mean())

In [52]:
oil_df.head()

            date      
dcoilwtico  2013-01-01    67.714366
            2013-01-02    93.140000
            2013-01-03    92.970000
            2013-01-04    93.120000
            2013-01-07    93.200000
dtype: float64

In [48]:
oil_df = oil_df.set_index(["date"])

In [51]:
oil_df = oil_df.unstack(level=0)

In [43]:
oil_df = oil_df.set_index(["date"])[["dcoilwtico"]].unstack(level=0).fillna(0)

In [44]:
oil_df.head()

            date      
dcoilwtico  2013-01-01    67.714366
            2013-01-02    93.140000
            2013-01-03    92.970000
            2013-01-04    93.120000
            2013-01-07    93.200000
dtype: float64

In [35]:
oil_df.columns = oil_df.columns.get_level_values(1)

AttributeError: 'Series' object has no attribute 'columns'

In [None]:
# # oil_df
# oil_df = oil_df.reindex(df_train.index.get_level_values(0))
# oil_df.columns = oil_df.columns.get_level_values(1)

## Item

In [17]:
items = items.reindex(df_train.index.get_level_values(1))

#### Item Family

In [18]:
items['family'] = items['family'].astype('category')
item_family_features = items.family.cat.codes.values

#### Item's class

In [46]:
items['class'] = items['class'].astype('category')
item_class_features = items['class'].cat.codes.values

## Store

In [47]:
stores = stores.reindex(df_train.index.get_level_values(0))

#### Store's city

In [48]:
stores['city'] = stores['city'].astype('category')
store_city_features = stores['city'].cat.codes.values

#### Store's state

In [49]:
stores['state'] = stores['state'].astype('category')
store_state_features = stores['state'].cat.codes.values

#### Store's type

In [50]:
stores['type'] = stores['type'].astype('category')
store_type_features = stores['type'].cat.codes.values

#### Store's cluster

In [51]:
stores['cluster'] = stores['cluster'].astype('category')
store_cluster_features = stores['cluster'].cat.codes.values

In [52]:
df_train.columns = df_train.columns.get_level_values(1)

#### Filling missing date

In [53]:
date_list = df_train.columns
obj_list = pd.date_range(filter_date, test_start-timedelta(1))
diff_list = list(set(obj_list) - set(date_list)) 
for i in diff_list:
    print(i)
    df_train[i] = 0

2016-12-25 00:00:00


In [54]:
date_list = promo_features.columns
obj_list = pd.date_range(filter_date, test_end)
diff_list = list(set(obj_list) - set(date_list)) 
for i in diff_list:
    print(i)
    promo_features[i] = 0

2016-12-25 00:00:00


#### Lagging and sliding windows

In [55]:
LAG_DICT = {'unit_sales': [1,2,3,4,5,6,7,14,21,28,35,42,49,56,63],
            'onpromotion': [2, 3,4,5,6, 7, 14, 21],
            'transactions': [1, 2, 3, 4, 5, 6, 7, 14, 21]}

SLIDING_DICT = {'unit_sales': [3, 4, 5, 6, 7, 14, 21, 30, 60, 63]}



# initialise dirs
RAW_DATA_DIR = 'datasets/'

In [57]:
def get_timespan(df, 
                 start_time,
                 minus,
                 periods,
                 freq='D'):
    return df[pd.date_range(start_time - timedelta(days=minus), periods=periods, freq=freq)]

def gen_dataset(df,
                promo_features,
                item_family_features,
                item_class_features,
                store_city_features,
                store_state_features,
                store_type_features,
                store_cluster_features,
                transactions_df,
                start_time,
                is_train=True):
    # init
    X = pd.DataFrame()

    for i in LAG_DICT['unit_sales']:
        X['lag_{}_sales'.format(i)] = get_timespan(df, start_time, i, 1).values.ravel()
    
    for i in LAG_DICT['onpromotion']:
        X['sum_{}_promo'.format(i)] = get_timespan(promo_features, start_time, i, 1).sum(axis=1).ravel()

    for i in SLIDING_DICT['unit_sales']:
        X["mean_{}_sales".format(i)] = get_timespan(df, start_time, i, i).mean(axis=1).values
        X["std_{}_sales".format(i)] = get_timespan(df, start_time, i, i).std(axis=1).values

    for i in range(7):
        X['mean_4_dow{}_2017'.format(i)] = get_timespan(df, start_time, 28-i, 4, freq='7D').mean(axis=1).values
        X['mean_20_dow{}_2017'.format(i)] = get_timespan(df, start_time, 140-i, 20, freq='7D').mean(axis=1).values

    for i in LAG_DICT['transactions']:
        X['lag_{}_transactions'.format(i)] = get_timespan(transactions_df, start_time, i, 1).values.ravel()

    # for the next to-predict 16 days 
    for i in range(16):
        X["promo_{}".format(i)] = promo_features[start_time + timedelta(days=i)].values.astype(np.uint8)

    X['item_family_features'] = item_family_features

    X['item_class_features'] = item_class_features

    X['store_city_features'] = store_city_features

    X['store_state_features'] = store_state_features

    X['store_type_features'] = store_type_features

    X['store_cluster_features'] = store_cluster_features
        
    if is_train:
        y = df[pd.date_range(start_time, periods=16)].values
        return X, y
    return X


#### Generate train, valid and test sets

In [76]:
print("Preparing dataset...")

nbr_weeks = int((valid_start - train_start).days/7)

X_l, y_l = [], []

for i in tqdm(range(nbr_weeks), desc = 'No. of week'):
    delta = timedelta(days=7 * i)
    X_tmp, y_tmp = gen_dataset(
        df_train,
        promo_features,
        item_family_features,
        item_class_features,
        store_city_features,
        store_state_features,
        store_type_features,
        store_cluster_features,
        transactions_df,
        train_start + delta
    )
    X_l.append(X_tmp)
    y_l.append(y_tmp)
#     break

No. of week:   0%|          | 0/24 [00:00<?, ?it/s]

Preparing dataset...


No. of week: 100%|██████████| 24/24 [00:21<00:00,  1.11it/s]


In [77]:
X_train = pd.concat(X_l, axis=0)
y_train = np.concatenate(y_l, axis=0)
del X_l, y_l

In [78]:
X_val, y_val = gen_dataset(df_train,
                           promo_features,
                           item_family_features,
                           item_class_features,
                           store_city_features,
                           store_state_features,
                           store_type_features,
                           store_cluster_features,
                           transactions_df,
                           valid_start)
X_test = gen_dataset(df_train, 
                    promo_features,
                    item_family_features,
                    item_class_features,
                    store_city_features,
                    store_state_features,
                    store_type_features,
                    store_cluster_features,
                     transactions_df
                    test_start, is_train=False)

TypeError: gen_dataset() missing 1 required positional argument: 'start_time'

#### Train Model

In [30]:
print("Training and predicting models...")
params = {
    'num_leaves': 2**5 - 1,
    'objective': 'regression_l2',
    'max_depth': 8,
    'min_data_in_leaf': 50,
    'learning_rate': 0.05,
    'feature_fraction': 0.75,
    'bagging_fraction': 0.75,
    'bagging_freq': 1,
    'metric': 'l2',
    'num_threads': 4
}

MAX_ROUNDS = 200
val_pred = []
test_pred = []
cate_vars = ['item_family_features',
            'item_class_features',
            'store_city_features',
            'store_state_features',
            'store_type_features',
            'store_cluster_features']

Training and predicting models...


In [31]:
for i in tqdm(range(16)):
    dtrain = lgb.Dataset(
        X_train, label=y_train[:, i],
        categorical_feature=cate_vars,
        weight=pd.concat([items["perishable"]] * nbr_weeks) * 0.25 + 1
    )
    dval = lgb.Dataset(
        X_val, label=y_val[:, i], reference=dtrain,
        weight=items["perishable"] * 0.25 + 1,
        categorical_feature=cate_vars)

    bst = lgb.train(
        params,
        dtrain,
        num_boost_round=MAX_ROUNDS,
#         verbose_eval = False,
        valid_sets=[dtrain, dval], early_stopping_rounds=50)
    print("\n".join(("%s: %.2f" % x) for x in sorted(
        zip(X_train.columns, bst.feature_importance("gain")),
        key=lambda x: x[1], reverse=True
    )))
    val_pred.append(bst.predict(
        X_val, num_iteration=bst.best_iteration or MAX_ROUNDS))
    test_pred.append(bst.predict(
        X_test, num_iteration=bst.best_iteration or MAX_ROUNDS))
    




[1]	training's l2: 1.04938	valid_1's l2: 1.00423
Training until validation scores don't improve for 50 rounds.
[2]	training's l2: 0.981164	valid_1's l2: 0.938377
[3]	training's l2: 0.918131	valid_1's l2: 0.877528
[4]	training's l2: 0.862105	valid_1's l2: 0.823612
[5]	training's l2: 0.810182	valid_1's l2: 0.773513
[6]	training's l2: 0.763315	valid_1's l2: 0.728307
[7]	training's l2: 0.720817	valid_1's l2: 0.687407
[8]	training's l2: 0.68249	valid_1's l2: 0.650288
[9]	training's l2: 0.647824	valid_1's l2: 0.617082
[10]	training's l2: 0.616322	valid_1's l2: 0.586896
[11]	training's l2: 0.587809	valid_1's l2: 0.55955
[12]	training's l2: 0.562755	valid_1's l2: 0.535611
[13]	training's l2: 0.539936	valid_1's l2: 0.513771
[14]	training's l2: 0.518578	valid_1's l2: 0.49327
[15]	training's l2: 0.499246	valid_1's l2: 0.474779
[16]	training's l2: 0.481693	valid_1's l2: 0.458007
[17]	training's l2: 0.465802	valid_1's l2: 0.442843
[18]	training's l2: 0.451247	valid_1's l2: 0.428881
[19]	training's 

  6%|▋         | 1/16 [01:33<23:22, 93.47s/it]

[1]	training's l2: 0.94861	valid_1's l2: 0.92514
Training until validation scores don't improve for 50 rounds.
[2]	training's l2: 0.890793	valid_1's l2: 0.868809
[3]	training's l2: 0.838363	valid_1's l2: 0.817604
[4]	training's l2: 0.790858	valid_1's l2: 0.771317
[5]	training's l2: 0.747899	valid_1's l2: 0.729328
[6]	training's l2: 0.709689	valid_1's l2: 0.692348
[7]	training's l2: 0.67449	valid_1's l2: 0.657869
[8]	training's l2: 0.643348	valid_1's l2: 0.627508
[9]	training's l2: 0.614606	valid_1's l2: 0.599446
[10]	training's l2: 0.588337	valid_1's l2: 0.573912
[11]	training's l2: 0.565035	valid_1's l2: 0.551382
[12]	training's l2: 0.543475	valid_1's l2: 0.530368
[13]	training's l2: 0.523972	valid_1's l2: 0.511355
[14]	training's l2: 0.506729	valid_1's l2: 0.494654
[15]	training's l2: 0.490593	valid_1's l2: 0.479024
[16]	training's l2: 0.476019	valid_1's l2: 0.464926
[17]	training's l2: 0.462839	valid_1's l2: 0.4522
[18]	training's l2: 0.450837	valid_1's l2: 0.440511
[19]	training's 

 12%|█▎        | 2/16 [03:04<21:38, 92.76s/it]

[1]	training's l2: 1.05416	valid_1's l2: 1.0597
Training until validation scores don't improve for 50 rounds.
[2]	training's l2: 0.987876	valid_1's l2: 0.993571
[3]	training's l2: 0.928453	valid_1's l2: 0.934357
[4]	training's l2: 0.873394	valid_1's l2: 0.879396
[5]	training's l2: 0.823862	valid_1's l2: 0.82985
[6]	training's l2: 0.779723	valid_1's l2: 0.785783
[7]	training's l2: 0.738901	valid_1's l2: 0.744934
[8]	training's l2: 0.701865	valid_1's l2: 0.70795
[9]	training's l2: 0.668383	valid_1's l2: 0.674601
[10]	training's l2: 0.638558	valid_1's l2: 0.644748
[11]	training's l2: 0.61099	valid_1's l2: 0.617162
[12]	training's l2: 0.586153	valid_1's l2: 0.592313
[13]	training's l2: 0.563435	valid_1's l2: 0.569467
[14]	training's l2: 0.542843	valid_1's l2: 0.548825
[15]	training's l2: 0.524738	valid_1's l2: 0.530657
[16]	training's l2: 0.508291	valid_1's l2: 0.514152
[17]	training's l2: 0.492908	valid_1's l2: 0.49861
[18]	training's l2: 0.478786	valid_1's l2: 0.484409
[19]	training's l2

 19%|█▉        | 3/16 [04:34<19:55, 91.93s/it]

[1]	training's l2: 1.18942	valid_1's l2: 1.16019
Training until validation scores don't improve for 50 rounds.
[2]	training's l2: 1.11147	valid_1's l2: 1.08389
[3]	training's l2: 1.04188	valid_1's l2: 1.01577
[4]	training's l2: 0.978182	valid_1's l2: 0.953186
[5]	training's l2: 0.920572	valid_1's l2: 0.896693
[6]	training's l2: 0.868258	valid_1's l2: 0.845441
[7]	training's l2: 0.821523	valid_1's l2: 0.79984
[8]	training's l2: 0.779101	valid_1's l2: 0.758581
[9]	training's l2: 0.740348	valid_1's l2: 0.720697
[10]	training's l2: 0.705061	valid_1's l2: 0.686312
[11]	training's l2: 0.673215	valid_1's l2: 0.655142
[12]	training's l2: 0.644341	valid_1's l2: 0.626986
[13]	training's l2: 0.618248	valid_1's l2: 0.601596
[14]	training's l2: 0.594599	valid_1's l2: 0.578615
[15]	training's l2: 0.573163	valid_1's l2: 0.55776
[16]	training's l2: 0.553686	valid_1's l2: 0.538733
[17]	training's l2: 0.536161	valid_1's l2: 0.521788
[18]	training's l2: 0.52003	valid_1's l2: 0.506052
[19]	training's l2: 

 25%|██▌       | 4/16 [06:02<18:07, 90.59s/it]

[1]	training's l2: 1.23175	valid_1's l2: 1.22124
Training until validation scores don't improve for 50 rounds.
[2]	training's l2: 1.15105	valid_1's l2: 1.13902
[3]	training's l2: 1.07797	valid_1's l2: 1.06504
[4]	training's l2: 1.01255	valid_1's l2: 0.998784
[5]	training's l2: 0.952821	valid_1's l2: 0.93777
[6]	training's l2: 0.898628	valid_1's l2: 0.883111
[7]	training's l2: 0.850085	valid_1's l2: 0.834468
[8]	training's l2: 0.805648	valid_1's l2: 0.789558
[9]	training's l2: 0.765296	valid_1's l2: 0.748775
[10]	training's l2: 0.728793	valid_1's l2: 0.712303
[11]	training's l2: 0.695795	valid_1's l2: 0.679277
[12]	training's l2: 0.666066	valid_1's l2: 0.649413
[13]	training's l2: 0.638947	valid_1's l2: 0.622355
[14]	training's l2: 0.614221	valid_1's l2: 0.597391
[15]	training's l2: 0.592184	valid_1's l2: 0.575397
[16]	training's l2: 0.572355	valid_1's l2: 0.555769
[17]	training's l2: 0.554234	valid_1's l2: 0.537699
[18]	training's l2: 0.537537	valid_1's l2: 0.521475
[19]	training's l2:

 31%|███▏      | 5/16 [07:29<16:26, 89.70s/it]

[1]	training's l2: 1.06252	valid_1's l2: 1.09768
Training until validation scores don't improve for 50 rounds.
[2]	training's l2: 0.997663	valid_1's l2: 1.03097
[3]	training's l2: 0.938964	valid_1's l2: 0.970305
[4]	training's l2: 0.88597	valid_1's l2: 0.915206
[5]	training's l2: 0.837951	valid_1's l2: 0.865389
[6]	training's l2: 0.794399	valid_1's l2: 0.820085
[7]	training's l2: 0.755152	valid_1's l2: 0.779093
[8]	training's l2: 0.719509	valid_1's l2: 0.741806
[9]	training's l2: 0.68727	valid_1's l2: 0.708251
[10]	training's l2: 0.658343	valid_1's l2: 0.67787
[11]	training's l2: 0.632285	valid_1's l2: 0.650537
[12]	training's l2: 0.608568	valid_1's l2: 0.625692
[13]	training's l2: 0.586662	valid_1's l2: 0.602527
[14]	training's l2: 0.566771	valid_1's l2: 0.581476
[15]	training's l2: 0.548816	valid_1's l2: 0.562416
[16]	training's l2: 0.532873	valid_1's l2: 0.545402
[17]	training's l2: 0.518407	valid_1's l2: 0.529991
[18]	training's l2: 0.504865	valid_1's l2: 0.515485
[19]	training's l

 38%|███▊      | 6/16 [08:56<14:48, 88.83s/it]

[1]	training's l2: 1.0197	valid_1's l2: 1.18349
Training until validation scores don't improve for 50 rounds.
[2]	training's l2: 0.959133	valid_1's l2: 1.11792
[3]	training's l2: 0.902667	valid_1's l2: 1.05694
[4]	training's l2: 0.851696	valid_1's l2: 1.00128
[5]	training's l2: 0.807051	valid_1's l2: 0.951967
[6]	training's l2: 0.765047	valid_1's l2: 0.905686
[7]	training's l2: 0.728367	valid_1's l2: 0.864904
[8]	training's l2: 0.693972	valid_1's l2: 0.826829
[9]	training's l2: 0.662853	valid_1's l2: 0.792183
[10]	training's l2: 0.635644	valid_1's l2: 0.761404
[11]	training's l2: 0.610947	valid_1's l2: 0.733454
[12]	training's l2: 0.587578	valid_1's l2: 0.707083
[13]	training's l2: 0.567183	valid_1's l2: 0.683549
[14]	training's l2: 0.548608	valid_1's l2: 0.662045
[15]	training's l2: 0.531022	valid_1's l2: 0.641762
[16]	training's l2: 0.515875	valid_1's l2: 0.623876
[17]	training's l2: 0.502157	valid_1's l2: 0.607578
[18]	training's l2: 0.488812	valid_1's l2: 0.592116
[19]	training's l

 44%|████▍     | 7/16 [10:23<13:14, 88.23s/it]

[1]	training's l2: 1.05148	valid_1's l2: 1.16034
Training until validation scores don't improve for 50 rounds.
[2]	training's l2: 0.987952	valid_1's l2: 1.09364
[3]	training's l2: 0.92803	valid_1's l2: 1.03118
[4]	training's l2: 0.87386	valid_1's l2: 0.973964
[5]	training's l2: 0.824654	valid_1's l2: 0.922195
[6]	training's l2: 0.782184	valid_1's l2: 0.876596
[7]	training's l2: 0.741697	valid_1's l2: 0.833897
[8]	training's l2: 0.706986	valid_1's l2: 0.796163
[9]	training's l2: 0.673632	valid_1's l2: 0.760914
[10]	training's l2: 0.644783	valid_1's l2: 0.729728
[11]	training's l2: 0.618699	valid_1's l2: 0.701392
[12]	training's l2: 0.595258	valid_1's l2: 0.675829
[13]	training's l2: 0.572261	valid_1's l2: 0.65098
[14]	training's l2: 0.551426	valid_1's l2: 0.628393
[15]	training's l2: 0.533804	valid_1's l2: 0.608824
[16]	training's l2: 0.516495	valid_1's l2: 0.590056
[17]	training's l2: 0.500876	valid_1's l2: 0.573065
[18]	training's l2: 0.487861	valid_1's l2: 0.558158
[19]	training's l2

 50%|█████     | 8/16 [11:50<11:44, 88.07s/it]

[1]	training's l2: 0.951534	valid_1's l2: 0.997163
Training until validation scores don't improve for 50 rounds.
[2]	training's l2: 0.896285	valid_1's l2: 0.940745
[3]	training's l2: 0.846202	valid_1's l2: 0.889279
[4]	training's l2: 0.801998	valid_1's l2: 0.843598
[5]	training's l2: 0.760911	valid_1's l2: 0.801476
[6]	training's l2: 0.723614	valid_1's l2: 0.763265
[7]	training's l2: 0.690013	valid_1's l2: 0.728053
[8]	training's l2: 0.659635	valid_1's l2: 0.69639
[9]	training's l2: 0.632055	valid_1's l2: 0.66787
[10]	training's l2: 0.606888	valid_1's l2: 0.641943
[11]	training's l2: 0.584133	valid_1's l2: 0.618389
[12]	training's l2: 0.563644	valid_1's l2: 0.596853
[13]	training's l2: 0.545067	valid_1's l2: 0.577219
[14]	training's l2: 0.528212	valid_1's l2: 0.559308
[15]	training's l2: 0.512977	valid_1's l2: 0.543211
[16]	training's l2: 0.49906	valid_1's l2: 0.528577
[17]	training's l2: 0.486541	valid_1's l2: 0.515218
[18]	training's l2: 0.475072	valid_1's l2: 0.503087
[19]	training'

 56%|█████▋    | 9/16 [13:18<10:16, 88.02s/it]

[1]	training's l2: 1.06013	valid_1's l2: 1.07278
Training until validation scores don't improve for 50 rounds.
[2]	training's l2: 0.995915	valid_1's l2: 1.00856
[3]	training's l2: 0.937465	valid_1's l2: 0.949567
[4]	training's l2: 0.884733	valid_1's l2: 0.896285
[5]	training's l2: 0.836903	valid_1's l2: 0.847886
[6]	training's l2: 0.793404	valid_1's l2: 0.804368
[7]	training's l2: 0.754173	valid_1's l2: 0.764597
[8]	training's l2: 0.718641	valid_1's l2: 0.728737
[9]	training's l2: 0.68641	valid_1's l2: 0.695984
[10]	training's l2: 0.657787	valid_1's l2: 0.667238
[11]	training's l2: 0.631085	valid_1's l2: 0.640177
[12]	training's l2: 0.607789	valid_1's l2: 0.616661
[13]	training's l2: 0.585774	valid_1's l2: 0.594107
[14]	training's l2: 0.565807	valid_1's l2: 0.573696
[15]	training's l2: 0.547689	valid_1's l2: 0.555239
[16]	training's l2: 0.531215	valid_1's l2: 0.538561
[17]	training's l2: 0.516385	valid_1's l2: 0.523285
[18]	training's l2: 0.502783	valid_1's l2: 0.509382
[19]	training's

 62%|██████▎   | 10/16 [14:45<08:45, 87.63s/it]

[1]	training's l2: 1.19375	valid_1's l2: 1.14467
Training until validation scores don't improve for 50 rounds.
[2]	training's l2: 1.11904	valid_1's l2: 1.07137
[3]	training's l2: 1.05158	valid_1's l2: 1.00539
[4]	training's l2: 0.990555	valid_1's l2: 0.945449
[5]	training's l2: 0.935925	valid_1's l2: 0.89209
[6]	training's l2: 0.885695	valid_1's l2: 0.842973
[7]	training's l2: 0.840938	valid_1's l2: 0.799426
[8]	training's l2: 0.799551	valid_1's l2: 0.759303
[9]	training's l2: 0.762075	valid_1's l2: 0.723005
[10]	training's l2: 0.728369	valid_1's l2: 0.690239
[11]	training's l2: 0.697653	valid_1's l2: 0.66049
[12]	training's l2: 0.669745	valid_1's l2: 0.633496
[13]	training's l2: 0.6446	valid_1's l2: 0.609328
[14]	training's l2: 0.621719	valid_1's l2: 0.587379
[15]	training's l2: 0.601328	valid_1's l2: 0.567963
[16]	training's l2: 0.582601	valid_1's l2: 0.550021
[17]	training's l2: 0.565614	valid_1's l2: 0.533859
[18]	training's l2: 0.550068	valid_1's l2: 0.518984
[19]	training's l2: 0

 69%|██████▉   | 11/16 [16:12<07:16, 87.34s/it]

[1]	training's l2: 1.23424	valid_1's l2: 1.20859
Training until validation scores don't improve for 50 rounds.
[2]	training's l2: 1.15682	valid_1's l2: 1.12938
[3]	training's l2: 1.08659	valid_1's l2: 1.05813
[4]	training's l2: 1.02326	valid_1's l2: 0.993768
[5]	training's l2: 0.965985	valid_1's l2: 0.935376
[6]	training's l2: 0.913972	valid_1's l2: 0.883067
[7]	training's l2: 0.867528	valid_1's l2: 0.836233
[8]	training's l2: 0.824838	valid_1's l2: 0.793281
[9]	training's l2: 0.78667	valid_1's l2: 0.754966
[10]	training's l2: 0.752096	valid_1's l2: 0.720616
[11]	training's l2: 0.720198	valid_1's l2: 0.688811
[12]	training's l2: 0.691746	valid_1's l2: 0.66083
[13]	training's l2: 0.665486	valid_1's l2: 0.635227
[14]	training's l2: 0.641577	valid_1's l2: 0.611268
[15]	training's l2: 0.619986	valid_1's l2: 0.590103
[16]	training's l2: 0.600489	valid_1's l2: 0.571236
[17]	training's l2: 0.582674	valid_1's l2: 0.55352
[18]	training's l2: 0.566896	valid_1's l2: 0.538627
[19]	training's l2: 0

 75%|███████▌  | 12/16 [17:40<05:49, 87.47s/it]

[1]	training's l2: 1.06641	valid_1's l2: 1.04936
Training until validation scores don't improve for 50 rounds.
[2]	training's l2: 1.00343	valid_1's l2: 0.986732
[3]	training's l2: 0.947526	valid_1's l2: 0.930572
[4]	training's l2: 0.897006	valid_1's l2: 0.879612
[5]	training's l2: 0.851248	valid_1's l2: 0.833714
[6]	training's l2: 0.808764	valid_1's l2: 0.791104
[7]	training's l2: 0.771323	valid_1's l2: 0.753348
[8]	training's l2: 0.736437	valid_1's l2: 0.718445
[9]	training's l2: 0.70483	valid_1's l2: 0.686779
[10]	training's l2: 0.676773	valid_1's l2: 0.658794
[11]	training's l2: 0.650729	valid_1's l2: 0.632753
[12]	training's l2: 0.627323	valid_1's l2: 0.609165
[13]	training's l2: 0.60589	valid_1's l2: 0.587735
[14]	training's l2: 0.586481	valid_1's l2: 0.568302
[15]	training's l2: 0.568928	valid_1's l2: 0.55084
[16]	training's l2: 0.552943	valid_1's l2: 0.534899
[17]	training's l2: 0.538884	valid_1's l2: 0.520814
[18]	training's l2: 0.525705	valid_1's l2: 0.507707
[19]	training's l

 81%|████████▏ | 13/16 [19:08<04:23, 87.72s/it]

[1]	training's l2: 1.03104	valid_1's l2: 0.994067
Training until validation scores don't improve for 50 rounds.
[2]	training's l2: 0.971751	valid_1's l2: 0.93585
[3]	training's l2: 0.918264	valid_1's l2: 0.882947
[4]	training's l2: 0.868077	valid_1's l2: 0.833571
[5]	training's l2: 0.822606	valid_1's l2: 0.788867
[6]	training's l2: 0.781454	valid_1's l2: 0.748363
[7]	training's l2: 0.74554	valid_1's l2: 0.713176
[8]	training's l2: 0.711712	valid_1's l2: 0.680062
[9]	training's l2: 0.682238	valid_1's l2: 0.651323
[10]	training's l2: 0.654412	valid_1's l2: 0.624042
[11]	training's l2: 0.629105	valid_1's l2: 0.599616
[12]	training's l2: 0.606311	valid_1's l2: 0.577302
[13]	training's l2: 0.58556	valid_1's l2: 0.557063
[14]	training's l2: 0.566801	valid_1's l2: 0.538823
[15]	training's l2: 0.549794	valid_1's l2: 0.522193
[16]	training's l2: 0.5342	valid_1's l2: 0.507138
[17]	training's l2: 0.520189	valid_1's l2: 0.49369
[18]	training's l2: 0.507466	valid_1's l2: 0.481399
[19]	training's l2

 88%|████████▊ | 14/16 [20:35<02:55, 87.58s/it]

[1]	training's l2: 1.05903	valid_1's l2: 1.00853
Training until validation scores don't improve for 50 rounds.
[2]	training's l2: 0.994475	valid_1's l2: 0.946353
[3]	training's l2: 0.936067	valid_1's l2: 0.889915
[4]	training's l2: 0.885673	valid_1's l2: 0.84078
[5]	training's l2: 0.84001	valid_1's l2: 0.796495
[6]	training's l2: 0.796148	valid_1's l2: 0.754043
[7]	training's l2: 0.756604	valid_1's l2: 0.715939
[8]	training's l2: 0.720641	valid_1's l2: 0.681266
[9]	training's l2: 0.68995	valid_1's l2: 0.651676
[10]	training's l2: 0.660304	valid_1's l2: 0.623119
[11]	training's l2: 0.635027	valid_1's l2: 0.598858
[12]	training's l2: 0.612237	valid_1's l2: 0.576931
[13]	training's l2: 0.591579	valid_1's l2: 0.557057
[14]	training's l2: 0.571073	valid_1's l2: 0.537718
[15]	training's l2: 0.552456	valid_1's l2: 0.520148
[16]	training's l2: 0.535596	valid_1's l2: 0.504286
[17]	training's l2: 0.520351	valid_1's l2: 0.489746
[18]	training's l2: 0.507666	valid_1's l2: 0.477596
[19]	training's 

 94%|█████████▍| 15/16 [22:02<01:27, 87.48s/it]

[1]	training's l2: 0.957024	valid_1's l2: 0.938696
Training until validation scores don't improve for 50 rounds.
[2]	training's l2: 0.902736	valid_1's l2: 0.885382
[3]	training's l2: 0.853669	valid_1's l2: 0.836832
[4]	training's l2: 0.80938	valid_1's l2: 0.793062
[5]	training's l2: 0.769242	valid_1's l2: 0.753288
[6]	training's l2: 0.732861	valid_1's l2: 0.717433
[7]	training's l2: 0.700055	valid_1's l2: 0.684937
[8]	training's l2: 0.670381	valid_1's l2: 0.65549
[9]	training's l2: 0.644377	valid_1's l2: 0.629698
[10]	training's l2: 0.619915	valid_1's l2: 0.605598
[11]	training's l2: 0.597714	valid_1's l2: 0.583727
[12]	training's l2: 0.577712	valid_1's l2: 0.563918
[13]	training's l2: 0.559637	valid_1's l2: 0.546048
[14]	training's l2: 0.543108	valid_1's l2: 0.529748
[15]	training's l2: 0.52824	valid_1's l2: 0.515055
[16]	training's l2: 0.514728	valid_1's l2: 0.50179
[17]	training's l2: 0.502474	valid_1's l2: 0.489825
[18]	training's l2: 0.491286	valid_1's l2: 0.47894
[19]	training's 

100%|██████████| 16/16 [23:30<00:00, 87.47s/it]


#### Generate submission

In [33]:
mse = mean_squared_error(y_val, np.array(val_pred).transpose())
mse

0.3498481141075822

In [95]:
mse = mean_squared_error(y_val, np.array(val_pred).transpose())

mlflow.set_experiment('grocery forecasting')
with mlflow.start_run(run_name='lgbm'):
    mlflow.log_param('model', 'lgbm')
    mlflow.log_param('train starts', train_start)
    mlflow.log_params(params)
    mlflow.log_param('lagging', LAG_DICT.values())
    mlflow.log_param('slidingWindows', SLIDING_DICT.values())
    mlflow.log_param('item_info', 'Yes')
    mlflow.log_param('store_info', 'Yes')
    mlflow.log_param('private score', 0.52193)
    mlflow.log_param('private rank', '14%')
    mlflow.log_param('public score', 0.51609)

    mlflow.log_metric('mse', mse)
    
print("Validation mse:", mse)

Validation mse: 0.35123858092295934


In [32]:
print("Making submission...")
y_test = np.array(test_pred).transpose()
df_preds = pd.DataFrame(
    y_test, index=df_train.index,
    columns=pd.date_range("2017-08-16", periods=16)
).stack().to_frame("unit_sales")
df_preds.index.set_names(["store_nbr", "item_nbr", "date"], inplace=True)

submission = df_test[["id"]].join(df_preds, how="left").fillna(0)
submission["unit_sales"] = np.clip(np.expm1(submission["unit_sales"]), 0, 1000)
submission.to_csv('lgb.csv', float_format='%.4f', index=None)

Making submission...
