In [1]:
from datetime import date, timedelta
import datetime
import pandas as pd
import numpy as np
from tqdm import tqdm, tnrange

from sklearn.metrics import mean_squared_error
import lightgbm as lgb

import mlflow
import mlflow.sklearn

from config import (
    RAW_DATA_DIR,
    FEATURE_DIR,
    LAG_DICT,
    SLIDING_DICT
)

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
# solve lightgbm error on MAC
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [3]:
# load data
df_train = pd.read_csv(
    RAW_DATA_DIR+'train.csv', usecols=[1, 2, 3, 4, 5],
    dtype={'onpromotion': bool},
    converters={'unit_sales': lambda u: np.log1p(
        float(u)) if float(u) > 0 else 0},
    parse_dates=["date"],
    skiprows=range(1, 66458909)  # 2016-01-01
)

df_test = pd.read_csv(
    RAW_DATA_DIR+'test.csv', usecols=[0, 1, 2, 3, 4],
    dtype={'onpromotion': bool},
    parse_dates=["date"]  # , date_parser=parser
).set_index(
    ['store_nbr', 'item_nbr', 'date']
)

items = pd.read_csv(
    RAW_DATA_DIR+'items.csv',
).set_index("item_nbr")

stores = pd.read_csv(
    RAW_DATA_DIR+'stores.csv',
).set_index("store_nbr")

transactions_df = pd.read_csv(
    RAW_DATA_DIR+'transactions.csv'
)

### Test Period

2017-08-16 to 2017-08-31

In [4]:
test_start = date(2017, 8, 16)
test_end = date(2017,8, 31)

In [5]:
valid_start = test_start - timedelta(16)
while(1):
    if valid_start.weekday() == test_start.weekday():
        break
    valid_start = valid_start-timedelta(days=1)
valid_end = valid_start + timedelta(15)
print('valid starts from {} to {}'.format(valid_start, valid_end))

valid starts from 2017-07-26 to 2017-08-10


### Valid Period

Considering the more nearer peiods of sales data may have more in common, it would be better to find the nearest period as valid period.

Based on the analysis before, we assume the sales data is periodically with the frequency of 7 days, so we want to keep that feature same
in the train, valid and test period.

So finally, we choose valid period:

2017-07-26 to 2017-08-10


In [6]:
valid_start = date(2017, 7, 26)
valid_end = date(2017, 8, 10)

### Filter Period

#### Earthquake happended on April 16, 2016. It may affect for the next several weeks.

In [7]:
# filter the period which is affected by earthquake.
filter_date = date(2016,4,16) + timedelta(7*4)
lag_max = 140
train_start=  filter_date+timedelta(days=lag_max)

while(1):
    train_start = train_start + timedelta(1)
    if train_start.weekday() == valid_start.weekday():
        break
print('train datasets starts from {}'.format(train_start))

train datasets starts from 2016-10-05


## Training Period

In [8]:
train_start = date(2017,2,8)

### Wages in the public sector are paid every two weeks on the 15 th and on the last day of the month. Supermarket sales could be affected by this.


In [9]:
df_train = df_train[df_train['date']>=filter_date]

'datetime.date' is coerced to a datetime. In the future pandas will
not coerce, and a TypeError will be raised. To retain the current
behavior, convert the 'datetime.date' to a datetime with
'pd.Timestamp'.
  """Entry point for launching an IPython kernel.


#### Promo feature

In [10]:
promo_train = df_train.set_index(
    ["store_nbr", "item_nbr", "date"])[["onpromotion"]]

# missing onpromotions filling
promo_train = promo_train.unstack(level=-1).fillna(False)
promo_train.columns = promo_train.columns.get_level_values(1)

In [11]:
# missing test onpromotions filling
promo_test = df_test[["onpromotion"]].unstack(level=-1).fillna(False)
promo_test.columns = promo_test.columns.get_level_values(1)
# filter those items/stores in promo_test but not in promo_train
promo_test = promo_test.reindex(promo_train.index).fillna(False)

In [12]:
promo_features = pd.concat([promo_train, promo_test], axis=1)
del promo_test, promo_train

In [13]:
transactions_df = pd.read_csv(
    RAW_DATA_DIR+'transactions.csv'
)

## Label

In [23]:
# label
df_train = df_train.set_index(["store_nbr", "item_nbr", "date"])[["unit_sales"]].unstack(level=-1).fillna(0)
# tmp = df_train.set_index(["store_nbr", "item_nbr", "date"])[["unit_sales"]].unstack(level=-1).fillna(0)

## Transactions

In [64]:
transactions_df['date'] = pd.to_datetime(transactions_df['date'])
transactions_df = transactions_df.set_index(["store_nbr", "date"])[["transactions"]].unstack(level=-1).fillna(0)
transactions_df = transactions_df.reindex(df_train.index.get_level_values(0))
transactions_df.columns = transactions_df.columns.get_level_values(1)

## Item

In [43]:
items = items.reindex(df_train.index.get_level_values(1))

#### Item Family

In [45]:
items['family'] = items['family'].astype('category')
item_family_features = items.family.cat.codes.values

#### Item's class

In [46]:
items['class'] = items['class'].astype('category')
item_class_features = items['class'].cat.codes.values

## Store

In [47]:
stores = stores.reindex(df_train.index.get_level_values(0))

#### Store's city

In [48]:
stores['city'] = stores['city'].astype('category')
store_city_features = stores['city'].cat.codes.values

#### Store's state

In [49]:
stores['state'] = stores['state'].astype('category')
store_state_features = stores['state'].cat.codes.values

#### Store's type

In [50]:
stores['type'] = stores['type'].astype('category')
store_type_features = stores['type'].cat.codes.values

#### Store's cluster

In [51]:
stores['cluster'] = stores['cluster'].astype('category')
store_cluster_features = stores['cluster'].cat.codes.values

In [52]:
df_train.columns = df_train.columns.get_level_values(1)

#### Filling missing date

In [53]:
date_list = df_train.columns
obj_list = pd.date_range(filter_date, test_start-timedelta(1))
diff_list = list(set(obj_list) - set(date_list)) 
for i in diff_list:
    print(i)
    df_train[i] = 0

2016-12-25 00:00:00


In [54]:
date_list = promo_features.columns
obj_list = pd.date_range(filter_date, test_end)
diff_list = list(set(obj_list) - set(date_list)) 
for i in diff_list:
    print(i)
    promo_features[i] = 0

2016-12-25 00:00:00


#### Lagging and sliding windows

In [55]:
LAG_DICT = {'unit_sales': [1,2,3,4,5,6,7,14,21,28,35,42,49,56,63],
            'onpromotion': [2, 3,4,5,6, 7, 14, 21],
            'transactions': [1, 2, 3, 4, 5, 6, 7, 14, 21]}

SLIDING_DICT = {'unit_sales': [3, 4, 5, 6, 7, 14, 21, 30, 60, 63]}



# initialise dirs
RAW_DATA_DIR = 'datasets/'

In [57]:
def get_timespan(df, 
                 start_time,
                 minus,
                 periods,
                 freq='D'):
    return df[pd.date_range(start_time - timedelta(days=minus), periods=periods, freq=freq)]

def gen_dataset(df,
                promo_features,
                item_family_features,
                item_class_features,
                store_city_features,
                store_state_features,
                store_type_features,
                store_cluster_features,
                transactions_df,
                start_time,
                is_train=True):
    # init
    X = pd.DataFrame()

    for i in LAG_DICT['unit_sales']:
        X['lag_{}_sales'.format(i)] = get_timespan(df, start_time, i, 1).values.ravel()
    
    for i in LAG_DICT['onpromotion']:
        X['sum_{}_promo'.format(i)] = get_timespan(promo_features, start_time, i, 1).sum(axis=1).ravel()

    for i in SLIDING_DICT['unit_sales']:
        X["mean_{}_sales".format(i)] = get_timespan(df, start_time, i, i).mean(axis=1).values
        X["std_{}_sales".format(i)] = get_timespan(df, start_time, i, i).std(axis=1).values

    for i in range(7):
        X['mean_4_dow{}_2017'.format(i)] = get_timespan(df, start_time, 28-i, 4, freq='7D').mean(axis=1).values
        X['mean_20_dow{}_2017'.format(i)] = get_timespan(df, start_time, 140-i, 20, freq='7D').mean(axis=1).values

    for i in LAG_DICT['transactions']:
        X['lag_{}_transactions'.format(i)] = get_timespan(transactions_df, start_time, i, 1).values.ravel()

    # for the next to-predict 16 days 
    for i in range(16):
        X["promo_{}".format(i)] = promo_features[start_time + timedelta(days=i)].values.astype(np.uint8)

    X['item_family_features'] = item_family_features

    X['item_class_features'] = item_class_features

    X['store_city_features'] = store_city_features

    X['store_state_features'] = store_state_features

    X['store_type_features'] = store_type_features

    X['store_cluster_features'] = store_cluster_features
        
    if is_train:
        y = df[pd.date_range(start_time, periods=16)].values
        return X, y
    return X


#### Generate train, valid and test sets

In [76]:
print("Preparing dataset...")

nbr_weeks = int((valid_start - train_start).days/7)

X_l, y_l = [], []

for i in tqdm(range(nbr_weeks), desc = 'No. of week'):
    delta = timedelta(days=7 * i)
    X_tmp, y_tmp = gen_dataset(
        df_train,
        promo_features,
        item_family_features,
        item_class_features,
        store_city_features,
        store_state_features,
        store_type_features,
        store_cluster_features,
        transactions_df,
        train_start + delta
    )
    X_l.append(X_tmp)
    y_l.append(y_tmp)
#     break

No. of week:   0%|          | 0/24 [00:00<?, ?it/s]

Preparing dataset...


No. of week: 100%|██████████| 24/24 [00:21<00:00,  1.11it/s]


In [77]:
X_train = pd.concat(X_l, axis=0)
y_train = np.concatenate(y_l, axis=0)
del X_l, y_l

In [79]:
X_val, y_val = gen_dataset(df_train,
                           promo_features,
                           item_family_features,
                           item_class_features,
                           store_city_features,
                           store_state_features,
                           store_type_features,
                           store_cluster_features,
                           transactions_df,
                           valid_start)
X_test = gen_dataset(df_train, 
                    promo_features,
                    item_family_features,
                    item_class_features,
                    store_city_features,
                    store_state_features,
                    store_type_features,
                    store_cluster_features,
                    transactions_df,
                    test_start, is_train=False)

#### Train Model

In [80]:
print("Training and predicting models...")
params = {
    'num_leaves': 2**5 - 1,
    'objective': 'regression_l2',
    'max_depth': 8,
    'min_data_in_leaf': 50,
    'learning_rate': 0.05,
    'feature_fraction': 0.75,
    'bagging_fraction': 0.75,
    'bagging_freq': 1,
    'metric': 'l2',
    'num_threads': 4
}

MAX_ROUNDS = 200
val_pred = []
test_pred = []
cate_vars = ['item_family_features',
            'item_class_features',
            'store_city_features',
            'store_state_features',
            'store_type_features',
            'store_cluster_features']

Training and predicting models...


In [81]:
for i in tqdm(range(16)):
    dtrain = lgb.Dataset(
        X_train, label=y_train[:, i],
        categorical_feature=cate_vars,
        weight=pd.concat([items["perishable"]] * nbr_weeks) * 0.25 + 1
    )
    dval = lgb.Dataset(
        X_val, label=y_val[:, i], reference=dtrain,
        weight=items["perishable"] * 0.25 + 1,
        categorical_feature=cate_vars)

    bst = lgb.train(
        params,
        dtrain,
        num_boost_round=MAX_ROUNDS,
#         verbose_eval = False,
        valid_sets=[dtrain, dval], early_stopping_rounds=50)
    print("\n".join(("%s: %.2f" % x) for x in sorted(
        zip(X_train.columns, bst.feature_importance("gain")),
        key=lambda x: x[1], reverse=True
    )))
    val_pred.append(bst.predict(
        X_val, num_iteration=bst.best_iteration or MAX_ROUNDS))
    test_pred.append(bst.predict(
        X_test, num_iteration=bst.best_iteration or MAX_ROUNDS))
    




[1]	training's l2: 1.05102	valid_1's l2: 1.00592
Training until validation scores don't improve for 50 rounds.
[2]	training's l2: 0.981373	valid_1's l2: 0.938532
[3]	training's l2: 0.919894	valid_1's l2: 0.879283
[4]	training's l2: 0.864084	valid_1's l2: 0.825476
[5]	training's l2: 0.813703	valid_1's l2: 0.776978
[6]	training's l2: 0.766517	valid_1's l2: 0.731337
[7]	training's l2: 0.723585	valid_1's l2: 0.690035
[8]	training's l2: 0.684655	valid_1's l2: 0.652411
[9]	training's l2: 0.650701	valid_1's l2: 0.619872
[10]	training's l2: 0.618776	valid_1's l2: 0.589183
[11]	training's l2: 0.589819	valid_1's l2: 0.561363
[12]	training's l2: 0.563715	valid_1's l2: 0.536369
[13]	training's l2: 0.539947	valid_1's l2: 0.513612
[14]	training's l2: 0.519278	valid_1's l2: 0.493935
[15]	training's l2: 0.499749	valid_1's l2: 0.475246
[16]	training's l2: 0.482751	valid_1's l2: 0.459138
[17]	training's l2: 0.467343	valid_1's l2: 0.444508
[18]	training's l2: 0.453455	valid_1's l2: 0.431288
[19]	training

  6%|▋         | 1/16 [01:45<26:24, 105.60s/it]

[1]	training's l2: 0.94861	valid_1's l2: 0.92514
Training until validation scores don't improve for 50 rounds.
[2]	training's l2: 0.890743	valid_1's l2: 0.868577
[3]	training's l2: 0.838411	valid_1's l2: 0.817683
[4]	training's l2: 0.791094	valid_1's l2: 0.771371
[5]	training's l2: 0.748662	valid_1's l2: 0.730161
[6]	training's l2: 0.710467	valid_1's l2: 0.693095
[7]	training's l2: 0.675701	valid_1's l2: 0.659396
[8]	training's l2: 0.643618	valid_1's l2: 0.628102
[9]	training's l2: 0.614662	valid_1's l2: 0.599758
[10]	training's l2: 0.588515	valid_1's l2: 0.574308
[11]	training's l2: 0.564743	valid_1's l2: 0.551102
[12]	training's l2: 0.543097	valid_1's l2: 0.530072
[13]	training's l2: 0.523631	valid_1's l2: 0.511141
[14]	training's l2: 0.505973	valid_1's l2: 0.493935
[15]	training's l2: 0.489997	valid_1's l2: 0.47848
[16]	training's l2: 0.475799	valid_1's l2: 0.464806
[17]	training's l2: 0.462933	valid_1's l2: 0.452526
[18]	training's l2: 0.451015	valid_1's l2: 0.440911
[19]	training'

 12%|█▎        | 2/16 [03:26<24:17, 104.12s/it]

[1]	training's l2: 1.05416	valid_1's l2: 1.0597
Training until validation scores don't improve for 50 rounds.
[2]	training's l2: 0.987435	valid_1's l2: 0.993141
[3]	training's l2: 0.927044	valid_1's l2: 0.932941
[4]	training's l2: 0.872288	valid_1's l2: 0.878221
[5]	training's l2: 0.822736	valid_1's l2: 0.828741
[6]	training's l2: 0.777793	valid_1's l2: 0.783905
[7]	training's l2: 0.737258	valid_1's l2: 0.743382
[8]	training's l2: 0.700401	valid_1's l2: 0.706458
[9]	training's l2: 0.666975	valid_1's l2: 0.672948
[10]	training's l2: 0.636805	valid_1's l2: 0.642687
[11]	training's l2: 0.609837	valid_1's l2: 0.615817
[12]	training's l2: 0.584994	valid_1's l2: 0.590912
[13]	training's l2: 0.562452	valid_1's l2: 0.568352
[14]	training's l2: 0.542018	valid_1's l2: 0.547933
[15]	training's l2: 0.524055	valid_1's l2: 0.529924
[16]	training's l2: 0.507191	valid_1's l2: 0.513021
[17]	training's l2: 0.492251	valid_1's l2: 0.498021
[18]	training's l2: 0.478304	valid_1's l2: 0.48402
[19]	training's

 19%|█▉        | 3/16 [05:06<22:16, 102.83s/it]

[1]	training's l2: 1.18942	valid_1's l2: 1.16019
Training until validation scores don't improve for 50 rounds.
[2]	training's l2: 1.11211	valid_1's l2: 1.0844
[3]	training's l2: 1.0416	valid_1's l2: 1.01506
[4]	training's l2: 0.977831	valid_1's l2: 0.952717
[5]	training's l2: 0.920271	valid_1's l2: 0.896289
[6]	training's l2: 0.868485	valid_1's l2: 0.84573
[7]	training's l2: 0.821942	valid_1's l2: 0.800252
[8]	training's l2: 0.778974	valid_1's l2: 0.758306
[9]	training's l2: 0.740597	valid_1's l2: 0.72085
[10]	training's l2: 0.705828	valid_1's l2: 0.687131
[11]	training's l2: 0.673961	valid_1's l2: 0.655981
[12]	training's l2: 0.644884	valid_1's l2: 0.62768
[13]	training's l2: 0.618828	valid_1's l2: 0.602315
[14]	training's l2: 0.595038	valid_1's l2: 0.579157
[15]	training's l2: 0.57387	valid_1's l2: 0.558669
[16]	training's l2: 0.554287	valid_1's l2: 0.539623
[17]	training's l2: 0.536813	valid_1's l2: 0.522705
[18]	training's l2: 0.520641	valid_1's l2: 0.506976
[19]	training's l2: 0.5

 25%|██▌       | 4/16 [06:41<20:08, 100.68s/it]

[1]	training's l2: 1.23108	valid_1's l2: 1.22034
Training until validation scores don't improve for 50 rounds.
[2]	training's l2: 1.15102	valid_1's l2: 1.13901
[3]	training's l2: 1.07843	valid_1's l2: 1.06624
[4]	training's l2: 1.0124	valid_1's l2: 0.999177
[5]	training's l2: 0.952436	valid_1's l2: 0.938217
[6]	training's l2: 0.898157	valid_1's l2: 0.883183
[7]	training's l2: 0.849526	valid_1's l2: 0.83448
[8]	training's l2: 0.8053	valid_1's l2: 0.790291
[9]	training's l2: 0.765544	valid_1's l2: 0.750459
[10]	training's l2: 0.729349	valid_1's l2: 0.714391
[11]	training's l2: 0.696239	valid_1's l2: 0.68039
[12]	training's l2: 0.666009	valid_1's l2: 0.649513
[13]	training's l2: 0.638768	valid_1's l2: 0.622194
[14]	training's l2: 0.614089	valid_1's l2: 0.59752
[15]	training's l2: 0.591922	valid_1's l2: 0.575552
[16]	training's l2: 0.571809	valid_1's l2: 0.555264
[17]	training's l2: 0.553357	valid_1's l2: 0.536614
[18]	training's l2: 0.536585	valid_1's l2: 0.519816
[19]	training's l2: 0.52

 31%|███▏      | 5/16 [08:19<18:18, 99.90s/it] 

[1]	training's l2: 1.06252	valid_1's l2: 1.09768
Training until validation scores don't improve for 50 rounds.
[2]	training's l2: 0.99757	valid_1's l2: 1.03067
[3]	training's l2: 0.939783	valid_1's l2: 0.970921
[4]	training's l2: 0.886435	valid_1's l2: 0.915683
[5]	training's l2: 0.838477	valid_1's l2: 0.865634
[6]	training's l2: 0.79486	valid_1's l2: 0.820186
[7]	training's l2: 0.755842	valid_1's l2: 0.779704
[8]	training's l2: 0.720203	valid_1's l2: 0.74241
[9]	training's l2: 0.688381	valid_1's l2: 0.7091
[10]	training's l2: 0.659456	valid_1's l2: 0.679019
[11]	training's l2: 0.632693	valid_1's l2: 0.650931
[12]	training's l2: 0.608875	valid_1's l2: 0.62599
[13]	training's l2: 0.587539	valid_1's l2: 0.603495
[14]	training's l2: 0.568138	valid_1's l2: 0.583068
[15]	training's l2: 0.550016	valid_1's l2: 0.563854
[16]	training's l2: 0.533521	valid_1's l2: 0.546202
[17]	training's l2: 0.518684	valid_1's l2: 0.530363
[18]	training's l2: 0.505136	valid_1's l2: 0.515829
[19]	training's l2: 

 38%|███▊      | 6/16 [09:57<16:31, 99.15s/it]

[1]	training's l2: 1.02148	valid_1's l2: 1.185
Training until validation scores don't improve for 50 rounds.
[2]	training's l2: 0.960828	valid_1's l2: 1.11909
[3]	training's l2: 0.906141	valid_1's l2: 1.05975
[4]	training's l2: 0.854683	valid_1's l2: 1.00389
[5]	training's l2: 0.808444	valid_1's l2: 0.952798
[6]	training's l2: 0.766356	valid_1's l2: 0.906615
[7]	training's l2: 0.728239	valid_1's l2: 0.864389
[8]	training's l2: 0.693989	valid_1's l2: 0.826231
[9]	training's l2: 0.662794	valid_1's l2: 0.791438
[10]	training's l2: 0.634556	valid_1's l2: 0.759522
[11]	training's l2: 0.608893	valid_1's l2: 0.73077
[12]	training's l2: 0.585626	valid_1's l2: 0.704345
[13]	training's l2: 0.564674	valid_1's l2: 0.680316
[14]	training's l2: 0.545524	valid_1's l2: 0.658309
[15]	training's l2: 0.528314	valid_1's l2: 0.6385
[16]	training's l2: 0.513177	valid_1's l2: 0.620653
[17]	training's l2: 0.498939	valid_1's l2: 0.604094
[18]	training's l2: 0.486017	valid_1's l2: 0.588684
[19]	training's l2: 0

 44%|████▍     | 7/16 [11:35<14:49, 98.80s/it]

[1]	training's l2: 1.05418	valid_1's l2: 1.1628
Training until validation scores don't improve for 50 rounds.
[2]	training's l2: 0.987906	valid_1's l2: 1.09358
[3]	training's l2: 0.930533	valid_1's l2: 1.03317
[4]	training's l2: 0.875885	valid_1's l2: 0.975577
[5]	training's l2: 0.826661	valid_1's l2: 0.923169
[6]	training's l2: 0.781863	valid_1's l2: 0.875715
[7]	training's l2: 0.741367	valid_1's l2: 0.832901
[8]	training's l2: 0.704755	valid_1's l2: 0.7937
[9]	training's l2: 0.672974	valid_1's l2: 0.759619
[10]	training's l2: 0.642774	valid_1's l2: 0.727391
[11]	training's l2: 0.615351	valid_1's l2: 0.698014
[12]	training's l2: 0.590372	valid_1's l2: 0.67123
[13]	training's l2: 0.567834	valid_1's l2: 0.646936
[14]	training's l2: 0.547472	valid_1's l2: 0.624753
[15]	training's l2: 0.528918	valid_1's l2: 0.604498
[16]	training's l2: 0.51217	valid_1's l2: 0.58597
[17]	training's l2: 0.496944	valid_1's l2: 0.569042
[18]	training's l2: 0.483151	valid_1's l2: 0.553675
[19]	training's l2: 0

 50%|█████     | 8/16 [13:11<13:04, 98.08s/it]

[1]	training's l2: 0.951538	valid_1's l2: 0.99719
Training until validation scores don't improve for 50 rounds.
[2]	training's l2: 0.897497	valid_1's l2: 0.941246
[3]	training's l2: 0.84727	valid_1's l2: 0.889881
[4]	training's l2: 0.801941	valid_1's l2: 0.842925
[5]	training's l2: 0.760986	valid_1's l2: 0.800503
[6]	training's l2: 0.723665	valid_1's l2: 0.762145
[7]	training's l2: 0.689895	valid_1's l2: 0.727535
[8]	training's l2: 0.65956	valid_1's l2: 0.695971
[9]	training's l2: 0.631941	valid_1's l2: 0.667247
[10]	training's l2: 0.606831	valid_1's l2: 0.641266
[11]	training's l2: 0.584196	valid_1's l2: 0.617658
[12]	training's l2: 0.563933	valid_1's l2: 0.596614
[13]	training's l2: 0.545193	valid_1's l2: 0.577155
[14]	training's l2: 0.528215	valid_1's l2: 0.559223
[15]	training's l2: 0.512885	valid_1's l2: 0.543148
[16]	training's l2: 0.499443	valid_1's l2: 0.528913
[17]	training's l2: 0.486758	valid_1's l2: 0.515366
[18]	training's l2: 0.47529	valid_1's l2: 0.503155
[19]	training's

 56%|█████▋    | 9/16 [14:46<11:20, 97.22s/it]

[1]	training's l2: 1.05838	valid_1's l2: 1.07115
Training until validation scores don't improve for 50 rounds.
[2]	training's l2: 0.994056	valid_1's l2: 1.0064
[3]	training's l2: 0.935763	valid_1's l2: 0.947768
[4]	training's l2: 0.883056	valid_1's l2: 0.894598
[5]	training's l2: 0.835649	valid_1's l2: 0.846472
[6]	training's l2: 0.792288	valid_1's l2: 0.802904
[7]	training's l2: 0.753073	valid_1's l2: 0.763542
[8]	training's l2: 0.717827	valid_1's l2: 0.727614
[9]	training's l2: 0.686424	valid_1's l2: 0.695756
[10]	training's l2: 0.65714	valid_1's l2: 0.666161
[11]	training's l2: 0.630496	valid_1's l2: 0.639094
[12]	training's l2: 0.606346	valid_1's l2: 0.614532
[13]	training's l2: 0.584496	valid_1's l2: 0.59235
[14]	training's l2: 0.565229	valid_1's l2: 0.57276
[15]	training's l2: 0.547201	valid_1's l2: 0.554533
[16]	training's l2: 0.531422	valid_1's l2: 0.538419
[17]	training's l2: 0.516562	valid_1's l2: 0.523085
[18]	training's l2: 0.503403	valid_1's l2: 0.509726
[19]	training's l2

 62%|██████▎   | 10/16 [16:25<09:46, 97.75s/it]

[1]	training's l2: 1.19481	valid_1's l2: 1.14556
Training until validation scores don't improve for 50 rounds.
[2]	training's l2: 1.11989	valid_1's l2: 1.07225
[3]	training's l2: 1.05315	valid_1's l2: 1.00687
[4]	training's l2: 0.992752	valid_1's l2: 0.947856
[5]	training's l2: 0.93726	valid_1's l2: 0.893497
[6]	training's l2: 0.887814	valid_1's l2: 0.845271
[7]	training's l2: 0.84222	valid_1's l2: 0.800917
[8]	training's l2: 0.801017	valid_1's l2: 0.760819
[9]	training's l2: 0.764173	valid_1's l2: 0.725218
[10]	training's l2: 0.730859	valid_1's l2: 0.693006
[11]	training's l2: 0.699813	valid_1's l2: 0.662981
[12]	training's l2: 0.671617	valid_1's l2: 0.635702
[13]	training's l2: 0.646259	valid_1's l2: 0.611378
[14]	training's l2: 0.623826	valid_1's l2: 0.589962
[15]	training's l2: 0.60308	valid_1's l2: 0.570025
[16]	training's l2: 0.584739	valid_1's l2: 0.55245
[17]	training's l2: 0.567392	valid_1's l2: 0.535795
[18]	training's l2: 0.551682	valid_1's l2: 0.520791
[19]	training's l2: 0

 69%|██████▉   | 11/16 [18:15<08:26, 101.22s/it]

[1]	training's l2: 1.23424	valid_1's l2: 1.20859
Training until validation scores don't improve for 50 rounds.
[2]	training's l2: 1.15681	valid_1's l2: 1.12917
[3]	training's l2: 1.08717	valid_1's l2: 1.05928
[4]	training's l2: 1.02376	valid_1's l2: 0.995097
[5]	training's l2: 0.967	valid_1's l2: 0.937385
[6]	training's l2: 0.914723	valid_1's l2: 0.884906
[7]	training's l2: 0.867625	valid_1's l2: 0.837235
[8]	training's l2: 0.825207	valid_1's l2: 0.794979
[9]	training's l2: 0.78645	valid_1's l2: 0.755954
[10]	training's l2: 0.751749	valid_1's l2: 0.721576
[11]	training's l2: 0.720226	valid_1's l2: 0.689727
[12]	training's l2: 0.69136	valid_1's l2: 0.6608
[13]	training's l2: 0.665028	valid_1's l2: 0.634654
[14]	training's l2: 0.641226	valid_1's l2: 0.610829
[15]	training's l2: 0.620047	valid_1's l2: 0.59035
[16]	training's l2: 0.600459	valid_1's l2: 0.571171
[17]	training's l2: 0.58273	valid_1's l2: 0.553864
[18]	training's l2: 0.566554	valid_1's l2: 0.537998
[19]	training's l2: 0.55183

 75%|███████▌  | 12/16 [20:07<06:58, 104.64s/it]

[1]	training's l2: 1.06641	valid_1's l2: 1.04936
Training until validation scores don't improve for 50 rounds.
[2]	training's l2: 1.00338	valid_1's l2: 0.986248
[3]	training's l2: 0.947456	valid_1's l2: 0.930002
[4]	training's l2: 0.89578	valid_1's l2: 0.878239
[5]	training's l2: 0.849161	valid_1's l2: 0.831409
[6]	training's l2: 0.806901	valid_1's l2: 0.789069
[7]	training's l2: 0.768647	valid_1's l2: 0.750737
[8]	training's l2: 0.734122	valid_1's l2: 0.716055
[9]	training's l2: 0.702671	valid_1's l2: 0.684668
[10]	training's l2: 0.674161	valid_1's l2: 0.656195
[11]	training's l2: 0.64851	valid_1's l2: 0.630623
[12]	training's l2: 0.6251	valid_1's l2: 0.60716
[13]	training's l2: 0.604144	valid_1's l2: 0.586079
[14]	training's l2: 0.585306	valid_1's l2: 0.567144
[15]	training's l2: 0.567859	valid_1's l2: 0.549784
[16]	training's l2: 0.552163	valid_1's l2: 0.53408
[17]	training's l2: 0.537881	valid_1's l2: 0.519804
[18]	training's l2: 0.525129	valid_1's l2: 0.506968
[19]	training's l2: 

 81%|████████▏ | 13/16 [21:47<05:09, 103.31s/it]

[1]	training's l2: 1.03104	valid_1's l2: 0.994067
Training until validation scores don't improve for 50 rounds.
[2]	training's l2: 0.969991	valid_1's l2: 0.934006
[3]	training's l2: 0.914781	valid_1's l2: 0.879791
[4]	training's l2: 0.864926	valid_1's l2: 0.830806
[5]	training's l2: 0.819832	valid_1's l2: 0.786291
[6]	training's l2: 0.779017	valid_1's l2: 0.746213
[7]	training's l2: 0.742009	valid_1's l2: 0.71003
[8]	training's l2: 0.708654	valid_1's l2: 0.677298
[9]	training's l2: 0.678305	valid_1's l2: 0.647625
[10]	training's l2: 0.650789	valid_1's l2: 0.620707
[11]	training's l2: 0.625858	valid_1's l2: 0.596634
[12]	training's l2: 0.603796	valid_1's l2: 0.575183
[13]	training's l2: 0.583312	valid_1's l2: 0.555103
[14]	training's l2: 0.56468	valid_1's l2: 0.537056
[15]	training's l2: 0.547842	valid_1's l2: 0.520773
[16]	training's l2: 0.532569	valid_1's l2: 0.505957
[17]	training's l2: 0.518697	valid_1's l2: 0.492553
[18]	training's l2: 0.506534	valid_1's l2: 0.480697
[19]	training'

 88%|████████▊ | 14/16 [23:23<03:22, 101.06s/it]

[1]	training's l2: 1.05906	valid_1's l2: 1.0086
Training until validation scores don't improve for 50 rounds.
[2]	training's l2: 0.994618	valid_1's l2: 0.946203
[3]	training's l2: 0.936245	valid_1's l2: 0.889772
[4]	training's l2: 0.885967	valid_1's l2: 0.840585
[5]	training's l2: 0.838041	valid_1's l2: 0.79409
[6]	training's l2: 0.796889	valid_1's l2: 0.753991
[7]	training's l2: 0.757313	valid_1's l2: 0.716043
[8]	training's l2: 0.721568	valid_1's l2: 0.681508
[9]	training's l2: 0.689113	valid_1's l2: 0.650425
[10]	training's l2: 0.659591	valid_1's l2: 0.622307
[11]	training's l2: 0.632947	valid_1's l2: 0.596708
[12]	training's l2: 0.608767	valid_1's l2: 0.573589
[13]	training's l2: 0.586652	valid_1's l2: 0.552765
[14]	training's l2: 0.566668	valid_1's l2: 0.533752
[15]	training's l2: 0.549637	valid_1's l2: 0.517416
[16]	training's l2: 0.533038	valid_1's l2: 0.501738
[17]	training's l2: 0.518933	valid_1's l2: 0.48826
[18]	training's l2: 0.505185	valid_1's l2: 0.475219
[19]	training's 

 94%|█████████▍| 15/16 [25:01<01:40, 100.06s/it]

[1]	training's l2: 0.955334	valid_1's l2: 0.937427
Training until validation scores don't improve for 50 rounds.
[2]	training's l2: 0.901319	valid_1's l2: 0.883671
[3]	training's l2: 0.852431	valid_1's l2: 0.835249
[4]	training's l2: 0.808292	valid_1's l2: 0.791357
[5]	training's l2: 0.768349	valid_1's l2: 0.751722
[6]	training's l2: 0.733286	valid_1's l2: 0.716865
[7]	training's l2: 0.7004	valid_1's l2: 0.684522
[8]	training's l2: 0.67073	valid_1's l2: 0.655358
[9]	training's l2: 0.643739	valid_1's l2: 0.628642
[10]	training's l2: 0.619304	valid_1's l2: 0.604571
[11]	training's l2: 0.597249	valid_1's l2: 0.582756
[12]	training's l2: 0.57731	valid_1's l2: 0.563142
[13]	training's l2: 0.55916	valid_1's l2: 0.545354
[14]	training's l2: 0.542764	valid_1's l2: 0.52915
[15]	training's l2: 0.527899	valid_1's l2: 0.514536
[16]	training's l2: 0.51437	valid_1's l2: 0.501214
[17]	training's l2: 0.502513	valid_1's l2: 0.489462
[18]	training's l2: 0.491401	valid_1's l2: 0.478553
[19]	training's l2

100%|██████████| 16/16 [26:37<00:00, 98.86s/it] 


#### Generate submission

In [82]:
mse = mean_squared_error(y_val, np.array(val_pred).transpose())
mse

0.34995690601359186

In [83]:
print("Making submission...")
y_test = np.array(test_pred).transpose()
df_preds = pd.DataFrame(
    y_test, index=df_train.index,
    columns=pd.date_range("2017-08-16", periods=16)
).stack().to_frame("unit_sales")
df_preds.index.set_names(["store_nbr", "item_nbr", "date"], inplace=True)

submission = df_test[["id"]].join(df_preds, how="left").fillna(0)
submission["unit_sales"] = np.clip(np.expm1(submission["unit_sales"]), 0, 1000)
submission.to_csv('lgb.csv', float_format='%.4f', index=None)

Making submission...


In [85]:
153/1674

0.0913978494623656

In [86]:
mse = mean_squared_error(y_val, np.array(val_pred).transpose())

mlflow.set_experiment('grocery forecasting')
with mlflow.start_run(run_name='lgbm'):
    mlflow.log_param('model', 'lgbm')
    mlflow.log_param('train starts', train_start)
    mlflow.log_params(params)
    mlflow.log_param('lagging', LAG_DICT.values())
    mlflow.log_param('slidingWindows', SLIDING_DICT.values())
    mlflow.log_param('item_info', 'Yes')
    mlflow.log_param('store_info', 'Yes')
    mlflow.log_param('transactions', 'Yes')
    mlflow.log_param('private score', 0.52153)
    mlflow.log_param('private rank', '12%')
    mlflow.log_param('public score', 0.51516)

    mlflow.log_metric('mse', mse)
    
print("Validation mse:", mse)

  from collections import (


Validation mse: 0.34995690601359186


Making submission...
