In [1]:
from datetime import date, timedelta
import datetime
import pandas as pd
import numpy as np
from tqdm import tqdm, tnrange

from sklearn.metrics import mean_squared_error
import lightgbm as lgb

import mlflow
import mlflow.sklearn

from config import (
    RAW_DATA_DIR,
    FEATURE_DIR,
    LAG_DICT,
    SLIDING_DICT
)

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
# solve lightgbm error on MAC
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [3]:
# load data
df_train = pd.read_csv(
    RAW_DATA_DIR+'train.csv', usecols=[1, 2, 3, 4, 5],
    dtype={'onpromotion': bool},
    converters={'unit_sales': lambda u: np.log1p(
        float(u)) if float(u) > 0 else 0},
    parse_dates=["date"],
    skiprows=range(1, 66458909)  # 2016-01-01
)

df_test = pd.read_csv(
    RAW_DATA_DIR+'test.csv', usecols=[0, 1, 2, 3, 4],
    dtype={'onpromotion': bool},
    parse_dates=["date"]  # , date_parser=parser
).set_index(
    ['store_nbr', 'item_nbr', 'date']
)

items = pd.read_csv(
    RAW_DATA_DIR+'items.csv',
).set_index("item_nbr")

stores = pd.read_csv(
    RAW_DATA_DIR+'stores.csv',
).set_index("store_nbr")

### Test Period

2017-08-16 to 2017-08-31

In [4]:
test_start = date(2017, 8, 16)
test_end = date(2017,8, 31)

In [5]:
valid_start = test_start - timedelta(16)
while(1):
    if valid_start.weekday() == test_start.weekday():
        break
    valid_start = valid_start-timedelta(days=1)
valid_end = valid_start + timedelta(15)
print('valid starts from {} to {}'.format(valid_start, valid_end))

valid starts from 2017-07-26 to 2017-08-10


### Valid Period

Considering the more nearer peiods of sales data may have more in common, it would be better to find the nearest period as valid period.

Based on the analysis before, we assume the sales data is periodically with the frequency of 7 days, so we want to keep that feature same
in the train, valid and test period.

So finally, we choose valid period:

2017-07-26 to 2017-08-10


In [6]:
valid_start = date(2017, 7, 26)
valid_end = date(2017, 8, 10)

### Filter Period

#### Earthquake happended on April 16, 2016. It may affect for the next several weeks.

In [7]:
# filter the period which is affected by earthquake.
filter_date = date(2016,4,16) + timedelta(7*4)
lag_max = 140
train_start=  filter_date+timedelta(days=lag_max)

while(1):
    train_start = train_start + timedelta(1)
    if train_start.weekday() == valid_start.weekday():
        break
print('train datasets starts from {}'.format(train_start))

train datasets starts from 2016-10-05


In [8]:
train_start = date(2017, 2, 8)

### Wages in the public sector are paid every two weeks on the 15 th and on the last day of the month. Supermarket sales could be affected by this.


In [9]:
df_train = df_train[df_train['date']>=filter_date]

'datetime.date' is coerced to a datetime. In the future pandas will
not coerce, and a TypeError will be raised. To retain the current
behavior, convert the 'datetime.date' to a datetime with
'pd.Timestamp'.
  """Entry point for launching an IPython kernel.


#### Promo feature

In [10]:
promo_train = df_train.set_index(
    ["store_nbr", "item_nbr", "date"])[["onpromotion"]]

# missing onpromotions filling
promo_train = promo_train.unstack(level=-1).fillna(False)
promo_train.columns = promo_train.columns.get_level_values(1)

In [11]:
# missing test onpromotions filling
promo_test = df_test[["onpromotion"]].unstack(level=-1).fillna(False)
promo_test.columns = promo_test.columns.get_level_values(1)
# filter those items/stores in promo_test but not in promo_train
promo_test = promo_test.reindex(promo_train.index).fillna(False)

In [12]:
promo_features = pd.concat([promo_train, promo_test], axis=1)
del promo_test, promo_train

## Label

In [13]:
# label
df_train = df_train.set_index(["store_nbr", "item_nbr", "date"])[["unit_sales"]].unstack(level=-1).fillna(0)

### Item

In [14]:
items = items.reindex(df_train.index.get_level_values(1))

#### Item Family

In [15]:
items['family'] = items['family'].astype('category')
item_family_features = items.family.cat.codes.values

#### Item's class

In [16]:
items['class'] = items['class'].astype('category')
item_class_features = items['class'].cat.codes.values

### Store

In [17]:
stores = stores.reindex(df_train.index.get_level_values(0))

#### Store's city

In [18]:
stores['city'] = stores['city'].astype('category')
store_city_features = stores['city'].cat.codes.values

#### Store's state

In [19]:
stores['state'] = stores['state'].astype('category')
store_state_features = stores['state'].cat.codes.values

#### Store's type

In [20]:
stores['type'] = stores['type'].astype('category')
store_type_features = stores['type'].cat.codes.values

#### Store's cluster

In [21]:
stores['cluster'] = stores['cluster'].astype('category')
store_cluster_features = stores['cluster'].cat.codes.values

In [22]:
df_train.columns = df_train.columns.get_level_values(1)

#### Filling missing date

In [23]:
date_list = df_train.columns
obj_list = pd.date_range(filter_date, test_start-timedelta(1))
diff_list = list(set(obj_list) - set(date_list)) 
for i in diff_list:
    print(i)
    df_train[i] = 0

2016-12-25 00:00:00


In [24]:
date_list = promo_features.columns
obj_list = pd.date_range(filter_date, test_end)
diff_list = list(set(obj_list) - set(date_list)) 
for i in diff_list:
    print(i)
    promo_features[i] = 0

2016-12-25 00:00:00


#### Lagging and sliding windows

In [25]:
LAG_DICT = {'unit_sales': [1,2,3,4,5,6,7,14,21,28,35,42,49,56,63],
            'onpromotion': [2, 3,4,5,6, 7, 14, 21]}

SLIDING_DICT = {'unit_sales': [3, 4, 5, 6, 7, 14, 21, 30, 60, 63]}

# initialise dirs
RAW_DATA_DIR = 'datasets/'

In [26]:
def get_timespan(df, 
                 start_time,
                 minus,
                 periods,
                 freq='D'):
    return df[pd.date_range(start_time - timedelta(days=minus), periods=periods, freq=freq)]

def gen_dataset(df, 
                promo_features,
                item_family_features,
                item_class_features,
                store_city_features,
                store_state_features,
                store_type_features,
                store_cluster_features,
                start_time,
                is_train=True):
    # init
    X = pd.DataFrame()
    
    for i in LAG_DICT['unit_sales']:
        X['lag_{}_sales'.format(i)] = get_timespan(df, start_time, i, 1).values.ravel()
    
    for i in LAG_DICT['onpromotion']:
        X['sum_{}_promo'.format(i)] = get_timespan(promo_features, start_time, i, 1).sum(axis=1).ravel()

    for i in SLIDING_DICT['unit_sales']:
        X["mean_{}_sales".format(i)] = get_timespan(df, start_time, i, i).mean(axis=1).values
        X["std_{}_sales".format(i)] = get_timespan(df, start_time, i, i).std(axis=1).values

    for i in range(7):
        X['mean_4_dow{}_2017'.format(i)] = get_timespan(df, start_time, 28-i, 4, freq='7D').mean(axis=1).values
        X['mean_20_dow{}_2017'.format(i)] = get_timespan(df, start_time, 140-i, 20, freq='7D').mean(axis=1).values
        
    # for the next to-predict 16 days 
    for i in range(16):
        X["promo_{}".format(i)] = promo_features[start_time + timedelta(days=i)].values.astype(np.uint8)

    X['item_family_features'] = item_family_features

    X['item_class_features'] = item_class_features

    X['store_city_features'] = store_city_features

    X['store_state_features'] = store_state_features

    X['store_type_features'] = store_type_features

    X['store_cluster_features'] = store_cluster_features
        
    if is_train:
        y = df[pd.date_range(start_time, periods=16)].values
        return X, y
    return X


#### Generate train, valid and test sets

In [27]:
print("Preparing dataset...")

nbr_weeks = int((valid_start - train_start).days/7)

X_l, y_l = [], []

for i in tqdm(range(nbr_weeks), desc = 'No. of week'):
    delta = timedelta(days=7 * i)
    X_tmp, y_tmp = gen_dataset(
        df_train,
        promo_features,
        item_family_features,
        item_class_features,
        store_city_features,
        store_state_features,
        store_type_features,
        store_cluster_features,
        train_start + delta
    )
    X_l.append(X_tmp)
    y_l.append(y_tmp)
#     break

No. of week:   0%|          | 0/16 [00:00<?, ?it/s]

Preparing dataset...


No. of week: 100%|██████████| 16/16 [00:10<00:00,  1.50it/s]


In [28]:
X_train = pd.concat(X_l, axis=0)
y_train = np.concatenate(y_l, axis=0)
del X_l, y_l

In [29]:

X_val, y_val = gen_dataset(df_train,
                           promo_features,
                           item_family_features,
                           item_class_features,
                           store_city_features,
                           store_state_features,
                           store_type_features,
                           store_cluster_features,
                           valid_start)
X_test = gen_dataset(df_train, 
                    promo_features,
                    item_family_features,
                    item_class_features,
                    store_city_features,
                    store_state_features,
                    store_type_features,
                    store_cluster_features,
                    test_start, is_train=False)

#### Train Model

In [30]:
print("Training and predicting models...")
params = {
    'num_leaves': 2**5 - 1,
    'objective': 'regression_l2',
    'max_depth': 8,
    'min_data_in_leaf': 50,
    'learning_rate': 0.05,
    'feature_fraction': 0.75,
    'bagging_fraction': 0.75,
    'bagging_freq': 1,
    'metric': 'l2',
    'num_threads': 4
}

MAX_ROUNDS = 200
val_pred = []
test_pred = []
cate_vars = ['item_family_features',
            'item_class_features',
            'store_city_features',
            'store_state_features',
            'store_type_features',
            'store_cluster_features']

Training and predicting models...


In [31]:
for i in tqdm(range(16)):
    dtrain = lgb.Dataset(
        X_train, label=y_train[:, i],
        categorical_feature=cate_vars,
        weight=pd.concat([items["perishable"]] * nbr_weeks) * 0.25 + 1
    )
    dval = lgb.Dataset(
        X_val, label=y_val[:, i], reference=dtrain,
        weight=items["perishable"] * 0.25 + 1,
        categorical_feature=cate_vars)

    bst = lgb.train(
        params,
        dtrain,
        num_boost_round=MAX_ROUNDS,
#         verbose_eval = False,
        valid_sets=[dtrain, dval], early_stopping_rounds=50)
    print("\n".join(("%s: %.2f" % x) for x in sorted(
        zip(X_train.columns, bst.feature_importance("gain")),
        key=lambda x: x[1], reverse=True
    )))
    val_pred.append(bst.predict(
        X_val, num_iteration=bst.best_iteration or MAX_ROUNDS))
    test_pred.append(bst.predict(
        X_test, num_iteration=bst.best_iteration or MAX_ROUNDS))
    




[1]	training's l2: 1.04527	valid_1's l2: 1.00397
Training until validation scores don't improve for 50 rounds.
[2]	training's l2: 0.977625	valid_1's l2: 0.938305
[3]	training's l2: 0.915068	valid_1's l2: 0.877657
[4]	training's l2: 0.859484	valid_1's l2: 0.82379
[5]	training's l2: 0.807876	valid_1's l2: 0.773734
[6]	training's l2: 0.761179	valid_1's l2: 0.728453
[7]	training's l2: 0.718861	valid_1's l2: 0.687584
[8]	training's l2: 0.680703	valid_1's l2: 0.650498
[9]	training's l2: 0.646204	valid_1's l2: 0.617178
[10]	training's l2: 0.614835	valid_1's l2: 0.586917
[11]	training's l2: 0.586478	valid_1's l2: 0.559611
[12]	training's l2: 0.561491	valid_1's l2: 0.535655
[13]	training's l2: 0.538848	valid_1's l2: 0.513894
[14]	training's l2: 0.517553	valid_1's l2: 0.493325
[15]	training's l2: 0.498306	valid_1's l2: 0.474906
[16]	training's l2: 0.480889	valid_1's l2: 0.458186
[17]	training's l2: 0.465057	valid_1's l2: 0.443102
[18]	training's l2: 0.450613	valid_1's l2: 0.429152
[19]	training'

  6%|▋         | 1/16 [01:15<18:55, 75.69s/it]

[1]	training's l2: 0.943767	valid_1's l2: 0.92528
Training until validation scores don't improve for 50 rounds.
[2]	training's l2: 0.886554	valid_1's l2: 0.869257
[3]	training's l2: 0.834857	valid_1's l2: 0.818485
[4]	training's l2: 0.787832	valid_1's l2: 0.77227
[5]	training's l2: 0.745386	valid_1's l2: 0.730549
[6]	training's l2: 0.707661	valid_1's l2: 0.693672
[7]	training's l2: 0.672799	valid_1's l2: 0.659269
[8]	training's l2: 0.641954	valid_1's l2: 0.628939
[9]	training's l2: 0.61359	valid_1's l2: 0.601086
[10]	training's l2: 0.587667	valid_1's l2: 0.575659
[11]	training's l2: 0.564615	valid_1's l2: 0.553139
[12]	training's l2: 0.543306	valid_1's l2: 0.532076
[13]	training's l2: 0.524002	valid_1's l2: 0.513121
[14]	training's l2: 0.506913	valid_1's l2: 0.496404
[15]	training's l2: 0.490965	valid_1's l2: 0.480794
[16]	training's l2: 0.476592	valid_1's l2: 0.466728
[17]	training's l2: 0.463545	valid_1's l2: 0.453943
[18]	training's l2: 0.45172	valid_1's l2: 0.442382
[19]	training's

 12%|█▎        | 2/16 [02:23<17:05, 73.29s/it]

[1]	training's l2: 1.05193	valid_1's l2: 1.06005
Training until validation scores don't improve for 50 rounds.
[2]	training's l2: 0.986159	valid_1's l2: 0.994185
[3]	training's l2: 0.927303	valid_1's l2: 0.935074
[4]	training's l2: 0.872793	valid_1's l2: 0.8803
[5]	training's l2: 0.823647	valid_1's l2: 0.830858
[6]	training's l2: 0.780024	valid_1's l2: 0.78703
[7]	training's l2: 0.739567	valid_1's l2: 0.746389
[8]	training's l2: 0.702908	valid_1's l2: 0.709601
[9]	training's l2: 0.669646	valid_1's l2: 0.67621
[10]	training's l2: 0.640211	valid_1's l2: 0.646507
[11]	training's l2: 0.612694	valid_1's l2: 0.618847
[12]	training's l2: 0.587884	valid_1's l2: 0.59401
[13]	training's l2: 0.565365	valid_1's l2: 0.571236
[14]	training's l2: 0.544873	valid_1's l2: 0.550452
[15]	training's l2: 0.526853	valid_1's l2: 0.532223
[16]	training's l2: 0.510602	valid_1's l2: 0.515827
[17]	training's l2: 0.495211	valid_1's l2: 0.500299
[18]	training's l2: 0.481183	valid_1's l2: 0.486166
[19]	training's l2

 19%|█▉        | 3/16 [03:24<15:03, 69.52s/it]

[1]	training's l2: 1.17381	valid_1's l2: 1.16028
Training until validation scores don't improve for 50 rounds.
[2]	training's l2: 1.09721	valid_1's l2: 1.08448
[3]	training's l2: 1.02873	valid_1's l2: 1.01673
[4]	training's l2: 0.965834	valid_1's l2: 0.954472
[5]	training's l2: 0.909126	valid_1's l2: 0.898394
[6]	training's l2: 0.857752	valid_1's l2: 0.847654
[7]	training's l2: 0.811711	valid_1's l2: 0.802138
[8]	training's l2: 0.770037	valid_1's l2: 0.761114
[9]	training's l2: 0.731893	valid_1's l2: 0.723452
[10]	training's l2: 0.697175	valid_1's l2: 0.689199
[11]	training's l2: 0.665888	valid_1's l2: 0.658405
[12]	training's l2: 0.637568	valid_1's l2: 0.630485
[13]	training's l2: 0.611825	valid_1's l2: 0.60498
[14]	training's l2: 0.588494	valid_1's l2: 0.581986
[15]	training's l2: 0.567411	valid_1's l2: 0.561115
[16]	training's l2: 0.548245	valid_1's l2: 0.542063
[17]	training's l2: 0.531084	valid_1's l2: 0.525276
[18]	training's l2: 0.515286	valid_1's l2: 0.509686
[19]	training's l2

 25%|██▌       | 4/16 [04:23<13:18, 66.55s/it]

[1]	training's l2: 1.23857	valid_1's l2: 1.22177
Training until validation scores don't improve for 50 rounds.
[2]	training's l2: 1.15721	valid_1's l2: 1.14002
[3]	training's l2: 1.08353	valid_1's l2: 1.06609
[4]	training's l2: 1.01731	valid_1's l2: 0.999711
[5]	training's l2: 0.95697	valid_1's l2: 0.939207
[6]	training's l2: 0.902388	valid_1's l2: 0.884434
[7]	training's l2: 0.853559	valid_1's l2: 0.835648
[8]	training's l2: 0.808785	valid_1's l2: 0.790881
[9]	training's l2: 0.768088	valid_1's l2: 0.749986
[10]	training's l2: 0.731227	valid_1's l2: 0.713227
[11]	training's l2: 0.697834	valid_1's l2: 0.679826
[12]	training's l2: 0.668086	valid_1's l2: 0.650361
[13]	training's l2: 0.640708	valid_1's l2: 0.623323
[14]	training's l2: 0.615754	valid_1's l2: 0.598469
[15]	training's l2: 0.593481	valid_1's l2: 0.576305
[16]	training's l2: 0.57348	valid_1's l2: 0.556717
[17]	training's l2: 0.555189	valid_1's l2: 0.538653
[18]	training's l2: 0.538338	valid_1's l2: 0.522286
[19]	training's l2: 

 31%|███▏      | 5/16 [05:23<11:48, 64.44s/it]

[1]	training's l2: 1.07025	valid_1's l2: 1.09676
Training until validation scores don't improve for 50 rounds.
[2]	training's l2: 1.00456	valid_1's l2: 1.02951
[3]	training's l2: 0.945237	valid_1's l2: 0.968399
[4]	training's l2: 0.891742	valid_1's l2: 0.9131
[5]	training's l2: 0.843253	valid_1's l2: 0.862931
[6]	training's l2: 0.799317	valid_1's l2: 0.817603
[7]	training's l2: 0.759429	valid_1's l2: 0.776347
[8]	training's l2: 0.723506	valid_1's l2: 0.73928
[9]	training's l2: 0.690956	valid_1's l2: 0.705473
[10]	training's l2: 0.661802	valid_1's l2: 0.675303
[11]	training's l2: 0.635516	valid_1's l2: 0.647999
[12]	training's l2: 0.611663	valid_1's l2: 0.62323
[13]	training's l2: 0.589542	valid_1's l2: 0.600123
[14]	training's l2: 0.569467	valid_1's l2: 0.57919
[15]	training's l2: 0.551322	valid_1's l2: 0.560196
[16]	training's l2: 0.53512	valid_1's l2: 0.54324
[17]	training's l2: 0.52056	valid_1's l2: 0.527981
[18]	training's l2: 0.506878	valid_1's l2: 0.513565
[19]	training's l2: 0.4

 38%|███▊      | 6/16 [06:23<10:30, 63.09s/it]

[1]	training's l2: 1.01784	valid_1's l2: 1.18317
Training until validation scores don't improve for 50 rounds.
[2]	training's l2: 0.957183	valid_1's l2: 1.11738
[3]	training's l2: 0.900769	valid_1's l2: 1.05651
[4]	training's l2: 0.849756	valid_1's l2: 1.00097
[5]	training's l2: 0.805054	valid_1's l2: 0.951482
[6]	training's l2: 0.763213	valid_1's l2: 0.90572
[7]	training's l2: 0.726316	valid_1's l2: 0.864695
[8]	training's l2: 0.691834	valid_1's l2: 0.826712
[9]	training's l2: 0.66056	valid_1's l2: 0.791987
[10]	training's l2: 0.633202	valid_1's l2: 0.761268
[11]	training's l2: 0.608481	valid_1's l2: 0.733267
[12]	training's l2: 0.585124	valid_1's l2: 0.70691
[13]	training's l2: 0.564703	valid_1's l2: 0.683444
[14]	training's l2: 0.546241	valid_1's l2: 0.662191
[15]	training's l2: 0.52865	valid_1's l2: 0.641941
[16]	training's l2: 0.513463	valid_1's l2: 0.624115
[17]	training's l2: 0.499784	valid_1's l2: 0.607906
[18]	training's l2: 0.486471	valid_1's l2: 0.592415
[19]	training's l2: 

 44%|████▍     | 7/16 [07:25<09:26, 62.89s/it]

[1]	training's l2: 1.04378	valid_1's l2: 1.16233
Training until validation scores don't improve for 50 rounds.
[2]	training's l2: 0.980812	valid_1's l2: 1.09579
[3]	training's l2: 0.92141	valid_1's l2: 1.03349
[4]	training's l2: 0.867666	valid_1's l2: 0.976639
[5]	training's l2: 0.818964	valid_1's l2: 0.925224
[6]	training's l2: 0.776933	valid_1's l2: 0.87967
[7]	training's l2: 0.736731	valid_1's l2: 0.837003
[8]	training's l2: 0.702363	valid_1's l2: 0.799586
[9]	training's l2: 0.669255	valid_1's l2: 0.764372
[10]	training's l2: 0.64078	valid_1's l2: 0.733313
[11]	training's l2: 0.615068	valid_1's l2: 0.705125
[12]	training's l2: 0.591915	valid_1's l2: 0.679664
[13]	training's l2: 0.569199	valid_1's l2: 0.655014
[14]	training's l2: 0.548351	valid_1's l2: 0.632466
[15]	training's l2: 0.53103	valid_1's l2: 0.613152
[16]	training's l2: 0.513788	valid_1's l2: 0.594527
[17]	training's l2: 0.498277	valid_1's l2: 0.577655
[18]	training's l2: 0.485415	valid_1's l2: 0.562823
[19]	training's l2:

 50%|█████     | 8/16 [08:26<08:18, 62.32s/it]

[1]	training's l2: 0.944491	valid_1's l2: 0.998089
Training until validation scores don't improve for 50 rounds.
[2]	training's l2: 0.889966	valid_1's l2: 0.94201
[3]	training's l2: 0.840568	valid_1's l2: 0.890789
[4]	training's l2: 0.796994	valid_1's l2: 0.845426
[5]	training's l2: 0.756408	valid_1's l2: 0.803319
[6]	training's l2: 0.719617	valid_1's l2: 0.76535
[7]	training's l2: 0.686478	valid_1's l2: 0.730685
[8]	training's l2: 0.656442	valid_1's l2: 0.699498
[9]	training's l2: 0.629206	valid_1's l2: 0.671118
[10]	training's l2: 0.604377	valid_1's l2: 0.645103
[11]	training's l2: 0.581902	valid_1's l2: 0.621541
[12]	training's l2: 0.561753	valid_1's l2: 0.600108
[13]	training's l2: 0.543386	valid_1's l2: 0.580641
[14]	training's l2: 0.526703	valid_1's l2: 0.562948
[15]	training's l2: 0.511548	valid_1's l2: 0.546906
[16]	training's l2: 0.497868	valid_1's l2: 0.532366
[17]	training's l2: 0.485492	valid_1's l2: 0.519221
[18]	training's l2: 0.474149	valid_1's l2: 0.507112
[19]	training

 56%|█████▋    | 9/16 [09:26<07:12, 61.75s/it]

[1]	training's l2: 1.05826	valid_1's l2: 1.07297
Training until validation scores don't improve for 50 rounds.
[2]	training's l2: 0.994598	valid_1's l2: 1.00898
[3]	training's l2: 0.93664	valid_1's l2: 0.950378
[4]	training's l2: 0.884376	valid_1's l2: 0.89724
[5]	training's l2: 0.836913	valid_1's l2: 0.849147
[6]	training's l2: 0.793761	valid_1's l2: 0.80556
[7]	training's l2: 0.754937	valid_1's l2: 0.76621
[8]	training's l2: 0.719654	valid_1's l2: 0.730453
[9]	training's l2: 0.687467	valid_1's l2: 0.698021
[10]	training's l2: 0.659015	valid_1's l2: 0.668964
[11]	training's l2: 0.632459	valid_1's l2: 0.64217
[12]	training's l2: 0.609275	valid_1's l2: 0.618576
[13]	training's l2: 0.587437	valid_1's l2: 0.596172
[14]	training's l2: 0.567487	valid_1's l2: 0.575844
[15]	training's l2: 0.54949	valid_1's l2: 0.557436
[16]	training's l2: 0.533057	valid_1's l2: 0.540653
[17]	training's l2: 0.518239	valid_1's l2: 0.525644
[18]	training's l2: 0.504661	valid_1's l2: 0.511842
[19]	training's l2: 

 62%|██████▎   | 10/16 [10:25<06:04, 60.71s/it]

[1]	training's l2: 1.17691	valid_1's l2: 1.14473
Training until validation scores don't improve for 50 rounds.
[2]	training's l2: 1.10341	valid_1's l2: 1.07205
[3]	training's l2: 1.03706	valid_1's l2: 1.0065
[4]	training's l2: 0.977066	valid_1's l2: 0.947053
[5]	training's l2: 0.923253	valid_1's l2: 0.894017
[6]	training's l2: 0.873547	valid_1's l2: 0.845275
[7]	training's l2: 0.829546	valid_1's l2: 0.801837
[8]	training's l2: 0.788944	valid_1's l2: 0.761933
[9]	training's l2: 0.752146	valid_1's l2: 0.72596
[10]	training's l2: 0.71864	valid_1's l2: 0.693226
[11]	training's l2: 0.688346	valid_1's l2: 0.663493
[12]	training's l2: 0.660949	valid_1's l2: 0.636759
[13]	training's l2: 0.636206	valid_1's l2: 0.612669
[14]	training's l2: 0.613669	valid_1's l2: 0.590814
[15]	training's l2: 0.593651	valid_1's l2: 0.571347
[16]	training's l2: 0.575185	valid_1's l2: 0.553525
[17]	training's l2: 0.558424	valid_1's l2: 0.537087
[18]	training's l2: 0.543072	valid_1's l2: 0.522379
[19]	training's l2: 

 69%|██████▉   | 11/16 [11:24<05:01, 60.32s/it]

[1]	training's l2: 1.24003	valid_1's l2: 1.20977
Training until validation scores don't improve for 50 rounds.
[2]	training's l2: 1.16189	valid_1's l2: 1.13079
[3]	training's l2: 1.09103	valid_1's l2: 1.05954
[4]	training's l2: 1.02711	valid_1's l2: 0.994986
[5]	training's l2: 0.969416	valid_1's l2: 0.937096
[6]	training's l2: 0.917003	valid_1's l2: 0.88445
[7]	training's l2: 0.870138	valid_1's l2: 0.837959
[8]	training's l2: 0.827047	valid_1's l2: 0.795167
[9]	training's l2: 0.788495	valid_1's l2: 0.756952
[10]	training's l2: 0.753573	valid_1's l2: 0.722103
[11]	training's l2: 0.72144	valid_1's l2: 0.690352
[12]	training's l2: 0.692712	valid_1's l2: 0.662321
[13]	training's l2: 0.666268	valid_1's l2: 0.636687
[14]	training's l2: 0.642048	valid_1's l2: 0.613084
[15]	training's l2: 0.620226	valid_1's l2: 0.59183
[16]	training's l2: 0.600437	valid_1's l2: 0.572698
[17]	training's l2: 0.582409	valid_1's l2: 0.555012
[18]	training's l2: 0.566437	valid_1's l2: 0.540203
[19]	training's l2: 0

 75%|███████▌  | 12/16 [12:24<04:00, 60.05s/it]

[1]	training's l2: 1.07777	valid_1's l2: 1.0496
Training until validation scores don't improve for 50 rounds.
[2]	training's l2: 1.01393	valid_1's l2: 0.986254
[3]	training's l2: 0.957328	valid_1's l2: 0.929677
[4]	training's l2: 0.906105	valid_1's l2: 0.878364
[5]	training's l2: 0.859742	valid_1's l2: 0.832136
[6]	training's l2: 0.816597	valid_1's l2: 0.789246
[7]	training's l2: 0.778627	valid_1's l2: 0.751509
[8]	training's l2: 0.743196	valid_1's l2: 0.716491
[9]	training's l2: 0.711108	valid_1's l2: 0.684702
[10]	training's l2: 0.682738	valid_1's l2: 0.656633
[11]	training's l2: 0.656305	valid_1's l2: 0.630655
[12]	training's l2: 0.632324	valid_1's l2: 0.607142
[13]	training's l2: 0.610536	valid_1's l2: 0.585803
[14]	training's l2: 0.590752	valid_1's l2: 0.566279
[15]	training's l2: 0.572849	valid_1's l2: 0.548717
[16]	training's l2: 0.556692	valid_1's l2: 0.532769
[17]	training's l2: 0.542356	valid_1's l2: 0.518759
[18]	training's l2: 0.528945	valid_1's l2: 0.505667
[19]	training's

 81%|████████▏ | 13/16 [13:23<02:59, 59.82s/it]

[1]	training's l2: 1.03072	valid_1's l2: 0.994269
Training until validation scores don't improve for 50 rounds.
[2]	training's l2: 0.971451	valid_1's l2: 0.935965
[3]	training's l2: 0.917924	valid_1's l2: 0.883168
[4]	training's l2: 0.867699	valid_1's l2: 0.833807
[5]	training's l2: 0.822218	valid_1's l2: 0.789002
[6]	training's l2: 0.781032	valid_1's l2: 0.74853
[7]	training's l2: 0.745071	valid_1's l2: 0.713186
[8]	training's l2: 0.711243	valid_1's l2: 0.680126
[9]	training's l2: 0.681886	valid_1's l2: 0.651453
[10]	training's l2: 0.654031	valid_1's l2: 0.624447
[11]	training's l2: 0.628748	valid_1's l2: 0.600058
[12]	training's l2: 0.605849	valid_1's l2: 0.577736
[13]	training's l2: 0.584998	valid_1's l2: 0.557459
[14]	training's l2: 0.566136	valid_1's l2: 0.539198
[15]	training's l2: 0.549074	valid_1's l2: 0.522583
[16]	training's l2: 0.533404	valid_1's l2: 0.507565
[17]	training's l2: 0.519311	valid_1's l2: 0.493899
[18]	training's l2: 0.506444	valid_1's l2: 0.481608
[19]	training

 88%|████████▊ | 14/16 [14:23<01:59, 59.98s/it]

[1]	training's l2: 1.05389	valid_1's l2: 1.00847
Training until validation scores don't improve for 50 rounds.
[2]	training's l2: 0.989743	valid_1's l2: 0.946425
[3]	training's l2: 0.93163	valid_1's l2: 0.890158
[4]	training's l2: 0.881542	valid_1's l2: 0.841019
[5]	training's l2: 0.836168	valid_1's l2: 0.796772
[6]	training's l2: 0.792457	valid_1's l2: 0.754546
[7]	training's l2: 0.753063	valid_1's l2: 0.716597
[8]	training's l2: 0.717177	valid_1's l2: 0.681957
[9]	training's l2: 0.686633	valid_1's l2: 0.652303
[10]	training's l2: 0.65701	valid_1's l2: 0.623893
[11]	training's l2: 0.631831	valid_1's l2: 0.599705
[12]	training's l2: 0.609073	valid_1's l2: 0.577735
[13]	training's l2: 0.58851	valid_1's l2: 0.55787
[14]	training's l2: 0.567943	valid_1's l2: 0.538382
[15]	training's l2: 0.549325	valid_1's l2: 0.520888
[16]	training's l2: 0.53242	valid_1's l2: 0.50505
[17]	training's l2: 0.517157	valid_1's l2: 0.490712
[18]	training's l2: 0.504471	valid_1's l2: 0.478642
[19]	training's l2:

 94%|█████████▍| 15/16 [15:26<01:00, 60.90s/it]

[1]	training's l2: 0.946809	valid_1's l2: 0.938802
Training until validation scores don't improve for 50 rounds.
[2]	training's l2: 0.89314	valid_1's l2: 0.885753
[3]	training's l2: 0.84468	valid_1's l2: 0.837507
[4]	training's l2: 0.800844	valid_1's l2: 0.793835
[5]	training's l2: 0.761128	valid_1's l2: 0.754249
[6]	training's l2: 0.725177	valid_1's l2: 0.718492
[7]	training's l2: 0.692724	valid_1's l2: 0.686148
[8]	training's l2: 0.663482	valid_1's l2: 0.657051
[9]	training's l2: 0.637764	valid_1's l2: 0.631341
[10]	training's l2: 0.613623	valid_1's l2: 0.607376
[11]	training's l2: 0.591662	valid_1's l2: 0.585676
[12]	training's l2: 0.571836	valid_1's l2: 0.565907
[13]	training's l2: 0.553897	valid_1's l2: 0.548055
[14]	training's l2: 0.537691	valid_1's l2: 0.531899
[15]	training's l2: 0.5229	valid_1's l2: 0.517251
[16]	training's l2: 0.509462	valid_1's l2: 0.503944
[17]	training's l2: 0.497286	valid_1's l2: 0.491953
[18]	training's l2: 0.486204	valid_1's l2: 0.481017
[19]	training's

100%|██████████| 16/16 [16:32<00:00, 62.28s/it]


#### Generate submission

In [32]:
mse = mean_squared_error(y_val, np.array(val_pred).transpose())
mse

0.3511710550491811

In [34]:
252/1672

0.1507177033492823

In [35]:
mse = mean_squared_error(y_val, np.array(val_pred).transpose())

mlflow.set_experiment('grocery forecasting')
with mlflow.start_run(run_name='lgbm'):
    mlflow.log_param('model', 'lgbm')
    mlflow.log_param('train starts', train_start)
    mlflow.log_params(params)
    mlflow.log_param('lagging', LAG_DICT.values())
    mlflow.log_param('slidingWindows', SLIDING_DICT.values())
    mlflow.log_param('item_info', 'Yes')
    mlflow.log_param('store_info', 'Yes')
    mlflow.log_param('private score', 0.52207)
    mlflow.log_param('private rank', '15%')
    mlflow.log_param('public score', 0.51485)

    mlflow.log_metric('mse', mse)
    
print("Validation mse:", mse)

  from collections import (


Validation mse: 0.3511710550491811


In [33]:
print("Making submission...")
y_test = np.array(test_pred).transpose()
df_preds = pd.DataFrame(
    y_test, index=df_train.index,
    columns=pd.date_range("2017-08-16", periods=16)
).stack().to_frame("unit_sales")
df_preds.index.set_names(["store_nbr", "item_nbr", "date"], inplace=True)

submission = df_test[["id"]].join(df_preds, how="left").fillna(0)
submission["unit_sales"] = np.clip(np.expm1(submission["unit_sales"]), 0, 1000)
submission.to_csv('lgb.csv', float_format='%.4f', index=None)

Making submission...
