In [1]:
from datetime import date, timedelta
import datetime
import pandas as pd
import numpy as np
from tqdm import tqdm, tnrange

from sklearn.metrics import mean_squared_error
import lightgbm as lgb

import mlflow
import mlflow.sklearn

from config import (
    RAW_DATA_DIR,
    FEATURE_DIR,
    LAG_DICT,
    SLIDING_DICT
)

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
# solve lightgbm error on MAC
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [3]:
# load data
df_train = pd.read_csv(
    RAW_DATA_DIR+'train.csv', usecols=[1, 2, 3, 4, 5],
    dtype={'onpromotion': bool},
    converters={'unit_sales': lambda u: np.log1p(
        float(u)) if float(u) > 0 else 0},
    parse_dates=["date"],
    skiprows=range(1, 66458909)  # 2016-01-01
)

df_test = pd.read_csv(
    RAW_DATA_DIR+'test.csv', usecols=[0, 1, 2, 3, 4],
    dtype={'onpromotion': bool},
    parse_dates=["date"]  # , date_parser=parser
).set_index(
    ['store_nbr', 'item_nbr', 'date']
)

items = pd.read_csv(
    RAW_DATA_DIR+'items.csv',
).set_index("item_nbr")

stores = pd.read_csv(
    RAW_DATA_DIR+'stores.csv',
).set_index("store_nbr")

### Test Period

2017-08-16 to 2017-08-31

In [4]:
test_start = date(2017, 8, 16)
test_end = date(2017,8, 31)

In [5]:
valid_start = test_start - timedelta(16)
while(1):
    if valid_start.weekday() == test_start.weekday():
        break
    valid_start = valid_start-timedelta(days=1)
valid_end = valid_start + timedelta(15)
print('valid starts from {} to {}'.format(valid_start, valid_end))

valid starts from 2017-07-26 to 2017-08-10


### Valid Period

Considering the more nearer peiods of sales data may have more in common, it would be better to find the nearest period as valid period.

Based on the analysis before, we assume the sales data is periodically with the frequency of 7 days, so we want to keep that feature same
in the train, valid and test period.

So finally, we choose valid period:

2017-07-26 to 2017-08-10


In [6]:
valid_start = date(2017, 7, 26)
valid_end = date(2017, 8, 10)

### Filter Period

#### Earthquake happended on April 16, 2016. It may affect for the next several weeks.

In [7]:
# filter the period which is affected by earthquake.
filter_date = date(2016,4,16) + timedelta(7*4)
lag_max = 140
train_start=  filter_date+timedelta(days=lag_max)

while(1):
    train_start = train_start + timedelta(1)
    if train_start.weekday() == valid_start.weekday():
        break
print('train datasets starts from {}'.format(train_start))

train datasets starts from 2016-10-05


In [8]:
train_start = date(2017, 4, 5)

### Wages in the public sector are paid every two weeks on the 15 th and on the last day of the month. Supermarket sales could be affected by this.


In [9]:
df_train = df_train[df_train['date']>=filter_date]

'datetime.date' is coerced to a datetime. In the future pandas will
not coerce, and a TypeError will be raised. To retain the current
behavior, convert the 'datetime.date' to a datetime with
'pd.Timestamp'.
  """Entry point for launching an IPython kernel.


#### Promo feature

In [10]:
promo_train = df_train.set_index(
    ["store_nbr", "item_nbr", "date"])[["onpromotion"]]

# missing onpromotions filling
promo_train = promo_train.unstack(level=-1).fillna(False)
promo_train.columns = promo_train.columns.get_level_values(1)

In [11]:
# missing test onpromotions filling
promo_test = df_test[["onpromotion"]].unstack(level=-1).fillna(False)
promo_test.columns = promo_test.columns.get_level_values(1)
# filter those items/stores in promo_test but not in promo_train
promo_test = promo_test.reindex(promo_train.index).fillna(False)

In [12]:
promo_features = pd.concat([promo_train, promo_test], axis=1)
del promo_test, promo_train

## Label

In [13]:
# label
df_train = df_train.set_index(["store_nbr", "item_nbr", "date"])[["unit_sales"]].unstack(level=-1).fillna(0)

### Item

In [14]:
items = items.reindex(df_train.index.get_level_values(1))

#### Item Family

In [15]:
items['family'] = items['family'].astype('category')
item_family_features = items.family.cat.codes.values

#### Item's class

In [16]:
items['class'] = items['class'].astype('category')
item_class_features = items['class'].cat.codes.values

### Store

In [17]:
stores = stores.reindex(df_train.index.get_level_values(0))

#### Store's city

In [18]:
stores['city'] = stores['city'].astype('category')
store_city_features = stores['city'].cat.codes.values

#### Store's state

In [19]:
stores['state'] = stores['state'].astype('category')
store_state_features = stores['state'].cat.codes.values

#### Store's type

In [20]:
stores['type'] = stores['type'].astype('category')
store_type_features = stores['type'].cat.codes.values

#### Store's cluster

In [21]:
stores['cluster'] = stores['cluster'].astype('category')
store_cluster_features = stores['cluster'].cat.codes.values

In [22]:
df_train.columns = df_train.columns.get_level_values(1)

#### Filling missing date

In [23]:
date_list = df_train.columns
obj_list = pd.date_range(filter_date, test_start-timedelta(1))
diff_list = list(set(obj_list) - set(date_list)) 
for i in diff_list:
    print(i)
    df_train[i] = 0

2016-12-25 00:00:00


In [24]:
date_list = promo_features.columns
obj_list = pd.date_range(filter_date, test_end)
diff_list = list(set(obj_list) - set(date_list)) 
for i in diff_list:
    print(i)
    promo_features[i] = 0

2016-12-25 00:00:00


#### Lagging and sliding windows

In [25]:
LAG_DICT = {'unit_sales': [1,2,3, 4, 5, 6, 7, 8, 9, 10 ,11 ,12, 13 ,14, 15, 16, 21,30, 60],
            'onpromotion': [2, 3,4,5,6, 7, 14, 21]}

SLIDING_DICT = {'unit_sales': [3, 4, 5, 6, 7, 14, 21, 30, 60]}

# initialise dirs
RAW_DATA_DIR = 'datasets/'

## Feature Engineering

In [39]:
def get_timespan(df, 
                 start_time,
                 minus,
                 periods,
                 freq='D'):
    return df[pd.date_range(start_time - timedelta(days=minus), periods=periods, freq=freq)]

def gen_dataset(df, 
                promo_features,
                item_family_features,
                item_class_features,
                store_city_features,
                store_state_features,
                store_type_features,
                store_cluster_features,
                start_time,
                is_train=True):
    # init
    X = pd.DataFrame()
    
    for i in LAG_DICT['unit_sales']:
        X['lag_{}_sales'.format(i)] = get_timespan(df, start_time, i, 1).values.ravel()

    for i in SLIDING_DICT['unit_sales']:
        X["mean_{}_sales".format(i)] = get_timespan(df, start_time, i, i).mean(axis=1).values
        X["std_{}_sales".format(i)] = get_timespan(df, start_time, i, i).std(axis=1).values
        X["min_{}_sales".format(i)] = get_timespan(df, start_time, i, i).min(axis=1).values
        X["max_{}_sales".format(i)] = get_timespan(df, start_time, i, i).max(axis=1).values
        X["median_{}_sales".format(i)] = get_timespan(df, start_time, i, i).median(axis=1).values

    for i in range(7):
        X['mean_4_dow{}_2017'.format(i)] = get_timespan(df, start_time, 28-i, 4, freq='7D').mean(axis=1).values
        X['mean_20_dow{}_2017'.format(i)] = get_timespan(df, start_time, 140-i, 20, freq='7D').mean(axis=1).values
        
    for i in LAG_DICT['onpromotion']:
        X['sum_{}_promo'.format(i)] = get_timespan(promo_features, start_time, i, 1).sum(axis=1).ravel()
        
    for i in range(1， 16):
        X['sum_{}_promo_test'.format(i)]= get_timespan(promo_features, start_time + timedelta(days=16), 15, i).sum(axis=1).values        

    # for the next to-predict 16 days 
    for i in range(16):
        X["promo_{}".format(i)] = promo_features[start_time + timedelta(days=i)].values.astype(np.uint8)

    for i in [7, 14, 30, 60, 140]:
        tmp = get_timespan(df, start_time, i, i)
        ###!
        X['has_sales_days_in_last_{}'.format(i)] = (tmp > 0).sum(axis=1).values
        X['last_has_sales_day_in_last_%s' % i] = i - ((tmp > 0) * np.arange(i)).max(axis=1).values

        tmp = get_timespan(promo_features, start_time, i, i)
        X['has_promo_days_in_last_%s' % i] = (tmp > 0).sum(axis=1).values
        X['last_has_promo_day_in_last_%s' % i] = i - ((tmp > 0) * np.arange(i)).max(axis=1).values

    # item, store information
    X['item_family_features'] = item_family_features

    X['item_class_features'] = item_class_features

    X['store_city_features'] = store_city_features

    X['store_state_features'] = store_state_features

    X['store_type_features'] = store_type_features

    X['store_cluster_features'] = store_cluster_features
        
    if is_train:
        y = df[pd.date_range(start_time, periods=16)].values
        return X, y
    return X


#### Generate train, valid and test sets

In [40]:
print("Preparing dataset...")

nbr_weeks = int((valid_start - train_start).days/7)

X_l, y_l = [], []

for i in tqdm(range(nbr_weeks), desc = 'No. of week'):
    delta = timedelta(days=7 * i)
    X_tmp, y_tmp = gen_dataset(
        df_train,
        promo_features,
        item_family_features,
        item_class_features,
        store_city_features,
        store_state_features,
        store_type_features,
        store_cluster_features,
        train_start + delta
    )
    X_l.append(X_tmp)
    y_l.append(y_tmp)
#     break

No. of week:   0%|          | 0/16 [00:00<?, ?it/s]

Preparing dataset...


No. of week: 100%|██████████| 16/16 [01:30<00:00,  5.71s/it]


In [41]:
X_train = pd.concat(X_l, axis=0)
y_train = np.concatenate(y_l, axis=0)
del X_l, y_l

In [43]:

X_val, y_val = gen_dataset(df_train,
                           promo_features,
                           item_family_features,
                           item_class_features,
                           store_city_features,
                           store_state_features,
                           store_type_features,
                           store_cluster_features,
                           valid_start)
X_test = gen_dataset(df_train, 
                    promo_features,
                    item_family_features,
                    item_class_features,
                    store_city_features,
                    store_state_features,
                    store_type_features,
                    store_cluster_features,
                    test_start, is_train=False)

#### Train Model

In [46]:
print("Training and predicting models...")
params = {
    'num_leaves': 2**8 - 1,
    'objective': 'regression_l2',
    'max_depth': 8,
    'min_data_in_leaf': 50,
    'learning_rate': 0.05,
    'feature_fraction': 0.75,
    'bagging_fraction': 0.75,
    'bagging_freq': 1,
    'metric': 'l2',
    'num_threads': 4
}

MAX_ROUNDS = 200
val_pred = []
test_pred = []
cate_vars = ['item_family_features',
            'item_class_features',
            'store_city_features',
            'store_state_features',
            'store_type_features',
            'store_cluster_features']

Training and predicting models...


In [47]:
for i in tqdm(range(16)):
    dtrain = lgb.Dataset(
        X_train, label=y_train[:, i],
        categorical_feature=cate_vars,
        weight=pd.concat([items["perishable"]] * nbr_weeks) * 0.25 + 1
    )
    dval = lgb.Dataset(
        X_val, label=y_val[:, i], reference=dtrain,
        weight=items["perishable"] * 0.25 + 1,
        categorical_feature=cate_vars)

    bst = lgb.train(
        params,
        dtrain,
        num_boost_round=MAX_ROUNDS,
#         verbose_eval = False,
        valid_sets=[dtrain, dval], early_stopping_rounds=50)
    print("\n".join(("%s: %.2f" % x) for x in sorted(
        zip(X_train.columns, bst.feature_importance("gain")),
        key=lambda x: x[1], reverse=True
    )))
    val_pred.append(bst.predict(
        X_val, num_iteration=bst.best_iteration or MAX_ROUNDS))
    test_pred.append(bst.predict(
        X_test, num_iteration=bst.best_iteration or MAX_ROUNDS))
    




[1]	training's l2: 1.04308	valid_1's l2: 1.00166
Training until validation scores don't improve for 50 rounds.
[2]	training's l2: 0.971876	valid_1's l2: 0.932673
[3]	training's l2: 0.909421	valid_1's l2: 0.872389
[4]	training's l2: 0.851101	valid_1's l2: 0.816019
[5]	training's l2: 0.798314	valid_1's l2: 0.764964
[6]	training's l2: 0.750631	valid_1's l2: 0.718945
[7]	training's l2: 0.707583	valid_1's l2: 0.677254
[8]	training's l2: 0.668575	valid_1's l2: 0.639617
[9]	training's l2: 0.634347	valid_1's l2: 0.606684
[10]	training's l2: 0.602286	valid_1's l2: 0.575777
[11]	training's l2: 0.573257	valid_1's l2: 0.5478
[12]	training's l2: 0.546986	valid_1's l2: 0.522542
[13]	training's l2: 0.523325	valid_1's l2: 0.499736
[14]	training's l2: 0.501798	valid_1's l2: 0.479107
[15]	training's l2: 0.482315	valid_1's l2: 0.460431
[16]	training's l2: 0.465276	valid_1's l2: 0.444262
[17]	training's l2: 0.449904	valid_1's l2: 0.429631
[18]	training's l2: 0.435273	valid_1's l2: 0.415667
[19]	training's

  6%|▋         | 1/16 [02:28<37:01, 148.09s/it]

[1]	training's l2: 0.941742	valid_1's l2: 0.923414
Training until validation scores don't improve for 50 rounds.
[2]	training's l2: 0.882799	valid_1's l2: 0.865569
[3]	training's l2: 0.82957	valid_1's l2: 0.81349
[4]	training's l2: 0.781429	valid_1's l2: 0.766275
[5]	training's l2: 0.737955	valid_1's l2: 0.723719
[6]	training's l2: 0.698657	valid_1's l2: 0.685127
[7]	training's l2: 0.663155	valid_1's l2: 0.650251
[8]	training's l2: 0.630953	valid_1's l2: 0.618623
[9]	training's l2: 0.601902	valid_1's l2: 0.590107
[10]	training's l2: 0.575501	valid_1's l2: 0.564239
[11]	training's l2: 0.551655	valid_1's l2: 0.5409
[12]	training's l2: 0.530085	valid_1's l2: 0.519782
[13]	training's l2: 0.510609	valid_1's l2: 0.50062
[14]	training's l2: 0.49297	valid_1's l2: 0.483426
[15]	training's l2: 0.476968	valid_1's l2: 0.467783
[16]	training's l2: 0.462499	valid_1's l2: 0.453626
[17]	training's l2: 0.44945	valid_1's l2: 0.44085
[18]	training's l2: 0.437575	valid_1's l2: 0.429268
[19]	training's l2:

 12%|█▎        | 2/16 [04:47<33:57, 145.52s/it]

[1]	training's l2: 1.04939	valid_1's l2: 1.05788
Training until validation scores don't improve for 50 rounds.
[2]	training's l2: 0.981186	valid_1's l2: 0.989747
[3]	training's l2: 0.919235	valid_1's l2: 0.927866
[4]	training's l2: 0.863157	valid_1's l2: 0.871762
[5]	training's l2: 0.812775	valid_1's l2: 0.821429
[6]	training's l2: 0.767725	valid_1's l2: 0.776221
[7]	training's l2: 0.726372	valid_1's l2: 0.734886
[8]	training's l2: 0.68876	valid_1's l2: 0.697186
[9]	training's l2: 0.65494	valid_1's l2: 0.663249
[10]	training's l2: 0.624298	valid_1's l2: 0.632503
[11]	training's l2: 0.596324	valid_1's l2: 0.604548
[12]	training's l2: 0.571092	valid_1's l2: 0.579292
[13]	training's l2: 0.548192	valid_1's l2: 0.556343
[14]	training's l2: 0.527553	valid_1's l2: 0.535632
[15]	training's l2: 0.509003	valid_1's l2: 0.516978
[16]	training's l2: 0.491866	valid_1's l2: 0.499768
[17]	training's l2: 0.476408	valid_1's l2: 0.484243
[18]	training's l2: 0.462359	valid_1's l2: 0.470189
[19]	training's

 19%|█▉        | 3/16 [07:07<31:08, 143.76s/it]

[1]	training's l2: 1.17203	valid_1's l2: 1.15877
Training until validation scores don't improve for 50 rounds.
[2]	training's l2: 1.09391	valid_1's l2: 1.08168
[3]	training's l2: 1.02258	valid_1's l2: 1.01141
[4]	training's l2: 0.958095	valid_1's l2: 0.947773
[5]	training's l2: 0.900062	valid_1's l2: 0.890534
[6]	training's l2: 0.847409	valid_1's l2: 0.838491
[7]	training's l2: 0.799796	valid_1's l2: 0.791564
[8]	training's l2: 0.756913	valid_1's l2: 0.749177
[9]	training's l2: 0.718318	valid_1's l2: 0.710971
[10]	training's l2: 0.683075	valid_1's l2: 0.676145
[11]	training's l2: 0.65156	valid_1's l2: 0.645196
[12]	training's l2: 0.62297	valid_1's l2: 0.617186
[13]	training's l2: 0.596763	valid_1's l2: 0.591385
[14]	training's l2: 0.573037	valid_1's l2: 0.568113
[15]	training's l2: 0.551638	valid_1's l2: 0.546979
[16]	training's l2: 0.532181	valid_1's l2: 0.527905
[17]	training's l2: 0.514607	valid_1's l2: 0.51069
[18]	training's l2: 0.498661	valid_1's l2: 0.495124
[19]	training's l2: 

 25%|██▌       | 4/16 [09:29<28:38, 143.17s/it]

[1]	training's l2: 1.23693	valid_1's l2: 1.22198
Training until validation scores don't improve for 50 rounds.
[2]	training's l2: 1.15367	valid_1's l2: 1.13821
[3]	training's l2: 1.07776	valid_1's l2: 1.06208
[4]	training's l2: 1.01025	valid_1's l2: 0.995128
[5]	training's l2: 0.948629	valid_1's l2: 0.934066
[6]	training's l2: 0.892952	valid_1's l2: 0.878122
[7]	training's l2: 0.842555	valid_1's l2: 0.827656
[8]	training's l2: 0.797044	valid_1's l2: 0.782036
[9]	training's l2: 0.756232	valid_1's l2: 0.741623
[10]	training's l2: 0.718944	valid_1's l2: 0.704381
[11]	training's l2: 0.684717	valid_1's l2: 0.670219
[12]	training's l2: 0.653995	valid_1's l2: 0.640131
[13]	training's l2: 0.625821	valid_1's l2: 0.612089
[14]	training's l2: 0.600519	valid_1's l2: 0.587185
[15]	training's l2: 0.577395	valid_1's l2: 0.564123
[16]	training's l2: 0.556704	valid_1's l2: 0.543808
[17]	training's l2: 0.537793	valid_1's l2: 0.525112
[18]	training's l2: 0.520592	valid_1's l2: 0.508141
[19]	training's l2

 31%|███▏      | 5/16 [11:50<26:10, 142.74s/it]

[1]	training's l2: 1.06806	valid_1's l2: 1.09463
Training until validation scores don't improve for 50 rounds.
[2]	training's l2: 1.00114	valid_1's l2: 1.02645
[3]	training's l2: 0.940061	valid_1's l2: 0.964023
[4]	training's l2: 0.884831	valid_1's l2: 0.907403
[5]	training's l2: 0.834853	valid_1's l2: 0.856009
[6]	training's l2: 0.789766	valid_1's l2: 0.809629
[7]	training's l2: 0.749192	valid_1's l2: 0.768041
[8]	training's l2: 0.712739	valid_1's l2: 0.730594
[9]	training's l2: 0.6793	valid_1's l2: 0.696007
[10]	training's l2: 0.649041	valid_1's l2: 0.664575
[11]	training's l2: 0.621694	valid_1's l2: 0.636208
[12]	training's l2: 0.597203	valid_1's l2: 0.610831
[13]	training's l2: 0.574773	valid_1's l2: 0.587481
[14]	training's l2: 0.554379	valid_1's l2: 0.566167
[15]	training's l2: 0.535974	valid_1's l2: 0.546838
[16]	training's l2: 0.519307	valid_1's l2: 0.529408
[17]	training's l2: 0.504235	valid_1's l2: 0.51353
[18]	training's l2: 0.490552	valid_1's l2: 0.499126
[19]	training's l2

 38%|███▊      | 6/16 [14:10<23:39, 141.92s/it]

[1]	training's l2: 1.01571	valid_1's l2: 1.18135
Training until validation scores don't improve for 50 rounds.
[2]	training's l2: 0.951557	valid_1's l2: 1.11297
[3]	training's l2: 0.893616	valid_1's l2: 1.05106
[4]	training's l2: 0.841219	valid_1's l2: 0.99463
[5]	training's l2: 0.793872	valid_1's l2: 0.943464
[6]	training's l2: 0.751122	valid_1's l2: 0.89694
[7]	training's l2: 0.712909	valid_1's l2: 0.855233
[8]	training's l2: 0.677817	valid_1's l2: 0.816777
[9]	training's l2: 0.646127	valid_1's l2: 0.781668
[10]	training's l2: 0.617871	valid_1's l2: 0.750163
[11]	training's l2: 0.591853	valid_1's l2: 0.721284
[12]	training's l2: 0.568692	valid_1's l2: 0.69526
[13]	training's l2: 0.547352	valid_1's l2: 0.671192
[14]	training's l2: 0.52796	valid_1's l2: 0.649176
[15]	training's l2: 0.510391	valid_1's l2: 0.629093
[16]	training's l2: 0.494702	valid_1's l2: 0.610981
[17]	training's l2: 0.480675	valid_1's l2: 0.594597
[18]	training's l2: 0.467573	valid_1's l2: 0.579398
[19]	training's l2:

 44%|████▍     | 7/16 [16:18<20:37, 137.51s/it]

[1]	training's l2: 1.04132	valid_1's l2: 1.16007
Training until validation scores don't improve for 50 rounds.
[2]	training's l2: 0.973592	valid_1's l2: 1.08952
[3]	training's l2: 0.913922	valid_1's l2: 1.02713
[4]	training's l2: 0.85955	valid_1's l2: 0.970072
[5]	training's l2: 0.809384	valid_1's l2: 0.917241
[6]	training's l2: 0.763974	valid_1's l2: 0.869469
[7]	training's l2: 0.724016	valid_1's l2: 0.826863
[8]	training's l2: 0.68786	valid_1's l2: 0.788386
[9]	training's l2: 0.653996	valid_1's l2: 0.752427
[10]	training's l2: 0.624289	valid_1's l2: 0.720486
[11]	training's l2: 0.596422	valid_1's l2: 0.690568
[12]	training's l2: 0.571213	valid_1's l2: 0.663504
[13]	training's l2: 0.549162	valid_1's l2: 0.639412
[14]	training's l2: 0.529119	valid_1's l2: 0.617433
[15]	training's l2: 0.510218	valid_1's l2: 0.596794
[16]	training's l2: 0.49317	valid_1's l2: 0.577985
[17]	training's l2: 0.477646	valid_1's l2: 0.56092
[18]	training's l2: 0.463565	valid_1's l2: 0.545328
[19]	training's l2:

 50%|█████     | 8/16 [18:42<18:37, 139.66s/it]

[1]	training's l2: 0.942409	valid_1's l2: 0.996946
Training until validation scores don't improve for 50 rounds.
[2]	training's l2: 0.887145	valid_1's l2: 0.940171
[3]	training's l2: 0.836046	valid_1's l2: 0.888165
[4]	training's l2: 0.789824	valid_1's l2: 0.840822
[5]	training's l2: 0.74882	valid_1's l2: 0.798619
[6]	training's l2: 0.711056	valid_1's l2: 0.759692
[7]	training's l2: 0.676977	valid_1's l2: 0.724682
[8]	training's l2: 0.646623	valid_1's l2: 0.693087
[9]	training's l2: 0.618704	valid_1's l2: 0.664036
[10]	training's l2: 0.593982	valid_1's l2: 0.638157
[11]	training's l2: 0.570938	valid_1's l2: 0.614161
[12]	training's l2: 0.550582	valid_1's l2: 0.592788
[13]	training's l2: 0.531616	valid_1's l2: 0.572895
[14]	training's l2: 0.514414	valid_1's l2: 0.554818
[15]	training's l2: 0.499241	valid_1's l2: 0.538753
[16]	training's l2: 0.485076	valid_1's l2: 0.523728
[17]	training's l2: 0.472703	valid_1's l2: 0.510489
[18]	training's l2: 0.461036	valid_1's l2: 0.498058
[19]	trainin

 56%|█████▋    | 9/16 [20:58<16:08, 138.38s/it]

[1]	training's l2: 1.05382	valid_1's l2: 1.06948
Training until validation scores don't improve for 50 rounds.
[2]	training's l2: 0.987887	valid_1's l2: 1.00346
[3]	training's l2: 0.929007	valid_1's l2: 0.944419
[4]	training's l2: 0.874617	valid_1's l2: 0.889932
[5]	training's l2: 0.825749	valid_1's l2: 0.840966
[6]	training's l2: 0.781307	valid_1's l2: 0.796271
[7]	training's l2: 0.741333	valid_1's l2: 0.756239
[8]	training's l2: 0.704819	valid_1's l2: 0.719549
[9]	training's l2: 0.67184	valid_1's l2: 0.686208
[10]	training's l2: 0.641999	valid_1's l2: 0.656092
[11]	training's l2: 0.615509	valid_1's l2: 0.629265
[12]	training's l2: 0.591002	valid_1's l2: 0.604465
[13]	training's l2: 0.569248	valid_1's l2: 0.582483
[14]	training's l2: 0.549689	valid_1's l2: 0.562726
[15]	training's l2: 0.531284	valid_1's l2: 0.544105
[16]	training's l2: 0.514516	valid_1's l2: 0.527136
[17]	training's l2: 0.499395	valid_1's l2: 0.511774
[18]	training's l2: 0.485648	valid_1's l2: 0.497904
[19]	training's

 62%|██████▎   | 10/16 [23:14<13:46, 137.69s/it]

[1]	training's l2: 1.17427	valid_1's l2: 1.14304
Training until validation scores don't improve for 50 rounds.
[2]	training's l2: 1.09915	valid_1's l2: 1.0693
[3]	training's l2: 1.03122	valid_1's l2: 1.00278
[4]	training's l2: 0.968941	valid_1's l2: 0.941831
[5]	training's l2: 0.912926	valid_1's l2: 0.887319
[6]	training's l2: 0.862059	valid_1's l2: 0.837545
[7]	training's l2: 0.815938	valid_1's l2: 0.792704
[8]	training's l2: 0.77501	valid_1's l2: 0.752676
[9]	training's l2: 0.737277	valid_1's l2: 0.715872
[10]	training's l2: 0.703224	valid_1's l2: 0.682668
[11]	training's l2: 0.673102	valid_1's l2: 0.653425
[12]	training's l2: 0.64546	valid_1's l2: 0.626634
[13]	training's l2: 0.619978	valid_1's l2: 0.601897
[14]	training's l2: 0.596858	valid_1's l2: 0.57955
[15]	training's l2: 0.576517	valid_1's l2: 0.559796
[16]	training's l2: 0.557818	valid_1's l2: 0.541784
[17]	training's l2: 0.540688	valid_1's l2: 0.525164
[18]	training's l2: 0.525055	valid_1's l2: 0.510143
[19]	training's l2: 0

 69%|██████▉   | 11/16 [25:29<11:25, 137.11s/it]

[1]	training's l2: 1.23812	valid_1's l2: 1.21076
Training until validation scores don't improve for 50 rounds.
[2]	training's l2: 1.15723	valid_1's l2: 1.12949
[3]	training's l2: 1.08404	valid_1's l2: 1.05646
[4]	training's l2: 1.01908	valid_1's l2: 0.992334
[5]	training's l2: 0.959767	valid_1's l2: 0.933843
[6]	training's l2: 0.90561	valid_1's l2: 0.87985
[7]	training's l2: 0.856922	valid_1's l2: 0.831627
[8]	training's l2: 0.812964	valid_1's l2: 0.788122
[9]	training's l2: 0.773152	valid_1's l2: 0.749031
[10]	training's l2: 0.736719	valid_1's l2: 0.712901
[11]	training's l2: 0.704051	valid_1's l2: 0.680776
[12]	training's l2: 0.674491	valid_1's l2: 0.652403
[13]	training's l2: 0.647363	valid_1's l2: 0.625723
[14]	training's l2: 0.62305	valid_1's l2: 0.60216
[15]	training's l2: 0.600712	valid_1's l2: 0.580345
[16]	training's l2: 0.580469	valid_1's l2: 0.560639
[17]	training's l2: 0.562394	valid_1's l2: 0.543137
[18]	training's l2: 0.545745	valid_1's l2: 0.527118
[19]	training's l2: 0.

 75%|███████▌  | 12/16 [27:52<09:14, 138.69s/it]

[1]	training's l2: 1.07569	valid_1's l2: 1.04812
Training until validation scores don't improve for 50 rounds.
[2]	training's l2: 1.00984	valid_1's l2: 0.983346
[3]	training's l2: 0.950262	valid_1's l2: 0.924853
[4]	training's l2: 0.897304	valid_1's l2: 0.872533
[5]	training's l2: 0.848483	valid_1's l2: 0.824651
[6]	training's l2: 0.804476	valid_1's l2: 0.781409
[7]	training's l2: 0.764679	valid_1's l2: 0.742495
[8]	training's l2: 0.728542	valid_1's l2: 0.706946
[9]	training's l2: 0.696005	valid_1's l2: 0.674902
[10]	training's l2: 0.666491	valid_1's l2: 0.645878
[11]	training's l2: 0.639743	valid_1's l2: 0.619655
[12]	training's l2: 0.615475	valid_1's l2: 0.595771
[13]	training's l2: 0.593829	valid_1's l2: 0.574537
[14]	training's l2: 0.574266	valid_1's l2: 0.555418
[15]	training's l2: 0.556492	valid_1's l2: 0.538062
[16]	training's l2: 0.540376	valid_1's l2: 0.522377
[17]	training's l2: 0.525474	valid_1's l2: 0.507851
[18]	training's l2: 0.511873	valid_1's l2: 0.494688
[19]	training'

 81%|████████▏ | 13/16 [30:11<06:56, 138.98s/it]

[1]	training's l2: 1.02872	valid_1's l2: 0.992827
Training until validation scores don't improve for 50 rounds.
[2]	training's l2: 0.965812	valid_1's l2: 0.931644
[3]	training's l2: 0.909921	valid_1's l2: 0.877363
[4]	training's l2: 0.859483	valid_1's l2: 0.828186
[5]	training's l2: 0.812743	valid_1's l2: 0.782884
[6]	training's l2: 0.770535	valid_1's l2: 0.741863
[7]	training's l2: 0.732254	valid_1's l2: 0.704837
[8]	training's l2: 0.697625	valid_1's l2: 0.671341
[9]	training's l2: 0.666401	valid_1's l2: 0.64094
[10]	training's l2: 0.638155	valid_1's l2: 0.613544
[11]	training's l2: 0.612485	valid_1's l2: 0.588732
[12]	training's l2: 0.589249	valid_1's l2: 0.566248
[13]	training's l2: 0.568217	valid_1's l2: 0.545899
[14]	training's l2: 0.549142	valid_1's l2: 0.527565
[15]	training's l2: 0.531796	valid_1's l2: 0.510928
[16]	training's l2: 0.516317	valid_1's l2: 0.496201
[17]	training's l2: 0.502425	valid_1's l2: 0.482991
[18]	training's l2: 0.489721	valid_1's l2: 0.470962
[19]	training

 88%|████████▊ | 14/16 [32:34<04:40, 140.02s/it]

[1]	training's l2: 1.05347	valid_1's l2: 1.00764
Training until validation scores don't improve for 50 rounds.
[2]	training's l2: 0.986536	valid_1's l2: 0.943406
[3]	training's l2: 0.926143	valid_1's l2: 0.885523
[4]	training's l2: 0.873163	valid_1's l2: 0.834608
[5]	training's l2: 0.823558	valid_1's l2: 0.787126
[6]	training's l2: 0.778815	valid_1's l2: 0.744241
[7]	training's l2: 0.738265	valid_1's l2: 0.705547
[8]	training's l2: 0.701468	valid_1's l2: 0.670493
[9]	training's l2: 0.668337	valid_1's l2: 0.638902
[10]	training's l2: 0.638168	valid_1's l2: 0.610242
[11]	training's l2: 0.610931	valid_1's l2: 0.584392
[12]	training's l2: 0.587018	valid_1's l2: 0.561721
[13]	training's l2: 0.564485	valid_1's l2: 0.540367
[14]	training's l2: 0.544958	valid_1's l2: 0.521893
[15]	training's l2: 0.526437	valid_1's l2: 0.504539
[16]	training's l2: 0.509671	valid_1's l2: 0.488756
[17]	training's l2: 0.494415	valid_1's l2: 0.474485
[18]	training's l2: 0.480492	valid_1's l2: 0.461526
[19]	training

 94%|█████████▍| 15/16 [34:57<02:20, 140.94s/it]

[1]	training's l2: 0.944863	valid_1's l2: 0.937437
Training until validation scores don't improve for 50 rounds.
[2]	training's l2: 0.889543	valid_1's l2: 0.883054
[3]	training's l2: 0.839565	valid_1's l2: 0.833871
[4]	training's l2: 0.794435	valid_1's l2: 0.789408
[5]	training's l2: 0.753568	valid_1's l2: 0.749311
[6]	training's l2: 0.717537	valid_1's l2: 0.713594
[7]	training's l2: 0.684167	valid_1's l2: 0.68089
[8]	training's l2: 0.653887	valid_1's l2: 0.651103
[9]	training's l2: 0.626668	valid_1's l2: 0.624152
[10]	training's l2: 0.601894	valid_1's l2: 0.599711
[11]	training's l2: 0.5794	valid_1's l2: 0.577615
[12]	training's l2: 0.559624	valid_1's l2: 0.558001
[13]	training's l2: 0.541063	valid_1's l2: 0.539712
[14]	training's l2: 0.524836	valid_1's l2: 0.523712
[15]	training's l2: 0.509567	valid_1's l2: 0.508758
[16]	training's l2: 0.496129	valid_1's l2: 0.495556
[17]	training's l2: 0.483562	valid_1's l2: 0.483258
[18]	training's l2: 0.47215	valid_1's l2: 0.472097
[19]	training's

100%|██████████| 16/16 [37:17<00:00, 140.71s/it]


#### Generate submission

In [48]:
mse = mean_squared_error(y_val, np.array(val_pred).transpose())
mse

0.34589913886584017

In [34]:
252/1672

0.1507177033492823

In [50]:
weight = items["perishable"] * 0.25 + 1
err = (y_val - np.array(val_pred).transpose())**2
err = err.sum(axis=1) * weight
err = np.sqrt(err.sum() / weight.sum() / 16)
print('nwrmsle = {}'.format(err)) #nwrmsle

nwrmsle = 0.5876048429200267


In [49]:
print("Making submission...")
y_test = np.array(test_pred).transpose()
df_preds = pd.DataFrame(
    y_test, index=df_train.index,
    columns=pd.date_range("2017-08-16", periods=16)
).stack().to_frame("unit_sales")
df_preds.index.set_names(["store_nbr", "item_nbr", "date"], inplace=True)

submission = df_test[["id"]].join(df_preds, how="left").fillna(0)
submission["unit_sales"] = np.clip(np.expm1(submission["unit_sales"]), 0, 1000)
submission.to_csv('lgb.csv', float_format='%.4f', index=None)

Making submission...


In [51]:
32/1674

0.019115890083632018

In [52]:
mse = mean_squared_error(y_val, np.array(val_pred).transpose())

mlflow.set_experiment('grocery forecasting')
with mlflow.start_run(run_name='lgbm'):
    mlflow.log_param('model', 'lgbm')
    mlflow.log_param('train starts', train_start)
    mlflow.log_params(params)
    mlflow.log_param('lagging', LAG_DICT.values())
    mlflow.log_param('featureEng', 'add: min, max, median, promo featureEng')
    mlflow.log_param('slidingWindows', SLIDING_DICT.values())
    mlflow.log_param('item_info', 'Yes')
    mlflow.log_param('store_info', 'Yes')
    mlflow.log_param('private score', 0.51737)
    mlflow.log_param('private rank', 'rank 32: TOP 1%')
    mlflow.log_param('public score', 0.51359)

    mlflow.log_metric('mse', mse)
    
print("Validation mse:", mse)

  from collections import (


Validation mse: 0.34589913886584017
