In [1]:
from datetime import date, timedelta
import datetime
import pandas as pd
import numpy as np
from tqdm import tqdm, tnrange

from sklearn.metrics import mean_squared_error
import lightgbm as lgb

import mlflow
import mlflow.sklearn

from config import (
    RAW_DATA_DIR,
    FEATURE_DIR,
    LAG_DICT,
    SLIDING_DICT
)

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
# solve lightgbm error on MAC
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [3]:
# load data
df_train = pd.read_csv(
    RAW_DATA_DIR+'train.csv', usecols=[1, 2, 3, 4, 5],
    dtype={'onpromotion': bool},
    converters={'unit_sales': lambda u: np.log1p(
        float(u)) if float(u) > 0 else 0},
    parse_dates=["date"],
    skiprows=range(1, 66458909)  # 2016-01-01
)

df_test = pd.read_csv(
    RAW_DATA_DIR+'test.csv', usecols=[0, 1, 2, 3, 4],
    dtype={'onpromotion': bool},
    parse_dates=["date"]  # , date_parser=parser
).set_index(
    ['store_nbr', 'item_nbr', 'date']
)

items = pd.read_csv(
    RAW_DATA_DIR+'items.csv',
).set_index("item_nbr")

stores = pd.read_csv(
    RAW_DATA_DIR+'stores.csv',
).set_index("store_nbr")

### Test Period

2017-08-16 to 2017-08-31

In [4]:
test_start = date(2017, 8, 16)
test_end = date(2017,8, 31)

In [5]:
valid_start = test_start - timedelta(16)
while(1):
    if valid_start.weekday() == test_start.weekday():
        break
    valid_start = valid_start-timedelta(days=1)
valid_end = valid_start + timedelta(15)
print('valid starts from {} to {}'.format(valid_start, valid_end))

valid starts from 2017-07-26 to 2017-08-10


### Valid Period

Considering the more nearer peiods of sales data may have more in common, it would be better to find the nearest period as valid period.

Based on the analysis before, we assume the sales data is periodically with the frequency of 7 days, so we want to keep that feature same
in the train, valid and test period.

So finally, we choose valid period:

2017-07-26 to 2017-08-10


In [6]:
valid_start = date(2017, 7, 26)
valid_end = date(2017, 8, 10)

### Filter Period

#### Earthquake happended on April 16, 2016. It may affect for the next several weeks.

In [7]:
# filter the period which is affected by earthquake.
filter_date = date(2016,4,16) + timedelta(7*4)
lag_max = 140
train_start=  filter_date+timedelta(days=lag_max)

while(1):
    train_start = train_start + timedelta(1)
    if train_start.weekday() == valid_start.weekday():
        break
print('train datasets starts from {}'.format(train_start))

train datasets starts from 2016-10-05


In [8]:
train_start = date(2017, 4, 5)

### Wages in the public sector are paid every two weeks on the 15 th and on the last day of the month. Supermarket sales could be affected by this.


In [9]:
df_train = df_train[df_train['date']>=filter_date]

'datetime.date' is coerced to a datetime. In the future pandas will
not coerce, and a TypeError will be raised. To retain the current
behavior, convert the 'datetime.date' to a datetime with
'pd.Timestamp'.
  """Entry point for launching an IPython kernel.


#### Promo feature

In [10]:
promo_train = df_train.set_index(
    ["store_nbr", "item_nbr", "date"])[["onpromotion"]]

# missing onpromotions filling
promo_train = promo_train.unstack(level=-1).fillna(False)
promo_train.columns = promo_train.columns.get_level_values(1)

In [11]:
# missing test onpromotions filling
promo_test = df_test[["onpromotion"]].unstack(level=-1).fillna(False)
promo_test.columns = promo_test.columns.get_level_values(1)
# filter those items/stores in promo_test but not in promo_train
promo_test = promo_test.reindex(promo_train.index).fillna(False)

In [12]:
promo_features = pd.concat([promo_train, promo_test], axis=1)
del promo_test, promo_train

## Label

In [13]:
# label
df_train = df_train.set_index(["store_nbr", "item_nbr", "date"])[["unit_sales"]].unstack(level=-1).fillna(0)

### Item

In [14]:
items = items.reindex(df_train.index.get_level_values(1))

#### Item Family

In [15]:
items['family'] = items['family'].astype('category')
item_family_features = items.family.cat.codes.values

#### Item's class

In [16]:
items['class'] = items['class'].astype('category')
item_class_features = items['class'].cat.codes.values

### Store

In [17]:
stores = stores.reindex(df_train.index.get_level_values(0))

#### Store's city

In [18]:
stores['city'] = stores['city'].astype('category')
store_city_features = stores['city'].cat.codes.values

#### Store's state

In [19]:
stores['state'] = stores['state'].astype('category')
store_state_features = stores['state'].cat.codes.values

#### Store's type

In [20]:
stores['type'] = stores['type'].astype('category')
store_type_features = stores['type'].cat.codes.values

#### Store's cluster

In [21]:
stores['cluster'] = stores['cluster'].astype('category')
store_cluster_features = stores['cluster'].cat.codes.values

In [22]:
df_train.columns = df_train.columns.get_level_values(1)

#### Filling missing date

In [23]:
date_list = df_train.columns
obj_list = pd.date_range(filter_date, test_start-timedelta(1))
diff_list = list(set(obj_list) - set(date_list)) 
for i in diff_list:
    print(i)
    df_train[i] = 0

2016-12-25 00:00:00


In [24]:
date_list = promo_features.columns
obj_list = pd.date_range(filter_date, test_end)
diff_list = list(set(obj_list) - set(date_list)) 
for i in diff_list:
    print(i)
    promo_features[i] = 0

2016-12-25 00:00:00


#### Lagging and sliding windows

In [25]:
LAG_DICT = {'unit_sales': [1,2,3, 4, 5, 6, 7, 8, 9, 10 ,11 ,12, 13 ,14, 15, 16, 21,30, 60],
            'onpromotion': [2, 3,4,5,6, 7, 14, 21]}

SLIDING_DICT = {'unit_sales': [3, 4, 5, 6, 7, 14, 21, 30, 60]}

# initialise dirs
RAW_DATA_DIR = 'datasets/'

In [None]:

    ###多于NN模型的特征模块：
【时间窗】
    for i in [3, 7, 14, 30, 60, 140]:
        tmp1 = get_timespan(df, t2017, i, i)
        tmp2 = (get_timespan(promo_df, t2017, i, i) > 0) * 1
【特征】促销日的销量均值
        X['has_promo_mean_%s' % i] = (tmp1 * tmp2.replace(0, np.nan)).mean(axis=1).values
【特征】促销日的销量递增加权和。np.power(0.9, np.arange(i)[::-1])产生指数递增权重
        X['has_promo_mean_%s_decay' % i] = (tmp1 * tmp2.replace(0, np.nan) * np.power(0.9, np.arange(i)[::-1])).sum(axis=1).values
【特征】非促销日
        X['no_promo_mean_%s' % i] = (tmp1 * (1 - tmp2).replace(0, np.nan)).mean(axis=1).values
        X['no_promo_mean_%s_decay' % i] = (tmp1 * (1 - tmp2).replace(0, np.nan) * np.power(0.9, np.arange(i)[::-1])).sum(axis=1).values



In [31]:
train_start + timedelta(days=16)

datetime.date(2017, 4, 21)

In [39]:
def get_timespan(df, 
                 start_time,
                 minus,
                 periods,
                 freq='D'):
    return df[pd.date_range(start_time - timedelta(days=minus), periods=periods, freq=freq)]

def gen_dataset(df, 
                promo_features,
                item_family_features,
                item_class_features,
                store_city_features,
                store_state_features,
                store_type_features,
                store_cluster_features,
                start_time,
                is_train=True):
    # init
    X = pd.DataFrame()
    
    for i in LAG_DICT['unit_sales']:
        X['lag_{}_sales'.format(i)] = get_timespan(df, start_time, i, 1).values.ravel()
    
    for i in LAG_DICT['onpromotion']:
        X['sum_{}_promo'.format(i)] = get_timespan(promo_features, start_time, i, 1).sum(axis=1).ravel()
        
    for i in range(16):
        X['sum_{}_promo_test'.format(i)]= get_timespan(promo_features, start_time + timedelta(days=16), 15, i).sum(axis=1).values
        
    for i in SLIDING_DICT['unit_sales']:
        X["mean_{}_sales".format(i)] = get_timespan(df, start_time, i, i).mean(axis=1).values
        X["std_{}_sales".format(i)] = get_timespan(df, start_time, i, i).std(axis=1).values
        X["min_{}_sales".format(i)] = get_timespan(df, start_time, i, i).min(axis=1).values
        X["max_{}_sales".format(i)] = get_timespan(df, start_time, i, i).max(axis=1).values
        X["median_{}_sales".format(i)] = get_timespan(df, start_time, i, i).median(axis=1).values


    for i in range(7):
        X['mean_4_dow{}_2017'.format(i)] = get_timespan(df, start_time, 28-i, 4, freq='7D').mean(axis=1).values
        X['mean_20_dow{}_2017'.format(i)] = get_timespan(df, start_time, 140-i, 20, freq='7D').mean(axis=1).values
        
    # for the next to-predict 16 days 
    for i in range(16):
        X["promo_{}".format(i)] = promo_features[start_time + timedelta(days=i)].values.astype(np.uint8)

    for i in [7, 14, 30, 60, 140]:
        tmp = get_timespan(df, start_time, i, i)

        X['has_sales_days_in_last_{}'.format(i)] = (tmp > 0).sum(axis=1).values
        X['last_has_sales_day_in_last_%s' % i] = i - ((tmp > 0) * np.arange(i)).max(axis=1).values

        tmp = get_timespan(promo_features, start_time, i, i)
        X['has_promo_days_in_last_%s' % i] = (tmp > 0).sum(axis=1).values
        X['last_has_promo_day_in_last_%s' % i] = i - ((tmp > 0) * np.arange(i)).max(axis=1).values


        
    X['item_family_features'] = item_family_features

    X['item_class_features'] = item_class_features

    X['store_city_features'] = store_city_features

    X['store_state_features'] = store_state_features

    X['store_type_features'] = store_type_features

    X['store_cluster_features'] = store_cluster_features
        
    if is_train:
        y = df[pd.date_range(start_time, periods=16)].values
        return X, y
    return X


#### Generate train, valid and test sets

In [40]:
print("Preparing dataset...")

nbr_weeks = int((valid_start - train_start).days/7)

X_l, y_l = [], []

for i in tqdm(range(nbr_weeks), desc = 'No. of week'):
    delta = timedelta(days=7 * i)
    X_tmp, y_tmp = gen_dataset(
        df_train,
        promo_features,
        item_family_features,
        item_class_features,
        store_city_features,
        store_state_features,
        store_type_features,
        store_cluster_features,
        train_start + delta
    )
    X_l.append(X_tmp)
    y_l.append(y_tmp)
#     break

No. of week:   0%|          | 0/16 [00:00<?, ?it/s]

Preparing dataset...


No. of week: 100%|██████████| 16/16 [01:30<00:00,  5.71s/it]


In [41]:
X_train = pd.concat(X_l, axis=0)
y_train = np.concatenate(y_l, axis=0)
del X_l, y_l

In [43]:

X_val, y_val = gen_dataset(df_train,
                           promo_features,
                           item_family_features,
                           item_class_features,
                           store_city_features,
                           store_state_features,
                           store_type_features,
                           store_cluster_features,
                           valid_start)
X_test = gen_dataset(df_train, 
                    promo_features,
                    item_family_features,
                    item_class_features,
                    store_city_features,
                    store_state_features,
                    store_type_features,
                    store_cluster_features,
                    test_start, is_train=False)

#### Train Model

In [46]:
print("Training and predicting models...")
params = {
    'num_leaves': 2**8 - 1,
    'objective': 'regression_l2',
    'max_depth': 8,
    'min_data_in_leaf': 50,
    'learning_rate': 0.05,
    'feature_fraction': 0.75,
    'bagging_fraction': 0.75,
    'bagging_freq': 1,
    'metric': 'l2',
    'num_threads': 4
}

MAX_ROUNDS = 200
val_pred = []
test_pred = []
cate_vars = ['item_family_features',
            'item_class_features',
            'store_city_features',
            'store_state_features',
            'store_type_features',
            'store_cluster_features']

Training and predicting models...


In [None]:
for i in tqdm(range(16)):
    dtrain = lgb.Dataset(
        X_train, label=y_train[:, i],
        categorical_feature=cate_vars,
        weight=pd.concat([items["perishable"]] * nbr_weeks) * 0.25 + 1
    )
    dval = lgb.Dataset(
        X_val, label=y_val[:, i], reference=dtrain,
        weight=items["perishable"] * 0.25 + 1,
        categorical_feature=cate_vars)

    bst = lgb.train(
        params,
        dtrain,
        num_boost_round=MAX_ROUNDS,
#         verbose_eval = False,
        valid_sets=[dtrain, dval], early_stopping_rounds=50)
    print("\n".join(("%s: %.2f" % x) for x in sorted(
        zip(X_train.columns, bst.feature_importance("gain")),
        key=lambda x: x[1], reverse=True
    )))
    val_pred.append(bst.predict(
        X_val, num_iteration=bst.best_iteration or MAX_ROUNDS))
    test_pred.append(bst.predict(
        X_test, num_iteration=bst.best_iteration or MAX_ROUNDS))
    




[1]	training's l2: 1.04308	valid_1's l2: 1.00166
Training until validation scores don't improve for 50 rounds.
[2]	training's l2: 0.971876	valid_1's l2: 0.932673
[3]	training's l2: 0.909421	valid_1's l2: 0.872389
[4]	training's l2: 0.851101	valid_1's l2: 0.816019
[5]	training's l2: 0.798314	valid_1's l2: 0.764964
[6]	training's l2: 0.750631	valid_1's l2: 0.718945
[7]	training's l2: 0.707583	valid_1's l2: 0.677254
[8]	training's l2: 0.668575	valid_1's l2: 0.639617
[9]	training's l2: 0.634347	valid_1's l2: 0.606684
[10]	training's l2: 0.602286	valid_1's l2: 0.575777
[11]	training's l2: 0.573257	valid_1's l2: 0.5478
[12]	training's l2: 0.546986	valid_1's l2: 0.522542
[13]	training's l2: 0.523325	valid_1's l2: 0.499736
[14]	training's l2: 0.501798	valid_1's l2: 0.479107
[15]	training's l2: 0.482315	valid_1's l2: 0.460431
[16]	training's l2: 0.465276	valid_1's l2: 0.444262
[17]	training's l2: 0.449904	valid_1's l2: 0.429631
[18]	training's l2: 0.435273	valid_1's l2: 0.415667
[19]	training's

  6%|▋         | 1/16 [02:28<37:01, 148.09s/it]

[1]	training's l2: 0.941742	valid_1's l2: 0.923414
Training until validation scores don't improve for 50 rounds.
[2]	training's l2: 0.882799	valid_1's l2: 0.865569
[3]	training's l2: 0.82957	valid_1's l2: 0.81349
[4]	training's l2: 0.781429	valid_1's l2: 0.766275
[5]	training's l2: 0.737955	valid_1's l2: 0.723719
[6]	training's l2: 0.698657	valid_1's l2: 0.685127
[7]	training's l2: 0.663155	valid_1's l2: 0.650251
[8]	training's l2: 0.630953	valid_1's l2: 0.618623
[9]	training's l2: 0.601902	valid_1's l2: 0.590107
[10]	training's l2: 0.575501	valid_1's l2: 0.564239
[11]	training's l2: 0.551655	valid_1's l2: 0.5409
[12]	training's l2: 0.530085	valid_1's l2: 0.519782
[13]	training's l2: 0.510609	valid_1's l2: 0.50062
[14]	training's l2: 0.49297	valid_1's l2: 0.483426
[15]	training's l2: 0.476968	valid_1's l2: 0.467783
[16]	training's l2: 0.462499	valid_1's l2: 0.453626
[17]	training's l2: 0.44945	valid_1's l2: 0.44085
[18]	training's l2: 0.437575	valid_1's l2: 0.429268
[19]	training's l2:

 12%|█▎        | 2/16 [04:47<33:57, 145.52s/it]

[1]	training's l2: 1.04939	valid_1's l2: 1.05788
Training until validation scores don't improve for 50 rounds.
[2]	training's l2: 0.981186	valid_1's l2: 0.989747
[3]	training's l2: 0.919235	valid_1's l2: 0.927866
[4]	training's l2: 0.863157	valid_1's l2: 0.871762
[5]	training's l2: 0.812775	valid_1's l2: 0.821429
[6]	training's l2: 0.767725	valid_1's l2: 0.776221
[7]	training's l2: 0.726372	valid_1's l2: 0.734886
[8]	training's l2: 0.68876	valid_1's l2: 0.697186
[9]	training's l2: 0.65494	valid_1's l2: 0.663249
[10]	training's l2: 0.624298	valid_1's l2: 0.632503
[11]	training's l2: 0.596324	valid_1's l2: 0.604548
[12]	training's l2: 0.571092	valid_1's l2: 0.579292
[13]	training's l2: 0.548192	valid_1's l2: 0.556343
[14]	training's l2: 0.527553	valid_1's l2: 0.535632
[15]	training's l2: 0.509003	valid_1's l2: 0.516978
[16]	training's l2: 0.491866	valid_1's l2: 0.499768
[17]	training's l2: 0.476408	valid_1's l2: 0.484243
[18]	training's l2: 0.462359	valid_1's l2: 0.470189
[19]	training's

#### Generate submission

In [32]:
mse = mean_squared_error(y_val, np.array(val_pred).transpose())
mse

0.3511710550491811

In [34]:
252/1672

0.1507177033492823

In [35]:
mse = mean_squared_error(y_val, np.array(val_pred).transpose())

mlflow.set_experiment('grocery forecasting')
with mlflow.start_run(run_name='lgbm'):
    mlflow.log_param('model', 'lgbm')
    mlflow.log_param('train starts', train_start)
    mlflow.log_params(params)
    mlflow.log_param('lagging', LAG_DICT.values())
    mlflow.log_param('slidingWindows', SLIDING_DICT.values())
    mlflow.log_param('item_info', 'Yes')
    mlflow.log_param('store_info', 'Yes')
    mlflow.log_param('private score', 0.52207)
    mlflow.log_param('private rank', '15%')
    mlflow.log_param('public score', 0.51485)

    mlflow.log_metric('mse', mse)
    
print("Validation mse:", mse)

  from collections import (


Validation mse: 0.3511710550491811


In [33]:
print("Making submission...")
y_test = np.array(test_pred).transpose()
df_preds = pd.DataFrame(
    y_test, index=df_train.index,
    columns=pd.date_range("2017-08-16", periods=16)
).stack().to_frame("unit_sales")
df_preds.index.set_names(["store_nbr", "item_nbr", "date"], inplace=True)

submission = df_test[["id"]].join(df_preds, how="left").fillna(0)
submission["unit_sales"] = np.clip(np.expm1(submission["unit_sales"]), 0, 1000)
submission.to_csv('lgb.csv', float_format='%.4f', index=None)

Making submission...
