In [13]:
import pandas as pd
from tqdm import tqdm

In [14]:
# def gen_datasets(df,
#                  dataset_dir='tmp_data/',
#                  PREDICT_LENGTH=16,
#                  changing_cols=['onpromotion']
#                  ):
#     """ Generate train test sets for PREDICT_LENGTH of models.
#     Args:
#         df: a DataFrame
#         dataset_dir: String, a dir to record train test sets.

#     Returns:
#         train_frame_dict: a Dict: {Int: Dataframe}
#     """

#     storeId_list = df['store_nbr'].unique()
#     train_frame_dict = {}

#     for i in tqdm(range(PREDICT_LENGTH), desc = '1st loop'):
#         # i_frame: ith model
#         count = 0
#         for store_nbr in tqdm(storeId_list):
#             store_df = df[df['store_nbr'] == store_nbr]
#             item_list = store_df['item_nbr'].unique()
#             for item_nbr in item_list:
#                 item_df = store_df[store_df['item_nbr']==item_nbr]    
#                 item_df[changing_cols] = item_df[changing_cols].shift(-i)
#                 item_df=item_df.dropna()
#                 if count == 0:
#                     i_frame = item_df
#                 else:
#                     i_frame = pd.concat([i_frame, item_df])            
#                 count += 1

#         i_frame = i_frame.dropna()
#         i_frame.to_csv(dataset_dir + str(i) + '.csv', index=False)
#         train_frame_dict[i] = i_frame
#     return train_frame_dict

In [57]:
def gen_datasets(df,
                 dataset_dir='tmp_data/',
                 PREDICT_LENGTH=16,
                 changing_cols=['onpromotion']
                 ):
    """ Generate train test sets for PREDICT_LENGTH of models.
    Args:
        df: a DataFrame
        dataset_dir: String, a dir to record train test sets.

    Returns:
        train_frame_dict: a Dict: {Int: Dataframe}
    """
    df['label'] = 0
    train_frame_dict = {}

    for i in tqdm(range(PREDICT_LENGTH), desc = '1st loop'):
        i_frame = df.copy()
        tmp = df[changing_cols+['unit_sales']].shift(-i)
        i_frame[changing_cols] = tmp[changing_cols]
        i_frame['label'] = tmp['unit_sales']
        i_frame.to_csv(dataset_dir + str(i) + '.csv', index=False)
        train_frame_dict[i] = i_frame
        del i_frame

In [39]:
import os 



In [40]:
files = os.listdir('feature_sets/')

In [None]:
for i, f in enumerate(files):
    if i==0:
        df = pd.read_csv('feature_sets/{}'.format(f))
    else:
        t = pd.read_csv('feature_sets/{}'.format(f))
        df = pd.concat([df, t])

In [34]:
df = df.drop('Unnamed: 0', axis=1)

In [35]:
df =df.sort_values(['store_nbr', 'item_nbr', 'date'])

### Test Period

2017-08-16 to 2017-08-31

### Valid Period

2017-07-31 to 2017-08-15

In [None]:
df = gen_datasets(df)



1st loop:   0%|          | 0/16 [00:00<?, ?it/s][A[A

1st loop:   6%|▋         | 1/16 [00:21<05:18, 21.21s/it][A[A

In [5]:
from datetime import date, timedelta

import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
import lightgbm as lgb


This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [44]:

df_train = pd.read_csv(
    'datasets/train.csv', usecols=[1, 2, 3, 4, 5],
    dtype={'onpromotion': bool},
    converters={'unit_sales': lambda u: np.log1p(
        float(u)) if float(u) > 0 else 0},
    parse_dates=["date"],
    skiprows=range(1, 66458909)  # 2016-01-01
)

df_test = pd.read_csv(
    'datasets/test.csv', usecols=[0, 1, 2, 3, 4],
    dtype={'onpromotion': bool},
    parse_dates=["date"]  # , date_parser=parser
).set_index(
    ['store_nbr', 'item_nbr', 'date']
)

items = pd.read_csv(
    'datasets/items.csv',
).set_index("item_nbr")


In [45]:

df_2017 = df_train[df_train.date.isin(
    pd.date_range("2017-05-31", periods=7 * 11))].copy()
del df_train

In [46]:
promo_2017_train = df_2017.set_index(
    ["store_nbr", "item_nbr", "date"])[["onpromotion"]]

In [49]:
promo_2017_train.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion
Unnamed: 0_level_1,date,2017-05-31,2017-06-01,2017-06-02,2017-06-03,2017-06-04,2017-06-05,2017-06-06,2017-06-07,2017-06-08,2017-06-09,...,2017-08-06,2017-08-07,2017-08-08,2017-08-09,2017-08-10,2017-08-11,2017-08-12,2017-08-13,2017-08-14,2017-08-15
store_nbr,item_nbr,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
1,96995,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,99197,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,103520,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,103665,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,105574,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [48]:
promo_2017_train = promo_2017_train.unstack(level=-1).fillna(False)

In [16]:
promo_2017_train.columns = promo_2017_train.columns.get_level_values(1)

In [17]:
promo_2017_train.head()

Unnamed: 0_level_0,date,2017-05-31 00:00:00,2017-06-01 00:00:00,2017-06-02 00:00:00,2017-06-03 00:00:00,2017-06-04 00:00:00,2017-06-05 00:00:00,2017-06-06 00:00:00,2017-06-07 00:00:00,2017-06-08 00:00:00,2017-06-09 00:00:00,...,2017-08-06 00:00:00,2017-08-07 00:00:00,2017-08-08 00:00:00,2017-08-09 00:00:00,2017-08-10 00:00:00,2017-08-11 00:00:00,2017-08-12 00:00:00,2017-08-13 00:00:00,2017-08-14 00:00:00,2017-08-15 00:00:00
store_nbr,item_nbr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,96995,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,99197,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,103520,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,103665,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,105574,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [19]:
promo_2017_test = df_test[["onpromotion"]].unstack(level=-1).fillna(False)
promo_2017_test.columns = promo_2017_test.columns.get_level_values(1)



In [22]:
promo_2017_test.head()

Unnamed: 0_level_0,date,2017-08-16 00:00:00,2017-08-17 00:00:00,2017-08-18 00:00:00,2017-08-19 00:00:00,2017-08-20 00:00:00,2017-08-21 00:00:00,2017-08-22 00:00:00,2017-08-23 00:00:00,2017-08-24 00:00:00,2017-08-25 00:00:00,2017-08-26 00:00:00,2017-08-27 00:00:00,2017-08-28 00:00:00,2017-08-29 00:00:00,2017-08-30 00:00:00,2017-08-31 00:00:00
store_nbr,item_nbr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1,96995,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,99197,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,103501,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,103520,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,103665,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [23]:
promo_2017_test = promo_2017_test.reindex(promo_2017_train.index).fillna(False)

In [24]:
promo_2017 = pd.concat([promo_2017_train, promo_2017_test], axis=1)
del promo_2017_test, promo_2017_train

In [26]:
df_2017 = df_2017.set_index(["store_nbr", "item_nbr", "date"])[["unit_sales"]].unstack(level=-1).fillna(0)

In [27]:
df_2017.columns = df_2017.columns.get_level_values(1)
items = items.reindex(df_2017.index.get_level_values(1))

In [28]:

def get_timespan(df, dt, minus, periods):
    return df[
        pd.date_range(dt - timedelta(days=minus), periods=periods)
    ]

def prepare_dataset(t2017, is_train=True):
    X = pd.DataFrame({
        "mean_3_2017": get_timespan(df_2017, t2017, 3, 3).mean(axis=1).values,
        "mean_7_2017": get_timespan(df_2017, t2017, 7, 7).mean(axis=1).values,
        "mean_14_2017": get_timespan(df_2017, t2017, 14, 14).mean(axis=1).values,
        "promo_14_2017": get_timespan(promo_2017, t2017, 14, 14).sum(axis=1).values
    })
    for i in range(16):
        X["promo_{}".format(i)] = promo_2017[
            t2017 + timedelta(days=i)].values.astype(np.uint8)
    if is_train:
        y = df_2017[
            pd.date_range(t2017, periods=16)
        ].values
        return X, y
    return X

In [29]:
print("Preparing dataset...")
t2017 = date(2017, 6, 21)
X_l, y_l = [], []


Preparing dataset...


In [30]:
for i in range(4):
    delta = timedelta(days=7 * i)
    X_tmp, y_tmp = prepare_dataset(
        t2017 + delta
    )
    X_l.append(X_tmp)
    y_l.append(y_tmp)

In [31]:
X_train = pd.concat(X_l, axis=0)
y_train = np.concatenate(y_l, axis=0)
del X_l, y_l
X_val, y_val = prepare_dataset(date(2017, 7, 26))
X_test = prepare_dataset(date(2017, 8, 16), is_train=False)

In [32]:
print("Training and predicting models...")
params = {
    'num_leaves': 2**5 - 1,
    'objective': 'regression_l2',
    'max_depth': 8,
    'min_data_in_leaf': 50,
    'learning_rate': 0.05,
    'feature_fraction': 0.75,
    'bagging_fraction': 0.75,
    'bagging_freq': 1,
    'metric': 'l2',
    'num_threads': 4
}

MAX_ROUNDS = 1000
val_pred = []
test_pred = []
cate_vars = []

Training and predicting models...


In [None]:
for i in range(16):
    print("=" * 50)
    print("Step %d" % (i+1))
    print("=" * 50)
    dtrain = lgb.Dataset(
        X_train, label=y_train[:, i],
        categorical_feature=cate_vars,
        weight=pd.concat([items["perishable"]] * 4) * 0.25 + 1
    )
    dval = lgb.Dataset(
        X_val, label=y_val[:, i], reference=dtrain,
        weight=items["perishable"] * 0.25 + 1,
        categorical_feature=cate_vars)
    bst = lgb.train(
        params, dtrain, num_boost_round=MAX_ROUNDS,
        valid_sets=[dtrain, dval], early_stopping_rounds=50, verbose_eval=50
    )
    print("\n".join(("%s: %.2f" % x) for x in sorted(
        zip(X_train.columns, bst.feature_importance("gain")),
        key=lambda x: x[1], reverse=True
    )))
    val_pred.append(bst.predict(
        X_val, num_iteration=bst.best_iteration or MAX_ROUNDS))
    test_pred.append(bst.predict(
        X_test, num_iteration=bst.best_iteration or MAX_ROUNDS))