In [1]:
# General imports
import numpy as np
import pandas as pd
import os, sys, gc, time, warnings, pickle, psutil, random

# Custom import
from multiprocessing import Pool        # Multiprocess Runs

warnings.filterwarnings('ignore')

In [18]:
#datapath
DataPath = "/PATH/"

In [2]:
# Set seed
def seed_all(seed = 0):
    random.seed(seed)
    np.random.seed(seed)
    
# Multiprocess runs
def df_paral_run(func, t_split):
    num_cores = np.min([N_cores, len(t_split)])
    pool = Pool(num_cores)
    df = pd.concat(pool.map(func, t_split), axis = 1)
    pool.close()
    pool.join()
    return df

In [3]:
# Model parameters
import lightgbm as lgb
lgb_params = {
                    'boosting_type': 'gbdt',
                    'objective': 'tweedie',
                    'tweedie_variance_power': 1.1,
                    'metric': 'rmse',
                    'subsample': 0.5,
                    'subsample_freq': 1,
                    'learning_rate': 0.03,
                    'num_leaves': 2**11-1,
                    'min_data_in_leaf': 2**12-1,
                    'feature_fraction': 0.5,
                    'max_bin': 100,
                    'n_estimators': 1400,
                    'boost_from_average': False,
                    'verbose': -1,
                } 

In [8]:
# Variabes
Ver = 3 # Version 3
Seed = 42
seed_all(Seed)
lgb_params['seed'] = Seed
N_cores = psutil.cpu_count()

Target = "sales"
First_train = 0
Last_train = 1941
Pred_len = 28

In [5]:
# Remove features that will result in overfitting
remove_features = ["id",
                  "state_id",
                  "store_id",
                  "date",
                  "wm_yr_wk",
                  "d",
                  "month",
                  "year", 
                  "price_nunique",
                  "price_momentum_y",
                  "sales_lag_34",
                  "sales_lag_36", 
                   Target]

mean_features = ["enc_cat_id_mean",
                "enc_cat_id_std",
                "enc_dept_id_mean",
                "enc_dept_id_std",
                "enc_item_id_mean",
                "enc_item_id_std"]

In [6]:
# Path for features
Base     = "M5_part_1.pkl"
Price    = "M5_part_2.pkl"
Calendar = "M5_part_3.pkl"
Lags     = "lags_df_28.pkl"
Mean_enc = "mean_encoding_df.pkl"

In [9]:
#Stores ids
Stores_IDs = [
              'CA_1', 
              'CA_2', 
              'CA_3', 
              'CA_4', 
              'TX_1', 
              'TX_2', 
              'TX_3', 
              'WI_1',
              'WI_2', 
              'WI_3'
             ]

In [10]:
# Splits for lags creation
Shift_day  = 28
N_lags     = 15
Lags_split = [col for col in range(Shift_day, Shift_day + N_lags)]
Rols_split = []
for i in [1,7,14]:
    for j in [7,14,28,56]:
        Rols_split.append([i,j])

In [11]:
# Functions to load data by store_id
def get_data_by_store(store):
    
    # Read and contact basic feature
    df = pd.concat([pd.read_pickle(Base),
                    pd.read_pickle(Price).iloc[:,2:],
                    pd.read_pickle(Calendar).iloc[:,2:]],
                    axis=1)
    
    # Leave only the corresponding store data
    df = df[df["store_id"] == store]
    
    df2 = pd.read_pickle(Mean_enc)[mean_features]
    df2 = df2[df2.index.isin(df.index)]
    
    df3 = pd.read_pickle(Lags).iloc[:,3:]
    df3 = df3[df3.index.isin(df.index)]
    
    df = pd.concat([df, df2], axis=1)
    del df2 # to not reach memory limit
    
    df = pd.concat([df, df3], axis=1)
    del df3 # to not reach memory limit
    
    # Create feature list
    features = [col for col in list(df) if col not in remove_features]
    df = df[["id", "d", Target] + features]
    
    # Skipping first n rows
    df = df[df["d"] >= First_train].reset_index(drop = True)
    
    return df, features

In [12]:
# Recombine test set after training
def get_base_test():
    base_test = pd.DataFrame()

    for store_id in Stores_IDs:
        temp_df = pd.read_pickle("test_" + store_id + ".pkl")
        temp_df["store_id"] = store_id
        base_test = pd.concat([base_test, temp_df]).reset_index(drop=True)
    
    return base_test

In [13]:
# Make dynamic lags and rolling lags
def make_lag(lag_day):
    lag_df = base_test[["id", "d", Target]]
    col_name = "sales_lag_" + str(lag_day)
    lag_df[col_name] = lag_df.groupby(["id"])[Target].transform(lambda x: x.shift(lag_day)).astype(np.float16)
    return lag_df[[col_name]]

def make_lag_roll(lag_day):
    shift_day = lag_day[0]
    roll_win = lag_day[1]
    lag_df = base_test[["id", "d", Target]]
    col_name = "rolling_mean_tmp_" + str(shift_day) + "_" + str(roll_win)
    lag_df[col_name] = lag_df.groupby(["id"])[Target].transform(lambda x: x.shift(shift_day).rolling(roll_win).mean())
    return lag_df[[col_name]]

In [14]:
## Train models
for store_id in Stores_IDs:
    print("Train", store_id)
    
    grid_df, feature_cols = get_data_by_store(store_id)
    
    train_m = grid_df["d"] <= Last_train
    valid_m = train_m & (grid_df["d"] > (Last_train - Pred_len))
    pred_m = grid_df["d"] > (Last_train - 100)
    
    # Apply masks and save lgb dataset as bin to reduce memory spikes during dtype convertations
    train_data = lgb.Dataset(grid_df[train_m][feature_cols], label = grid_df[train_m][Target])
    train_data.save_binary('train_data.bin')
    train_data = lgb.Dataset('train_data.bin')
    
    valid_data = lgb.Dataset(grid_df[valid_m][feature_cols], label = grid_df[valid_m][Target])
    
    # Saving part of the dataset for later predictions
    # Removing features that we need to calculate recursively 
    grid_df = grid_df[pred_m].reset_index(drop = True)
    keep_cols = [col for col in list(grid_df) if '_tmp_' not in col]
    grid_df = grid_df[keep_cols]
    grid_df.to_pickle("test_" + store_id + ".pkl")
    del grid_df
    
    seed_all(Seed)
    estimator = lgb.train(lgb_params,
                          train_data,
                          valid_sets = [valid_data],
                          verbose_eval = 100,
                          )
    
    model_name = 'lgb_model_' + store_id + '_v' + str(Ver) + '.bin'
    pickle.dump(estimator, open(model_name, 'wb'))
    
    # Remove temporary files and objects to free some hdd space and ram memory
    !rm train_data.bin
    del train_data, valid_data, estimator
    gc.collect()
    
    # "Keep" models features for predictions
    Model_features = feature_cols

Train CA_1
[100]	valid_0's rmse: 2.01759
[200]	valid_0's rmse: 1.98668
[300]	valid_0's rmse: 1.977
[400]	valid_0's rmse: 1.97009
[500]	valid_0's rmse: 1.9639
[600]	valid_0's rmse: 1.95737
[700]	valid_0's rmse: 1.95216
[800]	valid_0's rmse: 1.94751
[900]	valid_0's rmse: 1.9427
[1000]	valid_0's rmse: 1.93834
[1100]	valid_0's rmse: 1.9341
[1200]	valid_0's rmse: 1.92947
[1300]	valid_0's rmse: 1.92524
[1400]	valid_0's rmse: 1.92111
Train CA_2
[100]	valid_0's rmse: 1.94771
[200]	valid_0's rmse: 1.89386
[300]	valid_0's rmse: 1.87876
[400]	valid_0's rmse: 1.87048
[500]	valid_0's rmse: 1.86254
[600]	valid_0's rmse: 1.85543
[700]	valid_0's rmse: 1.84901
[800]	valid_0's rmse: 1.84367
[900]	valid_0's rmse: 1.8387
[1000]	valid_0's rmse: 1.83437
[1100]	valid_0's rmse: 1.8302
[1200]	valid_0's rmse: 1.82552
[1300]	valid_0's rmse: 1.82139
[1400]	valid_0's rmse: 1.81753
Train CA_3
[100]	valid_0's rmse: 2.38517
[200]	valid_0's rmse: 2.34201
[300]	valid_0's rmse: 2.32452
[400]	valid_0's rmse: 2.31584
[500

In [15]:
## Predict
all_preds = pd.DataFrame()

# Join back the Test dataset with a small part of the training data to make recursive features
base_test = get_base_test()

# Timer to measure predictions time
main_time = time.time()

for pred_day in range(1, 29):
    print("Predict | Day:", pred_day)
    start_time = time.time()
    
    # Make temporary grid to calculate rolling lags
    grid_df = base_test.copy()
    grid_df = pd.concat([grid_df, df_paral_run(make_lag_roll, Rols_split)], axis=1)
    
    for store_id in Stores_IDs:
        model_path = "lgb_model_" + store_id + "_v" + str(Ver) + ".bin" 
        estimator = pickle.load(open(model_path, "rb"))
        
        day_m = base_test['d'] == (Last_train + pred_day)
        store_m = base_test['store_id']==store_id
        
        m = (day_m) & (store_m)
        base_test[Target][m] = estimator.predict(grid_df[m][Model_features])
        
    # Make column name and add to all_preds DF
    temp_df = base_test[day_m][["id", Target]]
    temp_df.columns = ["id", "F" + str(pred_day)]
    if "id" in list(all_preds):
        all_preds = all_preds.merge(temp_df, on = ["id"], how = "left")
    else:
        all_preds = temp_df.copy()
        
    print('#'*10, ' %0.2f min round |' % ((time.time() - start_time) / 60),
              ' %0.2f min total |' % ((time.time() - main_time) / 60),
              ' %0.2f day sales |' % (temp_df['F' + str(pred_day)].sum()))
    del temp_df
    
all_preds = all_preds.reset_index(drop=True)
all_preds

Predict | Day: 1
##########  0.64 min round |  0.64 min total |  39873.24 day sales |
Predict | Day: 2
##########  0.65 min round |  1.29 min total |  37168.73 day sales |
Predict | Day: 3
##########  0.66 min round |  1.96 min total |  37038.54 day sales |
Predict | Day: 4
##########  0.72 min round |  2.67 min total |  37049.32 day sales |
Predict | Day: 5
##########  0.66 min round |  3.34 min total |  42008.78 day sales |
Predict | Day: 6
##########  0.67 min round |  4.01 min total |  50262.51 day sales |
Predict | Day: 7
##########  0.66 min round |  4.66 min total |  51103.06 day sales |
Predict | Day: 8
##########  0.65 min round |  5.31 min total |  44977.70 day sales |
Predict | Day: 9
##########  0.68 min round |  5.99 min total |  39273.24 day sales |
Predict | Day: 10
##########  0.66 min round |  6.65 min total |  44033.57 day sales |
Predict | Day: 11
##########  0.64 min round |  7.29 min total |  45016.40 day sales |
Predict | Day: 12
##########  0.67 min round |  7.96

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,HOBBIES_1_001_CA_1_evaluation,0.894156,0.797862,0.757838,0.815370,1.053641,1.251544,1.214042,1.084730,0.885424,...,1.014416,1.306888,1.140455,0.938197,0.875599,0.869725,0.859978,1.089290,1.307882,1.100659
1,HOBBIES_1_002_CA_1_evaluation,0.246908,0.216749,0.207193,0.200287,0.220388,0.289456,0.350437,0.262191,0.244119,...,0.298507,0.384425,0.427430,0.277320,0.287314,0.293951,0.294857,0.331525,0.410798,0.451908
2,HOBBIES_1_003_CA_1_evaluation,0.535192,0.472436,0.492068,0.492642,0.682019,0.913400,0.813372,0.465840,0.506559,...,0.756575,0.880776,0.824205,0.505758,0.419118,0.469078,0.511557,0.760734,0.867550,0.840639
3,HOBBIES_1_004_CA_1_evaluation,1.584041,1.354480,1.336962,1.448628,1.829020,2.696900,3.012109,1.856297,1.371992,...,1.738260,2.537276,3.017198,1.766089,1.453292,1.528347,1.550095,1.900432,2.549529,2.699758
4,HOBBIES_1_005_CA_1_evaluation,1.117710,0.991719,0.945132,0.996627,1.111623,1.400331,1.463146,1.117716,0.952329,...,1.241647,1.545267,1.517596,1.102651,0.954741,1.008502,1.023550,1.306502,1.504063,1.397712
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30485,FOODS_3_823_WI_3_evaluation,0.551792,0.514148,0.496788,0.541394,0.559960,0.624951,0.726713,0.627854,0.582946,...,0.639011,0.759081,0.857963,0.599906,0.641171,0.613696,0.521367,0.524332,0.639211,0.755465
30486,FOODS_3_824_WI_3_evaluation,0.271574,0.250420,0.246872,0.246384,0.232829,0.269247,0.322886,0.304320,0.279523,...,0.288918,0.418360,0.459928,0.377799,0.429828,0.433534,0.300204,0.269644,0.316149,0.370912
30487,FOODS_3_825_WI_3_evaluation,0.663182,0.522572,0.481299,0.435530,0.501278,0.616317,0.690022,0.668807,0.494482,...,0.835760,1.245322,1.346179,0.970036,1.016039,0.991759,0.715699,0.733611,0.791815,0.954163
30488,FOODS_3_826_WI_3_evaluation,0.960298,1.063197,1.034226,0.922244,1.094934,1.182236,1.221578,1.284041,1.122904,...,1.074147,1.492655,1.586092,1.230416,1.420511,1.370374,1.154457,1.197202,1.268506,1.401272


In [19]:
# Export the data
submission = pd.read_csv(DataPath + "/sample_submission.csv")[["id"]]
submission = submission.merge(all_preds, on=['id'], how = "left").fillna(0)
submission.to_csv('submission_v' + str(Ver) + '.csv', index = False)

In [63]:
# Evaluation metric
from typing import Union

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm_notebook as tqdm

In [65]:
class WRMSSEEvaluator(object):

    def __init__(self, train_df: pd.DataFrame, valid_df: pd.DataFrame, 
                 calendar: pd.DataFrame, prices: pd.DataFrame):
        train_y = train_df.loc[:, train_df.columns.str.startswith('d_')]
        train_target_columns = train_y.columns.tolist()
        weight_columns = train_y.iloc[:, -28:].columns.tolist()

        train_df['all_id'] = 'all'  # for lv1 aggregation

        id_columns = train_df.loc[:, ~train_df.columns.str.startswith('d_')]\
                     .columns.tolist()
        valid_target_columns = valid_df.loc[:, valid_df.columns.str.startswith('d_')]\
                               .columns.tolist()

        if not all([c in valid_df.columns for c in id_columns]):
            valid_df = pd.concat([train_df[id_columns], valid_df], 
                                 axis=1, sort=False)

        self.train_df = train_df
        self.valid_df = valid_df
        self.calendar = calendar
        self.prices = prices

        self.weight_columns = weight_columns
        self.id_columns = id_columns
        self.valid_target_columns = valid_target_columns

        weight_df = self.get_weight_df()

        self.group_ids = (
            'all_id',
            'state_id',
            'store_id',
            'cat_id',
            'dept_id',
            ['state_id', 'cat_id'],
            ['state_id', 'dept_id'],
            ['store_id', 'cat_id'],
            ['store_id', 'dept_id'],
            'item_id',
            ['item_id', 'state_id'],
            ['item_id', 'store_id']
        )

        for i, group_id in enumerate(tqdm(self.group_ids)):
            train_y = train_df.groupby(group_id)[train_target_columns].sum()
            scale = []
            for _, row in train_y.iterrows():
                series = row.values[np.argmax(row.values != 0):]
                scale.append(((series[1:] - series[:-1]) ** 2).mean())
            setattr(self, f'lv{i + 1}_scale', np.array(scale))
            setattr(self, f'lv{i + 1}_train_df', train_y)
            setattr(self, f'lv{i + 1}_valid_df', valid_df.groupby(group_id)\
                    [valid_target_columns].sum())

            lv_weight = weight_df.groupby(group_id)[weight_columns].sum().sum(axis=1)
            setattr(self, f'lv{i + 1}_weight', lv_weight / lv_weight.sum())

    def get_weight_df(self) -> pd.DataFrame:
        day_to_week = self.calendar.set_index('d')['wm_yr_wk'].to_dict()
        weight_df = self.train_df[['item_id', 'store_id'] + self.weight_columns]\
                    .set_index(['item_id', 'store_id'])
        weight_df = weight_df.stack().reset_index()\
                   .rename(columns={'level_2': 'd', 0: 'value'})
        weight_df['wm_yr_wk'] = weight_df['d'].map(day_to_week)

        weight_df = weight_df.merge(self.prices, how='left',
                                    on=['item_id', 'store_id', 'wm_yr_wk'])
        weight_df['value'] = weight_df['value'] * weight_df['sell_price']
        weight_df = weight_df.set_index(['item_id', 'store_id', 'd'])\
                    .unstack(level=2)['value']\
                    .loc[zip(self.train_df.item_id, self.train_df.store_id), :]\
                    .reset_index(drop=True)
        weight_df = pd.concat([self.train_df[self.id_columns],
                               weight_df], axis=1, sort=False)
        return weight_df

    def rmsse(self, valid_preds: pd.DataFrame, lv: int) -> pd.Series:
        valid_y = getattr(self, f'lv{lv}_valid_df')
        score = ((valid_y - valid_preds) ** 2).mean(axis=1)
        scale = getattr(self, f'lv{lv}_scale')
        return (score / scale).map(np.sqrt) 

    def score(self, valid_preds: Union[pd.DataFrame, 
                                       np.ndarray]) -> float:
        assert self.valid_df[self.valid_target_columns].shape \
               == valid_preds.shape

        if isinstance(valid_preds, np.ndarray):
            valid_preds = pd.DataFrame(valid_preds, 
                                       columns=self.valid_target_columns)

        valid_preds = pd.concat([self.valid_df[self.id_columns], 
                                 valid_preds], axis=1, sort=False)

        all_scores = []
        for i, group_id in enumerate(self.group_ids):

            valid_preds_grp = valid_preds.groupby(group_id)[self.valid_target_columns].sum()
            setattr(self, f'lv{i + 1}_valid_preds', valid_preds_grp)
            
            lv_scores = self.rmsse(valid_preds_grp, i + 1)
            setattr(self, f'lv{i + 1}_scores', lv_scores)
            
            weight = getattr(self, f'lv{i + 1}_weight')
            lv_scores = pd.concat([weight, lv_scores], axis=1, 
                                  sort=False).prod(axis=1)
            
            all_scores.append(lv_scores.sum())
            
        self.all_scores = all_scores

        return np.mean(all_scores)