In [1]:
import time
import warnings
import gc
import pickle
import math, decimal
import shutil

from math import ceil
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np
from typing import Union
from tqdm.auto import tqdm as tqdm
import logging
import datetime
import lightgbm as lgb
import random
import os
import psutil
import argparse
from pathlib import Path


warnings.filterwarnings('ignore')
dec = decimal.Decimal

Please go through the following on __Kaggle__ for more implementation details: 
* https://www.kaggle.com/c/m5-forecasting-accuracy/discussion/163216
* https://www.kaggle.com/kyakovlev/m5-simple-fe
* https://www.kaggle.com/kyakovlev/m5-lags-features
* https://www.kaggle.com/kyakovlev/m5-custom-features
* https://www.kaggle.com/kyakovlev/m5-three-shades-of-dark-darker-magic

In [2]:
def set_params(setting, seed, raw_train_file, raw_price_file, raw_calendar_file, raw_submission_file):
    '''
    this function is used to get all the global variables (apart from those which are defined as constants in other cell)
    which will be used in other functions.
    '''
    data_dir_path = Path(setting['data_dir_path'])

    raw_dir_path = data_dir_path / 'raw'
    raw_dir_path.mkdir(parents=True, exist_ok=True)

    output_name = Path(setting['output_name'])
    output_dir_path = data_dir_path / 'output' / output_name
    output_dir_path.mkdir(parents=True, exist_ok=True)

    result_dir_path = output_dir_path / 'result'
    result_dir_path.mkdir(parents=True, exist_ok=True)

    work_dir_path = output_dir_path / 'work'
    work_dir_path.mkdir(parents=True, exist_ok=True)

    model_dir_path = output_dir_path / 'model'
    model_dir_path.mkdir(parents=True, exist_ok=True)

    set_seed(seed)
   
    end_train_day_x_list = [int(fold_id) for fold_id in setting['fold_id_list_csv'].split(',')]
    end_train_day_default = 1941

    for end_train_day_x in end_train_day_x_list:
        (result_dir_path / str(end_train_day_x)).mkdir(parents=True, exist_ok=True)
        (work_dir_path / str(end_train_day_x)).mkdir(parents=True, exist_ok=True)
        (model_dir_path / str(end_train_day_x)).mkdir(parents=True, exist_ok=True)

    end_train_day_x = None

    prediction_horizon_list = [int(prediction_horizon) for prediction_horizon in
                                    setting['prediction_horizon_list_csv'].split(',')]
    prediction_horizon = None
    prediction_horizon_prev = None

    raw_train_path = raw_dir_path / raw_train_file
    raw_price_path = raw_dir_path / raw_price_file
    raw_calendar_path = raw_dir_path / raw_calendar_file
    raw_submission_path = raw_dir_path / raw_submission_file

    #These features lead to overfitting or these values are not present in test set, that's why model will
    #train without these features.
    remove_features = ['id', 'state_id', 'store_id', 'date', 'wm_yr_wk', 'd', target] 
    enable_features = None
    mean_features = setting['mean_features']

    return (raw_dir_path, output_dir_path, result_dir_path, work_dir_path, model_dir_path, end_train_day_x_list, \
            end_train_day_default, prediction_horizon_list, raw_train_path, raw_price_path, raw_calendar_path, \
            raw_submission_path, remove_features, enable_features, mean_features)

In [3]:
def reduce_mem_usage(df, verbose=False):
    '''
    this function is used to reduce the memory used to store a df.
    '''
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (
                start_mem - end_mem) / start_mem))
    return df

In [4]:
def get_memory_usage():
    return np.round(psutil.Process(os.getpid()).memory_info()[0] / 2. ** 30, 2)

In [5]:
def merge_by_concat(df1, df2, merge_on):
    merged_gf = df1[merge_on]
    merged_gf = merged_gf.merge(df2, on=merge_on, how='left')
    new_columns = [col for col in list(merged_gf) if col not in merge_on]
    df1 = pd.concat([df1, merged_gf[new_columns]], axis=1)
    return df1

In [6]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    return

In [7]:
def log_evaluation(period=100, show_stdv=True, level=logging.INFO):
    def _callback(env):
        if period > 0 and env.evaluation_result_list and (env.iteration + 1) % period == 0:
            result = '\t'.join(
                [lgb.callback._format_eval_result(x, show_stdv) for x in env.evaluation_result_list])
            print(level, '[{}]\t{}'.format(env.iteration + 1, result))

    _callback.order = 10
    return _callback

In [8]:
def load_data():
    print('#################### load_data #######################')
    train_df = pd.read_csv(raw_train_path)
    prices_df = pd.read_csv(raw_price_path)
    calendar_df = pd.read_csv(raw_calendar_path)
    submission_df = pd.read_csv(raw_submission_path)
    
    print('train_df.shape', train_df.shape)
    print('prices_df.shape', prices_df.shape)
    print('calendar_df.shape', calendar_df.shape)
    print('submission_df.shape', submission_df.shape)
    
    return train_df, prices_df, calendar_df, submission_df


In [9]:
def generate_grid_base(train_df, prices_df, calendar_df):
    '''
    converts horizontal to vertical data and generates dataframe with base features and save as pickle file.
    '''
    #grid_df will have data till d_(end_train_day_x + predcition_horizon). 
    #e.g: if end_train_day_x= 1913, predcition_horizon = 7, then d_1 to d_1920 data is there.
    #d_1913 is there for train and last 7 days of data is there for holdout/validation.
    #after training with data till 1913, we will generate the forecast for d_1914 to d_1920 and compare
    #with actual sales numbers (holdout data).
    #same goes for grid_df for prices and calendars. They will have the same number of rows which will be concatenated
    #along the columns later to get all the base features, and price and calendar related features. 

    print('################# generate_grid_base ###################')
    print('melting to convert horizontal to vertical data')
    index_columns = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']
    # after melting, for each row in horizontal data, we will have 1941 (d_1 to d_1941) rows in vertical data.
    grid_df = pd.melt(train_df, id_vars=index_columns, var_name='d', value_name= target)
    # assert grid_df.shape[0] == train_df.shape[0]*1941
    print('grid_df.shape after melting (vertical data)', grid_df.shape)

    print('generate holdout data for validation by removing days before end_train_day_x')
    num_before = grid_df.shape[0]
    grid_df['d_org'] = grid_df['d']
    grid_df['d'] = grid_df['d'].apply(lambda x: x[2:]).astype(np.int16)

    #data from end_train_day_x to (end_train_day_x + prediction_horizon) is saved as holdout dataset
    #which will be used to assess the model performance and could also be used for hyperparam tuning.
    holdout_df = grid_df[(grid_df['d'] > end_train_day_x) & \
                             (grid_df['d'] <= end_train_day_x + prediction_horizon)][main_index_list + [target]]
    
    holdout_df.to_csv(holdout_path, index=False)

    grid_df = grid_df[grid_df['d'] <= end_train_day_x]
    grid_df['d'] = grid_df['d_org']
    grid_df = grid_df.drop('d_org', axis=1)
    num_after = grid_df.shape[0]
    print(num_before, '-->', num_after)

    #data from end_train_day_x to (end_train_day_x + prediction_horizon) is being added as the test data which will be predicted using
    #the trained model and then this preds will be compared with true labels of holdout set.
    print('add test days')
    add_grid = pd.DataFrame()
    for i in range(prediction_horizon):
        temp_df = train_df[index_columns]
        temp_df = temp_df.drop_duplicates()
        temp_df['d'] = 'd_' + str(end_train_day_x + i + 1)
        temp_df[target] = np.nan #target is set as NaN since it is going to be predicted using the trained model
        add_grid = pd.concat([add_grid, temp_df])

    grid_df = pd.concat([grid_df, add_grid])
    grid_df = grid_df.reset_index(drop=True)

    del temp_df, add_grid
    del train_df
    
    #these features should be treated as categorical variables in lightgbm because of category data types.
    #but it always a good idea to exclusively specify these features as categorical_feature param in lightgbm
    print('convert to category')
    for col in index_columns:
        grid_df[col] = grid_df[col].astype('category')

    #we are calculating the release week which tells when an item in an store started to be sold for the first time.
    print('calc release week')
    release_df = prices_df.groupby(['store_id', 'item_id'])['wm_yr_wk'].agg(['min']).reset_index()
    release_df.columns = ['store_id', 'item_id', 'release']
    grid_df = merge_by_concat(grid_df, release_df, ['store_id', 'item_id'])
    del release_df
    grid_df = merge_by_concat(grid_df, calendar_df[['wm_yr_wk', 'd']], ['d'])
    grid_df = grid_df.reset_index(drop=True)

    #calculating release week relative to start of data (the item which was sold for the very first time)
    #this is being done to lower the range of values for release week by taking the relative value so that
    #it can be represented using int16 and memory usage could be reduced.
    print('convert release to int16')
    grid_df['release'] = grid_df['release'] - grid_df['release'].min()
    grid_df['release'] = grid_df['release'].astype(np.int16)

    print('save grid_base')
    grid_df.to_pickle(grid_base_path)

    print('grid_df.shape', grid_df.shape)
    return 

In [10]:
def generate_grid_price(prices_df, calendar_df):
    '''
    Generates dataframe with price related features and save as pickle file
    '''
    print('################## generate_grid_price #################')
    print('loading grid_base')
    grid_df = pd.read_pickle(grid_base_path)
    
    #calculating max/min/mean/std price of an item in an store across all the weeks it has been sold.
    prices_df['price_max'] = prices_df.groupby(['store_id', 'item_id'])['sell_price'].transform('max')
    prices_df['price_min'] = prices_df.groupby(['store_id', 'item_id'])['sell_price'].transform('min')
    prices_df['price_std'] = prices_df.groupby(['store_id', 'item_id'])['sell_price'].transform('std')
    prices_df['price_mean'] = prices_df.groupby(['store_id', 'item_id'])['sell_price'].transform('mean')
    prices_df['price_norm'] = prices_df['sell_price'] / prices_df['price_max']
    prices_df['price_nunique'] = prices_df.groupby(['store_id', 'item_id'])['sell_price'].transform('nunique')
    prices_df['item_nunique'] = prices_df.groupby(['store_id', 'sell_price'])['item_id'].transform('nunique')

    calendar_prices = calendar_df[['wm_yr_wk', 'month', 'year']]
    calendar_prices = calendar_prices.drop_duplicates(subset=['wm_yr_wk'])
    prices_df = prices_df.merge(calendar_prices[['wm_yr_wk', 'month', 'year']], on=['wm_yr_wk'], how='left')
    del calendar_prices
    
    #calculating rate of price change day-by-day, across the month, and across the year
    prices_df['price_momentum'] = prices_df['sell_price'] / prices_df.groupby(['store_id', 'item_id'])[
        'sell_price'].transform(lambda x: x.shift(1))
    prices_df['price_momentum_m'] = prices_df['sell_price'] / prices_df.groupby(['store_id', 'item_id', 'month'])[
        'sell_price'].transform('mean')
    prices_df['price_momentum_y'] = prices_df['sell_price'] / prices_df.groupby(['store_id', 'item_id', 'year'])[
        'sell_price'].transform('mean')
    
    #calculating the fractional part of the price/max/min
    prices_df['sell_price_cent'] = [math.modf(p)[0] for p in prices_df['sell_price']]
    prices_df['price_max_cent'] = [math.modf(p)[0] for p in prices_df['price_max']]
    prices_df['price_min_cent'] = [math.modf(p)[0] for p in prices_df['price_min']]

    del prices_df['month'], prices_df['year']

    print('merge prices')
    original_columns = list(grid_df)
    grid_df = grid_df.merge(prices_df, on=['store_id', 'item_id', 'wm_yr_wk'], how='left')
    keep_columns = [col for col in list(grid_df) if col not in original_columns]
    grid_df = grid_df[main_index_list + keep_columns]
    grid_df = reduce_mem_usage(grid_df)

    print('save grid_price')
    grid_df.to_pickle(grid_price_path)
    del prices_df
    return 


In [11]:
 def generate_grid_calendar(calendar_df):
    '''
    Generates dataframe with date, event and promotion related features and save as pickle file
    '''
    print('################## generate_grid_calendar ####################')
    grid_df = pd.read_pickle(grid_base_path)
    grid_df = grid_df[main_index_list]
    
    #moon phase is one feature which have been found to give some improvement. 
    #perhaps it affects shopping habits shomehow
    #This code produces moon phase coded 0 to 7:
    def get_moon_phase(d):  # 0=new, 4=full; 4 days/phase
        diff = datetime.datetime.strptime(d, '%Y-%m-%d') - datetime.datetime(2001, 1, 1)
        days = dec(diff.days) + (dec(diff.seconds) / dec(86400))
        lunations = dec("0.20439731") + (days * dec("0.03386319269"))
        phase_index = math.floor((lunations % dec(1) * dec(8)) + dec('0.5'))
        return int(phase_index) & 7

    calendar_df['moon'] = calendar_df.date.apply(get_moon_phase)

    # Merge calendar partly
    icols = ['date', 'd', 'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2', 'snap_CA', 'snap_TX', 'snap_WI', 'moon']

    grid_df = grid_df.merge(calendar_df[icols], on=['d'], how='left')
    #these features should be treated as categorical variables in lightgbm because of category data types.
    #but it always a good idea to exclusively specify these features as categorical_feature param in lightgbm
    icols = ['event_name_1', 'event_type_1', 'event_name_2', 'event_type_2', 'snap_CA', 'snap_TX', 'snap_WI']
    for col in icols:
        grid_df[col] = grid_df[col].astype('category')

    grid_df['date'] = pd.to_datetime(grid_df['date'])

    grid_df['tm_d'] = grid_df['date'].dt.day.astype(np.int8) #day of the momnth
    grid_df['tm_w'] = grid_df['date'].dt.week.astype(np.int8) #week of the year
    grid_df['tm_m'] = grid_df['date'].dt.month.astype(np.int8) #month of the year
    grid_df['tm_y'] = grid_df['date'].dt.year #year
    grid_df['tm_y'] = (grid_df['tm_y'] - grid_df['tm_y'].min()).astype(np.int8) #year in relative to data start year
    grid_df['tm_wm'] = grid_df['tm_d'].apply(lambda x: ceil(x / 7)).astype(np.int8) #week of the month

    grid_df['tm_dw'] = grid_df['date'].dt.dayofweek.astype(np.int8) #day of the week
    grid_df['tm_w_end'] = (grid_df['tm_dw'] >= 5).astype(np.int8) #is_weekend
    del grid_df['date']

    grid_df.to_pickle(grid_calendar_path)

    del calendar_df
    del grid_df

    return 

In [12]:
def modify_grid_base():
    print('############## modify_grid_base ####################')
    grid_df = pd.read_pickle(grid_base_path)
    #remove d_ part and just retain the number part for easier comparison.
    grid_df['d'] = grid_df['d'].apply(lambda x: x[2:]).astype(np.int16)

    del grid_df['wm_yr_wk']
    grid_df.to_pickle(grid_base_path)
    del grid_df
    return

In [13]:
def update_file_path():
    '''
    updates the file path depending on which end_train_day_x it is running for.
    There are different folders to save the work, result, and model related data
    for different end_train_day_x value.
    '''
    grid_base_path = work_dir_path / f'grid_base_{prediction_horizon}.pkl'
    grid_price_path = work_dir_path / f'grid_price_{prediction_horizon}.pkl'
    grid_calendar_path = work_dir_path / f'grid_calendar_{prediction_horizon}.pkl'
    holdout_path = result_dir_path / 'holdout.csv'
    lag_feature_path = work_dir_path / f'lag_feature_{prediction_horizon}.pkl'
    target_encoding_feature_path = work_dir_path / f'target_encoding_{prediction_horizon}.pkl'
    result_submission_path = result_dir_path / 'submission.csv'
    return (grid_base_path, grid_price_path, grid_calendar_path, holdout_path, \
    lag_feature_path, target_encoding_feature_path, result_submission_path)

In [14]:
def reset_dir_path():
    global result_dir_path, work_dir_path, model_dir_path #to modify the same variables which have been defined as global
    result_dir_path = output_dir_path / 'result'
    result_dir_path.mkdir(parents=True, exist_ok=True)

    work_dir_path = output_dir_path / 'work'
    work_dir_path.mkdir(parents=True, exist_ok=True)

    model_dir_path = output_dir_path / 'work'
    model_dir_path.mkdir(parents=True, exist_ok=True)
    return 


In [15]:
def update_predict_horizon():
    update_file_path()
    num_lag_day_list = []
    #lag feature length depends on the prediction horizon (7, 14, 21 , 28). 
    #lag length varies from prediction_horizon to (prediction_horizon + num_lag_day)
    #for predection horizon=28, why lag from 28-42 and not 1-15? Because for test data (future dates),we wont have the sales numbers 
    #for 1-27 lags since these lags will also be in future for 1-28 days out forecast.
    #The lags are different for each week model. Model w1's (1-7 days out) lags are {7, 8, 9, ..., 21 }. 
    #Model w4's (21-28 days out) lags are {28, 29, 30, ..., 42}
    for col in range(prediction_horizon, prediction_horizon + num_lag_day):
        num_lag_day_list.append(col)

    num_shift_rolling_day_list = []
    for num_shift_day in [1, 7, 14]:
        for num_rolling_day in [7, 14, 30, 60]:
            num_shift_rolling_day_list.append([num_shift_day, num_rolling_day])
    return (num_lag_day_list, num_shift_rolling_day_list)

In [16]:
def generate_grid_full(train_df, prices_df, calendar_df):
    '''
    generates base, price and calendar related features and saves those as pickle files.
    '''
    generate_grid_base(train_df, prices_df, calendar_df)
    generate_grid_price(prices_df, calendar_df)
    generate_grid_calendar(calendar_df)
    modify_grid_base()
    return

In [17]:
def generate_lag_feature(grid_base_path, lag_feature_path, num_lag_day_list, num_rolling_day_list, recursive_feature_flag, num_shift_rolling_day_list):
    '''
    generate lag and rolling (mean/std) features and save as pickle file.
    '''
    print('################ generate_lag features ################')
    print('load grid_base')
    grid_df = pd.read_pickle(grid_base_path)

    grid_df = grid_df[['id', 'd', 'sales']]

    start_time = time.time()
    print('creating lags')

    grid_df = grid_df.assign(**{
        '{}_lag_{}'.format(col, l): grid_df.groupby(['id'])[col].transform(lambda x: x.shift(l))
        for l in num_lag_day_list
        for col in [target]
    })

    for col in list(grid_df):
        if 'lag' in col:
            grid_df[col] = grid_df[col].astype(np.float16)

    start_time = time.time()
    print('create rolling aggs (mean/std)')
    # why it is getting shifted by "prediction_horizon" before taking rolling mean/std? for the test data (future dates), 
    #during our forecast horizon we wont have the sales numbers to take the rolling mean/std, that's why first we have to 
    #shift by "prediction_horizon" and then take the rolling mean/std. So effectively, for a prediction_horizon of 28 and 
    #rolling mean/std of 7 days is effcetively rolling mean/std of 35 days. for each week you would have to shift those 
    #features forward by 7 days for the first week (1-7 days model), 14 days for the second (8-14 days model), etc.

    for num_rolling_day in num_rolling_day_list:
        print('rolling period', num_rolling_day)
        grid_df['rolling_mean_' + str(num_rolling_day)] = grid_df.groupby(['id'])[target].transform(
            lambda x: x.shift(prediction_horizon).rolling(num_rolling_day).mean()).astype(np.float16)
        grid_df['rolling_std_' + str(num_rolling_day)] = grid_df.groupby(['id'])[target].transform(
            lambda x: x.shift(prediction_horizon).rolling(num_rolling_day).std()).astype(np.float16)
        
    #its better not to use recursive features since they introduce leakage in some way.
    if recursive_feature_flag:
        for num_shift_rolling_day in num_shift_rolling_day_list:
            num_shift_day = num_shift_rolling_day[0]
            num_rolling_day = num_shift_rolling_day[1]
            col_name = 'rolling_mean_tmp_' + str(num_shift_day) + '_' + str(num_rolling_day)
            grid_df[col_name] = grid_df.groupby(['id'])[target].transform(
                lambda x: x.shift(num_shift_day).rolling(num_rolling_day).mean()).astype(np.float16)

    print('save lag_feature')
    grid_df.to_pickle(lag_feature_path)

    return 

In [18]:
def generate_target_encoding_feature(grid_base_path):
    '''
    calculate mean encoding features of categorical variables or pairs and save as pickle file.
    '''
    print('################ generate_target_encoding_feature ################')
    set_seed(seed)

    grid_df = pd.read_pickle(grid_base_path)
    # to be sure we don't have leakage in our validation set, that's why target encoding is calculated using train data only and
    # the same encoded value for any group is filled for validation data.
    grid_df[target][grid_df['d'] > (end_train_day_x - prediction_horizon)] = np.nan
    base_cols = list(grid_df)
    
    # Because of memory issues we can't use many features.
    icols = [
        ['state_id'],
        ['store_id'],
        ['cat_id'],
        ['dept_id'],
        ['state_id', 'cat_id'],
        ['state_id', 'dept_id'],
        ['store_id', 'cat_id'],
        ['store_id', 'dept_id'],
        ['item_id'],
        ['item_id', 'state_id'],
        ['item_id', 'store_id']
    ]
    # There are several ways to do "mean" encoding. 1) K-fold scheme 2)LOO (leave one out) 3)Smoothed/regularized 4)Catboost encoder etc.
    # We will use simple target encoding by std and mean agg
    
    for col in icols:
        print('encoding', col)
        col_name = '_' + '_'.join(col) + '_'
        grid_df['enc' + col_name + 'mean'] = grid_df.groupby(col)[target].transform('mean').astype(np.float16)
        grid_df['enc' + col_name + 'std'] = grid_df.groupby(col)[target].transform('std').astype(np.float16)

    keep_cols = [col for col in list(grid_df) if col not in base_cols]
    grid_df = grid_df[['id', 'd'] + keep_cols]

    print('save target_encoding_feature')
    grid_df.to_pickle(target_encoding_feature_path)
    return

In [19]:
def load_grid_full():
    '''
    loads base, price and calendar features in one dataframe post concatenation.
    '''
    print('############ load_grid_full #####################')
    grid_df = pd.concat([pd.read_pickle(grid_base_path),
                         pd.read_pickle(grid_price_path).iloc[:, 2:], #first 2 columns are ['id', 'd'] which already come from base df
                         pd.read_pickle(grid_calendar_path).iloc[:, 2:]],
                        axis=1)
    return grid_df

In [20]:
# grid_df_full = load_grid_full()
# grid_df_full.columns

In [21]:
# train_df.store_id.unique()

In [22]:
def load_grid_by_store(store_id):
    '''
    loads all features (base, price, calendar, lag and encoding) in one dataframe for a given store or all stores.
    '''
    print("Loading the data for store: {}".format(store_id))
    df = load_grid_full()

    if store_id != 'all':
        df = df[df['store_id'] == store_id]
    # With memory limits we have to read lags and mean encoding features separately and drop items that we don't need.
    # As our Features Grids are aligned, we can use index to keep only necessary rows
    # Alignment is good for us as concat uses less memory than merge.
    df2 = pd.read_pickle(target_encoding_feature_path)[mean_features] 
    #select only those rows which are present in df since df2 contains data for all stores and but we have to choose data for 
    #a given store_id
    df2 = df2[df2.index.isin(df.index)]

    df3 = pd.read_pickle(lag_feature_path).iloc[:, 3:]
    df3 = df3[df3.index.isin(df.index)]

    df = pd.concat([df, df2], axis=1)
    del df2

    df = pd.concat([df, df3], axis=1)
    del df3
    
    #model will train only with enable_features after getting rid of remove_features
    enable_features = [col for col in list(df) if col not in remove_features]
    df = df[['id', 'd', target] + enable_features]

    df = df[df['d'] >= start_train_day_x].reset_index(drop=True)

    return df, enable_features

In [23]:
# df, enable_features= load_grid_by_store("CA_1")
# df.columns
# enable_features

In [24]:
def load_base_test(store_id_set_list):
    '''
    load the test data (dummy test data) for all stores in one dataframe.
    '''
    base_test = pd.DataFrame()

    for store_id in store_id_set_list:
        temp_df = pd.read_pickle(
            work_dir_path / f'test_{store_id}_{prediction_horizon}.pkl')
        temp_df['store_id'] = store_id
        base_test = pd.concat([base_test, temp_df]).reset_index(drop=True)
    return base_test

In [25]:
# base_test = load_base_test(train_df.store_id.unique().tolist())
# base_test.columns

In [26]:
def log_evaluation(period=100, show_stdv=True, level=logging.INFO):
    '''
    logs the evaluation metric while training 
    '''
    print('#################### log_evaluation #######################')
    def _callback(env):
        if period > 0 and env.evaluation_result_list and (env.iteration + 1) % period == 0:
            result = '\t'.join(
                [lgb.callback._format_eval_result(x, show_stdv) for x in env.evaluation_result_list])
            print(level, '[{}]\t{}'.format(env.iteration + 1, result))

    _callback.order = 10
    return _callback


In [27]:
def train_and_predict(train_df, calendar_df, prices_df, submission_df):
    
    print('######################################## train_and_predict #####################################################')
    #following parameters have been found after CV.
    lgb_params = {
        'boosting_type': 'gbdt',
        'objective': 'tweedie',
        'tweedie_variance_power': 1.1,
        'metric': 'rmse',
        'subsample': 0.5,
        'subsample_freq': 1,
        'learning_rate': 0.03,
        'num_leaves': 2 ** 11 - 1,
        'min_data_in_leaf': 2 ** 12 - 1,
        'feature_fraction': 0.5,
        'max_bin': 100,
        'n_estimators': 1, #1400
        'boost_from_average': False,
        'verbose': -1,
    }

    set_seed(seed)
    lgb_params['seed'] = seed
    #load the list of all 10 different stores.
    store_id_set_list = list(train_df['store_id'].unique())

    feature_importance_all_df = pd.DataFrame()
    for store_index, store_id in enumerate(store_id_set_list):
        print('train', store_id)
        # Get grid (all features) for current store
        grid_df, enable_features = load_grid_by_store(store_id)
        enable_features = enable_features
        #partition train, valid and test (dummy) data
        train_mask = grid_df['d'] <= end_train_day_x
        valid_mask = train_mask & (grid_df['d'] > (end_train_day_x - prediction_horizon))
        preds_mask = grid_df['d'] > (end_train_day_x - 100)
        print("train_mask horizon: d_{}-d_{}".format(min(grid_df[train_mask]["d"]), max(grid_df[train_mask]["d"])))
        print("valid_mask horizon: d_{}-d_{}".format(min(grid_df[valid_mask]["d"]), max(grid_df[valid_mask]["d"])))
        print("pred_mask horizon: d_{}-d_{}".format(min(grid_df[preds_mask]["d"]), max(grid_df[preds_mask]["d"])))
        print('[{3} - {4}] train {0}/{1} {2}'.format(
            store_index + 1, len(store_id_set_list), store_id,
            end_train_day_x, prediction_horizon))
        if export_all_flag:
            print('export train')
            grid_df[train_mask].to_csv(
                result_dir_path / ('exp_train_' + store_id + '.csv'), index=False)
        train_data = lgb.Dataset(grid_df[train_mask][enable_features],
                                 label=grid_df[train_mask][target])

        if export_all_flag:
            print('export valid')
            grid_df[valid_mask].to_csv(
                result_dir_path / ('exp_valid_' + store_id + '.csv'), index=False)
        valid_data = lgb.Dataset(grid_df[valid_mask][enable_features],
                                 label=grid_df[valid_mask][target])

        if export_all_flag:
            print('export test')
            grid_df[preds_mask].to_csv(
                result_dir_path / ('exp_test_' + store_id + '.csv'), index=False)

        if export_all_flag:
            print('export train_valid_test')
            grid_df[train_mask | valid_mask | preds_mask].to_csv(
                result_dir_path / ('exp_train_valid_test_' + store_id + '.csv'), index=False)
        valid_data = lgb.Dataset(grid_df[valid_mask][enable_features],
                                 label=grid_df[valid_mask][target])

        # Saving part of the dataset (dummy test data) for later predictions
        # Removing features that we need to calculate recursively
        grid_df = grid_df[preds_mask].reset_index(drop=True)
        if recursive_feature_flag:
            keep_cols = [col for col in list(grid_df) if '_tmp_' not in col]
            grid_df = grid_df[keep_cols]
        #this "test_{store_id}_{prediction_horizon}.pkl" file will be loaded later and go through
        #prediction and prediction will be saved as pred_h_df and pred_v_df
        grid_df.to_pickle(work_dir_path / f'test_{store_id}_{prediction_horizon}.pkl')
        del grid_df

        set_seed(seed)
        #lightgbm model training starts here
        estimator = lgb.train(lgb_params,
                              train_data,
                              valid_sets=[valid_data],
                              verbose_eval=True, #False
                              callbacks=[log_evaluation(period=100)],
                              )

        model_name = str(
            model_dir_path / f'lgb_model_{store_id}_{prediction_horizon}.bin')
        #feature importance for the store it is running currently
        feature_importance_store_df = pd.DataFrame(sorted(zip(enable_features, estimator.feature_importance())),
                                                   columns=['feature_name', 'importance'])
        feature_importance_store_df = feature_importance_store_df.sort_values('importance', ascending=False)
        feature_importance_store_df['store_id'] = store_id
        #save the feature importance for a given store and prediction_horizon
        feature_importance_store_df.to_csv(
            result_dir_path / ('feature_importance_{0}_{1}.csv'.format(
                store_id, prediction_horizon)), index=False)
        #concat to store the feature importance for all the stores and a given prediction_horizon
        feature_importance_all_df = pd.concat([feature_importance_all_df, feature_importance_store_df])
        #save the model for the given store and prediction_horizon.
        # Save model - it's not real '.bin' but a pickle file
        # estimator = lgb.Booster(model_file='model.txt') can only predict with the best iteration (or the saving iteration)
        # pickle.dump gives us more flexibilitylike estimator.predict(TEST, num_iteration=100)
        # num_iteration - number of iteration want to predict with, NULL or <= 0 means use best iteration
        pickle.dump(estimator, open(model_name, 'wb'))

        del train_data, valid_data, estimator
        gc.collect()

    print('aggregate feature importance')
    feature_importance_all_df.to_csv(result_dir_path / 'feature_importance_all_{0}.csv'.format(
        prediction_horizon), index=False)
    #aggregtaing (mean/std) the importance of all the features (for a given prediction_horizon) across all the stores.
    feature_importance_agg_df = feature_importance_all_df.groupby(
        'feature_name')['importance'].agg(['mean', 'std']).reset_index()
    feature_importance_agg_df.columns = ['feature_name', 'importance_mean', 'importance_std']
    feature_importance_agg_df = feature_importance_agg_df.sort_values('importance_mean', ascending=False)
    feature_importance_agg_df.to_csv(result_dir_path / 'feature_importance_agg_{0}.csv'.format(
        prediction_horizon), index=False)
    
    print('load base_test')
    #load the base_test data (which was saved as grid_df[preds_mask]) to perform the prediction using trained model.
    base_test = load_base_test(store_id_set_list)

    if export_all_flag:
        base_test.to_csv(
            result_dir_path / 'exp_base_test_{0}_a.csv'.format(prediction_horizon),
            index=False)
    #now we are populating the pred_v_df and pred_h_df for the test data for a given prediction_horizon
    if prediction_horizon_prev > 0:
        pred_v_prev_df = None
        for ph in prediction_horizon_list:
            if ph <= prediction_horizon_prev:
                pred_v_temp_df = pd.read_csv(result_dir_path / 'pred_v_{}.csv'.format(ph))
                pred_v_prev_df = pd.concat([pred_v_prev_df, pred_v_temp_df])
        for predict_day in range(1, prediction_horizon_prev + 1):
            base_test[target][base_test['d'] == (end_train_day_x + predict_day)] = \
                pred_v_prev_df[target][
                    pred_v_prev_df['d'] == (end_train_day_x + predict_day)].values

    if export_all_flag:
        base_test.to_csv(
            result_dir_path / 'exp_base_test_{0}_b.csv'.format(prediction_horizon),
            index=False)

    main_time = time.time()
    pred_h_df = pd.DataFrame()
    #looping for each predict_day 1-by-1. when prediction_horizon=7 then prediction_horizon_prev will be 0 and it will 
    #loop for 1-7 predict day. 
    #when prediction_horizon=14 then prediction_horizon_prev will be 7 and it will loop for 8-14 predict_day
    for predict_day in range(prediction_horizon_prev + 1, prediction_horizon + 1):
        print('predict day{:02d}'.format(predict_day))
        start_time = time.time()
        grid_df = base_test.copy()
        #usually we set recursive_feature_flag to False.
        if recursive_feature_flag:
            print('[{0} - {1}] calculate recursive features'.format(
                end_train_day_x, prediction_horizon))
            for num_shift_rolling_day in num_shift_rolling_day_list:
                num_shift_day = num_shift_rolling_day[0]
                num_rolling_day = num_shift_rolling_day[1]
                lag_df = base_test[['id', 'd', target]]
                col_name = 'rolling_mean_tmp_' + str(num_shift_day) + '_' + str(num_rolling_day)
                lag_df[col_name] = lag_df.groupby(['id'])[target].transform(
                    lambda x: x.shift(num_shift_day).rolling(num_rolling_day).mean())
                grid_df = pd.concat([grid_df, lag_df[[col_name]]], axis=1)

        day_mask = base_test['d'] == (end_train_day_x + predict_day)
        if export_all_flag:
            print('export recursive_features')
            grid_df[day_mask].to_csv(result_dir_path / 'exp_recursive_features_{0}_{1}.csv'.format(
                prediction_horizon, predict_day), index=False)
        for store_index, store_id in enumerate(store_id_set_list):
            # Read all our models and make predictions for each day/store pairs
            print('[{3} - {4}] predict {0}/{1} {2} day {5}'.format(
                store_index + 1, len(store_id_set_list), store_id,
                end_train_day_x, prediction_horizon, predict_day))

            model_path = str(
                model_dir_path / f'lgb_model_{store_id}_{prediction_horizon}.bin')
            #loading the model for a given store and prediction_horizon
            estimator = pickle.load(open(model_path, 'rb'))
            #selecting the test_data for given store_id and given predict_day
            if store_id != 'all':
                store_mask = base_test['store_id'] == store_id
                mask = (day_mask) & (store_mask)
            else:
                mask = day_mask

            if export_all_flag:
                print('export pred')
                grid_df[mask].to_csv(
                    result_dir_path / (
                            'exp_pred_' + store_id + '_day_' + str(predict_day) + '.csv'), index=False)
            #predict the base_test for a given predict_day and store
            base_test[target][mask] = estimator.predict(grid_df[mask][enable_features])

        temp_df = base_test[day_mask][['id', target]]
        temp_df.columns = ['id', 'F' + str(predict_day)]
        #merge the dataframe to populate the prediction columns for different predict_days (1-by-1) for a given prediction_horizon
        if 'id' in list(pred_h_df):
            pred_h_df = pred_h_df.merge(temp_df, on=['id'], how='left')
        else:
            pred_h_df = temp_df.copy()

        del temp_df

    if export_all_flag:
        base_test.to_csv(
            result_dir_path / 'exp_base_test_{0}_c.csv'.format(prediction_horizon),
            index=False)
    #save pred_h_df for a given prediction_horizon
    pred_h_df.to_csv(result_dir_path / 'pred_h_{}.csv'.format(
        prediction_horizon), index=False)
    #populate the predcition in vertical pred df using the prediction output on base_test for all stores and given prediction_horizon.
    pred_v_df = base_test[
        (base_test['d'] >= end_train_day_x + prediction_horizon_prev + 1) *
        (base_test['d'] < end_train_day_x + prediction_horizon + 1)
        ][main_index_list + [target]]
    #save pred_v_df for a given prediction_horizon
    pred_v_df.to_csv(result_dir_path / 'pred_v_{}.csv'.format(prediction_horizon),
                     index=False)

    return pred_h_df, pred_v_df

In [28]:
class WRMSSEEvaluator(object):
    '''
    this class is for WRMSSE calculation
    '''

    def __init__(self, train_df: pd.DataFrame, valid_df: pd.DataFrame,
                 calendar: pd.DataFrame, prices: pd.DataFrame):
        train_y = train_df.loc[:, train_df.columns.str.startswith('d_')]
        train_target_columns = train_y.columns.tolist()
        weight_columns = train_y.iloc[:, -28:].columns.tolist()

        train_df['all_id'] = 'all'  # for lv1 aggregation

        id_columns = train_df.loc[:, ~train_df.columns.str.startswith('d_')] \
            .columns.tolist()
        valid_target_columns = valid_df.loc[:, valid_df.columns.str.startswith('d_')] \
            .columns.tolist()

        if not all([c in valid_df.columns for c in id_columns]):
            valid_df = pd.concat([train_df[id_columns], valid_df],
                                 axis=1, sort=False)

        self.train_df = train_df
        self.valid_df = valid_df
        self.calendar = calendar
        self.prices = prices

        self.weight_columns = weight_columns
        self.id_columns = id_columns
        self.valid_target_columns = valid_target_columns

        weight_df = self.get_weight_df()

        self.group_ids = (
            'all_id',
            'state_id',
            'store_id',
            'cat_id',
            'dept_id',
            ['state_id', 'cat_id'],
            ['state_id', 'dept_id'],
            ['store_id', 'cat_id'],
            ['store_id', 'dept_id'],
            'item_id',
            ['item_id', 'state_id'],
            ['item_id', 'store_id']
        )

        for i, group_id in enumerate(tqdm(self.group_ids)):
            train_y = train_df.groupby(group_id)[train_target_columns].sum()
            scale = []
            for _, row in train_y.iterrows():
                series = row.values[np.argmax(row.values != 0):]
                scale.append(((series[1:] - series[:-1]) ** 2).mean())
            setattr(self, f'lv{i + 1}_scale', np.array(scale))
            setattr(self, f'lv{i + 1}_train_df', train_y)
            setattr(self, f'lv{i + 1}_valid_df', valid_df.groupby(group_id) \
                [valid_target_columns].sum())

            lv_weight = weight_df.groupby(group_id)[weight_columns].sum().sum(axis=1)
            setattr(self, f'lv{i + 1}_weight', lv_weight / lv_weight.sum())

    def get_weight_df(self) -> pd.DataFrame:
        day_to_week = self.calendar.set_index('d')['wm_yr_wk'].to_dict()
        weight_df = self.train_df[['item_id', 'store_id'] + self.weight_columns] \
            .set_index(['item_id', 'store_id'])
        weight_df = weight_df.stack().reset_index() \
            .rename(columns={'level_2': 'd', 0: 'value'})
        weight_df['wm_yr_wk'] = weight_df['d'].map(day_to_week)

        weight_df = weight_df.merge(self.prices, how='left',
                                    on=['item_id', 'store_id', 'wm_yr_wk'])
        weight_df['value'] = weight_df['value'] * weight_df['sell_price']
        weight_df = weight_df.set_index(['item_id', 'store_id', 'd']) \
                        .unstack(level=2)['value'] \
                        .loc[zip(self.train_df.item_id, self.train_df.store_id), :] \
            .reset_index(drop=True)
        weight_df = pd.concat([self.train_df[self.id_columns],
                               weight_df], axis=1, sort=False)
        return weight_df

    def rmsse(self, valid_preds: pd.DataFrame, lv: int) -> pd.Series:
        valid_y = getattr(self, f'lv{lv}_valid_df')
        score = ((valid_y - valid_preds) ** 2).mean(axis=1)
        scale = getattr(self, f'lv{lv}_scale')
        return (score / scale).map(np.sqrt)

    def score(self, valid_preds: Union[pd.DataFrame,
                                       np.ndarray]) -> float:
        assert self.valid_df[self.valid_target_columns].shape \
               == valid_preds.shape

        if isinstance(valid_preds, np.ndarray):
            valid_preds = pd.DataFrame(valid_preds,
                                       columns=self.valid_target_columns)

        valid_preds = pd.concat([self.valid_df[self.id_columns],
                                 valid_preds], axis=1, sort=False)

        all_scores = []
        for i, group_id in enumerate(self.group_ids):
            valid_preds_grp = valid_preds.groupby(group_id)[self.valid_target_columns].sum()
            setattr(self, f'lv{i + 1}_valid_preds', valid_preds_grp)

            lv_scores = self.rmsse(valid_preds_grp, i + 1)
            setattr(self, f'lv{i + 1}_scores', lv_scores)

            weight = getattr(self, f'lv{i + 1}_weight')
            lv_scores = pd.concat([weight, lv_scores], axis=1,
                                  sort=False).prod(axis=1)

            all_scores.append(lv_scores[~lv_scores.isin([np.inf])].sum())
        self.all_scores = all_scores

        return np.mean(all_scores)

In [29]:
def calc_wrmsse(train_df, prices_df, calendar_df, submission_df, all_preds):
    '''
    calculates Weighted Root Mean Squared Scaled Error (RMSSE) which is the evaluation parameter for this competition.
    '''
    print('############################# calc_wrmsse ###################################')
    temp_df = train_df
    print('adjust end of train period')
    num_before = train_df.shape
    num_diff_days = end_train_day_default - end_train_day_x - \
                    prediction_horizon
    if num_diff_days > 0:
        temp_df = train_df.iloc[:, :-1 * num_diff_days]
    num_after = temp_df.shape
    print(num_before, '-->', num_after)

    train_fold_df = temp_df.iloc[:, :-28]
    valid_fold_df = temp_df.iloc[:, -28:].copy()

    valid_preds = submission_df[submission_df['id'].str.contains('evaluation')][['id']]
    valid_preds = valid_preds.merge(all_preds, on=['id'], how='left').fillna(0)
    valid_preds = valid_preds.drop('id', axis=1)
    valid_preds.columns = valid_fold_df.columns
    train_fold_df.to_csv(result_dir_path / 'eval_wrmsse_train.csv', index=False)
    valid_fold_df.to_csv(result_dir_path / 'eval_wrmsse_test.csv', index=False)
    valid_preds.to_csv(result_dir_path / 'eval_wrmsse_pred.csv', index=False)

    evaluator = WRMSSEEvaluator(train_fold_df, valid_fold_df, calendar_df, prices_df)
    wrmsse = evaluator.score(valid_preds)
    print('wrmsse', wrmsse)

    return wrmsse

In [31]:
####### Global variables
base_dir = "/home/jupyter/m5-forecasting-kaggle/"
raw_train_file = "sales_train_evaluation.csv"
raw_price_file = "sell_prices.csv"
raw_calendar_file = "calendar.csv"
raw_submission_file = "sample_submission.csv"
main_index_list = ['id', 'd']
target= "sales" #target variable which needs to be forecasted
start_train_day_x = 1 # We can skip some rows (Nans/faster training)
recursive_feature_flag = False #whether we want to use recursive features or not. Usually we avoid using recursive features to avoid data leakage.
seed = 42 # We want all things to be as deterministic as possible
export_all_flag = False #whether we want to save some intermediate files and results or not (usually False)

num_lag_day = 15 #number of lag features to be created
num_rolling_day_list = [7, 14, 30, 60, 180] #different window size for rolling mean/std features

In [32]:
# A few other global variables which is provided as dict.
setting = dict()
setting['data_dir_path'] =  base_dir +"Data/"
setting['output_name'] = "default"
# list of End days of our train set. Serves as multiple validation sets for robust evaluation of the model 
#this different evaluation sets can also be utilised for tuning the hyper-parameters.
setting['fold_id_list_csv'] = '1941,1913,1885,1857,1829,1577' 
setting['prediction_horizon_list_csv']=  '7,14,21,28' #list of prediction horizon. We are building 4 different models, one for each of 4 weeks (1-7, 8-14, 15-21, 22-28 days).
#Because of memory issues we can't use many encoding features. Model will train only with following encoding features.
setting['mean_features'] = [
                            'enc_cat_id_mean', 'enc_cat_id_std',
                            'enc_dept_id_mean', 'enc_dept_id_std',
                            'enc_item_id_mean', 'enc_item_id_std']


In [33]:
#function to get some other global variables using set_params() function
raw_dir_path, output_dir_path, result_dir_path, work_dir_path, model_dir_path, end_train_day_x_list, end_train_day_default, \
           prediction_horizon_list, raw_train_path, raw_price_path, raw_calendar_path, raw_submission_path, \
           remove_features, enable_features, mean_features = set_params(setting, seed, raw_train_file, raw_price_file, raw_calendar_file, raw_submission_file)

raw_dir_path, output_dir_path, result_dir_path, work_dir_path, model_dir_path, end_train_day_x_list, \
           prediction_horizon_list, raw_train_path, raw_price_path, raw_calendar_path, raw_submission_path, \
           remove_features, enable_features, mean_features

(PosixPath('/home/jupyter/m5-forecasting-kaggle/Data/raw'),
 PosixPath('/home/jupyter/m5-forecasting-kaggle/Data/output/default'),
 PosixPath('/home/jupyter/m5-forecasting-kaggle/Data/output/default/result'),
 PosixPath('/home/jupyter/m5-forecasting-kaggle/Data/output/default/work'),
 PosixPath('/home/jupyter/m5-forecasting-kaggle/Data/output/default/model'),
 [1941, 1913, 1885, 1857, 1829, 1577],
 [7, 14, 21, 28],
 PosixPath('/home/jupyter/m5-forecasting-kaggle/Data/raw/sales_train_evaluation.csv'),
 PosixPath('/home/jupyter/m5-forecasting-kaggle/Data/raw/sell_prices.csv'),
 PosixPath('/home/jupyter/m5-forecasting-kaggle/Data/raw/calendar.csv'),
 PosixPath('/home/jupyter/m5-forecasting-kaggle/Data/raw/sample_submission.csv'),
 ['id', 'state_id', 'store_id', 'date', 'wm_yr_wk', 'd', 'sales'],
 None,
 ['enc_cat_id_mean',
  'enc_cat_id_std',
  'enc_dept_id_mean',
  'enc_dept_id_std',
  'enc_item_id_mean',
  'enc_item_id_std'])

In [34]:
print('main')
#loading raw data
train_df, prices_df, calendar_df, submission_df = load_data()
#saving original result, work and model directory which will be changed as we iterate for different end_train_day_x.
result_dir_org_path = result_dir_path
work_dir_org_path = work_dir_path
model_dir_org_path = model_dir_path
result_summary_all_df = pd.DataFrame()
#iterating for different end_train_day_x to evaluate in different time horizons
for end_train_day_x in end_train_day_x_list:
    end_train_day_x = end_train_day_x
    # updating result, work and model directory depending on the end_train_day_x it is running for
    result_dir_path = result_dir_org_path / str(end_train_day_x)
    work_dir_path = work_dir_org_path / str(end_train_day_x)
    model_dir_path = model_dir_org_path / str(end_train_day_x)
    grid_base_path, grid_price_path, grid_calendar_path, holdout_path, \
    lag_feature_path, target_encoding_feature_path, result_submission_path =update_file_path()

    pred_h_all_df = pd.DataFrame()
    pred_v_all_df = pd.DataFrame()
    prediction_horizon_prev = 0 #initially prediction_horizon_prev is zero.
    # iterate for different prediction_horizons [7, 14, 21, 28] since we are building different models for different prediction_horizon.
    for predict_horizon in prediction_horizon_list:
        print('-----------------', 'fold_id', end_train_day_x, 'predict_horizon', predict_horizon)
        prediction_horizon = predict_horizon
        # get the different lags to generate the feature for for different prediction_horizon.
        num_lag_day_list, num_shift_rolling_day_list = update_predict_horizon()
        #generate base, price and calendar related features
        generate_grid_full(train_df, prices_df, calendar_df)
        # generate lag features
        generate_lag_feature(grid_base_path, lag_feature_path, num_lag_day_list, num_rolling_day_list, recursive_feature_flag, num_shift_rolling_day_list)
        #generate target encoding for categorical variables/pairs
        generate_target_encoding_feature(grid_base_path)
        #train and get prediction with different models for each prediction_horizon and stores.
        pred_h_df, pred_v_df = train_and_predict(train_df, calendar_df, prices_df, submission_df)
        # append pred_h and pred_v for [7, 14, 21, 28] prediction_horizon to get the overall pred_h_all and pred_v_all
        if pred_h_all_df.shape[1] == 0:
            pred_h_all_df = pred_h_df
        else:
            pred_h_all_df = pred_h_all_df.merge(pred_h_df, on='id')
        pred_v_all_df = pd.concat([pred_v_all_df, pred_v_df], axis=0)
        prediction_horizon_prev = predict_horizon

        try:
            print('clear work_dir')
            shutil.rmtree(work_dir_path)
            os.mkdir(work_dir_path)
        except Exception:
            log.exception()
    #save pred_h_all and pred_v_all
    pred_h_all_df.to_csv(result_dir_path / 'pred_h_all.csv', index=False)
    pred_v_all_df.to_csv(result_dir_path / 'pred_v_all.csv', index=False)
    #read the holdout data for different end_train_day_x and prediction_horizon which was saved before.
    holdout_df = pd.read_csv(holdout_path)
    print('holdout_df.shape', holdout_df.shape)
    print('pred_v_all_df.shape', pred_v_all_df.shape)

    #following if condition will correspond to final submission holdout data.
    if holdout_df.shape[0] == 0:
        print('no holdout')
        print('generate submission')
        pred_h_all_df = pred_h_all_df.reset_index(drop=True)
        submission = pd.read_csv(raw_submission_path)[['id']]
        submission = submission.merge(pred_h_all_df, on=['id'], how='left').fillna(0)
        #submission csv file is saved in result folder in the output directory
        submission.to_csv(result_submission_path, index=False)
        result_summary_df = None
    else:
        print('calc metrics')
        #merge the actual holdout data and predictions for it to get the evaluation metrics.
        result_df = holdout_df.merge(pred_v_all_df, on=['id', 'd'], how='inner')
        result_df.columns = ['id', 'd', 'y_test', 'y_pred']
        print('result_df.shape', pred_v_all_df.shape)
        result_df.to_csv(result_dir_path / 'result.csv', index=False)
        # calculate the value of evaluation metrics on holdout actuals and preds.
        wrmsse = calc_wrmsse(train_df, prices_df, calendar_df, submission_df, pred_h_all_df)

        rmse = np.sqrt(mean_squared_error(result_df['y_test'], result_df['y_pred']))
        #save these metrics value in a dataframe corresponding to different end_train_day_x
        result_summary_df = pd.DataFrame(
            [
                [end_train_day_x, 'wrmsse', wrmsse],
                [end_train_day_x, 'rmse', rmse],
            ],
            columns=['fold_id', 'metric_name', 'metric_value'])
        print(result_summary_df)
        result_summary_df.to_csv(result_dir_path / 'result_summary.csv', index=False)
        result_summary_all_df = pd.concat([result_summary_all_df, result_summary_df])

#get the aggregated metrics mean and std across different end_train_day_x.
if result_summary_all_df.shape[0] == 0:
    pass
else:
    print(result_summary_all_df)
    print(result_summary_all_df.groupby('metric_name')['metric_value'].agg(['mean', 'median']))
    result_summary_all_df.to_csv(result_dir_org_path / 'result_summary_all.csv', index=False)
reset_dir_path()

main
#################### load_data #######################
train_df.shape (30490, 1947)
prices_df.shape (6841121, 4)
calendar_df.shape (1969, 14)
submission_df.shape (60980, 29)
----------------- fold_id 1941 predict_horizon 7
################# generate_grid_base ###################
melting to convert horizontal to vertical data
grid_df.shape after melting (vertical data) (59181090, 8)
generate holdout data for validation by removing days before end_train_day_x
59181090 --> 59181090
add test days
convert to category
calc release week
convert release to int16
save grid_base
grid_df.shape (59394520, 10)
################## generate_grid_price #################
loading grid_base
merge prices
save grid_price
################## generate_grid_calendar ####################
############## modify_grid_base ####################
################ generate_lag features ################
load grid_base
creating lags
create rolling aggs (mean/std)
rolling period 7
save lag_feature
################ gen

A Jupyter Widget


wrmsse 2.5256279621941418
   fold_id metric_name  metric_value
0     1913      wrmsse      2.525628
1     1913        rmse      3.639888
----------------- fold_id 1885 predict_horizon 7
################# generate_grid_base ###################
melting to convert horizontal to vertical data
grid_df.shape after melting (vertical data) (59181090, 8)
generate holdout data for validation by removing days before end_train_day_x
59181090 --> 57473650
add test days
convert to category
calc release week
convert release to int16
save grid_base
grid_df.shape (57687080, 10)
################## generate_grid_price #################
loading grid_base
merge prices
save grid_price
################## generate_grid_calendar ####################
############## modify_grid_base ####################
################ generate_lag features ################
load grid_base
creating lags
create rolling aggs (mean/std)
rolling period 7
save lag_feature
################ generate_target_encoding_feature ###########

A Jupyter Widget


wrmsse 2.426944112530406
   fold_id metric_name  metric_value
0     1885      wrmsse      2.426944
1     1885        rmse      3.586252
----------------- fold_id 1857 predict_horizon 7
################# generate_grid_base ###################
melting to convert horizontal to vertical data
grid_df.shape after melting (vertical data) (59181090, 8)
generate holdout data for validation by removing days before end_train_day_x
59181090 --> 56619930
add test days
convert to category
calc release week
convert release to int16
save grid_base
grid_df.shape (56833360, 10)
################## generate_grid_price #################
loading grid_base
merge prices
save grid_price
################## generate_grid_calendar ####################
############## modify_grid_base ####################
################ generate_lag features ################
load grid_base
creating lags
create rolling aggs (mean/std)
rolling period 7
save lag_feature
################ generate_target_encoding_feature ############

A Jupyter Widget


wrmsse 2.410408997233953
   fold_id metric_name  metric_value
0     1857      wrmsse      2.410409
1     1857        rmse      3.640562
----------------- fold_id 1829 predict_horizon 7
################# generate_grid_base ###################
melting to convert horizontal to vertical data
grid_df.shape after melting (vertical data) (59181090, 8)
generate holdout data for validation by removing days before end_train_day_x
59181090 --> 55766210
add test days
convert to category
calc release week
convert release to int16
save grid_base
grid_df.shape (55979640, 10)
################## generate_grid_price #################
loading grid_base
merge prices
save grid_price
################## generate_grid_calendar ####################
############## modify_grid_base ####################
################ generate_lag features ################
load grid_base
creating lags
create rolling aggs (mean/std)
rolling period 7
save lag_feature
################ generate_target_encoding_feature ############

A Jupyter Widget


wrmsse 2.395455234896503
   fold_id metric_name  metric_value
0     1829      wrmsse      2.395455
1     1829        rmse      3.698429
----------------- fold_id 1577 predict_horizon 7
################# generate_grid_base ###################
melting to convert horizontal to vertical data
grid_df.shape after melting (vertical data) (59181090, 8)
generate holdout data for validation by removing days before end_train_day_x
59181090 --> 48082730
add test days
convert to category
calc release week
convert release to int16
save grid_base
grid_df.shape (48296160, 10)
################## generate_grid_price #################
loading grid_base
merge prices
save grid_price
################## generate_grid_calendar ####################
############## modify_grid_base ####################
################ generate_lag features ################
load grid_base
creating lags
create rolling aggs (mean/std)
rolling period 7
save lag_feature
################ generate_target_encoding_feature ############

A Jupyter Widget


wrmsse 2.1684427821158345
   fold_id metric_name  metric_value
0     1577      wrmsse      2.168443
1     1577        rmse      3.697607
   fold_id metric_name  metric_value
0     1913      wrmsse      2.525628
1     1913        rmse      3.639888
0     1885      wrmsse      2.426944
1     1885        rmse      3.586252
0     1857      wrmsse      2.410409
1     1857        rmse      3.640562
0     1829      wrmsse      2.395455
1     1829        rmse      3.698429
0     1577      wrmsse      2.168443
1     1577        rmse      3.697607
                 mean    median
metric_name                    
rmse         3.652548  3.640562
wrmsse       2.385376  2.410409


In [1]:
# pred_h_df, pred_v_df= train_and_predict(train_df, calendar_df, prices_df, submission_df)

In [2]:
# def main():
#     print('main')
#     global raw_dir_path, output_dir_path, result_dir_path, work_dir_path, model_dir_path, end_train_day_x_list, \
#            prediction_horizon_list, raw_train_path, raw_price_path, raw_calendar_path, raw_submission_path, \
#            remove_features, enable_features, mean_features, grid_base_path, grid_price_path, grid_calendar_path, holdout_path, \
#             lag_feature_path, target_encoding_feature_path, result_submission_path
    
#     train_df, prices_df, calendar_df, submission_df = load_data()
#     result_dir_org_path = result_dir_path
#     work_dir_org_path = work_dir_path
#     model_dir_org_path = model_dir_path
#     result_summary_all_df = pd.DataFrame()
#     for end_train_day_x in end_train_day_x_list:
#         end_train_day_x = end_train_day_x
#         result_dir_path = result_dir_org_path / str(end_train_day_x)
#         work_dir_path = work_dir_org_path / str(end_train_day_x)
#         model_dir_path = model_dir_org_path / str(end_train_day_x)
#         grid_base_path, grid_price_path, grid_calendar_path, holdout_path, \
#         lag_feature_path, target_encoding_feature_path, result_submission_path = update_file_path()

#         pred_h_all_df = pd.DataFrame()
#         pred_v_all_df = pd.DataFrame()
#         prediction_horizon_prev = 0
#         for predict_horizon in prediction_horizon_list:
#             print('-----------------', 'fold_id', end_train_day_x, 'predict_horizon', predict_horizon)
#             prediction_horizon = predict_horizon
#             num_lag_day_list, num_shift_rolling_day_list = update_predict_horizon()
#             generate_grid_full(train_df, prices_df, calendar_df)
#             generate_lag_feature(grid_base_path, lag_feature_path, num_lag_day_list, num_rolling_day_list, recursive_feature_flag, num_shift_rolling_day_list)
#             generate_target_encoding_feature(grid_base_path)
#             pred_h_df, pred_v_df = train_and_predict(train_df, calendar_df, prices_df, submission_df)
#             if pred_h_all_df.shape[1] == 0:
#                 pred_h_all_df = pred_h_df
#             else:
#                 pred_h_all_df = pred_h_all_df.merge(pred_h_df, on='id')
#             pred_v_all_df = pd.concat([pred_v_all_df, pred_v_df], axis=0)
#             prediction_horizon_prev = predict_horizon

#             try:
#                 print('clear work_dir')
#                 shutil.rmtree(work_dir_path)
#                 os.mkdir(work_dir_path)
#             except Exception:
#                 log.exception()

#         pred_h_all_df.to_csv(result_dir_path / 'pred_h_all.csv', index=False)
#         pred_v_all_df.to_csv(result_dir_path / 'pred_v_all.csv', index=False)

#         holdout_df = pd.read_csv(holdout_path)
#         print('holdout_df.shape', holdout_df.shape)
#         print('pred_v_all_df.shape', pred_v_all_df.shape)

#         if holdout_df.shape[0] == 0:
#             print('no holdout')
#             print('generate submission')
#             pred_h_all_df = pred_h_all_df.reset_index(drop=True)
#             submission = pd.read_csv(raw_submission_path)[['id']]
#             submission = submission.merge(pred_h_all_df, on=['id'], how='left').fillna(0)
#             submission.to_csv(result_submission_path, index=False)
#             result_summary_df = None
#         else:
#             print('calc metrics')
#             result_df = holdout_df.merge(pred_v_all_df, on=['id', 'd'], how='inner')
#             result_df.columns = ['id', 'd', 'y_test', 'y_pred']
#             print('result_df.shape', pred_v_all_df.shape)
#             result_df.to_csv(result_dir_path / 'result.csv', index=False)

#             wrmsse = calc_wrmsse(train_df, prices_df, calendar_df, submission_df, pred_h_all_df)

#             rmse = np.sqrt(mean_squared_error(result_df['y_test'], result_df['y_pred']))

#             result_summary_df = pd.DataFrame(
#                 [
#                     [end_train_day_x, 'wrmsse', wrmsse],
#                     [end_train_day_x, 'rmse', rmse],
#                 ],
#                 columns=['fold_id', 'metric_name', 'metric_value'])
#             print(result_summary_df)
#             result_summary_df.to_csv(result_dir_path / 'result_summary.csv', index=False)
#             result_summary_all_df = pd.concat([result_summary_all_df, result_summary_df])

#     if result_summary_all_df.shape[0] == 0:
#         pass
#     else:
#         print(result_summary_all_df)
#         print(result_summary_all_df.groupby('metric_name')['metric_value'].agg(['mean', 'median']))
#         result_summary_all_df.to_csv(result_dir_org_path / 'result_summary_all.csv', index=False)
#     reset_dir_path()
#     return

In [3]:
# grid_df, enable_features = load_grid_by_store("CA_1")

In [4]:
# grid_df.dtypes[1:50]

In [48]:
# base_dir = "/home/jupyter/m5-forecasting-kaggle/"
# raw_train_file = "sales_train_evaluation.csv"
# raw_price_file = "sell_prices.csv"
# raw_calendar_file = "calendar.csv"
# raw_submission_file = "sample_submission.csv"
# end_train_day_x = 1913 #till d_1913 will go to train data and last 28 days will go into validation data. --re

# main_index_list = ['id', 'd']
# target= "sales"
# start_train_day_x = 1
# prediction_horizon= 7 #--re
# num_lag_day = 15
# recursive_feature_flag = False
# seed = 42
# export_all_flag = False
# recursive_feature_flag = False

# num_lag_day = 15
# num_rolling_day_list = [7, 14, 30, 60, 180]

# prediction_horizon_prev = 0

In [52]:
# setting = dict()
# setting['data_dir_path'] =  base_dir +"Data/"
# setting['output_name'] = "default"
# setting['fold_id_list_csv'] = "1913"
# setting['prediction_horizon_list_csv']= "7"

In [76]:
# raw_dir_path, output_dir_path, result_dir_path, work_dir_path, model_dir_path, end_train_day_x_list, \
#            prediction_horizon_list, raw_train_path, raw_price_path, raw_calendar_path, raw_submission_path, \
#            remove_features, enable_features, mean_features = set_params(setting, seed, raw_train_file, raw_price_file, raw_calendar_file, raw_submission_file)

# raw_dir_path, output_dir_path, result_dir_path, work_dir_path, model_dir_path, end_train_day_x_list, \
#            prediction_horizon_list, raw_train_path, raw_price_path, raw_calendar_path, raw_submission_path, \
#            remove_features, enable_features, mean_features

In [5]:
# train_df, prices_df, calendar_df, submission_df = load_data(raw_train_path, raw_price_path, raw_calendar_path, raw_submission_path)
# train_df.head()

In [6]:
# result_dir_org_path = result_dir_path
# work_dir_org_path = work_dir_path
# model_dir_org_path = model_dir_path
# result_summary_all_df = pd.DataFrame()
# end_train_day_x = end_train_day_x
# result_dir_path = result_dir_org_path / str(end_train_day_x)
# work_dir_path = work_dir_org_path / str(end_train_day_x)
# model_dir_path = model_dir_org_path / str(end_train_day_x)
# grid_base_path, grid_price_path, grid_calendar_path, holdout_path, \
# lag_feature_path, target_encoding_feature_path, result_submission_path = update_file_path()
# grid_base_path, grid_price_path, grid_calendar_path, holdout_path, \
# lag_feature_path, target_encoding_feature_path, result_submission_path

In [8]:
# #grid_df will have data till d_(end_train_day_x + predcition_horizon). 
# #e.g: if end_train_day_x= 1913, predcition_horizon = 13, then d_1 to d_1920 data is there.
# #d_1913 is there for train and last 7 days of data is there for holdout/validation.
# #after training with data till 1913, we will generate the forecast for d_1914 to d_1920 and compare
# #with actual sales numbers.
# #same goes for grid_df for prices and calendars. They will have the same number of rows which will be concatenated
# #along the columns later to get all the base features, and price and calendar related features. 
# grid_df= generate_grid_base(train_df, prices_df, calendar_df)
# # grid_df.head

In [9]:
# grid_df.columns, prices_df.columns, calendar_df.columns, grid_price_df.columns, grid_calendar_df.columns

In [10]:
# grid_price_df = generate_grid_price(prices_df, calendar_df)
# grid_price_df.head()

In [11]:
# grid_calendar_df= generate_grid_calendar(calendar_df)
# grid_calendar_df.head()

In [12]:
# modify_grid_base()

In [13]:
# num_lag_day_list, num_shift_rolling_day_list = update_predict_horizon(num_lag_day)
# num_lag_day_list, num_shift_rolling_day_list

In [14]:
# generate_grid_full(train_df, prices_df, calendar_df)

In [15]:
# grid_df = generate_lag_feature(grid_base_path, lag_feature_path, num_lag_day_list, num_rolling_day_list, recursive_feature_flag, num_shift_rolling_day_list)
# grid_df.loc[grid_df["id"] == "HOBBIES_2_006_CA_1_evaluation"].head(50)

In [16]:
# print(grid_df.columns)

In [17]:
# grid_df= generate_target_encoding_feature(grid_base_path)
# print(grid_df.columns)
# grid_df.head()

In [19]:
# grid_calendar_df.tm_d.unique()

In [20]:
# train_df = pd.read_csv("Data/sales_train_evaluation.csv")
# prices_df = pd.read_csv("Data/sell_prices.csv")
# calendar_df = pd.read_csv("Data/calendar.csv")
# submission_df = pd.read_csv("Data/sample_submission.csv")
# print(train_df.shape, prices_df.shape, calendar_df.shape, submission_df.shape)
# train_df.head()

In [21]:
# print('generate_grid_base')
# index_columns = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']
# grid_df = pd.melt(train_df, id_vars=index_columns, var_name='d', value_name= target)
# print(grid_df.shape)
# assert grid_df.shape[0] == train_df.shape[0]*1941
# grid_df.head()

In [22]:
# print('remove days before end_train_day_x / generate holdout')
# num_before = grid_df.shape[0]
# grid_df['d_org'] = grid_df['d']
# grid_df['d'] = grid_df['d'].apply(lambda x: x[2:]).astype(np.int16)
# grid_df.head()

In [23]:
# holdout_df = grid_df[(grid_df['d'] > end_train_day_x) & \
#                              (grid_df['d'] <= end_train_day_x + prediction_horizon)][main_index_list + [target]]

In [24]:
# holdout_df.head()

In [25]:
# holdout_df.to_csv("Data/holdout_df_1914_1941_28.csv", index=False)

In [26]:
# grid_df = reduce_mem_usage(grid_df)

In [27]:
# grid_df = grid_df[grid_df['d'] <= end_train_day_x]
# grid_df['d'] = grid_df['d_org']
# grid_df = grid_df.drop('d_org', axis=1)
# num_after = grid_df.shape[0]
# print("Number of train data after separating holdout set: ", num_after)

In [28]:
# add_grid = pd.DataFrame()
# for i in range(prediction_horizon):
#     temp_df = train_df[index_columns]
#     temp_df = temp_df.drop_duplicates()
#     temp_df['d'] = 'd_' + str(end_train_day_x + i + 1)
#     temp_df[target] = np.nan
#     add_grid = pd.concat([add_grid, temp_df])
# print(add_grid.shape)
# add_grid.head()

In [29]:
# grid_df = pd.concat([grid_df, add_grid])
# grid_df = grid_df.reset_index(drop=True)

# del temp_df, add_grid
# del train_df
# gc.collect()
# print(grid_df.shape)
# grid_df.head()

In [30]:
# print('convert to category')
# for col in index_columns:
#     grid_df[col] = grid_df[col].astype('category')

In [31]:
# grid_df.dtypes

In [32]:
# grid_df = reduce_mem_usage(grid_df)

In [33]:
# print('calc release week') #when this item in an store was sold for first time
# release_df = prices_df.groupby(['store_id', 'item_id'])['wm_yr_wk'].agg(['min']).reset_index()
# release_df.columns = ['store_id', 'item_id', 'release']
# grid_df = merge_by_concat(grid_df, release_df, ['store_id', 'item_id'])
# del release_df
# gc.collect()
# grid_df = merge_by_concat(grid_df, calendar_df[['wm_yr_wk', 'd']], ['d'])
# grid_df = grid_df.reset_index(drop=True)

# print('convert release to int16')
# grid_df['release'] = grid_df['release'] - grid_df['release'].min()
# grid_df['release'] = grid_df['release'].astype(np.int16)

# print('save grid_base')
# grid_df.to_pickle("Data/grid_base_1914_1941_28.pkl")

# print('grid_df.shape', grid_df.shape)
# print(grid_df.head())

In [34]:
# print(grid_df.shape)
# print(grid_df.head())

In [35]:
# print('generate_grid_price')
# print('load grid_base')
# grid_df = pd.read_pickle("Data/grid_base_1914_1941_28.pkl")

# prices_df['price_max'] = prices_df.groupby(['store_id', 'item_id'])['sell_price'].transform('max')
# prices_df['price_min'] = prices_df.groupby(['store_id', 'item_id'])['sell_price'].transform('min')
# prices_df['price_std'] = prices_df.groupby(['store_id', 'item_id'])['sell_price'].transform('std')
# prices_df['price_mean'] = prices_df.groupby(['store_id', 'item_id'])['sell_price'].transform('mean')
# prices_df['price_norm'] = prices_df['sell_price'] / prices_df['price_max']
# prices_df['price_nunique'] = prices_df.groupby(['store_id', 'item_id'])['sell_price'].transform('nunique')
# prices_df['item_nunique'] = prices_df.groupby(['store_id', 'sell_price'])['item_id'].transform('nunique')

In [36]:
# calendar_prices = calendar_df[['wm_yr_wk', 'month', 'year']]
# calendar_prices = calendar_prices.drop_duplicates(subset=['wm_yr_wk'])
# prices_df = prices_df.merge(calendar_prices[['wm_yr_wk', 'month', 'year']], on=['wm_yr_wk'], how='left')
# del calendar_prices

# prices_df['price_momentum'] = prices_df['sell_price'] / prices_df.groupby(['store_id', 'item_id'])[
#     'sell_price'].transform(lambda x: x.shift(1))
# prices_df['price_momentum_m'] = prices_df['sell_price'] / prices_df.groupby(['store_id', 'item_id', 'month'])[
#     'sell_price'].transform('mean')
# prices_df['price_momentum_y'] = prices_df['sell_price'] / prices_df.groupby(['store_id', 'item_id', 'year'])[
#     'sell_price'].transform('mean')

In [37]:
# prices_df['sell_price_cent'] = [math.modf(p)[0] for p in prices_df['sell_price']]
# prices_df['price_max_cent'] = [math.modf(p)[0] for p in prices_df['price_max']]
# prices_df['price_min_cent'] = [math.modf(p)[0] for p in prices_df['price_min']]

# del prices_df['month'], prices_df['year']

In [38]:
# prices_df= reduce_mem_usage(prices_df)

In [39]:
# print('merge prices')
# original_columns = list(grid_df)
# grid_df = grid_df.merge(prices_df, on=['store_id', 'item_id', 'wm_yr_wk'], how='left')
# keep_columns = [col for col in list(grid_df) if col not in original_columns]
# grid_df = grid_df[main_index_list + keep_columns]
# grid_df = reduce_mem_usage(grid_df)

# print('save grid_price')
# grid_df.to_pickle("Data/grid_price_1914_1941_28.pkl")
# del prices_df

In [38]:
# grid_df.loc[grid_df["id"] == "HOBBIES_2_006_CA_1_evaluation"].head(50)

In [40]:
# def get_moon_phase(d):  # 0=new, 4=full; 4 days/phase
#     diff = datetime.datetime.strptime(d, '%Y-%m-%d') - datetime.datetime(2001, 1, 1)
#     days = dec(diff.days) + (dec(diff.seconds) / dec(86400))
#     lunations = dec("0.20439731") + (days * dec("0.03386319269"))
#     phase_index = math.floor((lunations % dec(1) * dec(8)) + dec('0.5'))
#     return int(phase_index) & 7

In [41]:
# print('generate_grid_calendar')
# grid_df = pd.read_pickle("Data/grid_price_1914_1941_28.pkl")
# grid_df = grid_df[main_index_list]
# grid_df.head()

In [42]:
# calendar_df['moon'] = calendar_df.date.apply(get_moon_phase)
# calendar_df.head()

In [43]:
# # Merge calendar partly
# icols = ['date', 'd','event_name_1', 'event_type_1', 'event_name_2', 'event_type_2', 'snap_CA', 'snap_TX', 'snap_WI','moon']

In [44]:
# grid_df = grid_df.merge(calendar_df[icols], on=['d'], how='left')
# grid_df.head()

In [45]:
# icols = ['event_name_1', 'event_type_1', 'event_name_2', 'event_type_2', 'snap_CA', 'snap_TX', 'snap_WI']
# for col in icols:
#     grid_df[col] = grid_df[col].astype('category')

In [46]:
# grid_df['date'] = pd.to_datetime(grid_df['date'])

# grid_df['tm_d'] = grid_df['date'].dt.day.astype(np.int8)
# grid_df['tm_w'] = grid_df['date'].dt.week.astype(np.int8)
# grid_df['tm_m'] = grid_df['date'].dt.month.astype(np.int8)
# grid_df['tm_y'] = grid_df['date'].dt.year
# grid_df['tm_y'] = (grid_df['tm_y'] - grid_df['tm_y'].min()).astype(np.int8)
# grid_df['tm_wm'] = grid_df['tm_d'].apply(lambda x: ceil(x / 7)).astype(np.int8)

In [47]:
# grid_df.head()

In [48]:
# grid_df['tm_dw'] = grid_df['date'].dt.dayofweek.astype(np.int8)
# grid_df['tm_w_end'] = (grid_df['tm_dw'] >= 5).astype(np.int8)
# del grid_df['date']

# grid_df.to_pickle("Data/grid_calendar_1914_1941_28.pkl")

# del calendar_df
# del grid_df
# gc.collect()

In [49]:
# print('modify_grid_base')
# grid_df = pd.read_pickle("Data/grid_base_1914_1941_28.pkl")
# grid_df['d'] = grid_df['d'].apply(lambda x: x[2:]).astype(np.int16)

# del grid_df['wm_yr_wk']
# grid_df.to_pickle("Data/grid_base_1914_1941_28.pkl")

# del grid_df

In [50]:
# print('load_grid_full')
# grid_df = pd.concat([pd.read_pickle("Data/grid_base_1914_1941_28.pkl"),
#                      pd.read_pickle("Data/grid_price_1914_1941_28.pkl").iloc[:, 2:],
#                      pd.read_pickle("Data/grid_calendar_1914_1941_28.pkl").iloc[:, 2:]],
#                     axis=1)

In [51]:
# grid_df.head()

In [52]:
# print('generate_lag')
# print('load grid_base')
# grid_df = pd.read_pickle("Data/grid_base_1914_1941_28.pkl")

# grid_df = grid_df[['id', 'd', 'sales']]

# start_time = time.time()
# print('create lags')

In [53]:
# # why lag from 28-42 and not 1-15? Because for test data (future dates),we wont have the sales numbers for 1-27 lags since these lags will also
# # be in future for 1-28 days out forecast.
# # The lags are different for each week model. Model w1's (1-7 days out) lags are {7, 8, 9, ..., 13 }. 
# # Model w4's (21-28 days out) lags are {28, 29, 30, ..., 34}
# # why it is getting shifted by "prediction_horizon" before taking rolling mean/std? for the test data (future dates), during our forecast horizon
# # we wont have the sales numbers to take the rolling mean/std, that's why first we have to shift by "prediction_horizon" and then take the rolling
# # mean/std. So effectively, for a prediction_horizon of 28 and rolling mean/std of 7 days is effcetively rolling mean/std of 35 days.
# #for each week you would have to shift those features forward by 7 days for the first week (1-7 days), 14 days for the second (8-14 days), etc.
# num_lag_day_list = []
# for col in range(prediction_horizon, prediction_horizon + num_lag_day):
#     num_lag_day_list.append(col)
# print(num_lag_day_list)

In [54]:
# grid_df = grid_df.assign(**{
#     '{}_lag_{}'.format(col, l): grid_df.groupby(['id'])[col].transform(lambda x: x.shift(l))
#     for l in num_lag_day_list
#     for col in [target]
# })

In [55]:
# grid_df.loc[grid_df["id"] == "HOBBIES_1_001_CA_1_evaluation"].head(50)

In [56]:
# num_rolling_day_list = [7, 14, 30, 60, 180]

# num_shift_rolling_day_list = []
# for num_shift_day in [1, 7, 14]:
#     for num_rolling_day in [7, 14, 30, 60]:
#         num_shift_rolling_day_list.append([num_shift_day, num_rolling_day])
# num_shift_rolling_day_list

In [57]:
# for col in list(grid_df):
#     if 'lag' in col:
#         grid_df[col] = grid_df[col].astype(np.float16)

# start_time = time.time()
# print('create rolling aggs')
# # why it is getting shifted by "prediction_horizon" before taking rolling mean/std? for the test data (future dates), during our forecast horizon
# # we wont have the sales numbers to take the rolling mean/std, that's why first we have to shift by "prediction_horizon" and then take the rolling
# # mean/std. So effectively, for a prediction_horizon of 28 and rolling mean/std of 7 days is effcetively rolling mean/std of 35 days.
# #for each week you would have to shift those features forward by 7 days for the first week (1-7 days), 14 days for the second (8-14 days), etc.
# for num_rolling_day in num_rolling_day_list:
#     print('rolling period', num_rolling_day)
#     grid_df['rolling_mean_' + str(num_rolling_day)] = grid_df.groupby(['id'])[target].transform(
#         lambda x: x.shift(prediction_horizon).rolling(num_rolling_day).mean()).astype(np.float16)
#     grid_df['rolling_std_' + str(num_rolling_day)] = grid_df.groupby(['id'])[target].transform(
#         lambda x: x.shift(prediction_horizon).rolling(num_rolling_day).std()).astype(np.float16)
# grid_df.head()

In [58]:
# grid_df.loc[grid_df["id"] == "HOBBIES_1_001_CA_1_evaluation"].head(60)

In [59]:
# if recursive_feature_flag:
#     for num_shift_rolling_day in num_shift_rolling_day_list:
#         num_shift_day = num_shift_rolling_day[0]
#         num_rolling_day = num_shift_rolling_day[1]
#         col_name = 'rolling_mean_tmp_' + str(num_shift_day) + '_' + str(num_rolling_day)
#         grid_df[col_name] = grid_df.groupby(['id'])[target].transform(
#             lambda x: x.shift(num_shift_day).rolling(num_rolling_day).mean()).astype(np.float16)

# print('save lag_feature')
# grid_df.to_pickle("Data/lag_feature_1914_1941_28.pkl")

In [60]:
# set_seed(seed)
# grid_df = pd.read_pickle("Data/grid_base_1914_1941_28.pkl")
# grid_df[target][grid_df['d'] > (end_train_day_x - prediction_horizon)] = np.nan
# base_cols = list(grid_df)

# icols = [['state_id'], ['store_id'], ['cat_id'], ['dept_id'], ['state_id', 'cat_id'], ['state_id', 'dept_id'], ['store_id', 'cat_id'],
#          ['store_id', 'dept_id'], ['item_id'], ['item_id', 'state_id'], ['item_id', 'store_id']]

# for col in icols:
#     print('encoding', col)
#     col_name = '_' + '_'.join(col) + '_'
#     grid_df['enc' + col_name + 'mean'] = grid_df.groupby(col)[target].transform('mean').astype(np.float16)
#     grid_df['enc' + col_name + 'std'] = grid_df.groupby(col)[target].transform('std').astype(np.float16)

# keep_cols = [col for col in list(grid_df) if col not in base_cols]
# grid_df = grid_df[['id', 'd'] + keep_cols]

# print('save target_encoding_feature')
# grid_df.to_pickle("Data/target_encoding_1914_1941_28.pkl")

In [61]:
# grid_df.head()

In [62]:
# def load_grid_by_store(store_id):
#     print('load_grid_full')
#     df = pd.concat([pd.read_pickle("Data/grid_base_1914_1941_28.pkl"),
#                     pd.read_pickle("Data/grid_price_1914_1941_28.pkl").iloc[:, 2:],
#                     pd.read_pickle("Data/grid_calendar_1914_1941_28.pkl").iloc[:, 2:]],
#                     axis=1)
#     store_id = "all"
#     if store_id != 'all':
#         df = df[df['store_id'] == store_id]
#     mean_features = ['enc_cat_id_mean', 'enc_cat_id_std', 'enc_dept_id_mean', 'enc_dept_id_std', 'enc_item_id_mean', 'enc_item_id_std']
#     df2 = pd.read_pickle("Data/target_encoding_1914_1941_28.pkl")[mean_features]
#     df2 = df2[df2.index.isin(df.index)]

#     df3 = pd.read_pickle("Data/lag_feature_1914_1941_28.pkl").iloc[:, 3:]
#     df3 = df3[df3.index.isin(df.index)]

#     df = pd.concat([df, df2], axis=1)
#     del df2

#     df = pd.concat([df, df3], axis=1)
#     del df3

#     remove_features = ['id', 'state_id', 'store_id', 'date', 'wm_yr_wk', 'd', target]
#     enable_features = [col for col in list(df) if col not in remove_features]
#     df = df[['id', 'd', target] + enable_features]

#     df = df[df['d'] >= start_train_day_x].reset_index(drop=True)
#     df= reduce_mem_usage(df)
#     return df, enable_features

In [63]:
# def load_base_test(store_id_set_list):
#     base_test = pd.DataFrame()

#     for store_id in store_id_set_list:
#         temp_df = pd.read_pickle("Data/"f'test_1914_1941_{store_id}_{prediction_horizon}.pkl')
#         temp_df['store_id'] = store_id
#         base_test = pd.concat([base_test, temp_df]).reset_index(drop=True)
#     return base_test

In [16]:
# base_test = pd.DataFrame()

# for store_id in store_id_set_list:
#     temp_df = pd.read_pickle(work_dir_path / f'test_{store_id}_{prediction_horizon}.pkl')
#     temp_df['store_id'] = store_id
#     base_test = pd.concat([base_test, temp_df]).reset_index(drop=True)

In [64]:
# lgb_params = {
#     'boosting_type': 'gbdt',
#     'objective': 'tweedie',
#     'tweedie_variance_power': 1.1,
#     'metric': 'rmse',
#     'subsample': 0.5,
#     'subsample_freq': 1,
#     'learning_rate': 0.03,
#     'num_leaves': 2 ** 11 - 1,
#     'min_data_in_leaf': 2 ** 12 - 1,
#     'feature_fraction': 0.5,
#     'max_bin': 100,
#     'n_estimators': 1400,
#     'boost_from_average': False,
#     'verbose': -1,
# }

# set_seed(seed)
# lgb_params['seed'] = seed

# store_id_set_list = list(train_df['store_id'].unique())
# store_id_set_list

In [65]:
# store_index, store_id = 0, store_id_set_list[0]
# grid_df, enable_features = load_grid_by_store(store_id)
# enable_features = enable_features

# train_mask = grid_df['d'] <= end_train_day_x
# valid_mask = train_mask & (grid_df['d'] > (end_train_day_x - prediction_horizon))
# preds_mask = grid_df['d'] > (end_train_day_x - 100)

In [66]:
# grid_df = reduce_mem_usage(grid_df)

In [67]:
# print('[{3} - {4}] train {0}/{1} {2}'.format(store_index + 1, len(store_id_set_list), store_id, end_train_day_x, prediction_horizon))
# train_data = lgb.Dataset(grid_df[train_mask][enable_features],
#                          label=grid_df[train_mask][target])

# valid_data = lgb.Dataset(grid_df[valid_mask][enable_features],
#                          label=grid_df[valid_mask][target])

# valid_data = lgb.Dataset(grid_df[valid_mask][enable_features],
#                          label=grid_df[valid_mask][target])

# # Saving part of the dataset for later predictions
# # Removing features that we need to calculate recursively
# grid_df = grid_df[preds_mask].reset_index(drop=True)
# grid_df.to_pickle("Data/"f'test_{store_id}_1914_1941_{prediction_horizon}.pkl')

# del grid_df
# gc.collect()

In [68]:
# train_data= reduce_mem_usage(train_data)
# valid_data= reduce_mem_usage(valid_data)

In [69]:
# feature_importance_all_df = pd.DataFrame()
# set_seed(seed)
# estimator = lgb.train(lgb_params, train_data, valid_sets=[valid_data], verbose_eval=False,
#                       callbacks=[log_evaluation(period=100)])

# model_name = str("Data/"f'lgb_model_{store_id}_1914_1941_{prediction_horizon}.bin')
# feature_importance_store_df = pd.DataFrame(sorted(zip(enable_features, estimator.feature_importance())),
#                                            columns=['feature_name', 'importance'])
# feature_importance_store_df = feature_importance_store_df.sort_values('importance', ascending=False)
# feature_importance_store_df['store_id'] = store_id
# feature_importance_all_df = pd.concat([feature_importance_all_df, feature_importance_store_df])
# pickle.dump(estimator, open(model_name, 'wb'))
# del train_data, valid_data, estimator
# gc.collect()

In [70]:
# print('aggregate feature importance')
# feature_importance_all_df.to_csv("Data/feature_importance_1914_1941_{0}_{1}.csv".format(store_id, prediction_horizon), index=False)
# feature_importance_agg_df = feature_importance_all_df.groupby('feature_name')['importance'].agg(['mean', 'std']).reset_index()
# feature_importance_agg_df.columns = ['feature_name', 'importance_mean', 'importance_std']
# feature_importance_agg_df = feature_importance_agg_df.sort_values('importance_mean', ascending=False)
# feature_importance_agg_df.to_csv("Data/feature_importance_agg_1914_1941_{0}_{1}.csv".format(store_id, prediction_horizon), index=False)

In [71]:
# print('load base_test')
# store_id_set_list= list(store_id)
# base_test = load_base_test(store_id_set_list)

In [72]:
# main_time = time.time()
# pred_h_df = pd.DataFrame()
# for predict_day in range(prediction_horizon_prev + 1, prediction_horizon + 1):
#     print('predict day{:02d}'.format(predict_day))
#     start_time = time.time()
#     grid_df = base_test.copy()

#     if self.params.recursive_feature_flag:
#         self.log.info('[{0} - {1}] calculate recursive features'.format(
#             self.params.end_train_day_x, self.params.prediction_horizon))
#         for num_shift_rolling_day in self.params.num_shift_rolling_day_list:
#             num_shift_day = num_shift_rolling_day[0]
#             num_rolling_day = num_shift_rolling_day[1]
#             lag_df = base_test[['id', 'd', self.params.target]]
#             col_name = 'rolling_mean_tmp_' + str(num_shift_day) + '_' + str(num_rolling_day)
#             lag_df[col_name] = lag_df.groupby(['id'])[self.params.target].transform(
#                 lambda x: x.shift(num_shift_day).rolling(num_rolling_day).mean())
#             grid_df = pd.concat([grid_df, lag_df[[col_name]]], axis=1)

#     day_mask = base_test['d'] == (self.params.end_train_day_x + predict_day)
#     if self.params.export_all_flag:
#         self.log.info('export recursive_features')
#         grid_df[day_mask].to_csv(self.params.result_dir_path / 'exp_recursive_features_{0}_{1}.csv'.format(
#             self.params.prediction_horizon, predict_day), index=False)
#     for store_index, store_id in enumerate(store_id_set_list):
#         self.log.info('[{3} - {4}] predict {0}/{1} {2} day {5}'.format(
#             store_index + 1, len(store_id_set_list), store_id,
#             self.params.end_train_day_x, self.params.prediction_horizon, predict_day))

#         model_path = str(
#             self.params.model_dir_path / f'lgb_model_{store_id}_{self.params.prediction_horizon}.bin')

#         estimator = pickle.load(open(model_path, 'rb'))
#         if store_id != 'all':
#             store_mask = base_test['store_id'] == store_id
#             mask = (day_mask) & (store_mask)
#         else:
#             mask = day_mask

#         if self.params.export_all_flag:
#             self.log.info('export pred')
#             grid_df[mask].to_csv(
#                 self.params.result_dir_path / (
#                         'exp_pred_' + store_id + '_day_' + str(predict_day) + '.csv'), index=False)
#         base_test[self.params.target][mask] = estimator.predict(grid_df[mask][self.params.enable_features])


In [None]:
# if  prediction_horizon_prev > 0:
#     pred_v_prev_df = None
#     for ph in prediction_horizon_list:
#         if ph <= prediction_horizon_prev:
#             pred_v_temp_df = pd.read_csv(result_dir_path / 'pred_v_{}.csv'.format(ph))
#             pred_v_prev_df = pd.concat([pred_v_prev_df, pred_v_temp_df])
#     for predict_day in range(1, prediction_horizon_prev + 1):
#         base_test[target][base_test['d'] == (end_train_day_x + predict_day)] = \
#             pred_v_prev_df[target][
#                 pred_v_prev_df['d'] == (end_train_day_x + predict_day)].values

In [73]:
# model_name = str("Data/"f'lgb_model_{store_id}_1914_1941_{prediction_horizon}.bin')
# model_name

In [74]:
# feature_importance_all_df.head()

In [14]:
# print(train_mask[-100:])
# print(valid_mask[-100:])
# print(preds_mask[-100:])

In [75]:
# # train_and_predict(self, train_df, calendar_df, prices_df, submission_df):
# feature_importance_all_df = pd.DataFrame()
# for store_index, store_id in enumerate(store_id_set_list):
#     print('train', store_id)

#     grid_df, enable_features = load_grid_by_store(store_id)
#     enable_features = enable_features

#     train_mask = grid_df['d'] <= end_train_day_x
#     valid_mask = train_mask & (grid_df['d'] > (end_train_day_x - prediction_horizon))
#     preds_mask = grid_df['d'] > (end_train_day_x - 100)

#     self.log.info('[{3} - {4}] train {0}/{1} {2}'.format(
#         store_index + 1, len(store_id_set_list), store_id,
#         self.params.end_train_day_x, self.params.prediction_horizon))
#     if self.params.export_all_flag:
#         self.log.info('export train')
#         grid_df[train_mask].to_csv(
#             self.params.result_dir_path / ('exp_train_' + store_id + '.csv'), index=False)
#     train_data = lgb.Dataset(grid_df[train_mask][enable_features],
#                              label=grid_df[train_mask][self.params.target])

#     if self.params.export_all_flag:
#         self.log.info('export valid')
#         grid_df[valid_mask].to_csv(
#             self.params.result_dir_path / ('exp_valid_' + store_id + '.csv'), index=False)
#     valid_data = lgb.Dataset(grid_df[valid_mask][enable_features],
#                              label=grid_df[valid_mask][self.params.target])

#     if self.params.export_all_flag:
#         self.log.info('export test')
#         grid_df[preds_mask].to_csv(
#             self.params.result_dir_path / ('exp_test_' + store_id + '.csv'), index=False)

#     if self.params.export_all_flag:
#         self.log.info('export train_valid_test')
#         grid_df[train_mask | valid_mask | preds_mask].to_csv(
#             self.params.result_dir_path / ('exp_train_valid_test_' + store_id + '.csv'), index=False)
#     valid_data = lgb.Dataset(grid_df[valid_mask][enable_features],
#                              label=grid_df[valid_mask][self.params.target])

#     # Saving part of the dataset for later predictions
#     # Removing features that we need to calculate recursively
#     grid_df = grid_df[preds_mask].reset_index(drop=True)
#     if self.params.recursive_feature_flag:
#         keep_cols = [col for col in list(grid_df) if '_tmp_' not in col]
#         grid_df = grid_df[keep_cols]
#     grid_df.to_pickle(self.params.work_dir_path / f'test_{store_id}_{self.params.prediction_horizon}.pkl')
#     del grid_df

#     Util.set_seed(self.params.seed)
#     estimator = lgb.train(lgb_params,
#                           train_data,
#                           valid_sets=[valid_data],
#                           verbose_eval=False,
#                           callbacks=[self.log.log_evaluation(period=100)],
#                           )

#     model_name = str(
#         self.params.model_dir_path / f'lgb_model_{store_id}_{self.params.prediction_horizon}.bin')
#     feature_importance_store_df = pd.DataFrame(sorted(zip(enable_features, estimator.feature_importance())),
#                                                columns=['feature_name', 'importance'])
#     feature_importance_store_df = feature_importance_store_df.sort_values('importance', ascending=False)
#     feature_importance_store_df['store_id'] = store_id
#     feature_importance_store_df.to_csv(
#         self.params.result_dir_path / ('feature_importance_{0}_{1}.csv'.format(
#             store_id, self.params.prediction_horizon)), index=False)
#     feature_importance_all_df = pd.concat([feature_importance_all_df, feature_importance_store_df])
#     pickle.dump(estimator, open(model_name, 'wb'))

#     del train_data, valid_data, estimator
#     gc.collect()

# self.log.info('aggregate feature importance')
# feature_importance_all_df.to_csv(self.params.result_dir_path / 'feature_importance_all_{0}.csv'.format(
#     self.params.prediction_horizon), index=False)
# feature_importance_agg_df = feature_importance_all_df.groupby(
#     'feature_name')['importance'].agg(['mean', 'std']).reset_index()
# feature_importance_agg_df.columns = ['feature_name', 'importance_mean', 'importance_std']
# feature_importance_agg_df = feature_importance_agg_df.sort_values('importance_mean', ascending=False)
# feature_importance_agg_df.to_csv(self.params.result_dir_path / 'feature_importance_agg_{0}.csv'.format(
#     self.params.prediction_horizon), index=False)

# self.log.info('load base_test')
# base_test = self.load_base_test(store_id_set_list)

# if self.params.export_all_flag:
#     base_test.to_csv(
#         self.params.result_dir_path / 'exp_base_test_{0}_a.csv'.format(self.params.prediction_horizon),
#         index=False)
# if self.params.prediction_horizon_prev > 0:
#     pred_v_prev_df = None
#     for ph in self.params.prediction_horizon_list:
#         if ph <= self.params.prediction_horizon_prev:
#             pred_v_temp_df = pd.read_csv(self.params.result_dir_path / 'pred_v_{}.csv'.format(ph))
#             pred_v_prev_df = pd.concat([pred_v_prev_df, pred_v_temp_df])
#     for predict_day in range(1, self.params.prediction_horizon_prev + 1):
#         base_test[self.params.target][base_test['d'] == (self.params.end_train_day_x + predict_day)] = \
#             pred_v_prev_df[self.params.target][
#                 pred_v_prev_df['d'] == (self.params.end_train_day_x + predict_day)].values

# if self.params.export_all_flag:
#     base_test.to_csv(
#         self.params.result_dir_path / 'exp_base_test_{0}_b.csv'.format(self.params.prediction_horizon),
#         index=False)

# main_time = time.time()
# pred_h_df = pd.DataFrame()
# for predict_day in range(self.params.prediction_horizon_prev + 1, self.params.prediction_horizon + 1):
#     self.log.info('predict day{:02d}'.format(predict_day))
#     start_time = time.time()
#     grid_df = base_test.copy()

#     if self.params.recursive_feature_flag:
#         self.log.info('[{0} - {1}] calculate recursive features'.format(
#             self.params.end_train_day_x, self.params.prediction_horizon))
#         for num_shift_rolling_day in self.params.num_shift_rolling_day_list:
#             num_shift_day = num_shift_rolling_day[0]
#             num_rolling_day = num_shift_rolling_day[1]
#             lag_df = base_test[['id', 'd', self.params.target]]
#             col_name = 'rolling_mean_tmp_' + str(num_shift_day) + '_' + str(num_rolling_day)
#             lag_df[col_name] = lag_df.groupby(['id'])[self.params.target].transform(
#                 lambda x: x.shift(num_shift_day).rolling(num_rolling_day).mean())
#             grid_df = pd.concat([grid_df, lag_df[[col_name]]], axis=1)

#     day_mask = base_test['d'] == (self.params.end_train_day_x + predict_day)
#     if self.params.export_all_flag:
#         self.log.info('export recursive_features')
#         grid_df[day_mask].to_csv(self.params.result_dir_path / 'exp_recursive_features_{0}_{1}.csv'.format(
#             self.params.prediction_horizon, predict_day), index=False)
#     for store_index, store_id in enumerate(store_id_set_list):
#         self.log.info('[{3} - {4}] predict {0}/{1} {2} day {5}'.format(
#             store_index + 1, len(store_id_set_list), store_id,
#             self.params.end_train_day_x, self.params.prediction_horizon, predict_day))

#         model_path = str(
#             self.params.model_dir_path / f'lgb_model_{store_id}_{self.params.prediction_horizon}.bin')

#         estimator = pickle.load(open(model_path, 'rb'))
#         if store_id != 'all':
#             store_mask = base_test['store_id'] == store_id
#             mask = (day_mask) & (store_mask)
#         else:
#             mask = day_mask

#         if self.params.export_all_flag:
#             self.log.info('export pred')
#             grid_df[mask].to_csv(
#                 self.params.result_dir_path / (
#                         'exp_pred_' + store_id + '_day_' + str(predict_day) + '.csv'), index=False)
#         base_test[self.params.target][mask] = estimator.predict(grid_df[mask][self.params.enable_features])

#     temp_df = base_test[day_mask][['id', self.params.target]]
#     temp_df.columns = ['id', 'F' + str(predict_day)]
#     if 'id' in list(pred_h_df):
#         pred_h_df = pred_h_df.merge(temp_df, on=['id'], how='left')
#     else:
#         pred_h_df = temp_df.copy()

#     del temp_df

# if self.params.export_all_flag:
#     base_test.to_csv(
#         self.params.result_dir_path / 'exp_base_test_{0}_c.csv'.format(self.params.prediction_horizon),
#         index=False)
# pred_h_df.to_csv(self.params.result_dir_path / 'pred_h_{}.csv'.format(
#     self.params.prediction_horizon), index=False)

# pred_v_df = base_test[
#     (base_test['d'] >= self.params.end_train_day_x + self.params.prediction_horizon_prev + 1) *
#     (base_test['d'] < self.params.end_train_day_x + self.params.prediction_horizon + 1)
#     ][
#     self.params.main_index_list + [self.params.target]
#     ]
# pred_v_df.to_csv(self.params.result_dir_path / 'pred_v_{}.csv'.format(self.params.prediction_horizon),
#                  index=False)
