# Objective
Forecast sales (no. of items sold) for each item and store for the next 28 days

20200527: Diff from m5-forecasting_v1.ipynb: Here we do predictions for 1 state at a time

20200608: Diff from m5-forecasting_v2.ipynb: Here we do hyperparam tuning

20200616: Diff from m5-forecasting_v5.ipynb: Here we remove those rows where the sell_price is NaN, as this means the item is not yet being sold, so the target value of 0 is meaningless

In [548]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import gc
import matplotlib
import numpy as np # linear algebra
import operator as op
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import time

from collections import OrderedDict
from matplotlib import pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRegressor

%matplotlib inline

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [549]:
##### Input params ########
state = 'WI'
store = 'WI_3' # CA_1, CA_2, CA_3, CA_4, TX_1, TX_2, TX_3, WI_1, WI_2, WI_3

model_seed = 102
sample_seed = 103              # for sampling the dataframe

n_estimators = 100             # for the initial model before tuning. default = 100
max_depth = 3                  # for the initial model before tuning. default = 3
learning_rate = 0.1            # for the initial model before tuning. default = 0.1
min_child_weight = 1           # for the initial model before tuning. default = 1
subsample = 1                  # for the initial model before tuning. default = 1
colsample_bytree = 1           # for the initial model before tuning. default = 1
colsample_bylevel = 1          # for the initial model before tuning. default = 1
train_test_split_seed = 111    # 111
model_seed = 100

# train_start = 0
# train_end = 1884               # use date_block_num from train_start to train_end as train set
# val_date_block_num = 1885      # date_block_num for validation set
# pred_start = 1886              # do pred starting from this day
# pred_end = 1913                # do pred until and include this day

train_start = 0
train_end = 1912               # use date_block_num from train_start to train_end as train set
val_date_block_num = 1913      # date_block_num for validation set
pred_start = 1914            # do pred starting from this day
pred_end = 1941              # do pred until and include this day

shift_range = [1, 2, 3, 4, 5, 6, 7, 30, 365] # Use values from last 7 days, from 30 days ago, and from 365 days ago

fontsize = 14
ticklabelsize = 14
###########################

In [550]:
ticStart = time.time()

# Common functions

In [551]:
def downcast_dtypes(df):
    '''
        Changes column types in the dataframe: 
                
                `float64` type to `float32`
                `int64`   type to `int16`
    '''
    
    # Select columns to downcast
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols =   [c for c in df if df[c].dtype == "int64"]
    
    # Downcast
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols]   = df[int_cols].astype(np.int16)
    
    return df

def get_lags(index_cols, cols_to_rename, all_data, shift_range):
    """
    For a dataframe all_data, get the lags specified in cols_to_rename
    Input
        index_cols    : e.g. ['item_id', 'store_id', 'date_block_num']. These are the cols to do merging with the
                        lagged dataset
        cols_to_rename: e.g. ['target']. Based on this example, all_data will be returned with columns target_lag_1, 
                        target_lag_2, ..., target_lag_365.
        all_data      : need to have a column 'date_block_num' to indicate chronological order
        shift_range   : e.g. [1, 2, 3, 4, 5, 6, 7, 30, 365]
    Output
        all_data
    """
    for day_shift in shift_range:
        train_shift = all_data[index_cols + cols_to_rename].copy()
    
        # E.g. variable of 0 becomes 1, for day_shift = 1.
        # So when this is merged with variable of 1 in all_data, this will represent lag of 1.
        train_shift['date_block_num'] = train_shift['date_block_num'] + day_shift
    
        foo = lambda x: '{}_lag_{}'.format(x, day_shift) if x in cols_to_rename else x
        train_shift = train_shift.rename(columns=foo)

        all_data = pd.merge(all_data, train_shift, on=index_cols, how='left')
    
        # Fill the NaNs with 0
        for x in cols_to_rename:
            all_data[x + '_lag_' + str(day_shift)].fillna(0, inplace=True)
    
    del train_shift
    gc.collect();

    return all_data

def get_params(error_rate):
    metric1_opt = list(error_rate.items())[0][0]
    metric2_opt = list(error_rate.items())[0][1][0][0]
    mse_min = list(error_rate.items())[0][1][0][1]
    for item in error_rate:
        metric2, mse = min(error_rate[item], key=op.itemgetter(1))
        if mse < mse_min:
            metric1_opt = item
            metric2_opt = metric2
            mse_min = mse

    return (metric1_opt, metric2_opt, mse_min)


# Load data

In [552]:
sell_prices = pd.read_csv("./data/sell_prices.csv")
sell_prices.head()

Unnamed: 0,store_id,item_id,wm_yr_wk,sell_price
0,CA_1,HOBBIES_1_001,11325,9.58
1,CA_1,HOBBIES_1_001,11326,9.58
2,CA_1,HOBBIES_1_001,11327,8.26
3,CA_1,HOBBIES_1_001,11328,8.26
4,CA_1,HOBBIES_1_001,11329,8.26


File 2: “sell_prices.csv”

Contains information about the price of the products sold per store and date.
* store_id: The id of the store where the product is sold. 
* item_id: The id of the product.
* wm_yr_wk: The id of the week.
* sell_price: The price of the product for the given week/store. The price is provided per week (average across seven days). If not available, this means that the product was not sold during the examined week. Note that although prices are constant at weekly basis, they may change through time (both training and test set).

In [553]:
calendar = pd.read_csv("./data/calendar.csv")
calendar

Unnamed: 0,date,wm_yr_wk,weekday,wday,month,year,d,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,2011-01-29,11101,Saturday,1,1,2011,d_1,,,,,0,0,0
1,2011-01-30,11101,Sunday,2,1,2011,d_2,,,,,0,0,0
2,2011-01-31,11101,Monday,3,1,2011,d_3,,,,,0,0,0
3,2011-02-01,11101,Tuesday,4,2,2011,d_4,,,,,1,1,0
4,2011-02-02,11101,Wednesday,5,2,2011,d_5,,,,,1,0,1
5,2011-02-03,11101,Thursday,6,2,2011,d_6,,,,,1,1,1
6,2011-02-04,11101,Friday,7,2,2011,d_7,,,,,1,0,0
7,2011-02-05,11102,Saturday,1,2,2011,d_8,,,,,1,1,1
8,2011-02-06,11102,Sunday,2,2,2011,d_9,SuperBowl,Sporting,,,1,1,1
9,2011-02-07,11102,Monday,3,2,2011,d_10,,,,,1,1,0


File 1: “calendar.csv” 

Contains information about the dates the products are sold.
* date: The date in a “y-m-d” format.
* wm_yr_wk: The id of the week the date belongs to.
* weekday: The type of the day (Saturday, Sunday, …, Friday).
* wday: The id of the weekday, starting from Saturday.
* month: The month of the date.
* year: The year of the date.
* event_name_1: If the date includes an event, the name of this event.
* event_type_1: If the date includes an event, the type of this event.
* event_name_2: If the date includes a second event, the name of this event.
* event_type_2: If the date includes a second event, the type of this event.
* snap_CA, snap_TX, and snap_WI: A binary variable (0 or 1) indicating whether the stores of CA, TX or WI allow SNAP purchases on the examined date. 1 indicates that SNAP purchases are allowed.


In [554]:
sales_tr_val = pd.read_csv("./data/sales_train_validation.csv")
print("len(sales_tr_val) = " + str(len(sales_tr_val)))
sales_tr_val.head()

len(sales_tr_val) = 30490


Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,d_1904,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,3,0,1,1,1,3,0,1,1
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,1,2,1,1,1,0,1,1,1
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,0,5,4,1,0,1,3,7,2
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,1,1,0,1,1,2,2,2,4


For each (item_id, store_id), we have the number of items sold for 1913 days or 5.24 years.

There are 30,490 distinct id's, and 1913 days which gives 30,490*1913 = 58,327,370

In [555]:
# Keep only data from one state
sales_tr_val = sales_tr_val[(sales_tr_val['state_id']==state) & (sales_tr_val['store_id']==store)]
print("len(sales_tr_val) = " + str(len(sales_tr_val)))

len(sales_tr_val) = 3049


In [556]:
sample_sub = pd.read_csv("./data/sample_submission.csv")
print(len(sample_sub))
sample_sub.head()

60980


Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,HOBBIES_1_001_CA_1_validation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,HOBBIES_1_002_CA_1_validation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,HOBBIES_1_003_CA_1_validation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,HOBBIES_1_004_CA_1_validation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,HOBBIES_1_005_CA_1_validation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Provide the forecast for each id for the next 28 days. For the validation rows, this corresponds to d_1914 - d_1941, and for the evaluation rows, this corresponds to d_1942 - d_1969. (Note: a month before the competition close, the ground truth for the validation rows will be provided.)

Note 30,490*2 = 60,980 which is the number of rows in the submission file.

# Pre-process data

In [557]:
print(calendar.memory_usage().sum())

# # In calendar 'd' column, change 'd_1' to 1
# calendar['d'] = calendar.apply(lambda row: row['d'].split('_')[1], axis=1)

# Replance NaN with nil
calendar = calendar.fillna(value = 'nil')

# Downcast dtypes from 64 bit to save memory
calendar= downcast_dtypes(calendar)

# Change to category dtype
calendar[["year", "event_name_1", "event_type_1", "event_name_2", "event_type_2"]] = \
    calendar[["year", "event_name_1", "event_type_1", "event_name_2", "event_type_2"]] .astype("category")

# Drop weekday column since its not used
calendar.drop(['weekday'], axis=1, inplace=True)

print(calendar.memory_usage().sum())

calendar.head()

220608
67297


Unnamed: 0,date,wm_yr_wk,wday,month,year,d,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,2011-01-29,11101,1,1,2011,d_1,nil,nil,nil,nil,0,0,0
1,2011-01-30,11101,2,1,2011,d_2,nil,nil,nil,nil,0,0,0
2,2011-01-31,11101,3,1,2011,d_3,nil,nil,nil,nil,0,0,0
3,2011-02-01,11101,4,2,2011,d_4,nil,nil,nil,nil,1,1,0
4,2011-02-02,11101,5,2,2011,d_5,nil,nil,nil,nil,1,0,1


In [558]:
# Downcast dtypes from 64 bit to save memory
print(sell_prices.memory_usage().sum())
sell_prices = downcast_dtypes(sell_prices)

sell_prices[["store_id", "item_id"]] = \
    sell_prices[["store_id","item_id"]].astype("category")

print(sell_prices.memory_usage().sum())

218915952
61676881


In [559]:
# Downcast dtypes from 64 bit to save memory
print(sales_tr_val.memory_usage().sum())
sales_tr_val = downcast_dtypes(sales_tr_val)

sales_tr_val[["item_id", "dept_id", "cat_id", "store_id", "state_id"]] = \
    sales_tr_val[["item_id", "dept_id", "cat_id", "store_id", "state_id"]].astype("category")

print(sales_tr_val.memory_usage().sum())

46832640
11839520


In [560]:
# Melt sales_tr_val to switch from a wide to a long dataframe
tic = time.time()
value_vars = ['d_' + str(x) for x in range(1, 1914)]
sale_tr_val_melt = pd.melt(sales_tr_val, 
                           id_vars=['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], 
                           value_vars=value_vars)

toc = time.time()
print("Time taken = " + str(toc-tic) + ' s')
sale_tr_val_melt.head()

Time taken = 13.031988143920898 s


Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,variable,value
0,HOBBIES_1_001_WI_3_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,WI_3,WI,d_1,0
1,HOBBIES_1_002_WI_3_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,WI_3,WI,d_1,0
2,HOBBIES_1_003_WI_3_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,WI_3,WI,d_1,0
3,HOBBIES_1_004_WI_3_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,WI_3,WI,d_1,4
4,HOBBIES_1_005_WI_3_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,WI_3,WI,d_1,0


In [561]:
del sales_tr_val
gc.collect();
sales_tr_val = pd.DataFrame()

In [562]:
# Merge with calendar to get events info
sale_tr_val_melt_merged = sale_tr_val_melt.merge(calendar, 
                                                 left_on='variable', 
                                                 right_on='d', 
                                                 how='left')

# Drop unneccesary columns
sale_tr_val_melt_merged.drop(columns=['d'], inplace=True)

sale_tr_val_melt_merged.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,variable,value,date,wm_yr_wk,wday,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,HOBBIES_1_001_WI_3_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,WI_3,WI,d_1,0,2011-01-29,11101,1,1,2011,nil,nil,nil,nil,0,0,0
1,HOBBIES_1_002_WI_3_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,WI_3,WI,d_1,0,2011-01-29,11101,1,1,2011,nil,nil,nil,nil,0,0,0
2,HOBBIES_1_003_WI_3_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,WI_3,WI,d_1,0,2011-01-29,11101,1,1,2011,nil,nil,nil,nil,0,0,0
3,HOBBIES_1_004_WI_3_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,WI_3,WI,d_1,4,2011-01-29,11101,1,1,2011,nil,nil,nil,nil,0,0,0
4,HOBBIES_1_005_WI_3_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,WI_3,WI,d_1,0,2011-01-29,11101,1,1,2011,nil,nil,nil,nil,0,0,0


In [563]:
# Merge with sell_prices to get price info
sale_tr_val_melt_merged2 = sale_tr_val_melt_merged.merge(sell_prices, 
                                                         left_on=['store_id', 'item_id', 'wm_yr_wk'], 
                                                         right_on=['store_id', 'item_id', 'wm_yr_wk'], 
                                                         how='left')
sale_tr_val_melt_merged2

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,variable,value,date,wm_yr_wk,...,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price
0,HOBBIES_1_001_WI_3_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,WI_3,WI,d_1,0,2011-01-29,11101,...,1,2011,nil,nil,nil,nil,0,0,0,
1,HOBBIES_1_002_WI_3_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,WI_3,WI,d_1,0,2011-01-29,11101,...,1,2011,nil,nil,nil,nil,0,0,0,
2,HOBBIES_1_003_WI_3_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,WI_3,WI,d_1,0,2011-01-29,11101,...,1,2011,nil,nil,nil,nil,0,0,0,
3,HOBBIES_1_004_WI_3_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,WI_3,WI,d_1,4,2011-01-29,11101,...,1,2011,nil,nil,nil,nil,0,0,0,4.34
4,HOBBIES_1_005_WI_3_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,WI_3,WI,d_1,0,2011-01-29,11101,...,1,2011,nil,nil,nil,nil,0,0,0,
5,HOBBIES_1_006_WI_3_validation,HOBBIES_1_006,HOBBIES_1,HOBBIES,WI_3,WI,d_1,0,2011-01-29,11101,...,1,2011,nil,nil,nil,nil,0,0,0,
6,HOBBIES_1_007_WI_3_validation,HOBBIES_1_007,HOBBIES_1,HOBBIES,WI_3,WI,d_1,0,2011-01-29,11101,...,1,2011,nil,nil,nil,nil,0,0,0,
7,HOBBIES_1_008_WI_3_validation,HOBBIES_1_008,HOBBIES_1,HOBBIES,WI_3,WI,d_1,6,2011-01-29,11101,...,1,2011,nil,nil,nil,nil,0,0,0,0.46
8,HOBBIES_1_009_WI_3_validation,HOBBIES_1_009,HOBBIES_1,HOBBIES,WI_3,WI,d_1,1,2011-01-29,11101,...,1,2011,nil,nil,nil,nil,0,0,0,1.56
9,HOBBIES_1_010_WI_3_validation,HOBBIES_1_010,HOBBIES_1,HOBBIES,WI_3,WI,d_1,0,2011-01-29,11101,...,1,2011,nil,nil,nil,nil,0,0,0,3.17


In [564]:
# Check whether there are rows where value != 0 and sell_price is null
sale_tr_val_melt_merged2[(sale_tr_val_melt_merged2['value']!=0) & (sale_tr_val_melt_merged2['sell_price'].isnull())]

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,variable,value,date,wm_yr_wk,...,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price


There are no rows where value != 0 and sell_price is null

In [565]:
# Remove those rows where sell price is NaN and value=0 as this means the item is not being sold yet
print("Before dropna, len(sale_tr_val_melt_merged2) = " + str(len(sale_tr_val_melt_merged2)))
sale_tr_val_melt_merged2.dropna(axis=0, how='any', subset=['sell_price'], inplace=True)
print("After dropna, len(sale_tr_val_melt_merged2) = " + str(len(sale_tr_val_melt_merged2)))

Before dropna, len(sale_tr_val_melt_merged2) = 5832737
After dropna, len(sale_tr_val_melt_merged2) = 4686669


In [566]:
# del sell_prices
# gc.collect();
# sell_prices = pd.DataFrame()

del sale_tr_val_melt_merged
gc.collect();
sale_tr_val_melt_merged = pd.DataFrame()

In [567]:
# Get no. of distinct values in each column
print("item_id uniques = " + str(sale_tr_val_melt_merged2['item_id'].nunique()))
print("dept_id uniques = " + str(sale_tr_val_melt_merged2['dept_id'].nunique()))
print("cat_id uniques = " + str(sale_tr_val_melt_merged2['cat_id'].nunique()))
print("store_id uniques = " + str(sale_tr_val_melt_merged2['store_id'].nunique()))
print("state_id uniques = " + str(sale_tr_val_melt_merged2['state_id'].nunique()))

item_id uniques = 3049
dept_id uniques = 7
cat_id uniques = 3
store_id uniques = 1
state_id uniques = 1


In [568]:
print(sale_tr_val_melt_merged2['dept_id'].unique())

[HOBBIES_1, HOBBIES_2, HOUSEHOLD_1, HOUSEHOLD_2, FOODS_1, FOODS_2, FOODS_3]
Categories (7, object): [HOBBIES_1, HOBBIES_2, HOUSEHOLD_1, HOUSEHOLD_2, FOODS_1, FOODS_2, FOODS_3]


In [569]:
print(sale_tr_val_melt_merged2['cat_id'].unique())

[HOBBIES, HOUSEHOLD, FOODS]
Categories (3, object): [HOBBIES, HOUSEHOLD, FOODS]


In [570]:
print(sale_tr_val_melt_merged2['store_id'].unique())

['WI_3']


In [571]:
print(sale_tr_val_melt_merged2['state_id'].unique())

[WI]
Categories (1, object): [WI]


In [572]:
# Convert variable to a numerical column ie. d_1 to 1
sale_tr_val_melt_merged2['variable'] = sale_tr_val_melt_merged2['variable'].str.split('_').str[1]
sale_tr_val_melt_merged2

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,variable,value,date,wm_yr_wk,...,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price
3,HOBBIES_1_004_WI_3_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,WI_3,WI,1,4,2011-01-29,11101,...,1,2011,nil,nil,nil,nil,0,0,0,4.34
7,HOBBIES_1_008_WI_3_validation,HOBBIES_1_008,HOBBIES_1,HOBBIES,WI_3,WI,1,6,2011-01-29,11101,...,1,2011,nil,nil,nil,nil,0,0,0,0.46
8,HOBBIES_1_009_WI_3_validation,HOBBIES_1_009,HOBBIES_1,HOBBIES,WI_3,WI,1,1,2011-01-29,11101,...,1,2011,nil,nil,nil,nil,0,0,0,1.56
9,HOBBIES_1_010_WI_3_validation,HOBBIES_1_010,HOBBIES_1,HOBBIES,WI_3,WI,1,0,2011-01-29,11101,...,1,2011,nil,nil,nil,nil,0,0,0,3.17
11,HOBBIES_1_012_WI_3_validation,HOBBIES_1_012,HOBBIES_1,HOBBIES,WI_3,WI,1,3,2011-01-29,11101,...,1,2011,nil,nil,nil,nil,0,0,0,5.98
14,HOBBIES_1_015_WI_3_validation,HOBBIES_1_015,HOBBIES_1,HOBBIES,WI_3,WI,1,4,2011-01-29,11101,...,1,2011,nil,nil,nil,nil,0,0,0,0.70
15,HOBBIES_1_016_WI_3_validation,HOBBIES_1_016,HOBBIES_1,HOBBIES,WI_3,WI,1,1,2011-01-29,11101,...,1,2011,nil,nil,nil,nil,0,0,0,0.70
16,HOBBIES_1_017_WI_3_validation,HOBBIES_1_017,HOBBIES_1,HOBBIES,WI_3,WI,1,4,2011-01-29,11101,...,1,2011,nil,nil,nil,nil,0,0,0,1.93
19,HOBBIES_1_020_WI_3_validation,HOBBIES_1_020,HOBBIES_1,HOBBIES,WI_3,WI,1,0,2011-01-29,11101,...,1,2011,nil,nil,nil,nil,0,0,0,10.67
21,HOBBIES_1_022_WI_3_validation,HOBBIES_1_022,HOBBIES_1,HOBBIES,WI_3,WI,1,3,2011-01-29,11101,...,1,2011,nil,nil,nil,nil,0,0,0,7.33


In [573]:
# Convert variable to int
sale_tr_val_melt_merged2['variable'] = sale_tr_val_melt_merged2['variable'].astype('int16')

# Create features

In [574]:
all_data = sale_tr_val_melt_merged2
all_data = all_data.rename(columns={'variable': 'date_block_num', 
                                    'value': 'target'})

In [575]:
# Remove unnecessaray column
all_data.drop(['date'], axis=1, inplace=True)

In [576]:
# Convert features to categorical
columns_to_be_labeled = [
    'year',
    'event_name_1',
    'event_type_1',
    'event_name_2',
    'event_type_2',
    'wm_yr_wk',
    'item_id',
    'store_id',
    'dept_id',
    'cat_id',
    'state_id'
]
for column in columns_to_be_labeled:
    all_data.loc[:, column] = LabelEncoder().fit_transform(all_data[column]) # fit_transform: Fit label encoder and return encoded labels. Note the encoded labels are integers!!

all_data.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,date_block_num,target,wm_yr_wk,wday,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price
3,HOBBIES_1_004_WI_3_validation,1440,3,1,0,0,1,4,0,1,1,0,30,4,4,2,0,0,0,4.34
7,HOBBIES_1_008_WI_3_validation,1444,3,1,0,0,1,6,0,1,1,0,30,4,4,2,0,0,0,0.46
8,HOBBIES_1_009_WI_3_validation,1445,3,1,0,0,1,1,0,1,1,0,30,4,4,2,0,0,0,1.56
9,HOBBIES_1_010_WI_3_validation,1446,3,1,0,0,1,0,0,1,1,0,30,4,4,2,0,0,0,3.17
11,HOBBIES_1_012_WI_3_validation,1448,3,1,0,0,1,3,0,1,1,0,30,4,4,2,0,0,0,5.98


In [577]:
index_cols = ['item_id', 'store_id', 'date_block_num']

cols_to_rename = ['target']

for day_shift in shift_range:
    print(day_shift)
    train_shift = all_data[index_cols + cols_to_rename].copy()
    
    # E.g. variable of 0 becomes 1, for day_shift = 1.
    # So when this is merged with variable of 1 in all_data, this will represent lag of 1.
    train_shift['date_block_num'] = train_shift['date_block_num'] + day_shift
    
    foo = lambda x: '{}_lag_{}'.format(x, day_shift) if x in cols_to_rename else x
    train_shift = train_shift.rename(columns=foo)

    all_data = pd.merge(all_data, train_shift, on=index_cols, how='left')
    
#     # Fill the NaNs with 0
#     for x in cols_to_rename:
#         all_data[x + '_lag_' + str(day_shift)].fillna(0, inplace=True)
    
del train_shift
gc.collect();

all_data

1
2
3
4
5
6
7
30
365


Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,date_block_num,target,wm_yr_wk,wday,...,sell_price,target_lag_1,target_lag_2,target_lag_3,target_lag_4,target_lag_5,target_lag_6,target_lag_7,target_lag_30,target_lag_365
0,HOBBIES_1_004_WI_3_validation,1440,3,1,0,0,1,4,0,1,...,4.34,,,,,,,,,
1,HOBBIES_1_008_WI_3_validation,1444,3,1,0,0,1,6,0,1,...,0.46,,,,,,,,,
2,HOBBIES_1_009_WI_3_validation,1445,3,1,0,0,1,1,0,1,...,1.56,,,,,,,,,
3,HOBBIES_1_010_WI_3_validation,1446,3,1,0,0,1,0,0,1,...,3.17,,,,,,,,,
4,HOBBIES_1_012_WI_3_validation,1448,3,1,0,0,1,3,0,1,...,5.98,,,,,,,,,
5,HOBBIES_1_015_WI_3_validation,1451,3,1,0,0,1,4,0,1,...,0.70,,,,,,,,,
6,HOBBIES_1_016_WI_3_validation,1452,3,1,0,0,1,1,0,1,...,0.70,,,,,,,,,
7,HOBBIES_1_017_WI_3_validation,1453,3,1,0,0,1,4,0,1,...,1.93,,,,,,,,,
8,HOBBIES_1_020_WI_3_validation,1456,3,1,0,0,1,0,0,1,...,10.67,,,,,,,,,
9,HOBBIES_1_022_WI_3_validation,1458,3,1,0,0,1,3,0,1,...,7.33,,,,,,,,,


In [578]:
# Remove those rows where target_lag_x is NaN
lag_cols = []
for day_shift in shift_range[:-1]:         # ignore NaN for target_lag_365, else too many rows will be removed
    lag_cols.append('target_lag_'+str(day_shift))

print("Before dropna, len(all_data) = " + str(len(all_data)))
all_data.dropna(axis=0, how='any', subset=lag_cols, inplace=True)
print("After dropna, len(all_data) = " + str(len(all_data)))

Before dropna, len(all_data) = 4686669
After dropna, len(all_data) = 4595199


In [579]:
all_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4595199 entries, 38412 to 4686668
Data columns (total 29 columns):
id                object
item_id           int64
dept_id           int64
cat_id            int64
store_id          int64
state_id          int64
date_block_num    int16
target            int16
wm_yr_wk          int64
wday              int16
month             int16
year              int64
event_name_1      int64
event_type_1      int64
event_name_2      int64
event_type_2      int64
snap_CA           int16
snap_TX           int16
snap_WI           int16
sell_price        float32
target_lag_1      float64
target_lag_2      float64
target_lag_3      float64
target_lag_4      float64
target_lag_5      float64
target_lag_6      float64
target_lag_7      float64
target_lag_30     float64
target_lag_365    float64
dtypes: float32(1), float64(9), int16(7), int64(11), object(1)
memory usage: 850.2+ MB


# Split into train, val, test sets

We use 30 days as validation data

In [580]:
to_drop_cols = ['id', 'target', 'date_block_num', 'store_id', 'state_id']

In [581]:
X_train = all_data[(all_data['date_block_num']>=train_start) & 
                   (all_data['date_block_num']<=train_end)].drop(to_drop_cols, axis = 1)
X_cv = all_data[all_data['date_block_num']==val_date_block_num].drop(to_drop_cols, axis = 1)
X_train_cv = all_data[(all_data['date_block_num']>=train_start) & 
                      (all_data['date_block_num']<=val_date_block_num)].drop(to_drop_cols, axis = 1)

y_train = all_data[(all_data['date_block_num']>=train_start) & 
                   (all_data['date_block_num']<=train_end)][['id', 'target']]
y_cv = all_data[all_data['date_block_num']==val_date_block_num][['id', 'target']]
y_train_cv = all_data[(all_data['date_block_num']>=train_start) &
                      (all_data['date_block_num']<=val_date_block_num)][['id', 'target']]

# Train the model and get prediction for day val_date_block_num

In [582]:
# Create the model
model = XGBRegressor(seed=model_seed,
                      n_estimators=n_estimators,
                      max_depth=max_depth,
                      learning_rate=learning_rate,
                      min_child_weight=min_child_weight)

# Train the regressor
model.fit(X_train, y_train['target'])




XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.1, max_delta_step=0, max_depth=3,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=0, num_parallel_tree=1,
             objective='reg:squarederror', random_state=100, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, seed=100, subsample=1,
             tree_method='approx', validate_parameters=1, verbosity=None)

In [583]:
# View a list of the features and their importance scores
imp = list(zip(X_train.columns, model.feature_importances_))
imp.sort(key=lambda tup: tup[1], reverse=False) 
imp

[('cat_id', 0.0),
 ('year', 0.0),
 ('event_name_2', 0.0),
 ('event_type_2', 0.0),
 ('snap_TX', 0.0),
 ('snap_CA', 0.0005112946),
 ('event_type_1', 0.0006217556),
 ('event_name_1', 0.0006976856),
 ('wm_yr_wk', 0.0021909513),
 ('dept_id', 0.0023557765),
 ('month', 0.0034413165),
 ('sell_price', 0.0065370942),
 ('target_lag_365', 0.0066472148),
 ('wday', 0.008295924),
 ('snap_WI', 0.008683728),
 ('item_id', 0.014257864),
 ('target_lag_30', 0.025488589),
 ('target_lag_4', 0.052244473),
 ('target_lag_5', 0.06401203),
 ('target_lag_6', 0.0997067),
 ('target_lag_2', 0.11718076),
 ('target_lag_3', 0.1432134),
 ('target_lag_7', 0.15836455),
 ('target_lag_1', 0.2855489)]

In [584]:
# Do prediction on train set
pred = model.predict(X_train).astype('int')

# Calculate RMSE
print("Train set MSE = " + str(mean_squared_error(y_train['target'], pred)))

# Do prediction on val set
pred = model.predict(X_cv).astype('int')

# Calculate RMSE
print("Val set MSE = " + str(mean_squared_error(y_cv['target'], pred)))

Train set MSE = 5.572780505863267
Val set MSE = 3.438504427681207


In [585]:
y_cv['pred'] = pred
y_cv

Unnamed: 0,id,target,pred
4683620,HOBBIES_1_001_WI_3_validation,0,0
4683621,HOBBIES_1_002_WI_3_validation,0,0
4683622,HOBBIES_1_003_WI_3_validation,1,0
4683623,HOBBIES_1_004_WI_3_validation,2,0
4683624,HOBBIES_1_005_WI_3_validation,0,0
4683625,HOBBIES_1_006_WI_3_validation,2,0
4683626,HOBBIES_1_007_WI_3_validation,0,0
4683627,HOBBIES_1_008_WI_3_validation,3,2
4683628,HOBBIES_1_009_WI_3_validation,0,0
4683629,HOBBIES_1_010_WI_3_validation,0,0


# Hyperparameter tuning

## Plot the error rate versus n_estimators, varying max_depth (size of tree, default = 3)

In [586]:
# ensemble_clfs = [
#     (3,
#         XGBRegressor(seed=model_seed,
#                       max_depth=3,
#                       learning_rate=learning_rate,
#                       min_child_weight=min_child_weight)),
#     (5,
#         XGBRegressor(seed=model_seed,
#                       max_depth=5,
#                       learning_rate=learning_rate,
#                       min_child_weight=min_child_weight)),
#     (7,
#         XGBRegressor(seed=model_seed,
#                       max_depth=7,
#                       learning_rate=learning_rate,
#                       min_child_weight=min_child_weight)),
#     (9,
#         XGBRegressor(seed=model_seed,
#                       max_depth=9,
#                       learning_rate=learning_rate,
#                       min_child_weight=min_child_weight))
# ]

# # Map a classifier name to a list of (<n_estimators>, <error rate>) pairs.
# error_rate = OrderedDict((label, []) for label, _ in ensemble_clfs)

# # Range of `n_estimators` values to explore.
# estimators_list = [10]
# estimators_list.extend(range(100, 500, 200))

# tic = time.time()
# for label, clf in ensemble_clfs:
#     print(label)
#     for i in estimators_list:
#         print(i)
#         tic1 = time.time()
#         clf.set_params(n_estimators=i)
#         clf.fit(X_train, y_train['target'])
        
#         pred = clf.predict(X_cv).astype('int')
#         error_rate[label].append((i, mean_squared_error(y_cv['target'], pred)))
#         toc1 = time.time()
#         print("Time taken = " + str((toc1-tic1)/60.0) + " mins")
        
# toc = time.time()
# print("Minutes taken = " + str((toc-tic)/60.0))

In [587]:
# # Generate the "error rate" vs. "n_estimators" plot.
# plt.figure(figsize=(12, 8), dpi=80)
# for label, clf_err in error_rate.items():
#     xs, ys = zip(*clf_err)
#     plt.plot(xs, ys, label=label, marker='x')

# plt.xlabel("n_estimators")
# plt.ylabel("MSE")
# plt.legend(loc="lower right")
# plt.grid()
# matplotlib.rcParams.update({'font.size': fontsize})
# matplotlib.rcParams['xtick.labelsize'] = ticklabelsize
# matplotlib.rcParams['ytick.labelsize'] = ticklabelsize

In [588]:
# error_rate

In [589]:
# max_depth_opt, n_estimators_opt, mse_min = get_params(error_rate)
# max_depth_opt, n_estimators_opt, mse_min

## Plot the error rate versus n_estimators, varying learning_rate (default = 0.1)

In [590]:
# ensemble_clfs = [
#     (0.1,
#         XGBRegressor(seed=model_seed,
#                       max_depth=max_depth_opt,
#                       learning_rate=0.1,
#                       min_child_weight=min_child_weight)),
#     (0.01,
#         XGBRegressor(seed=model_seed,
#                       max_depth=max_depth_opt,
#                       learning_rate=0.01,
#                       min_child_weight=min_child_weight)),
#     (0.001,
#         XGBRegressor(seed=model_seed,
#                       max_depth=max_depth_opt,
#                       learning_rate=0.001,
#                       min_child_weight=min_child_weight))
# ]

# # Map a classifier name to a list of (<n_estimators>, <error rate>) pairs.
# error_rate = OrderedDict((label, []) for label, _ in ensemble_clfs)

# # Range of `n_estimators` values to explore.
# estimators_list = [10]
# estimators_list.extend(range(100, 500, 200))

# tic = time.time()
# for label, clf in ensemble_clfs:
#     print(label)
#     for i in estimators_list:
#         print(i)
#         tic1 = time.time()
#         clf.set_params(n_estimators=i)
#         clf.fit(X_train, y_train['target'])
        
#         pred = clf.predict(X_cv).astype('int')
#         error_rate[label].append((i, mean_squared_error(y_cv['target'], pred)))
#         toc1 = time.time()
#         print("Time taken = " + str((toc1-tic1)/60.0) + " mins")

# toc = time.time()
# print("Minutes taken = " + str((toc-tic)/60.0))

In [591]:
# # Generate the "error rate" vs. "n_estimators" plot.
# plt.figure(figsize=(12, 8), dpi=80)
# for label, clf_err in error_rate.items():
#     xs, ys = zip(*clf_err)
#     plt.plot(xs, ys, label=label, marker='x')

# plt.xlabel("n_estimators")
# plt.ylabel("MSE")
# plt.legend(loc="upper right")
# plt.grid()
# matplotlib.rcParams.update({'font.size': fontsize})
# matplotlib.rcParams['xtick.labelsize'] = ticklabelsize
# matplotlib.rcParams['ytick.labelsize'] = ticklabelsize

In [592]:
# error_rate

In [593]:
# learning_rate_opt, n_estimators_opt, mse_min = get_params(error_rate)
# learning_rate_opt, n_estimators_opt, mse_min

## Plot the error rate versus n_estimators, varying min_child_weight (default = 1)

In [594]:
# ensemble_clfs = [
#     (1,
#         XGBRegressor(seed=model_seed,
#                       max_depth=max_depth_opt,
#                       learning_rate=learning_rate_opt,
#                       min_child_weight=1)),
#     (10,
#         XGBRegressor(seed=model_seed,
#                       max_depth=max_depth_opt,
#                       learning_rate=learning_rate_opt,
#                       min_child_weight=10)),
#      (50,
#         XGBRegressor(seed=model_seed,
#                       max_depth=max_depth_opt,
#                       learning_rate=learning_rate_opt,
#                       min_child_weight=10)),
# ]

# # Map a classifier name to a list of (<n_estimators>, <error rate>) pairs.
# error_rate = OrderedDict((label, []) for label, _ in ensemble_clfs)

# # Range of `n_estimators` values to explore.
# estimators_list = [10]
# estimators_list.extend(range(100, 400, 200))

# tic = time.time()
# for label, clf in ensemble_clfs:
#     print(label)
#     for i in estimators_list:
#         print(i)
#         tic1 = time.time()
#         clf.set_params(n_estimators=i)
#         clf.fit(X_train, y_train['target'])
        
#         pred = clf.predict(X_cv).astype('int')
#         error_rate[label].append((i, mean_squared_error(y_cv['target'], pred)))
#         toc1 = time.time()
#         print("Time taken = " + str((toc1-tic1)/60.0) + " mins")

# toc = time.time()
# print("Minutes taken = " + str((toc-tic)/60.0))

In [595]:
# # Generate the "error rate" vs. "n_estimators" plot.
# plt.figure(figsize=(12, 8), dpi=80)
# for label, clf_err in error_rate.items():
#     xs, ys = zip(*clf_err)
#     plt.plot(xs, ys, label=label, marker='x')

# plt.xlabel("n_estimators")
# plt.ylabel("MSE")
# plt.legend(loc="upper right")
# plt.grid()
# matplotlib.rcParams.update({'font.size': fontsize})
# matplotlib.rcParams['xtick.labelsize'] = ticklabelsize
# matplotlib.rcParams['ytick.labelsize'] = ticklabelsize

In [596]:
# error_rate

In [597]:
# min_child_weight_opt, n_estimators_opt, mse_min = get_params(error_rate)
# min_child_weight_opt, n_estimators_opt, mse_min

In [598]:
n_estimators_opt = 300
max_depth_opt = 9
learning_rate_opt = 0.01
min_child_weight_opt = 1

# Retrain model with optimum hyperparameters

In [599]:
# Create the model
model = XGBRegressor(seed=model_seed,
                      n_estimators=n_estimators_opt,
                      max_depth=max_depth_opt,
                      learning_rate=learning_rate_opt,
                      min_child_weight=min_child_weight_opt)

# # Train the regressor
# model.fit(X_train, y_train['target'])

# # Do prediction on train set
# pred = model.predict(X_train).astype('int')

# # Calculate RMSE
# print("Train set MSE = " + str(mean_squared_error(y_train['target'], pred)))

# # Do prediction on val set
# pred = model.predict(X_cv).astype('int')

# # Calculate RMSE
# print("Val set MSE = " + str(mean_squared_error(y_cv['target'], pred)))

By tuning hyperparams, MSE of the val set dropped from 5.41 to 4.98

# Retrain model on train+val set

In [600]:
# Train the regressor
model.fit(X_train_cv, y_train_cv['target'])



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.01, max_delta_step=0, max_depth=9,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=300, n_jobs=0, num_parallel_tree=1,
             objective='reg:squarederror', random_state=100, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, seed=100, subsample=1,
             tree_method='approx', validate_parameters=1, verbosity=None)

# Get predictions for validation set 28 days

In [601]:
# Rename columns
sale_tr_val_melt = sale_tr_val_melt.rename(columns={'variable': 'date_block_num', 
                                                    'value': 'target'})

In [602]:
# Convert variable to a numerical column ie. d_1 to 1
sale_tr_val_melt['date_block_num'] = sale_tr_val_melt['date_block_num'].str.split('_').str[1].astype('int16')
sale_tr_val_melt.tail()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,date_block_num,target
5832732,FOODS_3_823_WI_3_validation,FOODS_3_823,FOODS_3,FOODS,WI_3,WI,1913,1
5832733,FOODS_3_824_WI_3_validation,FOODS_3_824,FOODS_3,FOODS,WI_3,WI,1913,0
5832734,FOODS_3_825_WI_3_validation,FOODS_3_825,FOODS_3,FOODS,WI_3,WI,1913,0
5832735,FOODS_3_826_WI_3_validation,FOODS_3_826,FOODS_3,FOODS,WI_3,WI,1913,3
5832736,FOODS_3_827_WI_3_validation,FOODS_3_827,FOODS_3,FOODS,WI_3,WI,1913,0


In [603]:
# Convert variable to a numerical column ie. d_1 to 1
calendar['d'] = calendar['d'].str.split('_').str[1].astype('int16')
calendar.head()

Unnamed: 0,date,wm_yr_wk,wday,month,year,d,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,2011-01-29,11101,1,1,2011,1,nil,nil,nil,nil,0,0,0
1,2011-01-30,11101,2,1,2011,2,nil,nil,nil,nil,0,0,0
2,2011-01-31,11101,3,1,2011,3,nil,nil,nil,nil,0,0,0
3,2011-02-01,11101,4,2,2011,4,nil,nil,nil,nil,1,1,0
4,2011-02-02,11101,5,2,2011,5,nil,nil,nil,nil,1,0,1


In [604]:
# Do prediction on the test set
results = y_cv[['id']].copy()
sale_tr_val_melt2 = sale_tr_val_melt[sale_tr_val_melt['date_block_num']<pred_start]
mse = []

for i in range(pred_start, pred_end+1):
    print("\nday = " + str(i))
    
    # Create the test set here
    temp = sale_tr_val_melt2[sale_tr_val_melt2['date_block_num']==1].copy()
    temp.loc[:, 'target'] = 0
    temp.loc[:, 'date_block_num'] = i
    sale_tr_val_melt2 = sale_tr_val_melt2.append(temp)
    
    # Merge with calendar to get events info
    sale_tr_val_melt_merged = sale_tr_val_melt2.merge(calendar, 
                                                     left_on='date_block_num', 
                                                     right_on='d', 
                                                     how='left')

    # Drop unneccesary columns
    sale_tr_val_melt_merged.drop(columns=['d'], inplace=True)
    
    # Merge with sell_prices to get price info
    all_data = sale_tr_val_melt_merged.merge(sell_prices, 
                                             left_on=['store_id', 'item_id', 'wm_yr_wk'], 
                                             right_on=['store_id', 'item_id', 'wm_yr_wk'], 
                                             how='left')
    
    # Remove those rows where sell price is NaN and value=0 as this means the item is not being sold yet
    all_data.dropna(axis=0, how='any', subset=['sell_price'], inplace=True)

    # Remove unnecessaray column
    all_data.drop(['date'], axis=1, inplace=True)
    
    # Convert features to categorical 
    for column in columns_to_be_labeled:
        all_data.loc[:, column] = LabelEncoder().fit_transform(all_data[column]) # fit_transform: Fit label encoder and return encoded labels. Note the encoded labels are integers!!

    # Get the lag features
    all_data = get_lags(index_cols, cols_to_rename, all_data, shift_range)
    
    # Remove those rows where target_lag_x is NaN
    all_data.dropna(axis=0, how='any', subset=lag_cols, inplace=True)
    
    # Get the test set
    X_test = all_data[all_data['date_block_num']==i].drop(to_drop_cols, axis = 1)
    y_test = all_data[all_data['date_block_num']==i][['id', 'target']]
    
    # Do prediction on test set
    pred = model.predict(X_test).astype('int')
    print("No. of zeros = " + str(len(pred[pred==0])))
    
    # Add preds to dataset
    sale_tr_val_melt2.loc[sale_tr_val_melt2['date_block_num']==i, 'target'] = pred 

    # Calculate RMSE
    print("MSE = " + str(mean_squared_error(y_test['target'], pred)))
    mse.append(mean_squared_error(y_test['target'], pred))
    
    results['F'+str(i-pred_start+1)] = pred
    
print("mse mean = " + str(np.mean(mse)))
results


day = 1914
No. of zeros = 2301
MSE = 6.4286651361102

day = 1915
No. of zeros = 2385
MSE = 5.019678583142014

day = 1916
No. of zeros = 2437
MSE = 4.375532961626763

day = 1917
No. of zeros = 2471
MSE = 4.467366349622827

day = 1918
No. of zeros = 2475
MSE = 7.12856674319449

day = 1919
No. of zeros = 2459
MSE = 11.591669399803214

day = 1920
No. of zeros = 2480
MSE = 8.950475565759266

day = 1921
No. of zeros = 2561
MSE = 5.620859298130535

day = 1922
No. of zeros = 2607
MSE = 4.934732699245655

day = 1923
No. of zeros = 2678
MSE = 3.461134798294523

day = 1924
No. of zeros = 2675
MSE = 4.604460478845523

day = 1925
No. of zeros = 2672
MSE = 7.672679567071171

day = 1926
No. of zeros = 2673
MSE = 9.619547392587734

day = 1927
No. of zeros = 2669
MSE = 9.208592981305346

day = 1928
No. of zeros = 2694
MSE = 5.323384716300426

day = 1929
No. of zeros = 2724
MSE = 3.6894063627418827

day = 1930
No. of zeros = 2729
MSE = 4.387340111511971

day = 1931
No. of zeros = 2733
MSE = 4.665792062

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
4683620,HOBBIES_1_001_WI_3_validation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4683621,HOBBIES_1_002_WI_3_validation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4683622,HOBBIES_1_003_WI_3_validation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4683623,HOBBIES_1_004_WI_3_validation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4683624,HOBBIES_1_005_WI_3_validation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4683625,HOBBIES_1_006_WI_3_validation,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4683626,HOBBIES_1_007_WI_3_validation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4683627,HOBBIES_1_008_WI_3_validation,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
4683628,HOBBIES_1_009_WI_3_validation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4683629,HOBBIES_1_010_WI_3_validation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [605]:
# Store results as csv
results.to_csv("./out/v6_results-" + state + "-" + store + "-" + str(pred_start) + "-" + str(pred_end) + ".csv", index=False)

In [606]:
tocEnd = time.time()
print("Total time = " + str((tocEnd-ticStart)/60.0) + " mins")

Total time = 48.31221963564555 mins


# Combine all predictions together and prepare submission file

In [609]:
# Combine all predictions together
pred_start = 1914              # do pred starting from this day
pred_end = 1941                # do pred until and include this day

state_store_list = [('CA', 'CA_1'), ('CA', 'CA_2'), ('CA', 'CA_3'), ('CA', 'CA_4'),
                    ('TX', 'TX_1'), ('TX', 'TX_2'), ('TX', 'TX_3'), 
                    ('WI', 'WI_1'), ('WI', 'WI_2'), ('WI', 'WI_3')]

state_store = state_store_list[0]
results_tot = pd.read_csv("./out/v5_results-" + state_store[0] + "-" + state_store[1] + "-" + str(pred_start) + "-" + str(pred_end) + ".csv")


for state_store in state_store_list[1:]:
    results = pd.read_csv("./out/v6_results-" + state_store[0] + "-" + state_store[1] + "-" + str(pred_start) + "-" + str(pred_end) + ".csv")
    results_tot = results_tot.append(results)
    
results_tot

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,HOBBIES_1_001_CA_1_validation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,HOBBIES_1_002_CA_1_validation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,HOBBIES_1_003_CA_1_validation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,HOBBIES_1_004_CA_1_validation,2,1,1,1,1,2,2,1,0,...,0,0,0,0,0,0,0,0,0,0
4,HOBBIES_1_005_CA_1_validation,1,1,1,1,1,2,1,1,1,...,0,0,0,0,0,0,0,0,0,0
5,HOBBIES_1_006_CA_1_validation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,HOBBIES_1_007_CA_1_validation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,HOBBIES_1_008_CA_1_validation,6,5,6,6,8,7,9,9,8,...,10,11,10,8,7,9,9,10,11,10
8,HOBBIES_1_009_CA_1_validation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,HOBBIES_1_010_CA_1_validation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [610]:
# Prepare submission file
submission = pd.merge(sample_sub[['id']], results_tot, on='id', how='left')
submission.fillna(value=0, inplace=True)
submission.to_csv("./out/v6_submission.csv", index=False)
submission

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,HOBBIES_1_001_CA_1_validation,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,HOBBIES_1_002_CA_1_validation,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,HOBBIES_1_003_CA_1_validation,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,HOBBIES_1_004_CA_1_validation,2.0,1.0,1.0,1.0,1.0,2.0,2.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,HOBBIES_1_005_CA_1_validation,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,HOBBIES_1_006_CA_1_validation,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,HOBBIES_1_007_CA_1_validation,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,HOBBIES_1_008_CA_1_validation,6.0,5.0,6.0,6.0,8.0,7.0,9.0,9.0,8.0,...,10.0,11.0,10.0,8.0,7.0,9.0,9.0,10.0,11.0,10.0
8,HOBBIES_1_009_CA_1_validation,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,HOBBIES_1_010_CA_1_validation,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


This obtained weighted root mean squared scaled error (RMSSE) of 3.53072 on the validation set

In [616]:
pd.set_option('display.max_columns', 500)
sales_tr_val = pd.read_csv("./data/sales_train_validation.csv")
print("len(sales_tr_val) = " + str(len(sales_tr_val)))
sales_tr_val

len(sales_tr_val) = 30490


Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,d_5,d_6,d_7,d_8,d_9,d_10,d_11,d_12,d_13,d_14,d_15,d_16,d_17,d_18,d_19,d_20,d_21,d_22,d_23,d_24,d_25,d_26,d_27,d_28,d_29,d_30,d_31,d_32,d_33,d_34,d_35,d_36,d_37,d_38,d_39,d_40,d_41,d_42,d_43,d_44,d_45,d_46,d_47,d_48,d_49,d_50,d_51,d_52,d_53,d_54,d_55,d_56,d_57,d_58,d_59,d_60,d_61,d_62,d_63,d_64,d_65,d_66,d_67,d_68,d_69,d_70,d_71,d_72,d_73,d_74,d_75,d_76,d_77,d_78,d_79,d_80,d_81,d_82,d_83,d_84,d_85,d_86,d_87,d_88,d_89,d_90,d_91,d_92,d_93,d_94,d_95,d_96,d_97,d_98,d_99,d_100,d_101,d_102,d_103,d_104,d_105,d_106,d_107,d_108,d_109,d_110,d_111,d_112,d_113,d_114,d_115,d_116,d_117,d_118,d_119,d_120,d_121,d_122,d_123,d_124,d_125,d_126,d_127,d_128,d_129,d_130,d_131,d_132,d_133,d_134,d_135,d_136,d_137,d_138,d_139,d_140,d_141,d_142,d_143,d_144,d_145,d_146,d_147,d_148,d_149,d_150,d_151,d_152,d_153,d_154,d_155,d_156,d_157,d_158,d_159,d_160,d_161,d_162,d_163,d_164,d_165,d_166,d_167,d_168,d_169,d_170,d_171,d_172,d_173,d_174,d_175,d_176,d_177,d_178,d_179,d_180,d_181,d_182,d_183,d_184,d_185,d_186,d_187,d_188,d_189,d_190,d_191,d_192,d_193,d_194,d_195,d_196,d_197,d_198,d_199,d_200,d_201,d_202,d_203,d_204,d_205,d_206,d_207,d_208,d_209,d_210,d_211,d_212,d_213,d_214,d_215,d_216,d_217,d_218,d_219,d_220,d_221,d_222,d_223,d_224,d_225,d_226,d_227,d_228,d_229,d_230,d_231,d_232,d_233,d_234,d_235,d_236,d_237,d_238,d_239,d_240,d_241,d_242,d_243,d_244,...,d_1664,d_1665,d_1666,d_1667,d_1668,d_1669,d_1670,d_1671,d_1672,d_1673,d_1674,d_1675,d_1676,d_1677,d_1678,d_1679,d_1680,d_1681,d_1682,d_1683,d_1684,d_1685,d_1686,d_1687,d_1688,d_1689,d_1690,d_1691,d_1692,d_1693,d_1694,d_1695,d_1696,d_1697,d_1698,d_1699,d_1700,d_1701,d_1702,d_1703,d_1704,d_1705,d_1706,d_1707,d_1708,d_1709,d_1710,d_1711,d_1712,d_1713,d_1714,d_1715,d_1716,d_1717,d_1718,d_1719,d_1720,d_1721,d_1722,d_1723,d_1724,d_1725,d_1726,d_1727,d_1728,d_1729,d_1730,d_1731,d_1732,d_1733,d_1734,d_1735,d_1736,d_1737,d_1738,d_1739,d_1740,d_1741,d_1742,d_1743,d_1744,d_1745,d_1746,d_1747,d_1748,d_1749,d_1750,d_1751,d_1752,d_1753,d_1754,d_1755,d_1756,d_1757,d_1758,d_1759,d_1760,d_1761,d_1762,d_1763,d_1764,d_1765,d_1766,d_1767,d_1768,d_1769,d_1770,d_1771,d_1772,d_1773,d_1774,d_1775,d_1776,d_1777,d_1778,d_1779,d_1780,d_1781,d_1782,d_1783,d_1784,d_1785,d_1786,d_1787,d_1788,d_1789,d_1790,d_1791,d_1792,d_1793,d_1794,d_1795,d_1796,d_1797,d_1798,d_1799,d_1800,d_1801,d_1802,d_1803,d_1804,d_1805,d_1806,d_1807,d_1808,d_1809,d_1810,d_1811,d_1812,d_1813,d_1814,d_1815,d_1816,d_1817,d_1818,d_1819,d_1820,d_1821,d_1822,d_1823,d_1824,d_1825,d_1826,d_1827,d_1828,d_1829,d_1830,d_1831,d_1832,d_1833,d_1834,d_1835,d_1836,d_1837,d_1838,d_1839,d_1840,d_1841,d_1842,d_1843,d_1844,d_1845,d_1846,d_1847,d_1848,d_1849,d_1850,d_1851,d_1852,d_1853,d_1854,d_1855,d_1856,d_1857,d_1858,d_1859,d_1860,d_1861,d_1862,d_1863,d_1864,d_1865,d_1866,d_1867,d_1868,d_1869,d_1870,d_1871,d_1872,d_1873,d_1874,d_1875,d_1876,d_1877,d_1878,d_1879,d_1880,d_1881,d_1882,d_1883,d_1884,d_1885,d_1886,d_1887,d_1888,d_1889,d_1890,d_1891,d_1892,d_1893,d_1894,d_1895,d_1896,d_1897,d_1898,d_1899,d_1900,d_1901,d_1902,d_1903,d_1904,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,2,0,2,2,0,0,0,1,1,0,2,0,1,1,2,0,1,0,0,0,2,1,0,1,1,2,0,0,0,0,0,0,0,1,0,1,0,1,0,3,1,1,0,1,1,2,0,0,0,0,1,1,0,0,0,0,3,0,1,0,0,0,0,1,1,1,0,1,0,2,0,0,0,0,2,0,0,0,0,1,1,2,0,0,0,0,2,0,0,1,1,1,1,0,0,0,0,0,1,2,2,0,1,0,0,0,0,1,2,1,0,0,0,0,0,1,0,3,0,1,2,1,0,3,0,0,0,1,0,2,2,1,0,0,1,2,0,1,0,1,4,0,0,5,0,0,0,0,0,0,2,1,2,1,0,0,0,1,1,1,0,0,1,1,1,1,1,0,0,0,2,2,0,0,1,4,0,0,0,0,1,1,2,0,4,0,1,0,1,4,2,0,2,0,1,1,0,1,0,0,1,1,3,0,0,0,1,1,1,3,1,3,1,2,2,0,1,1,1,1,0,0,0,0,0,1,0,4,2,3,0,1,2,0,0,0,1,1,3,0,1,1,1,3,0,1,1
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,1,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,1,1,1,0,1,0,0,0,0,0,1,0,0,1,0,0,0,1,0,1,1,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,...,1,0,0,1,1,0,0,0,3,4,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,1,1,0,0,0,0,0,0,0,2,1,0,0,1,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,1,0,2,1,0,0,0,1,1,0,0,0,0,1,1,0,1,0,1,1,0,0,0,0,0,1,0,1,1,0,3,0,0,0,0,0,0,0,1,0,0,0,0,0,2,1,0,0,1,1,0,2,0,1,0,2,1,1,5,0,1,0,3,5,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,2,1,0,3,0,1,2,0,3,1,0,0,1,0,1,0,0,0,0,2,0,1,0,1,0,1,1,0,1,0,1,0,0,0,1,2,0,0,0,1,0,1,1,1,1,0,0,0,0,0,0,2,0,1,0,0,2,0,0,0,1,0,0,1,0,0,2,0,0,0,0,0,0,0,0,2,0,2,3,0,1,3,1,2,2,3,0,1,1,0,0,0,0,2,3,1,1,4,3,2,1,2,2,0,1,5,2,0,1,2,3,0,1,2,1,3,0,1,1,1,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,1,1,1,1,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,6,1,1,2,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,2,2,1,2,1,1,1,0,1,1,1
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,2,0,1,0,0,0,0,0,0,2,0,1,0,0,1,1,1,0,2,3,1,0,0,0,0,0,1,0,0,1,1,0,0,1,0,0,1,2,3,0,2,0,0,2,2,0,0,2,1,2,1,1,1,2,0,0,1,0,1,3,1,0,0,0,0,3,5,2,2,1,1,1,1,1,1,0,0,2,1,1,1,2,0,0,0,2,5,6,0,0,0,0,0,0,0,0,0,0,0,2,1,2,0,1,0,2,0,0,5,1,0,0,1,3,1,3,5,1,3,0,3,4,4,0,0,1,3,1,4,0,0,2,0,2,0,1,4,2,1,0,2,1,3,6,1,1,2,1,2,3,1,2,0,0,0,3,4,5,1,0,0,1,0,1,4,6,3,1,1,0,1,4,5,1,1,4,0,0,0,1,2,2,1,1,6,2,4,4,0,0,0,2,2,0,1,1,3,1,2,4,2,1,3,2,...,0,0,0,11,2,2,1,1,2,1,1,2,1,1,0,3,0,2,14,0,0,0,3,3,1,1,1,1,0,3,3,1,7,3,1,0,0,1,0,1,1,0,0,2,1,4,4,3,0,2,0,0,1,3,3,0,2,1,2,4,7,0,2,1,0,5,5,2,2,4,1,0,0,3,1,0,0,0,3,1,3,3,0,0,4,1,1,1,1,3,3,1,0,3,0,1,3,3,3,2,2,2,4,3,0,5,1,3,3,2,0,0,1,1,0,2,2,2,3,2,1,2,0,5,0,1,0,0,0,3,4,0,0,1,5,3,2,2,0,1,1,0,2,1,0,2,4,0,0,0,3,2,4,3,1,2,3,0,8,2,1,2,2,5,2,6,1,0,3,5,1,1,6,4,3,2,2,3,2,1,0,0,0,2,0,5,4,2,1,1,2,3,0,6,0,0,0,1,0,1,5,3,1,0,0,0,1,2,3,0,1,3,4,2,1,4,1,3,5,0,6,6,0,0,0,0,3,1,2,1,3,1,0,2,5,4,2,0,3,0,1,0,5,4,1,0,1,3,7,2
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,6,0,3,2,3,5,3,1,0,0,1,0,2,2,4,0,0,3,1,1,1,2,2,0,0,0,0,0,0,3,7,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,3,2,0,0,0,0,0,0,0,2,0,6,2,4,2,0,0,3,2,4,1,0,0,0,0,1,1,1,2,0,0,0,0,0,0,0,0,2,6,7,9,4,7,6,3,4,2,0,1,5,2,2,0,0,0,0,0,0,1,3,...,2,0,5,2,0,0,1,0,3,1,0,1,0,2,1,0,0,1,3,1,1,0,4,0,2,1,2,4,0,1,0,0,3,1,2,2,0,1,1,0,4,0,0,4,0,2,2,2,1,2,1,1,1,4,0,2,1,2,0,0,1,0,1,1,2,2,3,1,0,2,3,0,1,1,4,0,3,2,1,2,1,2,2,1,2,0,1,1,2,0,2,0,0,0,4,2,1,2,0,0,0,0,0,2,1,0,0,1,2,0,1,2,1,2,1,2,3,3,0,3,1,5,3,2,1,2,3,4,0,0,1,0,0,1,0,0,1,0,0,0,0,2,0,0,3,0,0,1,2,2,0,1,0,0,0,1,0,0,3,0,0,1,1,0,3,1,0,4,1,2,0,0,0,1,1,2,0,0,5,2,2,2,1,0,0,0,3,0,0,0,3,1,1,1,1,2,1,0,0,1,0,2,1,1,0,3,1,1,2,1,1,0,3,2,2,2,3,1,0,0,0,0,1,0,4,4,0,1,4,0,1,0,1,0,1,1,2,0,1,1,2,1,1,0,1,1,2,2,2,4
5,HOBBIES_1_006_CA_1_validation,HOBBIES_1_006,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,3,2,0,0,0,1,2,1,5,0,0,0,0,0,0,0,2,6,0,0,0,0,0,1,0,1,2,0,0,1,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,3,4,0,0,0,8,0,2,0,1,0,0,1,0,3,0,0,0,1,3,0,0,1,0,0,3,3,0,0,0,0,3,0,2,2,1,0,0,0,0,9,1,0,0,0,0,1,0,2,1,3,0,0,0,1,3,0,4,0,0,0,1,5,0,1,0,0,2,0,3,1,1,2,0,1,3,0,0,0,0,0,0,3,0,1,0,0,1,4,0,0,0,1,0,1,0,0,0,2,0,0
6,HOBBIES_1_007_CA_1_validation,HOBBIES_1_007,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,1,3,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,2,0,0,0,0,2,0,3,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,1,1,1,0,2,1,0,0,0,0,0,0,0,0,1,1,0,0,0,2,2,0,2,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,1,0,0,0,1,0,0,0,0,0,2,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,0,1,2,1,0,0,1,0,2,0,0,1,0,0,1,0,0,0,1,2,0,0,1,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,1,0,1,0,0,1,1
7,HOBBIES_1_008_CA_1_validation,HOBBIES_1_008,HOBBIES_1,HOBBIES,CA_1,CA,12,15,0,0,0,4,6,5,7,0,13,2,2,20,9,3,0,1,0,0,0,10,4,4,2,6,2,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,12,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,20,0,2,13,13,0,0,0,8,6,6,0,0,0,0,0,5,23,0,0,16,9,10,8,1,0,0,6,13,0,0,16,6,0,0,0,0,27,0,5,0,0,0,0,0,0,0,0,0,4,8,5,7,6,3,13,6,4,1,25,0,0,13,0,0,0,24,0,0,15,4,18,4,0,2,14,23,0,0,0,0,0,14,10,1,22,0,0,0,0,0,0,3,6,14,9,5,14,19,4,10,11,21,1,5,4,11,16,9,11,7,24,9,22,0,6,18,0,3,4,0,7,37,5,16,3,19,3,0,13,27,4,4,10,2,2,5,9,3,0,23,0,0,0,12,2,4,3,4,6,8,15,15,6,0,2,2,26,2,0,0,0,10,8,11,5,5,1,6,14,11,7,1,6,17,10,4,2,5,23,...,8,30,9,10,14,28,10,20,0,0,11,3,8,14,10,27,4,10,6,6,12,10,3,9,15,7,4,15,0,57,34,21,2,4,4,13,6,6,13,6,5,4,7,5,47,29,0,6,1,8,5,2,10,2,10,6,6,3,36,17,1,10,3,43,19,14,9,1,5,8,3,6,38,19,2,7,4,20,41,15,3,5,0,8,8,5,9,40,13,10,5,7,34,15,26,6,3,6,8,0,0,17,4,27,28,0,5,0,6,8,20,12,7,27,49,0,0,7,0,6,5,11,25,0,8,19,5,9,0,17,0,7,2,9,7,7,30,7,2,1,2,75,8,15,1,17,5,8,18,6,47,5,48,4,16,13,13,14,4,3,0,8,1,1,23,12,4,5,44,32,9,37,0,5,18,4,0,7,27,23,1,9,2,24,11,10,11,34,0,10,4,12,17,0,0,0,0,0,0,0,0,0,4,27,1,2,19,28,3,1,2,0,17,11,0,2,13,10,2,0,0,0,0,11,5,2,12,38,22,2,0,10,26,8,12,4,5,2,8,4,0,0,1,37,3,4,6,3,2,1
8,HOBBIES_1_009_CA_1_validation,HOBBIES_1_009,HOBBIES_1,HOBBIES,CA_1,CA,2,0,7,3,0,2,3,9,0,0,4,3,1,0,4,4,1,2,0,1,1,5,2,2,2,3,0,0,8,0,0,0,0,0,2,7,1,8,6,0,0,0,4,2,6,2,0,0,0,1,4,0,9,1,1,0,1,0,0,2,0,4,0,4,0,1,1,3,0,2,0,2,2,0,0,4,1,6,1,2,8,0,2,2,0,9,0,0,3,3,4,1,0,5,1,1,1,0,0,0,1,1,0,4,8,1,2,8,1,0,0,0,7,2,7,0,8,0,0,1,2,0,0,1,3,0,2,0,0,5,0,0,2,6,0,1,0,0,2,8,2,0,0,0,2,5,1,2,2,4,2,3,5,0,8,0,2,2,0,18,1,7,0,0,0,0,0,0,4,3,0,3,2,1,1,4,0,2,2,0,4,1,0,4,2,7,0,0,3,2,7,2,0,1,3,5,0,1,0,0,2,0,0,2,0,3,0,1,1,1,7,1,2,0,0,0,1,0,1,0,0,1,0,0,0,3,0,0,7,0,0,0,0,0,0,0,0,1,0,0,0,1,6,4,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,1,1,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,1,0,0,0,0,0,0,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,1,2,0,3,0,0,0,0,0,1,1,0,7,0,0,0,0,1,1,6,0,0,0,0,0
9,HOBBIES_1_010_CA_1_validation,HOBBIES_1_010,HOBBIES_1,HOBBIES,CA_1,CA,0,0,1,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,2,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,1,1,0,0,1,1,1,2,1,1,2,0,0,1,0,2,1,1,1,2,0,0,2,1,2,0,0,1,1,0,0,0,0,1,1,2,0,1,0,0,0,1,0,0,0,1,0,0,0,2,0,2,4,0,2,0,0,0,0,1,0,1,1,4,1,0,1,0,0,0,1,0,0,2,0,1,0,0,0,0,2,3,0,1,0,0,3,2,2,0,0,4,0,1,1,1,0,2,2,1,1,1,2,0,1,2,0,0,0,1,...,1,0,1,0,0,0,1,0,2,1,2,0,0,0,1,0,2,0,1,0,0,0,1,1,0,3,2,0,1,0,1,0,1,1,0,0,0,0,0,1,0,1,0,1,0,1,2,0,0,1,0,2,0,0,1,2,1,0,1,0,1,0,0,0,0,1,0,1,3,1,0,0,0,0,1,0,0,1,0,1,0,1,0,1,0,0,1,1,0,1,0,1,1,2,1,1,1,0,0,0,0,2,1,0,1,0,3,1,1,2,1,0,0,0,1,0,1,1,0,1,0,0,3,1,0,0,0,1,0,0,3,0,2,0,0,1,0,1,2,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,1,4,0,0,1,1,1,2,1,1,0,0,0,1,1,1,0,1,3,0,0,0,0,1,1,1,2,0,0,0,0,1,2,0,2,0,0,1,0,1,0,1,0,0,0,1,0,0,0,1,0,1,0,0,0,0,2,3,0,0,0,0,0,0,0,0,1,1,1,1,0,3,2,2,1,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,2,0,2
