In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from datetime import datetime, timedelta
import gc
import lightgbm as lgb

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
CAL_COL_TYPES = {'event_name_1':'category', 'event_type_1':'category','event_name_2':'category', 'event_type_2':'category', 
            'weekday':'category', 'wm_yr_wk': 'int16', 'wday': 'int16', 'month': 'int16', 'year':'int16', 
            'snap_CA': 'float32', 'snap_TX': 'float32', 'snap_WI':'float32'}
SELLP_COL_TYPES = {'store_id':'category', 'item_id': 'category', 'wm_yr_wk': 'int16', 'sell_price':'float32'}

In [None]:
# Read the data
INPUT_DIR='../input/m5-forecasting-accuracy/'
cal_data=pd.read_csv(INPUT_DIR+'calendar.csv', dtype=CAL_COL_TYPES)
sellp_data=pd.read_csv(INPUT_DIR+'sell_prices.csv', dtype=SELLP_COL_TYPES)

In [None]:
print(cal_data.shape)
cal_data.head()

In [None]:
cal_data.loc[1913:]

In [None]:
print(sellp_data.shape)
sellp_data.head()

In [None]:
h = 28
max_lags = 57
tr_last = 1913
fday = datetime(2016,4,25)
fday

In [None]:
sellp_data.dtypes

In [None]:
def create_df(is_train=True, nrows=None, first_day=1200):
    # convert categorical colum values to numerical for sell_prices.csv 
    sellp_data_change = sellp_data.copy()
    for col, col_dtype in SELLP_COL_TYPES.items():
        if col_dtype == 'category':
            sellp_data_change[col] = sellp_data_change[col].cat.codes.astype('int16')
            sellp_data_change[col] -= sellp_data_change[col].min()
    
    cal_data_change = cal_data.copy()
    cal_data_change['date'] = pd.to_datetime(cal_data_change['date'])
    # convert categorical colum values to numerical for calendar.csv
    for col, col_dtype in CAL_COL_TYPES.items():
        if col_dtype == 'category':
            cal_data_change[col] = cal_data_change[col].cat.codes.astype('int16')
            cal_data_change[col] -= cal_data_change[col].min()
            
    start_day = max(1 if is_train else tr_last-max_lags, first_day)
    dcols = [f'd_{day}' for day in range(start_day, tr_last+1)]
    catcols = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']
    dtype = {dcol : 'float32' for dcol in dcols}
    dtype.update({col:'category' for col in catcols if col!='id'})
    salestv_data=pd.read_csv(INPUT_DIR+'sales_train_validation.csv', nrows=nrows, usecols= catcols+dcols, dtype=dtype)
    
    # convert categorical colum values to numerical for sales_train_validation.csv 
    for col in catcols:
        if col != 'id':
            salestv_data[col] = salestv_data[col].cat.codes.astype('int16')
            salestv_data[col] -= salestv_data[col].min()
    
    if not is_train:
        for day in range(tr_last+1, tr_last+28+1):
            salestv_data[f'd_{day}'] = np.nan
    
    # Unpivot the datafame along d_cols
    df = pd.melt(salestv_data, 
                 id_vars=catcols, 
                 value_vars=[col for col in salestv_data.columns if col.startswith('d_')],
                 var_name='d',
                 value_name='sales'
                )
    df = df.merge(cal_data_change, on='d', copy=False)
    df = df.merge(sellp_data_change, on=['store_id', 'item_id', 'wm_yr_wk'], copy=False)
    return df        

In [None]:
%%time 

#df_test = create_df()

In [None]:
#df_test.shape

In [None]:
#print(df_test.info())

In [None]:
#df_test.head()

In [None]:
#del df_test

In [None]:
def create_fea(df):
    lags =[7, 28]
    lag_cols = [f'd_{lag}' for lag in lags]
    
    # shift the sales by lag value and append a new column
    for lag, lag_col in zip(lags, lag_cols):
        df[lag_col] = df[['id', 'sales']].groupby('id')['sales'].shift(lag)
        
    wins = [7, 28]
    for win in wins:
        for lag, lag_col in zip(lags, lag_cols):
            df[f'rmean_{lag}_{win}'] = df[['id', lag_col]].groupby('id')[lag_col].transform(lambda x: x.rolling(win).mean())
    
    date_features = {
        'wday':'weekday',
        'week':'weekofyear',
        'month':'month',
        'quarter':'quarter',
        'year':'year',
        'mday':'day'
    }
    
    for date_feature_name, date_feature_func in date_features.items():
        if date_feature_name in df.columns:
            df[date_feature_name] = df[date_feature_name].astype('int16')
        else:
            df[date_feature_name] = getattr(df['date'].dt, date_feature_func).astype('int16')

In [None]:
FIRST_DAY=1000

In [None]:
sellp_data.dtypes

In [None]:
%%time

df = create_df(is_train=True, first_day = FIRST_DAY)
print(df.shape)

In [None]:
%%time

create_fea(df)
print(df.shape)

In [None]:
print(df.info())

In [None]:
df.head()

In [None]:
# Drop missing value rows
df.dropna(inplace=True)
df.shape

In [None]:
del sellp_data, cal_data

In [None]:
cat_features = ['item_id', 'store_id', 'cat_id', 'dept_id', 'state_id'] + ['event_name_1', 'event_type_1', 'event_name_2', 'event_type_2']
useless_cols = ['id', 'date', 'sales', 'd', 'wm_yr_wk', 'weekday']
train_cols = df.columns[~df.columns.isin(useless_cols)]
X_train = df[train_cols]
y_train = df['sales']

In [None]:
%%time

np.random.seed(777)

# This is a random sample, we're not gonna apply any time series train-test-split tricks here!
fake_valid_inds = np.random.choice(X_train.index.values, 2_000_000, replace=False) # Validation dataset
train_inds = np.setdiff1d(X_train.index.values, fake_valid_inds) # Training dataset

train_data = lgb.Dataset(X_train.loc[train_inds], label=y_train.loc[train_inds], categorical_feature=cat_features, free_raw_data=False)
fake_valid_data = lgb.Dataset(X_train.loc[fake_valid_inds], label=y_train.loc[fake_valid_inds], categorical_feature=cat_features, free_raw_data=False)

In [None]:
del df, X_train, y_train, fake_valid_inds,train_inds ; gc.collect()

In [None]:
# LGB parameters

params={
#    'device':'gpu',
    'objective':'poisson',
    'metric':['rmse'],
    'force_row_wise':True,
    'learning_rate':0.075,
    'sub_row': 0.75,
    'bagging_freq': 1,
    'lambda_12':0.1,
    'verbosity':1,
    'num_iterations':1200,
    'num_leaves':2**11-1,
    'min_data_in_leaf':2**12-1
}

In [None]:
%%time

m_lgb = lgb.train(params, train_data, valid_sets=[fake_valid_data], verbose_eval=20)

In [None]:
m_lgb.save_model("model.lgb")

In [None]:
%%time

alphas = [1.028, 1.023, 1.018]
weights = [1/len(alphas)]*len(alphas)
sub = 0.

for icount, (alpha, weight) in enumerate(zip(alphas, weights)):

    te = create_df(False)
    cols = [f"F{i}" for i in range(1,29)]

    for tdelta in range(0, 28):
        day = fday + timedelta(days=tdelta)
        print(tdelta, day)
        tst = te[(te.date >= day - timedelta(days=max_lags)) & (te.date <= day)].copy()
        create_fea(tst)
        tst = tst.loc[tst.date == day , train_cols]
        te.loc[te.date == day, "sales"] = alpha*m_lgb.predict(tst) # magic multiplier by kyakovlev



    te_sub = te.loc[te.date >= fday, ["id", "sales"]].copy()
#     te_sub.loc[te.date >= fday+ timedelta(days=h), "id"] = te_sub.loc[te.date >= fday+timedelta(days=h), 
#                                                                           "id"].str.replace("validation$", "evaluation")
    te_sub["F"] = [f"F{rank}" for rank in te_sub.groupby("id")["id"].cumcount()+1]
    te_sub = te_sub.set_index(["id", "F" ]).unstack()["sales"][cols].reset_index()
    te_sub.fillna(0., inplace = True)
    te_sub.sort_values("id", inplace = True)
    te_sub.reset_index(drop=True, inplace = True)
    te_sub.to_csv(f"submission_{icount}.csv",index=False)
    if icount == 0 :
        sub = te_sub
        sub[cols] *= weight
    else:
        sub[cols] += te_sub[cols]*weight
    print(icount, alpha, weight)


sub2 = sub.copy()
sub2["id"] = sub2["id"].str.replace("validation$", "evaluation")
sub = pd.concat([sub, sub2], axis=0, sort=False)
sub.to_csv("submission.csv",index=False)