# Library

In [86]:
import warnings
warnings.filterwarnings("ignore")


import numpy as np
import pandas as pd
import datetime, random, math
from catboost import CatBoostClassifier
import lightgbm as lgb
from time import time
from tqdm import tqdm
from collections import Counter
from scipy import stats
import gc, pickle
import ast
from typing import Union

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold, KFold, RepeatedKFold, GroupKFold, GridSearchCV, train_test_split, TimeSeriesSplit
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix, mean_squared_error, log_loss
from sklearn.linear_model import Ridge,Lasso, BayesianRidge
from sklearn.svm import LinearSVR
from sklearn.preprocessing import minmax_scale
from sklearn.cluster import KMeans
%matplotlib inline

# Preprocessing

In [87]:
def create_is_sell_data(sell_prices_df, calendar_df, train_df):
    sell_prices_df['id'] = sell_prices_df['item_id'].astype('str')+'_'+sell_prices_df['store_id']+'_evaluation'
    sell_prices_data = sell_prices_df[sell_prices_df.wm_yr_wk.isin(calendar_df.wm_yr_wk.unique())]
    sell_prices_data.reset_index(drop=True, inplace=True)
    tmp = sell_prices_data.groupby(['id'])[['wm_yr_wk', 'sell_price']].apply(
        lambda x: x.set_index('wm_yr_wk')['sell_price'].to_dict()
    ).to_dict()
    d = calendar_df.d
    wm_yr_wk = calendar_df.wm_yr_wk
    price_data = {}
    for col in tqdm(train_df.id.unique()):
        price_data[col] = wm_yr_wk.map(tmp[col])
    price_data = pd.DataFrame(price_data)
    price_data.index = d
    is_sell = price_data.notnull().astype(float).T
    price_data = price_data.fillna(0).T
    
    is_sell = pd.concat([
        train_df[['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']], is_sell
    ], axis=1)
    price_data = pd.concat([
        train_df[['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']], price_data
    ], axis=1)
    
    return price_data, is_sell

def set_index(df, name):
    d = {}
    for col, value in df.iloc[0,:].items():
        try:
            if '_evaluation' in value:
                d[col] = 'id'
            if 'd_' in value:
                d[col] = 'd'
        except:
            if type(value)!=str:
                d[col]=name
    return d

def dcol2int(col):
    if col[:2]=='d_':
        return int(col.replace('d_', ''))
    else:
        return col
    
def str_category_2_int(data):
    categories = [c for c in data.columns if data[c].dtype==object]
    for c in categories:
        if c=='id' or c=='d':
            pass
        else:
            data[c] = pd.factorize(data[c])[0]
            data[c] = data[c].replace(-1, np.nan)
    return data

def select_near_event(x, event_name):
    z = ''
    for y in x:
        if y in event_name:
            z+=y+'_'
    if len(z)==0:
        return np.nan
    else:
        return z
    
def sort_d_cols(d_cols):
    d_cols = [int(d.replace('d_','')) for d in d_cols]
    d_cols = sorted(d_cols)
    d_cols = [f'd_{d}' for d in d_cols]
    return d_cols

In [88]:
def preprocessing(path, d_cols, train_d_cols):
    train_df = pd.read_csv(path+'sales_train_evaluation.csv')
    calendar_df = pd.read_csv(path+'calendar.csv')
    sell_prices_df = pd.read_csv(path+'sell_prices.csv')
    sample_submission_df = pd.read_csv(path+'sample_submission.csv')
    
    train_df.index = train_df.id
    calendar_df['date']=pd.to_datetime(calendar_df.date)
    calendar_df.index = calendar_df.d
    price_data, is_sell = create_is_sell_data(sell_prices_df, calendar_df, train_df)
    
    str_cols = [ col for col in train_df.columns if 'id' in str(col)]
    new_columns = str_cols+d_cols
    train_df = train_df.reindex(columns=new_columns)
    
    train_df = pd.concat([
        train_df[['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']],
        train_df.loc[train_df.index,d_cols]*price_data.loc[train_df.index,d_cols]
    ], axis=1)
    
    data = train_df[train_d_cols].stack(dropna=False).reset_index()
    data = data.rename(columns=set_index(data, 'TARGET'))
    data.reset_index(drop=True, inplace=True)
    
    data = reduce_mem_usage(data)
    
    for key, value in train_df[['dept_id', 'cat_id', 'state_id', 'store_id', 'item_id']].to_dict().items():
        data[key] = data.id.map(value)
    
    #snap_data
    snap_data = calendar_df[['snap_CA', 'snap_WI', 'snap_TX', 'd']]
    snap_data.set_index('d', inplace=True)
    data[f'snap']=0
    for key, value in snap_data.to_dict().items():
        k = key.replace('snap_', '')
        data.loc[data.state_id==k,'snap'] = data.loc[data.state_id==k, 'd'].map(value).fillna(0)
        
    #price_data
    price_data = price_data[train_d_cols].T.astype(float)
    price_data = price_data.stack(dropna=False).reset_index()
    price_data.rename(columns=set_index(price_data, 'price'), inplace=True)
    data = pd.merge(data, price_data, on=['d', 'id'], how='left')
    
    event_name = ['SuperBowl', 'ValentinesDay', 'PresidentsDay', 'LentStart', 'LentWeek2', 'StPatricksDay', 'Purim End', 
              'OrthodoxEaster', 'Pesach End', 'Cinco De Mayo', "Mother's day", 'MemorialDay', 'NBAFinalsStart', 'NBAFinalsEnd',
              "Father's day", 'IndependenceDay', 'Ramadan starts', 'Eid al-Fitr', 'LaborDay', 'ColumbusDay', 'Halloween', 
              'EidAlAdha', 'VeteransDay', 'Thanksgiving', 'Christmas', 'Chanukah End', 'NewYear', 'OrthodoxChristmas', 
              'MartinLutherKingDay', 'Easter']
    event_type = ['Sporting', 'Cultural', 'National', 'Religious']
    event_names = {'event_name_1':event_name, 'event_type_1':event_type}
    for event, event_name in event_names.items():
        for w in [4]:
            calendar_df[f'new_{event}_{w}']=''
            for i in range(-1,-(w+1),-1):
                calendar_df[f'new_{event}_{w}'] += calendar_df[event].shift(i).astype(str)+'|'
            calendar_df[f'new_{event}_{w}'] = calendar_df[f'new_{event}_{w}'].apply(lambda x: x.split('|'))
            calendar_df[f'new_{event}_{w}'] = calendar_df[f'new_{event}_{w}'].apply(lambda x: select_near_event(x, event_name))

    #calendar_dict
    cols = ['new_event_name_1_4', 'new_event_type_1_4', 'wday', 'month', 'year', 'event_name_1','event_type_1']
    for key, value in calendar_df[cols].to_dict().items():
        data[key] = data.d.map(value)
    for shift in [-1,1]:
        data[f'snap_{shift}'] = data.groupby(['id'])['snap'].shift(shift)
    
    return data

# feature engineering

In [89]:
def make_roll_data(data, win):
    data_2 = data.groupby(['id'])['TARGET'].apply(
            lambda x:
            x.shift(1).rolling(win, min_periods=1).agg({'mean'})
        )
    for col in data_2.columns:
        data[f'roll_{win}_{col}'] = data_2[col]
        
    return data

def shift_diff_data(data):
    data[f'shift_diff']=0
    for i in range(4):
        data[f'shift_diff'] += data.groupby(['id'])['TARGET'].apply(lambda x: x.diff(7).shift(7))/4
    
    return data
    

def make_lag_roll_data(data, lag):
    data[f'lag{lag}_roll_14_mean'] = data.groupby(['id'])['TARGET'].apply(
        lambda x:
        x.shift(lag).rolling(28, min_periods=1).mean()
    )
    
    return data

def make_shift_data(data):
    for i in [0,3]:
        data[f'shift_{7*(i+1)}'] = data.groupby(['id'])['TARGET'].shift(7*(i+1))
        
    return data


def fe(data):
    data = make_roll_data(data, 7)
    data = make_roll_data(data, 28)
    
    #data = shift_diff_data(data) 
    data = make_lag_roll_data(data, 28)
    data = make_lag_roll_data(data, 7)
    #data = make_lag_roll_data(data, 84)
    #data = roll_diff_data(data)
    
    data = make_shift_data(data)
    
    return data

### reduce_mem_usage

In [90]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                       df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

### WRMSSEEvaluator

In [91]:
## evaluation metric
## from https://www.kaggle.com/c/m5-forecasting-accuracy/discussion/133834 and edited to get scores at all levels
class WRMSSEEvaluator(object):

    def __init__(self, train_df: pd.DataFrame, valid_df: pd.DataFrame, calendar: pd.DataFrame, prices: pd.DataFrame):
        train_y = train_df.loc[:, train_df.columns.str.startswith('d_')]
        train_target_columns = train_y.columns.tolist()
        weight_columns = train_y.iloc[:, -28:].columns.tolist()

        train_df['all_id'] = 0  # for lv1 aggregation

        id_columns = train_df.loc[:, ~train_df.columns.str.startswith('d_')].columns.tolist()
        valid_target_columns = valid_df.loc[:, valid_df.columns.str.startswith('d_')].columns.tolist()

        if not all([c in valid_df.columns for c in id_columns]):
            valid_df = pd.concat([train_df[id_columns], valid_df], axis=1, sort=False)

        self.train_df = train_df
        self.valid_df = valid_df
        self.calendar = calendar
        self.prices = prices

        self.weight_columns = weight_columns
        self.id_columns = id_columns
        self.valid_target_columns = valid_target_columns

        weight_df = self.get_weight_df()

        self.group_ids = (
            'all_id',
            'state_id',
            'store_id',
            'cat_id',
            'dept_id',
            ['state_id', 'cat_id'],
            ['state_id', 'dept_id'],
            ['store_id', 'cat_id'],
            ['store_id', 'dept_id'],
            'item_id',
            ['item_id', 'state_id'],
            ['item_id', 'store_id']
        )

        for i, group_id in enumerate(tqdm(self.group_ids)):
            train_y = train_df.groupby(group_id)[train_target_columns].sum()
            scale = []
            for _, row in train_y.iterrows():
                series = row.values[np.argmax(row.values != 0):]
                scale.append(((series[1:] - series[:-1]) ** 2).mean())
            setattr(self, f'lv{i + 1}_scale', np.array(scale))
            setattr(self, f'lv{i + 1}_train_df', train_y)
            setattr(self, f'lv{i + 1}_valid_df', valid_df.groupby(group_id)[valid_target_columns].sum())

            lv_weight = weight_df.groupby(group_id)[weight_columns].sum().sum(axis=1)
            setattr(self, f'lv{i + 1}_weight', lv_weight / lv_weight.sum())

    def get_weight_df(self) -> pd.DataFrame:
        day_to_week = self.calendar.set_index('d')['wm_yr_wk'].to_dict()
        weight_df = self.train_df[['item_id', 'store_id'] + self.weight_columns].set_index(['item_id', 'store_id'])
        weight_df = weight_df.stack().reset_index().rename(columns={'level_2': 'd', 0: 'value'})
        weight_df['wm_yr_wk'] = weight_df['d'].map(day_to_week)

        weight_df = weight_df.merge(self.prices, how='left', on=['item_id', 'store_id', 'wm_yr_wk'])
        weight_df['value'] = weight_df['value'] * weight_df['sell_price']
        weight_df = weight_df.set_index(['item_id', 'store_id', 'd']).unstack(level=2)['value']
        weight_df = weight_df.loc[zip(self.train_df.item_id, self.train_df.store_id), :].reset_index(drop=True)
        weight_df = pd.concat([self.train_df[self.id_columns], weight_df], axis=1, sort=False)
        return weight_df

    def rmsse(self, valid_preds: pd.DataFrame, lv: int) -> pd.Series:
        valid_y = getattr(self, f'lv{lv}_valid_df')
        score = ((valid_y - valid_preds) ** 2).mean(axis=1)
        scale = getattr(self, f'lv{lv}_scale')
        return (score / scale).map(np.sqrt)

    def score(self, valid_preds: Union[pd.DataFrame, np.ndarray]) -> float:
        assert self.valid_df[self.valid_target_columns].shape == valid_preds.shape

        if isinstance(valid_preds, np.ndarray):
            valid_preds = pd.DataFrame(valid_preds, columns=self.valid_target_columns)

        valid_preds = pd.concat([self.valid_df[self.id_columns], valid_preds], axis=1, sort=False)

        group_ids = []
        all_scores = []
        for i, group_id in enumerate(self.group_ids):
            lv_scores = self.rmsse(valid_preds.groupby(group_id)[self.valid_target_columns].sum(), i + 1)
            weight = getattr(self, f'lv{i + 1}_weight')
            lv_scores = pd.concat([weight, lv_scores], axis=1, sort=False).prod(axis=1)
            group_ids.append(group_id)
            all_scores.append(lv_scores.sum())

        return group_ids, all_scores

# lgb model utils

In [92]:
"""PARAMS = {'boosting_type': 'gbdt', 'objective': 'tweedie', 'metric': 'rmse', 'max_bin': 100, 
          'n_estimators': 2000, 'boost_from_average': False, 'verbose': -1, 'random_state': 2020,
          'tweedie_variance_power': 1.141893486974509, 'subsample': 0.8710431222390667, 
          'subsample_freq': 0.5692738176797527, 'learning_rate': 0.10957379305366494, 'num_leaves': 8,  
          'feature_fraction': 0.45380044045308154, 'bagging_freq': 4, 'min_child_samples': 5, 
          'lambda_l1': 7.510525772813387e-06, 'lambda_l2': 4.1004528526443944e-07,
          'device_type':'cpu'}"""

PARAMS = {'boosting_type': 'gbdt', 
          'objective' : 'tweedie', 'tweedie_variance_power': 1.141893486974509,
          "metric" :"rmse", "force_row_wise" : True, "learning_rate" : 0.075, "sub_row" : 0.75, "bagging_freq" : 1,
          "lambda_l2" : 0.1, 'verbosity': 1, 'num_iterations' : 2500}

In [93]:
def plot_importance(models, col):
    importances = np.zeros(len(col))
    for model in models:
        importances+=model.feature_importance(importance_type='gain')
    importance = pd.DataFrame()
    importance['col'] = col
    importance['importance'] = minmax_scale(importances)
    #importance.to_csv(f'importance_{name}.csv',index=False)
    return importance

def predict_cv(x_val, models):
    preds = np.zeros(len(x_val))
    for model in models:
        pred = model.predict(x_val)
        preds+=pred/len(models)
    return preds


def kmean_cluster(X):
    gk = GroupKFold(n_splits=10)
    group = X['wday'].astype(str)+'_'+X['month'].astype(str)
    for trn, val in gk.split(X,groups=group):
        tmp_X = X.loc[val, ['TARGET', 'wday', 'month']]
        km = KMeans(n_clusters=60, random_state=2020)
        k = StratifiedKFold(n_splits=10, random_state=2020, shuffle=True)
        tmp_X.reset_index(drop=True, inplace=True)
        for _trn, _val in k.split(tmp_X,y=tmp_X['TARGET'].astype(int)):
            sns.distplot(tmp_X.loc[_val,'TARGET'].values)
            plt.show()
            km.fit(tmp_X.loc[_val,['TARGET', 'wday', 'month']])
            print(len(val))
            break
        break
    
    X['group'] = km.predict(X[['TARGET', 'wday', 'month']])
    
    return X

def train_predict_RE(data,path,params=PARAMS):
    days = sorted(data.d.unique())
    days = sort_d_cols(days)
    tmp_days = days[-63:]
    trn_days = days[:-28]
    val_days = days[-28:]
    
    for i in range(28):
        data = fe(data)
        if i==0:
            shift_cols = [col for col in data.columns if 'shift' in col]
            roll_cols = [col for col in data.columns if 'roll' in col]
            cat_cols = ['item_id', 'store_id', 'snap', 'snap_-1', 'snap_1',
                      # 'dept_id_price', 'cat_id_price',
                      'price',  'new_event_type_1_4','event_name_1','event_type_1', 'wday', 'month', 'year'
                      #'is_sell_cnt_dept_id_store_id', 'is_sell_cnt_cat_id_store_id'
                     ]
            features=cat_cols+shift_cols+roll_cols

            print(f' FEATIRE LEN {len(features)}')
            models=[]
            X = data[data.d.isin(trn_days)][data.TARGET.notnull()]
            X = X.dropna(0)
            X.reset_index(drop=True, inplace=True)
            k = StratifiedKFold(n_splits=5, random_state=2020, shuffle=True)
            y = (500*minmax_scale(X['TARGET'])).astype(int)
            for trn, val in k.split(X,y=y):
                train_set = lgb.Dataset(X.loc[trn,features], X.loc[trn,'TARGET'])
                val_set = lgb.Dataset(X.loc[val,features], X.loc[val,'TARGET'])
                model = lgb.train(train_set=train_set, valid_sets=[train_set, val_set], params=params,
                                  early_stopping_rounds=100, verbose_eval=500, categorical_feature=cat_cols)
                models.append(model)
            data = data[data.d.isin(tmp_days)]

            importance = plot_importance(models, features)
            importance.to_csv(f'train_{i}_importance.csv', index=False)
        
        val_day = val_days[i]
        predict_data = data[data.d==val_day]
        preds = predict_cv(predict_data[features], models)
        
        data.loc[data.d==val_day, 'TARGET'] = preds
        
    sub = data[data.d.isin(val_days)][['id', 'd', 'TARGET', 'price']]
    sub.to_csv('all_result.csv', index=False)
    sub['TARGET'] = sub['TARGET']/sub['price']
    sub = sub.groupby(['id'])[['d', 'TARGET']].apply(lambda x: x.set_index('d')['TARGET'].T)[val_days]
    sub.columns=[f'F{i}' for i in range(1,29)]
    
    return sub

In [94]:
def all_flow(path, d_cols, train_d_cols):
    
    data = preprocessing(path, d_cols, train_d_cols)
    """
    data = fe(data)
    
    
    f = ['id', 'd', 'TARGET','cat_id', 'state_id', 'month']
    shift_cols = [col for col in data.columns if 'shift' in col]
    roll_cols = [col for col in data.columns if 'roll' in col]
    features=['dept_id', 'store_id', 'snap', 'snap_-1', 'snap_1', 'dept_id_price', 'cat_id_price', 'price',
              'new_event_name_1_4', 'new_event_type_1_4', 'wday', 'event_name_1','event_type_1',
              'is_sell_cnt_dept_id_store_id', 'is_sell_cnt_cat_id_store_id'
             ]+shift_cols+roll_cols
    f = f+features
    data = data[f]
    """
    data = str_category_2_int(data)
    
    use_days=data.d.unique().tolist()
    use_days=sort_d_cols(use_days)[63:]
    
    data = data[data.d.isin(use_days)]
    gc.collect()
    mem = data.memory_usage().sum()/1024**2
    
    print(f"""
    DATA SHAPE   {data.shape}
    MEMORY USAGE   {mem:.2f}MB
    DATA COLUMNS  {data.columns.tolist()}
    """)
    
    gc.collect()
    sub = train_predict_RE(data=data, path=path)
    
    sample_sub = pd.read_csv(path+'sample_submission.csv')
    sample_sub = sample_sub.set_index('id')
    sample_sub.loc[sub.index, sub.columns]= sub.values
    sample_sub=sample_sub.reset_index()
    
    return sample_sub

In [None]:
%%time
#path = '/Users/kanoumotoharu/Downloads/m5-forecasting-accuracy/'
#path = '/Users/abcdm/Downloads/m5-forecasting-accuracy/'
path = '../input/m5-forecasting-accuracy/'

d_cols=[f'd_{i+1}' for i in range(1969)]
train_d_cols = d_cols[:-56]
train_d_cols = train_d_cols[-530:]

sub = all_flow(path, d_cols, train_d_cols)
sub.to_csv('submission.csv', index=False)


df_train_full = pd.read_csv(path+"sales_train_evaluation.csv")
df_calendar = pd.read_csv(path+"calendar.csv")
df_prices = pd.read_csv(path+"sell_prices.csv")
df_sample_submission = pd.read_csv(path+"sample_submission.csv")
df_sample_submission["order"] = range(df_sample_submission.shape[0])
df_train_full = pd.read_csv(path+"sales_train_evaluation.csv")

df_train = df_train_full.iloc[:, :-56]
df_valid = df_train_full.iloc[:, -56:-56+28]

evaluator = WRMSSEEvaluator(df_train, df_valid, df_calendar, df_prices)

100%|██████████| 30490/30490 [00:28<00:00, 1079.57it/s]


Mem. usage decreased to 277.40 Mb (25.0% reduction)

    DATA SHAPE   (14238830, 19)
    MEMORY USAGE   2091.20MB
    DATA COLUMNS  ['id', 'd', 'TARGET', 'dept_id', 'cat_id', 'state_id', 'store_id', 'item_id', 'snap', 'price', 'new_event_name_1_4', 'new_event_type_1_4', 'wday', 'month', 'year', 'event_name_1', 'event_type_1', 'snap_-1', 'snap_1']
    


In [None]:
#sub = pd.read_csv('../input/create-m5-all-flow-11/submission.csv')
validation = sub[sub.id.str.contains('_validation')]
evaluation = sub[sub.id.str.contains('_evaluation')]
evaluation.id = evaluation.id.str.replace('_evaluation', '_validation')
evaluation = evaluation.set_index('id').loc[validation.id]
sub[sub.id.str.contains('_validation')] = evaluation.reset_index()
sub.loc[sub.id.str.contains('_evaluation'),[f'F{i}' for i in range(1,29)]]=0

preds_valid = sub#pd.read_csv("../input/submission-m5-ver3/submission.csv")
preds_valid = preds_valid[preds_valid.id.str.contains("validation")]
preds_valid = preds_valid.merge(df_sample_submission[["id", "order"]], on = "id").sort_values("order").drop(["id", "order"], axis = 1)
preds_valid.rename(columns = {f'F{i+1}':d for i, d in enumerate(train_d_cols[-28:])}, inplace = True)

groups, scores = evaluator.score(preds_valid)

score_public_lb = np.mean(scores)

for i in range(len(groups)):
    print(f"Score for group {groups[i]}: {round(scores[i], 5)}")

print(f"\nPublic LB Score: {round(score_public_lb, 5)}")