In [1]:
import warnings
warnings.filterwarnings("ignore")


import numpy as np
import pandas as pd
import datetime
from catboost import CatBoostClassifier
import lightgbm as lgb
from time import time
from tqdm import tqdm_notebook as tqdm
from collections import Counter
from scipy import stats

import matplotlib.pyplot as plt
import seaborn as sns

import ast

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, KFold, RepeatedKFold, GroupKFold, GridSearchCV, train_test_split, TimeSeriesSplit
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix, mean_squared_error
from sklearn import linear_model

from tqdm import tqdm_notebook as tqdm
import gc, pickle

import datetime
from catboost import CatBoostClassifier
from time import time
from collections import Counter
from scipy import stats

from sklearn import preprocessing

In [3]:
def create_is_sell_data(sell_prices_df, calendar_df, train_df):
    sell_prices_df['id'] = sell_prices_df['item_id'].astype('str')+'_'+sell_prices_df['store_id']+'_validation'
    sell_prices_data = sell_prices_df[sell_prices_df.wm_yr_wk.isin(calendar_df.wm_yr_wk.unique())]
    sell_prices_data.reset_index(drop=True, inplace=True)
    tmp = sell_prices_data.groupby(['id'])[['wm_yr_wk', 'sell_price']].apply(
        lambda x: x.set_index('wm_yr_wk')['sell_price'].to_dict()
    ).to_dict()
    d = calendar_df.d
    wm_yr_wk = calendar_df.wm_yr_wk
    price_data = {}
    for col in tqdm(train_df.id.unique()):
        price_data[col] = wm_yr_wk.map(tmp[col])
    price_data = pd.DataFrame(price_data)
    price_data.index = d
    is_sell = price_data.notnull().astype(float).T
    price_data = price_data.fillna(0)
    
    is_sell.index=train_df.id
    train_df.index=train_df.id
    is_sell = pd.concat([
        train_df[['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']], is_sell
    ], axis=1)
    price_data = pd.concat([
        train_df[['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']], price_data.T  
    ], axis=1)
    
    return price_data, is_sell

def set_index(df, name):
    d = {}
    for col, value in df.iloc[0,:].items():
        if type(col)==str:
            if type(df[col].values[0])!=str:
                v = 'd'
            else:
                v='id'
        else:
            v=name
        d[col]=v
    return d

def dcol2int(col):
    if col[:2]=='d_':
        return int(col.replace('d_', ''))
    else:
        return col

In [4]:
def create_category_data(data, dfT):
    C = ['item_id', 'cat_id', 'dept_id', 'store_id', 'state_id']
    tmp = dfT[C]
    tmp = tmp.to_dict()
    for key, value in tmp.items():
        data[key] = data.id.map(value)
    return data
def create_calendar_data_dict(calendar_df, use_d_cols, _state):
    calendar_data_dict = {}
    calendar_data_dict.update(
        calendar_df[calendar_df.d.isin(use_d_cols)][['wday', 'month', f'snap_{_state}']].to_dict()
    )
    
    for shift in [0,1]:
        tmp = calendar_df[calendar_df.d.isin(use_d_cols)][[f'snap_{_state}']].shift(shift)
        tmp=tmp.rename(columns={f'snap_{_state}':f'snap_{_state}_shift{shift}'})
        calendar_data_dict.update(tmp.to_dict())
        
    return calendar_data_dict
def create_groupby_price_data(data, store_price_data, use_d_cols, groups=['dept_id', 'cat_id']):
    for group in groups:
        tmp = store_price_data[use_d_cols]/store_price_data.groupby([group])[use_d_cols].transform('mean')
        tmp = tmp.T.astype(float)
        tmp = tmp.rolling(7, min_periods=1).mean().stack(dropna=False).reset_index()
        tmp = tmp.rename(columns=set_index(tmp, f'price_mean_{group}'))
        data = pd.merge(data, tmp, on=['d', 'id'], how='left')
    
    return data

In [5]:
def create_event_data(data, name):
    a = pd.concat([
        train_df.groupby(['item_id'])[d_cols].agg('mean').T.astype(float),
        calendar_df[name]
    ],axis=1)
    
    cols = a.columns[:-1]
    a[name]=a[name].fillna('NAN')
    a[cols] = a.groupby([name])[cols].transform(
        lambda x:x.sort_index().rolling(len(x), min_periods=1).mean().shift(1)
    )/train_df.groupby(['item_id'])[d_cols].agg('mean').T.astype(float).shift(1).rolling(100, min_periods=1).mean()[cols]
    
    a.loc[a[name]=='NAN', cols]=0
    a.loc[a[name]=='NAN', name]=np.nan
    a.drop(name, axis=1, inplace=True)
    
    for shift in [-3,-2,-1,0,1,2,3]:
        tmp_a = a.shift(shift).stack().reset_index()
        tmp_a = tmp_a.rename(columns=set_index(tmp_a, 'value'))
        tmp_a.index = tmp_a.id.astype(str)+'_'+tmp_a.d.astype(str)
        tmp_a = tmp_a[tmp_a.index.isin(data.d.unique().tolist())]
        tmp_a.drop(columns=['d', 'id'], inplace=True)
        data[f'{name}_shift{shift}'] = (data.item_id.astype(str)+'_'+data.d.astype(str)).map(tmp_a['value']).apply(
            lambda x: float(x) if float(x)>0 else 0
        )
    del a, tmp_a; gc.collect()
    return data

In [16]:
def create_roll_data(data, df, wins, use_d_cols):
    for win in wins:
        for agg in ['mean', 'std', 'skew']:
            tmp = df.rolling(window=win, min_periods=1).agg(agg).loc[use_d_cols].stack(dropna=False).reset_index()
            tmp = tmp.rename(columns=set_index(tmp, f'_diff_roll{win}_{agg}'))
            data = pd.merge(data, tmp, on=['d', 'id'], how='left')
    return data

def create_diff_data(data, df, i, wins, use_d_cols, name=None):
    a = df.diff(i)
    for win in wins:
        #for agg in ['mean', 'std', 'skew']:
        for agg in ['mean']:
            tmp = a.rolling(window=win, min_periods=1).agg(agg).loc[use_d_cols].stack(dropna=False).reset_index()
            if name is None:
                tmp = tmp.rename(columns=set_index(tmp, f'diff_{i}_roll{win}_{agg}'))
            else:
                tmp = tmp.rename(columns=set_index(tmp, f'{name}_diff_{i}_roll{win}_{agg}'))
            data = pd.merge(data, tmp, on=['d', 'id'], how='left')
    a = abs(a)
    for win in wins:
        #for agg in ['mean', 'std', 'skew']:
        for agg in ['mean']:
            tmp = a.rolling(window=win, min_periods=1).agg(agg).loc[use_d_cols].stack(dropna=False).reset_index()
            if name is None:
                tmp = tmp.rename(columns=set_index(tmp, f'abs_diff_{i}_roll{win}_{agg}'))
            else:
                tmp = tmp.rename(columns=set_index(tmp, f'{name}_abs_diff_{i}_roll{win}_{agg}'))
            data = pd.merge(data, tmp, on=['d', 'id'], how='left')
    return data

def create_group_sell_data(data, dfT, use_d_cols, groups=['dept_id', 'cat_id']):
    for group in groups:
        tmp = dfT[use_d_cols]/ dfT.groupby([group])[use_d_cols].transform('mean')
        tmp = tmp.T.astype(float)
        data = create_diff_data(data, tmp, 7, wins=[360, 30, 28, 7], use_d_cols=use_d_cols, name=f'By_{group}')
        data = create_diff_data(data, tmp, 1, wins=[360, 7], use_d_cols=use_d_cols, name=f'By_{group}')
        data = create_diff_data(data, tmp, 28, wins=[360, 7], use_d_cols=use_d_cols, name=f'By_{group}')
    return data


def create_shift_data(data, df, shifts, use_d_cols, name=None):
    tmp_df = df[data.id.unique()]+df[data.id.unique()].shift(7)+df[data.id.unique()].shift(14)+df[data.id.unique()].shift(21)
    
    for i, shift in enumerate(shifts):
        tmp = tmp_df.shift(shift).loc[use_d_cols].stack(dropna=False).reset_index()
        if name is None:
            tmp = tmp.rename(columns=set_index(tmp, f'shift_no{i+1}'))
        else:
            tmp = tmp.rename(columns=set_index(tmp, f'{name}_shift_no{i+1}'))
        data = pd.merge(data, tmp, on=['d', 'id'], how='left')
    return data

def create_group_shift_data(data, df, dfT, shifts, use_d_cols):
    for group in ['dept_id', 'cat_id']:
        _tmp = dfT.groupby([group])[use_d_cols].transform('mean').T.astype(float)
        data = create_shift_data(data, _tmp, shifts, use_d_cols, name=f'By_{group}')
    return data

In [17]:
def make_input(train_d_cols, _state):
    use_d_cols = [i for i in range(train_d_cols[0]-380,train_d_cols[0])]+train_d_cols
    ids = train_df[train_df.state_id==_state].id.unique().tolist()
    
    df = train_df.loc[ids,:]
    df = df.T.loc[d_cols]
    df = pd.DataFrame(df.values.astype(float), index=d_cols, columns=df.columns)

    df['d']=df.index
    calendar_dict = calendar_df[[
        'date', 'wm_yr_wk', 'weekday', 'wday', 'month', 'year', 'd', 'event_name_1',
        'event_type_1', 'event_name_2', 'event_type_2',f'snap_{_state}'
    ]].set_index('d').to_dict()

    for key, value in calendar_dict.items():
        df[key] = df['d'].map(value)
    df.drop('d', axis=1, inplace=True)
    
    store_price_data = price_data.loc[ids,:]
    store_is_sell = is_sell.loc[ids,:]
    store_price_data[[col for col in store_price_data.columns if type(col)!=str]] =\
            store_price_data[[col for col in store_price_data.columns if type(col)!=str]].replace(0, np.nan)
    
    
    df = df.loc[use_d_cols]
    dfT = train_df.loc[ids,['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']+use_d_cols]
    store_is_sell = store_is_sell[['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']+use_d_cols]
    store_price_data = store_price_data[['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']+use_d_cols]
    
    data = df.loc[train_d_cols, ids].stack(dropna=False).reset_index().rename(columns={'level_0':'d', 'id':'id', 0:'TARGET'})
    
    
    data = fe(data, ids, df, dfT, store_price_data, calendar_df, use_d_cols, train_d_cols, _state)
    
    return data

def fe(data, ids, df, dfT, store_price_data, calendar_df, use_d_cols, train_d_cols, _state):
    data = create_category_data(data, dfT)
    
    calendar_data_dict = create_calendar_data_dict(calendar_df, use_d_cols, _state)
    for key, values in calendar_data_dict.items():
        data[key] = data.d.map(values)
        
        
    for name in ['event_name_1', 'event_type_1', 'event_name_2', 'event_type_2']:
        data = create_event_data(data, name)
    
    data = create_groupby_price_data(data, store_price_data, use_d_cols)
    
    data = create_diff_data(data, df[ids], 7, wins=[30], use_d_cols=train_d_cols)
    data = create_diff_data(data, df[ids], 1, wins=[360, 30, 7], use_d_cols=train_d_cols)
    data = create_diff_data(data, df[ids], 28, wins=[30], use_d_cols=train_d_cols)
    #data = create_roll_data(data, df[ids], [7,28,56], use_d_cols)
    
    categories = [c for c in data.columns if data[c].dtype==object]
    print(categories)
    for c in categories:
        if c=='id':
            pass
        else:
            data[c] = pd.factorize(data[c])[0]
    
    return data, df, dfT

In [18]:
params = {
            'n_estimators':2000,
            'boosting_type': 'gbdt',
            'objective': 'poisson',
            'metric': 'rmse',
            'subsample': 0.75,
            'subsample_freq': 1,
            'learning_rate': 0.07,
            'feature_fraction': 0.85,
            'max_depth': 15,
            'lambda_l1': 1,  
            'lambda_l2': 1,
            'verbose': 100,
            'random_state':123,
            }

def train(data, df, dfT):
    train_d_cols = data.d.unique().tolist()
    use_d_cols = [i for i in range(train_d_cols[0]-380,train_d_cols[0])]+train_d_cols
    log = []
    for for_predict in range(28):
        print("""
        ###########################################
         ############## {} - {}  ##############
        ###########################################""".format(_state, for_predict+1))
        
        g_for_predict = for_predict//7 + 1
        shifts = [7*i for i in [g_for_predict, g_for_predict+2, g_for_predict+3, g_for_predict+4]]
        if for_predict==0:
                  split=int(0.3*len(train_d_cols[shifts[0]:]))
        
        if for_predict%7==0:
            if for_predict>0:
                data.drop(columns=shift_cols, inplace=True)
            data = create_shift_data(data, df, shifts, use_d_cols)
            data = create_group_shift_data(data, df, dfT, shifts, use_d_cols)
                
        
        diff_cols = [col for col in data.columns if ('diff' in col)]
        shift_cols = [col for col in data.columns if ('shift' in col) and (not 'diff' in col)and (not 'snap' in col)]
        data = pd.concat([
            data.drop(columns=diff_cols),
            data[diff_cols+['id']].groupby(['id']).transform(
                lambda x: x.sort_index().shift(1)
            )
        ], axis=1)
        
        data =pd.concat([
            data['id'],
            data[[c for c in data.columns if c!='id']].astype(float)
        ], axis=1)
        
        y = data[data.TARGET.notnull()][['TARGET']+['d', 'id']]
        X = data[data.TARGET.notnull()].drop(columns=['id',  'TARGET', 'item_id','state_id']).astype(float)
        
        
        split=int(0.3*len(train_d_cols[shifts[0]:]))
        x_train, x_val = X[X.d.isin(train_d_cols[shifts[0]:-split])], X[X.d.isin(train_d_cols[-split:])]
        y_train, y_val = y[y.d.isin(train_d_cols[shifts[0]:-split])], y[y.d.isin(train_d_cols[-split:])]
        
        #x_train, x_val = X[X.d.isin(train_d_cols[:-3])], X[X.d.isin(train_d_cols[-3:])]
        #y_train, y_val = y[y.d.isin(train_d_cols[:-3])], y[y.d.isin(train_d_cols[-3:])]
        
        x_train.drop('d', axis=1, inplace=True)
        x_val.drop('d', axis=1, inplace=True)
        y_train = y_train['TARGET'].astype(float)
        y_val = y_val['TARGET'].astype(float)
        if for_predict==0:
            print(x_train.shape)
            print(x_val.shape)
            
        train_set = lgb.Dataset(x_train, np.log1p(y_train))
        val_set = lgb.Dataset(x_val, np.log1p(y_val))
        
        categories = ['cat_id', 'dept_id', 'store_id']
        
        model = lgb.train(
                    train_set=train_set, 
                    valid_sets=[train_set, val_set],
                    params=params, num_boost_round=3000, early_stopping_rounds=100, verbose_eval=500,
            categorical_feature=categories+['wday', 'month']
        )
        
        
        pred = model.predict(x_train)
        pred = np.e**(pred)-1
        print(f'TRAIN    {np.sqrt(mean_squared_error(pred, y_train))}')
        
        pred = model.predict(x_val)
        pred = np.e**(pred)-1
        print(f'VAL    {np.sqrt(mean_squared_error(pred, y_val))}')
        
        importance = pd.DataFrame()
        importance['importance'] = model.feature_importance(importance_type='gain')
        importance['importance'] = preprocessing.minmax_scale(importance.importance)
        importance['columns'] = x_val.columns
        log.append([importance])
        gc.collect()
        del train_set, val_set, X, x_train, x_val, y_train, y_val;gc.collect()
    gc.collect()
    
    with open(f'train_log_{_state}.pickle', 'wb') as f:
        pickle.dump(log, f)
        
def plot_importance(model, col, num):
    importance = pd.DataFrame()
    importance['importance'] = model.feature_importance(importance_type='gain')
    importance['importance'] = preprocessing.minmax_scale(importance.importance)
    importance.index = col
    importance = importance['importance']

    plt.figure(figsize=(10,30))
    importance.sort_values( ascending=True).plot('barh')
    plt.title(f'for {num}')
    
def train_predict(data, df, dfT, _store):
    train_d_cols = data[data.TARGET.notnull()].d.unique().tolist()
    use_d_cols = [i for i in range(train_d_cols[0]-380,train_d_cols[0])]+data.d.unique().tolist()
    ids = data.id.unique().tolist()
    
    sub_df = pd.DataFrame()
    #sub_df.index = ids
    models = {}
    for for_predict in range(28):
        print("""
        ############################################
         ############## {} - {}  ##############
        ############################################""".format(_state,for_predict+1))
        
        g_for_predict = for_predict//7 + 1
        shifts = [7*i for i in [g_for_predict, g_for_predict+3]]
        
        if for_predict%7==0:
            if for_predict>0:
                data.drop(columns=shift_cols, inplace=True)
                
            data = create_shift_data(data, df, shifts, use_d_cols)
            data = create_group_shift_data(data, df, dfT, shifts, use_d_cols)
                
        
        diff_cols = [col for col in data.columns if ('diff' in col)]
        shift_cols = [col for col in data.columns if ('shift' in col) and (not 'diff' in col)and (not 'snap' in col)]
        data = pd.concat([
            data.drop(columns=diff_cols),
            data[diff_cols+['id']].groupby(['id']).transform(
                lambda x: x.sort_index().shift(1)
            )
        ], axis=1)
        
        data =pd.concat([
            data['id'],
            data[[c for c in data.columns if c!='id']].astype(float)
        ], axis=1)
        
        sub_df = pd.concat([
            sub_df,
            data[data.d==TARGET_D_COLS[for_predict]]
        ],axis=0)
        
        y = data[data.TARGET.notnull()][['TARGET']+['d']].astype(float)
        X = data[data.TARGET.notnull()].drop(columns=['id', 'TARGET', 'item_id','state_id']).astype(float)
        
        split=int(0.3*len(train_d_cols[shifts[0]:]))
        x_train, x_val = X[X.d.isin(train_d_cols[shifts[0]:-split])], X[X.d.isin(train_d_cols[-split:])]
        y_train, y_val = y[y.d.isin(train_d_cols[shifts[0]:-split])], y[y.d.isin(train_d_cols[-split:])]
        
        #x_train, x_val = X[X.d.isin(train_d_cols[:-3])], X[X.d.isin(train_d_cols[-3:])]
        #y_train, y_val = y[y.d.isin(train_d_cols[:-3])], y[y.d.isin(train_d_cols[-3:])]
        
        x_train.drop('d', axis=1, inplace=True)
        x_val.drop('d', axis=1, inplace=True)
        y_train = y_train['TARGET']
        y_val = y_val['TARGET']
        if for_predict==0:
            print(x_train.shape)
            print(x_val.shape)
        train_set = lgb.Dataset(x_train, y_train)
        val_set = lgb.Dataset(x_val, y_val)
        
        categories = ['cat_id', 'dept_id', 'store_id']
        
        model = lgb.train(
                    train_set=train_set, 
                    valid_sets=[train_set, val_set],
                    params=params, num_boost_round=3000, early_stopping_rounds=100, verbose_eval=500,
            categorical_feature=categories+['wday', 'month']
        )
        
        models[f'{_state}_model_{for_predict+1}']=model
        
        plot_importance(model, x_train.columns, for_predict+1)
        gc.collect()
        
        
        del train_set, val_set, X, y, x_train, x_val, y_train, y_val;gc.collect()
    
    gc.collect()
    
    sub_df.reset_index(drop=True, inplace=True)
    sub_df.to_pickle(f'{_state}_csv.pickle')
    with open(f'models_{_state}.pickle', 'wb') as f:
        pickle.dump(models, f)

In [9]:
%%time
#path = '/Users/kanoumotoharu/Downloads/m5-forecasting-accuracy/'
#path = '/Users/abcdm/Downloads/m5-forecasting-accuracy/'
path = '../input/m5-forecasting-accuracy/'

train_df = pd.read_csv(path+'sales_train_validation.csv')
calendar_df = pd.read_csv(path+'calendar.csv')
sell_prices_df = pd.read_csv(path+'sell_prices.csv')
sample_submission_df = pd.read_csv(path+'sample_submission.csv')


calendar_df['d'] = calendar_df.d.str.replace('d_', '').astype(int)
cols = train_df.columns
cols = [dcol2int(col) for col in cols]
train_df.columns=cols
calendar_df['date']=pd.to_datetime(calendar_df.date)
calendar_df.index = calendar_df.d
price_data, is_sell = create_is_sell_data(sell_prices_df, calendar_df, train_df)

d_cols = [ col for col in train_df.columns if type(col)!=str ]
for i in range(1,29):
    train_df[d_cols[-1]+i]=np.nan
d_cols = [ col for col in train_df.columns if type(col)!=str ]

train_df = pd.concat([
    train_df[['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']],
    train_df[d_cols]+is_sell[d_cols].replace(0, np.nan).replace(1, 0)
], axis=1)

HBox(children=(FloatProgress(value=0.0, max=30490.0), HTML(value='')))


CPU times: user 1min, sys: 6.11 s, total: 1min 6s
Wall time: 1min 6s


In [10]:
#_state = 'CA'
#lag=28
#train_d_cols = d_cols[-(lag+60+56):-lag]
#data, df, dfT = make_input(train_d_cols, _state)
#train(data, df, dfT)

['id', 'item_id', 'cat_id', 'dept_id', 'store_id', 'state_id']


In [None]:
#train(data, df, dfT)


        ###########################################
         ############## CA - 1  ##############
        ###########################################
(938804, 60)
(390272, 60)
Training until validation scores don't improve for 100 rounds
[500]	training's rmse: 0.481739	valid_1's rmse: 0.498395
[1000]	training's rmse: 0.477831	valid_1's rmse: 0.498122
Early stopping, best iteration is:
[1153]	training's rmse: 0.476766	valid_1's rmse: 0.498057
TRAIN    2.0793618724227554


In [None]:
#(939092, 83)
#(390272, 83)
#train(data, df, dfT)

In [None]:
#data[data.d.isin(train_d_cols[-7:])].shape

In [None]:
#a = train(data[data.d.isin(train_d_cols[-7:])], df, dfT)

In [None]:
#data[data.d.isin(train_d_cols[-7:][:-3])]['TARGET']

In [None]:
#np.sqrt(mean_squared_error(a, data[data.d.isin(train_d_cols[-7:][:-3])]['TARGET']))

In [None]:
#plt.hist(a)

In [None]:
#lag=28
#TARGET_D_COLS = d_cols[-lag:]

#train_d_cols = d_cols[-(lag+160+56):-lag]#+TARGET_D_COLS

In [None]:
def run_all(_state):
    lag=28
    train_d_cols = d_cols[-(lag+160+56):-lag]#+TARGET_D_COLS
    data, df, dfT = make_input(train_d_cols, _state)
    train(data, df, dfT)
    
for _state in ['CA', 'WI', 'TX']:
    run_all(_state)
    gc.collect()

In [None]:
#for _store in ['TX_2', 'TX_3', 'WI_1', 'WI_2', 'WI_3']:
 #   run_all(_store)
  #  gc.collect()

In [None]:
#'CA_1', 'CA_2', 'CA_3', 'CA_4', 'TX_1', 'TX_2', 'TX_3', 'WI_1', 'WI_2', 'WI_3'
#_store='CA_1'
#data, df, dfT = make_input(train_d_cols, _store)
#train(data, df, dfT)