In [1]:
import warnings
warnings.filterwarnings("ignore")


import numpy as np
import pandas as pd
import datetime
from catboost import CatBoostClassifier
import lightgbm as lgb
from time import time
from tqdm import tqdm_notebook as tqdm
from collections import Counter
from scipy import stats

import matplotlib.pyplot as plt
import seaborn as sns

import ast

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, KFold, RepeatedKFold, GroupKFold, GridSearchCV, train_test_split, TimeSeriesSplit
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import linear_model

from tqdm import tqdm_notebook as tqdm
import gc, pickle

import datetime
from catboost import CatBoostClassifier
from time import time
from collections import Counter
from scipy import stats

from sklearn import preprocessing

In [2]:
def create_is_sell_data(sell_prices_df, calendar_df, train_df):
    sell_prices_df['id'] = sell_prices_df['item_id'].astype('str')+'_'+sell_prices_df['store_id']+'_validation'
    sell_prices_data = sell_prices_df[sell_prices_df.wm_yr_wk.isin(calendar_df.wm_yr_wk.unique())]
    sell_prices_data.reset_index(drop=True, inplace=True)
    tmp = sell_prices_data.groupby(['id'])[['wm_yr_wk', 'sell_price']].apply(
        lambda x: x.set_index('wm_yr_wk')['sell_price'].to_dict()
    ).to_dict()
    d = calendar_df.d
    wm_yr_wk = calendar_df.wm_yr_wk
    price_data = {}
    for col in tqdm(train_df.id.unique()):
        price_data[col] = wm_yr_wk.map(tmp[col])
    price_data = pd.DataFrame(price_data)
    price_data.index = d
    is_sell = price_data.notnull().astype(float).T
    price_data = price_data.fillna(0)
    
    is_sell.index=train_df.id
    train_df.index=train_df.id
    is_sell = pd.concat([
        train_df[['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']], is_sell
    ], axis=1)
    price_data = pd.concat([
        train_df[['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']], price_data.T  
    ], axis=1)
    
    return price_data, is_sell

def set_index(df, name):
    d = {}
    for col, value in df.iloc[0,:].items():
        if type(col)==str:
            if type(value)!=str:
                v = 'd'
            else:
                v='id'
        else:
            v=name
        d[col]=v
    return d

def dcol2int(col):
    if col[:2]=='d_':
        return int(col.replace('d_', ''))
    else:
        return col

In [3]:
def c_diff_data(data, _diff, wins):
    data['tmp'] = data.groupby(['id'])['TARGET'].transform(
        lambda x: x.sort_index().diff(_diff)
    )
    
    for win in wins:
        data[f'diff{_diff}_{win}'] = data.groupby(['id'])['tmp'].transform(
            lambda x: x.rolling(win, min_periods=1).mean()
        )
        data[f'abs_diff{_diff}_{win}'] = data.groupby(['id'])['tmp'].transform(
            lambda x: abs(x).rolling(win, min_periods=1).mean()
        )
    del data['tmp']
    return data

def c_event_data(data, _state):
    for col in [ 'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2', f'snap_{_state}']:
        for shift in [1,2,3,7]:
            data[f'{abs(shift)}_{col}'] = data.groupby('id')[col].transform(
                lambda x: x.sort_index().shift(shift)
            )
    return data


def c_diffshift_data(data, wins):
    for win in wins:
        for agg in ['mean', 'std', 'skew']:
            data[f'diff{agg}_roll{win}'] = data.groupby(['id'])['TARGET'].transform(
                lambda x: x.sort_index().rolling(win, min_periods=1).agg(agg)
            )
    return data


def c_diff_data_g(data, _diff, wins, group):
    data['tmp'] = data.groupby(['d']+group)['TARGET'].transform('mean')
    data['tmp'] = data.groupby(['id'])['tmp'].transform(
        lambda x: x.sort_index().diff(_diff)
    )
    name = ','.join(group).replace(',', '_')
    for win in wins:
        data[f'diff{_diff}_{win}_by{name}'] = data.groupby(['id'])['tmp'].transform(
            lambda x: x.sort_index().rolling(win, min_periods=1).mean()
        )
        data[f'abs_diff{_diff}_{win}_by{name}'] = data.groupby(['id'])['tmp'].transform(
            lambda x: abs(x.sort_index()).rolling(win, min_periods=1).mean()
        )
    del data['tmp']
    return data

def c_diffshift_data_g(data, wins, group):
    data['tmp'] = data.groupby(['d']+group)['TARGET'].transform('mean')
    
    name = ','.join(group).replace(',', '_')
    for win in wins:
        for agg in ['mean', 'std', 'skew']:
            data[f'diff{agg}_roll{win}_by{name}'] = data.groupby(['id'])['tmp'].transform(
                lambda x: x.sort_index().rolling(win, min_periods=1).agg(agg)
            )
    del data['tmp']
    return data

In [4]:
def c_shift_data(data, shifts):
    for i, shift in enumerate(shifts):
        data[f'shift_no{i}'] = data.groupby(['id'])['todel_shift'].transform(
            lambda x: x.sort_index().shift(shift)
        )
    return data
def c_shift_data_g(data, group, shifts):
    data['tmp'] = data.groupby(['d']+group)['todel_shift'].transform('mean')
    for i, shift in enumerate(shifts):
        data[f'shift_no{i}_by{group}'] = data.groupby(['id'])['tmp'].transform(
            lambda x: x.sort_index().shift(shift)
        )
    del data['tmp']
    return data

In [11]:
%%time
#path = '/Users/kanoumotoharu/Downloads/m5-forecasting-accuracy/'
path = '/Users/abcdm/Downloads/m5-forecasting-accuracy/'
#path = '../input/m5-forecasting-accuracy/'

train_df = pd.read_csv(path+'sales_train_validation.csv')
calendar_df = pd.read_csv(path+'calendar.csv')
sell_prices_df = pd.read_csv(path+'sell_prices.csv')
sample_submission_df = pd.read_csv(path+'sample_submission.csv')

calendar_df['d'] = calendar_df.d.str.replace('d_', '').astype(int)
cols = train_df.columns
cols = [dcol2int(col) for col in cols]
train_df.columns=cols
calendar_df['date']=pd.to_datetime(calendar_df.date)
calendar_df.index = calendar_df.d
price_data, is_sell = create_is_sell_data(sell_prices_df, calendar_df, train_df)

d_cols = [ col for col in train_df.columns if type(col)!=str ]
for i in range(1,29):
    train_df[d_cols[-1]+i]=np.nan
d_cols = [ col for col in train_df.columns if type(col)!=str ]

train_df = pd.concat([
    train_df[['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']],
    train_df[d_cols]+is_sell[d_cols].replace(0, np.nan).replace(1, 0)
], axis=1)

HBox(children=(FloatProgress(value=0.0, max=30490.0), HTML(value='')))


Wall time: 34.8 s


In [12]:
gc.collect()

20

In [13]:
t=60
train_d_cols = d_cols[-(360+t+28):-28]
_state='CA'

train_df = train_df[train_df.state_id==_state];gc.collect()
ids = train_df.id.unique().tolist()
calendar_dict = calendar_df[[
    'wday', 'month', 'year', 'd', 'event_name_1',
    'event_type_1', 'event_name_2', 'event_type_2',f'snap_{_state}'
]].set_index('d').to_dict()
cat_dict = train_df[train_df.state_id==_state][['item_id',  'dept_id',   'cat_id', 'store_id', 'state_id']].to_dict()

sell_prices_df['key'] = sell_prices_df['wm_yr_wk'].astype(str)+'_'+sell_prices_df['id'].astype(str)
sell_prices_dict = sell_prices_df.set_index('key')['sell_price'].to_dict()

del calendar_df, sell_prices_df, is_sell; gc.collect()

train_df = train_df.loc[ids,d_cols].stack().reset_index()
train_df = train_df.rename(columns=set_index(train_df, 'TARGET'))
train_df.sort_values('d', inplace=True)
train_df.reset_index(drop=True, inplace=True)

data = train_df[train_df.d.isin(train_d_cols)]
data.reset_index(inplace=True, drop=True)

del train_df;gc.collect()

for key, value in calendar_dict.items():
    data[key] = data['d'].map(value)
    
for key, value in cat_dict.items():
    data[key] = data['id'].map(value)
    
data['sell_price'] = (data.wm_yr_wk.astype(str)+'_'+data.id.astype(str)).map(sell_prices_dict)

In [14]:
diff_feature_cols = {
    1:[7, 30, 360],
    7:[7],
    28:[7]   
}

In [15]:
%%time
step_n=1

data['price_dept'] = data['sell_price']/data.groupby(['store_id', 'd', 'dept_id'])['sell_price'].transform('mean')
data['price_cat'] = data['sell_price']/data.groupby(['store_id', 'd', 'cat_id'])['sell_price'].transform('mean')
print(f'step_{step_n}   complate');step_n+=1

data = c_event_data(data, _state)
print(f'step_{step_n}   complate');step_n+=1

for _diff, wins in diff_feature_cols.items():
    data = c_diff_data(data, _diff, wins)
    print(f'step_{step_n}   complate');step_n+=1
    for group in [['store_id', 'dept_id'], ['cat_id','store_id']]:
        data = c_diff_data_g(data, _diff, wins, group)
        print(f'step_{step_n}   complate');step_n+=1
        
data = c_diffshift_data(data, [7, 28, 60])
print(f'step_{step_n}   complate');step_n+=1
for group in [['store_id', 'dept_id'], ['cat_id','store_id']]:
    data = c_diffshift_data_g(data, [7, 28, 60], group)
    print(f'step_{step_n}   complate');step_n+=1

step_1   complate
step_2   complate
step_3   complate
step_4   complate
step_5   complate
step_6   complate
step_7   complate
step_8   complate
step_9   complate
step_10   complate
step_11   complate
step_12   complate


KeyboardInterrupt: 

In [None]:
data['todel_shift'] = data.groupby(['id', 'wday'])['TARGET'].transform(
    lambda x: x.sort_index().rolling(4, min_periods=1).mean()
)
gc.collect()

In [None]:
#data.to_csv(f'{_state}_data.csv', index=False)

In [None]:
category_f = [
    'item_id', 'state_id', 'dept_id', 'cat_id', 'store_id', 'event_name_1', 'event_type_1', 'event_name_2',
    'event_type_2', '1_event_name_1', '2_event_name_1', '3_event_name_1', '7_event_name_1', '1_event_type_1', 
    '2_event_type_1', '3_event_type_1', '7_event_type_1', '1_event_name_2', '2_event_name_2', '3_event_name_2',
    '7_event_name_2', '1_event_type_2', '2_event_type_2', '3_event_type_2', '7_event_type_2'
]

In [None]:
for col in category_f:
    data[col] = pd.factorize(data[col])[0]

In [None]:
params = {
            'n_estimators':2000,
            'boosting_type': 'gbdt',
            'objective': 'poisson',
            'metric': 'rmse',
            'subsample': 0.75,
            'subsample_freq': 1,
            'learning_rate': 0.07,
            'feature_fraction': 0.85,
            'max_depth': 15,
            'lambda_l1': 1,  
            'lambda_l2': 1,
            'verbose': 100,
            'random_state':123,
            }

def train(data, train_d_cols, t):
    log = []
    for for_predict in range(28):
        print("""
        ###########################################
         ############## {} - {}  ##############
        ###########################################""".format(_state, for_predict+1))
        
        g_for_predict = for_predict//7 + 1
        shifts = [7*i for i in [g_for_predict, g_for_predict+3]]
        if for_predict==0:
                  split=int(0.3*len(train_d_cols[-t:]))
        
        if for_predict%7==0:
            if for_predict>0:
                data.drop(columns=shift_cols, inplace=True)
            data = c_shift_data(data, shifts)
            for group in [['store_id', 'dept_id'], ['cat_id','store_id']]:
                c_shift_data_g(data, group, shifts)
                
        
        diff_cols = [col for col in data.columns if ('diff' in col)]
        shift_cols = [col for col in data.columns if ('shift' in col) and (not 'diff' in col)and (not 'snap' in col)]
        data = pd.concat([
            data.drop(columns=diff_cols),
            data.groupby(['id'])[diff_cols].transform(
                lambda x: x.sort_index().shift(1)
            )
        ], axis=1)
        
        data =pd.concat([
            data['id'],
            data[[c for c in data.columns if c!='id']].astype(float)
        ], axis=1)
        
        y = data[['TARGET']+['d', 'id']]
        X = data.drop(columns=['id',  'TARGET', 'item_id','state_id', 'todel_shift']).astype(float)
        
        
        x_train, x_val = X[X.d.isin(train_d_cols[-t:-split])], X[X.d.isin(train_d_cols[-split:])]
        y_train, y_val = y[y.d.isin(train_d_cols[-t:-split])], y[y.d.isin(train_d_cols[-split:])]
        
        x_train.drop('d', axis=1, inplace=True)
        x_val.drop('d', axis=1, inplace=True)
        y_train = y_train['TARGET'].astype(float)
        y_val = y_val['TARGET'].astype(float)
        if for_predict==0:
            print(x_train.shape)
            print(x_val.shape)
            
        train_set = lgb.Dataset(x_train, y_train)
        val_set = lgb.Dataset(x_val, y_val)
        
        model = lgb.train(
                    train_set=train_set, 
                    valid_sets=[train_set, val_set],
                    params=params, num_boost_round=3000, early_stopping_rounds=100, verbose_eval=500,
            categorical_feature=category_f+['wday', 'month']
        )
        
        importance = pd.DataFrame()
        importance['importance'] = model.feature_importance(importance_type='gain')
        importance['importance'] = preprocessing.minmax_scale(importance.importance)
        importance['columns'] = x_val.columns
        log.append([importance])
        gc.collect()
        del train_set, val_set, X, x_train, x_val, y_train, y_val;gc.collect()
    gc.collect()
    
    with open(f'train_log_{_state}.pickle', 'wb') as f:
        pickle.dump(log, f)

In [None]:
train(data, train_d_cols, t)