In [1]:
import warnings
warnings.filterwarnings("ignore")


import numpy as np
import pandas as pd
import datetime
from catboost import CatBoostClassifier
import lightgbm as lgb
from time import time
from tqdm import tqdm
from collections import Counter
from scipy import stats
import gc, pickle
import ast

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold, KFold, RepeatedKFold,TimeSeriesSplit, GroupKFold, GridSearchCV, train_test_split, TimeSeriesSplit
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix, mean_squared_error, log_loss
from sklearn.linear_model import Ridge,Lasso, BayesianRidge
from sklearn.svm import LinearSVR
from sklearn.preprocessing import minmax_scale

In [2]:
PARAMS = {
    'boosting_type': 'gbdt',
    'objective': 'tweedie',
    'tweedie_variance_power': 1.1,
    'metric': 'rmse',
    'subsample': 0.5,
    'subsample_freq': 1,
    'learning_rate': 0.03,
    'num_leaves': 2**11-1,
    'min_data_in_leaf': 2**12-1,
    'feature_fraction': 0.5,
    'max_bin': 100,
    'n_estimators': 1400,
    'boost_from_average': False,
    'verbose': 1,
    
    'random_state':2020
    } 

In [3]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                       df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [4]:
def create_is_sell_data(sell_prices_df, calendar_df, train_df):
    sell_prices_df['id'] = sell_prices_df['item_id'].astype('str')+'_'+sell_prices_df['store_id']+'_evaluation'
    sell_prices_data = sell_prices_df[sell_prices_df.wm_yr_wk.isin(calendar_df.wm_yr_wk.unique())]
    sell_prices_data.reset_index(drop=True, inplace=True)
    tmp = sell_prices_data.groupby(['id'])[['wm_yr_wk', 'sell_price']].apply(
        lambda x: x.set_index('wm_yr_wk')['sell_price'].to_dict()
    ).to_dict()
    d = calendar_df.d
    wm_yr_wk = calendar_df.wm_yr_wk
    price_data = {}
    for col in tqdm(train_df.id.unique()):
        price_data[col] = wm_yr_wk.map(tmp[col])
    price_data = pd.DataFrame(price_data)
    price_data.index = d
    is_sell = price_data.notnull().astype(float).T
    price_data = price_data.fillna(0)
    
    is_sell.index=train_df.id
    train_df.index=train_df.id
    is_sell = pd.concat([
        train_df[['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']], is_sell
    ], axis=1)
    price_data = pd.concat([
        train_df[['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']], price_data.T  
    ], axis=1)
    
    return price_data, is_sell

def set_index(df, name):
    d = {}
    for col, value in df.iloc[0,:].items():
        if type(col)==str:
            if type(df[col].values[0])!=str:
                v = 'd'
            else:
                v='id'
        else:
            v=name
        d[col]=v
    return d

def dcol2int(col):
    if col[:2]=='d_':
        return int(col.replace('d_', ''))
    else:
        return col
    
def create_event_data(train_df, calendar_df):
    new_df = pd.DataFrame()
    D_COLS = [d for d in train_df.columns if type(d)!=str]
    for event_name in ['event_name_1', 'event_name_2']:
        tmp_df = pd.concat([
            train_df.groupby(['dept_id'])[D_COLS].mean().T.astype(float),
            train_df.groupby(['cat_id'])[D_COLS].mean().T.astype(float),
            calendar_df.loc[D_COLS,event_name].replace(np.nan, 'NAN')
        ],axis=1)

        dept_id_cols = train_df.dept_id.unique().tolist()
        cat_id_cols = train_df.cat_id.unique().tolist()

        tmp_df = pd.concat([
            tmp_df[[event_name]],
            tmp_df.groupby([event_name])[dept_id_cols].transform(
            lambda x: x.shift(1).rolling(len(x), min_periods=1).mean()
            ),
            tmp_df.groupby([event_name])[cat_id_cols].transform(
            lambda x: x.shift(1).rolling(len(x), min_periods=1).mean()
            )
        ], axis=1)

        tmp_df[dept_id_cols] = tmp_df[dept_id_cols]/tmp_df[dept_id_cols].rolling(56, min_periods=1).mean().shift(1)
        tmp_df[cat_id_cols] = tmp_df[cat_id_cols]/tmp_df[cat_id_cols].rolling(56, min_periods=1).mean().shift(1)
        tmp_df.loc[tmp_df[event_name]=='NAN', dept_id_cols+cat_id_cols]=1
        
        tmp_df.columns=[f'{event_name}_{col}' for col in tmp_df.columns]
        
        new_df = pd.concat([
            new_df, tmp_df
        ] ,axis=1)
    new_df.index=D_COLS
    return new_df


def create_metadata(path, d_cols, submmit=True):
    train_df = pd.read_csv(path+'sales_train_evaluation.csv')
    calendar_df = pd.read_csv(path+'calendar.csv')
    sell_prices_df = pd.read_csv(path+'sell_prices.csv')
    #sample_submission_df = pd.read_csv(path+'sample_submission.csv')

    calendar_df['d'] = calendar_df.d.str.replace('d_', '').astype(int)
    cols = train_df.columns
    cols = [dcol2int(col) for col in cols]
    train_df.columns=cols
    calendar_df['date']=pd.to_datetime(calendar_df.date)
    calendar_df.index = calendar_df.d
    price_data, is_sell = create_is_sell_data(sell_prices_df, calendar_df, train_df)
    
    str_cols = [ col for col in train_df.columns if 'id' in str(col)]
    new_columns = str_cols+d_cols
    train_df = train_df.reindex(columns=new_columns)
    
    
    train_df = pd.concat([
        train_df[['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']],
        train_df.loc[:,d_cols]+is_sell[d_cols].replace(0, np.nan).replace(1, 0)
    ], axis=1)
    train_df.index = train_df.id
    del is_sell;gc.collect()
    
    df = train_df.loc[:,d_cols].T.astype(float)
    a = df.loc[d_cols[28:-56]].rolling(28, min_periods=1).sum().replace(0,np.nan)+df.loc[d_cols[28:-56]][::-1].rolling(28, min_periods=1).sum()[::-1].replace(0,np.nan)
    a[a.notnull()]=0
    df.loc[d_cols[28:-56]] += a
    df = df.loc[d_cols,:].T.astype(float)
    del a;gc.collect()
    
    #snap_data
    snap_data = calendar_df[['snap_CA', 'snap_WI', 'snap_TX', 'd']]
    snap_data.set_index('d', inplace=True)
    
    #dept_id_price
    dept_id_price = price_data[d_cols]/price_data.groupby(['dept_id', 'store_id'])[d_cols].transform('mean')
    dept_id_price = dept_id_price.T.astype(float)
    #dept_id_price['d'] = dept_id_price.index
    dept_id_price = dept_id_price.replace(0,np.nan)
    
    #cat_id_price
    cat_id_price = price_data[d_cols]/price_data.groupby(['cat_id', 'store_id'])[d_cols].transform('mean')
    cat_id_price = cat_id_price.T.astype(float)
    #cat_id_price['d'] = cat_id_price.index
    cat_id_price = cat_id_price.replace(0,np.nan)
    
    #price_data
    price_data = price_data[d_cols].T
    price_data.replace(0,np.nan, inplace=True)
    #price_data['d']=price_data.index
    
    #event_df
    event_df = create_event_data(train_df, calendar_df)
    #event_df.reset_index(inplace=True)
    
    #calendar_dict
    calendar_dict = calendar_df[['wday', 'month']].to_dict()
    
    return train_df, snap_data, dept_id_price, cat_id_price, price_data, event_df, calendar_dict, df

In [5]:
def make_shift_roll_data(data, shift, agg={'mean', 'std'}):
    data_2 = data.groupby(['id'])['TARGET'].apply(
        lambda x:
        x.shift(shift).rolling(14, min_periods=1).agg(agg)
    )
    for col in data_2.columns:
        data[f'shift_roll_14_{col}'] = data_2[col]
        
    data_2 = data.groupby(['id'])['TARGET'].apply(
        lambda x:
        x.shift(shift).rolling(28, min_periods=1).agg(agg)
    )
    for col in data_2.columns:
        data[f'shift_roll_28_{col}'] = data_2[col]
        
    return data


def make_roll_data(data, win, agg={'mean', 'std'}):
    data_2 = data.groupby(['id'])['TARGET'].apply(
        lambda x:
        x.shift(1).rolling(win, min_periods=1).agg(agg)
    )
    for col in data_2.columns:
        data[f'roll_{win}_{col}'] = data_2[col]
        
    return data

def make_diff_data(data, win):
    data[f'diff_mean_{win}_1'] = data.groupby(['id'])['TARGET'].apply(
        lambda x:
        abs(x.shift(1).diff(1)).rolling(win, min_periods=1).agg('mean')
    ) 
    
    data[f'diff_mean_{win}_7'] = data.groupby(['id'])['TARGET'].apply(
        lambda x:
        abs(x.shift(1).diff(7)).rolling(win, min_periods=1).agg('mean')
    ) 
    
    diff_data = data.groupby(['id'])['TARGET'].apply(
        lambda x:
        x.shift(1).diff(7).rolling(win, min_periods=1).agg({'min', 'max'})
    ) 
    for col in diff_data.columns:
        data[f'diff_{col}_{win}_7'] = diff_data[col]
        
    diff_data = data.groupby(['id'])['TARGET'].apply(
        lambda x:
        x.shift(1).diff(1).rolling(win, min_periods=1).agg({'min', 'max'})
    ) 
    for col in diff_data.columns:
        data[f'diff_{col}_{win}_1'] = diff_data[col]
    
    return data

def make_shift_data(data):
    shift=7
    for i, p in  enumerate(range(4)):
        data[f'shift_{i+1}'] = data.groupby(['id'])['TARGET'].shift(shift+7*p)
    return data

def create_sale_feature(data):
    cols = data.columns.tolist()
    
    data = make_shift_roll_data(data=data, shift=14, agg={'mean'})
    data = make_roll_data(data=data,win=28,agg={'mean', 'min', 'max'})
    data = make_roll_data(data=data,win=7,agg={'mean', 'min', 'max'})
    data = make_diff_data(data=data, win=28)
    one_cols = [col for col in data.columns if not col in cols]
    cols = data.columns.tolist()
    
    data = make_shift_data(data=data)
    seven_cols = [col for col in data.columns if not col in cols]
    
    return data, one_cols, seven_cols

In [6]:
def preprocessing_3(path,state,d_cols,test):
    train_df, snap_data, dept_id_price, cat_id_price, price_data, event_df, calendar_dict, df = create_metadata(path, d_cols)
    
    train_df[train_df.state_id==state]
    if test:
        train_df = train_df[train_df.id.isin(train_df.id.unique()[:2000])]
    
    data = train_df[d_cols].stack(dropna=False).reset_index()
    data = data.rename(columns=set_index(data, 'TARGET'))
    data.sort_values('d', inplace=True)
    data.reset_index(drop=True, inplace=True)
    data = reduce_mem_usage(data)
    gc.collect()


    for key, value in train_df[['dept_id', 'cat_id', 'state_id', 'store_id']].to_dict().items():
        data[key] = data.id.map(value)
    
    data[f'snap']=0
    for key, value in snap_data.to_dict().items():
        k = key.replace('snap_', '')
        data.loc[data.state_id==k,'snap'] = data.loc[data.state_id==k, 'd'].map(value).fillna(0)
    for shift in [-3,-2,-1,1,2,3]:
        data[f'snap_{shift}'] = data.groupby(['id'])['snap'].shift(shift).fillna(0)
        
    data.drop('state_id', axis=1, inplace=True)

    dept_id_price = dept_id_price.stack(dropna=False).reset_index()
    cat_id_price = cat_id_price.stack(dropna=False).reset_index()

    dept_id_price.rename(columns=set_index(dept_id_price, 'dept_id_price'), inplace=True)
    cat_id_price.rename(columns=set_index(cat_id_price, 'cat_id_price'), inplace=True)

    data = pd.merge(
        data, dept_id_price, on=['d', 'id'], how='left'
    )
    data = pd.merge(
        data, cat_id_price, on=['d', 'id'], how='left'
    )


    del dept_id_price,cat_id_price;gc.collect()

    price_data = price_data.stack(dropna=False).reset_index()
    price_data.rename(columns=set_index(price_data, 'price'), inplace=True)
    data = pd.merge(
        data, price_data, on=['d', 'id'], how='left'
    )
    del price_data;gc.collect()

    data['wday'] = data.d.map(calendar_dict['wday'])
    data['month'] = data.d.map(calendar_dict['month'])
    del calendar_dict;gc.collect()


    tmp_dic = event_df.to_dict()
    data[f'dept_id_event_name_1']=1
    data[f'cat_id_event_name_1']=1
    for key, value in tmp_dic.items():
        if 'event_name_1' in key:
            if key[13:] in train_df.dept_id.unique().tolist():
                data.loc[data.dept_id==key[13:], f'dept_id_{key[:12]}']=data.loc[data.dept_id==key[13:], 'd'].map(value).fillna(1)
            if key[13:] in train_df.cat_id.unique().tolist():
                data.loc[data.cat_id==key[13:], f'cat_id_{key[:12]}']=data.loc[data.cat_id==key[13:], 'd'].map(value).fillna(1)
    for shift in [-7,-1,1]:
        for event_name in ['dept_id_event_name_1', 'cat_id_event_name_1']:
            data[f'{event_name}_shift{shift}'] = data.groupby(['id'])[event_name].shift(shift).fillna(1)

    categories = [c for c in data.columns if data[c].dtype==object]
    print(categories)
    for c in categories:
        if c=='id':
            pass
        else:
            data[c] = pd.factorize(data[c])[0]
    
    data['TARGET'] = data['TARGET']*data['price']
    data, one_cols, seven_cols = create_sale_feature(data)
    
    #data = data[data.d.isin(d_cols[-150:])]
    
    return data, one_cols, seven_cols


def predict_cv(x_val, models):
    preds = np.zeros(len(x_val))
    for model in models:
        pred = model.predict(x_val)
        #pred = np.e**(pred)-1
        preds+=pred/len(models)
    return preds

def train(data, params=PARAMS):
    data = data[data.TARGET.notnull()]
    data = data.reset_index(drop=True)
    models = []
    k = GroupKFold(n_splits=6)
    categories = ['cat_id', 'dept_id', 'store_id']

    y = data['TARGET']
    group = data['group']
    data = data.drop(columns=['id', 'd', 'TARGET', 'month', 'group'], axis=1)
    cols = data.columns.tolist()

    stop_cnt = 0
    for trn_indx, val_indx in k.split(data, groups=group):
        train_set = lgb.Dataset(data.loc[trn_indx,:], y.loc[trn_indx])
        val_set = lgb.Dataset(data.loc[val_indx,:], y.loc[val_indx])

        
        model = lgb.train(
            train_set=train_set, 
            valid_sets=[train_set, val_set],
            params=params, num_boost_round=3000, early_stopping_rounds=100, verbose_eval=500,
            categorical_feature=categories+['wday']
        )
        
        models.append(model)
        gc.collect()
        stop_cnt+=1
        if stop_cnt==3:
            break
    return models, cols


def main():
    #path = '/Users/kanoumotoharu/Downloads/m5-forecasting-accuracy/'
    path = '/Users/abcdm/Downloads/m5-forecasting-accuracy/'
    #path = '../input/m5-forecasting-accuracy/'
    
    d_cols=[i+1 for i in range(1969)]
    test = False
    state='CA'
    data, one_cols, seven_cols = preprocessing_3(path, state, d_cols[-200:], test=test)
    
    
    data['group'] = data['wday'].astype(str)+'_'+data['dept_id'].astype(str)
    print(data.shape)
    day_one = d_cols[-28]
    sub_df = pd.DataFrame()
    for i in range(28):
        models, cols = train(data[data.d.isin(d_cols[-150:])], params=PARAMS)
        pred_X=data[data.d==day_one+i]
        pred = predict_cv(pred_X[cols], models)
        pred_X['pred'] = pred/pred_X['price']
        sub_df = pd.concat([
            sub_df, pred_X[['id', 'd', 'pred']]
        ], axis=0)
        if i%7==0 and i>0:
            data[seven_cols] = data.groupby(['id'])[seven_cols].shift(7)
        data[one_cols] = data.groupby(['id'])[one_cols].shift(1)
    return sub_df

In [7]:
main()

100%|██████████| 30490/30490 [00:10<00:00, 2818.30it/s]


Mem. usage decreased to 69.79 Mb (50.0% reduction)
['id', 'dept_id', 'cat_id', 'store_id']
(6098000, 45)
Training until validation scores don't improve for 100 rounds
[500]	training's rmse: 6.891	valid_1's rmse: 6.43902
[1000]	training's rmse: 6.72468	valid_1's rmse: 6.33564
Did not meet early stopping. Best iteration is:
[1400]	training's rmse: 6.65464	valid_1's rmse: 6.30155
Training until validation scores don't improve for 100 rounds
[500]	training's rmse: 6.67164	valid_1's rmse: 7.76577
[1000]	training's rmse: 6.50353	valid_1's rmse: 7.60402
Did not meet early stopping. Best iteration is:
[1400]	training's rmse: 6.43002	valid_1's rmse: 7.55199
Training until validation scores don't improve for 100 rounds
[500]	training's rmse: 6.83794	valid_1's rmse: 6.91931
[1000]	training's rmse: 6.65473	valid_1's rmse: 6.86148
Early stopping, best iteration is:
[1187]	training's rmse: 6.61743	valid_1's rmse: 6.85208


TypeError: can only concatenate list (not "int") to list