In [1]:
path = '/Users/kanoumotoharu/Downloads/m5-forecasting-accuracy/'
#path = '/Users/abcdm/Downloads/m5-forecasting-accuracy/'
#path = '../input/m5-forecasting-accuracy/'

In [2]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import os, gc
import termcolor
from typing import Union

import math, random
import pickle
import datetime, time
from tqdm import tqdm_notebook as tqdm

import torch 
from torch import nn
from torch import optim

from sklearn import preprocessing
from sklearn.model_selection import train_test_split, KFold
from sklearn.cluster import KMeans

%matplotlib inline

In [3]:
def Preprocessing(train_df, calendar_df, sell_prices_df):
    sell_prices_df['id'] = sell_prices_df['item_id'].astype('str')+'_'+sell_prices_df['store_id']+'_validation'
    d_cols = [f'd_{i}' for i in range(1,1914)]
    
    event_type_1 = pd.get_dummies(calendar_df.event_type_1)
    event_type_1.columns = [f'{col}_event_type_1' for col in event_type_1.columns]
    event_type_2 = pd.get_dummies(calendar_df.event_type_1)
    event_type_2.columns = [f'{col}_event_type_2' for col in event_type_2.columns]
    calendar_data = pd.concat([
        calendar_df.drop(columns=['event_name_1', 'event_type_1', 'event_name_2', 'event_type_2'])[['wday', 'd','month','snap_CA', 'snap_TX', 'snap_WI']],
        event_type_1,
        event_type_2
    ], axis=1)
    calendar_data = calendar_data.set_index('d').T
    
    
    
    sell_prices_data = sell_prices_df[sell_prices_df.wm_yr_wk.isin(calendar_df.wm_yr_wk.unique())]
    sell_prices_data.reset_index(drop=True, inplace=True)
    tmp = sell_prices_data.groupby(['id'])[['wm_yr_wk', 'sell_price']].apply(lambda x: x.set_index('wm_yr_wk')['sell_price'].to_dict()).to_dict()
    d = calendar_df.d
    wm_yr_wk = calendar_df.wm_yr_wk
    price_data = {}
    for col in tqdm(train_df.id.unique()):
        price_data[col] = wm_yr_wk.map(tmp[col])
    price_data = pd.DataFrame(price_data)
    price_data.index = d
    
    
    is_sell = price_data.notnull().astype(float).T
    price_data = price_data.fillna(0)
    
    train_df = train_df.T
    train_df.columns = train_df.loc['id', :].values
    train_df = train_df.T
    
    return train_df, calendar_df, calendar_data, price_data, is_sell


def make_calendar_data(calendar_data, train_cols):
    calendar_index = [
        'wday', 'month',
        'Cultural_event_type_1', 'National_event_type_1', 'Religious_event_type_1', 'Sporting_event_type_1',
        'Cultural_event_type_2', 'National_event_type_2', 'Religious_event_type_2', 'Sporting_event_type_2'
    ]
    calendar = calendar_data.loc[calendar_index,:]
    event_index = [
        'Cultural_event_type_1', 'National_event_type_1', 'Religious_event_type_1', 'Sporting_event_type_1',
        'Cultural_event_type_2', 'National_event_type_2', 'Religious_event_type_2', 'Sporting_event_type_2'
    ]
    for shift in [3, 7, 14, 28]:
        tmp_calendar = calendar.loc[event_index, :]
        tmp_calendar = tmp_calendar.T.shift(-shift).T
        tmp_calendar.index = [f'{col}_shift{shift}' for col in tmp_calendar.index]
        calendar = pd.concat([
            calendar,
            tmp_calendar
        ], axis=0)
    calendar = calendar[train_cols]
    calendar = torch.FloatTensor(calendar.values.astype(float))
    return calendar

def make_data(train_cols, state, train_df, calendar_data, price_data, is_sell_data, sample_submission_df):
    data_train = train_df[['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']+train_cols]
    train_product = sample_submission_df[(sample_submission_df.id.str.contains(state))&(sample_submission_df.id.str.contains('_validation'))].id.values
    #train_product = data_train[data_train.state_id==state]['id'].unique()
    
    data = data_train.loc[train_product,train_cols]
    
    calendar_index = [ f'snap_{state}']
    event_index = [ f'snap_{state}']
    calendar = calendar_data.loc[calendar_index,:]
    for shift in [3, 7, 14, 28]:
        tmp_calendar = calendar.loc[event_index, :]
        tmp_calendar = tmp_calendar.T.shift(shift).T
        tmp_calendar.index = [f'{col}_shift{shift}' for col in tmp_calendar.index]
        calendar = pd.concat([
            calendar,
            tmp_calendar
        ], axis=0)
    calendar = calendar[train_cols]
    
    price = price_data.T[train_cols].loc[train_product,:]
    past_price_1 = price_data.loc[:,train_product].shift(3).T[train_cols]
    past_price_2 = price_data.loc[:,train_product].shift(7).T[train_cols]
    past_price_3 = price_data.loc[:,train_product].shift(14).T[train_cols]
    
    
    is_sell = is_sell_data[train_cols].loc[train_product,:]
    past_is_sell_1 = is_sell_data.T.shift(3).T.loc[train_product, train_cols]
    past_is_sell_2 = is_sell_data.T.shift(7).T.loc[train_product, train_cols]
    past_is_sell_3 = is_sell_data.T.shift(14).T.loc[train_product, train_cols]

    data = torch.FloatTensor(data.values.astype(float))
    
    calendar = torch.FloatTensor(calendar.values.astype(float))
    
    price = torch.FloatTensor(price.values.astype(float))
    
    past_price_1 = torch.FloatTensor(past_price_1.values.astype(float))
    past_price_2 = torch.FloatTensor(past_price_2.values.astype(float))
    past_price_3 = torch.FloatTensor(past_price_3.values.astype(float))
    
    is_sell = torch.FloatTensor(is_sell.values.astype(float))
    past_is_sell_1 = torch.FloatTensor(past_is_sell_1.values.astype(float))
    past_is_sell_2 = torch.FloatTensor(past_is_sell_2.values.astype(float))
    past_is_sell_3 = torch.FloatTensor(past_is_sell_3.values.astype(float))
    
    data_list = []
    for idx in range(len(data)):
        _data = data[[idx],:]
        _price = price[[idx],:]
        
        _past_price_1 = past_price_1[[idx],:]
        _past_price_2 = past_price_2[[idx],:]
        _past_price_3 = past_price_3[[idx],:]
        
        _is_sell = is_sell[[idx],:]
        
        _past_is_sell_1 = past_is_sell_1[[idx],:]
        _past_is_sell_2 = past_is_sell_2[[idx],:]
        _past_is_sell_3 = past_is_sell_3[[idx],:]
        
        x = torch.cat((
            _data, calendar,
            _price,
            _past_price_1, _past_price_2, _past_price_3,
            _is_sell,
            _past_is_sell_1, _past_is_sell_2, _past_is_sell_3
        ), dim=0)
        data_list.append(x.tolist())
    data_list = torch.FloatTensor(data_list)
    return data_list

In [4]:
def mish(input):
    return input * torch.tanh(nn.functional.softplus(input))

class Mish(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, input):
        return mish(input)

In [5]:
class residual_conv1d(nn.Module):

    def __init__(self, in_channel):
        super(residual_conv1d, self).__init__()
        
        self.mish = Mish()
        self.layer = nn.Sequential(
            nn.Conv1d(in_channel, in_channel, 1),
            Mish(),
            nn.Conv1d(in_channel, in_channel, 1)
        )

    def forward(self, x):
        x = x+self.layer(x)
        x = self.mish(x)
        return x

class Conv_1d_Net(nn.Module):

    def __init__(self, in_channel):
        super(Conv_1d_Net, self).__init__()
        
        self.layer_1 = nn.Sequential(
            nn.Conv1d(in_channel, 2*in_channel, 1),
            nn.Dropout(0.2),
            Mish(),
            residual_conv1d(2*in_channel)
        )
        
        self.layer_2 = nn.Sequential(
            nn.Conv1d(2*in_channel, 4*in_channel, 1),
            nn.Dropout(0.2),
            Mish(),
            residual_conv1d(4*in_channel)
        )
        
        self.layer_3 = nn.Sequential(
            nn.Conv1d(4*in_channel, 8*in_channel, 1),
            nn.Dropout(0.2),
            Mish(),
            residual_conv1d(8*in_channel)
        )
       
         
        self.avgpool1d = nn.AdaptiveAvgPool1d(1)
        
        self.fc = nn.Sequential(
            nn.Linear(8*in_channel, 8*in_channel),
            nn.Dropout(0.1),
            Mish(),
            nn.Linear(8*in_channel, 16*in_channel),
            nn.Dropout(0.1),
            Mish(),
            nn.Linear(16*in_channel, 28)
        ) 

    def forward(self, x):
        #_in = x.size()[1]
        x = self.layer_1(x)
        x = self.layer_2(x)
        x = self.layer_3(x)
        #x = self.layer_4(x)
        x = self.avgpool1d(x)
        x = torch.flatten(x, 1)
        x = self.fc(x)

        return x

In [6]:
from torch.optim.optimizer import Optimizer
import math

class RAdam(Optimizer):

    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0):
        defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
        self.buffer = [[None, None, None] for ind in range(10)]
        super(RAdam, self).__init__(params, defaults)

    def __setstate__(self, state):
        super(RAdam, self).__setstate__(state)

    def step(self, closure=None):

        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:

            for p in group['params']:
                if p.grad is None:
                    continue
                grad = p.grad.data.float()
                if grad.is_sparse:
                    raise RuntimeError('RAdam does not support sparse gradients')

                p_data_fp32 = p.data.float()

                state = self.state[p]

                if len(state) == 0:
                    state['step'] = 0
                    state['exp_avg'] = torch.zeros_like(p_data_fp32)
                    state['exp_avg_sq'] = torch.zeros_like(p_data_fp32)
                else:
                    state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32)
                    state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32)

                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
                beta1, beta2 = group['betas']

                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
                exp_avg.mul_(beta1).add_(1 - beta1, grad)

                state['step'] += 1
                buffered = self.buffer[int(state['step'] % 10)]
                if state['step'] == buffered[0]:
                    N_sma, step_size = buffered[1], buffered[2]
                else:
                    buffered[0] = state['step']
                    beta2_t = beta2 ** state['step']
                    N_sma_max = 2 / (1 - beta2) - 1
                    N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t)
                    buffered[1] = N_sma

                    # more conservative since it's an approximated value
                    if N_sma >= 5:
                        step_size = group['lr'] * math.sqrt((1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / (1 - beta1 ** state['step'])
                    else:
                        step_size = group['lr'] / (1 - beta1 ** state['step'])
                    buffered[2] = step_size

                if group['weight_decay'] != 0:
                    p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32)

                # more conservative since it's an approximated value
                if N_sma >= 5:            
                    denom = exp_avg_sq.sqrt().add_(group['eps'])
                    p_data_fp32.addcdiv_(-step_size, exp_avg, denom)
                else:
                    p_data_fp32.add_(-step_size, exp_avg)

                p.data.copy_(p_data_fp32)

        return loss

In [7]:
class Mydatasets(torch.utils.data.Dataset):
    def __init__(self, data, calendar):
        self.data = data
        self.calendar = calendar
        self.datanum = len(data)

    def __len__(self):
        return self.datanum

    def __getitem__(self, idx):
        _data = self.data[idx, :, :]
        x = torch.cat((_data, self.calendar), dim=0)
        return x

In [8]:
original_train_df = pd.read_csv(path+'sales_train_validation.csv')
calendar_df = pd.read_csv(path+'calendar.csv')
sell_prices_df = pd.read_csv(path+'sell_prices.csv')
sample_submission_df = pd.read_csv(path+'sample_submission.csv')

In [9]:
group_ids = (
    ['all_id'],
    ['state_id'],
    ['store_id'],
    ['cat_id'],
    ['dept_id'],
    ['state_id', 'cat_id'],
    ['state_id', 'dept_id'],
    ['store_id', 'cat_id'],
    ['store_id', 'dept_id'],
    ['item_id'],
    ['item_id', 'state_id'],
    ['item_id', 'store_id']
)

In [10]:
original_train_df.columns

Index(['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'd_1',
       'd_2', 'd_3', 'd_4',
       ...
       'd_1904', 'd_1905', 'd_1906', 'd_1907', 'd_1908', 'd_1909', 'd_1910',
       'd_1911', 'd_1912', 'd_1913'],
      dtype='object', length=1919)

In [13]:
original_train_df['all_id'] = 0
original_train_df['index'] = original_train_df.index
group_id_dict={}

In [14]:
sum_M = []
for i, group_id in enumerate(group_ids):
    a = original_train_df.groupby(group_id)['index'].unique().to_dict()
    group_id_dict[f'{group_id}'] = list(a.keys())
    for key in tqdm(group_id_dict[f'{group_id}']):
        v = a[key]
        zeros = torch.zeros(30490)
        zeros[v] = 1
        zeros = zeros/zeros.sum()
        sum_M += [zeros.tolist()]

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))




HBox(children=(IntProgress(value=0, max=3), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=3), HTML(value='')))




HBox(children=(IntProgress(value=0, max=7), HTML(value='')))




HBox(children=(IntProgress(value=0, max=9), HTML(value='')))




HBox(children=(IntProgress(value=0, max=21), HTML(value='')))




HBox(children=(IntProgress(value=0, max=30), HTML(value='')))




HBox(children=(IntProgress(value=0, max=70), HTML(value='')))




HBox(children=(IntProgress(value=0, max=3049), HTML(value='')))




HBox(children=(IntProgress(value=0, max=9147), HTML(value='')))




HBox(children=(IntProgress(value=0, max=30490), HTML(value='')))




In [None]:
gc.collect()

In [308]:
zeros.sum()

tensor(1.)

In [309]:
zeros[zeros>0.1]

tensor([1.])

In [311]:
zeros[zeros>0]

tensor([1.])

In [314]:
v

array([29052])

In [313]:
torch.argmax(zeros)

tensor(29052)

In [265]:
group_id_dict

{"['all_id']": [0],
 "['state_id']": ['CA', 'TX', 'WI'],
 "['store_id']": ['CA_1',
  'CA_2',
  'CA_3',
  'CA_4',
  'TX_1',
  'TX_2',
  'TX_3',
  'WI_1',
  'WI_2',
  'WI_3'],
 "['cat_id']": ['FOODS', 'HOBBIES', 'HOUSEHOLD'],
 "['dept_id']": ['FOODS_1',
  'FOODS_2',
  'FOODS_3',
  'HOBBIES_1',
  'HOBBIES_2',
  'HOUSEHOLD_1',
  'HOUSEHOLD_2'],
 "['state_id', 'cat_id']": [('CA', 'FOODS'),
  ('CA', 'HOBBIES'),
  ('CA', 'HOUSEHOLD'),
  ('TX', 'FOODS'),
  ('TX', 'HOBBIES'),
  ('TX', 'HOUSEHOLD'),
  ('WI', 'FOODS'),
  ('WI', 'HOBBIES'),
  ('WI', 'HOUSEHOLD')],
 "['state_id', 'dept_id']": [('CA', 'FOODS_1'),
  ('CA', 'FOODS_2'),
  ('CA', 'FOODS_3'),
  ('CA', 'HOBBIES_1'),
  ('CA', 'HOBBIES_2'),
  ('CA', 'HOUSEHOLD_1'),
  ('CA', 'HOUSEHOLD_2'),
  ('TX', 'FOODS_1'),
  ('TX', 'FOODS_2'),
  ('TX', 'FOODS_3'),
  ('TX', 'HOBBIES_1'),
  ('TX', 'HOBBIES_2'),
  ('TX', 'HOUSEHOLD_1'),
  ('TX', 'HOUSEHOLD_2'),
  ('WI', 'FOODS_1'),
  ('WI', 'FOODS_2'),
  ('WI', 'FOODS_3'),
  ('WI', 'HOBBIES_1'),
  ('WI', '

In [238]:
group_id_dict.keys()

dict_keys(["['all_id']", "['state_id']", "['store_id']", "['cat_id']", "['dept_id']", "['state_id', 'cat_id']", "['state_id', 'dept_id']", "['store_id', 'cat_id']", "['store_id', 'dept_id']", "['item_id']", "['item_id', 'state_id']", "['item_id', 'store_id']"])

In [240]:
len(a)

30490

In [247]:
zeros[a[('FOODS_1_001', 'CA_1')]] = 1

In [250]:
zeros

tensor([0., 0., 0.,  ..., 0., 0., 0.])

In [186]:
d_cols = [f'd_{i}' for i in range(1,1914)]
train_target_columns = d_cols[:1500]
valid_target_columns = d_cols[1500:]

original_train_df['all_id'] = 0
original_train_df['index'] = original_train_df.index

train_df = original_train_df[['all_id', 'id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']+train_target_columns]
valid_df = original_train_df[['all_id', 'id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']+valid_target_columns]
train_target_lv = {}
valid_target_lv = {}
for i, group_id in enumerate(tqdm(group_ids)):
    print(group_id)
    a = train_df.groupby(group_id)[train_target_columns].sum().T.to_dict()
    for key, value in a.items():
        value = list(value.values())
        a[key] = torch.FloatTensor(value)
    train_target_lv[f'lv{i + 1}_train_df'] = a
    a = valid_df.groupby(group_id)[valid_target_columns].sum()
    for key, value in a.items():
        a[key] = torch.FloatTensor(value)
    valid_target_lv[f'lv{i + 1}_valid_df'] = a

HBox(children=(IntProgress(value=0, max=12), HTML(value='')))

['all_id']
['state_id']
['store_id']
['cat_id']
['dept_id']
['state_id', 'cat_id']
['state_id', 'dept_id']
['store_id', 'cat_id']
['store_id', 'dept_id']
['item_id']
['item_id', 'state_id']
['item_id', 'store_id']



In [187]:
train_target_lv.keys()

dict_keys(['lv1_train_df', 'lv2_train_df', 'lv3_train_df', 'lv4_train_df', 'lv5_train_df', 'lv6_train_df', 'lv7_train_df', 'lv8_train_df', 'lv9_train_df', 'lv10_train_df', 'lv11_train_df', 'lv12_train_df'])

In [189]:
train_target_lv['lv1_train_df']

{0: tensor([32631., 31749., 23783.,  ..., 35937., 44229., 47379.])}

In [18]:
original_train_df['index'] = original_train_df.index

In [21]:
original_train_df.groupby(['state_id'])['index'].unique()

state_id
CA    [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...
TX    [12196, 12197, 12198, 12199, 12200, 12201, 122...
WI    [21343, 21344, 21345, 21346, 21347, 21348, 213...
Name: index, dtype: object

In [22]:
original_train_df.groupby(['all_id'])['index'].unique()

all_id
0    [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...
Name: index, dtype: object

In [24]:
original_train_df.groupby( ['state_id', 'cat_id'])['index'].unique().to_dict()

{('CA', 'FOODS'): array([ 1612,  1613,  1614, ..., 12193, 12194, 12195]),
 ('CA', 'HOBBIES'): array([   0,    1,    2, ..., 9709, 9710, 9711]),
 ('CA', 'HOUSEHOLD'): array([  565,   566,   567, ..., 10756, 10757, 10758]),
 ('TX', 'FOODS'): array([13808, 13809, 13810, ..., 21340, 21341, 21342]),
 ('TX', 'HOBBIES'): array([12196, 12197, 12198, ..., 18856, 18857, 18858]),
 ('TX', 'HOUSEHOLD'): array([12761, 12762, 12763, ..., 19903, 19904, 19905]),
 ('WI', 'FOODS'): array([22955, 22956, 22957, ..., 30487, 30488, 30489]),
 ('WI', 'HOBBIES'): array([21343, 21344, 21345, ..., 28003, 28004, 28005]),
 ('WI', 'HOUSEHOLD'): array([21908, 21909, 21910, ..., 29050, 29051, 29052])}

In [36]:
def get_weight_df(train_df, calendar, prices):
    weight_columns = [f'd_{i}' for i in range(1906, 1914)]
    day_to_week = calendar.set_index('d')['wm_yr_wk'].to_dict()
    weight_df = train_df[['item_id', 'store_id'] + weight_columns].set_index(['item_id', 'store_id'])
    weight_df = weight_df.stack().reset_index().rename(columns={'level_2': 'd', 0: 'value'})
    weight_df['wm_yr_wk'] = weight_df['d'].map(day_to_week)
    
    weight_df = weight_df.merge(prices, how='left', on=['item_id', 'store_id', 'wm_yr_wk'])
    weight_df['value'] = weight_df['value'] * weight_df['sell_price']
    weight_df = weight_df.set_index(['item_id', 'store_id', 'd']).unstack(level=2)['value']
    weight_df = weight_df.loc[zip(train_df.item_id, train_df.store_id), :].reset_index(drop=True)
    weight_df = pd.concat([self.train_df[self.id_columns], weight_df], axis=1, sort=False)
    return weight_df

In [191]:
day_to_week = calendar_df.set_index('d')['wm_yr_wk'].to_dict()

In [192]:
#day_to_week

In [193]:
weight_df = original_train_df[['item_id', 'store_id'] + [f'd_{i}' for i in range(1,1000)]].set_index(['item_id', 'store_id'])

In [194]:
weight_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,d_1,d_2,d_3,d_4,d_5,d_6,d_7,d_8,d_9,d_10,...,d_990,d_991,d_992,d_993,d_994,d_995,d_996,d_997,d_998,d_999
item_id,store_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
HOBBIES_1_001,CA_1,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0
HOBBIES_1_002,CA_1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HOBBIES_1_003,CA_1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HOBBIES_1_004,CA_1,0,0,0,0,0,0,0,0,0,0,...,1,2,4,3,1,1,2,2,1,2
HOBBIES_1_005,CA_1,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,4,1,0,0,0


In [195]:
weight_df = weight_df.stack().reset_index().rename(columns={'level_2': 'd', 0: 'value'})

In [196]:
weight_df.head()

Unnamed: 0,item_id,store_id,d,value
0,HOBBIES_1_001,CA_1,d_1,0
1,HOBBIES_1_001,CA_1,d_2,0
2,HOBBIES_1_001,CA_1,d_3,0
3,HOBBIES_1_001,CA_1,d_4,0
4,HOBBIES_1_001,CA_1,d_5,0


In [197]:
weight_df.tail()

Unnamed: 0,item_id,store_id,d,value
30459505,FOODS_3_827,WI_3,d_995,0
30459506,FOODS_3_827,WI_3,d_996,0
30459507,FOODS_3_827,WI_3,d_997,0
30459508,FOODS_3_827,WI_3,d_998,0
30459509,FOODS_3_827,WI_3,d_999,0


In [198]:
weight_df['wm_yr_wk'] = weight_df['d'].map(day_to_week)

In [199]:
weight_df.head()

Unnamed: 0,item_id,store_id,d,value,wm_yr_wk
0,HOBBIES_1_001,CA_1,d_1,0,11101
1,HOBBIES_1_001,CA_1,d_2,0,11101
2,HOBBIES_1_001,CA_1,d_3,0,11101
3,HOBBIES_1_001,CA_1,d_4,0,11101
4,HOBBIES_1_001,CA_1,d_5,0,11101


In [200]:
weight_df = weight_df.merge(sell_prices_df, how='left', on=['item_id', 'store_id', 'wm_yr_wk'])

In [201]:
weight_df.sell_price.notnull().sum()

19427871

In [202]:
weight_df[weight_df.sell_price.notnull()].head()

Unnamed: 0,item_id,store_id,d,value,wm_yr_wk,sell_price
896,HOBBIES_1_001,CA_1,d_897,0,11325,9.58
897,HOBBIES_1_001,CA_1,d_898,0,11325,9.58
898,HOBBIES_1_001,CA_1,d_899,0,11325,9.58
899,HOBBIES_1_001,CA_1,d_900,0,11325,9.58
900,HOBBIES_1_001,CA_1,d_901,0,11325,9.58


In [203]:
weight_df['value'] = weight_df['value'] * weight_df['sell_price']
weight_df

Unnamed: 0,item_id,store_id,d,value,wm_yr_wk,sell_price
0,HOBBIES_1_001,CA_1,d_1,,11101,
1,HOBBIES_1_001,CA_1,d_2,,11101,
2,HOBBIES_1_001,CA_1,d_3,,11101,
3,HOBBIES_1_001,CA_1,d_4,,11101,
4,HOBBIES_1_001,CA_1,d_5,,11101,
5,HOBBIES_1_001,CA_1,d_6,,11101,
6,HOBBIES_1_001,CA_1,d_7,,11101,
7,HOBBIES_1_001,CA_1,d_8,,11102,
8,HOBBIES_1_001,CA_1,d_9,,11102,
9,HOBBIES_1_001,CA_1,d_10,,11102,


In [204]:
weight_df[weight_df.sell_price.notnull()].head()

Unnamed: 0,item_id,store_id,d,value,wm_yr_wk,sell_price
896,HOBBIES_1_001,CA_1,d_897,0.0,11325,9.58
897,HOBBIES_1_001,CA_1,d_898,0.0,11325,9.58
898,HOBBIES_1_001,CA_1,d_899,0.0,11325,9.58
899,HOBBIES_1_001,CA_1,d_900,0.0,11325,9.58
900,HOBBIES_1_001,CA_1,d_901,0.0,11325,9.58


In [205]:
weight_df = weight_df.set_index(['item_id', 'store_id', 'd']).unstack(level=2)['value']

In [206]:
weight_df.head()

Unnamed: 0_level_0,d,d_1,d_10,d_100,d_101,d_102,d_103,d_104,d_105,d_106,d_107,...,d_990,d_991,d_992,d_993,d_994,d_995,d_996,d_997,d_998,d_999
item_id,store_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
FOODS_1_001,CA_1,6.0,0.0,0.0,0.0,2.0,4.0,0.0,2.0,6.0,2.0,...,0.0,0.0,2.24,0.0,4.48,0.0,0.0,0.0,2.24,0.0
FOODS_1_001,CA_2,4.0,2.0,10.0,0.0,16.0,0.0,0.0,10.0,4.0,2.0,...,6.72,2.24,4.48,0.0,6.72,4.48,2.24,2.24,2.24,0.0
FOODS_1_001,CA_3,2.0,2.0,2.0,10.0,0.0,8.0,6.0,18.0,6.0,10.0,...,0.0,0.0,2.24,0.0,15.68,4.48,0.0,0.0,0.0,0.0
FOODS_1_001,CA_4,0.0,0.0,0.0,4.0,2.0,0.0,2.0,0.0,0.0,8.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.24
FOODS_1_001,TX_1,0.0,2.0,2.0,0.0,0.0,2.0,2.0,2.0,0.0,2.0,...,2.24,2.24,0.0,2.24,0.0,0.0,0.0,2.24,0.0,0.0


In [207]:
weight_df

Unnamed: 0_level_0,d,d_1,d_10,d_100,d_101,d_102,d_103,d_104,d_105,d_106,d_107,...,d_990,d_991,d_992,d_993,d_994,d_995,d_996,d_997,d_998,d_999
item_id,store_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
FOODS_1_001,CA_1,6.00,0.00,0.00,0.00,2.00,4.00,0.00,2.00,6.00,2.00,...,0.00,0.00,2.24,0.00,4.48,0.00,0.00,0.00,2.24,0.00
FOODS_1_001,CA_2,4.00,2.00,10.00,0.00,16.00,0.00,0.00,10.00,4.00,2.00,...,6.72,2.24,4.48,0.00,6.72,4.48,2.24,2.24,2.24,0.00
FOODS_1_001,CA_3,2.00,2.00,2.00,10.00,0.00,8.00,6.00,18.00,6.00,10.00,...,0.00,0.00,2.24,0.00,15.68,4.48,0.00,0.00,0.00,0.00
FOODS_1_001,CA_4,0.00,0.00,0.00,4.00,2.00,0.00,2.00,0.00,0.00,8.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,2.24
FOODS_1_001,TX_1,0.00,2.00,2.00,0.00,0.00,2.00,2.00,2.00,0.00,2.00,...,2.24,2.24,0.00,2.24,0.00,0.00,0.00,2.24,0.00,0.00
FOODS_1_001,TX_2,0.00,4.00,2.00,2.00,0.00,8.00,0.00,2.00,2.00,4.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
FOODS_1_001,TX_3,,0.00,0.00,0.00,2.00,4.00,0.00,2.00,0.00,2.00,...,0.00,0.00,0.00,0.00,8.96,4.48,0.00,0.00,0.00,0.00
FOODS_1_001,WI_1,0.00,2.00,2.00,2.00,0.00,0.00,2.00,2.00,2.00,2.00,...,0.00,0.00,2.24,0.00,0.00,0.00,2.24,0.00,4.48,2.24
FOODS_1_001,WI_2,0.00,2.00,0.00,0.00,2.00,2.00,2.00,2.00,4.00,6.00,...,2.24,0.00,0.00,0.00,2.24,0.00,0.00,2.24,0.00,2.24
FOODS_1_001,WI_3,0.00,6.00,0.00,0.00,2.00,2.00,6.00,2.00,0.00,2.00,...,0.00,0.00,0.00,2.24,4.48,8.96,0.00,0.00,0.00,0.00


In [208]:
original_train_df.columns

Index(['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'd_1',
       'd_2', 'd_3', 'd_4',
       ...
       'd_1906', 'd_1907', 'd_1908', 'd_1909', 'd_1910', 'd_1911', 'd_1912',
       'd_1913', 'all_id', 'index'],
      dtype='object', length=1921)

In [209]:
weight_df = weight_df.loc[zip(train_df.item_id, train_df.store_id), :].reset_index(drop=True)

In [210]:
weight_df.head()

d,d_1,d_10,d_100,d_101,d_102,d_103,d_104,d_105,d_106,d_107,...,d_990,d_991,d_992,d_993,d_994,d_995,d_996,d_997,d_998,d_999
0,,,,,,,,,,,...,8.26,0.0,0.0,0.0,0.0,8.26,0.0,0.0,0.0,0.0
1,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,,,,,,,,,,,...,,,,,,,,,,
3,,,13.02,4.34,0.0,0.0,0.0,0.0,13.02,21.7,...,4.64,9.28,18.56,13.92,4.64,4.64,9.28,9.28,4.64,9.28
4,,,,,,,,,,,...,3.08,3.08,3.08,3.08,3.08,12.32,3.08,0.0,0.0,0.0


In [211]:
weight_df = pd.concat([original_train_df[['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id',  'all_id']], weight_df], axis=1, sort=False)

In [212]:
weight_df.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,all_id,d_1,d_10,d_100,...,d_990,d_991,d_992,d_993,d_994,d_995,d_996,d_997,d_998,d_999
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,,,,...,8.26,0.0,0.0,0.0,0.0,8.26,0.0,0.0,0.0,0.0
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,,,,...,,,,,,,,,,
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,,,13.02,...,4.64,9.28,18.56,13.92,4.64,4.64,9.28,9.28,4.64,9.28
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,,,,...,3.08,3.08,3.08,3.08,3.08,12.32,3.08,0.0,0.0,0.0


In [217]:
weight_lv = {}
weight_columns = [f'd_{i}' for i in range(1,1000)]

for i, group_id in enumerate(tqdm(group_ids)):
    lv_weight = weight_df.groupby(group_id)[weight_columns].sum().sum(axis=1)
    weight_lv[f'lv{i + 1}_weight'] = (lv_weight / lv_weight.sum()).to_dict()

HBox(children=(IntProgress(value=0, max=12), HTML(value='')))




In [218]:
weight_lv.keys()

dict_keys(['lv1_weight', 'lv2_weight', 'lv3_weight', 'lv4_weight', 'lv5_weight', 'lv6_weight', 'lv7_weight', 'lv8_weight', 'lv9_weight', 'lv10_weight', 'lv11_weight', 'lv12_weight'])

In [219]:
weight_lv['lv2_weight']

{'CA': 0.45241719209612724,
 'TX': 0.29416325144917316,
 'WI': 0.2534195564546997}

In [220]:
weight_lv['lv3_weight']

{'CA_1': 0.12076551360362014,
 'CA_2': 0.09504995638643435,
 'CA_3': 0.17281619577314036,
 'CA_4': 0.0637855263329302,
 'TX_1': 0.0854982443698652,
 'TX_2': 0.11890010985573843,
 'TX_3': 0.08976489722357042,
 'WI_1': 0.06709067177937054,
 'WI_2': 0.0819128253610134,
 'WI_3': 0.10441605931431697}

In [223]:
train_target_lv['lv9_train_df']

{('CA_1', 'FOODS_1'): tensor([297., 284., 214.,  ..., 315., 396., 357.]),
 ('CA_1', 'FOODS_2'): tensor([674., 655., 396.,  ..., 515., 655., 742.]),
 ('CA_1', 'FOODS_3'): tensor([2268., 2198., 1398.,  ..., 2067., 2485., 2841.]),
 ('CA_1', 'HOBBIES_1'): tensor([528., 489., 409.,  ..., 542., 584., 548.]),
 ('CA_1', 'HOBBIES_2'): tensor([28.,  9.,  6.,  ..., 39., 43., 60.]),
 ('CA_1',
  'HOUSEHOLD_1'): tensor([ 361.,  350.,  279.,  ...,  625.,  809., 1162.]),
 ('CA_1', 'HOUSEHOLD_2'): tensor([181., 170., 114.,  ..., 228., 262., 261.]),
 ('CA_2', 'FOODS_1'): tensor([406., 408., 238.,  ..., 264., 336., 311.]),
 ('CA_2', 'FOODS_2'): tensor([212., 227., 138.,  ...,  75., 145., 116.]),
 ('CA_2', 'FOODS_3'): tensor([1575., 1286.,  913.,  ...,  957., 1176., 1146.]),
 ('CA_2', 'HOBBIES_1'): tensor([522., 381., 352.,  ..., 327., 390., 408.]),
 ('CA_2', 'HOBBIES_2'): tensor([16., 16., 16.,  ..., 26., 36., 55.]),
 ('CA_2', 'HOUSEHOLD_1'): tensor([529., 461., 306.,  ..., 448., 705., 894.]),
 ('CA_2', 

In [221]:
weight_lv['lv9_weight']

{('CA_1', 'FOODS_1'): 0.006992931298772223,
 ('CA_1', 'FOODS_2'): 0.01658326681459951,
 ('CA_1', 'FOODS_3'): 0.05163868432299457,
 ('CA_1', 'HOBBIES_1'): 0.014685270556275174,
 ('CA_1', 'HOBBIES_2'): 0.0005218871848964872,
 ('CA_1', 'HOUSEHOLD_1'): 0.020686573383048235,
 ('CA_1', 'HOUSEHOLD_2'): 0.009656900043034032,
 ('CA_2', 'FOODS_1'): 0.011376350997552765,
 ('CA_2', 'FOODS_2'): 0.0043442169908170574,
 ('CA_2', 'FOODS_3'): 0.030245402040931058,
 ('CA_2', 'HOBBIES_1'): 0.010929659655600214,
 ('CA_2', 'HOBBIES_2'): 0.0006382728582529142,
 ('CA_2', 'HOUSEHOLD_1'): 0.023185682215444696,
 ('CA_2', 'HOUSEHOLD_2'): 0.014330371627835688,
 ('CA_3', 'FOODS_1'): 0.01055066599428751,
 ('CA_3', 'FOODS_2'): 0.023350666118721334,
 ('CA_3', 'FOODS_3'): 0.06818043403971,
 ('CA_3', 'HOBBIES_1'): 0.0158194961367096,
 ('CA_3', 'HOBBIES_2'): 0.0008164255808013017,
 ('CA_3', 'HOUSEHOLD_1'): 0.0375989993240668,
 ('CA_3', 'HOUSEHOLD_2'): 0.01649950857884374,
 ('CA_4', 'FOODS_1'): 0.004252331787487271,
 ('C

In [190]:
weight_lv

{'lv1_weight': all_id
 0    1.0
 dtype: float64, 'lv2_weight': state_id
 CA    0.452417
 TX    0.294163
 WI    0.253420
 dtype: float64, 'lv3_weight': store_id
 CA_1    0.120766
 CA_2    0.095050
 CA_3    0.172816
 CA_4    0.063786
 TX_1    0.085498
 TX_2    0.118900
 TX_3    0.089765
 WI_1    0.067091
 WI_2    0.081913
 WI_3    0.104416
 dtype: float64, 'lv4_weight': cat_id
 FOODS        0.595326
 HOBBIES      0.112930
 HOUSEHOLD    0.291743
 dtype: float64, 'lv5_weight': dept_id
 FOODS_1        0.069331
 FOODS_2        0.130689
 FOODS_3        0.395307
 HOBBIES_1      0.107239
 HOBBIES_2      0.005691
 HOUSEHOLD_1    0.209403
 HOUSEHOLD_2    0.082340
 dtype: float64, 'lv6_weight': state_id  cat_id   
 CA        FOODS        0.263062
           HOBBIES      0.051968
           HOUSEHOLD    0.137387
 TX        FOODS        0.179102
           HOBBIES      0.032671
           HOUSEHOLD    0.082390
 WI        FOODS        0.153162
           HOBBIES      0.028292
           HOUSEHOLD    

In [24]:
class WRMSSEEvaluator(object):

    def __init__(self, train_df: pd.DataFrame, valid_df: pd.DataFrame, calendar: pd.DataFrame, prices: pd.DataFrame):
        train_y = train_df.loc[:, train_df.columns.str.startswith('d_')]
        train_target_columns = train_y.columns.tolist()
        weight_columns = train_y.iloc[:, -28:].columns.tolist()

        train_df['all_id'] = 0  # for lv1 aggregation

        id_columns = train_df.loc[:, ~train_df.columns.str.startswith('d_')].columns.tolist()
        valid_target_columns = valid_df.loc[:, valid_df.columns.str.startswith('d_')].columns.tolist()

        if not all([c in valid_df.columns for c in id_columns]):
            valid_df = pd.concat([train_df[id_columns], valid_df], axis=1, sort=False)

        self.train_df = train_df
        self.valid_df = valid_df
        self.calendar = calendar
        self.prices = prices

        self.weight_columns = weight_columns
        self.id_columns = id_columns
        self.valid_target_columns = valid_target_columns

        weight_df = self.get_weight_df()

        self.group_ids = (
            ['all_id'],
            ['state_id'],
            ['store_id'],
            ['cat_id'],
            ['dept_id'],
            ['state_id', 'cat_id'],
            ['state_id', 'dept_id'],
            ['store_id', 'cat_id'],
            ['store_id', 'dept_id'],
            ['item_id'],
            ['item_id', 'state_id'],
            ['item_id', 'store_id']
        )

        for i, group_id in enumerate(tqdm(self.group_ids)):
            setattr(self, f'lv{i + 1}_train_df', train_df.groupby(group_id)[train_target_columns].sum())
            setattr(self, f'lv{i + 1}_valid_df', valid_df.groupby(group_id)[valid_target_columns].sum())

            lv_weight = weight_df.groupby(group_id)[weight_columns].sum().sum(axis=1)
            setattr(self, f'lv{i + 1}_weight', lv_weight / lv_weight.sum())

    def get_weight_df(self) -> pd.DataFrame:
        day_to_week = self.calendar.set_index('d')['wm_yr_wk'].to_dict()
        weight_df = self.train_df[['item_id', 'store_id'] + self.weight_columns].set_index(['item_id', 'store_id'])
        weight_df = weight_df.stack().reset_index().rename(columns={'level_2': 'd', 0: 'value'})
        weight_df['wm_yr_wk'] = weight_df['d'].map(day_to_week)

        weight_df = weight_df.merge(self.prices, how='left', on=['item_id', 'store_id', 'wm_yr_wk'])
        weight_df['value'] = weight_df['value'] * weight_df['sell_price']
        weight_df = weight_df.set_index(['item_id', 'store_id', 'd']).unstack(level=2)['value']
        weight_df = weight_df.loc[zip(self.train_df.item_id, self.train_df.store_id), :].reset_index(drop=True)
        weight_df = pd.concat([self.train_df[self.id_columns], weight_df], axis=1, sort=False)
        return weight_df

    def rmsse(self, valid_preds: pd.DataFrame, lv: int) -> pd.Series:
        train_y = getattr(self, f'lv{lv}_train_df')
        valid_y = getattr(self, f'lv{lv}_valid_df')
        score = ((valid_y - valid_preds) ** 2).mean(axis=1)
        scale = ((train_y.iloc[:, 1:].values - train_y.iloc[:, :-1].values) ** 2).mean(axis=1)
        return (score / scale).map(np.sqrt)

    def score(self, valid_preds: Union[pd.DataFrame, np.ndarray]) -> float:
        assert self.valid_df[self.valid_target_columns].shape == valid_preds.shape

        if isinstance(valid_preds, np.ndarray):
            valid_preds = pd.DataFrame(valid_preds, columns=self.valid_target_columns)

        valid_preds = pd.concat([self.valid_df[self.id_columns], valid_preds], axis=1, sort=False)

        all_scores = []
        for i, group_id in enumerate(self.group_ids):
            lv_scores = self.rmsse(valid_preds.groupby(group_id)[self.valid_target_columns].sum(), i + 1)
            weight = getattr(self, f'lv{i + 1}_weight')
            lv_scores = pd.concat([weight, lv_scores], axis=1, sort=False).prod(axis=1)
            all_scores.append(lv_scores.sum())

        return np.mean(all_scores)

In [None]:
def get_rmsse(self, valid_preds) -> pd.Series:
        '''
        returns rmsse scores for all 42840 series
        '''
        score = ((self.valid_series - valid_preds) ** 2).mean(axis=1)
        self.scale = np.where(self.scale != 0 , self.scale, 1)
        rmsse = (score / self.scale).map(np.sqrt)
        return rmsse

In [130]:
a = torch.rand(30000,28)

In [133]:
a.flatten().size()

torch.Size([840000])

In [110]:
df = pd.DataFrame({'a':[ i for i in range(10)]})
df2 = pd.DataFrame({'b':[i*10 for i in range(10)]})

In [116]:
df2 = df2.iloc[[1,0, 3, 5, 2, 6, 7, 9, 8, 4],:]

In [118]:
pd.concat([
    df,df2
],axis=1)

Unnamed: 0,a,b
0,0,0
1,1,10
2,2,20
3,3,30
4,4,40
5,5,50
6,6,60
7,7,70
8,8,80
9,9,90


In [None]:
for i, group_id in enumerate(tqdm(group_ids)):
    print(group_id)
    a = train_df.groupby(group_id)[train_target_columns].sum().T.to_dict()
    for key, value in a.items():
        value = list(value.values())
        a[key] = torch.FloatTensor(value)
    train_target_lv[f'lv{i + 1}_train_df'] = a
    a = valid_df.groupby(group_id)[valid_target_columns].sum()
    for key, value in a.items():
        a[key] = torch.FloatTensor(value)
    valid_target_lv[f'lv{i + 1}_valid_df'] = a

In [120]:
class WRMSSE(nn.Module):
    def __init__(self, df, calendar, prices):
        super(WRMSSE, self).__init__()
        self.df = df
        self.df['all_id'] = 0  # for lv1 aggregation
        self.df['index'] = self.df.index
        self.id_columns = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'all_id']
        self.calendar = calendar
        self.prices = prices
        
        self.group_ids = (
            'all_id',
            'state_id',
            'store_id',
            'cat_id',
            'dept_id',
            ['state_id', 'cat_id'],
            ['state_id', 'dept_id'],
            ['store_id', 'cat_id'],
            ['store_id', 'dept_id'],
            'item_id',
            ['item_id', 'state_id'],
            ['item_id', 'store_id']
        )
        
        
    def prepare_metrics(self, train_d_cols, valid_d_cols):
        self.train_d_cols = train_d_cols
        self.valid_d_cols = valid_d_cols
        self.weight_columns = train_d_cols[-28:]
        self.split_train_valid_data()
        self.get_weight()
        
        self.lv_index_dict = {}
        self.lv_train_df_dict = {}
        self.lv_valid_df_dict = {}
        self.lv_weight_df_dict = {}
        for i, group_id in enumerate(tqdm(self.group_ids)):
            
            a = self.train_df.groupby(group_id)['index'].unique().to_dict()
            for key, value in a.items():
                value = list(value.values())
                a[key] = value
            self.lv_index_dict[f'lv{i + 1}_index'] = a
            
            a = self.train_df.groupby(group_id)[self.train_d_cols].sum().T.to_dict()
            for key, value in a.items():
                value = list(value.values())
                a[key] = torch.FloatTensor(value)
            
            self.lv_train_df_dict[f'lv{i + 1}_train_df'] = a
            
            a = self.valid_df.groupby(group_id)[self.valid_d_cols].sum().T.to_dict()
            for key, value in a.items():
                value = list(value.values())
                a[key] = torch.FloatTensor(value)
            self.lv_valid_df_dict[f'lv{i + 1}_valid_df'] = a

            lv_weight = self.weight_df.groupby(group_id)[self.weight_columns].sum().sum(axis=1)
            self.lv_valid_df_dict[f'lv{i + 1}_weight'] = (lv_weight / lv_weight.sum()).to_dict()
            
        self.denominator = {}
        
        
    def split_train_valid_data(self):
        self.train_df = self.df[self.id_columns+self.train_d_cols]
        self.valid_df = self.df[self.id_columns+self.valid_d_cols]
        
    def get_weight(self):
        day_to_week = self.calendar.set_index('d')['wm_yr_wk'].to_dict()
        weight_df = self.train_df[['item_id', 'store_id'] + self.weight_columns].set_index(['item_id', 'store_id'])
        weight_df = weight_df.stack().reset_index().rename(columns={'level_2': 'd', 0: 'value'})
        weight_df['wm_yr_wk'] = weight_df['d'].map(day_to_week)

        weight_df = weight_df.merge(self.prices, how='left', on=['item_id', 'store_id', 'wm_yr_wk'])
        weight_df['value'] = weight_df['value'] * weight_df['sell_price']
        weight_df = weight_df.set_index(['item_id', 'store_id', 'd']).unstack(level=2)['value']
        weight_df = weight_df.loc[zip(self.train_df.item_id, self.train_df.store_id), :].reset_index(drop=True)
        self.weight_df = pd.concat([self.train_df[self.id_columns], weight_df], axis=1, sort=False)
    
    
    def forward(self, pred):
        return loss

In [124]:
a = torch.rand(1000,28)

In [128]:
len(a.sum(0))

28

In [10]:
%%time
train_df, calendar_df, calendar_data, price_data, is_sell = Preprocessing(original_train_df, calendar_df, sell_prices_df)

HBox(children=(IntProgress(value=0, max=30490), HTML(value='')))


CPU times: user 51.4 s, sys: 3.89 s, total: 55.2 s
Wall time: 55.5 s


In [11]:
#'snap_CA', 'snap_TX', 'snap_WI'

In [12]:
%%time
d_cols = [f'd_{i}' for i in range(1,1914)]

CPU times: user 691 µs, sys: 47 µs, total: 738 µs
Wall time: 784 µs


In [13]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cpu'

## cv

In [14]:
def _create_CvData(cols, train_df, calendar_data, price_data, is_sell, sample_submission_df):
    trn_cols = cols[:-28]
    val_cols = cols
    
    state='CA'
    data_ca = make_data(trn_cols, state, train_df, calendar_data, price_data, is_sell, sample_submission_df)
    state='TX'
    data_tx = make_data(trn_cols, state, train_df, calendar_data, price_data, is_sell, sample_submission_df)
    state='WI'
    data_wi = make_data(trn_cols, state, train_df, calendar_data, price_data, is_sell, sample_submission_df)
    trn_data = torch.cat(
        (data_ca, data_tx, data_wi),
        dim=0
    )
    trn_calendar = make_calendar_data(calendar_data, train_cols)
    del data_ca, data_tx, data_wi
    gc.collect()

    state='CA'
    data_ca = make_data(val_cols, state, train_df, calendar_data, price_data, is_sell, sample_submission_df)
    state='TX'
    data_tx = make_data(val_cols, state, train_df, calendar_data, price_data, is_sell, sample_submission_df)
    state='WI'
    data_wi = make_data(val_cols, state, train_df, calendar_data, price_data, is_sell, sample_submission_df)
    val_data = torch.cat(
        (data_ca, data_tx, data_wi),
        dim=0
    )
    val_calendar = make_calendar_data(calendar_data, val_cols)
    del data_ca, data_tx, data_wi
    gc.collect()
    
    print(val_data.size()[1]+ val_calendar.size()[0])
    trn_data_set=Mydatasets(trn_data, trn_calendar, train = True)
    trn_loader = torch.utils.data.DataLoader(trn_data_set, batch_size = 200, shuffle = True)

    val_data_set=Mydatasets(val_data, val_calendar, train = True)
    val_loader = torch.utils.data.DataLoader(val_data_set, batch_size = 50, shuffle = False)
    
    return trn_loader,  val_loader



def Nest_cv(d_cols=[f'd_{i}' for i in range(900,1914)], cv=5):
    cv_lndex=[]
    N = int((len(d_cols)/cv)+1)
    for i in range(cv):
        cols = d_cols[N*i:]
        n=int(len(cols)*0.75)
        holdout = {}
        holdout['trn'] = cols[:n]
        holdout['val'] = cols[n:]
        
        inner_cols = cols[:n]
        n=int(len(inner_cols)*0.75)
        inner={}
        inner['trn'] = inner_cols[:n]
        inner['val'] = inner_cols[n:]
        
        Index ={}
        Index['holdout'] = holdout
        Index['inner'] = inner
        cv_lndex.append(Index)
    return cv_lndex

In [27]:
n=200
i=0
if i==0:
    cols = d_cols[-n*(i+1):]
else:
    cols = d_cols[-n*(i+1):-n*i]

state='CA'
data_ca = make_data(cols, state, train_df, calendar_data, price_data, is_sell, sample_submission_df)
state='TX'
data_tx = make_data(cols, state, train_df, calendar_data, price_data, is_sell, sample_submission_df)
state='WI'
data_wi = make_data(cols, state, train_df, calendar_data, price_data, is_sell, sample_submission_df)


data = torch.cat(
    (data_ca, data_tx, data_wi),
    dim=0
)
calendar = make_calendar_data(calendar_data, cols)
del data_ca, data_tx, data_wi
gc.collect()

70

In [28]:
data_set=Mydatasets(data, calendar)
#data_loader = torch.utils.data.DataLoader(data_set, batch_size = 200, shuffle = True)

In [35]:
c=0
for x in data_loader:
    if c<5:
        print(x.size())
    c+=1

torch.Size([200, 56, 200])
torch.Size([200, 56, 200])
torch.Size([200, 56, 200])
torch.Size([200, 56, 200])
torch.Size([200, 56, 200])


In [66]:
def train_model(model, data_set):
    num_epochs = 40
    lr = 1e-4
    eta_min = 1e-3
    t_max = 10
    numclass = 5
    
    model = model.to(device)
    criterion = nn.MSELoss()
    optimizer = RAdam(params=model.parameters(), lr=lr)
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=t_max, eta_min=eta_min)


    best_epoch = -1
    best_score = 10000
    early_stoppping_cnt = 0
    best_model = model
    
    for epoch in range(num_epochs):
        start_time = time.time()
        
        model.train()
        avg_loss = 0.
        data_loader = torch.utils.data.DataLoader(data_set, batch_size = 200, shuffle = True)
        for x_batch in tqdm(data_loader):
            optimizer.zero_grad()
            
            
            y_batch = x_batch[:,0,-28*2:-28]
            x_batch = x_batch[:,:,:-28*2]
            
            x_batch = x_batch.to(device)
            y_batch = y_batch.to(device)
            
            preds = model(x_batch)
            loss = torch.sqrt(criterion(preds.squeeze(1), y_batch))
            
            
            loss.backward()
            optimizer.step()
            scheduler.step()

            avg_loss += loss.item() / len(data_loader)
        
        model.eval()
        avg_val_loss = 0.
        
        data_loader = torch.utils.data.DataLoader(data_set, batch_size = 100, shuffle = False)
        for x_batch in data_loader:
            
            y_batch = x_batch[:,0,-28:]
            x_batch = x_batch[:,:,:-28]
            
            x_batch = x_batch.to(device)
            y_batch = y_batch.to(device)
            
            preds = model(x_batch)
            loss = torch.sqrt(criterion(preds.squeeze(1), y_batch))

            avg_val_loss += loss.item() / len(data_loader)
            
            
        if best_score>avg_val_loss:
            best_score = avg_val_loss
            early_stoppping_cnt=0
            best_epoch=epoch
            best_model = model
            elapsed = time.time() - start_time
            p_avg_val_loss = termcolor.colored(np.round(avg_val_loss, 4),"red")
            
            print(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {p_avg_val_loss} time: {elapsed:.0f}s')
        else:
            early_stoppping_cnt+=1
            elapsed = time.time() - start_time
            print(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f} time: {elapsed:.0f}s')
        
        if (epoch>10) and (early_stoppping_cnt>7):
                break
    
    print(f'best_score : {best_score}    best_epoch : {best_epoch}')
    torch.save(model.state_dict(), 'net.pt')
    
    return best_model, best_score

In [72]:
in_size=56
model = Conv_1d_Net(in_size)

In [74]:
#best_model, best_score = train_model(model, data_set)
gc.collect()

558

In [75]:
gc.collect()

0