In [1]:
#path = '/Users/kanoumotoharu/Downloads/m5-forecasting-accuracy/'
path = '/Users/abcdm/Downloads/m5-forecasting-accuracy/'
#path = '../input/m5-forecasting-accuracy/'

In [2]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import os, gc
import termcolor

import math, random
import pickle
import datetime, time
from tqdm import tqdm_notebook as tqdm

import torch 
from torch import nn
from torch import optim

from sklearn import preprocessing
from sklearn.model_selection import train_test_split, KFold
from sklearn.cluster import KMeans

%matplotlib inline

In [3]:
def Preprocessing(train_df, calendar_df, sell_prices_df):
    sell_prices_df['id'] = sell_prices_df['item_id'].astype('str')+'_'+sell_prices_df['store_id']+'_validation'
    d_cols = [f'd_{i}' for i in range(1,1914)]
    
    event_type_1 = pd.get_dummies(calendar_df.event_type_1)
    event_type_1.columns = [f'{col}_event_type_1' for col in event_type_1.columns]
    event_type_2 = pd.get_dummies(calendar_df.event_type_1)
    event_type_2.columns = [f'{col}_event_type_2' for col in event_type_2.columns]
    calendar_data = pd.concat([
        calendar_df.drop(columns=['event_name_1', 'event_type_1', 'event_name_2', 'event_type_2'])[['wday', 'd','month','snap_CA', 'snap_TX', 'snap_WI']],
        event_type_1,
        event_type_2
    ], axis=1)
    calendar_data = calendar_data.set_index('d').T
    
    
    
    sell_prices_data = sell_prices_df[sell_prices_df.wm_yr_wk.isin(calendar_df.wm_yr_wk.unique())]
    sell_prices_data.reset_index(drop=True, inplace=True)
    tmp = sell_prices_data.groupby(['id'])[['wm_yr_wk', 'sell_price']].apply(lambda x: x.set_index('wm_yr_wk')['sell_price'].to_dict()).to_dict()
    d = calendar_df.d
    wm_yr_wk = calendar_df.wm_yr_wk
    price_data = {}
    for col in tqdm(train_df.id.unique()):
        price_data[col] = wm_yr_wk.map(tmp[col])
    price_data = pd.DataFrame(price_data)
    price_data.index = d
    
    
    is_sell = price_data.notnull().astype(float).T
    price_data = price_data.fillna(0)
    
    train_df = train_df.T
    train_df.columns = train_df.loc['id', :].values
    train_df = train_df.T
    
    return train_df, calendar_df, calendar_data, price_data, is_sell


def make_calendar_data(calendar_data, train_cols):
    calendar_index = [
        'wday', 'month',
        'Cultural_event_type_1', 'National_event_type_1', 'Religious_event_type_1', 'Sporting_event_type_1',
        'Cultural_event_type_2', 'National_event_type_2', 'Religious_event_type_2', 'Sporting_event_type_2'
    ]
    calendar = calendar_data.loc[calendar_index,:]
    event_index = [
        'Cultural_event_type_1', 'National_event_type_1', 'Religious_event_type_1', 'Sporting_event_type_1',
        'Cultural_event_type_2', 'National_event_type_2', 'Religious_event_type_2', 'Sporting_event_type_2'
    ]
    for shift in [3, 7, 14, 28]:
        tmp_calendar = calendar.loc[event_index, :]
        tmp_calendar = tmp_calendar.T.shift(-shift).T
        tmp_calendar.index = [f'{col}_shift{shift}' for col in tmp_calendar.index]
        calendar = pd.concat([
            calendar,
            tmp_calendar
        ], axis=0)
    calendar = calendar[train_cols]
    calendar = torch.FloatTensor(calendar.values.astype(float))
    return calendar

def make_data(train_cols, state, train_df, calendar_data, price_data, is_sell_data, sample_submission_df):
    data_train = train_df[['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']+train_cols]
    train_product = sample_submission_df[(sample_submission_df.id.str.contains(state))&(sample_submission_df.id.str.contains('_validation'))].id.values
    #train_product = data_train[data_train.state_id==state]['id'].unique()
    
    data = data_train.loc[train_product,train_cols]
    
    calendar_index = [ f'snap_{state}']
    event_index = [ f'snap_{state}']
    calendar = calendar_data.loc[calendar_index,:]
    for shift in [3, 7, 14, 28]:
        tmp_calendar = calendar.loc[event_index, :]
        tmp_calendar = tmp_calendar.T.shift(shift).T
        tmp_calendar.index = [f'{col}_shift{shift}' for col in tmp_calendar.index]
        calendar = pd.concat([
            calendar,
            tmp_calendar
        ], axis=0)
    calendar = calendar[train_cols]
    
    price = price_data.T[train_cols].loc[train_product,:]
    past_price_1 = price_data.loc[:,train_product].shift(3).T[train_cols]
    past_price_2 = price_data.loc[:,train_product].shift(7).T[train_cols]
    past_price_3 = price_data.loc[:,train_product].shift(14).T[train_cols]
    
    
    is_sell = is_sell_data[train_cols].loc[train_product,:]
    past_is_sell_1 = is_sell_data.T.shift(3).T.loc[train_product, train_cols]
    past_is_sell_2 = is_sell_data.T.shift(7).T.loc[train_product, train_cols]
    past_is_sell_3 = is_sell_data.T.shift(14).T.loc[train_product, train_cols]

    data = torch.FloatTensor(data.values.astype(float))
    
    calendar = torch.FloatTensor(calendar.values.astype(float))
    
    price = torch.FloatTensor(price.values.astype(float))
    
    past_price_1 = torch.FloatTensor(past_price_1.values.astype(float))
    past_price_2 = torch.FloatTensor(past_price_2.values.astype(float))
    past_price_3 = torch.FloatTensor(past_price_3.values.astype(float))
    
    is_sell = torch.FloatTensor(is_sell.values.astype(float))
    past_is_sell_1 = torch.FloatTensor(past_is_sell_1.values.astype(float))
    past_is_sell_2 = torch.FloatTensor(past_is_sell_2.values.astype(float))
    past_is_sell_3 = torch.FloatTensor(past_is_sell_3.values.astype(float))
    
    data_list = []
    for idx in range(len(data)):
        _data = data[[idx],:]
        _price = price[[idx],:]
        
        _past_price_1 = past_price_1[[idx],:]
        _past_price_2 = past_price_2[[idx],:]
        _past_price_3 = past_price_3[[idx],:]
        
        _is_sell = is_sell[[idx],:]
        
        _past_is_sell_1 = past_is_sell_1[[idx],:]
        _past_is_sell_2 = past_is_sell_2[[idx],:]
        _past_is_sell_3 = past_is_sell_3[[idx],:]
        
        x = torch.cat((
            _data, calendar,
            _price,
            _past_price_1, _past_price_2, _past_price_3,
            _is_sell,
            _past_is_sell_1, _past_is_sell_2, _past_is_sell_3
        ), dim=0)
        data_list.append(x.tolist())
    data_list = torch.FloatTensor(data_list)
    return data_list

In [4]:
def mish(input):
    return input * torch.tanh(nn.functional.softplus(input))

class Mish(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, input):
        return mish(input)

In [5]:
class residual_conv1d(nn.Module):

    def __init__(self, in_channel):
        super(residual_conv1d, self).__init__()
        
        self.mish = Mish()
        self.layer = nn.Sequential(
            nn.Conv1d(in_channel, in_channel, 1),
            Mish(),
            nn.Conv1d(in_channel, in_channel, 1)
        )

    def forward(self, x):
        x = x+self.layer(x)
        x = self.mish(x)
        return x

class Conv_1d_Net(nn.Module):

    def __init__(self, in_channel):
        super(Conv_1d_Net, self).__init__()
        
        self.layer_1 = nn.Sequential(
            nn.Conv1d(in_channel, 2*in_channel, 1),
            nn.Dropout(0.2),
            Mish(),
            residual_conv1d(2*in_channel)
        )
        
        self.layer_2 = nn.Sequential(
            nn.Conv1d(2*in_channel, 4*in_channel, 1),
            nn.Dropout(0.2),
            Mish(),
            residual_conv1d(4*in_channel)
        )
        
        self.layer_3 = nn.Sequential(
            nn.Conv1d(4*in_channel, 8*in_channel, 1),
            nn.Dropout(0.2),
            Mish(),
            residual_conv1d(8*in_channel)
        )
       
         
        self.avgpool1d = nn.AdaptiveAvgPool1d(1)
        
        self.fc = nn.Sequential(
            nn.Linear(8*in_channel, 8*in_channel),
            nn.Dropout(0.1),
            Mish(),
            nn.Linear(8*in_channel, 16*in_channel),
            nn.Dropout(0.1),
            Mish(),
            nn.Linear(16*in_channel, 28)
        ) 

    def forward(self, x):
        #_in = x.size()[1]
        x = self.layer_1(x)
        x = self.layer_2(x)
        x = self.layer_3(x)
        #x = self.layer_4(x)
        x = self.avgpool1d(x)
        x = torch.flatten(x, 1)
        x = self.fc(x)

        return x

In [6]:
from torch.optim.optimizer import Optimizer
import math

class RAdam(Optimizer):

    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0):
        defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
        self.buffer = [[None, None, None] for ind in range(10)]
        super(RAdam, self).__init__(params, defaults)

    def __setstate__(self, state):
        super(RAdam, self).__setstate__(state)

    def step(self, closure=None):

        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:

            for p in group['params']:
                if p.grad is None:
                    continue
                grad = p.grad.data.float()
                if grad.is_sparse:
                    raise RuntimeError('RAdam does not support sparse gradients')

                p_data_fp32 = p.data.float()

                state = self.state[p]

                if len(state) == 0:
                    state['step'] = 0
                    state['exp_avg'] = torch.zeros_like(p_data_fp32)
                    state['exp_avg_sq'] = torch.zeros_like(p_data_fp32)
                else:
                    state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32)
                    state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32)

                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
                beta1, beta2 = group['betas']

                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
                exp_avg.mul_(beta1).add_(1 - beta1, grad)

                state['step'] += 1
                buffered = self.buffer[int(state['step'] % 10)]
                if state['step'] == buffered[0]:
                    N_sma, step_size = buffered[1], buffered[2]
                else:
                    buffered[0] = state['step']
                    beta2_t = beta2 ** state['step']
                    N_sma_max = 2 / (1 - beta2) - 1
                    N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t)
                    buffered[1] = N_sma

                    # more conservative since it's an approximated value
                    if N_sma >= 5:
                        step_size = group['lr'] * math.sqrt((1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / (1 - beta1 ** state['step'])
                    else:
                        step_size = group['lr'] / (1 - beta1 ** state['step'])
                    buffered[2] = step_size

                if group['weight_decay'] != 0:
                    p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32)

                # more conservative since it's an approximated value
                if N_sma >= 5:            
                    denom = exp_avg_sq.sqrt().add_(group['eps'])
                    p_data_fp32.addcdiv_(-step_size, exp_avg, denom)
                else:
                    p_data_fp32.add_(-step_size, exp_avg)

                p.data.copy_(p_data_fp32)

        return loss

In [7]:
class Mydatasets(torch.utils.data.Dataset):
    def __init__(self, data, calendar):
        self.data = data
        self.calendar = calendar
        self.datanum = len(data)

    def __len__(self):
        return self.datanum

    def __getitem__(self, idx):
        _data = self.data[idx, :, :]
        x = torch.cat((_data, self.calendar), dim=0)
        return x

In [8]:
class WRMSSE(nn.Module):
    def __init__(self, df, calendar, prices):
        super(WRMSSE, self).__init__()
        self.df = df
        self.df['all_id'] = 0  # for lv1 aggregation
        self.df['index'] = self.df.index
        self.id_columns = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'all_id', 'index']
        self.calendar = calendar
        self.prices = prices
        
        self.group_ids = (
            ['all_id'],
            ['state_id'],
            ['store_id'],
            ['cat_id'],
            ['dept_id'],
            ['state_id', 'cat_id'],
            ['state_id', 'dept_id'],
            ['store_id', 'cat_id'],
            ['store_id', 'dept_id'],
            ['item_id'],
            ['item_id', 'state_id'],
            ['item_id', 'store_id']
        )
        
        
    def prepare_metrics(self, valid_d_cols):
        max_n = int(valid_d_cols[-1].replace('d_', ''))
        self.train_d_cols = [f'd_{i}' for i in range(1,max_n)]  #<-- コンペが last １ヶ月になったら 914 -->1942 変える
        #self.train_d_cols = [f'd_{i}' for i in range(1000,1914) if f'd_{i}' in valid_d_cols]  #<-- コンペが last １ヶ月になったら 914 -->1942 変える
        self.valid_d_cols = valid_d_cols
        self.weight_columns = self.train_d_cols[-28:]
        self.split_train_valid_data()
        self.get_weight()
        
        self.tensor_index = {}
        self.index_len = {}
        self.denominator = {}
        self.True_y = {}
        self.weight = {}
        for i, group_id in enumerate(tqdm(self.group_ids)):
            #  index dict
            self.tensor_index.update(self.df[['index']+group_id].groupby(group_id)['index'].unique().to_dict())
            self.index_len.update(self.df[['index']+group_id].groupby(group_id)['index'].nunique().to_dict())
            
            # denominator
            tmp_a = self.train_df.groupby(group_id)[self.train_d_cols].sum().T.to_dict()
            a = {}
            for key, value in tmp_a.items() :
                value = np.array(list(value.values()))
                value = ((value[1:]-value[:-1])**2).mean()
                if value<=0:
                    value=1
                a[key] = value
            self.denominator.update(a)
            
            #weight 
            lv_weight = self.weight_df.groupby(group_id)[self.weight_columns].sum().sum(axis=1)
            self.weight.update((lv_weight / lv_weight.sum()).to_dict())
            
        self.tensor_index = np.array(list(self.tensor_index.values()))
        self.index_len = torch.FloatTensor(list(self.index_len.values()))
        self.denominator = torch.FloatTensor(list(self.denominator.values()))
        self.True_y = torch.FloatTensor(list(self.df[self.valid_d_cols].values))
        self.weight = torch.FloatTensor(list(self.weight.values()))
        
        
    def split_train_valid_data(self):
        self.train_df = self.df[self.id_columns+self.train_d_cols]
        self.valid_df = self.df[self.id_columns+self.valid_d_cols]
        
    def get_weight(self):
        day_to_week = self.calendar.set_index('d')['wm_yr_wk'].to_dict()
        weight_df = self.train_df[['item_id', 'store_id'] + self.weight_columns].set_index(['item_id', 'store_id'])
        weight_df = weight_df.stack().reset_index().rename(columns={'level_2': 'd', 0: 'value'})
        weight_df['wm_yr_wk'] = weight_df['d'].map(day_to_week)

        weight_df = weight_df.merge(self.prices, how='left', on=['item_id', 'store_id', 'wm_yr_wk'])
        weight_df['value'] = weight_df['value'] * weight_df['sell_price']
        weight_df = weight_df.set_index(['item_id', 'store_id', 'd']).unstack(level=2)['value']
        weight_df = weight_df.loc[zip(self.train_df.item_id, self.train_df.store_id), :].reset_index(drop=True)
        self.weight_df = pd.concat([self.train_df[self.id_columns], weight_df], axis=1, sort=False)        
    
    def forward(self, pred, y):
        Loss=0
        for i in range(len(valid_criterion.index_len)):
            lv_tensor_index = self.tensor_index[i]
            d = 1/self.denominator[i]
            w = self.weight[i]
            Loss+=w*d*nn.MSELoss(pred[lv_tensor_index,:].sum(0),self.True_y[lv_tensor_index,:].sum(0))
        return Loss

In [9]:
%%time
original_train_df = pd.read_csv(path+'sales_train_validation.csv')
calendar_df = pd.read_csv(path+'calendar.csv')
sell_prices_df = pd.read_csv(path+'sell_prices.csv')
sample_submission_df = pd.read_csv(path+'sample_submission.csv')

Wall time: 4.53 s


In [10]:
train_criterion = WRMSSE(df=original_train_df, calendar=calendar_df, prices=sell_prices_df)
valid_criterion = WRMSSE(df=original_train_df, calendar=calendar_df, prices=sell_prices_df)

In [11]:
train_df, calendar_df, calendar_data, price_data, is_sell = Preprocessing(original_train_df, calendar_df, sell_prices_df)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

HBox(children=(FloatProgress(value=0.0, max=30490.0), HTML(value='')))




'cuda'

In [12]:
d_cols = [f'd_{i}' for i in range(1,1914)]
n=200
i=0
if i==0:
    cols = d_cols[-n*(i+1):]
else:
    cols = d_cols[-n*(i+1):-n*i]

state='CA'
data_ca = make_data(cols, state, train_df, calendar_data, price_data, is_sell, sample_submission_df)
state='TX'
data_tx = make_data(cols, state, train_df, calendar_data, price_data, is_sell, sample_submission_df)
state='WI'
data_wi = make_data(cols, state, train_df, calendar_data, price_data, is_sell, sample_submission_df)


data = torch.cat(
    (data_ca, data_tx, data_wi),
    dim=0
)
calendar = make_calendar_data(calendar_data, cols)
del data_ca, data_tx, data_wi
gc.collect()

0

In [13]:
data_set=Mydatasets(data, calendar)
data_loader = torch.utils.data.DataLoader(data_set, batch_size = 200, shuffle = False)

In [14]:
for i, x in enumerate(data_loader):
    if i<5:
        print(x.size())
    pass

torch.Size([200, 56, 200])
torch.Size([200, 56, 200])
torch.Size([200, 56, 200])
torch.Size([200, 56, 200])
torch.Size([200, 56, 200])


In [23]:
in_size=56
model = Conv_1d_Net(in_size)

In [16]:
train_criterion.prepare_metrics(valid_d_cols=cols[-(28*2):-28])
gc.collect()

HBox(children=(FloatProgress(value=0.0, max=12.0), HTML(value='')))




3

In [17]:
valid_criterion.prepare_metrics(valid_d_cols=cols[-28:])

HBox(children=(FloatProgress(value=0.0, max=12.0), HTML(value='')))




In [21]:
def train_model(model, data_set, criterion_train, criterion_valid):
    num_epochs = 40
    lr = 1e-4
    eta_min = 1e-3
    t_max = 10
    numclass = 5
    
    model = model.to(device)
    #criterion = nn.MSELoss()
    optimizer = RAdam(params=model.parameters(), lr=lr)
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=t_max, eta_min=eta_min)


    best_epoch = -1
    best_score = 10000
    early_stoppping_cnt = 0
    best_model = model
    
    for epoch in range(num_epochs):
        start_time = time.time()
        
        model.train()
        avg_loss = 0.
        
        
        train_pred = torch.tensor([])
        for i, x_batch in tqdm(enumerate(data_loader), total=len(data_loader)):
            optimizer.zero_grad()
            
            
            y_batch = x_batch[:,0,-28*2:-28]
            x_batch = x_batch[:,:,:-28*2]
            x_batch = x_batch.to(device)
            preds = model(x_batch)
            train_pred = torch.cat(
                (train_pred, preds.cpu()), dim=0
            )
            
            del preds; gc.collect()
            
        loss = criterion_train(train_pred)
        loss.backward()
        optimizer.step()
        avg_loss = loss.items()
        
        model.eval()
        
        valid_pred = torch.tensor([])
        for i, x_batch in enumerate(data_loader):
            y_batch = x_batch[:,0,-28:]
            x_batch = x_batch[:,:,:-28]
            x_batch = x_batch.to(device)
            preds = model(x_batch)
            valid_pred = torch.cat(
                (valid_pred, preds.cpu()),
                dim=0
            ) 
            del preds; gc.collect()
            
        loss = criterion_valid(valid_pred)
        avg_val_loss = loss.items()
        
        if best_score>avg_val_loss:
            best_score = avg_val_loss
            early_stoppping_cnt=0
            best_epoch=epoch
            best_model = model
            elapsed = time.time() - start_time
            p_avg_val_loss = termcolor.colored(np.round(avg_val_loss, 4),"red")
            
            print(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {p_avg_val_loss} time: {elapsed:.0f}s')
        else:
            early_stoppping_cnt+=1
            elapsed = time.time() - start_time
            print(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f} time: {elapsed:.0f}s')
        
        if (epoch>10) and (early_stoppping_cnt>7):
                break
    
    print(f'best_score : {best_score}    best_epoch : {best_epoch}')
    torch.save(model.state_dict(), 'net.pt')
    
    return best_model, best_score

In [26]:
gc.collect()

20

In [27]:
best_model, best_score = train_model(model, data_set, train_criterion, valid_criterion)

HBox(children=(FloatProgress(value=0.0, max=153.0), HTML(value='')))

RuntimeError: CUDA out of memory. Tried to allocate 50.00 MiB (GPU 0; 6.00 GiB total capacity; 3.91 GiB already allocated; 6.91 MiB free; 407.61 MiB cached)

In [None]:
class Loss(nn.Module):
    def __init__(self, tensor_index, denominator, weight, True_y):
        super(Loss, self).__init__()
        self.tensor_index = tensor_index
        self.denominator = denominator
        self.weight = weight
        self.mse = nn.MSELoss()
        self.True_y = True_y
    def forward(self, preds):
        Loss=0
        for i in range(len(valid_criterion.index_len)):
            lv_tensor_index = self.tensor_index[i]
            d = 1/self.denominator[i]
            w = self.weight[i]
            Loss+=w*d*mse(pred[lv_tensor_index,:].sum(0),self.True_y[lv_tensor_index,:].sum(0))
        return Loss