In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import os, gc
import math, random
import pickle
import datetime, time
from tqdm import tqdm_notebook as tqdm

import torch 
from torch import nn
from torch import optim

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans

%matplotlib inline

In [2]:
def Preprocessing(train_df, calendar_df, sell_prices_df):
    sell_prices_df['id'] = sell_prices_df['item_id'].astype('str')+'_'+sell_prices_df['store_id']+'_validation'
    d_cols = [f'd_{i}' for i in range(1,1914)]
    
    event_type_1 = pd.get_dummies(calendar_df.event_type_1)
    event_type_1.columns = [f'{col}_event_type_1' for col in event_type_1.columns]
    event_type_2 = pd.get_dummies(calendar_df.event_type_1)
    event_type_2.columns = [f'{col}_event_type_2' for col in event_type_2.columns]
    calendar_data = pd.concat([
        calendar_df.drop(columns=['event_name_1', 'event_type_1', 'event_name_2', 'event_type_2'])[['wday', 'd','month','snap_CA', 'snap_TX', 'snap_WI']],
        event_type_1,
        event_type_2
    ], axis=1)
    calendar_data = calendar_data.set_index('d').T
    
    
    
    sell_prices_data = sell_prices_df[sell_prices_df.wm_yr_wk.isin(calendar_df.wm_yr_wk.unique())]
    sell_prices_data.reset_index(drop=True, inplace=True)
    tmp = sell_prices_data.groupby(['id'])[['wm_yr_wk', 'sell_price']].apply(lambda x: x.set_index('wm_yr_wk')['sell_price'].to_dict()).to_dict()
    d = calendar_df.d
    wm_yr_wk = calendar_df.wm_yr_wk
    price_data = {}
    for col in tqdm(train_df.id.unique()):
        price_data[col] = wm_yr_wk.map(tmp[col])
    price_data = pd.DataFrame(price_data)
    price_data.index = d
    
    
    is_sell = price_data.notnull().astype(float).T
    price_data = price_data.fillna(0)
    
    train_df = train_df.T
    train_df.columns = train_df.loc['id', :].values
    train_df = train_df.T
    
    return train_df, calendar_df, calendar_data, price_data, is_sell


def make_calendar_data(calendar_data, train_cols):
    calendar_index = [
        'wday', 'month',
        'Cultural_event_type_1', 'National_event_type_1', 'Religious_event_type_1', 'Sporting_event_type_1',
        'Cultural_event_type_2', 'National_event_type_2', 'Religious_event_type_2', 'Sporting_event_type_2'
    ]
    calendar = calendar_data.loc[calendar_index,:]
    event_index = [
        'Cultural_event_type_1', 'National_event_type_1', 'Religious_event_type_1', 'Sporting_event_type_1',
        'Cultural_event_type_2', 'National_event_type_2', 'Religious_event_type_2', 'Sporting_event_type_2'
    ]
    for shift in [-14, -7, 7, 14, 28, 56]:
        tmp_calendar = calendar.loc[event_index, :]
        tmp_calendar = tmp_calendar.T.shift(-shift).T
        tmp_calendar.index = [f'{col}_shift{shift}' for col in tmp_calendar.index]
        calendar = pd.concat([
            calendar,
            tmp_calendar
        ], axis=0)
    calendar = calendar[train_cols]
    calendar = torch.FloatTensor(calendar.values.astype(float))
    return calendar

def make_data(train_cols, state, train_df, calendar_data, price_data, is_sell_data):
    data_train = train_df[['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']+train_cols]
    
    train_product = data_train[data_train.state_id==state]['id'].unique()
    print(len(train_product))
    
    data = data_train.loc[train_product,train_cols]
    
    calendar_index = [ f'snap_{state}']
    event_index = [ f'snap_{state}']
    calendar = calendar_data.loc[calendar_index,:]
    for shift in [1, 3, 5, 7, 14, 28]:
        tmp_calendar = calendar.loc[event_index, :]
        tmp_calendar = tmp_calendar.T.shift(shift).T
        tmp_calendar.index = [f'{col}_shift{shift}' for col in tmp_calendar.index]
        calendar = pd.concat([
            calendar,
            tmp_calendar
        ], axis=0)
    calendar = calendar[train_cols]
    
    price = price_data.T[train_cols].loc[train_product,:]
    past_price_1 = price_data.loc[:,train_product].shift(3).T[train_cols]
    past_price_2 = price_data.loc[:,train_product].shift(7).T[train_cols]
    past_price_3 = price_data.loc[:,train_product].shift(14).T[train_cols]
    
    
    is_sell = is_sell_data[train_cols].loc[train_product,:]
    past_is_sell_1 = is_sell_data.T.shift(3).T.loc[train_product, train_cols]
    past_is_sell_2 = is_sell_data.T.shift(7).T.loc[train_product, train_cols]
    past_is_sell_3 = is_sell_data.T.shift(14).T.loc[train_product, train_cols]
    
    
    print(
        past_price_1.isnull().sum().sum(), 
        past_price_2.isnull().sum().sum(), 
        past_price_3.isnull().sum().sum()
    )
    print(
        past_is_sell_1.isnull().sum().sum(),
        past_is_sell_2.isnull().sum().sum(),
        past_is_sell_3.isnull().sum().sum()
    )
    
    data = torch.FloatTensor(data.values.astype(float))
    
    calendar = torch.FloatTensor(calendar.values.astype(float))
    
    price = torch.FloatTensor(price.values.astype(float))
    
    past_price_1 = torch.FloatTensor(past_price_1.values.astype(float))
    past_price_2 = torch.FloatTensor(past_price_2.values.astype(float))
    past_price_3 = torch.FloatTensor(past_price_3.values.astype(float))
    
    is_sell = torch.FloatTensor(is_sell.values.astype(float))
    past_is_sell_1 = torch.FloatTensor(past_is_sell_1.values.astype(float))
    past_is_sell_2 = torch.FloatTensor(past_is_sell_2.values.astype(float))
    past_is_sell_3 = torch.FloatTensor(past_is_sell_3.values.astype(float))
    
    data_list = []
    for idx in tqdm(range(len(data))):
        _data = data[[idx],:]
        _price = price[[idx],:]
        
        _past_price_1 = past_price_1[[idx],:]
        _past_price_2 = past_price_2[[idx],:]
        _past_price_3 = past_price_3[[idx],:]
        
        _is_sell = is_sell[[idx],:]
        
        _past_is_sell_1 = past_is_sell_1[[idx],:]
        _past_is_sell_2 = past_is_sell_2[[idx],:]
        _past_is_sell_3 = past_is_sell_3[[idx],:]
        
        x = torch.cat((
            _data, calendar,
            _price,
            _past_price_1, _past_price_2, _past_price_3,
            _is_sell,
            _past_is_sell_1, _past_is_sell_2, _past_is_sell_3
        ), dim=0)
        data_list.append(x.tolist())
    data_list = torch.FloatTensor(data_list)
    return data_list

In [3]:
def make_calendar_data(calendar_data, train_cols):
    calendar_index = [
        'wday', 'month',
        'Cultural_event_type_1', 'National_event_type_1', 'Religious_event_type_1', 'Sporting_event_type_1',
        'Cultural_event_type_2', 'National_event_type_2', 'Religious_event_type_2', 'Sporting_event_type_2'
    ]
    calendar = calendar_data.loc[calendar_index,:]
    event_index = [
        'Cultural_event_type_1', 'National_event_type_1', 'Religious_event_type_1', 'Sporting_event_type_1',
        'Cultural_event_type_2', 'National_event_type_2', 'Religious_event_type_2', 'Sporting_event_type_2'
    ]
    for shift in [-14, -7, 7, 14, 28, 56]:
        tmp_calendar = calendar.loc[event_index, :]
        tmp_calendar = tmp_calendar.T.shift(-shift).T
        tmp_calendar.index = [f'{col}_shift{shift}' for col in tmp_calendar.index]
        calendar = pd.concat([
            calendar,
            tmp_calendar
        ], axis=0)
    calendar = calendar[train_cols]
    calendar = torch.FloatTensor(calendar.values.astype(float))
    return calendar

In [4]:
path = '/Users/kanoumotoharu/Downloads/m5-forecasting-accuracy/'
#path = '/Users/abcdm/Downloads/m5-forecasting-accuracy/'
#path = '../input/m5-forecasting-accuracy/'

original_train_df = pd.read_csv(path+'sales_train_validation.csv')
calendar_df = pd.read_csv(path+'calendar.csv')
sell_prices_df = pd.read_csv(path+'sell_prices.csv')
sample_submission_df = pd.read_csv(path+'sample_submission.csv')

In [5]:
%%time
train_df, calendar_df, calendar_data, price_data, is_sell = Preprocessing(original_train_df, calendar_df, sell_prices_df)

HBox(children=(IntProgress(value=0, max=30490), HTML(value='')))


CPU times: user 54.1 s, sys: 4.25 s, total: 58.4 s
Wall time: 58.8 s


<p>過去58日分のdataで全て入る </p>

In [6]:
%%time
d_cols = [f'd_{i}' for i in range(1,1914)]

n = 200
train_cols = d_cols[-n:]
null_check_cols = d_cols[-(n+14):]

#'snap_CA', 'snap_TX', 'snap_WI'

state='CA'
data_ca = make_data(train_cols, state, train_df, calendar_data, price_data, is_sell)
state='TX'
data_tx = make_data(train_cols, state, train_df, calendar_data, price_data, is_sell)
state='WI'
data_wi = make_data(train_cols, state, train_df, calendar_data, price_data,is_sell)
calendar = make_calendar_data(calendar_data, train_cols)

12196
0 0 0
0 0 0


HBox(children=(IntProgress(value=0, max=12196), HTML(value='')))


9147
0 0 0
0 0 0


HBox(children=(IntProgress(value=0, max=9147), HTML(value='')))


9147
0 0 0
0 0 0


HBox(children=(IntProgress(value=0, max=9147), HTML(value='')))


CPU times: user 21.5 s, sys: 5.83 s, total: 27.3 s
Wall time: 27.9 s


In [7]:
data_ca.size(), data_tx.size(), data_wi.size(), calendar.size()

(torch.Size([12196, 16, 200]),
 torch.Size([9147, 16, 200]),
 torch.Size([9147, 16, 200]),
 torch.Size([58, 200]))

In [8]:
data = torch.cat(
    (data_ca, data_tx, data_wi),
    dim=0
)

In [9]:
data.size()

torch.Size([30490, 16, 200])

In [10]:
train_df.shape

(30490, 1919)

In [11]:
del data_ca, data_tx, data_wi
gc.collect()

101

In [12]:
data.size()

torch.Size([30490, 16, 200])

In [17]:
torch.isnan(data).sum()

tensor(0)