In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import os, gc
import math, random
import pickle
import datetime, time
from tqdm import tqdm_notebook as tqdm

import torch 
from torch import nn
from torch import optim

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans

%matplotlib inline

In [2]:
def Preprocessing(train_df, calendar_df, sell_prices_df):
    sell_prices_df['id'] = sell_prices_df['item_id'].astype('str')+'_'+sell_prices_df['store_id']+'_validation'
    d_cols = [f'd_{i}' for i in range(1,1914)]
    
    event_type_1 = pd.get_dummies(calendar_df.event_type_1)
    event_type_1.columns = [f'{col}_event_type_1' for col in event_type_1.columns]
    event_type_2 = pd.get_dummies(calendar_df.event_type_1)
    event_type_2.columns = [f'{col}_event_type_2' for col in event_type_2.columns]
    calendar_data = pd.concat([
        calendar_df.drop(columns=['event_name_1', 'event_type_1', 'event_name_2', 'event_type_2'])[['wday', 'd','month','snap_CA', 'snap_TX', 'snap_WI']],
        event_type_1,
        event_type_2
    ], axis=1)
    calendar_data = calendar_data.set_index('d').T
    
    
    
    sell_prices_data = sell_prices_df[sell_prices_df.wm_yr_wk.isin(calendar_df.wm_yr_wk.unique())]
    sell_prices_data.reset_index(drop=True, inplace=True)
    tmp = sell_prices_data.groupby(['id'])[['wm_yr_wk', 'sell_price']].apply(lambda x: x.set_index('wm_yr_wk')['sell_price'].to_dict()).to_dict()
    d = calendar_df.d
    wm_yr_wk = calendar_df.wm_yr_wk
    price_data = {}
    for col in tqdm(train_df.id.unique()):
        price_data[col] = wm_yr_wk.map(tmp[col])
    price_data = pd.DataFrame(price_data)
    price_data.index = d
    
    train_df2 = train_df.copy()
    
    train_df = train_df.T
    train_df.columns = train_df.loc['id', :].values
    train_df.loc[d_cols,  :] = train_df.loc[d_cols,  :] + np.where(
    np.isnan(
        price_data[price_data.index.isin(d_cols)]
    ), np.nan,0)
    train_df = train_df.T
    
    return train_df, calendar_df, calendar_data, price_data



In [12]:
def make_data(train_cols, null_check_cols, state, train_df, calendar_data, price_data):
    data_train = train_df[['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']+train_cols]
    
    null = train_df[null_check_cols].isnull().sum(axis=1)
    null = null[null==0].index
    data_train = data_train[data_train.id.isin(null)]
    
    train_price = price_data.T
    train_price= train_price[train_cols]
    
    train_product = data_train[data_train.state_id==state]['id'].unique()
    len(train_product)
    data = data_train.loc[train_product,train_cols]
    
    calendar_index = [
        'wday', 'month', f'snap_{state}', 'Cultural_event_type_1', 'National_event_type_1', 'Religious_event_type_1',
        'Sporting_event_type_1', 'Cultural_event_type_2', 'National_event_type_2', 'Religious_event_type_2', 'Sporting_event_type_2'
    ]

    calendar = calendar_data.loc[calendar_index,:]
    event_index = [
        'Cultural_event_type_1', 'National_event_type_1', 'Religious_event_type_1', 'Sporting_event_type_1',
        'Cultural_event_type_2', 'National_event_type_2', 'Religious_event_type_2', 'Sporting_event_type_2'
    ]
    for shift in [-14, -7, 7, 14, 28, 56]:
        tmp_calendar = calendar.loc[event_index, :]
        tmp_calendar = tmp_calendar.T.shift(-shift).T
        tmp_calendar.index = [f'{col}_shift{shift}' for col in tmp_calendar.index]
        calendar = pd.concat([
            calendar,
            tmp_calendar
        ], axis=0)
    calendar = calendar[train_cols]
    
    price = price_data.T[train_cols].loc[train_product,:]
    price_1 = price_data.loc[:,train_product].shift(-3).T[train_cols]
    price_2 = price_data.loc[:,train_product].shift(-7).T[train_cols]
    price_3 = price_data.loc[:,train_product].shift(-14).T[train_cols]

    past_price_1 = price_data.loc[:,train_product].shift(3).T[train_cols]
    past_price_2 = price_data.loc[:,train_product].shift(7).T[train_cols]
    past_price_3 = price_data.loc[:,train_product].shift(14).T[train_cols]
    print(
        price_1.isnull().sum().sum(),
        price_2.isnull().sum().sum(),
        price_3.isnull().sum().sum())
    print(
        past_price_1.isnull().sum().sum(), 
        past_price_2.isnull().sum().sum(), 
        past_price_3.isnull().sum().sum()
    )
    
    data = torch.FloatTensor(data.values.astype(float))
    calendar = torch.FloatTensor(calendar.values.astype(float))
    price = torch.FloatTensor(price.values.astype(float))
    price_1 = torch.FloatTensor(price_1.values.astype(float))
    price_2 = torch.FloatTensor(price_2.values.astype(float))
    price_3= torch.FloatTensor(price_3.values.astype(float))
    
    past_price_1 = torch.FloatTensor(past_price_1.values.astype(float))
    past_price_2 = torch.FloatTensor(past_price_2.values.astype(float))
    past_price_3 = torch.FloatTensor(past_price_3.values.astype(float))
    
    return data, calendar, price, price_1, price_2, price_3, past_price_1, past_price_2, past_price_3
    

In [4]:
path = '/Users/kanoumotoharu/Downloads/m5-forecasting-accuracy/'
#path = '/Users/abcdm/Downloads/m5-forecasting-accuracy/'

original_train_df = pd.read_csv(path+'sales_train_validation.csv')
calendar_df = pd.read_csv(path+'calendar.csv')
sell_prices_df = pd.read_csv(path+'sell_prices.csv')
sample_submission_df = pd.read_csv(path+'sample_submission.csv')

In [5]:
%%time
train_df, calendar_df, calendar_data, price_data = Preprocessing(original_train_df, calendar_df, sell_prices_df)

HBox(children=(IntProgress(value=0, max=30490), HTML(value='')))




In [13]:
d_cols = [f'd_{i}' for i in range(1,1914)]
train_cols = d_cols[-200:]
null_check_cols = d_cols[-214:]
#'snap_CA', 'snap_TX', 'snap_WI'
state='CA'
data, calendar, price, price_1, price_2, price_3, past_price_1, past_price_2, past_price_3 = make_data(train_cols, null_check_cols, state, train_df, calendar_data, price_data)


0 0 0
0 0 0


In [18]:
for tnsr in [data, calendar, price, price_1, price_2, price_3, past_price_1, past_price_2, past_price_3]:
    print(tnsr[torch.isnan(tnsr)])

tensor([])
tensor([])
tensor([])
tensor([])
tensor([])
tensor([])
tensor([])
tensor([])
tensor([])
