In [1]:
import sys

parent_dir = 'Predict-Future-Sales'
p_sub = sys.path[0]

ride = ''
for path in p_sub.split('/'):
    if path != parent_dir:
        ride = ride + path + '/'
    else:
        ride = ride + path + '/'
        break
sys.path[0] = ride

import numpy as np
import pandas as pd
from pandas import DatetimeIndex
from sklearn.preprocessing import StandardScaler
from torch.utils.data import TensorDataset

from module.lino_module.preprocess import _mode_of_freq, _expand_and_split,\
                                         _time_delay_embedding, _src_tgt_split, _to_torch_dataset
from typing import Tuple, Optional
from numpy import ndarray
from pandas import DataFrame, Series
from torch.utils.data import DataLoader

  from .autonotebook import tqdm as notebook_tqdm


### Time Delay Embedding に対応させた曜日と月時情報をd_modelにconcatしたデータセットを出力する

In [2]:
from module.lino_module.preprocess import weekly_monthly_tde_dataset
df = pd.read_csv('../data/sales_train.csv')
seq = 7
d_model = 4
dilation = 0
src_tgt_seq = (6, 2)
batch_size = 64

train ,test = weekly_monthly_tde_dataset(df, seq, d_model, dilation, src_tgt_seq, batch_size)
src, tgt, y = next(iter(train))
src.shape, tgt.shape, y.shape

(torch.Size([64, 6, 12]), torch.Size([64, 2, 12]), torch.Size([64, 1]))

In [3]:
def _delay_embeddings(x: np.ndarray,
                      y: np.ndarray,
                      index: DatetimeIndex,
                      d_model: int,
                      dilation: int,
                      seq: int,
                      weekly=True,
                      monthly=True) -> Tuple[np.ndarray, ]:
    """TDEに対応した曜日、月時ラベルをconcatする"""
    # Time Delay Embedding
    tded, label = _time_delay_embedding(x, y, d_model, dilation)

    # 曜日ラベル
    if weekly:
        # positional encodingのために0-1でスケーリング
        weekly_num = list(np.linspace(0, 1, 7))
        # 曜日ラベルをデータ数分ループさせたシーケンス
        weekly_label = weekly_num * (len(index) // 7) + weekly_num[:len(index) % 7]
        week, _ = _expand_and_split(weekly_label, seq)
        tded_week = _time_delay_embedding(week, None, d_model, dilation)
        tded = np.concatenate((tded, tded_week), axis=2)

    # 月ラベル
    if monthly:
        # positional encodingのために0-1でスケーリング
        scaled_index = (index.month - 1) / 11
        month, _ = _expand_and_split(scaled_index, seq)
        tded_month = _time_delay_embedding(month, None, d_model, dilation)
        tded = np.concatenate((tded, tded_month), axis=2)
    return tded, label

In [4]:
def weekly_monthly_tde_dataset(data: DataFrame,
                               seq: int,
                               d_model: int,
                               dilation: int,
                               src_tgt_seq: Tuple[int, int],
                               batch_size: int,
                               trg_column='item_cnt_day') -> Tuple[DataLoader]:
    """TDEに対応した曜日ラベルと月ラベル付与したデータセット"""
    data = getattr(_mode_of_freq(data), trg_column)
    index = data.index
    data = StandardScaler().fit_transform(data.values.reshape(-1, 1))
    data = data.reshape(-1)
    x, y = _expand_and_split(data, seq)
    tded, label = _delay_embeddings(
                                x, y,
                                index,
                                d_model,
                                dilation,
                                seq,
                                weekly=True, monthly=True)
    src, tgt = _src_tgt_split(tded, *src_tgt_seq)
    train, test = _to_torch_dataset(src, tgt, label, batch_size)
    return train, test