In [1]:
from typing import List, Tuple
import torch
import numpy as np
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def mode_of_freq(data: pd.DataFrame,
                 key='date',
                 freq='D',
                 mode='sum'
                 ) -> pd.DataFrame:
    """データを基本統計量で統合する
    引数:
        data: 対象を含むオリジナルデータ
        key: 時間軸のカラム名
        freq: グループ単位（D: 日ごと, M: 月ごと, Y: 年ごと）
        mode: 統計量（sum, mean, etc）
    """
    # 日付をobjectからdate_time型に変更
    data[key] = pd.to_datetime(data[key], format=('%d.%m.%Y'))
    # 時系列(key)についてグループ単位(freq)の売上数の基本統計量(mode)で出力
    mode_of_key = getattr(data.groupby(pd.Grouper(key=key, freq=freq)), mode)
    return mode_of_key()

In [3]:
def expand_and_split(ds: pd.Series, seq: int) -> Tuple[np.ndarray]:
    """2次元にd_modelずらしたデータと正解データを作成する
    引数:
        ds: 単変量時系列データ
        seq: transformerのシーケンス
    """
    endpoint = len(ds) - (seq + 1)
    expanded = np.stack([ds[i: i + seq + 1] for i in range(0, endpoint)])
    x = expanded[:, :-1]
    y = expanded[:, -1]
    return x, y #,expanded  # for debag

In [4]:
def time_delay_embedding(x: np.ndarray,
                            y:np.ndarray,
                            d_model=32,
                            dilation=1) -> Tuple[np.ndarray]:
    """Time Delay Embedding
    引数:
        x: 訓練データ
        y: 正解データ
        d_model: エンべディング次元数
        dilation: エンべディングの膨張率 
    """
    endpoint = x.shape[0] - d_model * dilation
    span = d_model * dilation

    tded = [x[i: i + span: dilation, :].T for i in range(endpoint)] 
    y = y[span - dilation:]
    return np.array(tded), np.array(y)

## for debag
# i = 0
# print(expanded[i: i + span: dilation, :][-1,   -2:])
# print(tded[i][-1, -1], y_[i])

In [5]:
def src_tgt_split(tded: np.ndarray,
                   src_seq: int,
                   tgt_seq: int) -> Tuple[np.ndarray]:
    """エンコーダ入力とデコーダ入力への分割"""
    src = tded[:, :src_seq]
    tgt = tded[:, -tgt_seq:]
    return src, tgt

In [8]:
def to_torch_dataset(src: np.ndarray,
                     tgt: np.ndarray,
                     label: np.ndarray,
                     batch_size: int) -> object:
    """Pytorch用のデータセットへの変換
    引数:
        src: エンコーダ入力データ
        tgt: デコーダ入力データ
        label: 正解データ
        batch_size: ミニバッチのバッチサイズ
    """
    label = label.reshape(-1, 1)[:len(src)]
    pack = (src, tgt , label)
    pack = [torch.from_numpy(i.astype(np.float32)).clone() for i in pack]
    dataset = torch.utils.data.TensorDataset(*pack)
    dataset = torch.utils.data.DataLoader(dataset, batch_size, shuffle=False)
    return dataset

In [9]:
def time_series_dataset(data,
                        trg_column = 'item_cnt_day',
                        seq=7,
                        d_model=32,
                        dilation=1,
                        src_tgt_seq=(6, 2),
                        batch_size=64):
    data = getattr(mode_of_freq(data), trg_column)
    x, y = expand_and_split(data, seq=7)
    tded, label = time_delay_embedding(x, y, d_model=32, dilation=1)
    src, tgt = src_tgt_split(tded, *src_tgt_seq)
    dataset = to_torch_dataset(src, tgt, label, batch_size=64)
    return dataset

In [1]:

from module.lino import time_series_dataset

data = pd.read_csv('../data/sales_train.csv')
dataset = time_series_dataset(data)

  from .autonotebook import tqdm as notebook_tqdm


NameError: name 'pd' is not defined

In [2]:
sys.path

['/Users/lino/Desktop/Predict-Future-Sales/osada_notebook',
 '/Users/lino/.vscode/extensions/ms-toolsai.jupyter-2022.11.1003412109/pythonFiles',
 '/Users/lino/.vscode/extensions/ms-toolsai.jupyter-2022.11.1003412109/pythonFiles/lib/python',
 '/Users/lino/opt/anaconda3/envs/datascience/lib/python39.zip',
 '/Users/lino/opt/anaconda3/envs/datascience/lib/python3.9',
 '/Users/lino/opt/anaconda3/envs/datascience/lib/python3.9/lib-dynload',
 '',
 '/Users/lino/opt/anaconda3/envs/datascience/lib/python3.9/site-packages']

In [None]:
import torch.nn as nn

In [None]:
d_model = 32
encoder_layer = nn.TransformerEncoderLayer(d_model=32, nhead=8, batch_first=True)
transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=6)
src = torch.rand(64,7,32)
memory = transformer_encoder(src)
print(memory.shape)

In [None]:
decoder_layer = nn.TransformerDecoderLayer(d_model, nhead=8, batch_first=True)
transformer_decoder = nn.TransformerDecoder(decoder_layer, num_layers=6)
tgt = torch.rand(64, 2,32)
out = transformer_decoder(tgt, memory)
linear = nn.Linear(32, 1)
y = linear(out)
y.shape