In [1]:
import copy
from pathlib import Path
import warnings

import torch
import numpy as np
import pandas as pd
from glob import glob

import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor
from pytorch_lightning.loggers import TensorBoardLogger

from pytorch_forecasting import Baseline, TemporalFusionTransformer, TimeSeriesDataSet
from pytorch_forecasting.data import GroupNormalizer
from pytorch_forecasting.metrics import SMAPE, PoissonLoss, QuantileLoss
from pytorch_forecasting.models.temporal_fusion_transformer.tuning import optimize_hyperparameters

In [2]:
data = pd.read_pickle('../M5_Dataset/level_12.pkl')

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59181090 entries, 0 to 59181089
Data columns (total 20 columns):
 #   Column        Dtype  
---  ------        -----  
 0   id            int16  
 1   item_id       int16  
 2   dept_id       int8   
 3   cat_id        int8   
 4   store_id      int8   
 5   state_id      int8   
 6   d             int16  
 7   sold          int16  
 8   weekday       int8   
 9   wday          int8   
 10  month         int8   
 11  year          int16  
 12  event_name_1  int8   
 13  event_type_1  int8   
 14  event_name_2  int8   
 15  event_type_2  int8   
 16  snap_CA       int8   
 17  snap_TX       int8   
 18  snap_WI       int8   
 19  sell_price    float16
dtypes: float16(1), int16(5), int8(14)
memory usage: 1.9 GB


In [4]:
data.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sold,weekday,wday,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price
0,14370,1437,3,1,0,0,1,0,2,1,1,2011,-1,-1,-1,-1,0,0,0,0.0
1,14380,1438,3,1,0,0,1,0,2,1,1,2011,-1,-1,-1,-1,0,0,0,0.0
2,14390,1439,3,1,0,0,1,0,2,1,1,2011,-1,-1,-1,-1,0,0,0,0.0
3,14400,1440,3,1,0,0,1,0,2,1,1,2011,-1,-1,-1,-1,0,0,0,0.0
4,14410,1441,3,1,0,0,1,0,2,1,1,2011,-1,-1,-1,-1,0,0,0,0.0


In [5]:
data["id"] = data["id"].astype(str).astype("category")
data["item_id"] = data["item_id"].astype(str).astype("category")
data["dept_id"] = data["dept_id"].astype(str).astype("category")
data["cat_id"] = data["cat_id"].astype(str).astype("category")
data["store_id"] = data["store_id"].astype(str).astype("category")
data["state_id"] = data["state_id"].astype(str).astype("category")
data["weekday"] = data["weekday"].astype(str).astype("category")
data["wday"] = data["wday"].astype(str).astype("category")
data["month"] = data["month"].astype(str).astype("category")

In [6]:
special_days = ['event_name_1', 'event_type_1', 'event_name_2', 'event_type_2']
snap_days = ['snap_CA', 'snap_TX', 'snap_WI']

In [7]:
data[snap_days] = data[snap_days].astype(str).astype("category")
data[special_days] = data[special_days].astype(str).astype("category")

In [8]:
train_dataset = data[data.d < 1912]
test_dataset = data[data.d >= 1912]

In [9]:
max_prediction_length = 30
max_encoder_length = 90
training_cutoff = train_dataset["d"].max() - max_prediction_length

In [10]:
test_encoder = train_dataset[lambda x: x.d > x.d.max() - max_encoder_length]

In [11]:
test_dataset = pd.concat([test_encoder, test_dataset], ignore_index=True)

In [12]:
test_cutoff = test_dataset["d"].max() - max_prediction_length

In [1]:
data.info()

NameError: name 'data' is not defined

In [None]:
train_dataset = TimeSeriesDataSet(
    train_dataset[lambda x: x.d <= training_cutoff],
    time_idx="d",
    target="sold",
    group_ids=["id"],
    min_encoder_length=max_encoder_length // 2,  # keep encoder length long (as it is in the validation set)
    max_encoder_length=max_encoder_length,
    min_prediction_length=1,
    max_prediction_length=max_prediction_length,
    static_categoricals=['item_id', "store_id", "dept_id", "state_id", 'cat_id'],
    time_varying_known_categoricals=["special_days", "month", "weekday", "wday", "month", "snap_days"],
    variable_groups={"special_days": special_days, "snap_days": snap_days},  # group of categorical variables can be treated as one variable
    time_varying_known_reals=["sell_price"],
    time_varying_unknown_categoricals=[],
    time_varying_unknown_reals=["sold"],

    add_relative_time_idx=False,
    add_target_scales=False,
    add_encoder_length=False
)

In [None]:
# create dataloaders for model
batch_size = 128

train_dataloader = train_dataset.to_dataloader(train=True, batch_size=batch_size, num_workers=8, shuffle=True)
val_dataloader = test_dataset.to_dataloader(train=False, batch_size=batch_size , num_workers=8)

In [None]:
# configure network and trainer
pl.seed_everything(22)
trainer = pl.Trainer(
    gpus=0,
    # clipping gradients is a hyperparameter and important to prevent divergance
    # of the gradient for recurrent neural networks
    gradient_clip_val=0.1,
)


tft = TemporalFusionTransformer.from_dataset(
    train_dataset,
    # not meaningful for finding the learning rate but otherwise very important
    learning_rate=0.03,
    hidden_size=16,  # most important hyperparameter apart from learning rate
    # number of attention heads. Set to up to 4 for large datasets
    attention_head_size=4,
    dropout=0.3,  # between 0.1 and 0.3 are good values
    hidden_continuous_size=8,  # set to <= hidden_size
    output_size=7,  # 7 quantiles by default
    loss=QuantileLoss(),
    # reduce learning rate if no improvement in validation loss after x epochs
    reduce_on_plateau_patience=4,
)
print(f"Number of parameters in network: {tft.size()/1e3:.1f}k")

In [None]:
# find optimal learning rate
res = trainer.tuner.lr_find(
    tft,
    train_dataloader=train_dataloader,
    val_dataloaders=val_dataloader,
    max_lr=10.0,
    min_lr=1e-6,
)

print(f"suggested learning rate: {res.suggestion()}")
fig = res.plot(show=True, suggest=True)
fig.show()

In [None]:
# configure network and trainer
early_stop_callback = EarlyStopping(monitor="val_loss", min_delta=1e-4, patience=20, verbose=False, mode="min")
lr_logger = LearningRateMonitor()  # log the learning rate
logger = TensorBoardLogger("lightning_logs")  # logging results to a tensorboard

trainer = pl.Trainer(
    max_epochs=200,
    gpus=0,
    weights_summary="top",
    gradient_clip_val=0.1,
    limit_train_batches=30,  # coment in for training, running valiation every 30 batches
    # fast_dev_run=True,  # comment in to check that networkor dataset has no serious bugs
    callbacks=[lr_logger, early_stop_callback],
    logger=logger,
)


tft = TemporalFusionTransformer.from_dataset(
    train_dataset,
    learning_rate=res.suggestion(),
    hidden_size=16,
    attention_head_size=4,
    dropout=0.3,
    hidden_continuous_size=8,
    output_size=7,  # 7 quantiles by default
    loss=QuantileLoss(),
    log_interval=10,  # uncomment for learning rate finder and otherwise, e.g. to 10 for logging every 10 batches
    reduce_on_plateau_patience=4,
)
print(f"Number of parameters in network: {tft.size()/1e3:.1f}k")

In [None]:
trainer.fit(
    tft,
    train_dataloader=train_dataloader,
    val_dataloaders=val_dataloader,
)