<a href="https://colab.research.google.com/github/Mark-THU/load_forecast/blob/main/TransformerEncoder_LSTMDecoder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# prediction by seq2seq model which consists of TransformerEncoder-LSTMDecoder
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pdb
import random
import math
import torch
import torch.nn as nn
import torch.nn.functional as F

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, KFold
from sqlalchemy import create_engine
from torch.autograd import Variable
from torch.utils.data import TensorDataset, DataLoader

In [2]:
if torch.cuda.is_available():  
  dev = "cuda:0" 
else:  
  dev = "cpu"
device = torch.device(dev)

In [3]:
# load data
# url = 'https://raw.githubusercontent.com/Mark-THU/load_forecast/main/integrate_0101510000.csv'
url = 'https://raw.githubusercontent.com/Mark-THU/load_forecast/main/dataset.csv'
data = pd.read_csv(url, sep='\t', index_col='time')
data = data[['tem', 'tembody', 'month_of_year', 'is_holiday', 'day_of_week', 'load']]

In [4]:
# 归一化
def normalization(data):
    """
    data: original data with load
    return: normalized data, scaler of load
    """
    normalized_data = MinMaxScaler().fit_transform(data)
    scaler_y = MinMaxScaler()
    scaler_y.fit_transform(data[['load']])
    return normalized_data, scaler_y

In [5]:
# build supervised data
def Series_To_Supervise(data, seq_len, target_len, y_col_index):
    """
    convert series data to supervised data
    :param data: original data
    :param seq_len: length of sequence
    :y_col_index: index of column which acts as output
    :return: return two ndarrays-- input and output in format suitable to feed to LSTM
    """
#     pdb.set_trace()
    dim_0 = data.shape[0] - seq_len
    dim_1 = data.shape[1]
    x = np.zeros((dim_0, seq_len, dim_1))
    y = np.zeros((dim_0, target_len))
    for i in range(dim_0):
        x[i] = data[i: i+seq_len]
        y[i] = data[i+seq_len+1-target_len:i+seq_len+1, y_col_index]
    print("shape of x: {}, shape of y: {}".format(x.shape, y.shape))
    return x, y

In [6]:
# 5-fold cross-validation
def split_dataset(X, Y, n_split=5):
    """
    X: original feature, size * 72 * features
    Y: labels, size * 1
    return: list of train_x, test_x, train_y, test_y
    """
    kf = KFold(n_splits=n_split, shuffle=True, random_state=1)
    train_x_list = list()
    valid_x_list = list()
    test_x_list = list()
    train_y_list = list()
    valid_y_list = list()
    test_y_list = list()
    for train_index, test_index in kf.split(X):
        train_x_list.append(X[train_index])
        train_y_list.append(Y[train_index])
        test_x = X[test_index]
        test_y = Y[test_index]
        valid_x, test_x, valid_y, test_y = train_test_split(test_x, test_y, test_size=0.5, random_state=1)
        valid_x_list.append(valid_x)
        valid_y_list.append(valid_y)
        test_x_list.append(test_x)
        test_y_list.append(test_y)
    return train_x_list, valid_x_list, test_x_list, train_y_list, valid_y_list, test_y_list

In [7]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

In [8]:
# define encoder
class Encoder(nn.Module):
    def __init__(self, input_size=13, d_model=64, nhead=8, n_layers=4, dim_feedward=256):
        super(Encoder, self).__init__()
        
        self.embedding = nn.Linear(input_size, d_model)
        self.pos_encoder = PositionalEncoding(d_model=d_model)
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, dim_feedforward=dim_feedward)
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=n_layers)
    def forward(self, x):
        # transpose shape from (seq_len, batch_size, feature_num) to (batch_size, seq_len, feature_num)
        x = x.transpose(0, 1)
        embedded = self.embedding(x)
        encoder_input = self.pos_encoder(embedded)
        output = self.transformer_encoder(encoder_input)
        # transpose shape from (batch_size, seq_len, feature_num) to (seq_len, batch_size, feature_num)
        output = output.transpose(0, 1)
        return output

In [9]:
# define attn_lstm_decoder
class AttnLSTMDecoder(nn.Module):
    def __init__(self, input_size, hidden_dim, output_size=1, n_layers=1, drop_prob=0.1, seq_len=48):
        super(AttnLSTMDecoder, self).__init__()
        self.input_size = input_size
        self.hidden_dim = hidden_dim
        self.output_size = output_size
        self.n_layers = n_layers
        self.drop_prob = drop_prob
        self.seq_len = seq_len
        
        self.embedding = nn.Linear(self.input_size, self.hidden_dim * self.n_layers)
        self.attn = nn.Linear(self.hidden_dim * self.n_layers * 2, self.seq_len)
        self.attn_combine = nn.Linear(self.hidden_dim * (self.n_layers + 1), self.hidden_dim)
        self.dropout = nn.Dropout(self.drop_prob)
        self.lstm = nn.LSTM(self.hidden_dim, self.hidden_dim, self.n_layers, batch_first=True)
        self.out = nn.Linear(self.hidden_dim, self.output_size)
    
    def forward(self, input, hidden, encoder_outputs):
        """
        :input size(batch_size, 1, input_size)
        :hidden size(batch_size, 1, hidden_dim)
        :encoder_outputs size(batch_size, seq_len, hidden_dim)
        """
        embedded = self.embedding(input)
        
        # flat hidden to calculte weights
        hidden_flat = hidden[0].transpose(0, 1)
        hidden_flat = hidden_flat.contiguous().view(hidden_flat.shape[0], 1, -1)
        
        # pdb.set_trace()
        attn_weights = F.softmax(self.attn(torch.cat((embedded, hidden_flat), 2)), dim=2)
        attn_applied = torch.bmm(attn_weights, encoder_outputs)
        
        output = torch.cat((embedded, attn_applied), 2)
        output = self.attn_combine(output)
        output = F.relu(output)
        
        output, hidden = self.lstm(output, hidden)        
        
        output = self.out(output)      
        
        return output, hidden
        
    def init_hidden(self, batch_size):
        hidden = (torch.zeros(self.n_layers, batch_size, self.hidden_dim).to(device),
                      torch.zeros(self.n_layers, batch_size, self.hidden_dim).to(device))
        return hidden  

In [15]:
# train the model 
def train_model(train_x, train_y, valid_x, valid_y, input_size=13, hidden_dim=64, n_layers=1,
                seq_len=48, target_len=24, number_epochs=80, batch_size=512, teacher_forcing_ratio=0.5,
                lr=0.01, training_prediction='teacher_forcing', dynamic_tf=False):
    encoder = Encoder(input_size)
    decoder = AttnLSTMDecoder(input_size, hidden_dim, n_layers=n_layers)
    encoder = encoder.to(device)
    decoder = decoder.to(device)
    train_dataset = TensorDataset(torch.FloatTensor(train_x), torch.FloatTensor(train_y))
    valid_dataset = TensorDataset(torch.FloatTensor(valid_x), torch.FloatTensor(valid_y))
    criterion = nn.MSELoss()
    encoder_optimizer = torch.optim.Adam(encoder.parameters(), lr=lr)
    decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=lr)
    train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
    valid_loader = DataLoader(dataset=valid_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
    train_losses = list()
    valid_loss_min = np.Inf
    num_without_imp = 0
    
    y_index = train_x.shape[2] - 1
    #train
    for epoch in range(1, number_epochs + 1):
        for i, (inputs, labels) in enumerate(train_loader):
            inputs = inputs.to(device)
            labels = labels.to(device)
            outputs = torch.zeros(batch_size, target_len).to(device)
            encoder_optimizer.zero_grad()
            decoder_optimizer.zero_grad()
            # encoder
            input_tensor = inputs[:, 0:seq_len, :]
            target_tensor = inputs[:, seq_len:, :]
            encoder_outputs = encoder(input_tensor)
            # decoder
            decoder_input = inputs[:, seq_len-1, :].unsqueeze(1)
            decoder_hidden = decoder.init_hidden(decoder_input.shape[0])
            
            if training_prediction == 'recursive':
                # predict recursively
                for t in range(target_len):
                    # pdb.set_trace()
                    decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden, encoder_outputs)
                    outputs[:, t] = decoder_output.squeeze()
                    decoder_input = torch.cat((target_tensor[:, t, :-1].unsqueeze(1), decoder_output.detach()), 2)
            
            if training_prediction == 'teacher_forcing':
                # use teacher forcing
                if random.random() < teacher_forcing_ratio:
                    for t in range(target_len):
                        decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden, encoder_outputs)
                        outputs[:, t] = decoder_output.squeeze()
                        decoder_input = target_tensor[:, t, :].unsqueeze(1)
                # predict recurisively
                else:
                    for t in range(target_len):
                        decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden, encoder_outputs)
                        outputs[:, t] = decoder_output.squeeze()
                        decoder_input = torch.cat((target_tensor[:, t, :-1].unsqueeze(1), decoder_output.detach()), 2)
            
            if training_prediction == 'mixed_teacher_forcing':
                # predict using mixed teacher forcing
                for t in range(target_len):
                    decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden, encoder_outputs)
                    outputs[:, t] = decoder_output.squeeze()
                    # predict with teacher forcing
                    if random.random() < teacher_forcing_ratio:
                        decoder_input = target_tensor[:, t, :].unsqueeze(1)
                            
                    # predict recursively 
                    else:
                        decoder_input = torch.cat((target_tensor[:, t, :-1].unsqueeze(1), decoder_output.detach()), 2)
            loss = criterion(outputs, labels)
            train_losses.append(loss.item)
            loss.backward()
            encoder_optimizer.step()
            decoder_optimizer.step()
            
            # eval
            if i % 5 == 0:
                if num_without_imp > 30:
                    return encoder, decoder
                num_without_imp = num_without_imp + 1
                valid_losses = list()
                encoder.eval()
                decoder.eval()
                for inp, lab in valid_loader:
                    inp = inp.to(device)
                    lab = lab.to(device)
                    out = torch.zeros(batch_size, target_len).to(device)
                    # encoder
                    input_tensor = inp[:, 0:seq_len, :]
                    target_tensor = inp[:, seq_len:, 0:-1]
                    encoder_outputs = encoder(input_tensor)
                    # decoder
                    decoder_input = inp[:, seq_len-1, :].unsqueeze(1)
                    decoder_hidden = decoder.init_hidden(decoder_input.shape[0])
                    # predict recuisively
                    for t in range(target_len):
                        decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden, encoder_outputs)
                        out[:, t] = decoder_output.squeeze()
                        decoder_input = torch.cat((target_tensor[:, t, :].unsqueeze(1), decoder_output.detach()), 2)
                    valid_loss = criterion(out, lab)
                    valid_losses.append(valid_loss.item())
                encoder.train()
                decoder.train()
                print("Epoch: {}/{}...".format(epoch, number_epochs),
                     "Step: {}/{}...".format(i+1, len(train_dataset)//batch_size),
                     "Loss: {}...".format(loss.item()),
                     "Valid Loss: {}...".format(np.mean(valid_losses)))
                if np.mean(valid_losses) < valid_loss_min:
                    num_without_imp = 0
                    torch.save(encoder.state_dict(), "encoder_state_dict.pt")
                    torch.save(decoder.state_dict(), "decoder_state_dict.pt")
                    print("Valid loss decreased ({:.6f} --> {:.6f}).  Saving model ...".format(valid_loss_min, np.mean(valid_losses)))
                    valid_loss_min = np.mean(valid_losses)
    return encoder, decoder

In [11]:
# test the model
def test_model(encoder, decoder, test_x, test_y, batch_size, seq_len, target_len, scaler):
    test_dataset = TensorDataset(torch.FloatTensor(test_x), torch.FloatTensor(test_y))
    test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False, drop_last=True)
    encoder.load_state_dict(torch.load('encoder_state_dict.pt'))
    decoder.load_state_dict(torch.load('decoder_state_dict.pt'))
    y_pred = list()
    y_true = list()
    y_index = test_x.shape[2] - 1
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs = inputs.to(device=device)
            labels = labels.to(device=device)
            outputs = torch.zeros(batch_size, target_len).to(device)
            # encoder
            input_tensor = inputs[:, 0:seq_len, :]
            target_tensor = inputs[:, seq_len:, :]
            encoder_outputs = encoder(input_tensor)
            # decoder
            decoder_input = inputs[:, seq_len-1, :].unsqueeze(1)
            decoder_hidden = decoder.init_hidden(decoder_input.shape[0])
            # predict recursively
            for t in range(target_len):
                decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden, encoder_outputs)
                outputs[:, t] = decoder_output.squeeze()
                decoder_input = torch.cat((target_tensor[:, t, :-1].unsqueeze(1), decoder_output.detach()), 2)
            y_pred = y_pred + outputs.view(-1).cpu().numpy().tolist()
            y_true = y_true + labels.view(-1).cpu().numpy().tolist()
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    load_true = scaler.inverse_transform(np.expand_dims(y_true, axis=1))
    load_pred = scaler.inverse_transform(np.expand_dims(y_pred, axis=1))
    MAPE = np.mean(np.abs(load_true-load_pred)/load_true)
    return MAPE

In [12]:
# model configs
def Model_Configs():
    batch_sizes = [512]
    n_layers = [1]
    lrs = [0.01]
    hidden_dims = [64]
    configs = list()
    for i in batch_sizes:
        for j in n_layers:
            for k in lrs:
                for l in hidden_dims:
                    configs.append([i, j, k, l])
    return configs

In [13]:
def main(n_split=5, seq_len=48, target_len=24):
    normalized_data, scaler_y = normalization(data)
    y_index = normalized_data.shape[1] - 1
    num_features = normalized_data.shape[1]
    x, y = Series_To_Supervise(normalized_data, seq_len=seq_len + target_len, target_len=target_len, y_col_index=y_index)
    train_x_list, valid_x_list, test_x_list, train_y_list, valid_y_list, test_y_list = split_dataset(x, y, n_split=n_split)    
    print("model configs set.")
    configs = Model_Configs()
    MAPE_list = list()
    for config in configs:
        batch_size = config[0]
        n_layer = config[1]
        lr = config[2]
        hidden_dim = config[3]
        print("Config: batch_size--{}, n_layer--{}, lr--{}, hidden_dims--{}".format(batch_size, n_layer, lr, hidden_dim))
        tmp_list = list()
        tmp_list_24 = list()
        for i in range(n_split):
          while(1):
            encoder, decoder = train_model(train_x_list[i], train_y_list[i], valid_x_list[i], valid_y_list[i],
                                           input_size=num_features, hidden_dim=hidden_dim, batch_size=batch_size,
                                           n_layers=n_layer, lr=lr, seq_len=seq_len, target_len=target_len, 
                                           training_prediction='teacher_forcing')
            MAPE = test_model(encoder, decoder, test_x_list[i], test_y_list[i], batch_size=batch_size, seq_len=seq_len,
                              target_len=target_len, scaler=scaler_y)
            if MAPE < 0.1:
                break
          tmp_list.append(MAPE)
        MAPE_list.append(tmp_list)
    return MAPE_list

In [None]:
MAPE_list = main()

shape of x: (22745, 72, 6), shape of y: (22745, 24)
model configs set.
Config: batch_size--512, n_layer--1, lr--0.01, hidden_dims--64
Epoch: 1/80... Step: 1/35... Loss: 0.16790780425071716... Valid Loss: 0.03976811654865742...
Valid loss decreased (inf --> 0.039768).  Saving model ...
Epoch: 1/80... Step: 6/35... Loss: 0.025201275944709778... Valid Loss: 0.021147733088582754...
Valid loss decreased (0.039768 --> 0.021148).  Saving model ...
Epoch: 1/80... Step: 11/35... Loss: 0.02159234508872032... Valid Loss: 0.02076515508815646...
Valid loss decreased (0.021148 --> 0.020765).  Saving model ...
Epoch: 1/80... Step: 16/35... Loss: 0.022524867206811905... Valid Loss: 0.020570220425724983...
Valid loss decreased (0.020765 --> 0.020570).  Saving model ...
Epoch: 1/80... Step: 21/35... Loss: 0.020397160202264786... Valid Loss: 0.020654250402003527...
Epoch: 1/80... Step: 26/35... Loss: 0.021410010755062103... Valid Loss: 0.02046036208048463...
Valid loss decreased (0.020570 --> 0.020460). 

In [None]:
print("MAPE: {:.6f}".format(np.array(MAPE_list).mean()))