# Sequence Processing

Source:

* https://www.kaggle.com/tartakovsky/pytorch-lightning-lstm-timeseries-clean-code
* https://keras.io/examples/timeseries/timeseries_weather_forecasting/

We will be using Jena Climate dataset recorded by the Max Planck Institute for Biogeochemistry. The dataset consists of 14 features such as temperature, pressure, humidity etc, recorded once per 10 minutes.

Location: Weather Station, Max Planck Institute for Biogeochemistry in Jena, Germany

Time-frame Considered: Jan 10, 2009 - December 31, 2016

# Imports

In [None]:
! pip install pytorch-lightning

In [None]:
# Re-loads all imports every time the cell is ran. 
%load_ext autoreload
%autoreload 2

from time import time

import numpy as np
import pandas as pd
pd.options.display.float_format = '{:,.5f}'.format
from IPython.display import display

# Sklearn tools
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Neural Networks
import torch
import torch.nn as nn

from torch.utils.data import Dataset, DataLoader

import pytorch_lightning as pl
from pytorch_lightning import Trainer, seed_everything
from pytorch_lightning.loggers.csv_logs import CSVLogger

# Plotting
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib as mpl
plt.style.use('bmh')
mpl.rcParams['figure.figsize'] = 18, 8

# Data


### Load

In [None]:
! wget -nc "https://storage.googleapis.com/tensorflow/tf-keras-datasets/jena_climate_2009_2016.csv.zip"

In [None]:
from zipfile import ZipFile
zip_file = ZipFile("jena_climate_2009_2016.csv.zip")
zip_file.extractall()
csv_path = "jena_climate_2009_2016.csv"

data = pd.read_csv(csv_path)
data = data.iloc[::200] # only each 200th point

data.head()

In [None]:
date_time_key = "Date Time"
data[date_time_key] = pd.to_datetime(data[date_time_key], infer_datetime_format=True)
data.set_index(date_time_key, inplace=True)
data.sort_index(inplace=True)
data.head()

In [None]:
data.shape

### Visualization

Let's see what data we got as a result, and also make visualizations.

In [None]:
titles = [
    "Pressure",
    "Temperature",
    "Temperature in Kelvin",
    "Temperature (dew point)",
    "Relative Humidity",
    "Saturation vapor pressure",
    "Vapor pressure",
    "Vapor pressure deficit",
    "Specific humidity",
    "Water vapor concentration",
    "Airtight",
    "Wind speed",
    "Maximum wind speed",
    "Wind direction in degrees",
]

feature_keys = [
    "p (mbar)",
    "T (degC)",
    "Tpot (K)",
    "Tdew (degC)",
    "rh (%)",
    "VPmax (mbar)",
    "VPact (mbar)",
    "VPdef (mbar)",
    "sh (g/kg)",
    "H2OC (mmol/mol)",
    "rho (g/m**3)",
    "wv (m/s)",
    "max. wv (m/s)",
    "wd (deg)",
]

colors = [
    "blue",
    "orange",
    "green",
    "red",
    "purple",
    "brown",
    "pink",
    "gray",
    "olive",
    "cyan",
]



def show_raw_visualization(data):
    time_data = data.index
    fig, axes = plt.subplots(nrows=7, ncols=2, figsize=(15, 20), dpi=80, facecolor="w", edgecolor="k")
    for i in range(len(feature_keys)):
        key = feature_keys[i]
        c = colors[i % (len(colors))]
        t_data = data[key]
        t_data.index = time_data
        t_data.head()
        ax = t_data.plot(ax=axes[i // 2, i % 2], color=c, title="{} - {}".format(titles[i], key), rot=25,)
        ax.legend([titles[i]])
    plt.tight_layout()


show_raw_visualization(data)

### Choose data

Further experiments will be carried out with a Celsius temperature.

In [None]:
data_name = "T (degC)"

# Time Series: play with data

Let's take a look at how data can be decomposed into components:

- **Trend**: The increasing or decreasing value in the series;
- **Seasonality**: The repeating relatively short-term cycle in the series;
- **Noise**: The random deviations in the series.

And also draw the [autocorrelation graph](https://otexts.com/fpp2/non-seasonal-arima.html#acf-and-pacf-plots).

### Trend and seasonality

In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose
decomposition = seasonal_decompose(data[data_name], model='additive', freq=365)
decomposition.plot()
plt.show()

### Autocorrelation

In [None]:
from pandas.plotting import lag_plot

import pandas as pd
from pandas.plotting import autocorrelation_plot

autocorrelation_plot(data[data_name])
plt.title(f'{data_name} autocorrelation.')
plt.show()

In [None]:
print('autocorrelation coef:', data[data_name].autocorr())

# Task 1

Plot the correlation of $y_{t}$ with $y_ {t+1}$ and calculate the correlation coefficient (see an example in the lecture). What conclusions can we draw from the results obtained?

In [None]:
y_curr = # your code where
y_next = # your code where

# your code where (scatter plot)

plt.title('Correlation of the previous value y(t) with the next one y(t+1).')
plt.xlabel('y(t)')
plt.ylabel('y(t+1)')
plt.show()

In [None]:
corrcoef = # your code where
print('Correlation coefficient:', corrcoef)

Conclusion: we see a rather strong positive correlation of the previous step with the next one.

# AR model for time series forecast


Consider $y_1, y_2, ..., y_i, ..., y_N$ are observations of a time series. Autoregressive Model for the time series forecast assumes the following:

$$ \hat{y}_{i+m} = f( y_{i}, y_{i-1}, ... y_{i-k+1} ) $$

where $\hat{y}_{i+m}$ is a predicted value.

In matrix forms we will define this model as:

$$\hat{Y} = f(X)$$

where
$$X = \left(
\begin{array}{cccc}
y_{k} & y_{k-1} & \ldots & y_{1}\\
\vdots & \vdots & \ddots & \vdots\\
y_{k+j} & y_{k+j-1} & \ldots & y_{j+1}\\
\vdots & \vdots & \ddots & \vdots
\end{array}
\right)$$

$$Y = \left(
\begin{array}{c}
y_{k+m} \\
\vdots\\
y_{k+j+m}\\
\vdots
\end{array}
\right)$$

### Preprocessing

In [None]:
X = data[[data_name]].values

ss = StandardScaler()
X = ss.fit_transform(X)

In [None]:
plt.plot(X)
plt.title(f'Preprocessed "{data_name}" data.')

In [None]:
def AR_matrices(Y, k, m):
    X_AR = []
    Y_AR = []
    for i in range(len(Y)):
        
        if i < k-1: continue
        if i+m >= len(Y): break
        
        ax_ar = Y[i+1-k:i+1].reshape(-1, )
        X_AR.append(ax_ar)

        ay_ar = Y[i+m]#[0]
        Y_AR.append(ay_ar)

    return np.array(X_AR), np.array(Y_AR)

In [None]:
# prepare X and Y matrices
k = 1
X_AR, Y_AR = AR_matrices(X, k=k, m=1)
print('X shape:', X.shape)
print('X_AR shape: ', X_AR.shape)
print('Y_AR shape: ', Y_AR.shape)

### Train / test split

In [None]:
N = len(X)//2

X_AR_train, X_AR_test = X_AR[:N], X_AR[N:]
Y_AR_train, Y_AR_test = Y_AR[:N], Y_AR[N:]

### Model

In [None]:
from sklearn.linear_model import LinearRegression

model = LinearRegression().fit(X_AR_train, Y_AR_train)

### Prediction

In [None]:
Y_pred_test = model.predict(X_AR_test)

In [None]:
plt.figure(figsize=(12, 5))
plt.plot(Y_AR_test, label='True', alpha=1.)
plt.plot(Y_pred_test, label='Prediction', alpha=1.)
plt.xticks(size=14)
plt.yticks(size=14)
plt.legend(loc='best', fontsize=14)
plt.grid(b=1)
plt.show()

In [None]:
from sklearn.metrics import mean_absolute_error as mse
print('MSE loss:', mse(Y_AR_test, Y_pred_test))

# Task 2

Do the same experiment for large k value -- take $k=30$:

* make new datasets;
* train new model;
* compare graphs and MSE losses.

Make sure AR model performs better at higher k values.

In [None]:
new_k = 30

# prepare X and Y matrices
X_AR_new_k, Y_AR_new_k = AR_matrices(X, k=new_k, m=1)

assert X_AR_new_k.shape == (2073, 30)
assert Y_AR_new_k.shape == (2073, 1)

print('X shape:', X.shape)
print('X_AR shape: ', X_AR_new_k.shape)
print('Y_AR shape: ', Y_AR_new_k.shape)

X_AR_train_new_k, X_AR_test_new_k = X_AR_new_k[:N], X_AR_new_k[N:]
Y_AR_train_new_k, Y_AR_test_new_k = Y_AR_new_k[:N], Y_AR_new_k[N:]

assert X_AR_train_new_k.shape == (1051, 30)
assert X_AR_test_new_k.shape == (1022, 30)
assert Y_AR_train_new_k.shape == (1051, 1)
assert Y_AR_test_new_k.shape == (1022, 1)

In [None]:
model = LinearRegression().fit(X_AR_train_new_k, Y_AR_train_new_k)

Y_pred_test_new_k = model.predict(X_AR_test_new_k)

assert Y_pred_test_new_k.shape == (1022, 1)

for i in range(Y_AR_test.shape[1]):
    plt.figure(figsize=(12, 5))
    plt.plot(Y_AR_test_new_k[:, i], label='True', alpha=1.)
    plt.plot(Y_pred_test_new_k[:, i], label=f'Prediction, k={new_k}', alpha=.7, color='C4')
    plt.plot(Y_pred_test[-Y_AR_test_new_k.shape[0]:], label='Prediction, k=1', alpha=1.)
    plt.xticks(size=14)
    plt.yticks(size=14)
    plt.legend(loc='best', fontsize=14)
    plt.grid(b=1)
    plt.show()

In [None]:
print('MSE loss k=1:', mse(Y_AR_test[-Y_AR_test_new_k.shape[0]:], Y_pred_test[-Y_AR_test_new_k.shape[0]:]))
print(f'MSE loss k={new_k}:', mse(Y_AR_test_new_k, Y_pred_test_new_k))

# LSTM model for time series forecast

Now we use a recurrent neural network to solve the problem (note that for such a simple dataset as it is now, it will most likely be more efficient to use a simple model - linear or polynomial).

### Dataset

In [None]:
class TimeseriesDataset(Dataset):   
    '''
    Custom Dataset subclass. 
    Serves as input to DataLoader to transform X 
      into sequence data using rolling window. 
    DataLoader using this dataset will output batches 
      of `(batch_size, seq_len, n_features)` shape.
    Suitable as an input to RNNs. 
    '''
    def __init__(self, X: np.ndarray, y: np.ndarray, seq_len: int = 1):
        self.X = torch.tensor(X).float()
        self.y = torch.tensor(y).float()
        self.seq_len = seq_len

    def __len__(self):
        return self.X.__len__() - (self.seq_len-1)

    def __getitem__(self, index):
        return (self.X[index:index+self.seq_len], self.y[index+self.seq_len-1])

In [None]:
class DataModule(pl.LightningDataModule):
    '''
    LightningDataModule:
    https://pytorch-lightning.readthedocs.io/en/latest/extensions/datamodules.html
    '''
    def __init__(self, seq_len = 1, batch_size = 128, num_workers=0):
        super().__init__()
        self.seq_len = seq_len
        self.batch_size = batch_size
        self.num_workers = num_workers
        self.X_train = None
        self.y_train = None
        self.X_test = None
        self.X_test = None
        self.columns = None
        self.preprocessing = None

    def prepare_data(self):
        pass

    def setup(
        self,
        stage=None, 
        X_train=X_AR_train,
        y_train=Y_AR_train,
        X_test=X_AR_test,
        y_test=Y_AR_test,
    ):
        # Assign train/test datasets for use in dataloaders
        if stage == 'fit' or stage is None:
            self.X_train = X_train
            self.y_train = y_train.reshape((-1, 1))
            self.X_val = X_test
            self.y_val = y_test.reshape((-1, 1))
        if stage == 'test' or stage is None:
            self.X_test = X_test
            self.y_test = y_test.reshape((-1, 1))

    def train_dataloader(self):
        train_dataset = TimeseriesDataset(self.X_train, 
                                          self.y_train, 
                                          seq_len=self.seq_len)
        train_loader = DataLoader(train_dataset, 
                                  batch_size = self.batch_size, 
                                  shuffle = False, 
                                  num_workers = self.num_workers)
        
        return train_loader
    
    def val_dataloader(self):
        val_dataset = TimeseriesDataset(self.X_val, 
                                        self.y_val, 
                                        seq_len=self.seq_len)
        val_loader = DataLoader(val_dataset, 
                                batch_size = self.batch_size, 
                                shuffle = False, 
                                num_workers = self.num_workers)

        return val_loader

    def test_dataloader(self):
        test_dataset = TimeseriesDataset(self.X_test, 
                                         self.y_test, 
                                         seq_len=self.seq_len)
        test_loader = DataLoader(test_dataset, 
                                 batch_size = self.batch_size, 
                                 shuffle = False, 
                                 num_workers = self.num_workers)

        return test_loader

### LSTM model

Our model will consist of several LSTM-layers.

In [None]:
class LSTMRegressor(pl.LightningModule):
    '''
    Standard PyTorch Lightning module:
    https://pytorch-lightning.readthedocs.io/en/latest/lightning_module.html
    '''
    def __init__(self, 
                 n_features, 
                 hidden_size, 
                 seq_len, 
                 batch_size,
                 num_layers, 
                 dropout, 
                 learning_rate,
                 criterion):
        super(LSTMRegressor, self).__init__()
        self.n_features = n_features
        self.hidden_size = hidden_size
        self.seq_len = seq_len
        self.batch_size = batch_size
        self.num_layers = num_layers
        self.dropout = dropout
        self.criterion = criterion
        self.learning_rate = learning_rate

        self.lstm = nn.LSTM(input_size=n_features, 
                            hidden_size=hidden_size,
                            num_layers=num_layers, # number of LSTM-layers.
                            dropout=dropout, 
                            batch_first=True)
        self.linear = nn.Linear(hidden_size, 1)
        
    def forward(self, x):
        # lstm_out = (batch_size, seq_len, hidden_size)
        lstm_out, _ = self.lstm(x)
        y_pred = self.linear(lstm_out[:,-1])
        return y_pred
    
    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.learning_rate)

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = self.criterion(y_hat, y)
        self.log('train_loss', loss)
        return loss
    
    # since the dataset is rather small, we use test data for validation
    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = self.criterion(y_hat, y)
        self.log('val_loss', loss)
        return loss

    def test_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = self.criterion(y_hat, y)
        self.log('test_loss', loss)
        return loss

We aggregate all parameters in one place -- to the dict.
This is useful for reporting experiment params to experiment tracking software.

In fact, for understanding and reproducibility, researchers try to store all hyperparameters 
in special YAML configurators.

You could read more here: [Data Science in Production — Advanced Python Best Practices](https://medium.com/bcggamma/data-science-python-best-practices-fdb16fdedf82).

In [None]:
p = dict(
    seq_len = 1,
    batch_size = 70, 
    criterion = nn.MSELoss(),
    max_epochs = 20,
    n_features = 1,
    hidden_size = 100,
    num_layers = 1,
    dropout = 0.2,
    learning_rate = 0.001,
)

In [None]:
model = LSTMRegressor(
    n_features = p['n_features'],
    hidden_size = p['hidden_size'],
    seq_len = p['seq_len'],
    batch_size = p['batch_size'],
    criterion = p['criterion'],
    num_layers = p['num_layers'],
    dropout = p['dropout'],
    learning_rate = p['learning_rate']
)

In [None]:
seed_everything(1)

csv_logger = CSVLogger('./', name='lstm', version='0'),

trainer = Trainer(
    max_epochs=p['max_epochs'],
    logger=csv_logger,
    gpus=1,
    log_every_n_steps=1,
    progress_bar_refresh_rate=2,
)

In [None]:
dm = DataModule(
    seq_len = p['seq_len'],
    batch_size = p['batch_size']
)

In [None]:
trainer.fit(model, dm)
trainer.test(model, datamodule=dm)

In [None]:
metrics = pd.read_csv('./lstm/0/metrics.csv')
train_loss = metrics[['train_loss', 'step', 'epoch']][~np.isnan(metrics['train_loss'])]
val_loss = metrics[['val_loss', 'epoch']][~np.isnan(metrics['val_loss'])]
test_loss = metrics['test_loss'].iloc[-1]

fig, axes = plt.subplots(1, 2, figsize=(16, 5), dpi=100)
axes[0].set_title('Train loss per batch')
axes[0].plot(train_loss['step'], train_loss['train_loss'])
axes[1].set_title('Validation loss per epoch')
axes[1].plot(val_loss['epoch'], val_loss['val_loss'], color='orange')
plt.show(block = True)

print('MSE:')
print(f"Train loss: {train_loss['train_loss'].iloc[-1]:.3f}")
print(f"Val loss:   {val_loss['val_loss'].iloc[-1]:.3f}")
print(f'Test loss:  {test_loss:.3f}')

### Prediction

In [None]:
from torch.autograd import Variable 

X_test_lstm = Variable(torch.tensor(X_AR_test, dtype=torch.float).view(-1, 1, 1))
X_test_lstm.shape

In [None]:
Y_pred_test_lstm = model(X_test_lstm).detach().numpy()

### Prediction

In [None]:
plt.figure(figsize=(12, 5))
plt.plot(Y_AR_test, label='True', alpha=1.)
plt.plot(Y_pred_test_lstm, label='Prediction', alpha=1.)
plt.xticks(size=14)
plt.yticks(size=14)
plt.legend(loc='best', fontsize=14)
plt.grid(b=1)
plt.show()

In [None]:
print('MSE loss:', mse(Y_AR_test, Y_pred_test_lstm))

# Task 3

Do the same experiment with LSTM for large k value -- take $k=20$ (in the case of LSTM, this is the length of the sequence):

* train new model;
* compare graphs and MSE losses.

In [None]:
seq_len = 20

p = dict(
    seq_len = # your code where
    batch_size = 50, 
    criterion = nn.MSELoss(),
    max_epochs = 20,
    n_features = 1,
    hidden_size = 20,
    num_layers = 1,
    dropout = 0.2,
    learning_rate = 0.001,
)

In [None]:
dm = DataModule(
    seq_len = p['seq_len'],
    batch_size = p['batch_size']
)

In [None]:
model = LSTMRegressor(
    n_features = p['n_features'],
    hidden_size = p['hidden_size'],
    seq_len = p['seq_len'],
    batch_size = p['batch_size'],
    criterion = p['criterion'],
    num_layers = p['num_layers'],
    dropout = p['dropout'],
    learning_rate = p['learning_rate']
)

In [None]:
seed_everything(1)

csv_logger = CSVLogger('./', name='lstm', version='0'),

trainer = Trainer(
    max_epochs=p['max_epochs'],
    logger=csv_logger,
    gpus=1,
    log_every_n_steps=1,
    progress_bar_refresh_rate=2,
)

In [None]:
trainer.fit(model, dm)
trainer.test(model, datamodule=dm)

In [None]:
metrics = pd.read_csv('./lstm/0/metrics.csv')
train_loss = metrics[['train_loss', 'step', 'epoch']][~np.isnan(metrics['train_loss'])]
val_loss = metrics[['val_loss', 'epoch']][~np.isnan(metrics['val_loss'])]
test_loss = metrics['test_loss'].iloc[-1]

fig, axes = plt.subplots(1, 2, figsize=(16, 5), dpi=100)
axes[0].set_title('Train loss per batch')
axes[0].plot(train_loss['step'], train_loss['train_loss'])
axes[1].set_title('Validation loss per epoch')
axes[1].plot(val_loss['epoch'], val_loss['val_loss'], color='orange')
plt.show(block = True)

print('MSE:')
print(f"Train loss: {train_loss['train_loss'].iloc[-1]:.3f}")
print(f"Val loss:   {val_loss['val_loss'].iloc[-1]:.3f}")
print(f'Test loss:  {test_loss:.3f}')

### Prediction

In [None]:
X_AR_test_new_k_lstm = # your code here
assert X_AR_test_new_k_lstm.shape == torch.Size([1022, 30, 1])

In [None]:
Y_pred_test_new_k_lstm = # your code here

In [None]:
for i in range(Y_AR_test.shape[1]):
    plt.figure(figsize=(12, 5))
    plt.plot(Y_AR_test, label='True', alpha=1.)
    plt.plot(Y_pred_test_new_k_lstm, label=f'Prediction, k={new_k}', alpha=.7, color='C4')
    plt.plot(Y_pred_test_lstm, label='Prediction, k=1', alpha=1.)
    plt.xticks(size=14)
    plt.yticks(size=14)
    plt.legend(loc='best', fontsize=14)
    plt.grid(b=1)
    plt.show()

In [None]:
print('MSE loss k=1:', mse(Y_pred_test_lstm, Y_AR_test))
print(f'MSE loss k={seq_len}:', mse(Y_pred_test_new_k_lstm, Y_AR_test_new_k))

# Conclusion

In this notebook, we have learned to use predict a rather toy-like time series using linear models and recurrent neural networks.