# Predicting Time Sequences with Recurrent Neural Networks

DATA PREPARATION

In [None]:
import pandas as pd

The dataset is based off of the [census.gov/retail/marts/www/timeseries.html](https://www.census.gov/retail/marts/www/timeseries.html)

In [None]:
# Read in the text file and convert it to a pandas dataframe.
def read_text_file(
    filename: "The name of the text file to read in"
    ):
  """Reads in a text file and returns a list of the values"""
  data = []
  with open(filename, encoding='utf-8') as f:
    # Skip the first line.
    next(f)
    for line in f:
      data += line.split()[1:]
  return [float(value) for value in data]

In [None]:
sales_data = read_text_file('historical_sales_data.txt')

In [None]:
# Convert list to pandas dataframe
historical_sales = pd.DataFrame(sales_data,index=pd.date_range(start='31-01-1992', end='30-09-2021', freq='M'), columns=['sales'])
historical_sales.plot()

In [None]:
# For learning purposes, the data will be truncated just before March 2020. (COVID-19)
historical_sales[:'2020-02-28'].plot()

In [None]:
# Remove COVID-19 era data and convert to numpy array.
sales = historical_sales[:'2020-02-28'].values.reshape(-1)
sales.shape

TENSORFLOW

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import SimpleRNN
import logging

In [None]:
EPOCHS = 100
BATCH_SIZE = 16
TRAIN_TEST_SPLIT = 0.8
MIN = 12

In [None]:
# Data preparation
months = len(sales)
split = int(months * TRAIN_TEST_SPLIT)
train_sales, test_sales = sales[:split], sales[split:]

In [None]:
# Naive model test that simply predicts what was observed in the past month.
test_output = test_sales[MIN:]
naive_prediction = test_sales[MIN-1:-1]

In [None]:
x = range(len(test_output))
plt.plot(x, test_output, 'g-', label='test_ouput')
plt.plot(x, naive_prediction, 'm-', label='naive prediction')
plt.title('Historical sales')
plt.xlabel('months')
plt.ylabel('Monthly historical sales')
plt.legend()
plt.show()

In [None]:
# Standardize train and test data.
# Only train data used to compute mean and stddev.
mean = np.mean(train_sales)
stddev = np.std(train_sales)
train_sales_std = (train_sales - mean) / stddev
test_sales_std = (test_sales - mean) / stddev

In [None]:
# Convert the historical times series to a supervised learning dataset format, X -> y
train_months = len(train_sales)
train_X = np.zeros((train_months - MIN, train_months-1, 1))
train_y = np.zeros((train_months-MIN, 1))
for i in range(0, train_months-MIN):
    train_X[i, -(i+MIN):, 0] = train_sales_std[0:i+MIN]
    train_y[i, 0] = train_sales_std[i+MIN]

# Create test examples.
test_months = len(test_sales)
test_X = np.zeros((test_months-MIN, test_months-1, 1))
test_y = np.zeros((test_months-MIN, 1))
for i in range(0, test_months-MIN):
    test_X[i, -(i+MIN):, 0] = test_sales_std[0:i+MIN]
    test_y[i, 0] = test_sales_std[i+MIN]

In [None]:
# The Model
model = Sequential()
model.add(SimpleRNN(128, activation='relu', input_shape=(None, 1)))
model.add(Dense(1, activation='linear'))
model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mean_absolute_error'])
model.summary()

In [None]:
history = model.fit(train_X, train_y,
                    validation_data
                    = (test_X, test_y), epochs=EPOCHS,
                    batch_size=BATCH_SIZE, verbose=2,
                    shuffle=True)

In [None]:
# Create naive prediction based on standardized data.
test_output = test_sales_std[MIN:]
naive_prediction = test_sales_std[MIN-1:-1]
mean_squared_error = np.mean(np.square(naive_prediction - test_output))
mean_abs_error = np.mean(np.abs(naive_prediction - test_output))
print('naive test mse: ', mean_squared_error)
print('naive test mean abs: ', mean_abs_error)

In [None]:
# Use trained model to predict the test data
predicted_test = model.predict(test_X, len(test_X))
predicted_test = np.reshape(predicted_test, (len(predicted_test)))
predicted_test = predicted_test * stddev + mean
# Plot test prediction.
x = range(len(test_sales)-MIN)
plt.plot(x, predicted_test, 'm-', label='predicted test_output')
plt.plot(x, test_sales[-(len(test_sales)-MIN):], 'g-', label='actual test_output')
plt.title('Historical sales')
plt.xlabel('months')
plt.ylabel('Predicted historical sales')
plt.legend()
plt.show()

PYTORCH

In [None]:
import torch
import torch.nn as nn 
from torch.utils.data import TensorDataset, DataLoader
import numpy as np
from utils import train_model

In [None]:
# Set up device.
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
# Create train examples.
train_months = len(train_sales)
train_X = np.zeros((train_months-MIN, train_months-1, 1), dtype=np.float32)
train_y = np.zeros((train_months-MIN, 1), dtype=np.float32)
for i in range(0, train_months-MIN):
    train_X[i, -(i+MIN):, 0] = train_sales_std[0:i+MIN]
    train_y[i, 0] = train_sales_std[i+MIN]

# Create test examples.
test_months = len(test_sales)
test_X = np.zeros((test_months-MIN, test_months-1, 1), dtype=np.float32)
test_y = np.zeros((test_months-MIN, 1), dtype=np.float32)
for i in range(0, test_months-MIN):
    test_X[i, -(i+MIN):, 0] = test_sales_std[0:i+MIN]
    test_y[i, 0] = test_sales_std[i+MIN]

# Create Dataset objects.
trainset = TensorDataset(torch.from_numpy(train_X).clone(), torch.from_numpy(train_y))
testset = TensorDataset(torch.from_numpy(test_X).clone(), torch.from_numpy(test_y))

In [None]:
# Custom layer that retrieves only last time step from RNN output.
class LastTimestep(nn.Module):
    def forward(self, inputs):
        return inputs[1][0]

# Create RNN model
model = nn.Sequential(
    nn.RNN(1, 128, nonlinearity='relu', batch_first=True),
    LastTimestep(),
    nn.Linear(128, 1)
)

# Loss function and optimizer.
optimizer = torch.optim.Adam(model.parameters())
loss_function = nn.MSELoss()

# Train model.
train_model(model, device, EPOCHS, BATCH_SIZE, trainset, testset,
            optimizer, loss_function, 'mae')

In [None]:

# Create naive prediction based on standardized data.
test_output = test_sales_std[MIN:]
naive_prediction = test_sales_std[MIN-1:-1]
mean_squared_error = np.mean(np.square(naive_prediction
                                       - test_output))
mean_abs_error = np.mean(np.abs(naive_prediction
                                - test_output))
print('naive test mse: ', mean_squared_error)
print('naive test mean abs: ', mean_abs_error)

In [None]:
# Use trained model to predict the test data
inputs = torch.from_numpy(test_X)
inputs = inputs.to(device)
outputs = model(inputs)
predicted_test = outputs.cpu().detach().numpy()

# De-standardize output.
predicted_test = np.reshape(predicted_test,
                            (len(predicted_test)))
predicted_test = predicted_test * stddev + mean

# Plot test prediction.
x = range(len(test_sales)-MIN)
plt.plot(x, predicted_test, 'm-', label='predicted test_output')
plt.plot(x, test_sales[-(len(test_sales)-MIN):], 'g-', label='actual test_output')
plt.title('Historical sales')
plt.xlabel('months')
plt.ylabel('Predicted historical sales')
plt.legend()
plt.show()