# Timeseries classification with a Transformer model


# URL and Information about the dataset
url = 'http://timeseriesclassification.com/description.php?Dataset=FordA'

Train Size = 3601 Test Size = 1320 Length = 500 N_of_classes = 2 Number of Dimensions = 1

In [None]:
import os, pathlib, shutil, random, urllib, zipfile
import keras
from tensorflow import keras
from keras import layers
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

os.environ["KERAS_BACKEND"] = "jax"


In [None]:
# Load Train and Test datasets in Pandas
train = pd.read_table('./FordA/FordA_TRAIN.txt',sep = '\s+', header = None)
test = pd.read_table('./FordA/FordA_TEST.txt', sep = '\s+',header = None)

# First column is the label and the rest of the columns are the Timestamps for each signal
train.head()

# Convert to numpy arrays
x_train, y_train = train.iloc[:,1:].values, train.iloc[:,0].values.astype('int64')
x_test, y_test = test.iloc[:,1:].values, test.iloc[:,0].values.astype('int64')

x_train.shape

In [None]:
# Plot a couple of signals
# Each row is a time series beign measured

fig,ax = plt.subplots(figsize = (12,9))
ax.plot(np.arange(0,x_train.shape[1]), x_train[0,:], label = 'Class -1', c ='r', marker = 'o', ls = '--')
ax.plot(np.arange(0,x_train.shape[1]), x_train[1,:], label = 'Class 1', c= 'g', marker = '*', ls='-')
plt.legend(loc = 'best')
plt.grid(True)
plt.show()

In [None]:
x_train = x_train.reshape((x_train.shape[0], x_train.shape[1], 1))
x_test = x_test.reshape((x_test.shape[0], x_test.shape[1], 1))

n_classes = len(np.unique(y_train))

idx = np.random.permutation(len(x_train))
x_train = x_train[idx]
y_train = y_train[idx]

y_train[y_train == -1] = 0
y_test[y_test == -1] = 0

# Build Model 1. Transformer with Conv 1D

Our model processes a tensor of shape (batch size, sequence length, features), where sequence length is the number of time steps and features is each input timeseries.

You can replace your classification RNN layers with this one: the inputs are fully compatible!

We include residual connections, layer normalization, and dropout. The resulting layer can be stacked multiple times.

The projection layers are implemented through keras.layers.Conv1D.

In [None]:
def transformer_encoder(inputs, head_size, num_heads, ff_dim, dropout=0):
    # Attention and Normalization
    x = layers.MultiHeadAttention(
        key_dim=head_size, num_heads=num_heads, dropout=dropout
    )(inputs, inputs)
    x = layers.Dropout(dropout)(x)
    x = layers.LayerNormalization(epsilon=1e-6)(x)
    res = x + inputs

    # Feed Forward Part
    x = layers.Conv1D(filters=ff_dim, kernel_size=1, activation="relu")(res)
    x = layers.Dropout(dropout)(x)
    x = layers.Conv1D(filters=inputs.shape[-1], kernel_size=1)(x)
    x = layers.LayerNormalization(epsilon=1e-6)(x)
    return x + res

In [None]:
def build_model(
    input_shape,
    head_size,
    num_heads,
    ff_dim,
    num_transformer_blocks,
    mlp_units,
    dropout=0,
    mlp_dropout=0,
):
    inputs = keras.Input(shape=input_shape)
    x = inputs
    for _ in range(num_transformer_blocks):
        x = transformer_encoder(x, head_size, num_heads, ff_dim, dropout)

    x = layers.GlobalAveragePooling1D(data_format="channels_last")(x)
    for dim in mlp_units:
        x = layers.Dense(dim, activation="relu")(x)
        x = layers.Dropout(mlp_dropout)(x)
    outputs = layers.Dense(n_classes, activation="softmax")(x)
    return keras.Model(inputs, outputs)

In [None]:
input_shape = x_train.shape[1:]

model = build_model(
    input_shape,
    head_size=256,
    num_heads=4,
    ff_dim=4,
    num_transformer_blocks=4,
    mlp_units=[128],
    mlp_dropout=0.4,
    dropout=0.25,
)

model.compile(
    loss="sparse_categorical_crossentropy",
    optimizer=keras.optimizers.Adam(learning_rate=1e-4),
    metrics=["sparse_categorical_accuracy"],
)
model.summary()

callbacks = [keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)]
#keras.utils.plot_model(model, show_shapes=True)

In [None]:

model.fit(
    x_train,
    y_train,
    validation_split=0.2,
    epochs=150,
    batch_size=64,
    callbacks=callbacks,
)

model.evaluate(x_test, y_test, verbose=1)

Conclusions

In about 110-120 epochs (25s each on Colab), the model reaches a training accuracy of ~0.95, validation accuracy of ~84 and a testing accuracy of ~85, without hyperparameter tuning. And that is for a model with less than 100k parameters. Of course, parameter count and accuracy could be improved by a hyperparameter search and a more sophisticated learning rate schedule, or a different optimizer.