# A sequence to sequence prediction for YAB and St James' Contact data

### MFK, AMW, MLG

Next, we'll define a function to preprocess the data and create training and test datasets:

In [5]:
import numpy as np

from tensorflow.keras.preprocessing.sequence import pad_sequences

def preprocess_data(data, sequence_length):
    # Create a mapping from surface to integer
    surfaces = sorted(set(data))
    surface_to_int = {surface: i for i, surface in enumerate(surfaces)}
    
    # Convert the data to integers
    data_int = [surface_to_int[surface] for surface in data]
    
    # Split the data into input sequences and corresponding labels
    sequences = []
    labels = []
    for i in range(len(data_int) - sequence_length):
        sequences.append(data_int[i: i + sequence_length])
        labels.append(data_int[i + sequence_length])
    
    # One-hot encode the labels
    labels = np.eye(len(surfaces))[labels]
    
    # Pad the input sequences
    sequences = pad_sequences(sequences, maxlen=sequence_length, padding='post')
    
    # Split the data into training and test sets
    split_index = int(0.8 * len(sequences))
    x_train, x_test = sequences[:split_index], sequences[split_index:]
    y_train, y_test = labels[:split_index], labels[split_index:]
    
    return x_train, x_test, y_train, y_test, surface_to_int



Now we can define a function to build and train the RNN model:

In [6]:
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Sequential

def build_and_train_model(x_train, y_train, x_test, y_test, surface_to_int, epochs=100):
    # Build the model
    model = Sequential()
    model.add(Embedding(input_dim=len(surface_to_int), output_dim=10, input_length=x_train.shape[1]))
    model.add(LSTM(units=50))
    model.add(Dense(units=y_train.shape[1], activation='softmax'))
    
    # Compile the model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    # Train the model
    model.fit(x_train, y_train, epochs=epochs, validation_data=(x_test, y_test))
    
    return model, surface_to_int


Finally, we can put everything together and use the model to make predictions on a new sequence of surface contacts:

In [7]:
# Toy dataset of surface contacts
data = 'AAABBCCCCDDDDEEEEEE'

# Preprocess the data
x_train, x_test, y_train, y_test, surface_to_int = preprocess_data(data, sequence_length=3)


In [9]:
x_train, x_test, y_train, y_test, surface_to_int

(array([[0, 0, 0],
        [0, 0, 1],
        [0, 1, 1],
        [1, 1, 2],
        [1, 2, 2],
        [2, 2, 2],
        [2, 2, 2],
        [2, 2, 3],
        [2, 3, 3],
        [3, 3, 3],
        [3, 3, 3],
        [3, 3, 4]], dtype=int32),
 array([[3, 4, 4],
        [4, 4, 4],
        [4, 4, 4],
        [4, 4, 4]], dtype=int32),
 array([[0., 1., 0., 0., 0.],
        [0., 1., 0., 0., 0.],
        [0., 0., 1., 0., 0.],
        [0., 0., 1., 0., 0.],
        [0., 0., 1., 0., 0.],
        [0., 0., 1., 0., 0.],
        [0., 0., 0., 1., 0.],
        [0., 0., 0., 1., 0.],
        [0., 0., 0., 1., 0.],
        [0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 1.],
        [0., 0., 0., 0., 1.]]),
 array([[0., 0., 0., 0., 1.],
        [0., 0., 0., 0., 1.],
        [0., 0., 0., 0., 1.],
        [0., 0., 0., 0., 1.]]),
 {'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4})

In [10]:
# Build and train the model
model, surface_to_int = build_and_train_model(x_train, y_train, x_test, y_test, surface_to_int)

Metal device set to: Apple M1

systemMemory: 16.00 GB
maxCacheSize: 5.33 GB



2022-12-27 20:26:37.696127: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-12-27 20:26:37.696736: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


NotImplementedError: Cannot convert a symbolic Tensor (lstm/strided_slice:0) to a numpy array. This error may indicate that you're trying to pass a Tensor to a NumPy call, which is not supported

In [None]:
# Predict the next surface contact
model.predict(np.array([[surface_to_int['E']]]))