In [None]:
# needed libraries
import re
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import Dense, Conv2D, Conv2DTranspose
from tensorflow.keras.layers import Flatten, Reshape, Dropout, BatchNormalization, Activation, LeakyReLU

# utilities
import os
from glob import glob
import matplotlib.pyplot as plt
import pathlib
import time
import datetime

from IPython import display

gpu_available = tf.config.list_physical_devices('GPU')
print(gpu_available)

[]


In [None]:
def get_angles(pos, i, d_model):
  angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
  return pos * angle_rates

In [None]:
def positional_encoding(max_position, d_model):
  angle_rads = get_angles(np.arange(max_position)[:, np.newaxis],
                          np.arange(d_model)[np.newaxis, :],
                          d_model)

  # apply sin to even indices in the array; 2i
  angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])

  # apply cos to odd indices in the array; 2i+1
  angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])

  pos_encoding = angle_rads[np.newaxis, ...]

  return tf.cast(pos_encoding, dtype=tf.float32)

In [None]:
def ScaledDotProduct(Q, K, V, mask=None):
    dk = tf.cast(tf.shape(K)[-1], tf.float32)

    # compute attention 
    KT = tf.transpose(K, [0, 1, 2, 4, 3])                 
    attention = tf.matmul(Q, KT)/tf.sqrt(dk)

    # mask if necessary
    if mask is not None:
      attention += (mask * -1e9)

    # compute values and weighted sum of their attention
    weights = tf.nn.softmax(attention, axis=-1)
    output = tf.matmul(weights, V)

    return output, weights 

In [None]:
class MultiHeadAttention(keras.layers.Layer):
  def __init__(self, dk=256, num_heads=8):
    super(MultiHeadAttention, self).__init__()
    
    # params
    self.num_heads = num_heads
    self.dk = dk
    self.dk_by_head = dk//num_heads

    # layers
    self.WQ = keras.layers.Dense(dk, use_bias=False)
    self.WK = keras.layers.Dense(dk, use_bias=False)
    self.WV = keras.layers.Dense(dk, use_bias=False)
    self.dense = keras.layers.Dense(dk, use_bias=False)
    
  def splitheads(self, x):
    batch_size, seq_length = x.shape[0:2]

    # spliting the heads done by reshaping last dimension
    x = tf.reshape(x, (batch_size, seq_length, -1, self.num_heads, self.dk_by_head))      #(batch, seq, neighbors, head, features_by_head)
    return tf.transpose(x, (0, 3, 1, 2, 4))                                               #(batch, head, seq, neighbors, features_by_head)

  def call(self, q, k, v, mask=None):
    batch_size, seq_length = q.shape[0:2]

    # projections
    q = self.WQ(q)
    k = self.WK(v)
    v = self.WV(k)

    # split heads
    q = self.splitheads(q)
    k = self.splitheads(k)
    v = self.splitheads(v)

    # compute attention and merge heads
    attn_output, attention = ScaledDotProduct(q, k, v, mask)                              #(batch, head, seq, neighbors, features_by_head)
    attn_output = tf.transpose(attn_output,  (0, 2, 3, 1, 4))                                  #(batch, seq, neighbors, head, features_by_head)
    concat_output = tf.reshape(attn_output, (batch_size, seq_length, -1, self.dk))        #(batch, seq, neighbors, features)
    output = self.dense(concat_output)

    return output, attention


In [None]:
def get_ffn(d_model, hidden_size, act_func='relu'):
  return keras.models.Sequential([
                                  keras.layers.Dense(hidden_size, activation=act_func),
                                  keras.layers.Dense(d_model)
  ])

In [None]:
class EncoderLayer(keras.layers.Layer):
  def __init__(self, dk=256, num_heads=8, hidden_layer_size=256, use_dropout=True, drop_rate=0.1):
    super(EncoderLayer, self).__init__()
    # params
    self.use_dropout = use_dropout

    # layers
    self.MH = MultiHeadAttention(dk, num_heads)
    self.ffn = get_ffn(dk, dk, 'relu')
    self.normLayer1 = keras.layers.LayerNormalization(epsilon=1e-6)
    self.normLayer2 = keras.layers.LayerNormalization(epsilon=1e-6)
    self.dropout1 = keras.layers.Dropout(drop_rate)
    self.dropout2 = keras.layers.Dropout(drop_rate)

  def call(self, x, training, mask=None):
    # multihead attention
    attn_output, _ = self.MH(x, x, x, mask)

    # dropout layer
    if self.use_dropout and training:
      attn_output = self.dropout1(attn_output)
    
    # normalization and feed forward layers
    z = self.normLayer1(x + attn_output)
    output = self.ffn(z)

    # dropout layer
    if self.use_dropout and training:
      output = self.dropout2(output)
    
    # normalization layer
    output = self.normLayer2(z + output)

    return output 

In [None]:
sample_encoder_layer = EncoderLayer()

In [None]:
samp_inp = tf.random.uniform((3, 20, 6, 256))
out = sample_encoder_layer(samp_inp, True)
out.shape

TensorShape([3, 20, 6, 256])

In [None]:
class DecoderLayer(keras.layers.Layer):
  def __init__(self, dk=256, num_heads=8, hidden_layer=256, use_dropout=True, drop_rate=0.1):
    super(DecoderLayer, self).__init__()

    #params
    self.use_dropout = use_dropout

    # layers
    self.SAMH = MultiHeadAttention(dk, num_heads)
    self.EDMH = MultiHeadAttention(dk, num_heads)
    self.ffn = get_ffn(dk, hidden_layer)

    self.normLayer1 = keras.layers.LayerNormalization(epsilon=1e-6)
    self.normLayer2 = keras.layers.LayerNormalization(epsilon=1e-6)
    self.normLayer3 = keras.layers.LayerNormalization(epsilon=1e-6)\

    self.dropout1 = keras.layers.Dropout(drop_rate)
    self.dropout2 = keras.layers.Dropout(drop_rate)
    self.dropout3 = keras.layers.Dropout(drop_rate)
  
  def call(self, x, enc_output, training, mask=None):

    # self attention computation
    self_attn_out, self_attn = self.SAMH(x, x, x, mask)

    if self.use_dropout and training:
      self_attn_out = self.dropout1(self_attn_out)
    
    z = self.normLayer1(x + self_attn_out) 

    # encoder decoder computation
    enc_dec_out, enc_dec_attn = self.EDMH(z, enc_output, enc_output)

    if self.use_dropout and training:
      enc_dec_out = self.dropout2(enc_dec_out)
    
    z = self.normLayer2(z + enc_dec_out)

    # feed forward computation
    output = self.ffn(z)

    if self.use_dropout and training:
      output = self.dropout3(output)
    
    output = self.normLayer3(z + output)

    return output, self_attn, enc_dec_attn


In [None]:
sample_decoder_layer = DecoderLayer()

In [None]:
dec_inp = tf.random.uniform((3, 20, 6, 256))
out2 = sample_decoder_layer(dec_inp, out, True)
out2[0].shape

TensorShape([3, 20, 6, 256])

In [None]:
class Encoder(keras.layers.Layer):
  def __init__(self, features_size, max_size, dk_model=256, num_heads=8, num_encoders=6, 
               enc_hidden_size=256, use_pos_emb=True, use_dropout=True, drop_rate=0.1):
    super(Encoder, self).__init__()

    # params
    self.dk_model = dk_model
    self.max_size = max_size
    self.use_dropout = use_dropout
    self.use_pos_emb = use_pos_emb
    self.enc_hidden_size = enc_hidden_size
    self.num_encoders = num_encoders

    # layers
    #self.embedding = keras.layers.Embedding(features_size, dk_model)
    self.embedding = keras.layers.Dense(dk_model)
    self.encoders_stack = [EncoderLayer(dk_model, num_heads, enc_hidden_size, use_dropout, drop_rate) for _ in range(num_encoders)]
    self.dropout = tf.keras.layers.Dropout(drop_rate)
  
  def call(self, x, training):

    x = self.embedding(x)
    x *= tf.math.sqrt(tf.cast(self.dk_model, tf.float32))

    if self.use_pos_emb:
      x += positional_encoding(self.max_size, self.dk_model)
    
    if self.use_dropout and training:
      x = self.dropout(x)
    
    for encoder_layer in self.encoders_stack:
      x = encoder_layer(x, training)
    
    return x

In [None]:
samp_inp = tf.random.uniform((3, 6, 20, 256))
encoder = Encoder(256, 20, 256)
out = encoder(samp_inp, True)
out.shape

TensorShape([3, 6, 20, 256])

In [None]:
class Decoder(keras.layers.Layer):
  def __init__(self, features_size, max_size, dk_model=256, num_heads=8, num_decoders=6, 
               dec_hidden_size=256, use_pos_emb=True, use_dropout=True, drop_rate=0.1):
    
    super(Decoder, self).__init__()

    # params
    self.dk_model = dk_model
    self.max_size = max_size
    self.use_dropout = use_dropout
    self.use_pos_emb = use_pos_emb
    self.dec_hidden_size = dec_hidden_size
    self.num_decoders = num_decoders

    # layers
    self.embedding = keras.layers.Dense(dk_model)
    self.decoders_stack = [DecoderLayer(dk_model, num_heads, dec_hidden_size, use_dropout, drop_rate) for _ in range(num_decoders)]
    self.dropout = tf.keras.layers.Dropout(drop_rate)

    def call(self, x, enc_output, training):

      x = self.embedding(x)
      x *= tf.math.sqrt(tf.cast(self.dk_model, tf.float32))

      if self.use_pos_emb:
        x += positional_encoding(self.max_size, self.dk_model)
      
      if self.use_dropout and training:
        x = self.dropout(x)
      
      for decoder_layer in self.decoders_stack:
        x, attn1, attn2, = decoder_layer(x, enc_output, training)
      
      return x


In [None]:
samp_inp = tf.random.uniform((3, 6, 20, 256))
decoder = Decoder(256, 20, 256)
out2 = decoder(samp_inp, out, True)
out2.shape

TensorShape([3, 6, 20, 256])

In [None]:
class STTransformer(keras.layers.Layer):
  def __init__(self, features_size, max_seq_size, max_neighbors_size, 
               sp_dk=256, sp_enc_heads=8, sp_dec_heads=8, sp_num_encoders=6, sp_num_decoders=6, 
               tm_dk=256, tm_enc_heads=8, tm_dec_heads=8, tm_num_encoders=6, tm_num_decoders=6, 
               dec_hidden_size=256, use_dropout=True, drop_rate=0.1):
    
    super(STTransformer, self).__init__()

    # layers
    self.sp_encoder = Encoder(features_size, max_neighbors_size, sp_dk, use_pos_emb=False)
    self.sp_decoder = Decoder(features_size, max_neighbors_size, sp_dk, use_pos_emb=False)
    self.tm_encoder = Encoder(features_size, max_seq_size, tm_dk)
    self.tm_decoder = Decoder(features_size, max_seq_size, tm_dk)

    
  def call(self, inputs, training):
    x = inputs
    sp_enc_out = self.sp_encoder(x, training)                               #(batch, seq, neighbors, <spatial attn features>)
    out = tf.transpose(sp_enc_out, [0, 2, 1, 3])                            #(batch, neighbors, seq, <spatial attn features>)
    tm_enc_out = self.tm_encoder(out, training)                             #(batch, neighbots, seq, <time attn features>)

    # review shapes
    #tm_dec_out = self.tm_decoder(targets, tm_enc_out, training)
    #sp_dec_out = self.sp_decoder(tm_dec_out, ts_enc_out, training)
    return tm_enc_out

In [None]:
model = STTransformer(100, 20, 6)

In [None]:
input = tf.random.uniform((3, 20, 6, 256))

In [None]:
o = model(input)

In [None]:
o.shape

TensorShape([3, 6, 20, 256])