In [1]:
# needed libraries
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import Dense, Conv2D, Conv2DTranspose
from tensorflow.keras.layers import Flatten, Reshape, Dropout, BatchNormalization, Activation, LeakyReLU

# utilities
import os
from glob import glob
import matplotlib.pyplot as plt
import pathlib
import time
import datetime

from IPython import display

gpu_available = tf.config.list_physical_devices('GPU')
print(gpu_available)

[]


In [None]:
from google.colab import drive
drive._mount('/content/drive')

ValueError: ignored

In [None]:
!pip3 install pickle5
import pickle5 as pickle

# store processed data in pkl files
def save_pkl_data(data, filename):
    with open(filename, 'wb') as file:
        pickle.dump(data, file, pickle.HIGHEST_PROTOCOL)
        print("data stored succesfully to: ", filename)


# read processed data in pkl files
def load_pkl_data(filename):
    with open(filename, 'rb') as file:
        data = pickle.load(file)
    return data

Collecting pickle5
  Downloading pickle5-0.0.12-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (256 kB)
[?25l[K     |█▎                              | 10 kB 22.1 MB/s eta 0:00:01[K     |██▋                             | 20 kB 29.5 MB/s eta 0:00:01[K     |███▉                            | 30 kB 23.0 MB/s eta 0:00:01[K     |█████▏                          | 40 kB 18.6 MB/s eta 0:00:01[K     |██████▍                         | 51 kB 14.5 MB/s eta 0:00:01[K     |███████▊                        | 61 kB 14.8 MB/s eta 0:00:01[K     |█████████                       | 71 kB 12.5 MB/s eta 0:00:01[K     |██████████▎                     | 81 kB 13.6 MB/s eta 0:00:01[K     |███████████▌                    | 92 kB 14.0 MB/s eta 0:00:01[K     |████████████▉                   | 102 kB 12.2 MB/s eta 0:00:01[K     |██████████████                  | 112 kB 12.2 MB/s eta 0:00:01[K     |███████████████▍                | 122 kB 12.2 MB/s eta 0:00:01[K     |████████████████▋ 

In [None]:
cubes = load_pkl_data('nusc_inps.pkl') 

# Masking

In [2]:
def get_look_ahead_mask(input):
  input_shape = list(input.shape)[:-1]
  input_shape.insert(-1, input_shape[-1])
  input_shape.insert(1, 1)
  mask = 1 - tf.linalg.band_part(tf.ones(input_shape), -1, 0)
  return mask

In [3]:
def adapt_spatial_mask(mask):
  return mask[np.newaxis, : , np.newaxis, : ]         #(1 (head), seq, 1 (neighbor), neighbors) to broadcast when doing addition in the attention layer

In [4]:
def adapt_seq_mask(mask):
  return mask[np.newaxis, np.newaxis, np.newaxis, : ]   #(1 (head), 1(neighbors), 1(seq), seq)

# Positional Encoding

In [None]:
def get_angles(pos, i, d_model):
  angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
  return pos * angle_rates

In [None]:
def positional_encoding(max_position, d_model):
  angle_rads = get_angles(np.arange(max_position)[:, np.newaxis],
                          np.arange(d_model)[np.newaxis, :],
                          d_model)

  # apply sin to even indices in the array; 2i
  angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])

  # apply cos to odd indices in the array; 2i+1
  angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])

  pos_encoding = angle_rads[np.newaxis, ...]

  return tf.cast(pos_encoding, dtype=tf.float32)

# Attention

In [None]:
def ScaledDotProduct(Q, K, V, mask=None):
    dk = tf.cast(tf.shape(K)[-1], tf.float32)

    # compute attention 
    KT = tf.transpose(K, [0, 1, 2, 4, 3])                 
    attention = tf.matmul(Q, KT)/tf.sqrt(dk)

    # mask if necessary
    if mask is not None:
      #print(attention.shape)
      attention += (mask * -1e9)

    # compute values and weighted sum of their attention
    weights = tf.nn.softmax(attention, axis=-1)
    output = tf.matmul(weights, V)

    return output, weights 

In [None]:
class MultiHeadAttention(keras.layers.Layer):
  def __init__(self, dk=256, num_heads=8):
    super(MultiHeadAttention, self).__init__()
    
    # params
    self.num_heads = num_heads
    self.dk = dk
    self.dk_by_head = dk//num_heads

    # layers
    self.WQ = keras.layers.Dense(dk)
    self.WK = keras.layers.Dense(dk)
    self.WV = keras.layers.Dense(dk)
    self.dense = keras.layers.Dense(dk)
    
  def splitheads(self, x):
    batch_size, seq_length = x.shape[0:2]

    # spliting the heads done by reshaping last dimension
    x = tf.reshape(x, (batch_size, seq_length, -1, self.num_heads, self.dk_by_head))      #(batch, seq, neighbors, head, features_by_head)
    return tf.transpose(x, (0, 3, 1, 2, 4))                                               #(batch, head, seq, neighbors, features_by_head)

  def call(self, q, k, v, mask=None):
    batch_size, seq_length = q.shape[0:2]

    # projections
    q = self.WQ(q)
    k = self.WK(v)
    v = self.WV(k)

    # split heads
    q = self.splitheads(q)
    k = self.splitheads(k)
    v = self.splitheads(v)

    # compute attention and merge heads
    attn_output, attention = ScaledDotProduct(q, k, v, mask)                              #(batch, head, seq, neighbors, features_by_head)
    attn_output = tf.transpose(attn_output,  (0, 2, 3, 1, 4))                             #(batch, seq, neighbors, head, features_by_head)
    concat_output = tf.reshape(attn_output, (batch_size, seq_length, -1, self.dk))        #(batch, seq, neighbors, features)
    output = self.dense(concat_output)

    return output, attention


# Transformer Architecture

In [None]:
def get_ffn(d_model, hidden_size, act_func='relu'):
  return keras.models.Sequential([
                                  keras.layers.Dense(hidden_size, activation=act_func),
                                  keras.layers.Dense(d_model)
  ], name='SEQ')

In [None]:
class EncoderLayer(keras.layers.Layer):
  def __init__(self, dk=256, num_heads=8, hidden_layer_size=256, use_dropout=True, drop_rate=0.1):
    super(EncoderLayer, self).__init__()
    # params
    self.use_dropout = use_dropout

    # layers
    self.MH = MultiHeadAttention(dk, num_heads)
    self.ffn = get_ffn(dk, dk, 'relu')
    self.normLayer1 = keras.layers.LayerNormalization(epsilon=1e-6)
    self.normLayer2 = keras.layers.LayerNormalization(epsilon=1e-6)
    self.dropout1 = keras.layers.Dropout(drop_rate)
    self.dropout2 = keras.layers.Dropout(drop_rate)

  def call(self, x, training, mask):
    # multihead attention
    attn_output, _ = self.MH(x, x, x, mask)

    # dropout layer
    if self.use_dropout and training:
      attn_output = self.dropout1(attn_output)
    
    # normalization and feed forward layers
    z = self.normLayer1(x + attn_output)
    output = self.ffn(z)

    # dropout layer
    if self.use_dropout and training:
      output = self.dropout2(output)
    
    # normalization layer
    output = self.normLayer2(z + output)

    return output 

In [None]:
sample_encoder_layer = EncoderLayer()

In [None]:
samp_inp = tf.random.uniform((3, 20, 6, 256))
out = sample_encoder_layer(samp_inp, True, None)
out.shape

TensorShape([3, 20, 6, 256])

In [None]:
class DecoderLayer(keras.layers.Layer):
  def __init__(self, dk=256, num_heads=8, hidden_layer=256, use_dropout=True, drop_rate=0.1):
    super(DecoderLayer, self).__init__()

    #params
    self.use_dropout = use_dropout

    # layers
    self.SAMH = MultiHeadAttention(dk, num_heads)
    self.EDMH = MultiHeadAttention(dk, num_heads)
    self.ffn = get_ffn(dk, hidden_layer)

    self.normLayer1 = keras.layers.LayerNormalization(epsilon=1e-6)
    self.normLayer2 = keras.layers.LayerNormalization(epsilon=1e-6)
    self.normLayer3 = keras.layers.LayerNormalization(epsilon=1e-6)\

    self.dropout1 = keras.layers.Dropout(drop_rate)
    self.dropout2 = keras.layers.Dropout(drop_rate)
    self.dropout3 = keras.layers.Dropout(drop_rate)
  
  def call(self, x, enc_output, training, look_ahead_mask, padding_mask):

    # self attention computation
    self_attn_out, self_attn = self.SAMH(x, x, x, look_ahead_mask)

    if self.use_dropout and training:
      self_attn_out = self.dropout1(self_attn_out)
    
    z = self.normLayer1(x + self_attn_out) 

    # encoder decoder computation
    enc_dec_out, enc_dec_attn = self.EDMH(z, enc_output, enc_output, padding_mask)

    if self.use_dropout and training:
      enc_dec_out = self.dropout2(enc_dec_out)
    
    z = self.normLayer2(z + enc_dec_out)

    # feed forward computation
    output = self.ffn(z)

    if self.use_dropout and training:
      output = self.dropout3(output)
    
    output = self.normLayer3(z + output)

    return output, self_attn, enc_dec_attn


In [None]:
sample_decoder_layer = DecoderLayer()

In [None]:
dec_inp = tf.random.uniform((3, 20, 6, 256))
out2 = sample_decoder_layer(dec_inp, out, True, None, None)
out2[0].shape

TensorShape([3, 20, 6, 256])

In [None]:
class Encoder(keras.layers.Layer):
  def __init__(self, features_size, max_size, dk_model=256, num_heads=8, num_encoders=6, 
               enc_hidden_size=256, use_pos_emb=True, use_dropout=True, drop_rate=0.1):
    super(Encoder, self).__init__()

    # params
    self.dk_model = dk_model
    self.max_size = max_size
    self.use_dropout = use_dropout
    self.use_pos_emb = use_pos_emb
    self.enc_hidden_size = enc_hidden_size
    self.num_encoders = num_encoders

    # layers
    #self.embedding = keras.layers.Embedding(features_size, dk_model)
    self.embedding = keras.layers.Dense(dk_model)
    self.encoders_stack = [EncoderLayer(dk_model, num_heads, enc_hidden_size, use_dropout, drop_rate) for _ in range(num_encoders)]
    self.dropout = tf.keras.layers.Dropout(drop_rate)
  
  def call(self, x, padding_mask, training):
    x = self.embedding(x)
    x *= tf.math.sqrt(tf.cast(self.dk_model, tf.float32))

    if self.use_pos_emb:
      x += positional_encoding(self.max_size, self.dk_model)
    
    if self.use_dropout and training:
      x = self.dropout(x)
    
    for encoder_layer in self.encoders_stack:
      x = encoder_layer(x, training, padding_mask)
    
    return x

In [None]:
samp_inp = tf.random.uniform((3, 6, 20, 256))
encoder = Encoder(256, 20, 256)
out = encoder(samp_inp, None, True)
out.shape

HI


TensorShape([3, 6, 20, 256])

In [None]:
class Decoder(keras.layers.Layer):
  def __init__(self, features_size, max_size, dk_model=256, num_heads=8, num_decoders=6, 
               dec_hidden_size=256, use_pos_emb=True, use_dropout=True, drop_rate=0.1):
    
    super(Decoder, self).__init__()

    # params
    self.dk_model = dk_model
    self.max_size = max_size
    self.use_dropout = use_dropout
    self.use_pos_emb = use_pos_emb
    self.dec_hidden_size = dec_hidden_size
    self.num_decoders = num_decoders

    # layers
    self.embedding = keras.layers.Dense(dk_model)
    self.decoders_stack = [DecoderLayer(dk_model, num_heads, dec_hidden_size, use_dropout, drop_rate) for _ in range(num_decoders)]
    self.dropout = tf.keras.layers.Dropout(drop_rate)

  def call(self, x, enc_output, look_ahead_mask, padding_mask, training):
    #print(x)
    x = self.embedding(x)
    #print(x)
    x *= tf.math.sqrt(tf.cast(self.dk_model, tf.float32))
    if self.use_pos_emb:
      x += positional_encoding(self.max_size, self.dk_model)
    
    #print(x)
    if self.use_dropout and training:
      x = self.dropout(x)
    
    for decoder_layer in self.decoders_stack:
      x, attn1, attn2, = decoder_layer(x, enc_output, training, look_ahead_mask, padding_mask)
    
    return x


In [None]:
samp_inp = tf.random.uniform((3, 6, 20, 256))
decoder = Decoder(256, 20, 256)
out2 = decoder(samp_inp, out, None, None, True)
out2.shape

HI
xs:  (3, 6, 20, 256)
pe:  (1, 20, 256)
xs:  (3, 6, 20, 256)


TensorShape([3, 6, 20, 256])

In [None]:
class STTransformer(keras.Model):
  def __init__(self, features_size, max_seq_size, max_neighbors_size, 
               sp_dk=256, sp_enc_heads=8, sp_dec_heads=8, sp_num_encoders=6, sp_num_decoders=6, 
               tm_dk=256, tm_enc_heads=8, tm_dec_heads=8, tm_num_encoders=6, tm_num_decoders=6, 
               dec_hidden_size=256, use_dropout=True, drop_rate=0.1):
    
    super(STTransformer, self).__init__()

    # layers
    self.sp_encoder = Encoder(features_size, max_neighbors_size, sp_dk, use_pos_emb=False)
    self.sp_decoder = Decoder(features_size, max_neighbors_size, sp_dk, use_pos_emb=False)
    self.tm_encoder = Encoder(features_size, max_seq_size, tm_dk)
    self.tm_decoder = Decoder(features_size, max_seq_size, tm_dk)
    self.linear = tf.keras.layers.Dense(3, name='Linear_Trans')

    
  def call(self, inputs, training):
    inp, inp_masks, seq_inp_masks, targets, tar_masks, seq_tar_masks = inputs
    
    sp_enc_out = self.sp_encoder(inp,  inp_masks, training)                               #(batch, seq, neighbors, <spatial attn features>)
    out = tf.transpose(sp_enc_out, [0, 2, 1, 3])                                          #(batch, neighbors, seq, <spatial attn features>)
    tm_enc_out = self.tm_encoder(out, seq_inp_masks, training)                            #(batch, neighbots, seq, <time attn features>)
    
    # decode time
    targets = tf.transpose(targets, [0, 2, 1, 3])                                         #(batch, neighbors, seq, features)
    look_mask = get_look_ahead_mask(targets)
    tm_dec_out = self.tm_decoder(targets, tm_enc_out, look_mask, seq_tar_masks, training) 
    out2 = tf.transpose(tm_dec_out, [0, 2, 1, 3])                                         #(batch, seq, neighbors, features)
    sp_dec_out = self.sp_decoder(out2, sp_enc_out, None, tar_masks, training)
    
    # linear projection
    output = self.linear(sp_dec_out)
    return output

In [None]:
def buildDataSet(input, batch_size):
  input_ds = tf.data.Dataset.from_tensor_slices([x[0].astype(np.float32) for x in input])
  inpMask_ds = tf.data.Dataset.from_tensor_slices([ adapt_spatial_mask(x[1].astype(np.float32)) for x in input])
  seq_inpMask_ds = tf.data.Dataset.from_tensor_slices([adapt_seq_mask(x[2].astype(np.float32)) for x in input])

  target_ds = tf.data.Dataset.from_tensor_slices([x[3].astype(np.float32) for x in input])
  tarMask_ds = tf.data.Dataset.from_tensor_slices([adapt_spatial_mask(x[4].astype(np.float32)) for x in input])
  seq_tarMask_ds = tf.data.Dataset.from_tensor_slices([adapt_seq_mask(x[5].astype(np.float32)) for x in input])

  dataset = tf.data.Dataset.zip((input_ds, inpMask_ds, seq_inpMask_ds, target_ds, tarMask_ds, seq_tarMask_ds))
  dataset = dataset.shuffle(100)
  dataset = dataset.batch(batch_size)
  return dataset

In [None]:
BATCH_SIZE = 8

In [None]:
dataset = buildDataSet(cubes, BATCH_SIZE)

In [None]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
  def __init__(self, d_model, warmup_steps=4000):
    super(CustomSchedule, self).__init__()

    self.d_model = d_model
    self.d_model = tf.cast(self.d_model, tf.float32)

    self.warmup_steps = warmup_steps

  def __call__(self, step):
    arg1 = tf.math.rsqrt(step)
    arg2 = step * (self.warmup_steps ** -1.5)

    return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

In [None]:
learning_rate = CustomSchedule(16)
optimizer = optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98,
                                     epsilon=1e-9)

In [None]:
feat_size = 32
seq_size = 8
neigh_size = 10
dk = 16
n_heads = 4
model = STTransformer(feat_size, seq_size, neigh_size, 
                      sp_dk=dk, sp_enc_heads=n_heads, sp_dec_heads=n_heads,
                      tm_dk=dk, tm_enc_heads=n_heads, tm_dec_heads=n_heads,
                      sp_num_encoders=3, sp_num_decoders=3, tm_num_encoders=3, tm_num_decoders=3)

In [None]:
loss_object = tf.keras.losses.MeanSquaredError()

In [None]:
def loss_function(real, pred, seq_mask_array, neighbors_mask):
  seq_mask_array = 1 - seq_mask_array
  neighbors_mask = 1 - neighbors_mask

  seq_mask_array = seq_mask_array[:, :, np.newaxis, np.newaxis]
  neighbors_mask = neighbors_mask[:, :, :, np.newaxis]

  pred_masked = pred * seq_mask_array
  pred_masked = pred_masked * neighbors_mask
  loss_ = loss_object(real, pred_masked)

  return loss_

In [None]:
#@tf.function
def train_step(zipped_input, losses):

  # all model inputs
  inputs = zipped_input[0]
  neigh_inp_masks = zipped_input[1]
  seq_inp_masks = zipped_input[2]

  # targets
  tar = zipped_input[3]
  neigh_tar_masks = zipped_input[4]
  seq_tar_masks = zipped_input[5]

  seq_out_masks = tf.squeeze(seq_tar_masks)
  neigh_out_masks = tf.squeeze(neigh_tar_masks)

  
  # get only x, y, and rotation
  targets = tar[:, :, :, :3]                                            

  with tf.GradientTape() as tape:
    predictions = model((inputs, neigh_inp_masks, seq_inp_masks, tar, neigh_tar_masks, seq_tar_masks), training=True)
    loss = loss_function(targets, predictions, seq_out_masks, neigh_tar_masks)

  #print(predictions)
  print('loss: ', loss)
  losses.append(loss)
  gradients = tape.gradient(loss, model.trainable_variables)
  optimizer.apply_gradients(zip(gradients, model.trainable_variables))
  return losses, loss

In [None]:
def eval_step(zipped_input):
  # all model inputs
  inputs = zipped_input[0]
  neigh_inp_masks = zipped_input[1]
  seq_inp_masks = zipped_input[2]

  # targets
  tar = zipped_input[3]
  neigh_tar_masks = zipped_input[4]
  seq_tar_masks = zipped_input[5]

  seq_out_masks = tf.squeeze(seq_tar_masks)
  neigh_out_masks = tf.squeeze(neigh_tar_masks)

  targets = tf.transpose(tar[:, :, :, :2], [0, 2, 1, 3])     
  preds = model((inputs, neigh_inp_masks, seq_inp_masks, tar, neigh_tar_masks, seq_tar_masks), training=False)

  neigh_out_masks = 1 - neigh_out_masks
  seq_out_masks = 1 - seq_out_masks
  seq_out_masks = seq_out_masks[:, :, np.newaxis, np.newaxis]

  # masking
  preds = preds * seq_out_masks
  preds = preds * neigh_out_masks[:, :, :, np.newaxis]
  preds = preds[:, :, :, :2]

  # sequence with feats dimension
  preds = tf.transpose(preds, [0, 2, 1, 3])

  # reshape to remove batch
  targets = tf.reshape(targets, (-1, 8, 2))
  preds = tf.reshape(preds, (-1, 8, 2))
  
  return ADE(targets.numpy(), preds.numpy())

In [None]:
model.summary()

In [None]:
final_file = '/content/drive/MyDrive/Colab_Notebooks/pesos_transformer'

final_checkpoint = tf.train.Checkpoint(model=model)

# Load Model If necessary

In [None]:
final_checkpoint.read(final_file).assert_consumed()

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f6a873cf2d0>

In [None]:
for epoch in range(10):
  print('epoch: ', epoch)
  losses = []
  for batch in dataset:
    losses, loss = train_step(batch, losses)
    if np.isnan(loss.numpy()):
      break;
  
  l_ade = []
  for batch in dataset:
    ade = eval_step(batch)
    l_ade.append(ade)

  print('ade: ', np.mean(np.array(l_ade)))  

  print("avg loss", tf.reduce_mean(losses)) 

epoch:  0
loss:  tf.Tensor(176.25484, shape=(), dtype=float32)
loss:  tf.Tensor(118.58099, shape=(), dtype=float32)
loss:  tf.Tensor(195.1319, shape=(), dtype=float32)
loss:  tf.Tensor(79.62045, shape=(), dtype=float32)
loss:  tf.Tensor(357.97433, shape=(), dtype=float32)
loss:  tf.Tensor(127.219955, shape=(), dtype=float32)
loss:  tf.Tensor(143.39894, shape=(), dtype=float32)
loss:  tf.Tensor(156.43394, shape=(), dtype=float32)
loss:  tf.Tensor(136.0303, shape=(), dtype=float32)
loss:  tf.Tensor(132.96176, shape=(), dtype=float32)
loss:  tf.Tensor(178.79048, shape=(), dtype=float32)
loss:  tf.Tensor(198.95853, shape=(), dtype=float32)
loss:  tf.Tensor(188.69234, shape=(), dtype=float32)
loss:  tf.Tensor(166.93082, shape=(), dtype=float32)
loss:  tf.Tensor(111.72363, shape=(), dtype=float32)
loss:  tf.Tensor(223.94678, shape=(), dtype=float32)
loss:  tf.Tensor(228.81241, shape=(), dtype=float32)
loss:  tf.Tensor(208.09143, shape=(), dtype=float32)
loss:  tf.Tensor(269.48404, shape=(), 

In [None]:
final_checkpoint.write(final_file)

'/content/drive/MyDrive/Colab_Notebooks/pesos_transformer'

In [None]:
ld = list(dataset)

In [None]:
inputs = ld[0][0]
mask = ld[0][1]
s_mask = ld[0][2]
tar = ld[0][3]
t_mask = ld[0][4]
st_mask = ld[0][5]
seq_mask = ld[0][6]
neig_mask = ld[0][7]

# divide input as the trajectory input, and target (basically past and future to predict) 
out = model((inputs, mask, s_mask, tar, t_mask, st_mask), training=False)

In [None]:
def ADE(real, pred):
    diff_sq = (real - pred)**2
    diff_sq = np.sum(diff_sq, axis=2)
    diff_sq = np.sqrt(diff_sq)
    mean_diff = np.mean(diff_sq)
    return mean_diff

In [None]:
tar[0, :, :, :3].shape

TensorShape([8, 10, 3])

In [None]:
neigh_mask = 1-neig_mask
pred = out[0] * neigh_mask[0][:, :, np.newaxis]

In [None]:
tar_t = tf.transpose(tar[0,:,:, :3], [1, 0, 2])[:, :, :2]
pred_t = tf.transpose(pred, [1,0,2])[:, :, :2]

In [None]:
ADE(tar_t.numpy(), pred_t.numpy())

6.1653643

# Testing functions

In [None]:
for i in range(len(cubes)):
  for j in range(len(cubes[i][0])):
    for k in range(len(cubes[i][0][j])):
      for l in range(len(cubes[i][0][j][k])):
        if np.isnan(cubes[i][0][j][k][l]):
          cubes[i][0][j][k][l] = 0.0      


In [None]:
all_inps = [x[3] for x in cubes]
for inp in all_inps:
  for face in inp:
    for row in face:
      for el in row:
        if np.isnan(el):
          print('WHAAAAT')

In [None]:
np.arange(10)[:, np.newaxis]

array([[0],
       [1],
       [2],
       [3],
       [4],
       [5],
       [6],
       [7],
       [8],
       [9]])

In [None]:
t = tf.constant(np.arange(3 * 4 * 3 * 5 * 5)) + 1    
t = tf.reshape(t, (3, 4, 3, 5, 5))             #(batch, head, seq, N, N)
t = tf.cast(t, tf.float32)
t2 = np.random.choice([0, 1], (3, 3, 5)) * 0.5

In [None]:
t2 = tf.reshape(t2, (3, 1, 3, 1, 5))          # (batch, 1, seq, 1, N)
t2 = tf.cast(t2, tf.float32)

In [None]:
t2

<tf.Tensor: shape=(3, 1, 3, 1, 5), dtype=float32, numpy=
array([[[[[0. , 0. , 0. , 0.5, 0. ]],

         [[0. , 0.5, 0.5, 0.5, 0.5]],

         [[0. , 0.5, 0. , 0.5, 0. ]]]],



       [[[[0.5, 0. , 0.5, 0. , 0. ]],

         [[0.5, 0. , 0. , 0. , 0.5]],

         [[0. , 0. , 0.5, 0.5, 0.5]]]],



       [[[[0. , 0. , 0.5, 0. , 0. ]],

         [[0. , 0. , 0. , 0.5, 0. ]],

         [[0.5, 0.5, 0.5, 0. , 0. ]]]]], dtype=float32)>

In [None]:
t + t2

<tf.Tensor: shape=(3, 4, 3, 5, 5), dtype=float32, numpy=
array([[[[[  1. ,   2. ,   3. ,   4.5,   5. ],
          [  6. ,   7. ,   8. ,   9.5,  10. ],
          [ 11. ,  12. ,  13. ,  14.5,  15. ],
          [ 16. ,  17. ,  18. ,  19.5,  20. ],
          [ 21. ,  22. ,  23. ,  24.5,  25. ]],

         [[ 26. ,  27.5,  28.5,  29.5,  30.5],
          [ 31. ,  32.5,  33.5,  34.5,  35.5],
          [ 36. ,  37.5,  38.5,  39.5,  40.5],
          [ 41. ,  42.5,  43.5,  44.5,  45.5],
          [ 46. ,  47.5,  48.5,  49.5,  50.5]],

         [[ 51. ,  52.5,  53. ,  54.5,  55. ],
          [ 56. ,  57.5,  58. ,  59.5,  60. ],
          [ 61. ,  62.5,  63. ,  64.5,  65. ],
          [ 66. ,  67.5,  68. ,  69.5,  70. ],
          [ 71. ,  72.5,  73. ,  74.5,  75. ]]],


        [[[ 76. ,  77. ,  78. ,  79.5,  80. ],
          [ 81. ,  82. ,  83. ,  84.5,  85. ],
          [ 86. ,  87. ,  88. ,  89.5,  90. ],
          [ 91. ,  92. ,  93. ,  94.5,  95. ],
          [ 96. ,  97. ,  98. ,  99.5, 100

In [None]:
mask = np.random.choice([0, 1], size=(3, 5))

In [None]:
mask

array([[0, 1, 0, 0, 1],
       [1, 1, 0, 0, 1],
       [0, 1, 0, 1, 0]])

In [None]:
adapt_spatial_mask(mask).shape

(1, 3, 1, 5)

In [None]:
inp = np.arange(4 * 5 * 2 * 3) + 1
inp = np.reshape(inp, (4, 5, 2, 3))

In [None]:
inp

array([[ 1,  2,  3,  4],
       [ 5,  6,  7,  8],
       [ 9, 10, 11, 12],
       [13, 14, 15, 16]])

In [None]:
mask = 1 - np.array([[0, 0, 0, 1, 1], [0, 0, 0, 0, 1], [0, 0, 0, 0, 0], [0, 0, 0, 0, 1]])
mask = mask[:, :, np.newaxis, np.newaxis]
inp * mask

array([[[[  1,   2,   3],
         [  4,   5,   6]],

        [[  7,   8,   9],
         [ 10,  11,  12]],

        [[ 13,  14,  15],
         [ 16,  17,  18]],

        [[  0,   0,   0],
         [  0,   0,   0]],

        [[  0,   0,   0],
         [  0,   0,   0]]],


       [[[ 31,  32,  33],
         [ 34,  35,  36]],

        [[ 37,  38,  39],
         [ 40,  41,  42]],

        [[ 43,  44,  45],
         [ 46,  47,  48]],

        [[ 49,  50,  51],
         [ 52,  53,  54]],

        [[  0,   0,   0],
         [  0,   0,   0]]],


       [[[ 61,  62,  63],
         [ 64,  65,  66]],

        [[ 67,  68,  69],
         [ 70,  71,  72]],

        [[ 73,  74,  75],
         [ 76,  77,  78]],

        [[ 79,  80,  81],
         [ 82,  83,  84]],

        [[ 85,  86,  87],
         [ 88,  89,  90]]],


       [[[ 91,  92,  93],
         [ 94,  95,  96]],

        [[ 97,  98,  99],
         [100, 101, 102]],

        [[103, 104, 105],
         [106, 107, 108]],

        [[109, 110, 11

In [None]:
inp = np.array([[[1.0, 2.0, 3.0], [1.0, 1.0, 2.0]], [[1.0, 2.0, 3.0], [1.0, 1.0, 2.0]]])


tar = np.array([[[2.0, 4.0, 2.0], [1.0, 3.0, 2.0]], [[2.0, 4.0, 2.0], [1.0, 3.0, 2.0]]])

loss_object(inp,tar)

<tf.Tensor: shape=(), dtype=float64, numpy=1.6666667461395264>

In [None]:
np.sum((inp - tar)**2)/12

1.6666666666666667

In [None]:
tar

array([[2., 4., 2.],
       [1., 3., 2.]])

In [None]:
5/3.0 + 4.0/3

3.0