In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = ""

In [3]:
import tensorflow as tf
import numpy as np

In [7]:
# this just shows how to do the positional encoding
def positional_encoding(input_t, n_freqs):
    # input_t: b x 1
    # n_freqs: scalar
    # returns b x (2*n_freqs)
    exponents = tf.range(n_freqs, dtype=tf.float32)  # n_freqs vector
    frequencies = tf.pow(2., exponents)  # same
    sines = tf.math.sin(2*np.pi*frequencies*input_t)  # b x n_freqs
    cosines =  tf.math.cos(2*np.pi*frequencies*input_t)  # b x n_freqs
    return tf.concat([sines, cosines], axis=-1)  # b x 2*n_freqs


# usage example for positional_encoding

t_max = 1000
batch_size = 32
# in reality, times needs to be a batch_size vector of sampled time steps.
# just like we sample a sigma_i in score-based models.
times = np.random.randint(0, 1000, size=(batch_size, 1))
times_normalized = times.astype(np.float32) / t_max  # to [0, 1]

# how many frequencies to choose? 
# generally, we have a sampling rate of t_max for the normalized t.
# the highest frequency that can be expressed (nyquist frequency) is just below t_max/2.
# e.g. for t_max = 1000, frequencies of 500 Hz or higher are lost/aliased.
# -> stay below 500. 2**9 is 512, so rather use 2**8 = 256 as highest frequency.
# i.e. 2**0, 2**1, 2**2, ..., 2**8.
n_freqs = 9
times_encoded = positional_encoding(times_normalized, n_freqs)

times_encoded.shape  # batch_size x (n_freqs * 2) for sine and cosine waves of each frequency.

TensorShape([32, 18])

In [None]:
# use whatever input shape is appropriate for your data
inp = tf.keras.Input((64, 64, 3))  # this is the image input

t_input = tf.keras.Input((1,))
# None, None adds axes for widht and height.
t_encoded = positional_encoding(t_input, n_freqs=9)[:, None, None, :]
t_broadcast = tf.tile(t_encoded, [1, 64, 64, 1])  # repeat t over the whole image

combined = tfkl.Concatenate()([inp, t_broadcast])  # put this into your network

output = Network(combined) # output of your network, same shape as inp


model = tf.keras.Model([inp, t_input], output)  # model that takes two inputs

# keep in mind that you need to call multi-input models like this
model([input_one, input_two])
# NOT this
model(input_one, input_two)

# in principle you could also handle all the t stuff outside the actual model
# and just give the already concatenated tensor as a single input to the model.


# Finally: The papers recommed to insert t not only in the input layer, but also
# give it to hidden layers directly. To do that, you will need to create t_broadcast
# for all layers with the appropriate size, tiling over width and height of the 
# respective feature maps.

In [None]:
# forward process
# these values are taken from the paper.
tmax = 1000
betas = np.linspace(0.0001, 0.02, tmax).astype(np.float32)
alphas = 1 - betas
alphas_bar = np.cumprod(alphas)


# running the forward process, for example, is then
t = 200 # whatever time step
noise_scale = np.sqrt(1 - alphas_bar[t])
noisy_data = np.sqrt(alphas_bar[t]) * data + scale*np.random.normal(size=data.shape)

# all required formulas can be found in the paper!!