# Chapter 12: Custom Models and Training with TensorFlow

In [1]:
import tensorflow as tf
import numpy as np

2026-01-17 14:25:29.330526: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1768659929.555866      55 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1768659929.620269      55 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1768659930.116014      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1768659930.116066      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1768659930.116101      55 computation_placer.cc:177] computation placer alr

## Using TensorFlow like NumPy
### Tensors and Operations

In [2]:
t = tf.constant([[1, 2, 3], [5, 6, 6]], dtype= 'float32')

2026-01-17 14:25:46.608296: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


In [3]:
t

<tf.Tensor: shape=(2, 3), dtype=float32, numpy=
array([[1., 2., 3.],
       [5., 6., 6.]], dtype=float32)>

In [4]:
t.shape

TensorShape([2, 3])

In [5]:
t.dtype

tf.float32

In [6]:
t[:, :1]

<tf.Tensor: shape=(2, 1), dtype=float32, numpy=
array([[1.],
       [5.]], dtype=float32)>

In [7]:
t[..., 1]

<tf.Tensor: shape=(2,), dtype=float32, numpy=array([2., 6.], dtype=float32)>

In [8]:
t[..., 1, tf.newaxis]

<tf.Tensor: shape=(2, 1), dtype=float32, numpy=
array([[2.],
       [6.]], dtype=float32)>

In [9]:
t + 10

<tf.Tensor: shape=(2, 3), dtype=float32, numpy=
array([[11., 12., 13.],
       [15., 16., 16.]], dtype=float32)>

In [10]:
tf.square(t)

<tf.Tensor: shape=(2, 3), dtype=float32, numpy=
array([[ 1.,  4.,  9.],
       [25., 36., 36.]], dtype=float32)>

### Tensors and NumPy

In [11]:
a = np.array([2, 4, 5])

In [12]:
a

array([2, 4, 5])

In [13]:
t1 = tf.constant(a)

In [14]:
t1

<tf.Tensor: shape=(3,), dtype=int64, numpy=array([2, 4, 5])>

In [15]:
t1.numpy()

array([2, 4, 5])

### Type Conversions

In [16]:
# tf.constant(2.) + tf.constant(12)
# InvalidArgumentError: cannot compute AddV2 as input #1(zero-based) was expected to 
# be a float tensor but is a int32 tensor [Op:AddV2] name: 

In [17]:
t2 = tf.constant(40., dtype= tf.float64)
tf.constant(0.2) + tf.cast(t2, tf.float32)

<tf.Tensor: shape=(), dtype=float32, numpy=40.20000076293945>

### Variables and Other Data Structures

In [18]:
v = tf.Variable([[1, 2, 3], [8, 6, 9]], dtype= tf.float32)

In [19]:
v

<tf.Variable 'Variable:0' shape=(2, 3) dtype=float32, numpy=
array([[1., 2., 3.],
       [8., 6., 9.]], dtype=float32)>

In [20]:
v.assign(2 * v)

<tf.Variable 'UnreadVariable' shape=(2, 3) dtype=float32, numpy=
array([[ 2.,  4.,  6.],
       [16., 12., 18.]], dtype=float32)>

In [21]:
v

<tf.Variable 'Variable:0' shape=(2, 3) dtype=float32, numpy=
array([[ 2.,  4.,  6.],
       [16., 12., 18.]], dtype=float32)>

In [22]:
v[0, 1].assign(42)

<tf.Variable 'UnreadVariable' shape=(2, 3) dtype=float32, numpy=
array([[ 2., 42.,  6.],
       [16., 12., 18.]], dtype=float32)>

## Customizing Models and Training Algorithms
### Custom Loss Function
Huber Loss:
$$
L_{\delta}(y, \hat{y}) =
\begin{cases}
\frac{1}{2}(y - \hat{y})^{2} & \text{if } |y - \hat{y}| \leq \delta \\
\delta |y - \hat{y}| - \frac{1}{2}\delta^{2} & \text{otherwise}
\end{cases}
$$

In [23]:
def create_huber(delta=1.0):
    def huber_loss(y_true, y_pred):
        error = y_true - y_pred
        abs_error = tf.abs(error)
        squared_loss = 0.5 * tf.square(error)
        linear_loss = delta * abs_error - 0.5 * delta ** 2
    
        return tf.where(abs_error <= delta, squared_loss, linear_loss)
    return huber_loss

## This can be used simply like
# model.compile(loss= create_huber(2.0), ...)

## For loading model with custom objects
# model = tf.keras.models.load_model(..., custom_objects= {'huber_loss': create_huber(2.0)})

In [24]:
class HuberLoss(tf.keras.losses.Loss):
    def __init__(self, threshold=1.0, **kwargs):
        super().__init__(**kwargs)
        self.threshold = threshold

    def call(self, y_true, y_pred):
        error = y_true - y_pred
        abs_error = tf.abs(error)
        squared_loss = tf.square(error) / 2
        linear_loss = self.threshold * abs_error - (self.threshold ** 2) / 2
    
        return tf.where(abs_error <= self.threshold, squared_loss, linear_loss)

    def get_config(self):
        base_config = super().get_config()
        return {
            **base_config,
            'threshold': self.threshold
        }

## usage
# model.compile(loss= HuberLoss(2.), ...)
# model.fit(..., custom_objects= {'HuberLoss': HuberLoss}) # it will call the get_config and directly set the threshold

### Custom Activation, Initializer, Regularizer, and Constraint

In [25]:
def my_softplus(x):
    return tf.math.log(1.0 + tf.exp(x))

def my_glorot_initializer(shape, dtype= tf.float32):
    stddev = tf.sqrt(2. / shape[0] + shape[1])
    return tf.random.normal(shape, stddev= sttdev, dtype= dtype)

def my_l1_regularizer(weights):
    return tf.reduce_sum(tf.abs(0.01 * weights))

def my_positive_weights(weights):
    return tf.where(weights < 0., tf.zeros_like(weights), weights)

## We can also create classes of all these, just like we did for the loss function.

In [26]:
layer = tf.keras.layers.Dense(
    1, 
    activation= my_softplus,
    kernel_regularizer= my_l1_regularizer,
    kernel_initializer=  my_glorot_initializer,
    kernel_constraint= my_positive_weights
)

Using the inheritance way, we can also create custom metrics and custom layers. For non-weight layers, we can simply use Lambda layer.

## Custom Model (basically using the subclassing API)

In [27]:
class ResidualBlock(tf.keras.layers.Layer):
    def __init__(self, n_layers, n_neurons, **kwargs):
        super().__init__(**kwargs)
        self.hidden = [
            tf.keras.layers.Dense(n_neurons, activation= 'relu', kernel_initializer= 'he_normal')
            for _ in range(n_layers)
        ]

    def call(self, inputs):
        z = inputs
        for layer in self.hidden:
            z = layer(z)
        return inputs + z

In [28]:
class ResidualRegressor(tf.keras.Model):
    def __init__(self, output_dim, **kwargs):
        super().__init__(**kwargs)
        self.hidden1 = tf.keras.layers.Dense(30, activation= 'relu', kernel_initializer= 'he_normal')
        self.block1 = ResidualBlock(2, 30)
        self.block2 = ResidualBlock(2, 30)
        self.out = tf.keras.layers.Dense(output_dim)

    def call(self, inputs):
        z = self.hidden1(inputs)
        for _ in range(1 + 3):
            z = self.block1(z)

        z = self.block2(z)
        return self.out(z)

## Computing Gradients using AutoDiff
Let $f(X)$ be a linear function representing a very simple neural network, ofcourse neural network has more parameters.

In [29]:
def f(w1, w2):
    return 3 * w1 ** 2 + 2 * w1 * w2

For this specific function the gradient can be found like as follows:

In [30]:
def grad_f(w1, w2):
    wrt_w1 = 6 * w1 + 2 * w2
    wrt_w2 = 2 * w1
    return [wrt_w1, wrt_w2]

In [31]:
w1, w2 = 5, 3

In [32]:
grad_f(w1, w2)

[36, 10]

But calculating gradient this way is very inefficient for large neural networks. So we use a simple approximation, where we calculate each partial derivative by measuring how much the function's output changes when we tweak the corresponding parameter by a tiny amount.

In [33]:
eps = 1e-6

In [34]:
(f(w1 + eps, w2) - f(w1, w2)) / eps

36.000003007075065

In [35]:
(f(w1, w2 + eps) - f(w1, w2)) / eps

10.000000003174137

Having to call f() at least once per parameter makes this approach very difficulty to control for large neural netowrks. So instead we use reverse-mode autodiff 

In [36]:
w1, w2 = tf.Variable(5.), tf.Variable(3.)
with tf.GradientTape() as tape:
    z = f(w1, w2)

gradients = tape.gradient(z, [w1, w2])

In [37]:
gradients

[<tf.Tensor: shape=(), dtype=float32, numpy=36.0>,
 <tf.Tensor: shape=(), dtype=float32, numpy=10.0>]

In [38]:
# tape.gradient(z, [w1, w2])
# cannot call this twice, RuntimeError: A non-persistent GradientTape can only be used to compute one set of gradients (or jacobians)

In [39]:
with tf.GradientTape(persistent= True) as tape:   # no error if persistent
    z = f(w1, w2)

tape.gradient(z, [w1, w2])
tape.gradient(z, w1)
tape.gradient(z, w2)
del tape

In [41]:
# numerically stable gradients
@tf.custom_gradient
def my_softplus(z):
    def my_softplus_gradients(grads):           # grads = backproped from upper layer
        return grads * (1 - 1 / 1 + tf.exp(z))  # stable grads for softplus

    result = tf.math.log(1 + tf.exp(-tf.abs(z))) + tf.maximum(0., z)
    return result, my_softplus_gradients

## TensorFlow Functions and Graphs

In [42]:
def cube(x):
    return x ** 3

In [43]:
cube(3)

27

In [44]:
cube(tf.constant(3))

<tf.Tensor: shape=(), dtype=int32, numpy=27>

In [45]:
tf_cube = tf.function(cube)
tf_cube

<tensorflow.python.eager.polymorphic_function.polymorphic_function.Function at 0x7c46a90731a0>

In [46]:
@tf.function
def cube(x):
    return x ** 3

In [47]:
cube(3)

<tf.Tensor: shape=(), dtype=int32, numpy=27>

In [48]:
cube(tf.constant(3))

<tf.Tensor: shape=(), dtype=int32, numpy=27>

In [49]:
cube.python_function(3)

27