##### Copyright 2019 The TensorFlow Authors.

In [1]:
#@title Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Writing custom layers and models with Keras

<table class="tfo-notebook-buttons" align="left">
  <td>
    <a target="_blank" href="https://www.tensorflow.org/guide/keras/custom_layers_and_models"><img src="https://www.tensorflow.org/images/tf_logo_32px.png" />View on TensorFlow.org</a>
  </td>
  <td>
    <a target="_blank" href="https://colab.research.google.com/github/tensorflow/docs/blob/master/site/en/guide/keras/custom_layers_and_models.ipynb"><img src="https://www.tensorflow.org/images/colab_logo_32px.png" />Run in Google Colab</a>
  </td>
  <td>
    <a target="_blank" href="https://github.com/tensorflow/docs/blob/master/site/en/guide/keras/custom_layers_and_models.ipynb"><img src="https://www.tensorflow.org/images/GitHub-Mark-32px.png" />View source on GitHub</a>
  </td>
  <td>
    <a href="https://storage.googleapis.com/tensorflow_docs/docs/site/en/guide/keras/custom_layers_and_models.ipynb"><img src="https://www.tensorflow.org/images/download_logo_32px.png" />Download notebook</a>
  </td>
</table>

### Setup

In [3]:
from __future__ import absolute_import, division, print_function, unicode_literals

try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
    pass
import tensorflow as tf

tf.keras.backend.clear_session()  # For easy reset of notebook state.

## The Layer class



### Layers encapsulate a state (weights) and some computation

The main data structure you'll work with is the `Layer`.
A layer encapsulates both a state (the layer's "weights")
and a transformation from inputs to outputs (a "call", the layer's
forward pass).

Here's a densely-connected layer. It has a state: the variables `w` and `b`.


In [4]:
from tensorflow.keras import layers


class Linear(layers.Layer):

    def __init__(self, units=32, input_dim=32):
        super(Linear, self).__init__()
        w_init = tf.random_normal_initializer()
        self.w = tf.Variable(initial_value=w_init(shape=(input_dim, units),
                                                  dtype='float32'),
                             trainable=True)
        b_init = tf.zeros_initializer()
        self.b = tf.Variable(initial_value=b_init(shape=(units,),
                                                  dtype='float32'),
                             trainable=True)

    def call(self, inputs):
        return tf.matmul(inputs, self.w) + self.b

x = tf.ones((2, 2))
linear_layer = Linear(4, 2)
y = linear_layer(x)
print(y)

tf.Tensor(
[[-0.02889749 -0.10398108 -0.00575986  0.01895376]
 [-0.02889749 -0.10398108 -0.00575986  0.01895376]], shape=(2, 4), dtype=float32)


In [8]:
linear_layer.weights

[<tf.Variable 'Variable:0' shape=(2, 4) dtype=float32, numpy=
 array([[ 0.09302854,  0.007962  ,  0.01599212,  0.05290163],
        [-0.05164344,  0.03467279, -0.02845521,  0.02873094]],
       dtype=float32)>,
 <tf.Variable 'Variable:0' shape=(4,) dtype=float32, numpy=array([0., 0., 0., 0.], dtype=float32)>]

In [9]:
[linear_layer.w, linear_layer.b]

[<tf.Variable 'Variable:0' shape=(2, 4) dtype=float32, numpy=
 array([[ 0.09302854,  0.007962  ,  0.01599212,  0.05290163],
        [-0.05164344,  0.03467279, -0.02845521,  0.02873094]],
       dtype=float32)>,
 <tf.Variable 'Variable:0' shape=(4,) dtype=float32, numpy=array([0., 0., 0., 0.], dtype=float32)>]

In [19]:
import inspect
import pprint

In [8]:
linear_layer.weights

[<tf.Variable 'Variable:0' shape=(2, 4) dtype=float32, numpy=
 array([[ 0.02054681, -0.03626327, -0.0118773 , -0.02946744],
        [-0.0011623 , -0.04729664,  0.06191743, -0.0300195 ]],
       dtype=float32)>,
 <tf.Variable 'Variable:0' shape=(4,) dtype=float32, numpy=array([0., 0., 0., 0.], dtype=float32)>]

Note that the weights `w` and `b` are automatically tracked by the layer upon
being set as layer attributes:

In [6]:
assert linear_layer.weights == [linear_layer.w, linear_layer.b]

Note you also have access to a quicker shortcut for adding weight to a layer: the `add_weight` method:



In [3]:
class Linear(layers.Layer):

    def __init__(self, units=32, input_dim=32):
        super(Linear, self).__init__()
        self.w = self.add_weight(shape=(input_dim, units),
                                 initializer='random_normal',
                                 trainable=True)
        self.b = self.add_weight(shape=(units,),
                                 initializer='zeros',
                                 trainable=True)

    def call(self, inputs):
        return tf.matmul(inputs, self.w) + self.b

x = tf.ones((2, 2))
linear_layer = Linear(4, 2)
y = linear_layer(x)
print(y)

tf.Tensor(
[[-0.04952537  0.10652478  0.01390582 -0.07114404]
 [-0.04952537  0.10652478  0.01390582 -0.07114404]], shape=(2, 4), dtype=float32)


In [30]:
linear_layer.weights

[<tf.Variable 'Variable:0' shape=(2, 4) dtype=float32, numpy=
 array([[-0.03324789, -0.04835039,  0.01064808,  0.00119833],
        [-0.01639398, -0.03614883, -0.00842738,  0.04886182]],
       dtype=float32)>,
 <tf.Variable 'Variable:0' shape=(4,) dtype=float32, numpy=array([0., 0., 0., 0.], dtype=float32)>]

#### Layers can have non-trainable weights

Besides trainable weights, you can add non-trainable weights to a layer as well.
Such weights are meant not to be taken into account during backpropagation,
when you are training the layer.

Here's how to add and use a non-trainable weight:

In [39]:
class ComputeSum(layers.Layer):

    def __init__(self, input_dim):
        super(ComputeSum, self).__init__()
        self.total = tf.Variable(initial_value=tf.zeros((input_dim,)),
                                 trainable=False)

    def call(self, inputs):
        self.total.assign_add(tf.reduce_sum(inputs, axis=0))
        return self.total

x = tf.ones((2, 2))
my_sum = ComputeSum(2)
y = my_sum(x)
print(y.numpy())
y = my_sum(x)
print(y.numpy())

[2. 2.]
[4. 4.]


In [12]:
class ComputeSum(layers.Layer):

    def __init__(self, input_dim):
        super(ComputeSum, self).__init__()
        self.total = tf.Variable(initial_value=tf.zeros((input_dim,)),
                                trainable = False)

    def call(self, inputs):
        self.total.assign_add(tf.reduce_sum(inputs, axis=0))
        return self.total

x = tf.ones((4, 2))
my_sum = ComputeSum(2)
y = my_sum(x)
print(y.numpy())
y = my_sum(x)
print(y.numpy())

[4. 4.]
[8. 8.]


It's part of `layer.weights`, but it gets categorized as a non-trainable weight:

In [13]:
print('weights:', len(my_sum.weights))
print('non-trainable weights:', len(my_sum.non_trainable_weights))

# It's not included in the trainable weights:
print('trainable_weights:', my_sum.trainable_weights)

weights: 1
non-trainable weights: 1
trainable_weights: []


### Best practice: deferring weight creation until the shape of the inputs is known

In the logistic regression example above, our `Linear` layer took an `input_dim` argument
that was used to compute the shape of the weights `w` and `b` in `__init__`:

In [14]:
class Linear(layers.Layer):

    def __init__(self, units=32, input_dim=32):
        super(Linear, self).__init__()
        self.w = self.add_weight(shape=(input_dim, units),
                                initializer='random_normal',
                                trainable=True)
        self.b = self.add_weight(shape=(units,),
                                initializer='zeros',
                                trainable=True)

In many cases, you may not know in advance the size of your inputs, and you would
like to lazily create weights when that value becomes known,
some time after instantiating the layer.

In the Keras API, we recommend creating layer weights in the `build(inputs_shape)` method of your layer.
Like this:

In [7]:
class Linear(layers.Layer):

    def __init__(self, units=32):
        super(Linear, self).__init__()
        self.units = units

    def build(self, input_shape):
        self.w = self.add_weight(shape=(input_shape[-1], self.units),
                                 initializer='random_normal',
                                 trainable=True)
        self.b = self.add_weight(shape=(self.units,),
                                 initializer='random_normal',
                                 trainable=True)

    def call(self, inputs):
        return tf.matmul(inputs, self.w) + self.b

In [43]:
class Linear_a(layers.Layer):

    def __init__(self, units=32):
        super(Linear_a, self).__init__()
        self.units = units
        print("init")

    def build(self, a):
        self.w = self.add_weight(shape=(a[-1], self.units),
                                 initializer='random_normal',
                                 trainable=True)
        self.b = self.add_weight(shape=(self.units,),
                                 initializer='random_normal',
                                 trainable=True)
        print("build")

    def call(self, abc):
        print("call")
        return tf.matmul(abc, self.w) + self.b

In [44]:
linear_layer_a = Linear_a(32)
linear_layer_a(x)

init
build
call


<tf.Tensor: id=475, shape=(2, 32), dtype=float32, numpy=
array([[-0.05962064,  0.05254285,  0.04065034, -0.02519245, -0.00860368,
         0.00766941, -0.0459495 , -0.01660534, -0.16014819, -0.16301665,
        -0.034472  ,  0.18118721, -0.0761043 , -0.09365515,  0.04958824,
         0.01287301,  0.04132933, -0.06629962, -0.02524884,  0.176397  ,
         0.14697832,  0.10170928,  0.0081564 ,  0.00733461,  0.06406184,
         0.00845177,  0.08103774, -0.04317323,  0.04183863,  0.0579142 ,
        -0.04370681, -0.166964  ],
       [-0.05962064,  0.05254285,  0.04065034, -0.02519245, -0.00860368,
         0.00766941, -0.0459495 , -0.01660534, -0.16014819, -0.16301665,
        -0.034472  ,  0.18118721, -0.0761043 , -0.09365515,  0.04958824,
         0.01287301,  0.04132933, -0.06629962, -0.02524884,  0.176397  ,
         0.14697832,  0.10170928,  0.0081564 ,  0.00733461,  0.06406184,
         0.00845177,  0.08103774, -0.04317323,  0.04183863,  0.0579142 ,
        -0.04370681, -0.166964  

In [45]:
linear_layer_a.w

<tf.Variable 'linear_a_6/Variable:0' shape=(2, 32) dtype=float32, numpy=
array([[-0.10247622,  0.05028255, -0.01717131, -0.00418463,  0.03603247,
         0.02031094,  0.075741  , -0.02484997,  0.01153527, -0.03117465,
        -0.0557558 ,  0.06679498,  0.01571436, -0.03417717, -0.04535168,
         0.05258175,  0.02755596,  0.06353343,  0.04188916, -0.00650458,
         0.06517434,  0.01466364, -0.00290745, -0.04499962, -0.0052455 ,
        -0.0838006 ,  0.05636636, -0.00766802, -0.00542857, -0.00263645,
         0.00603967, -0.03574076],
       [ 0.02222881,  0.0104343 ,  0.09268503,  0.01864413,  0.02977577,
         0.00432332, -0.0498291 ,  0.03441909, -0.08387384, -0.04579395,
         0.00448197,  0.04648512, -0.02968965, -0.05317941,  0.12420692,
         0.0055695 , -0.02151216, -0.01883881, -0.06344761,  0.04110399,
         0.01639282,  0.02360121,  0.02974799,  0.05391997,  0.06019765,
         0.0913583 , -0.06671597,  0.00603999,  0.07442065,  0.00870804,
        -0.05419

In [46]:
linear_layer_a.trainable_weights

[<tf.Variable 'linear_a_6/Variable:0' shape=(2, 32) dtype=float32, numpy=
 array([[-0.10247622,  0.05028255, -0.01717131, -0.00418463,  0.03603247,
          0.02031094,  0.075741  , -0.02484997,  0.01153527, -0.03117465,
         -0.0557558 ,  0.06679498,  0.01571436, -0.03417717, -0.04535168,
          0.05258175,  0.02755596,  0.06353343,  0.04188916, -0.00650458,
          0.06517434,  0.01466364, -0.00290745, -0.04499962, -0.0052455 ,
         -0.0838006 ,  0.05636636, -0.00766802, -0.00542857, -0.00263645,
          0.00603967, -0.03574076],
        [ 0.02222881,  0.0104343 ,  0.09268503,  0.01864413,  0.02977577,
          0.00432332, -0.0498291 ,  0.03441909, -0.08387384, -0.04579395,
          0.00448197,  0.04648512, -0.02968965, -0.05317941,  0.12420692,
          0.0055695 , -0.02151216, -0.01883881, -0.06344761,  0.04110399,
          0.01639282,  0.02360121,  0.02974799,  0.05391997,  0.06019765,
          0.0913583 , -0.06671597,  0.00603999,  0.07442065,  0.00870804,
  

In [55]:
class Linear3(layers.Layer):

    def __init__(self, units=32):
        super(Linear3, self).__init__()
        self.units = units

    def build(self):
        self.w = self.add_weight(shape=(input_shape[-1], self.units),
                                 initializer='random_normal',
                                 trainable=True)
        self.b = self.add_weight(shape=(self.units,),
                                 initializer='random_normal',
                                 trainable=True)

    def call(self, inputs):
        return tf.matmul(inputs, self.w) + self.b

In [8]:
import random

In [65]:
class Linear5(layers.Layer):

    def __init__(self, units=32):
        super(Linear5, self).__init__()
        self.units = units
        print("init")

    def build(self, input_shape):
        print('build')
    def call(self, inputs):
        print("call")
        return tf.matmul(inputs, self.weights[0][0]) + self.b

In [66]:

linear_layer5 = Linear5(32)

linear_layer5(x)

init
build
call


IndexError: list index out of range

In [67]:
linear_layer5.weights

[]

In [57]:
linear_layer5.weights

[<tf.Variable 'linear5_40/Variable:0' shape=(2, 32) dtype=float32, numpy=
 array([[-0.07900801, -0.01455089, -0.04886997,  0.00849197,  0.04231484,
          0.00241421, -0.06481494, -0.00893692, -0.01957316, -0.00694142,
          0.04600096,  0.01236336,  0.08572765, -0.0362857 ,  0.00192356,
         -0.04226952, -0.05431823, -0.0406916 ,  0.01047941,  0.05529768,
          0.05197164, -0.10651936, -0.05965484, -0.06841718, -0.03939276,
          0.04163815,  0.01368979, -0.06939536, -0.01541925, -0.05483044,
          0.04326602, -0.01088315],
        [-0.06114447,  0.01810847,  0.05237279, -0.0096415 ,  0.04506547,
          0.05059414,  0.05961224,  0.01779534, -0.06088741, -0.03586103,
          0.0339044 ,  0.06761163, -0.01116273, -0.05239036,  0.02479657,
          0.0050546 , -0.02378914, -0.07779925, -0.02153455,  0.06172351,
         -0.15516023,  0.01414393,  0.12694876, -0.03168827,  0.0452406 ,
          0.00567624, -0.03010116,  0.04501907,  0.00605122,  0.00128576,
  

In [14]:
linear_layer5.calling(x)

AttributeError: 'Linear5' object has no attribute 'calling'

In [63]:
class Linear4(layers.Layer):

    def __init__(self, units=32):
        super(Linear4, self).__init__()
        self.units = units

    def build(self, a):
        self.w = self.add_weight(shape=(a[-1], self.units),
                                 initializer='random_normal',
                                 trainable=True)
        self.b = self.add_weight(shape=(self.units,),
                                 initializer='random_normal',
                                 trainable=True)

    #def call(self, inputs):
    #    return tf.matmul(inputs, self.w) + self.b

In [33]:
linear_layer_a = Linear_a(32)

init


In [34]:
linear_layer_a(x)

build
call


<tf.Tensor: id=375, shape=(4, 32), dtype=float32, numpy=
array([[-0.02419248,  0.03397185, -0.00721853, -0.03902208,  0.04057902,
         0.08895342, -0.05935798,  0.15014562,  0.06483946, -0.05875566,
        -0.17688638, -0.08090846,  0.00874879,  0.05344076,  0.11211172,
        -0.02501448,  0.0373958 ,  0.00749506,  0.00742238,  0.10471216,
        -0.04898893,  0.06628293, -0.10775416,  0.07727765, -0.10830477,
         0.19399542, -0.0445637 , -0.07546984,  0.15959659, -0.03410715,
         0.07729881,  0.06042442],
       [-0.02419248,  0.03397185, -0.00721853, -0.03902208,  0.04057902,
         0.08895342, -0.05935798,  0.15014562,  0.06483946, -0.05875566,
        -0.17688638, -0.08090846,  0.00874879,  0.05344076,  0.11211172,
        -0.02501448,  0.0373958 ,  0.00749506,  0.00742238,  0.10471216,
        -0.04898893,  0.06628293, -0.10775416,  0.07727765, -0.10830477,
         0.19399542, -0.0445637 , -0.07546984,  0.15959659, -0.03410715,
         0.07729881,  0.06042442

The `__call__` method of your layer will automatically run `build` the first time it is called.
You now have a layer that's lazy and easy to use:

In [30]:
linear_layer = Linear(32)  # At instantiation, we don't know on what inputs this is going to get called
linear_layer(x)  # The layer's weights are created dynamically the first time the layer is called

<tf.Tensor: id=319, shape=(4, 32), dtype=float32, numpy=
array([[-0.1274697 , -0.0630742 , -0.01142713,  0.05119533, -0.02470569,
         0.05852924, -0.087138  ,  0.08649664,  0.04938974, -0.06793442,
        -0.03455362,  0.02894306,  0.01817703,  0.02836488, -0.21493   ,
         0.04592641, -0.0460737 , -0.05568846,  0.08812203, -0.16927117,
        -0.12181335, -0.01016857,  0.09846251,  0.05953536, -0.04449662,
         0.00659457,  0.02993122,  0.15447405, -0.07759735,  0.02221101,
         0.02020011, -0.01443632],
       [-0.1274697 , -0.0630742 , -0.01142713,  0.05119533, -0.02470569,
         0.05852924, -0.087138  ,  0.08649664,  0.04938974, -0.06793442,
        -0.03455362,  0.02894306,  0.01817703,  0.02836488, -0.21493   ,
         0.04592641, -0.0460737 , -0.05568846,  0.08812203, -0.16927117,
        -0.12181335, -0.01016857,  0.09846251,  0.05953536, -0.04449662,
         0.00659457,  0.02993122,  0.15447405, -0.07759735,  0.02221101,
         0.02020011, -0.01443632

In [23]:
linear_layer_a.w

AttributeError: 'Linear_a' object has no attribute 'w'

In [24]:
linear_layer.w

<tf.Variable 'linear_3/Variable:0' shape=(2, 32) dtype=float32, numpy=
array([[-0.01024805, -0.12468822, -0.03961597, -0.03007054, -0.04628066,
         0.04873963,  0.0056522 , -0.01616198, -0.00550074,  0.00910311,
         0.05721447, -0.00032047,  0.02677893, -0.01647685,  0.02558899,
         0.08199804,  0.01251215, -0.04133772,  0.00167426, -0.09411531,
        -0.06747368, -0.03606477,  0.01028839, -0.09276644, -0.01903689,
         0.00494483, -0.05151172, -0.10299905,  0.02904531,  0.00276534,
         0.00856061, -0.03546946],
       [ 0.09505925, -0.00867851, -0.00139748,  0.08212978,  0.1330924 ,
        -0.02217632,  0.06656325, -0.0485464 , -0.04153347, -0.04293505,
         0.01865369,  0.09250673, -0.03356451,  0.08733492, -0.03331181,
         0.00075316,  0.11341518,  0.01956592, -0.02479024, -0.04311122,
         0.03003897,  0.04169715,  0.12543605, -0.014     ,  0.00981854,
         0.08322202,  0.08934832,  0.01648777,  0.0848069 , -0.02683881,
         0.0658551

In [61]:
linear_layer_a = Linear_a(32)
y = linear_layer_a(x)
linear_layer_a.w

<tf.Variable 'linear_a_1/Variable:0' shape=(2, 32) dtype=float32, numpy=
array([[-0.02224136,  0.10893494,  0.01650461,  0.04596733, -0.00191518,
        -0.00289454,  0.01686579, -0.03833527, -0.05119411, -0.0350647 ,
         0.00085375,  0.04506344,  0.0066293 ,  0.04897112,  0.0204167 ,
         0.02053897,  0.00276471, -0.07860785,  0.05259438, -0.0493194 ,
        -0.12971416, -0.02451086,  0.01026036,  0.07726505,  0.03485465,
         0.02859635, -0.07592668, -0.03255811,  0.0422284 ,  0.08190619,
        -0.0173349 , -0.0686255 ],
       [ 0.02413127, -0.01974991,  0.03357071,  0.04227452, -0.06001312,
        -0.05494372,  0.05127419, -0.00933078,  0.01657465, -0.05440009,
         0.03437327,  0.01573649,  0.00602964,  0.02063557,  0.00344559,
        -0.00656494,  0.09053495,  0.10931852, -0.1267864 ,  0.02341956,
        -0.08000993, -0.04601313, -0.05010744,  0.01067231, -0.00597965,
         0.07272794, -0.04900721, -0.00134051,  0.02059883, -0.06399708,
        -0.04932

In [53]:
linear_layer3 = Linear3(32)
y = linear_layer3(x)

TypeError: build() takes 1 positional argument but 2 were given

In [64]:
linear_layer4 = Linear4(32)
y = linear_layer4(x)
linear_layer4.w

<tf.Variable 'linear4_4/Variable:0' shape=(2, 32) dtype=float32, numpy=
array([[ 0.02944901,  0.08169571,  0.08607452,  0.09525241,  0.03352685,
         0.00728847, -0.04876156, -0.07203896,  0.03152307,  0.02647154,
        -0.09575149, -0.03901853, -0.11773364,  0.02425112, -0.00497574,
         0.00120795, -0.02768264, -0.05510433,  0.05402781, -0.0268576 ,
        -0.0396144 , -0.07633492, -0.01152158,  0.03471149, -0.04223849,
         0.03975738,  0.09698174, -0.03637597,  0.00842177, -0.03485252,
        -0.05574738,  0.04095755],
       [-0.05322326, -0.03908886,  0.04748499,  0.01411388,  0.00665069,
         0.08479541,  0.05209041, -0.04581574,  0.0448983 ,  0.00983533,
         0.05158487,  0.04326131,  0.02078844,  0.06170132, -0.04357408,
         0.04804098,  0.02756624,  0.08641037, -0.00463737, -0.0491388 ,
        -0.05058886,  0.0049554 , -0.0373957 ,  0.00160119,  0.0137968 ,
        -0.08824806, -0.01923079,  0.09739023,  0.06208529, -0.03312061,
         0.025549

In [15]:
linear_layer.inputs_shape

AttributeError: 'Linear' object has no attribute 'inputs_shape'


### Layers are recursively composable

If you assign a Layer instance as attribute of another Layer,
the outer layer will start tracking the weights of the inner layer.

We recommend creating such sublayers in the `__init__` method (since the sublayers will typically have a `build` method, they will be built when the outer layer gets built).

In [68]:
# Let's assume we are reusing the Linear class
# with a `build` method that we defined above.
class MLPBlock(layers.Layer):

    def __init__(self):
        super(MLPBlock, self).__init__()
        self.linear_1 = Linear(32)
        self.linear_2 = Linear(32)
        self.linear_3 = Linear(2)

    def call(self, inputs):
        x = self.linear_1(inputs)
        x = tf.nn.relu(x)
        x = self.linear_2(x)
        x = tf.nn.relu(x)
        return self.linear_3(x)

mlp = MLPBlock()
y = mlp(tf.ones(shape=(3, 64)))  # The first call to the `mlp` will create the weights
print('weights:', len(mlp.weights))
print('trainable weights:', len(mlp.trainable_weights))

init
init
init
build
call
build
call
build
call
weights: 6
trainable weights: 6


In [8]:
class MLPBlock(layers.Layer):

    def __init__(self):
        super(MLPBlock, self).__init__()
        
    def build(self,input_shape):
        self.linear_1 = Linear(32)
        self.linear_2 = Linear(32)
        self.linear_3 = Linear(2)

    def call(self, inputs):
        x = self.linear_1(inputs)
        x = tf.nn.relu(x)
        x = self.linear_2(x)
        x = tf.nn.relu(x)
        return self.linear_3(x)

mlp = MLPBlock()
y = mlp(tf.ones(shape=(3, 64)))  # The first call to the `mlp` will create the weights
print('weights:', len(mlp.weights))
print('trainable weights:', len(mlp.trainable_weights))

weights: 6
trainable weights: 6


In [69]:
mlp.weights

[<tf.Variable 'mlp_block/linear_7/Variable:0' shape=(64, 32) dtype=float32, numpy=
 array([[ 0.0180655 ,  0.02374338, -0.05606483, ..., -0.01315719,
         -0.03572294, -0.01693827],
        [ 0.00323544, -0.04751184,  0.07704943, ...,  0.01181477,
         -0.00795043, -0.00286257],
        [ 0.0608725 ,  0.05619473,  0.09616824, ..., -0.04533891,
          0.02907863,  0.11917132],
        ...,
        [ 0.0464141 ,  0.04623416,  0.0678204 , ...,  0.07510582,
         -0.02357451, -0.05712678],
        [ 0.04974559,  0.00750397, -0.02929542, ...,  0.04593847,
          0.02657567,  0.0264814 ],
        [ 0.01923304,  0.01592487,  0.02636321, ..., -0.008432  ,
          0.00454803,  0.0552357 ]], dtype=float32)>,
 <tf.Variable 'mlp_block/linear_7/Variable:0' shape=(32,) dtype=float32, numpy=
 array([-0.02736011,  0.03980467,  0.00797743,  0.05097401,  0.09139656,
         0.08187187, -0.00299348,  0.00110982,  0.04128009, -0.03663872,
        -0.10554752, -0.00448293, -0.00175833,  

In [81]:
print(y,y + mlp.weights[5])

tf.Tensor(
[[ 0.09285626 -0.02166161]
 [ 0.09285626 -0.02166161]
 [ 0.09285626 -0.02166161]], shape=(3, 2), dtype=float32) tf.Tensor(
[[ 0.17987949 -0.04317725]
 [ 0.17987949 -0.04317725]
 [ 0.17987949 -0.04317725]], shape=(3, 2), dtype=float32)


In [79]:
x

<tf.Tensor: id=427, shape=(4, 2), dtype=float32, numpy=
array([[1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.]], dtype=float32)>

In [82]:
mlp.weights[5]

<tf.Variable 'mlp_block_1/linear_11/Variable:0' shape=(2,) dtype=float32, numpy=array([ 0.08702324, -0.02151564], dtype=float32)>

In [60]:
a = Linear(32)
b = Linear()
z = a(tf.ones(shape=(3, 64)))
a.input_shape

AttributeError: The layer has never been called and thus has no defined input shape.

### Layers recursively collect losses created during the forward pass

When writing the `call` method of a layer, you can create loss tensors that you will want to use later, when writing your training loop. This is doable by calling `self.add_loss(value)`:


In [4]:
# A layer that creates an activity regularization loss
class ActivityRegularizationLayer(layers.Layer):

    def __init__(self, rate=1e-2):
        super(ActivityRegularizationLayer, self).__init__()
        self.rate = rate

    def call(self, inputs):
        self.add_loss(self.rate * tf.reduce_sum(inputs))
        return inputs

These losses (including those created by any inner layer) can be retrieved via `layer.losses`.
This property is reset at the start of every `__call__` to the top-level layer, so that `layer.losses` always contains the loss values created during the last forward pass.

In [5]:
class OuterLayer(layers.Layer):

    def __init__(self):
        super(OuterLayer, self).__init__()
        self.activity_reg = ActivityRegularizationLayer(1e-2)

    def call(self, inputs):
        return self.activity_reg(inputs)


layer = OuterLayer()
assert len(layer.losses) == 0  # No losses yet since the layer has never been called
_ = layer(tf.zeros(1, 1))
assert len(layer.losses) == 1  # We created one loss value

# `layer.losses` gets reset at the start of each __call__
_ = layer(tf.zeros(1, 1))
assert len(layer.losses) == 1  # This is the loss created during the call above

In [8]:
print(layer.losses)
len(layer.losses)

[<tf.Tensor: id=45, shape=(), dtype=float32, numpy=0.0>]


1

In [10]:
_ = layer(tf.ones((2,2)))
layer.losses

[<tf.Tensor: id=52, shape=(), dtype=float32, numpy=0.04>]

In [12]:
layer.weights

[]

In addition, the `loss` property also contains regularization losses created for the weights of any inner layer:

In [70]:
class OuterLayer(layers.Layer):

    def __init__(self):
        super(OuterLayer, self).__init__()
        self.dense = layers.Dense(32, kernel_regularizer=tf.keras.regularizers.l2(1e-3))

    def call(self, inputs):
        return self.dense(inputs)


layer = OuterLayer()
_ = layer(tf.zeros((1, 1)))

# This is `1e-3 * sum(layer.dense.kernel ** 2)`,
# created by the `kernel_regularizer` above.
print(layer.losses)

[<tf.Tensor: id=1329, shape=(), dtype=float32, numpy=0.0017187007>]


In [71]:
class test(layers.Layer):
    
    def __init__(self):
        super(test, self).__init__()
        self.dense = layers.Dense(32)
        
    def call(self, inputs):
        return self.dense(inputs)
    
layer_test = test()

In [72]:
layer_test.weights

[]

In [84]:
_ = layer_test(tf.ones((1,1)))
print(layer_test.weights[0].initializer)

None


In [28]:
layer.weights[0]

<tf.Variable 'outer_layer_1/dense/kernel:0' shape=(1, 32) dtype=float32, numpy=
array([[ 0.20301062,  0.41947216, -0.3077877 ,  0.3320356 , -0.04775584,
         0.09079772, -0.24240516,  0.29237407, -0.01459622,  0.1882621 ,
        -0.21656553, -0.28707606, -0.20224887,  0.32503098, -0.39885205,
         0.09833658, -0.17787234, -0.36508054,  0.3141998 ,  0.24522626,
         0.03932247,  0.12195104, -0.26490855,  0.2911014 ,  0.0182344 ,
        -0.21591367, -0.38121134,  0.3878811 ,  0.21664739, -0.28069466,
         0.40381426, -0.36430547]], dtype=float32)>

These losses are meant to be taken into account when writing training loops, like this:


```python

# Instantiate an optimizer.
optimizer = tf.keras.optimizers.SGD(learning_rate=1e-3)
loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True)

# Iterate over the batches of a dataset.
for x_batch_train, y_batch_train in train_dataset:
  with tf.GradientTape() as tape:
    logits = layer(x_batch_train)  # Logits for this minibatch
    # Loss value for this minibatch
    loss_value = loss_fn(y_batch_train, logits)
    # Add extra losses created during this forward pass:
    loss_value += sum(model.losses)

  grads = tape.gradient(loss_value, model.trainable_weights)
  optimizer.apply_gradients(zip(grads, model.trainable_weights))

```

For a detailed guide about writing training loops, see the second section of the [guide to training and evaluation](./train_and_evaluate.ipynb).

In [89]:
(x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()

# Preprocess the data (these are Numpy arrays)
x_train = x_train.reshape(60000, 784).astype('float32') / 255
x_test = x_test.reshape(10000, 784).astype('float32') / 255

y_train = y_train.astype('float32')
y_test = y_test.astype('float32')

# Reserve 10,000 samples for validation
x_val = x_train[-10000:]
y_val = y_train[-10000:]
x_train = x_train[:-10000]
y_train = y_train[:-10000]

In [86]:
from tensorflow import keras
def get_uncompiled_model():
    inputs = keras.Input(shape=(784,), name='digits')
    x = layers.Dense(64, activation='relu', name='dense_1')(inputs)
    x = layers.Dense(64, activation='relu', name='dense_2')(x)
    outputs = layers.Dense(10, name='predictions')(x)
    model = keras.Model(inputs=inputs, outputs=outputs)
    return model
def get_compiled_model():
    model = get_uncompiled_model()
    model.compile(optimizer=keras.optimizers.RMSprop(learning_rate=1e-3),
                loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                metrics=['sparse_categorical_accuracy'])
    return model

In [90]:
model = get_compiled_model()
train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
train_dataset = train_dataset.shuffle(buffer_size=1024).batch(64)


In [42]:
layer(tf.ones((1,1)))

<tf.Tensor: id=655, shape=(1, 32), dtype=float32, numpy=
array([[ 0.20301062,  0.41947216, -0.3077877 ,  0.3320356 , -0.04775584,
         0.09079772, -0.24240516,  0.29237407, -0.01459622,  0.1882621 ,
        -0.21656553, -0.28707606, -0.20224887,  0.32503098, -0.39885205,
         0.09833658, -0.17787234, -0.36508054,  0.3141998 ,  0.24522626,
         0.03932247,  0.12195104, -0.26490855,  0.2911014 ,  0.0182344 ,
        -0.21591367, -0.38121134,  0.3878811 ,  0.21664739, -0.28069466,
         0.40381426, -0.36430547]], dtype=float32)>

In [93]:
?layer

In [91]:
# Instantiate an optimizer
optimizer = tf.keras.optimizers.SGD(learning_rate = 1e-3)
loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True)

# Iterate over the batches of a dataset.
for x_batch_train, y_batch_train in train_dataset:
    with tf.GradientTape() as tape:
        logits = layer(x_batch_train) # Logits for this minibatch
        # Loss value for this minibatch
        loss_value = loss_fn(y_batch_train, logits)
        # Add extra losses created during this forward pass:
        loss_value += sum(model.losses)
        
    grads = tape.gradient(loss_value, model.trainable_weights)
    optimizer.apply_gradients(zip(grads, model.trainable_weights))

InvalidArgumentError: Matrix size-incompatible: In[0]: [64,784], In[1]: [1,32] [Op:MatMul]

### You can optionally enable serialization on your layers

If you need your custom layers to be serializable as part of a [Functional model](./functional.ipynb), you can optionally implement a `get_config` method:


In [95]:
class Linear(layers.Layer):

    def __init__(self, units = 32):
        super(Linear, self).__init__()
        self.units = units

    def build(self, input_shape):
        self.w = self.add_weight(shape = (input_shape[-1], self.units),
                                 initializer = 'random_normal',
                                 trainable = True)
        self.b = self.add_weight(shape = (self.units,),
                                 initializer = 'random_normal',
                                 trainable = True)

    def call(self, inputs):
        return tf.matmul(inputs, self.w) + self.b

    def get_config(self):
        return {'units': self.units}


# Now you can recreate the layer from its config:
layer = Linear(64)
config = layer.get_config()
print(config)
new_layer = Linear.from_config(config)

{'units': 64}


Note that the `__init__` method of the base `Layer` class takes some keyword arguments, in particular a `name` and a `dtype`. It's good practice to pass these arguments to the parent class in `__init__` and to include them in the layer config:

In [97]:
class Linear(layers.Layer):

    def __init__(self, units=32, **kwargs):
        super(Linear, self).__init__(**kwargs)
        self.units = units

    def build(self, input_shape):
        self.w = self.add_weight(shape = (input_shape[-1], self.units),
                                 initializer = 'random_normal',
                                 trainable = True)
        self.b = self.add_weight(shape = (self.units,),
                                 initializer = 'random_normal',
                                 trainable = True)

    def call(self, inputs):
        return tf.matmul(inputs, self.w) + self.b

    def get_config(self):
        config = super(Linear, self).get_config()
        config.update({'units': self.units})
        return config


layer = Linear(64)
config = layer.get_config()
print(config)
#new_layer = Linear.from_config(config)

{'name': 'linear_14', 'trainable': True, 'dtype': 'float32', 'units': 64}


If you need more flexibility when deserializing the layer from its config, you can also override the `from_config` class method. This is the base implementation of `from_config`:

```python
def from_config(cls, config):
  return cls(**config)
```

To learn more about serialization and saving, see the complete [Guide to Saving and Serializing Models](./save_and_serialize.ipynb).

In [98]:
def from_config(cls, config):
    return cls(**config)

### Privileged `training` argument in the `call` method


Some layers, in particular the `BatchNormalization` layer and the `Dropout` layer, have different behaviors during training and inference. For such layers, it is standard practice to expose a `training` (boolean) argument in the `call` method.

By exposing this argument in `call`, you enable the built-in training and evaluation loops (e.g. `fit`) to correctly use the layer in training and inference.


In [71]:
class CustomDropout(layers.Layer):

    def __init__(self, rate, **kwargs):
        super(CustomDropout, self).__init__(**kwargs)
        self.rate = rate

    def call(self, inputs, training=None):
        if training:
            return tf.nn.dropout(inputs, rate=self.rate)
        return inputs

## Building Models



### The Model class

In general, you will use the `Layer` class to define inner computation blocks,
and will use the `Model` class to define the outer model -- the object you will train.

For instance, in a ResNet50 model, you would have several ResNet blocks subclassing `Layer`,
and a single `Model` encompassing the entire ResNet50 network.

The `Model` class has the same API as `Layer`, with the following differences:

- It exposes built-in training, evaluation, and prediction loops (`model.fit()`, `model.evaluate()`, `model.predict()`).
- It exposes the list of its inner layers, via the `model.layers` property.
- It exposes saving and serialization APIs.

Effectively, the "Layer" class corresponds to what we refer to in the literature
as a "layer" (as in "convolution layer" or "recurrent layer") or as a "block" (as in "ResNet block" or "Inception block").

Meanwhile, the "Model" class corresponds to what is referred to in the literature
as a "model" (as in "deep learning model") or as a "network" (as in "deep neural network").

For instance, we could take our mini-resnet example above, and use it to build a `Model` that we could
train with `fit()`, and that we could save with `save_weights`:

```python
class ResNet(tf.keras.Model):

    def __init__(self):
        super(ResNet, self).__init__()
        self.block_1 = ResNetBlock()
        self.block_2 = ResNetBlock()
        self.global_pool = layers.GlobalAveragePooling2D()
        self.classifier = Dense(num_classes)

    def call(self, inputs):
        x = self.block_1(inputs)
        x = self.block_2(x)
        x = self.global_pool(x)
        return self.classifier(x)


resnet = ResNet()
dataset = ...
resnet.fit(dataset, epochs=10)
resnet.save_weights(filepath)
```


In [100]:
class ResNet(tf.keras.Model):
    
    def __init__(self):
        super(ResNet, self).__init__()
        self.block_1 = ResNetBlock()
        self.block_2 = ResnetBlock()
        self.global_pool = layers.GlobalAveragePooling2D()
        self.classifier = Dense(num_classes)
        
    def call(self, inputs):
        x = self.block_1(inputs)
        x = self.block_2(x)
        x = self.global_pool(x)
        return self.classifier(x)
    
resnet = ResNet()
dataset = ...
resnet.fit(dataset, epochs = 10)
resnet.save_weights(filepath)

NameError: name 'Dense' is not defined

### Putting it all together: an end-to-end example

Here's what you've learned so far:

- A `Layer` encapsulate a state (created in `__init__` or `build`) and some computation (in `call`).
- Layers can be recursively nested to create new, bigger computation blocks.
- Layers can create and track losses (typically regularization losses).
- The outer container, the thing you want to train, is a `Model`. A `Model` is just like a `Layer`, but with added training and serialization utilities.

Let's put all of these things together into an end-to-end example: we're going to implement a Variational AutoEncoder (VAE). We'll train it on MNIST digits.

Our VAE will be a subclass of `Model`, built as a nested composition of layers that subclass `Layer`. It will feature a regularization loss (KL divergence).

In [73]:
class Sampling(layers.Layer):
    """Uses (z_mean, z_log_var) to sample z, the vector encoding a digit."""

    def call(self, inputs):
        z_mean, z_log_var = inputs
        batch = tf.shape(z_mean)[0]
        dim = tf.shape(z_mean)[1]
        epsilon = tf.keras.backend.random_normal(shape = (batch, dim))
        return z_mean + tf.exp(0.5 * z_log_var) * epsilon


class Encoder(layers.Layer):
    """Maps MNIST digits to a triplet (z_mean, z_log_var, z)."""

    def __init__(self,
                latent_dim = 32,
                intermediate_dim = 64,
                name = 'encoder',
                **kwargs):
        super(Encoder, self).__init__(name = name, **kwargs)
        self.dense_proj = layers.Dense(intermediate_dim, activation = 'relu')
        self.dense_mean = layers.Dense(latent_dim)
        self.dense_log_var = layers.Dense(latent_dim)
        self.sampling = Sampling()

    def call(self, inputs):
        x = self.dense_proj(inputs)
        z_mean = self.dense_mean(x)
        z_log_var = self.dense_log_var(x)
        z = self.sampling((z_mean, z_log_var))
        return z_mean, z_log_var, z

class Decoder(layers.Layer):
    """Converts z, the encoded digit vector, back into a readable digit."""

    def __init__(self,
                original_dim,
                intermediate_dim = 64,
                name = 'decoder',
                **kwargs):
        super(Decoder, self).__init__(name = name, **kwargs)
        self.dense_proj = layers.Dense(intermediate_dim, activation = 'relu')
        self.dense_output = layers.Dense(original_dim, activation = 'sigmoid')

    def call(self, inputs):
        x = self.dense_proj(inputs)
        return self.dense_output(x)


class VariationalAutoEncoder(tf.keras.Model):
    """Combines the encoder and decoder into an end-to-end model for training."""

    def __init__(self,
                original_dim,
                intermediate_dim = 64,
                latent_dim = 32,
                name = 'autoencoder',
                **kwargs):
        super(VariationalAutoEncoder, self).__init__(name = name, **kwargs)
        self.original_dim = original_dim
        self.encoder = Encoder(latent_dim = latent_dim,
                               intermediate_dim = intermediate_dim)
        self.decoder = Decoder(original_dim, intermediate_dim = intermediate_dim)

    def call(self, inputs):
        z_mean, z_log_var, z = self.encoder(inputs)
        reconstructed = self.decoder(z)
        # Add KL divergence regularization loss.
        kl_loss = - 0.5 * tf.reduce_mean(
            z_log_var - tf.square(z_mean) - tf.exp(z_log_var) + 1)
        self.add_loss(kl_loss)
        return reconstructed

In [74]:
original_dim = 784
vae = VariationalAutoEncoder(original_dim, 64, 32)

optimizer = tf.keras.optimizers.Adam(learning_rate = 1e-3)
mse_loss_fn = tf.keras.losses.MeanSquaredError()

loss_metric = tf.keras.metrics.Mean()

(x_train, _), _ = tf.keras.datasets.mnist.load_data()
x_train = x_train.reshape(60000, 784).astype('float32') / 255

train_dataset = tf.data.Dataset.from_tensor_slices(x_train)
train_dataset = train_dataset.shuffle(buffer_size=1024).batch(64)

epochs = 3

# Iterate over epochs.
for epoch in range(epochs):
    print('Start of epoch %d' % (epoch,))

    # Iterate over the batches of the dataset.
    for step, x_batch_train in enumerate(train_dataset):
        with tf.GradientTape() as tape:
            reconstructed = vae(x_batch_train)
            # Compute reconstruction loss
            loss = mse_loss_fn(x_batch_train, reconstructed)
            loss += sum(vae.losses)  # Add KLD regularization loss

        grads = tape.gradient(loss, vae.trainable_weights)
        optimizer.apply_gradients(zip(grads, vae.trainable_weights))

        loss_metric(loss)

        if step % 100 == 0:
            print('step %s: mean loss = %s' % (step, loss_metric.result()))


Start of epoch 0
step 0: mean loss = tf.Tensor(0.34416658, shape=(), dtype=float32)
step 100: mean loss = tf.Tensor(0.12549832, shape=(), dtype=float32)
step 200: mean loss = tf.Tensor(0.09920556, shape=(), dtype=float32)
step 300: mean loss = tf.Tensor(0.08919379, shape=(), dtype=float32)
step 400: mean loss = tf.Tensor(0.084206045, shape=(), dtype=float32)
step 500: mean loss = tf.Tensor(0.08087842, shape=(), dtype=float32)
step 600: mean loss = tf.Tensor(0.07874563, shape=(), dtype=float32)
step 700: mean loss = tf.Tensor(0.07715198, shape=(), dtype=float32)
step 800: mean loss = tf.Tensor(0.075982735, shape=(), dtype=float32)
step 900: mean loss = tf.Tensor(0.07496637, shape=(), dtype=float32)
Start of epoch 1
step 0: mean loss = tf.Tensor(0.074673206, shape=(), dtype=float32)
step 100: mean loss = tf.Tensor(0.07402148, shape=(), dtype=float32)
step 200: mean loss = tf.Tensor(0.073512234, shape=(), dtype=float32)
step 300: mean loss = tf.Tensor(0.07303131, shape=(), dtype=float32)


In [105]:
(x_train, _), _ = tf.keras.datasets.mnist.load_data()
x_train = x_train.reshape(60000, 784).astype('float32') / 255
train_dataset = tf.data.Dataset.from_tensor_slices(x_train)
#train_dataset = train_dataset.shuffle(buffer_size=1024).batch(64)
x_train
train_dataset

<TensorSliceDataset shapes: (784,), types: tf.float32>

Note that since the VAE is subclassing `Model`, it features built-in training loops. So you could also have trained it like this:

In [75]:
vae = VariationalAutoEncoder(784, 64, 32)

optimizer = tf.keras.optimizers.Adam(learning_rate = 1e-3)

vae.compile(optimizer, loss=tf.keras.losses.MeanSquaredError())
vae.fit(x_train, x_train, epochs = 3, batch_size = 64)

Train on 60000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x238e0eff6c8>

### Beyond object-oriented development: the Functional API

Was this example too much object-oriented development for you? You can also build models using [the Functional API](./functional.ipynb). Importantly, choosing one style or another does not prevent you from leveraging components written in the other style: you can always mix-and-match.

For instance, the Functional API example below reuses the same `Sampling` layer we defined in the example above.

In [21]:
original_dim = 784
intermediate_dim = 64
latent_dim = 32

# Define encoder model.
original_inputs = tf.keras.Input(shape = (original_dim,), name = 'encoder_input')
x = layers.Dense(intermediate_dim, activation = 'relu')(original_inputs)
z_mean = layers.Dense(latent_dim, name = 'z_mean')(x)
z_log_var = layers.Dense(latent_dim, name = 'z_log_var')(x)
z = Sampling()((z_mean, z_log_var))
encoder = tf.keras.Model(inputs = original_inputs, outputs = z, name = 'encoder')

# Define decoder model.
latent_inputs = tf.keras.Input(shape = (latent_dim,), name = 'z_sampling')
x = layers.Dense(intermediate_dim, activation = 'relu')(latent_inputs)
outputs = layers.Dense(original_dim, activation = 'sigmoid')(x)
decoder = tf.keras.Model(inputs = latent_inputs, outputs = outputs, name = 'decoder')

# Define VAE model.
outputs = decoder(z)
vae = tf.keras.Model(inputs = original_inputs, outputs = outputs, name = 'vae')

# Add KL divergence regularization loss.
kl_loss = - 0.5 * tf.reduce_mean(
    z_log_var - tf.square(z_mean) - tf.exp(z_log_var) + 1)
vae.add_loss(kl_loss)

# Train.
optimizer = tf.keras.optimizers.Adam(learning_rate = 1e-3)
vae.compile(optimizer, loss = tf.keras.losses.MeanSquaredError())
vae.fit(x_train, x_train, epochs = 3, batch_size = 64)

Train on 60000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7fa0d0323198>