In [1]:
import tensorflow as tf
from datetime import datetime

%load_ext tensorboard

# 构建tf model

1、所有的model(layer)都继承tf.Module类

In [2]:
# eager execute
# 动态图
class SimpleModule(tf.Module):
    def __init__(self, name=None): # 模型初始化，定义所需变量和常量
        super().__init__(name=name)
        self.a_variable = tf.Variable(5.0, name='train_me')
        self.non_trainable_varibale = tf.Variable(5.0, trainable=False, name="do_not_train_me")
    def __call__(self, x): # 前向过程
        return self.a_variable * x + self.non_trainable_varibale

simple_module = SimpleModule(name="simple")
simple_module(tf.constant(5.0))

2022-01-20 00:21:45.111252: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN)to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-01-20 00:21:45.125611: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7fbc771973c0 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
2022-01-20 00:21:45.125624: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): Host, Default Version


<tf.Tensor: shape=(), dtype=float32, numpy=30.0>

如何获得静态图？  
在tfnotes_graph中学到，用tf.function修饰函数usr_def_func，修饰器会捕捉函数usr_def_func中的所有tf.Op和python logic，构成计算图（概念中的）。第一次调用该函数的时候，会执行trace，在内存中真正创建tf.Graph（tf.Autograph）  
usr_def_func.get_concrete_function(input_data).graph.as_graph_def()，可以打印计算图  
  
在模型的class代码中，用tf.function修饰前向过程（即__call__函数）。第一次开始前向过程的时候，会执行trace，构建tf.Graph  
可以使用  tfmodel.\_\_call\_\_.get_concrete_function(input_data).graph.as_graph_def()  的方法，打印计算图

In [3]:
# graph execute
# 静态图
class SimpleModule(tf.Module):
    def __init__(self, name=None):
        super().__init__(name=name)
        self.a_variable = tf.Variable(5.0, name='train_me')
        self.non_trainable_varibale = tf.Variable(5.0, trainable=False, name="do_not_train_me")

    @tf.function
    def __call__(self, x): # tf.function修饰前向过程，得到静态图的模型
        return self.a_variable * x + self.non_trainable_varibale

simple_module = SimpleModule(name='simplegraph')

In [4]:
# 打印图结构
simple_module.__call__.get_concrete_function(tf.constant(5.0)).graph.as_graph_def()

node {
  name: "x"
  op: "Placeholder"
  attr {
    key: "_user_specified_name"
    value {
      s: "x"
    }
  }
  attr {
    key: "dtype"
    value {
      type: DT_FLOAT
    }
  }
  attr {
    key: "shape"
    value {
      shape {
      }
    }
  }
}
node {
  name: "ReadVariableOp/resource"
  op: "Placeholder"
  attr {
    key: "dtype"
    value {
      type: DT_RESOURCE
    }
  }
  attr {
    key: "shape"
    value {
      shape {
      }
    }
  }
}
node {
  name: "ReadVariableOp"
  op: "ReadVariableOp"
  input: "ReadVariableOp/resource"
  attr {
    key: "dtype"
    value {
      type: DT_FLOAT
    }
  }
}
node {
  name: "mul"
  op: "Mul"
  input: "ReadVariableOp"
  input: "x"
  attr {
    key: "T"
    value {
      type: DT_FLOAT
    }
  }
}
node {
  name: "add/ReadVariableOp/resource"
  op: "Placeholder"
  attr {
    key: "dtype"
    value {
      type: DT_RESOURCE
    }
  }
  attr {
    key: "shape"
    value {
      shape {
      }
    }
  }
}
node {
  name: "add/ReadVariab

2、用model构建model

In [2]:
class Dense(tf.Module): # dense layer
    def __init__(self, in_features, out_features, name=None):
        super().__init__(name=name)
        self.w = tf.Variable(tf.random.normal([in_features, out_features]), name='w')
        self.b = tf.Variable(tf.zeros(out_features), name='b')
    def __call__(self, x):
        y = tf.matmul(x, self.w) + self.b
        return tf.nn.relu(y)

In [5]:
class SequentialModule(tf.Module):
    def __init__(self, name=None):
        super().__init__(name=name)
        self.dense_1 = Dense(in_features=3, out_features=3)
        self.dense_2 = Dense(in_features=3, out_features=2)
    
    @tf.function
    def __call__(self, x):
        x = self.dense_1(x) # 重引用不会创建node
        return self.dense_2(x)

my_model = SequentialModule(name="mlp_2layer")

In [6]:
my_model.__call__.get_concrete_function(tf.constant([[1., 2., 3.]])).graph.as_graph_def()

node {
  name: "x"
  op: "Placeholder"
  attr {
    key: "_user_specified_name"
    value {
      s: "x"
    }
  }
  attr {
    key: "dtype"
    value {
      type: DT_FLOAT
    }
  }
  attr {
    key: "shape"
    value {
      shape {
        dim {
          size: 1
        }
        dim {
          size: 3
        }
      }
    }
  }
}
node {
  name: "MatMul/ReadVariableOp/resource"
  op: "Placeholder"
  attr {
    key: "dtype"
    value {
      type: DT_RESOURCE
    }
  }
  attr {
    key: "shape"
    value {
      shape {
      }
    }
  }
}
node {
  name: "MatMul/ReadVariableOp"
  op: "ReadVariableOp"
  input: "MatMul/ReadVariableOp/resource"
  attr {
    key: "dtype"
    value {
      type: DT_FLOAT
    }
  }
}
node {
  name: "MatMul"
  op: "MatMul"
  input: "x"
  input: "MatMul/ReadVariableOp"
  attr {
    key: "T"
    value {
      type: DT_FLOAT
    }
  }
  attr {
    key: "transpose_a"
    value {
      b: false
    }
  }
  attr {
    key: "transpose_b"
    value {
      b: f

tf.Module类有类方法，可以展示它自动收集的tf.Variable实例和tf.Module实例。初始化一个实例之后就可以看到。

In [7]:
my_model.submodules

2022-01-20 00:22:02.143245: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


(<__main__.Dense at 0x7fbc837425e0>, <__main__.Dense at 0x7fbc83628fa0>)

In [8]:
for var in my_model.variables:
    print(var, '\n')

<tf.Variable 'b:0' shape=(3,) dtype=float32, numpy=array([0., 0., 0.], dtype=float32)> 

<tf.Variable 'w:0' shape=(3, 3) dtype=float32, numpy=
array([[ 1.2392718 ,  1.1902896 , -1.2150244 ],
       [-2.1295872 , -0.5849101 ,  1.2354501 ],
       [ 1.953534  ,  0.79020137,  2.3628767 ]], dtype=float32)> 

<tf.Variable 'b:0' shape=(2,) dtype=float32, numpy=array([0., 0.], dtype=float32)> 

<tf.Variable 'w:0' shape=(3, 2) dtype=float32, numpy=
array([[-0.502447  ,  0.67322606],
       [ 1.5595437 , -0.8026046 ],
       [-1.2609985 , -0.96091294]], dtype=float32)> 



In [9]:
# all trainable variables
simple_module.trainable_variables

(<tf.Variable 'train_me:0' shape=() dtype=float32, numpy=5.0>,)

In [10]:
# all variables
simple_module.variables

(<tf.Variable 'train_me:0' shape=() dtype=float32, numpy=5.0>,
 <tf.Variable 'do_not_train_me:0' shape=() dtype=float32, numpy=5.0>)

3、延迟构建（deferr）  
即不是在初始化实例的时候构建内部参数变量，而是在第一次前向过程的时候，构建内部变量  
好处是等拿到input之后，才开始构建内部参数变量。所以内部参数变量可以根据第一次input来创建

In [11]:
class FlexibleDense(tf.Module):
    def __init__(self, out_features, name=None):
        super().__init__(name=name)
        self.is_built = False # 内部变量，判断计算图是否创建。第一次前向过程中会创建计算图，在此时才初始化内部变量
        self.out_features = out_features # 传递实例初始化的参数到前向过程函数
    
    def __call__(self, x):
        # create variables on first call
        if not self.is_built: # 假如尚未built，即第一次前向过程之前
            self.w = tf.Variable(tf.random.normal([x.shape[-1], self.out_features]), name='w') # 根据input data维度，确定in_features
            self.b = tf.Variable(tf.zeros([self.out_features]), name='b')
            self.is_built = True # 第一次前向过程中，内部变量被创建，且计算图被创建。设定状态变量is_built为True，避免下一次前向过程再度创建内部变量
        
        y = tf.matmul(x, self.w) + self.b
        return tf.nn.relu(y)

class MySeqModule(tf.Module):
    def __init__(self, name=None):
        super().__init__(name)
        self.dense_1 = FlexibleDense(out_features=3)
        self.dense_2 = FlexibleDense(out_features=2)
    
    def __call__(self, x):
        x = self.dense_1(x)
        return self.dense_2(x)

my_model = MySeqModule(name='flex_mlp_2l')
my_model(tf.constant([[1., 2., 3., 4.]]))

<tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[4.1062264, 0.       ]], dtype=float32)>

# 保存tf model

1、保存参数  
tfmodel保存的参数又叫做checkpoint

In [12]:
chkp_path = "../../model/my_checkpoint"
checkpoint = tf.train.Checkpoint(model=my_model)
checkpoint.write(chkp_path)

'../../model/my_checkpoint'

列出以my_checkpoint命名的文件:  
.data-00000-of-00001文件是数据分片，储存了weights的值  
.index文件储存了weights的顺序

In [15]:
ls ../../model/my_checkpoint*

../../model/my_checkpoint.data-00000-of-00001
../../model/my_checkpoint.index


In [16]:
# 打印checkpoint文件中存储的变量
tf.train.list_variables(chkp_path)

[('_CHECKPOINTABLE_OBJECT_GRAPH', []),
 ('model/dense_1/b/.ATTRIBUTES/VARIABLE_VALUE', [3]),
 ('model/dense_1/w/.ATTRIBUTES/VARIABLE_VALUE', [4, 3]),
 ('model/dense_2/b/.ATTRIBUTES/VARIABLE_VALUE', [2]),
 ('model/dense_2/w/.ATTRIBUTES/VARIABLE_VALUE', [3, 2])]

2、读取checkpoints  
读取checkpoints前，需要从original python code中，实例化一个新的model  
然后使用  tf.train.Checkpoint(model=new_model).restore(chkp_path)  方法来加载checkpoint

In [17]:
new_model = MySeqModule() #初始化一个新的model，同一个类
new_checkpoint = tf.train.Checkpoint(model=new_model)
new_checkpoint.restore(chkp_path) #new_model读取chkp_path储存的checkoints

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7fbc83753040>

In [18]:
# 测试一下
new_model(tf.constant([[1., 2., 3., 4.]]))

<tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[4.1062264, 0.       ]], dtype=float32)>

2、保存函数  
tf model可以在没有python源代码的情况下使用，只需计算图被建构好。  
所以保存模型 即 保存计算图，即 使用静态图模型  
  
定义模型的时候，要定义静态图的模型，即用tf.function修饰__call__()前向函数

In [3]:
class MySeqModel(tf.Module):
    def __init__(self, name=None):
        super().__init__(name=name)
        self.dense_1 = Dense(in_features=3, out_features=3)
        self.dense_2 = Dense(in_features=3, out_features=2)
    
    @tf.function # 定义静态图模型，在第一次前向过程的时候，会创建计算图
    def __call__(self, x):
        x = self.dense_1(x)
        return self.dense_2(x)
my_model = MySeqModel(name='mlp_2l')

2022-01-20 02:43:19.494711: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN)to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-01-20 02:43:19.512268: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7fa5c95a6650 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
2022-01-20 02:43:19.512282: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): Host, Default Version


In [4]:
my_model(tf.constant([[2., 2., 2.]]))

<tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[1.1390929 , 0.21892878]], dtype=float32)>

In [5]:
my_model(tf.constant([[2., 2., 2.], [2., 2., 2.]]))

<tf.Tensor: shape=(2, 2), dtype=float32, numpy=
array([[1.1390929 , 0.21892878],
       [1.1390929 , 0.21892878]], dtype=float32)>

In [6]:
my_model(tf.constant([[[2., 2., 2.],
                       [2., 2., 2.]],
                       ]))

<tf.Tensor: shape=(1, 2, 2), dtype=float32, numpy=
array([[[1.1390929 , 0.21892878],
        [1.1390929 , 0.21892878]]], dtype=float32)>

In [7]:
# 打印my_model 概念计算图的signatures
print(my_model.__call__.pretty_printed_concrete_signatures())

__call__(x)
  Args:
    x: float32 Tensor, shape=(2, 3)
  Returns:
    float32 Tensor, shape=(2, 2)

__call__(x)
  Args:
    x: float32 Tensor, shape=(1, 2, 3)
  Returns:
    float32 Tensor, shape=(1, 2, 2)

__call__(x)
  Args:
    x: float32 Tensor, shape=(1, 3)
  Returns:
    float32 Tensor, shape=(1, 2)


3、保存模型  
最好的办法是直接保存模型，这样就会同时保存函数（graphs）和数据（weights）  

In [10]:
tf.saved_model.save(my_model, '../../model/mlp_2l_saved_model')

INFO:tensorflow:Assets written to: ../../model/mlp_2l_saved_model/assets


2022-01-20 02:09:28.298319: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


生成了一个指定路径的文件夹，打印文件夹里的所有文件，可以看到三个固定命名的文件：  
assets  
variables  
saved_model.pb

In [11]:
ls -l ../../model/mlp_2l_saved_model

total 32
drwxr-xr-x  2 lhy  staff     64  1 20 01:30 [1m[36massets[m[m/
-rw-r--r--  1 lhy  staff  14131  1 20 02:09 saved_model.pb
drwxr-xr-x  4 lhy  staff    128  1 20 02:09 [1m[36mvariables[m[m/


saved_model.pb文件是类似json的语言中性的序列文件，记录了计算图graph

variables文件夹记录了模型的checkpoints文件（data shard和index文件）

In [12]:
ls -l ../../model/mlp_2l_saved_model/variables

total 16
-rw-r--r--  1 lhy  staff  408  1 20 02:09 variables.data-00000-of-00001
-rw-r--r--  1 lhy  staff  356  1 20 02:09 variables.index


4、加载模型  
从saved_model文件中加载模型，不需要python源代码，甚至不需要python解释器  
因为.pb文件是语言中性的定义graph文件，所以会直接根据.pb文件在内存中创建计算图  
所以:  
  加载的模型不是原模型class的实例instance  
  加载的模型，只能使用already-defined签名  

In [13]:
new_model = tf.saved_model.load('../../model/mlp_2l_saved_model')

In [14]:
# 测试一下符合的signature input
new_model(tf.constant([[2., 2., 2.], [2., 2., 2.]]))

<tf.Tensor: shape=(2, 2), dtype=float32, numpy=
array([[0.09257771, 0.2780159 ],
       [0.09257771, 0.2780159 ]], dtype=float32)>

In [18]:
# 测试一下不符合的signature input
new_model(tf.constant([[[2., 2., 2.],
                       [2., 2., 2.]],
                       [[2., 2., 2.],
                       [2., 2., 2.]],
                       ]))

ValueError: Could not find matching function to call loaded from the SavedModel. Got:
  Positional arguments (1 total):
    * Tensor("x:0", shape=(2, 2, 3), dtype=float32)
  Keyword arguments: {}

Expected these arguments to match one of the following 3 option(s):

Option 1:
  Positional arguments (1 total):
    * TensorSpec(shape=(1, 3), dtype=tf.float32, name='x')
  Keyword arguments: {}

Option 2:
  Positional arguments (1 total):
    * TensorSpec(shape=(2, 3), dtype=tf.float32, name='x')
  Keyword arguments: {}

Option 3:
  Positional arguments (1 total):
    * TensorSpec(shape=(1, 2, 3), dtype=tf.float32, name='x')
  Keyword arguments: {}

# 计算图Graph可视化

用tf.function修饰前向过程__call__的模型，会在第一次前向过程中，在内存中生成静态图tf.Graph  
生成静态图tf.Graph的过程叫做trace  
  
retrace  
    不同数据signature（shape和dtype定义了signature）的input会导致retrace  
    多次传递python args会导致retrace  
    @tf.function写在了循环内部  

（解释下为什么@tf.function写在了循环内部会导致retrace：  
for in loop:  
----@tf.function # 修饰  
----def usr_def_func: code # 定义  
----usr_def_func(input)  # 调用  
在循环内部，每一次定义usr_def_func时，都会申请一块新内存，来存储修饰好的usr_def_func。而上一次循环中申请的内存，存储着的usr_def_func，丢了python名字，但因为调用过，所以trace建立了计算图，所以内存还没有释放。  
也就是说，在新一次循环，虽然函数名字没变，但它已经指向了新的内存块。此时完成函数调用之后，其实是用新的内存块的函数调用，其实也是第一次调用，于是就会retrace。


内存中若同时存在5个tf.Graph，则会报warning

### tensorboard