# 保存.h5模型，这里使用的是MobileNetV2，而且是在ImageNet上预训练过的

In [1]:
import os
from tensorflow.keras.applications.mobilenet_v2 import MobileNetV2 as Net

model = Net(weights='imagenet')

os.makedirs('./model', exist_ok=True)

# Save the h5 file to path specified.
model.save("./model/mobilenetv2.h5")

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


# 将.h5文件转化成frozen graph，并保存

In [2]:
import tensorflow as tf
from tensorflow.python.framework import graph_io
from tensorflow.keras.models import load_model
from tensorflow.python.compiler.tensorrt import trt_convert as trt


# Clear any previous session.
tf.keras.backend.clear_session()

save_pb_dir = './model'
model_fname = './model/mobilenetv2.h5'
def freeze_graph(graph, session, output, save_pb_dir='.', save_pb_name='fp32_frozen_graph.pb', save_pb_as_text=False):
    with graph.as_default():
        graphdef_inf = tf.graph_util.remove_training_nodes(graph.as_graph_def())
        graphdef_frozen = tf.graph_util.convert_variables_to_constants(session, graphdef_inf, output)
        graph_io.write_graph(graphdef_frozen, save_pb_dir, save_pb_name, as_text=save_pb_as_text)
        return graphdef_frozen

# This line must be executed before loading Keras model.
tf.keras.backend.set_learning_phase(0) 

model = load_model(model_fname)

session = tf.keras.backend.get_session()

input_names = [t.op.name for t in model.inputs]
output_names = [t.op.name for t in model.outputs]

# Prints input and output nodes names, take notes of them.
print(input_names, output_names)

fp32_frozen_graph = freeze_graph(session.graph, session, [out.op.name for out in model.outputs], save_pb_dir=save_pb_dir)

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
['input_1'] ['Logits/Softmax']
Instructions for updating:
Use `tf.compat.v1.graph_util.remove_training_nodes`
Instructions for updating:
Use `tf.compat.v1.graph_util.convert_variables_to_constants`
Instructions for updating:
Use `tf.compat.v1.graph_util.extract_sub_graph`
INFO:tensorflow:Froze 262 variables.
INFO:tensorflow:Converted 262 variables to const ops.


# 将frozen graph转化成FP32的.tflite

In [3]:
import tensorflow as tf

graph_def_file = "./model/fp32_frozen_graph.pb"
input_arrays = ["input_1"]
output_arrays = ['Logits/Softmax']

converter = tf.lite.TFLiteConverter.from_frozen_graph(
  graph_def_file, input_arrays, output_arrays)
tflite_model = converter.convert()
open("./model/fp32_frozen_graph.tflite", "wb").write(tflite_model)

13973776

# 读取一张图片用于calibration

In [4]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.mobilenet_v2 import preprocess_input, decode_predictions
# Optional image to test model prediction.
img_path = './elephant.jpg'

image_size = [224, 224, 3]
img = image.load_img(img_path, target_size=image_size[:2])
x = image.img_to_array(img)
x = np.expand_dims(x, axis=0)
x = preprocess_input(x)

# 转化成.tflite过程中引入integer quantization，此处是FP32->INT8的转化，所以要引入calibration

In [5]:
num_calibration_batches = 2
BATCH_SIZE = 1
batched_input = np.zeros((BATCH_SIZE * num_calibration_batches, 224, 224, 3), dtype=np.float32)

config = tf.ConfigProto()
config.gpu_options.allow_growth=True
with tf.Session(graph=tf.Graph(), config=config) as sess:
    # prepare dataset iterator
    next_element = tf.convert_to_tensor(x)
    for i in range(num_calibration_batches):
        print(batched_input[i*BATCH_SIZE:(i+1)*BATCH_SIZE, :].shape, sess.run(next_element).shape)
        batched_input[i*BATCH_SIZE:(i+1)*BATCH_SIZE, :] = sess.run(next_element)

#batched_input = tf.constant(batched_input)
print('Calibration data shape: ', batched_input.shape)

def calibration_input_fn_gen():
    for i in range(num_calibration_batches):
        yield [batched_input[i*BATCH_SIZE:(i+1)*BATCH_SIZE, :]]
        
calibration_input_fn = calibration_input_fn_gen()

(1, 224, 224, 3) (1, 224, 224, 3)
(1, 224, 224, 3) (1, 224, 224, 3)
Calibration data shape:  (2, 224, 224, 3)


In [6]:
import tensorflow as tf

graph_def_file = "./model/fp32_frozen_graph.pb"
input_arrays = ["input_1"]
output_arrays = ['Logits/Softmax']

converter = tf.lite.TFLiteConverter.from_frozen_graph(
  graph_def_file, input_arrays, output_arrays)
converter.optimizations = [tf.lite.Optimize.DEFAULT]
converter.representative_dataset = calibration_input_fn_gen
tflite_model = converter.convert()
open("./model/int8_frozen_graph.tflite", "wb").write(tflite_model)

3979896

# 封装好的Inference函数

In [7]:
def tfLiteInference(input_details, interpreter, output_details, x):
    interpreter.set_tensor(input_details[0]['index'], x)

    interpreter.invoke()

    # The function `get_tensor()` returns a copy of the tensor data.
    # Use `tensor()` in order to get a pointer to the tensor.
    output_data = interpreter.get_tensor(output_details[0]['index'])
    
    return output_data

# 测试FP32的.tflite

In [8]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.mobilenet_v2 import preprocess_input, decode_predictions

# Load TFLite model and allocate tensors.
interpreter = tf.lite.Interpreter(model_path="./model/fp32_frozen_graph.tflite")
interpreter.allocate_tensors()

# Get input and output tensors.
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()

# Optional image to test model prediction.
img_path = './elephant.jpg'

image_size = [224, 224, 3]
img = image.load_img(img_path, target_size=image_size[:2])
x = image.img_to_array(img)
x = np.expand_dims(x, axis=0)
x = preprocess_input(x)

In [9]:
# test FP32 CPU
import time
times = []

output_data = tfLiteInference(input_details, interpreter, output_details, x)

# decode the results into a list of tuples (class, description, probability)
# (one such list for each sample in the batch)
print('Predicted:', decode_predictions(output_data, top=3)[0])

for i in range(2000):
    start_time = time.time()
    output_data = tfLiteInference(input_details, interpreter, output_details, x)
    delta = (time.time() - start_time)
    times.append(delta)
mean_delta = np.array(times).mean()
fps = 1 / mean_delta
print('average(sec):{:.2f},fps:{:.2f}'.format(mean_delta, fps))

Predicted: [('n02504458', 'African_elephant', 0.40645647), ('n02504013', 'Indian_elephant', 0.26055372), ('n01871265', 'tusker', 0.14443965)]
average(sec):0.02,fps:64.40


我们可以看到它的FPS是64

# 测试INT8的.tflite

In [10]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.mobilenet_v2 import preprocess_input, decode_predictions

# Load TFLite model and allocate tensors.
interpreter = tf.lite.Interpreter(model_path="./model/int8_frozen_graph.tflite")
interpreter.allocate_tensors()

# Get input and output tensors.
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()

# Optional image to test model prediction.
img_path = './elephant.jpg'

image_size = [224, 224, 3]
img = image.load_img(img_path, target_size=image_size[:2])
x = image.img_to_array(img)
x = np.expand_dims(x, axis=0)
x = preprocess_input(x)

In [11]:
# test INT8 CPU
import time
times = []

output_data = tfLiteInference(input_details, interpreter, output_details, x)

# decode the results into a list of tuples (class, description, probability)
# (one such list for each sample in the batch)
print('Predicted:', decode_predictions(output_data, top=3)[0])

for i in range(20):
    start_time = time.time()
    output_data = tfLiteInference(input_details, interpreter, output_details, x)
    delta = (time.time() - start_time)
    times.append(delta)
mean_delta = np.array(times).mean()
fps = 1 / mean_delta
print('average(sec):{:.2f},fps:{:.2f}'.format(mean_delta, fps))

Predicted: [('n02504458', 'African_elephant', 0.45703125), ('n02504013', 'Indian_elephant', 0.18359375), ('n01871265', 'tusker', 0.13671875)]
average(sec):0.91,fps:1.10


我们可以看到它的FPS是1

针对FP32和INT8的FPS问题，还需要深入挖掘一下