In [1]:
import tensorflow as tf
converter = tf.lite.TFLiteConverter.from_saved_model("saved_model/mnist_cnn")
converter.optimizations = [tf.lite.Optimize.DEFAULT]
quant_model = converter.convert()
with open("model_quant.tflite", "wb") as f:
    f.write(quant_model)

In [3]:
from keras.layers import TFSMLayer
model = TFSMLayer("saved_model/mnist_cnn", call_endpoint="serving_default")

In [5]:
converter = tf.lite.TFLiteConverter.from_saved_model("saved_model/mnist_cnn")
converter.optimizations = [tf.lite.Optimize.DEFAULT]
quant_model = converter.convert()

In [7]:
with open("model_fp32.tflite", "wb") as f: f.write(tf.lite.TFLiteConverter.from_saved_model("saved_model/mnist_cnn").convert())
with open("model_quant.tflite", "wb") as f: f.write(quant_model)

In [9]:
import os
print("FP32 Size:", os.path.getsize("model_fp32.tflite"))
print("Quantized Size:", os.path.getsize("model_quant.tflite"))

FP32 Size: 1391304
Quantized Size: 353696


In [11]:
interpreter_fp32 = tf.lite.Interpreter(model_path="model_fp32.tflite")
interpreter_quant = tf.lite.Interpreter(model_path="model_quant.tflite")
interpreter_fp32.allocate_tensors(); interpreter_quant.allocate_tensors()

In [14]:
from tensorflow.keras.datasets import mnist
from tensorflow.keras.utils import to_categorical
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_test = x_test.astype("float32") / 255.0
x_test = x_test[..., None] 
y_test = to_categorical(y_test)

In [18]:
import time
import numpy as np
def evaluate_model(interpreter, test_images, test_labels):
    input_index = interpreter.get_input_details()[0]['index']
    output_index = interpreter.get_output_details()[0]['index']
    correct = 0
    total_time = 0
    for i in range(len(test_images)):
        img = np.expand_dims(test_images[i], axis=0).astype(np.float32)
        interpreter.set_tensor(input_index, img)
        start = time.time()
        interpreter.invoke()
        total_time += time.time() - start
        output = interpreter.get_tensor(output_index)
        if np.argmax(output) == np.argmax(test_labels[i]):
            correct += 1
    return correct / len(test_images), total_time / len(test_images)

acc_fp32, time_fp32 = evaluate_model(interpreter_fp32, x_test, y_test)
acc_quant, time_quant = evaluate_model(interpreter_quant, x_test, y_test)


In [20]:
print(f"{'Model':<15}{'Size (KB)':<15}{'Accuracy':<15}{'Time (s/sample)':<20}")
print(f"{'FP32':<15}{os.path.getsize('model_fp32.tflite')//1024:<15}{acc_fp32:<15.4f}{time_fp32:<20.4f}")
print(f"{'Quantized':<15}{os.path.getsize('model_quant.tflite')//1024:<15}{acc_quant:<15.4f}{time_quant:<20.4f}")

Model          Size (KB)      Accuracy       Time (s/sample)     
FP32           1358           0.9840         0.0001              
Quantized      345            0.9840         0.0000              
