# Description

This notobook simply demonstrades how to perform the Post-Traning-Quantization technique.

In [8]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 
import random
random.seed(42)
import tensorflow as tf
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try: tf.config.experimental.set_memory_growth(gpus[0], True)
    except RuntimeError as e:   print(e)

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

from tensorflow.keras.layers import Layer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense, Flatten, Conv2D, MaxPooling2D, Dropout

In [6]:
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()

# Normalize the data (between 0 and 1)
x_train, x_test = x_train / 255.0, x_test / 255.0

# Reshaped data to match the input shape of the Conv2D layer
x_train = x_train.reshape(-1, 28, 28, 1)
x_test = x_test.reshape(-1, 28, 28, 1)

In [14]:
# Custom Dense Layer
class CustomDense(Layer):
    def __init__(self, units, activation=None, quantized_type=tf.int8, quantized_technique='asymmetric'):
        super(CustomDense, self).__init__()
        self.units = units
        self.activation = tf.keras.activations.get(activation)
        self.quantized_type = quantized_type
        self.quantized_technique = quantized_technique  # symmetric or asymmetric

    def get_symmetric_quantize_matrix(self, x, target_dtype):
        # Define the range for target type
        qmin = tf.constant(target_dtype.min, dtype=tf.float32)
        qmax = tf.constant(target_dtype.max, dtype=tf.float32)

        # Compute the scale factor
        max_abs_val = tf.reduce_max(tf.abs(x))
        scale = max_abs_val / qmax
        zero_point = 0.0

        # Quantization and Clip to ensure quantized values are within the range
        quantized = tf.round(x / scale)  # Perform quantization
        quantized = tf.clip_by_value(quantized, qmin, qmax) 
        
        return tf.cast(quantized, target_dtype), scale, zero_point


    def get_asymmetric_quantize_matrix(self, x, target_dtype):
        # Define the range for the target type
        qmin = tf.constant(target_dtype.min, dtype=tf.float32)
        qmax = tf.constant(target_dtype.max, dtype=tf.float32)
        
        # Compute the min and max for the actual tensor
        x_min = tf.reduce_min(x)
        x_max = tf.reduce_max(x)

        # Scale and zero point calculations
        scale = (x_max - x_min) / (qmax - qmin)
        zero_point = tf.cast(qmin - x_min / scale, tf.float32)

        # Quantization and Clip to ensure quantized values are within the range
        quantized = tf.round((x - x_min) / scale + qmin)
        quantized = tf.clip_by_value(quantized, qmin, qmax)

        return tf.cast(quantized, target_dtype), scale, zero_point


    def build(self, input_shape):
        # Initialize weights and bias
        self.w = self.add_weight(shape=(input_shape[-1], self.units), initializer='random_normal', trainable=True)
        self.b = self.add_weight(shape=(self.units,), initializer='random_normal', trainable=True)

        self.w_quantized = self.add_weight(shape=(input_shape[-1], self.units), dtype=self.quantized_type, initializer='zeros', trainable=False)
        self.quantized_scale = self.add_weight(shape=(), initializer='ones', trainable=False)
        self.zero_point = self.add_weight(shape=(), initializer='ones', trainable=False)

        super(CustomDense, self).build(input_shape)  # Be sure to call this at the end


    def post_training_quantization(self):
        """Perform post training quantization after FINISH training"""

        # Query matrix
        if self.quantized_technique == 'symmetric':
            w_quantized, quantized_scale, zero_point = self.get_symmetric_quantize_matrix(self.w, target_dtype=self.quantized_type)
        elif self.quantized_technique == 'asymmetric':
            w_quantized, quantized_scale, zero_point = self.get_asymmetric_quantize_matrix(self.w, target_dtype=self.quantized_type)

        self.w_quantized.assign(w_quantized)
        self.quantized_scale.assign(quantized_scale)
        self.zero_point.assign(zero_point)

        del self.w

    def call(self, inputs):
        
        # Linear transformation
        if hasattr(self, 'w'):
            z = tf.matmul(inputs, self.w) + self.b  # Linear transformation
        else:
            w_dequantized = tf.cast(self.w_quantized, tf.float32) - self.zero_point
            w_dequantized = tf.multiply(w_dequantized, self.quantized_scale)
            z = tf.matmul(inputs, w_dequantized) + self.b

        if self.activation:
            z = self.activation(z)  
        return z  

In [22]:
tf.keras.backend.clear_session()
model = Sequential([
    Flatten(input_shape=(28, 28)),  # Flatten 28x28 images to 1D vectors
    CustomDense(128, activation='relu'),  # Add a fully connected layer
    CustomDense(10, activation='softmax') # Output layer with 10 neurons for classification
])

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 flatten (Flatten)           (None, 784)               0         
                                                                 
 custom_dense (CustomDense)  (None, 128)               200834    
                                                                 
 custom_dense_1 (CustomDens  (None, 10)                2572      
 e)                                                              
                                                                 
Total params: 203406 (496.80 KB)
Trainable params: 101770 (397.54 KB)
Non-trainable params: 101636 (99.27 KB)
_________________________________________________________________


In [23]:
class Post_Quantization_Callback(tf.keras.callbacks.Callback):
    def __init__(self):
        super(Post_Quantization_Callback, self).__init__()

    def on_train_end(self, logs=None):
        self.quantization_layer()

    def quantization_layer(self):
        for layer in self.model.submodules:
            if isinstance(layer, CustomDense):
                layer.post_training_quantization()

In [24]:
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

callbacks = [
    Post_Quantization_Callback()
]

model.fit(x_train, y_train, epochs=5, callbacks=callbacks)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7f1498124970>

In [25]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 flatten (Flatten)           (None, 784)               0         
                                                                 
 custom_dense (CustomDense)  (None, 128)               100482    
                                                                 
 custom_dense_1 (CustomDens  (None, 10)                1292      
 e)                                                              
                                                                 
Total params: 101774 (99.80 KB)
Trainable params: 138 (552.00 Byte)
Non-trainable params: 101636 (99.27 KB)
_________________________________________________________________


In [26]:
model.evaluate(x_test, y_test)



[0.0762646421790123, 0.9775999784469604]