## Setup

In [1]:
! pip install -q tensorflow
! pip install -q tensorflow-model-optimization

In [2]:
import tensorflow as tf
tf.__version__

'2.8.0'

In [3]:
(x_train,y_train),(x_test,y_test) = tf.keras.datasets.fashion_mnist.load_data()

x_train = x_train / 255.
x_test = x_test / 255.

x_train.shape, x_test.shape

((60000, 28, 28), (10000, 28, 28))

In [4]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, InputLayer, Reshape, Conv2D, Flatten, BatchNormalization

### BatchNorm should be RIGHT AFTER Conv layer (should be in front of activation)

In [5]:
from tensorflow.keras import layers
from tensorflow.keras import activations

In [6]:
def conv_block(model,filters):
    model.add(Conv2D(filters=filters,kernel_size=(3,3)))
    model.add(BatchNormalization())
    model.add(layers.Activation(activations.relu))

### Notice the strings of model declaration via Sequential API here, we'll investigate the behavior afterwards

In [7]:
"""
model = Sequential([
    InputLayer(input_shape=(28,28)),
    # RESHAPE LAYER IS IMPORTANT FOR QUANTIZATION
    Reshape(target_shape=(28,28,1)),

    Conv2D(filters=8, kernel_size=(3, 3),activation='relu'),
    BatchNormalization(),
    ReLU(),

    Conv2D(filters=8, kernel_size=(3, 3),activation='relu'),
    BatchNormalization(),

    Flatten(),
    Dense(units=15, activation='relu'),
    Dense(units=10, activation="softmax"),    
])
"""
model = Sequential()

model.add(InputLayer(input_shape=(28,28)))
model.add(Reshape(target_shape=(28,28,1)))

conv_block(model,filters=3)

conv_block(model,filters=8)

model.add(Flatten())

model.add(Dense(15))
model.add(layers.Activation(activations.relu))

model.add(Dense(10))
model.add(layers.Activation(activations.softmax))

model.build(input_shape=(28,28))
model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 reshape (Reshape)           (None, 28, 28, 1)         0         
                                                                 
 conv2d (Conv2D)             (None, 26, 26, 3)         30        
                                                                 
 batch_normalization (BatchN  (None, 26, 26, 3)        12        
 ormalization)                                                   
                                                                 
 activation (Activation)     (None, 26, 26, 3)         0         
                                                                 
 conv2d_1 (Conv2D)           (None, 24, 24, 8)         224       
                                                                 
 batch_normalization_1 (Batc  (None, 24, 24, 8)        32        
 hNormalization)                                        

In [8]:
model.compile(optimizer=tf.keras.optimizers.Adam(1e-3),loss='sparse_categorical_crossentropy',metrics=['accuracy'])

_ = model.fit(x_train,y_train,epochs=5,validation_split=0.2,batch_size=64,verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [9]:
# define helper function to evaluate the inputed model quick
def model_eval(model):
    los,acc = model.evaluate(x_test,y_test)
    print("test loss: %.4f, test accuracy: %.4f" % (los,acc))
    return acc

In [10]:
test_acc = model_eval(model)

test loss: 0.3574, test accuracy: 0.8758


### For later usage and experiment, we define following helper functions

In [11]:
!pip install pyyaml h5py 



In [12]:
import tempfile

def get_model():
  model = Sequential()

  model.add(InputLayer(input_shape=(28,28)))
  model.add(Reshape(target_shape=(28,28,1)))

  conv_block(model,filters=3)

  conv_block(model,filters=8)

  model.add(Flatten())

  model.add(Dense(15))
  model.add(layers.Activation(activations.relu))

  model.add(Dense(10))
  model.add(layers.Activation(activations.softmax))

  model.build(input_shape=(28,28))

  return model

_, pretrained_weights = tempfile.mkstemp('.tf')

model.save_weights(pretrained_weights)

In [13]:
def setup_model():
  model = get_model()
  model.load_weights(pretrained_weights)
  model.compile(optimizer=tf.keras.optimizers.Adam(1e-3),loss='sparse_categorical_crossentropy',metrics=['accuracy'])
  return model

## check if setup_model() works
_model = setup_model()
_ = model_eval(_model) # it works, since test accuracy is consistent

test loss: 0.3574, test accuracy: 0.8758


### We'll conduct quantization aware training below
### By first make the model "quantization aware", then re-compile and fine-tuning it. Last we compare the accuracy and storage size

In [14]:
import tensorflow_model_optimization as tfmot

### Since BatchNorm is not supported by default, we need to pass it to the quantized model by hand

In [15]:
# import necessary functions
quantize_annotate_layer = tfmot.quantization.keras.quantize_annotate_layer
quantize_annotate_model = tfmot.quantization.keras.quantize_annotate_model
quantize_scope = tfmot.quantization.keras.quantize_scope

In [16]:
LastValueQuantizer = tfmot.quantization.keras.quantizers.LastValueQuantizer
MovingAverageQuantizer = tfmot.quantization.keras.quantizers.MovingAverageQuantizer

class BNQuantizeConfig(tfmot.quantization.keras.QuantizeConfig):
    # Configure how to quantize weights.
    def get_weights_and_quantizers(self, layer):
      # by default, all layer are quantized to 8 bit
      return [(layer.gamma, MovingAverageQuantizer(num_bits=8, symmetric=True, narrow_range=False, per_axis=False)),(layer.beta, MovingAverageQuantizer(num_bits=8, symmetric=True, narrow_range=False, per_axis=False))]      

    def get_activations_and_quantizers(self, layer):
      return []

    def set_quantize_weights(self, layer, quantize_weights):
      layer.gamma=quantize_weights[0]
      layer.beta=quantize_weights[1]
      #print(quantize_weights,"\n",quantize_weights[0]) ## un-comment this to see how BatchNorm behaves

    def set_quantize_activations(self, layer, quantize_activations):
      return

    def get_output_quantizers(self, layer):
      # Does not quantize output, since we return an empty list.
      return []

    def get_config(self):
      return {}


#### Weights would be the same when converting layers to "quant aware" layers

In [17]:
def quantized_model(model):
    
    model_list = [quantize_annotate_layer(layer,BNQuantizeConfig()) if isinstance(layer,BatchNormalization) else layer for layer in model.layers]
    
    q_model = quantize_annotate_model(Sequential(model_list))

    # check again since it's fancy syntax
    # and for print out the summary
    q_model.build(input_shape=(None,*x_train[0].shape))
    
    # `quantize_apply` requires mentioning `xxQuantizeConfig` with `quantize_scope`:
    # use **kwargs for better customization ?
    with quantize_scope(
        {'BNQuantizeConfig': BNQuantizeConfig}):
        # Use `quantize_apply` to actually make the model quantization aware.
        q_aware = tfmot.quantization.keras.quantize_apply(q_model)

    q_aware.summary()

    return q_aware


In [18]:
q_aware = quantized_model(model)

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 quantize_layer (QuantizeLay  (None, 28, 28)           3         
 er)                                                             
                                                                 
 quant_reshape (QuantizeWrap  (None, 28, 28, 1)        1         
 perV2)                                                          
                                                                 
 quant_conv2d (QuantizeWrapp  (None, 26, 26, 3)        39        
 erV2)                                                           
                                                                 
 quant_batch_normalization (  (None, 26, 26, 3)        17        
 QuantizeWrapperV2)                                              
                                                                 
 quant_activation (QuantizeW  (None, 26, 26, 3)       

In [19]:
# randomly choose 1000 images out of the training dataset
import numpy as np
idx = np.random.randint(0,len(x_train),1000)

x_train_, y_train_ = x_train[idx], y_train[idx]

x_train_.shape, y_train_.shape

((1000, 28, 28), (1000,))

In [20]:
# define the function to train the quantized model
# to not confused with the not quant-aware ones
def fit_q_aware(q_aware):
  q_aware.compile(optimizer=tf.keras.optimizers.Adam(1e-3),loss='sparse_categorical_crossentropy',metrics=['accuracy'])
  print("training")
  q_aware.fit(x_train_,y_train_,batch_size=256,epochs=1,validation_split=0.2)
  print("\ntesting")
  # evaluate q_aware
  _, q_test_acc = q_aware.evaluate(x_test,y_test)
  return q_test_acc
  

#### Since BatchNorm layer is not supported for q-aware training, the accuracy might be a little unstable (could be off by 1~3%)

In [21]:
# evaluate q_aware
q_test_acc = fit_q_aware(q_aware)

# Comparison
print("\nBaseline accuracy: %.4f\nQuantized accuracy: %.4f" % (test_acc,q_test_acc))

training

testing

Baseline accuracy: 0.8758
Quantized accuracy: 0.8593


In [22]:
# convert quant model to tf lite model
def quant_to_lite(q_aware):
  converter = tf.lite.TFLiteConverter.from_keras_model(q_aware)
  converter.optimizations = [tf.lite.Optimize.DEFAULT]

  return converter.convert()

#### Seems like the answer to the storage size problem?
#### Or not, since there're identical warning in the example

In [23]:
lite_model = quant_to_lite(q_aware)



In [24]:
def evaluate_model(interpreter):
    input_index = interpreter.get_input_details()[0]["index"]
    output_index = interpreter.get_output_details()[0]["index"]

    # Run predictions on every image on the "test" dataset.

    predictions = np.zeros((len(x_test)))
    for i, image in enumerate(x_test):

        if i % 1000 == 999:
            print('Evaluated on %5d results so far.' % (i+1))

        # Pre-processing: add batch dimension and convert to float32 
        # to match with the model's input data format
        interpreter.set_tensor(input_index,np.expand_dims(image, axis=0).astype(np.float32))

        # Run inference.
        interpreter.invoke()

        # Post-processing: remove batch dimension 
        # and find the digit with highest probability
        output = interpreter.tensor(output_index)
        predictions[i] = np.argmax(output()[0])

    # Calculate test accuracy
    accuracy = (predictions == y_test).mean()
    return accuracy


In [25]:
def evaluate_lite_model(lite_model):
  interpreter = tf.lite.Interpreter(model_content=lite_model)
  interpreter.allocate_tensors()
  return evaluate_model(interpreter)

lite_test_accuracy = evaluate_lite_model(lite_model)
print('\nQuant TFLite test_accuracy: %.4f\nQuant TF test accuracy: %10.4f'%(lite_test_accuracy, q_test_acc))

Evaluated on  1000 results so far.
Evaluated on  2000 results so far.
Evaluated on  3000 results so far.
Evaluated on  4000 results so far.
Evaluated on  5000 results so far.
Evaluated on  6000 results so far.
Evaluated on  7000 results so far.
Evaluated on  8000 results so far.
Evaluated on  9000 results so far.
Evaluated on 10000 results so far.

Quant TFLite test_accuracy: 0.8595
Quant TF test accuracy:     0.8593


In [26]:
# Create float TFLite model.
def model_to_float(model):
  float_converter = tf.lite.TFLiteConverter.from_keras_model(model)
  return float_converter.convert()

In [27]:
import os

float_lite_model = model_to_float(model)

def write_file(model_):
    _, model_file = tempfile.mkstemp('.tflite')
    with open(model_file,'wb') as f:
        f.write(model_)
    # 2 ** 20 = 1048576
    print(" model in Mb:", os.path.getsize(model_file) / 1048576.)
    
print("\nFloat",end="")
write_file(float_lite_model)

print("Quantized",end="")
write_file(lite_model)

INFO:tensorflow:Assets written to: /tmp/tmplmyci1mf/assets


INFO:tensorflow:Assets written to: /tmp/tmplmyci1mf/assets



Float model in Mb: 0.2691459655761719
Quantized model in Mb: 0.0740509033203125


## To see if BatchNorm affects the storage size, I did the following experiment

In [28]:
def model_without_BN(model):
  return Sequential([layer for layer in model.layers if not isinstance(layer,BatchNormalization)])

In [29]:
q2_model = model_without_BN(model)

In [30]:
q2_model.build(input_shape=(None,28,28))
q2_model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 reshape (Reshape)           (None, 28, 28, 1)         0         
                                                                 
 conv2d (Conv2D)             (None, 26, 26, 3)         30        
                                                                 
 activation (Activation)     (None, 26, 26, 3)         0         
                                                                 
 conv2d_1 (Conv2D)           (None, 24, 24, 8)         224       
                                                                 
 activation_1 (Activation)   (None, 24, 24, 8)         0         
                                                                 
 flatten (Flatten)           (None, 4608)              0         
                                                                 
 dense (Dense)               (None, 15)               

### Can we really train for a epoch here after popping out BN?

In [31]:
q2_model.compile(optimizer=tf.keras.optimizers.Adam(1e-3),loss='sparse_categorical_crossentropy',metrics=['accuracy'])
_ = q2_model.fit(x_train_[:100],y_train_[:100],epochs=1)



In [32]:
q2_aware = quantized_model(q2_model)

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 quantize_layer_1 (QuantizeL  (None, 28, 28)           3         
 ayer)                                                           
                                                                 
 quant_reshape (QuantizeWrap  (None, 28, 28, 1)        1         
 perV2)                                                          
                                                                 
 quant_conv2d (QuantizeWrapp  (None, 26, 26, 3)        31        
 erV2)                                                           
                                                                 
 quant_activation (QuantizeW  (None, 26, 26, 3)        3         
 rapperV2)                                                       
                                                                 
 quant_conv2d_1 (QuantizeWra  (None, 24, 24, 8)       

In [33]:
q2_test_acc = fit_q_aware(q2_aware)

training

testing


In [34]:
lite2_test_acc = evaluate_lite_model(quant_to_lite(q2_aware))



INFO:tensorflow:Assets written to: /tmp/tmpnn3hxamk/assets


INFO:tensorflow:Assets written to: /tmp/tmpnn3hxamk/assets


Evaluated on  1000 results so far.
Evaluated on  2000 results so far.
Evaluated on  3000 results so far.
Evaluated on  4000 results so far.
Evaluated on  5000 results so far.
Evaluated on  6000 results so far.
Evaluated on  7000 results so far.
Evaluated on  8000 results so far.
Evaluated on  9000 results so far.
Evaluated on 10000 results so far.


In [35]:
# result of lite2
lite2_test_acc

0.7827

In [36]:
# check the performance of model2 (model without BatchNorm), shall we?
q2_test_acc = model_eval(q2_model)

test loss: 1.3352, test accuracy: 0.6793


In [37]:
write_file(quant_to_lite(q_aware))



 model in Mb: 0.0740509033203125


In [38]:
write_file(quant_to_lite(q2_aware))



INFO:tensorflow:Assets written to: /tmp/tmplzdn3nch/assets


INFO:tensorflow:Assets written to: /tmp/tmplzdn3nch/assets


 model in Mb: 0.0713958740234375


### It turned out, if we use functional API to build the model, then the model would be compressed correctly

### However, if we use Sequential API aforementioned ...

In [39]:
model_ = Sequential([
    InputLayer(input_shape=(28,28)),
    # RESHAPE LAYER IS IMPORTANT FOR QUANTIZATION
    Reshape(target_shape=(28,28,1)),

    Conv2D(filters=8, kernel_size=(3, 3), activation='relu'),
    BatchNormalization(),

    Conv2D(filters=8, kernel_size=(3, 3), activation='relu'),
    BatchNormalization(),

    Flatten(),
    Dense(units=15, activation='relu'),
    Dense(units=10, activation="softmax"),    
])

q3_aware = quantized_model(model_)
q3_lite = quant_to_lite(q3_aware)
write_file(q3_lite)

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 quantize_layer_2 (QuantizeL  (None, 28, 28)           3         
 ayer)                                                           
                                                                 
 quant_reshape_2 (QuantizeWr  (None, 28, 28, 1)        1         
 apperV2)                                                        
                                                                 
 quant_conv2d_4 (QuantizeWra  (None, 26, 26, 8)        99        
 pperV2)                                                         
                                                                 
 quant_batch_normalization_4  (None, 26, 26, 8)        37        
  (QuantizeWrapperV2)                                            
                                                                 
 quant_conv2d_5 (QuantizeWra  (None, 24, 24, 8)       



 model in Mb: 0.27286529541015625


### The storage size shown little to none decrease (size is about 0.279Mb)
### I believe this should be more well-documented to increase the accessability of the Tensorflow library

## Different QuantConfig() settings

#### Defalult setting of Dense layer

In [40]:
class DefaultDenseQuantizeConfig(tfmot.quantization.keras.QuantizeConfig):
    # Configure how to quantize weights.
    def get_weights_and_quantizers(self, layer):
      return [(layer.kernel, LastValueQuantizer(num_bits=8, symmetric=True, narrow_range=False, per_axis=False))]

    # Configure how to quantize activations.
    def get_activations_and_quantizers(self, layer):
      return [(layer.activation, MovingAverageQuantizer(num_bits=8, symmetric=False, narrow_range=False, per_axis=False))]

    def set_quantize_weights(self, layer, quantize_weights):
      # Add this line for each item returned in `get_weights_and_quantizers`
      # , in the same order
      layer.kernel = quantize_weights[0]

    def set_quantize_activations(self, layer, quantize_activations):
      # Add this line for each item returned in `get_activations_and_quantizers`
      # , in the same order.
      layer.activation = quantize_activations[0]

    # Configure how to quantize outputs (may be equivalent to activations).
    def get_output_quantizers(self, layer):
      return []

    def get_config(self):
      return {}

#### Define a new function for custom changes in QuantizeConfig()

In [41]:
def quantized_custom_model(model,DenseConfig=DefaultDenseQuantizeConfig,DenseConfig_name='DefaultDenseQuantizeConfig',BNConfig=BNQuantizeConfig,BNConfig_name='BNQuantizeConfig'):

    model_list = []
    for layer in model.layers:
      if isinstance(layer,BatchNormalization):
        model_list.append(quantize_annotate_layer(layer,BNConfig()))
      elif isinstance(layer,Dense):
        model_list.append(quantize_annotate_layer(layer,DenseConfig()))
      else:
        model_list.append(layer)

    # layers besides Dense and BatchNorm
    q_model = quantize_annotate_model(Sequential(model_list))

    q_model.build(input_shape=(None,28,28))

    with quantize_scope(
        {DenseConfig_name: DenseConfig,
        BNConfig_name: BNConfig}):
      q_aware = tfmot.quantization.keras.quantize_apply(q_model)

    return q_aware


#### 4 bit on Dense layer only

In [42]:
class Dense4bitQuantizeConfig(DefaultDenseQuantizeConfig):
    # Configure how to quantize weights.
    def get_weights_and_quantizers(self, layer):
      return [(layer.kernel, LastValueQuantizer(num_bits=4, symmetric=True, narrow_range=False, per_axis=False))]

    # Configure how to quantize activations.
    def get_activations_and_quantizers(self, layer):
      return [(layer.activation, MovingAverageQuantizer(num_bits=4, symmetric=False, narrow_range=False, per_axis=False))]

## Was the weight in the original model changed?

#### To this concern, we define get_model() function, and get the pretrained model everytime we try new configurations

In [43]:
# convert q_aware model to lite model and get the storage size at once
def quant_lite_write(qaw):
  qlite = quant_to_lite(qaw)
  qwac = fit_q_aware(qaw)
  print("q_aware accuracy: %.4f\nlite model accuracy: %.4f" % (qwac,evaluate_lite_model(quant_to_lite(qaw))))
  write_file(qlite)

In [44]:
model = setup_model()
_ = model_eval(model)
print(_)
q_d4_aware = quantized_custom_model(model,Dense4bitQuantizeConfig,"Dense4bitQuantizeConfig")
quant_lite_write(q_d4_aware)

test loss: 0.3574, test accuracy: 0.8758
0.8758000135421753




training

testing




Evaluated on  1000 results so far.
Evaluated on  2000 results so far.
Evaluated on  3000 results so far.
Evaluated on  4000 results so far.
Evaluated on  5000 results so far.
Evaluated on  6000 results so far.
Evaluated on  7000 results so far.
Evaluated on  8000 results so far.
Evaluated on  9000 results so far.
Evaluated on 10000 results so far.
q_aware accuracy: 0.8533
lite model accuracy: 0.8611
 model in Mb: 0.07410430908203125


#### 4 bit on BatchNorm only

#### Ignore the checkpoint warning since we'd never use any checkpoints

In [45]:
class BN4bitQuantizeConfig(BNQuantizeConfig):
    def get_weights_and_quantizers(self, layer):
      return [(layer.gamma, MovingAverageQuantizer(num_bits=4, symmetric=True, narrow_range=False, per_axis=False)),(layer.beta, MovingAverageQuantizer(num_bits=8, symmetric=True, narrow_range=False, per_axis=False))]      


In [46]:
model = setup_model()
q_b4_aware = quantized_custom_model(model,BNConfig=BN4bitQuantizeConfig,BNConfig_name="BN4bitQuantizeConfig")
quant_lite_write(q_b4_aware)



training

testing
























































































































Evaluated on  1000 results so far.
Evaluated on  2000 results so far.
Evaluated on  3000 results so far.
Evaluated on  4000 results so far.
Evaluated on  5000 results so far.
Evaluated on  6000 results so far.
Evaluated on  7000 results so far.
Evaluated on  8000 results so far.
Evaluated on  9000 results so far.
Evaluated on 10000 results so far.
q_aware accuracy: 0.8091
lite model accuracy: 0.8087
 model in Mb: 0.07416534423828125


#### 4 bit on BatchNorm and Dense

In [47]:
model = setup_model()
q_b4_d4_aware = quantized_custom_model(model,Dense4bitQuantizeConfig,"Dense4bitQuantizeConfig",BNConfig=BN4bitQuantizeConfig,BNConfig_name="BN4bitQuantizeConfig")
quant_lite_write(q_b4_d4_aware)

























































































































training

testing




Evaluated on  1000 results so far.
Evaluated on  2000 results so far.
Evaluated on  3000 results so far.
Evaluated on  4000 results so far.
Evaluated on  5000 results so far.
Evaluated on  6000 results so far.
Evaluated on  7000 results so far.
Evaluated on  8000 results so far.
Evaluated on  9000 results so far.
Evaluated on 10000 results so far.
q_aware accuracy: 0.8010
lite model accuracy: 0.8086
 model in Mb: 0.07419586181640625


#### Fix range algorithm on Dense layer

In [50]:
class FixedRangeQuantizer(tfmot.quantization.keras.quantizers.Quantizer):
  """Quantizer which forces outputs to be between -1 and 1."""

  def build(self, tensor_shape, name, layer):
    # Not needed. No new TensorFlow variables needed.
    return {}

  def __call__(self, inputs, training, weights, **kwargs):
    return tf.keras.backend.clip(inputs, -1.0, 1.0)

  def get_config(self):
    # Not needed. No __init__ parameters to serialize.
    return {}


class ModifiedDenseQuantizeConfig(DefaultDenseQuantizeConfig):
    # Configure weights to quantize with 4-bit instead of 8-bits.
    def get_weights_and_quantizers(self, layer):
      # Use custom algorithm defined in `FixedRangeQuantizer` instead of default Quantizer.
      return [(layer.kernel, FixedRangeQuantizer())]

In [51]:
model = setup_model()
q_d4_aware_fix = quantized_custom_model(model,ModifiedDenseQuantizeConfig,"ModifiedDenseQuantizeConfig")
quant_lite_write(q_d4_aware_fix)



training

testing




Evaluated on  1000 results so far.
Evaluated on  2000 results so far.
Evaluated on  3000 results so far.
Evaluated on  4000 results so far.
Evaluated on  5000 results so far.
Evaluated on  6000 results so far.
Evaluated on  7000 results so far.
Evaluated on  8000 results so far.
Evaluated on  9000 results so far.
Evaluated on 10000 results so far.
q_aware accuracy: 0.8607
lite model accuracy: 0.8598
 model in Mb: 0.074127197265625
