# Quantization

Quantization is a technique to reduce the computational and memory costs of running inference by representing the weights and activations with low-precision data types like 8-bit integer (int8) instead of the usual 32-bit floating point (float32).

# Original Model

In [18]:
from grammar_models.grammar_checker import GrammarChecker # T5 prithivida grammar corrector model selected for project
gc = GrammarChecker(model_path="../models/prithivida_grammar_error_correcter_v1")

In [19]:
model = gc.get_model()

In [20]:
# Helper to compute the size of a model
def size_model(model):
    param_size = 0
    for param in model.parameters():
        param_size += param.nelement() * param.element_size()
    buffer_size = 0
    for buffer in model.buffers():
        buffer_size += buffer.nelement() * buffer.element_size()

    size_all_mb = (param_size + buffer_size) / 1024**2
    print('model size: {:.3f}MB'.format(size_all_mb))

In [21]:
size_model(model)

model size: 850.310MB


# Quantization

In [22]:
model.half()  # convert all the model parameters to 16 bits half precision

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [23]:
size_model(model)

model size: 425.155MB


In [24]:
# save the model
model.save_pretrained("../models/prithivida_grammar_error_correcter_v1_fp16")
gc.get_tokenizer().save_pretrained("../models/prithivida_grammar_error_correcter_v1_fp16")
print("Model saved locally at ./models/prithivida_grammar_error_correcter_v1_fp16")

Model saved locally at ./models/prithivida_grammar_error_correcter_v1_fp16


In [25]:
gc.set_model(model)

In [26]:
print(gc.summary())

Model Summary:
- Total Parameters: 222,903,552
- Trainable Parameters: 222,903,552


# Test Quantizied Model

In [27]:
influent_sentences = [
        "He are moving here.",
        "I am doing fine. How is you?",
        "How is they?",
        "Matt like fish",
        "the collection of letters was original used by the ancient Romans",
        "We enjoys horror movies",
        "Anna and Mike is going skiing",
        "I walk to the store and I bought milk",
        " We all eat the fish and then made dessert",
        "I will eat fish for dinner and drink milk",
        "what be the reason for everyone leave the company",
]   

    # Perform grammar correction
for input_text in influent_sentences:
    corrected_text = gc.correct(input_text)
    print("Original:", input_text)
    print("Corrected:", corrected_text)
    print("----------\n")

Original: He are moving here.
Corrected: He is moving here.
----------

Original: I am doing fine. How is you?
Corrected: I am doing fine. How are you?
----------

Original: How is they?
Corrected: How are they?
----------

Original: Matt like fish
Corrected: Matt likes fish.
----------

Original: the collection of letters was original used by the ancient Romans
Corrected: the collection of letters was originally used by the ancient Romans
----------

Original: We enjoys horror movies
Corrected: We enjoy horror movies.
----------

Original: Anna and Mike is going skiing
Corrected: Anna and Mike are going skiing.
----------

Original: I walk to the store and I bought milk
Corrected: I walked to the store and I bought milk.
----------

Original:  We all eat the fish and then made dessert
Corrected: We all ate the fish and then made dessert.
----------

Original: I will eat fish for dinner and drink milk
Corrected: I will eat fish for dinner and drink milk.
----------

Original: what be t

# Other Quantization Technics

based on code from [source](https://odsc.medium.com/optimizing-your-model-for-inference-with-pytorch-quantization-1a25073ba318)

In [28]:
from grammar_models.grammar_checker import GrammarChecker
gc = GrammarChecker(model_path="../models/prithivida_grammar_error_correcter_v1")
grammar_model = gc.get_model()

In [29]:
print("Original Model:")
print(gc.summary())

Original Model:
Model Summary:
- Total Parameters: 222,903,552
- Trainable Parameters: 222,903,552


In [30]:
# Quantize the model
gc.quantize()

Model quantized successfully.


In [31]:
# Print the quantized model summary
print("Quantized Model:")
print(gc.summary())

Quantized Model:
Model Summary:
- Total Parameters: 24,722,688
- Trainable Parameters: 24,722,688


In [32]:
# Test with example input
input_text = "He are moving here."
corrected_text = gc.correct(input_text)
print("Original:", input_text)
print("Corrected:", corrected_text)

Original: He are moving here.
Corrected: He is moving here.


In [33]:
size_model(gc.get_model())

model size: 94.310MB


In [34]:
import torch
# Save the quantized model
torch.save(gc.get_model(), "../models/quantized_grammar_checker_int8.pth")