## Model Optimization And Quantization

In [6]:
from optimum.onnxruntime import ORTModelForSequenceClassification
from transformers import AutoModelForSequenceClassification
import torch
from transformers import AutoTokenizer

In [7]:
model_path="./tinybert-sentiment"
onnx_path="./tinybert-sentiment-onnx"

In [8]:
# Convert to ONNX format
model = ORTModelForSequenceClassification.from_pretrained(
    model_path,
    export=True,
    provider="CPUExecutionProvider"
)

In [9]:
# Save ONNX model and tokenizer
model.save_pretrained(onnx_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer.save_pretrained(onnx_path)

('./tinybert-sentiment-onnx/tokenizer_config.json',
 './tinybert-sentiment-onnx/special_tokens_map.json',
 './tinybert-sentiment-onnx/vocab.txt',
 './tinybert-sentiment-onnx/added_tokens.json',
 './tinybert-sentiment-onnx/tokenizer.json')

In [6]:
# Make sure you're loading a pure PyTorch model, not an ONNX model
model = AutoModelForSequenceClassification.from_pretrained(model_path)

In [7]:
# Dynamic quantization for PyTorch (alternative)
quantized_model = torch.quantization.quantize_dynamic(
    model,
    {torch.nn.Linear},
    dtype=torch.qint8
)

In [8]:
torch.save(quantized_model.state_dict(), "./tinybert-sentiment-quantized.pt")