
# Evaluation, optimization and Quantization
source: https://www.youtube.com/watch?v=_AKFDOnrZz8

credits: Julien Simon, Huggingface, source: https://gitlab.com/juliensimon/huggingface-demos/-/blob/main/optimum/onnx/optimize_onnx.py

### Huggingface's evaluation model

🤗 Evaluate's main methods are:

- evaluate.list_evaluation_modules() to list the available metrics, comparisons and measurements
- evaluate.load(module_name, **kwargs) to instantiate an evaluation module
- results = module.compute(*kwargs) to compute the result of an evaluation module

In [1]:
!pip install evaluate



In [6]:
!pip install optimum

Collecting optimum
  Downloading optimum-1.17.1-py3-none-any.whl.metadata (18 kB)
Downloading optimum-1.17.1-py3-none-any.whl (407 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m407.1/407.1 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: optimum
Successfully installed optimum-1.17.1


# Part 1: Julien Simon tutorial on youtube > 1 old & outdated

In [2]:
import datasets
import evaluate
import transformers

2024-03-18 09:26:27.301918: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
#settings
task_type = "text-classification"

model_id = "juliensimon/distilbert-amazon-shoe-reviews"

dataset_id = "juliensimon/amazon-shoe-reviews"
label_column = "labels"
label_mapping = {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
}
data = datasets.load_dataset(dataset_id, split="test")
print(data)

Downloading readme:   0%|          | 0.00/473 [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/736 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/10.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.14M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/90000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Dataset({
    features: ['labels', 'text'],
    num_rows: 10000
})


In [4]:
# evaluation of the original model
metric = evaluate.load("accuracy")
evaluator = evaluate.evaluator(task_type)


def evaluate_pipeline(pipeline):
    results = evaluator.compute(
        model_or_pipeline=pipeline,
        data=data,
        metric=metric,
        label_column=label_column,
        label_mapping=label_mapping,
    )
    return results


print("*** Original model")
classifier = transformers.pipeline(task_type, model_id)
results = evaluate_pipeline(classifier)
print(results)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

*** Original model


Downloading config.json:   0%|          | 0.00/845 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/320 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

{'accuracy': 0.5778, 'total_time_in_seconds': 375.2978071240068, 'samples_per_second': 26.645506075914202, 'latency_in_seconds': 0.03752978071240068}


In [8]:
#import optimum 
from optimum.onnxruntime import ORTModelForSequenceClassification
from optimum.pipelines import pipeline

In [12]:
model_id = "juliensimon/distilbert-amazon-shoe-reviews"
model = ORTModelForSequenceClassification.from_pretrained(model_id)
#tokenizer = transformers.AutoTokenizer.from_pretrained(model_id)
#model.save_pretrained("./model_onnx")
#tokenizer.save_pretrained("./model_onnx")
#classifier_onnx = pipeline(task_type, model=model, tokenizer=tokenizer)
#results = evaluate_pipeline(classifier_onnx)
print("*** ONNX")
#print(results)

FileNotFoundError: Could not find any ONNX model file in juliensimon/distilbert-amazon-shoe-reviews

In [None]:
print("*** ONNX optimizer")

from optimum.onnxruntime import ORTOptimizer
from optimum.onnxruntime.configuration import OptimizationConfig

optimizer = ORTOptimizer.from_pretrained(model)
optimizer.optimize(
    OptimizationConfig(
        optimization_level=99, # 1, 2 or 99
    ),
    save_dir="./model_onnx",
)
model_optimized = ORTModelForSequenceClassification.from_pretrained(
    "./model_onnx", file_name="model_optimized.onnx"
)
classifier_optimized = pipeline(task_type, model=model_optimized, tokenizer=tokenizer)
results = evaluate_pipeline(classifier_optimized)
print(results)

In [None]:
print("*** ONNX quantizer")

from optimum.onnxruntime import ORTQuantizer
from optimum.onnxruntime.configuration import AutoQuantizationConfig

quantizer = ORTQuantizer.from_pretrained(model)
qconfig = AutoQuantizationConfig.avx512_vnni(is_static=False, per_channel=True)
quantizer.quantize(save_dir="./model_onnx", quantization_config=qconfig)
model_quantized = ORTModelForSequenceClassification.from_pretrained(
    "./model_onnx", file_name="model_quantized.onnx"
)
classifier_quantized = pipeline(task_type, model=model_quantized, tokenizer=tokenizer)
results = evaluate_pipeline(classifier_quantized)
print(results)

# Part 2: Huggingface tutorial (new)

In [13]:
from optimum.pipelines import pipeline

classifier = pipeline(task="text-classification", accelerator="ort")

Downloading config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

Framework not specified. Using pt to export the model.


Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Using the export variant default. Available variants are:
    - default: The default ONNX variant.
Using framework PyTorch: 2.0.0


verbose: False, log level: Level.ERROR



In [14]:
classifier("I like you. I love you.")

[{'label': 'POSITIVE', 'score': 0.9998763799667358}]

In [17]:
from optimum.pipelines import pipeline

onnx_qa = pipeline("question-answering", model="deepset/roberta-base-squad2", accelerator="ort")
question = "What's my name?"
context = "My name is Philipp and I live in Nuremberg."

pred = onnx_qa(question=question, context=context)
pred

Framework not specified. Using pt to export the model.
Using the export variant default. Available variants are:
    - default: The default ONNX variant.
Using framework PyTorch: 2.0.0
Overriding 1 configuration item(s)
	- use_cache -> False


verbose: False, log level: Level.ERROR



{'score': 0.9041661620140076, 'start': 11, 'end': 18, 'answer': 'Philipp'}

In [20]:
from transformers import AutoTokenizer
from optimum.onnxruntime import (
    AutoQuantizationConfig,
    ORTModelForSequenceClassification,
    ORTQuantizer
)
from optimum.pipelines import pipeline

# Load the tokenizer and export the model to the ONNX format
model_id = "distilbert-base-uncased-finetuned-sst-2-english"
save_dir = "distilbert_quantized"

model = ORTModelForSequenceClassification.from_pretrained(model_id, export=True)

# Load the quantization configuration detailing the quantization we wish to apply
qconfig = AutoQuantizationConfig.avx512_vnni(is_static=False, per_channel=True)
quantizer = ORTQuantizer.from_pretrained(model)

# Apply dynamic quantization and save the resulting model
quantizer.quantize(save_dir=save_dir, quantization_config=qconfig)
# Load the quantized model from a local repository
model = ORTModelForSequenceClassification.from_pretrained(save_dir)

# Create the transformers pipeline
onnx_clx = pipeline("text-classification", model=model, accelerator="ort")
text = "I like the new ORT pipeline"
pred = onnx_clx(text)
print(pred)
# [{'label': 'POSITIVE', 'score': 0.9974810481071472}]

# Save and push the model to the hub (in practice save_dir could be used here instead)
#model.save_pretrained("new_path_for_directory")
#model.push_to_hub("new_path_for_directory", repository_id="my-onnx-repo", use_auth_token=True)

Framework not specified. Using pt to export the model.
Using the export variant default. Available variants are:
    - default: The default ONNX variant.
Using framework PyTorch: 2.0.0
  mask, torch.tensor(torch.finfo(scores.dtype).min)


verbose: False, log level: Level.ERROR



Creating dynamic quantizer: QOperator (mode: IntegerOps, schema: u8/s8, channel-wise: True)
Quantizing model...
Saving quantized model at: distilbert_quantized (external data format: False)
Configuration saved in distilbert_quantized/ort_config.json


[{'label': 'NEGATIVE', 'score': 0.5954363346099854}]
