In [3]:
from pathlib import Path

model_name = "small.en"
WHISPER_ENCODER_OV = Path(f"models/whisper-{model_name}/whisper_{model_name}_encoder.xml")
WHISPER_DECODER_OV = Path(f"models/whisper-{model_name}/whisper_{model_name}_decoder.xml")

WHISPER_ENCODER_OV_INT8 = Path(f"models/whisper-{model_name}/whisper_{model_name}_encoder_int8.xml")
WHISPER_DECODER_OV_INT8 = Path(f"models/whisper-{model_name}/whisper_{model_name}_decoder_int8.xml")

In [4]:
import openvino as ov
import whisper
from util import patch_whisper_for_ov_inference, OpenVINOAudioEncoder, OpenVINOTextDecoder

core = ov.Core()
device="CPU"
model_fp32 = whisper.load_model(model_name, "cpu").eval()
patch_whisper_for_ov_inference(model_fp32)

model_fp32.encoder = OpenVINOAudioEncoder(core, WHISPER_ENCODER_OV, device=device)
model_fp32.decoder = OpenVINOTextDecoder(core, WHISPER_DECODER_OV, device=device)

In [5]:
from contextlib import contextmanager
from functools import partial
from typing import Optional

import torch

COLLECT_CALIBRATION_DATA = False
encoder_calibration_data = []
decoder_calibration_data = []

@contextmanager
def calibration_data_collection():
    global COLLECT_CALIBRATION_DATA
    try:
        COLLECT_CALIBRATION_DATA = True
        yield
    finally:
        COLLECT_CALIBRATION_DATA = False


def encoder_forward(self, mel: torch.Tensor):
    if COLLECT_CALIBRATION_DATA:
        encoder_calibration_data.append(mel)
    return torch.from_numpy(self.compiled_model(mel)[self.output_blob])

def decoder_forward(self, x: torch.Tensor, xa: torch.Tensor, kv_cache: Optional[dict] = None):
    feed_dict = {'x': ov.Tensor(x.numpy()), 'xa': ov.Tensor(xa.numpy())}
    feed_dict = (self.preprocess_kv_cache_inputs(feed_dict, kv_cache))
    if COLLECT_CALIBRATION_DATA:
        decoder_calibration_data.append(feed_dict)
    res = self.compiled_model(feed_dict)
    return self.postprocess_outputs(res)

model_fp32.encoder.forward = partial(encoder_forward, model_fp32.encoder)
model_fp32.decoder.forward = partial(decoder_forward, model_fp32.decoder)

In [6]:
from datasets import load_dataset
from tqdm import tqdm

CALIBRATION_DATASET_SIZE = 30

calibration_dataset = load_dataset("librispeech_asr", "clean", split="validation", streaming=True).take(CALIBRATION_DATASET_SIZE)

with calibration_data_collection():
    for data_item in tqdm(calibration_dataset, desc="Collecting calibration data", total=CALIBRATION_DATASET_SIZE):
        model_fp32.transcribe(data_item["audio"]["array"].astype("float32"), task="transcribe")

Downloading builder script:   0%|          | 0.00/11.5k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/10.2k [00:00<?, ?B/s]

Collecting calibration data: 100%|██████████| 30/30 [02:09<00:00,  4.32s/it]


In [7]:
import nncf
from openvino.runtime import serialize

print("Quantizing encoder...")
quantized_encoder = nncf.quantize(
    model=model_fp32.encoder.model,
    calibration_dataset=nncf.Dataset(encoder_calibration_data),
    subset_size=len(encoder_calibration_data),
    model_type=nncf.ModelType.TRANSFORMER,
    advanced_parameters=nncf.AdvancedQuantizationParameters(
        smooth_quant_alpha=0.5      # Smooth Quant algorithm reduces activation quantization error; optimal alpha value was obtained through grid search
    ),
    target_device=nncf.TargetDevice.GPU # GPUで推論する場合必要
)
serialize(quantized_encoder, WHISPER_ENCODER_OV_INT8)
print(f"Saved quantized encoder at ./{WHISPER_ENCODER_OV_INT8}")

print("Quantizing decoder...")
quantized_decoder = nncf.quantize(
    model=model_fp32.decoder.model,
    calibration_dataset=nncf.Dataset(decoder_calibration_data),
    subset_size=len(decoder_calibration_data),
    model_type=nncf.ModelType.TRANSFORMER,
    advanced_parameters=nncf.AdvancedQuantizationParameters(
        smooth_quant_alpha=0.95     # Smooth Quant algorithm reduces activation quantization error; optimal alpha value was obtained through grid search
    ),
    target_device=nncf.TargetDevice.GPU # GPUで推論する場合必要
)
serialize(quantized_decoder, WHISPER_DECODER_OV_INT8)
print(f"Saved quantized decoder at ./{WHISPER_DECODER_OV_INT8}")


INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, onnx, openvino
Quantizing encoder...


Output()

Output()

INFO:nncf:36 ignored nodes were found by name in the NNCFGraph
INFO:nncf:48 ignored nodes were found by name in the NNCFGraph


Output()

Output()

In [None]:
core = ov.Core()
device = "GPU"
# from utils import patch_whisper_for_ov_inference, OpenVINOAudioEncoder, OpenVINOTextDecoder
model = whisper.load_model(model_name, "cpu").eval()
patch_whisper_for_ov_inference(model)

model.encoder = OpenVINOAudioEncoder(core, WHISPER_ENCODER_OV, device=device)
model.decoder = OpenVINOTextDecoder(core, WHISPER_DECODER_OV, device=device)

In [None]:
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
sample = ds[0]["audio"]["array"].astype("float32")

In [None]:
result = model.transcribe(sample,verbose=True,language="en")