In [5]:
import torch
import torchaudio
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
import time
import os
import soundfile as sf

  from .autonotebook import tqdm as notebook_tqdm


### Load and store model in the dir

In [2]:


save_dir = "models"
os.makedirs(save_dir, exist_ok=True)

models = {

    "large_960h_lv60": "facebook/wav2vec2-large-960h-lv60-self",

}

for name, model_id in models.items():
    print(f"Loading and saving {model_id}...")
    processor = Wav2Vec2Processor.from_pretrained(model_id,use_safetensors=True)
    model = Wav2Vec2ForCTC.from_pretrained(model_id,use_safetensors=True)

    # Create subfolder per model inside models folder
    model_save_path = os.path.join(save_dir, name)
    os.makedirs(model_save_path, exist_ok=True)

    # Save processor and model locally
    processor.save_pretrained(model_save_path)
    model.save_pretrained(model_save_path)

print("All models loaded and saved successfully!")


Loading and saving facebook/wav2vec2-large-960h-lv60-self...


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h-lv60-self and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


All models loaded and saved successfully!


In [7]:
from jiwer import wer

models_dir = "models"

test_data_dir = "test_data"
transcribs_dir = "ground_truth"

### Load model from the dir

In [4]:
model_names = [name for name in os.listdir(models_dir) if os.path.isdir(os.path.join(models_dir, name))]


### load wav file and resample if needed

In [5]:
def load_audio(path, target_sr = 16000):
    waveform, sr = torchaudio.load(path)
    if sr != target_sr:
        resampler = torchaudio.transforms.Resample(sr, target_sr)
        waveform = resampler(waveform)
    return waveform.squeeze()


### Transcribe audio with given model and processor

In [6]:
def transcribe(waveform, processor, model):
        # Ensure mono audio: get only one channel if multiple
        if waveform.ndim == 2:
            waveform = waveform[0]  # take the first channel
        
        # Ensure it's a tensor of shape (1, time)
        input_values = processor(waveform, return_tensors="pt", sampling_rate=16000).input_values

        with torch.no_grad():
            logits = model(input_values).logits
        predicted_ids = torch.argmax(logits, dim=-1)
        transcription = processor.decode(predicted_ids[0])

        return transcription.lower()


### Results

In [7]:
import soundfile as sf

results = []

for model_name in model_names:
    print(f"\n--- Using model: {model_name} ---")
    model_path = os.path.join(models_dir, model_name)
    
    # Load processor and model
    processor = Wav2Vec2Processor.from_pretrained(model_path)
    model = Wav2Vec2ForCTC.from_pretrained(model_path)
    model.eval()

    # Loop through all wav files
    for wav_file in sorted(os.listdir(test_data_dir)):
        if not wav_file.endswith(".wav"):
            continue
        
        wav_path = os.path.join(test_data_dir, wav_file)
        txt_file = wav_file.replace(".wav", ".txt")
        txt_path = os.path.join(transcribs_dir, txt_file)

        if not os.path.isfile(txt_path):
            print(f"Transcription file missing for {wav_file}, skipping...")
            continue
        
        # Load audio to get duration
        speech_array, sample_rate = sf.read(wav_path)
        audio_duration = len(speech_array) / sample_rate
        
        # Load audio waveform for model input
        waveform = load_audio(wav_path)  # Your existing function
        
        with open(txt_path, "r", encoding="utf-8") as f:
            ground_truth = f.read().strip().lower()

        # Transcribe & measure time
        start_time = time.time()
        pred_text = transcribe(waveform, processor, model)
        transcription_time = time.time() - start_time

        # Calculate WER
        error_rate = wer(ground_truth, pred_text)

        # Print results
        print(f"{model_name} | {wav_file} | Audio Duration: {audio_duration:.2f}s | Transcription Time: {transcription_time:.2f}s | WER: {error_rate:.3f}")
        print(f"Transcription: {pred_text}")
        print(f"Ground Truth : {ground_truth}\n")

        # Store result including audio duration and transcription time
        results.append({
            "model": model_name,
            "audio_file": wav_file,
            "audio_duration_sec": round(audio_duration, 2),
            "transcription_time_sec": round(transcription_time, 2),
            "wer": error_rate,
            "prediction": pred_text,
            "ground_truth": ground_truth
        })



--- Using model: large_960h_lv60 ---
large_960h_lv60 | 1.wav | Audio Duration: 8.18s | Transcription Time: 7.00s | WER: 0.375
Transcription: this is the test now there is a noise finished
Ground Truth : this is test now there is noise finised

large_960h_lv60 | 10.wav | Audio Duration: 7.75s | Transcription Time: 2.22s | WER: 0.222
Transcription: tence design to be oven clearly each word is easy to pronounce and understand when rets aloud slowly
Ground Truth : sentence designed to be spoken clearly each word is easy to pronounce and understand when read aloud slowly

large_960h_lv60 | 2.wav | Audio Duration: 14.72s | Transcription Time: 4.47s | WER: 0.842
Transcription: sometimes life tis onas e if hi truns life antns not even i the e of confusionisas evrystep evente sloone e sentid ito soe stronger an misel
Ground Truth : sometimes life does not go as planned we face struggles doubts and delays but even in the middle of confusion your path has purpose every step even the slow ones is

### Average Error per word of each model

In [8]:
def summarize_model_wer(results):
    from collections import defaultdict

    model_stats = defaultdict(lambda: {
        "total_wer": 0.0,
        "total_audio_duration": 0.0,
        "total_transcription_time": 0.0,
        "count": 0
    })

    for result in results:
        model = result["model"]
        model_stats[model]["total_wer"] += result["wer"]
        model_stats[model]["total_audio_duration"] += result.get("audio_duration_sec", 0)
        model_stats[model]["total_transcription_time"] += result.get("transcription_time_sec", 0)
        model_stats[model]["count"] += 1

    print("\n=== Average Metrics per Model ===")
    for model, stats in model_stats.items():
        count = stats["count"]
        avg_wer = stats["total_wer"] / count if count > 0 else 0
        avg_audio_duration = stats["total_audio_duration"] / count if count > 0 else 0
        avg_transcription_time = stats["total_transcription_time"] / count if count > 0 else 0

        print(f"Model: {model}")
        print(f"  Number of files        : {count}")
        print(f"  Average WER            : {avg_wer:.4f}")
        print(f"  Average Audio Duration : {avg_audio_duration:.2f} sec")
        print(f"  Average Transcription Time : {avg_transcription_time:.2f} sec\n")


In [9]:
summarize_model_wer(results)



=== Average Metrics per Model ===
Model: large_960h_lv60
  Number of files        : 10
  Average WER            : 0.3603
  Average Audio Duration : 27.21 sec
  Average Transcription Time : 9.28 sec



## Export large model into ONNX 

In [9]:
# model_dir = "models/large_960h_lv60/"
model_onx = "models/large_960h_lv60/"
onx_model = "large_960h_lv60.onnx"
test_data = "test_data"

In [10]:
def convert_model_to_onnx(model_onx, onx_model):
    print(f"Converting mode {model_onx}")
    model = Wav2Vec2ForCTC.from_pretrained(model_onx)
    model.eval()

    audio_len = 16000
    dummy_input = torch.randn(1, audio_len)

    torch.onnx.export(
        model,     
        dummy_input,   
        onx_model,             
        export_params=True,     
        opset_version=14,         
        do_constant_folding=True,    
        input_names=["input_values"],
        output_names=["logits"],     
        dynamic_axes={
            "input_values": {1: "sequence_length"},
            "logits": {1: "sequence_length"}
        },                          # allow variable-length audio sequences
    )
    print(f"ONNX model saved at: {onx_model}")

In [11]:
convert_model_to_onnx(model_onx, onx_model)

Converting mode models/large_960h_lv60/


  is_causal = query.shape[2] > 1 and attention_mask is None and getattr(module, "is_causal", True)


ONNX model saved at: large_960h_lv60.onnx


In [12]:
processor = Wav2Vec2Processor.from_pretrained(model_onx)
    


### Load ONNX model with optimized session (model less than 2GB)

In [13]:
import onnxruntime as rt

import numpy as np

sess_options = rt.SessionOptions()
sess_options.graph_optimization_level = rt.GraphOptimizationLevel.ORT_ENABLE_ALL
session = rt.InferenceSession(onx_model, sess_options)

In [14]:
def transcribe_from_file(filepath):
    # Load audio
    speech_array, sr = sf.read(filepath)

    # If not mono, convert to mono
    if len(speech_array.shape) > 1:
        speech_array = np.mean(speech_array, axis=1)

    # Resample if needed
    if sr != 16000:
        resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)
        speech_array = resampler(torch.tensor(speech_array).float()).numpy()
        sr = 16000

    # Prepare input for ONNX
    inputs = processor(speech_array, sampling_rate=16000, return_tensors="pt")
    input_values = inputs.input_values.numpy()

    # Run ONNX inference
    ort_inputs = {session.get_inputs()[0].name: input_values}
    ort_outputs = session.run(None, ort_inputs)[0]

    # Decode prediction
    predicted_ids = np.argmax(ort_outputs, axis=-1)
    transcription = processor.decode(predicted_ids[0])

    return transcription.lower()

In [16]:
results_onnx = []

for wav_file in sorted(os.listdir(test_data_dir)):
    if not wav_file.endswith(".wav"):
        continue

    wav_path = os.path.join(test_data_dir, wav_file)
    txt_file = wav_file.replace(".wav", ".txt")
    txt_path = os.path.join(transcribs_dir, txt_file)

    if not os.path.isfile(txt_path):
        print(f"Ground truth missing for {wav_file}, skipping...")
        continue

    # Read ground truth transcription
    with open(txt_path, "r", encoding="utf-8") as f:
        ground_truth = f.read().strip().lower()

    # Load audio to get duration
    speech_array, sr = sf.read(wav_path)
    audio_duration = len(speech_array) / sr  # audio duration in seconds

    # Transcribe with ONNX model and measure time
    start_time = time.time()
    try:
        prediction = transcribe_from_file(wav_path)
    except Exception as e:
        print(f"Error transcribing {wav_file}: {e}")
        continue
    transcription_time = time.time() - start_time

    # Compute WER
    error_rate = wer(ground_truth, prediction)

    # Print result
    print(f"{wav_file} | Audio Duration: {audio_duration:.2f}s | Transcription Time: {transcription_time:.2f}s | WER: {error_rate:.3f}")
    print(f"Prediction   : {prediction}")
    print(f"Ground Truth : {ground_truth}\n")

    # Save result with updated keys
    results_onnx.append({
        "model": "onnx_large_960h_lv60",
        "audio_file_name": wav_file,
        "audio_duration_sec": round(audio_duration, 2),
        "transcription_time_sec": round(transcription_time, 2),
        "wer": round(error_rate, 4),
        "prediction": prediction,
        "ground_truth": ground_truth
    })


1.wav | Audio Duration: 8.18s | Transcription Time: 2.10s | WER: 0.375
Prediction   : this is the test now there is a noise finished
Ground Truth : this is test now there is noise finised

10.wav | Audio Duration: 7.75s | Transcription Time: 2.00s | WER: 0.222
Prediction   : tence design to be oven clearly each word is easy to pronounce and understand when rets aloud slowly
Ground Truth : sentence designed to be spoken clearly each word is easy to pronounce and understand when read aloud slowly

2.wav | Audio Duration: 14.72s | Transcription Time: 3.89s | WER: 0.711
Prediction   : sometimes life te sontis tine if hi true life snts not eveny teien of confusion  isas every step even the stoon sepingnito so stronger and wiser
Ground Truth : sometimes life does not go as planned we face struggles doubts and delays but even in the middle of confusion your path has purpose every step even the slow ones is shaping you into someone stronger and wiser

3.wav | Audio Duration: 32.62s | Transcrip

In [17]:
def summarize_onnx_wer_o(results_onnx):
    from collections import defaultdict

    model_wer_stats = defaultdict(lambda: {"total_wer": 0.0, "count": 0, "total_audio_duration": 0.0, "total_transcription_time": 0.0})

    for result in results_onnx:
        model = result["model"]
        model_wer_stats[model]["total_wer"] += result["wer"]
        model_wer_stats[model]["count"] += 1
        model_wer_stats[model]["total_audio_duration"] += result.get("audio_duration_sec", 0)
        model_wer_stats[model]["total_transcription_time"] += result.get("transcription_time_sec", 0)

    print("\n=== ONNX Model Summary ===")
    for model, stats in model_wer_stats.items():
        count = stats["count"]
        avg_wer = stats["total_wer"] / count if count > 0 else 0
        avg_audio_dur = stats["total_audio_duration"] / count if count > 0 else 0
        avg_trans_time = stats["total_transcription_time"] / count if count > 0 else 0

        print(f"Model: {model}")
        print(f"  Number of files     : {count}")
        print(f"  Average WER         : {avg_wer:.4f}")
        print(f"  Average Audio Length: {avg_audio_dur:.2f} sec")
        print(f"  Average Transcription Time: {avg_trans_time:.2f} sec\n")


In [18]:
summarize_onnx_wer_o(results_onnx)


=== ONNX Model Summary ===
Model: onnx_large_960h_lv60
  Number of files     : 10
  Average WER         : 0.3421
  Average Audio Length: 27.21 sec
  Average Transcription Time: 10.89 sec



### Converting into OpenVino

In [19]:
import openvino as ov


In [20]:
example = torch.randn(1, 160000)
open_vino_model = ov.convert_model(model, example_input = (example,))
core = ov.Core()

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


In [21]:
compiled_model  = core.compile_model(open_vino_model, 'CPU')


In [22]:
def transcribe_from_file_ov(filepath):
    speech_array, sr = sf.read(filepath)

    if len(speech_array.shape) > 1:
        speech_array = np.mean(speech_array, axis=1)

    if sr != 16000:
        resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)
        speech_array = resampler(torch.tensor(speech_array).float()).numpy()

    inputs = processor(speech_array, sampling_rate=16000, return_tensors="np")
    input_values = inputs["input_values"]

    ov_inputs = {0: input_values}
    ov_outputs = compiled_model(ov_inputs)
    logits = list(ov_outputs.values())[0]

    predicted_ids = np.argmax(logits, axis=-1)
    transcription = processor.decode(predicted_ids[0])

    return transcription.lower()

In [23]:


results_openvino = []

for wav_file in sorted(os.listdir(test_data)):
    if not wav_file.endswith(".wav"):
        continue

    wav_path = os.path.join(test_data, wav_file)
    txt_file = wav_file.replace(".wav", ".txt")
    txt_path = os.path.join(transcribs_dir, txt_file)

    if not os.path.isfile(txt_path):
        print(f"❌ Ground truth missing for {wav_file}, skipping...")
        continue

    with open(txt_path, "r", encoding="utf-8") as f:
        ground_truth = f.read().strip().lower()

    speech_array, sr = sf.read(wav_path)
    audio_duration = len(speech_array) / sr

    try:
        start_time = time.time()
        prediction = transcribe_from_file_ov(wav_path)
        transcription_time = time.time() - start_time
    except Exception as e:
        print(f"❌ Error transcribing {wav_file}: {e}")
        continue

    error_rate = wer(ground_truth, prediction)

    print(f"{wav_file} | Audio Duration: {audio_duration:.2f}s | Transcription Time: {transcription_time:.2f}s | WER: {error_rate:.3f}")
    print(f"🔊 Prediction   : {prediction}")
    print(f"📜 Ground Truth : {ground_truth}\n")

    results_openvino.append({
        "model": "openvino_large_960h_lv60",
        "audio_file_name": wav_file,
        "audio_duration_sec": round(audio_duration, 2),
        "transcription_time_sec": round(transcription_time, 2),
        "wer": round(error_rate, 4),
        "prediction": prediction,
        "ground_truth": ground_truth
    })

1.wav | Audio Duration: 8.18s | Transcription Time: 1.35s | WER: 0.375
🔊 Prediction   : this is the test now there is a noise finished
📜 Ground Truth : this is test now there is noise finised

10.wav | Audio Duration: 7.75s | Transcription Time: 0.98s | WER: 0.222
🔊 Prediction   : tence design to be oven clearly each word is easy to pronounce and understand when rets aloud slowly
📜 Ground Truth : sentence designed to be spoken clearly each word is easy to pronounce and understand when read aloud slowly

2.wav | Audio Duration: 14.72s | Transcription Time: 1.75s | WER: 0.711
🔊 Prediction   : sometimes life te sontis tine if hi true life snts not eveny teien of confusion  isas every step even the stoon sepingnito so stronger and wiser
📜 Ground Truth : sometimes life does not go as planned we face struggles doubts and delays but even in the middle of confusion your path has purpose every step even the slow ones is shaping you into someone stronger and wiser

3.wav | Audio Duration: 32.62s

In [24]:
def summarize_onnx_wer(results_openvino):
    from collections import defaultdict

    model_wer_stats = defaultdict(lambda: {"total_wer": 0.0, "count": 0, "total_audio_duration": 0.0, "total_transcription_time": 0.0})

    for result in results_openvino:
        model = result["model"]
        model_wer_stats[model]["total_wer"] += result["wer"]
        model_wer_stats[model]["count"] += 1
        model_wer_stats[model]["total_audio_duration"] += result.get("audio_duration_sec", 0)
        model_wer_stats[model]["total_transcription_time"] += result.get("transcription_time_sec", 0)

    print("\n=== Openvino Model Summary ===")
    for model, stats in model_wer_stats.items():
        count = stats["count"]
        avg_wer = stats["total_wer"] / count if count > 0 else 0
        avg_audio_dur = stats["total_audio_duration"] / count if count > 0 else 0
        avg_trans_time = stats["total_transcription_time"] / count if count > 0 else 0

        print(f"Model: {model}")
        print(f"  Number of files     : {count}")
        print(f"  Average WER         : {avg_wer:.4f}")
        print(f"  Average Audio Length: {avg_audio_dur:.2f} sec")
        print(f"  Average Transcription Time: {avg_trans_time:.2f} sec\n")


In [25]:
summarize_onnx_wer(results_openvino)


=== Openvino Model Summary ===
Model: openvino_large_960h_lv60
  Number of files     : 10
  Average WER         : 0.3421
  Average Audio Length: 27.21 sec
  Average Transcription Time: 4.22 sec



In [26]:
summarize_onnx_wer_o(results_onnx)


=== ONNX Model Summary ===
Model: onnx_large_960h_lv60
  Number of files     : 10
  Average WER         : 0.3421
  Average Audio Length: 27.21 sec
  Average Transcription Time: 10.89 sec



In [27]:
summarize_model_wer(results)


=== Average Metrics per Model ===
Model: large_960h_lv60
  Number of files        : 10
  Average WER            : 0.3603
  Average Audio Duration : 27.21 sec
  Average Transcription Time : 9.28 sec



### Checking ONNX model size (should be less than 2GB to optimize)

In [28]:
import os

model_path = "large_960h_lv60.onnx"
size_bytes = os.path.getsize(model_path)
size_mb = size_bytes / (1024 * 1024)
size_gb = size_mb / 1024

print(f"Model size: {size_bytes} bytes")
print(f"Model size: {size_mb:.2f} MB")
print(f"Model size: {size_gb:.2f} GB")

if size_gb > 2:
    print("Model size is GREATER than 2GB")
else:
    print("Model size is LESS than or equal to 2GB")

Model size: 1262392509 bytes
Model size: 1203.91 MB
Model size: 1.18 GB
Model size is LESS than or equal to 2GB


### Dynamic Quantizing large960h lv60 pytorch model

In [29]:
from torch.quantization import quantize_dynamic

In [30]:
model.eval()


Wav2Vec2ForCTC(
  (wav2vec2): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (1-4): 4 x Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (5-6): 2 x Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): Wav2Vec2FeatureProjection(
      (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (projec

### Dynamic quantizing the linear layers

In [31]:
dynamic_quantized_model = quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8)

### save the quantized model

In [32]:
quantized_path = os.path.join(models_dir, "dynamic_quantized_model_state_dict.pth")
torch.save(dynamic_quantized_model.state_dict(), quantized_path)
print(f"Quantized model state_dict saved to: {quantized_path}")
#to load later|
#quantized_model.load_state_dict(torch.load(quantized_path))


Quantized model state_dict saved to: models\dynamic_quantized_model_state_dict.pth


### Dynamic quantizing ONNX model (large960h-lv60 model )

In [1]:
import onnx

model_path = "large_960h_lv60.onnx"
model = onnx.load(model_path)

In [2]:
from onnxruntime.quantization import quantize_dynamic, QuantType
from onnxruntime.quantization.shape_inference import quant_pre_process

#### Pre-processing model before quantizing

In [35]:
quant_pre_process(
    input_model="large_960h_lv60.onnx",
    output_model_path="large_960h_lv60_optimized.onnx",
    save_as_external_data=True,
    all_tensors_to_one_file=True,               
    external_data_location="large_960h_lv60.data"  
)

DecodeError: Error parsing message with type 'onnx.ModelProto'

In [None]:
quantize_dynamic(
    model_input="large_960h_lv60_optimized.onnx",  
    model_output="large_960h_lv60_quantized.onnx", 
    weight_type=QuantType.QInt8,
)

MemoryError: 

### Transcribe audio using quantized pytorch model large 960h lv60

In [36]:
state_dict = torch.load(quantized_path)
dynamic_quantized_model.load_state_dict(state_dict)

  device=storage.device,


<All keys matched successfully>

In [37]:
dynamic_quantized_model.eval()

Wav2Vec2ForCTC(
  (wav2vec2): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (1-4): 4 x Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (5-6): 2 x Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): Wav2Vec2FeatureProjection(
      (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (projec

In [38]:
def transcribe_with_quantized_model(wav_path):
    # Load audio
    waveform, sr = torchaudio.load(wav_path)

    # Convert stereo to mono if needed
    if waveform.shape[0] > 1:
        waveform = waveform.mean(dim=0, keepdim=True)

    # Resample if needed
    if sr != 16000:
        resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)
        waveform = resampler(waveform)

    # Remove batch/channel dimension
    waveform = waveform.squeeze()

    # Preprocess
    inputs = processor(waveform, sampling_rate=16000, return_tensors="pt", padding=True)

    # Inference
    with torch.no_grad():
        outputs = dynamic_quantized_model(**inputs)
        logits = outputs.logits
        predicted_ids = torch.argmax(logits, dim=-1)

    # Decode
    transcription = processor.decode(predicted_ids[0])

    return transcription.lower()

In [39]:
results_quantized = []

for wav_file in sorted(os.listdir(test_data_dir)):
    if not wav_file.endswith(".wav"):
        continue

    wav_path = os.path.join(test_data_dir, wav_file)
    txt_file = wav_file.replace(".wav", ".txt")
    txt_path = os.path.join(transcribs_dir, txt_file)

    if not os.path.isfile(txt_path):
        print(f"Ground truth missing for {wav_file}, skipping...")
        continue

    # Read ground truth transcription
    with open(txt_path, "r", encoding="utf-8") as f:
        ground_truth = f.read().strip().lower()

    # Load audio to get duration
    speech_array, sr = sf.read(wav_path)
    audio_duration = len(speech_array) / sr  # audio duration in seconds

    # Transcribe with ONNX model and measure time
    start_time = time.time()
    try:
        prediction = transcribe_with_quantized_model(wav_path)
    except Exception as e:
        print(f"Error transcribing {wav_file}: {e}")
        continue
    transcription_time = time.time() - start_time

    # Compute WER
    error_rate = wer(ground_truth, prediction)

    # Print result
    print(f"{wav_file} | Audio Duration: {audio_duration:.2f}s | Transcription Time: {transcription_time:.2f}s | WER: {error_rate:.3f}")
    print(f"Prediction   : {prediction}")
    print(f"Ground Truth : {ground_truth}\n")

    # Save result with updated keys
    results_quantized.append({
        "model": dynamic_quantized_model,
        "audio_file_name": wav_file,
        "audio_duration_sec": round(audio_duration, 2),
        "transcription_time_sec": round(transcription_time, 2),
        "wer": round(error_rate, 4),
        "prediction": prediction,
        "ground_truth": ground_truth
    })


1.wav | Audio Duration: 8.18s | Transcription Time: 1.32s | WER: 0.375
Prediction   : this is the test now there is a noise finished
Ground Truth : this is test now there is noise finised

10.wav | Audio Duration: 7.75s | Transcription Time: 0.84s | WER: 0.222
Prediction   : tene design to be oven clearly each word is easy to pronounce and understand when its aloud slowly
Ground Truth : sentence designed to be spoken clearly each word is easy to pronounce and understand when read aloud slowly

2.wav | Audio Duration: 14.72s | Transcription Time: 1.84s | WER: 0.895
Prediction   : sometimes life disots i if his to life nas not eveni teiin of confusionasas evrystep even te oo  sapintoa stonganise
Ground Truth : sometimes life does not go as planned we face struggles doubts and delays but even in the middle of confusion your path has purpose every step even the slow ones is shaping you into someone stronger and wiser

3.wav | Audio Duration: 32.62s | Transcription Time: 4.97s | WER: 0.857


In [40]:
def summarize_model_wer_quantized(results_quantized):
    from collections import defaultdict

    model_stats = defaultdict(lambda: {
        "total_wer": 0.0,
        "total_audio_duration": 0.0,
        "total_transcription_time": 0.0,
        "count": 0
    })

    for result in results_quantized:
        model = result["model"]
        model_stats[model]["total_wer"] += result["wer"]
        model_stats[model]["total_audio_duration"] += result.get("audio_duration_sec", 0)
        model_stats[model]["total_transcription_time"] += result.get("transcription_time_sec", 0)
        model_stats[model]["count"] += 1

    print("\n=== Average Metrics per Model ===")
    for model, stats in model_stats.items():
        count = stats["count"]
        avg_wer = stats["total_wer"] / count if count > 0 else 0
        avg_audio_duration = stats["total_audio_duration"] / count if count > 0 else 0
        avg_transcription_time = stats["total_transcription_time"] / count if count > 0 else 0

        print(f"Model: {model}")
        print(f"  Number of files        : {count}")
        print(f"  Average WER            : {avg_wer:.4f}")
        print(f"  Average Audio Duration : {avg_audio_duration:.2f} sec")
        print(f"  Average Transcription Time : {avg_transcription_time:.2f} sec\n")


In [41]:
summarize_model_wer_quantized(results_quantized)


=== Average Metrics per Model ===
Model: Wav2Vec2ForCTC(
  (wav2vec2): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (1-4): 4 x Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (5-6): 2 x Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): Wav2Vec2FeatureProjection(
      (layer_norm): LayerNorm((512,), eps=1e-

In [42]:
summarize_model_wer(results)


=== Average Metrics per Model ===
Model: large_960h_lv60
  Number of files        : 10
  Average WER            : 0.3603
  Average Audio Duration : 27.21 sec
  Average Transcription Time : 9.28 sec



### transcribe audio using quantized ONNX model

In [15]:
results_onnx_quantized = []

for wav_file in sorted(os.listdir(test_data_dir)):
    if not wav_file.endswith(".wav"):
        continue

    wav_path = os.path.join(test_data_dir, wav_file)
    txt_file = wav_file.replace(".wav", ".txt")
    txt_path = os.path.join(transcribs_dir, txt_file)

    if not os.path.isfile(txt_path):
        print(f"Ground truth missing for {wav_file}, skipping...")
        continue

    # Read ground truth transcription
    with open(txt_path, "r", encoding="utf-8") as f:
        ground_truth = f.read().strip().lower()

    # Load audio to get duration
    speech_array, sr = sf.read(wav_path)
    audio_duration = len(speech_array) / sr  # audio duration in seconds

    # Transcribe with ONNX model and measure time
    start_time = time.time()
    try:
        prediction = transcribe_from_file(wav_path)
    except Exception as e:
        print(f"Error transcribing {wav_file}: {e}")
        continue
    transcription_time = time.time() - start_time

    # Compute WER
    error_rate = wer(ground_truth, prediction)

    # Print result
    print(f"{wav_file} | Audio Duration: {audio_duration:.2f}s | Transcription Time: {transcription_time:.2f}s | WER: {error_rate:.3f}")
    print(f"Prediction   : {prediction}")
    print(f"Ground Truth : {ground_truth}\n")

    # Save result with updated keys
    results_onnx_quantized.append({
        "model": "large_960h_lv60_quantized.onnx",
        "audio_file_name": wav_file,
        "audio_duration_sec": round(audio_duration, 2),
        "transcription_time_sec": round(transcription_time, 2),
        "wer": round(error_rate, 4),
        "prediction": prediction,
        "ground_truth": ground_truth
    })


1.wav | Audio Duration: 8.18s | Transcription Time: 1.64s | WER: 0.375
Prediction   : this is the test now there is a noise finished
Ground Truth : this is test now there is noise finised

10.wav | Audio Duration: 7.75s | Transcription Time: 1.30s | WER: 0.222
Prediction   : tence design to be oven clearly each word is easy to pronounce and understand when rets aloud slowly
Ground Truth : sentence designed to be spoken clearly each word is easy to pronounce and understand when read aloud slowly

2.wav | Audio Duration: 14.72s | Transcription Time: 3.03s | WER: 0.711
Prediction   : sometimes life te sontis tine if hi true life snts not eveny teien of confusion  isas every step even the stoon sepingnito so stronger and wiser
Ground Truth : sometimes life does not go as planned we face struggles doubts and delays but even in the middle of confusion your path has purpose every step even the slow ones is shaping you into someone stronger and wiser

3.wav | Audio Duration: 32.62s | Transcrip

In [16]:
def summarize_onnx_wer_quantized(results_onnx_quantized):
    from collections import defaultdict

    model_wer_stats = defaultdict(lambda: {"total_wer": 0.0, "count": 0, "total_audio_duration": 0.0, "total_transcription_time": 0.0})

    for result in results_onnx_quantized:
        model = result["model"]
        model_wer_stats[model]["total_wer"] += result["wer"]
        model_wer_stats[model]["count"] += 1
        model_wer_stats[model]["total_audio_duration"] += result.get("audio_duration_sec", 0)
        model_wer_stats[model]["total_transcription_time"] += result.get("transcription_time_sec", 0)

    print("\n=== Quantized ONNX Model Summary ===")
    for model, stats in model_wer_stats.items():
        count = stats["count"]
        avg_wer = stats["total_wer"] / count if count > 0 else 0
        avg_audio_dur = stats["total_audio_duration"] / count if count > 0 else 0
        avg_trans_time = stats["total_transcription_time"] / count if count > 0 else 0

        print(f"Model: {model}")
        print(f"  Number of files     : {count}")
        print(f"  Average WER         : {avg_wer:.4f}")
        print(f"  Average Audio Length: {avg_audio_dur:.2f} sec")
        print(f"  Average Transcription Time: {avg_trans_time:.2f} sec\n")


In [20]:
summarize_onnx_wer_quantized(results_onnx_quantized)


=== Quantized ONNX Model Summary ===
Model: large_960h_lv60_quantized.onnx
  Number of files     : 10
  Average WER         : 0.3421
  Average Audio Length: 27.21 sec
  Average Transcription Time: 9.85 sec



In [19]:
summarize_onnx_wer_o(results_onnx)

NameError: name 'summarize_onnx_wer_o' is not defined

### Quantizing OpenVINO model using POT (Post-Training Optimization Toolkit)