In [4]:
import torch
import torchaudio
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, AutoProcessor, Wav2Vec2FeatureExtractor,Wav2Vec2CTCTokenizer
import time
import os

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
save_dir_new = "finedtunemodels"
os.makedirs(save_dir_new, exist_ok=True)

In [5]:
# model_id = "facebook/wav2vec2-large-xlsr-53"
# print(f"Loading model {model_id}...")

# feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_id)  
# model = Wav2Vec2ForCTC.from_pretrained(model_id, use_safetensors=True)  
# tokenizer = Wav2Vec2CTCTokenizer.from_pretrained("gagan3012/wav2vec2-xlsr-nepali")  
# model.resize_token_embeddings(len(tokenizer))

# processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

# model_save_path_2 = os.path.join(save_dir_new, "multilingual_model")
# os.makedirs(model_save_path_2, exist_ok=True)

# model_id = "kiranpantha/wav2vec2-large-xls-r-300m-nepali"
# print(f"Loading model {model_id}...")


# processor = Wav2Vec2Processor.from_pretrained(model_id)  # no use_safetensors here
# model = Wav2Vec2ForCTC.from_pretrained(model_id, use_safetensors=True)  # yes here

# model_save_path = os.path.join(save_dir_new, "fine_tuned_model_kiran")
# os.makedirs(model_save_path, exist_ok=True)

# processor.save_pretrained(model_save_path)
# model.save_pretrained(model_save_path, use_safetensors=True)
# print(f"Saved fine_tuned_model to {model_save_path}")

# print("All models loaded and saved successfully!")




# model_id_1 = "anish-shilpakar/wav2vec2-nepali"
# print(f"Loading model {model_id_1}...")

# processor_1 = Wav2Vec2Processor.from_pretrained(model_id_1)  # no use_safetensors here
# model_1 = Wav2Vec2ForCTC.from_pretrained(model_id_1, use_safetensors=True)  # yes here

# model_save_path_1 = os.path.join(save_dir_new, "fine_tuned_model")
# os.makedirs(model_save_path_1, exist_ok=True)

# processor_1.save_pretrained(model_save_path_1)
# model_1.save_pretrained(model_save_path_1, use_safetensors=True)
# print(f"Saved fine_tuned_model to {model_save_path_1}")

# print("All models loaded and saved successfully!")


In [17]:
from jiwer import wer

models_dir = "finedtunemodels"
test_data_dir = "nepali_audios"
transcribs_dir = "transcribs"

In [7]:
model_names = [name for name in os.listdir(models_dir) if os.path.isdir(os.path.join(models_dir, name))]

In [19]:
import soundfile as sf
import torchaudio.transforms as T
import torch
def load_audio(path, target_sr=16000):
    waveform, sr = sf.read(path)
    if sr != target_sr:
        waveform = torch.tensor(waveform).unsqueeze(0)
        resampler = T.Resample(orig_freq=sr, new_freq=target_sr)
        waveform = resampler(waveform)
        waveform = waveform.squeeze(0).numpy()
    return waveform

In [9]:
def transcribe(waveform, processor, model):
        if waveform.ndim == 2:
            waveform = waveform[0]  
        
        input_values = processor(waveform, return_tensors="pt", sampling_rate=16000).input_values

        with torch.no_grad():
            logits = model(input_values).logits
        predicted_ids = torch.argmax(logits, dim=-1)
        transcription = processor.decode(predicted_ids[0])

        return transcription.lower()


In [10]:
import torchaudio
torchaudio.set_audio_backend("soundfile")
print(torchaudio.list_audio_backends())  # should now show: ['soundfile']


['soundfile']


  torchaudio.set_audio_backend("soundfile")


In [11]:
results = []

for model_name in model_names:
    print(f"\n--- Using model: {model_name} ---")
    model_path = os.path.join(models_dir, model_name)
    
    # Load processor and model
    processor = Wav2Vec2Processor.from_pretrained(model_path)
    model = Wav2Vec2ForCTC.from_pretrained(model_path)
    model.eval()

    # Loop through all wav files
    for wav_file in sorted(os.listdir(test_data_dir)):
        if not wav_file.endswith(".wav"):
            continue
        
        wav_path = os.path.join(test_data_dir, wav_file)
        txt_file = wav_file.replace(".wav", ".txt")
        txt_path = os.path.join(transcribs_dir, txt_file)

        if not os.path.isfile(txt_path):
            print(f"Transcription file missing for {wav_file}, skipping...")
            continue
        
        # Load audio & ground truth text
        waveform, sample_rate = torchaudio.load(wav_path)
        audio_duration = waveform.shape[1] / sample_rate  # in seconds

        with open(txt_path, "r", encoding="utf-8") as f:
            ground_truth = f.read().strip().lower()

        # Transcribe & measure time
        start_time = time.time()
        pred_text = transcribe(waveform, processor, model)
        duration = time.time() - start_time

        # Calculate WER
        error_rate = wer(ground_truth, pred_text)

        # Print results
        print(f"{model_name} | {wav_file} | Transcribed Time: {duration:.2f}s | Audio Duration: {audio_duration:.2f}s | WER: {error_rate:.3f}")
        print(f"Transcription: {pred_text}")
        print(f"Ground Truth : {ground_truth}\n")

        # Store result
        results.append({
            "model": model_name,
            "audio_file": wav_file,
            "transcribed_time_sec": duration,
            "audio_duration_sec": audio_duration,
            "wer": error_rate,
            "prediction": pred_text,
            "ground_truth": ground_truth,
        })



--- Using model: fine_tuned_model ---
fine_tuned_model | eight.wav | Transcribed Time: 8.44s | Audio Duration: 10.30s | WER: 0.467
Transcription: सहकार्य र सहयोग मात्र समाजमा शान्ति र सम्तृति आउछ आमि सबैदले माया जमझधारी बढाउनुपर्छ।
Ground Truth : सहकार्य र सहयोगले मात्र समाजमा शान्ति र समृद्धि आउँछ हामी सबैले माया र समझदारी बढाउनुपर्छ।

fine_tuned_model | five.wav | Transcribed Time: 3.45s | Audio Duration: 9.70s | WER: 0.375
Transcription: स्वच्छता र स्वास्थ्य पुख्याल राख्नु हरेक नागरिको कर्त ब्यउ सफा वातावरणले मात्र स्वास्थ्य समाज सम्भव हुन्छ।
Ground Truth : स्वच्छता र स्वास्थ्यको ख्याल राख्नु हरेक नागरिकको कर्तव्य हो सफा वातावरणले मात्र स्वस्थ समाज सम्भव हुन्छ।

fine_tuned_model | four.wav | Transcribed Time: 2.16s | Audio Duration: 9.38s | WER: 0.500
Transcription: हाम्नो देशको समनीति हाम र एकताको बलमा निर्भन गर्छ हामीले एक भएर मात्रै उजुवल भविषय बनाउन सक्छ।
Ground Truth : हाम्रो देशको समृद्धि हाम्रो एकताको बलमा निर्भर गर्छ हामीले एक भएर मात्रै उज्जवल भविष्य बनाउन सक्छौं।

fine_tu

In [12]:
def summarize_model_wer(results):
    from collections import defaultdict

    model_wer_stats = defaultdict(lambda: {
        "total_wer": 0.0,
        "count": 0,
        "total_audio_duration": 0.0,
        "total_transcription_time": 0.0
    })

    for result in results:
        model = result["model"]
        model_wer_stats[model]["total_wer"] += result["wer"]
        model_wer_stats[model]["count"] += 1
        model_wer_stats[model]["total_audio_duration"] += result.get("audio_duration_sec", 0)
        model_wer_stats[model]["total_transcription_time"] += result.get("transcribed_time_sec", 0)

    print("\n=== Model Summary ===")
    for model, stats in model_wer_stats.items():
        count = stats["count"]
        avg_wer = stats["total_wer"] / count if count > 0 else 0
        avg_audio_dur = stats["total_audio_duration"] / count if count > 0 else 0
        avg_trans_time = stats["total_transcription_time"] / count if count > 0 else 0

        print(f"Model: {model}")
        print(f"  Number of files     : {count}")
        print(f"  Average WER         : {avg_wer:.4f}")
        print(f"  Average Audio Length: {avg_audio_dur:.2f} sec")
        print(f"  Average Transcription Time: {avg_trans_time:.2f} sec\n")


In [5]:
model_onx = "finedtunemodels/fine_tuned_model/"
onx_model = "fine_tuned_models.onnx"

In [6]:
def convert_model_to_onnx(model_onx, onx_model):
    print(f"Converting mode {model_onx}")
    model = Wav2Vec2ForCTC.from_pretrained(model_onx)
    model.eval()

    audio_len = 16000   
    dummy_input = torch.randn(1, audio_len)

    torch.onnx.export(
        model,     
        dummy_input,   
        onx_model,             
        export_params=True,     
        opset_version=14,         
        do_constant_folding=True,    
        input_names=["input_values"],
        output_names=["logits"],     
        dynamic_axes={
            "input_values": {1: "sequence_length"},
            "logits": {1: "sequence_length"}
        },                         
    )
    print(f"ONNX model saved at: {onx_model}")

In [7]:
convert_model_to_onnx(model_onx, onx_model)
processor = Wav2Vec2Processor.from_pretrained(model_onx)

Converting mode finedtunemodels/fine_tuned_model/


  is_causal = query.shape[2] > 1 and attention_mask is None and getattr(module, "is_causal", True)


ONNX model saved at: fine_tuned_models.onnx


In [8]:
import onnxruntime as rt

import numpy as np

sess_options = rt.SessionOptions()
sess_options.graph_optimization_level = rt.GraphOptimizationLevel.ORT_ENABLE_ALL
session = rt.InferenceSession(onx_model, sess_options)

In [24]:
def transcribe_from_file(filepath):
    # Load audio
    speech_array, sr = sf.read(filepath)

    # If not mono, convert to mono
    if len(speech_array.shape) > 1:
        speech_array = np.mean(speech_array, axis=1)

    # Resample if needed
    if sr != 16000:
        resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)
        speech_array = resampler(torch.tensor(speech_array).float()).numpy()
        sr = 16000

    # Prepare input for ONNX
    inputs = processor(speech_array, sampling_rate=16000, return_tensors="pt")
    input_values = inputs.input_values.numpy()

    # Run ONNX inference
    ort_inputs = {session.get_inputs()[0].name: input_values}
    ort_outputs = session.run(None, ort_inputs)[0]

    # Decode prediction
    predicted_ids = np.argmax(ort_outputs, axis=-1)
    transcription = processor.decode(predicted_ids[0])

    return transcription.lower()

In [25]:
results_onnx = []

for wav_file in sorted(os.listdir(test_data_dir)):
    if not wav_file.endswith(".wav"):
        continue

    wav_path = os.path.join(test_data_dir, wav_file)
    txt_file = wav_file.replace(".wav", ".txt")
    txt_path = os.path.join(transcribs_dir, txt_file)

    if not os.path.isfile(txt_path):
        print(f"Ground truth missing for {wav_file}, skipping...")
        continue

    # Read ground truth transcription
    with open(txt_path, "r", encoding="utf-8") as f:
        ground_truth = f.read().strip().lower()

    # Load audio to get duration
    speech_array, sr = sf.read(wav_path)
    audio_duration = len(speech_array) / sr  # audio duration in seconds

    # Transcribe with ONNX model and measure time
    start_time = time.time()
    try:
        prediction = transcribe_from_file(wav_path)
    except Exception as e:
        print(f"Error transcribing {wav_file}: {e}")
        continue
    transcription_time = time.time() - start_time

    # Compute WER
    error_rate = wer(ground_truth, prediction)

    # Print result
    print(f"{wav_file} | Audio Duration: {audio_duration:.2f}s | Transcription Time: {transcription_time:.2f}s | WER: {error_rate:.3f}")
    print(f"Prediction   : {prediction}")
    print(f"Ground Truth : {ground_truth}\n")

    # Save result with updated keys
    results_onnx.append({
        "model": "onnx_FineTuned",
        "audio_file_name": wav_file,
        "audio_duration_sec": round(audio_duration, 2),
        "transcription_time_sec": round(transcription_time, 2),
        "wer": round(error_rate, 4),
        "prediction": prediction,
        "ground_truth": ground_truth
    })


eight.wav | Audio Duration: 10.30s | Transcription Time: 2.33s | WER: 0.467
Prediction   : सहकार्य र सहयोग मात्र समाजमा शान्ति र सम्तृति आउछ आमि सबैदले माया जमझधारी बढाउनुपर्छ।
Ground Truth : सहकार्य र सहयोगले मात्र समाजमा शान्ति र समृद्धि आउँछ हामी सबैले माया र समझदारी बढाउनुपर्छ।

five.wav | Audio Duration: 9.70s | Transcription Time: 1.65s | WER: 0.375
Prediction   : स्वच्छता र स्वास्थ्य पुख्याल राख्नु हरेक नागरिको कर्त ब्यउ सफा वातावरणले मात्र स्वास्थ्य समाज सम्भव हुन्छ।
Ground Truth : स्वच्छता र स्वास्थ्यको ख्याल राख्नु हरेक नागरिकको कर्तव्य हो सफा वातावरणले मात्र स्वस्थ समाज सम्भव हुन्छ।

four.wav | Audio Duration: 9.38s | Transcription Time: 1.55s | WER: 0.500
Prediction   : हाम्नो देशको समनीति हाम र एकताको बलमा निर्भन गर्छ हामीले एक भएर मात्रै उजुवल भविषय बनाउन सक्छ।
Ground Truth : हाम्रो देशको समृद्धि हाम्रो एकताको बलमा निर्भर गर्छ हामीले एक भएर मात्रै उज्जवल भविष्य बनाउन सक्छौं।

nine.wav | Audio Duration: 9.91s | Transcription Time: 1.47s | WER: 0.467
Prediction   : कामभू ने

In [26]:
def summarize_onnx_wer_o(results_onnx):
    from collections import defaultdict

    model_wer_stats = defaultdict(lambda: {"total_wer": 0.0, "count": 0, "total_audio_duration": 0.0, "total_transcription_time": 0.0})

    for result in results_onnx:
        model = result["model"]
        model_wer_stats[model]["total_wer"] += result["wer"]
        model_wer_stats[model]["count"] += 1
        model_wer_stats[model]["total_audio_duration"] += result.get("audio_duration_sec", 0)
        model_wer_stats[model]["total_transcription_time"] += result.get("transcription_time_sec", 0)

    print("\n=== ONNX Model Summary ===")
    for model, stats in model_wer_stats.items():
        count = stats["count"]
        avg_wer = stats["total_wer"] / count if count > 0 else 0
        avg_audio_dur = stats["total_audio_duration"] / count if count > 0 else 0
        avg_trans_time = stats["total_transcription_time"] / count if count > 0 else 0

        print(f"Model: {model}")
        print(f"  Number of files     : {count}")
        print(f"  Average WER         : {avg_wer:.4f}")
        print(f"  Average Audio Length: {avg_audio_dur:.2f} sec")
        print(f"  Average Transcription Time: {avg_trans_time:.2f} sec\n")


In [20]:
import openvino as ov

In [21]:
example = torch.randn(1, 160000)
open_vino_model = ov.convert_model(model, example_input = (example,))
core = ov.Core()

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


In [22]:
compiled_model  = core.compile_model(open_vino_model, 'CPU')

In [23]:
def transcribe_from_file_ov(filepath):
    speech_array, sr = sf.read(filepath)

    if len(speech_array.shape) > 1:
        speech_array = np.mean(speech_array, axis=1)

    if sr != 16000:
        resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)
        speech_array = resampler(torch.tensor(speech_array).float()).numpy()

    inputs = processor(speech_array, sampling_rate=16000, return_tensors="np")
    input_values = inputs["input_values"]

    ov_inputs = {0: input_values}
    ov_outputs = compiled_model(ov_inputs)
    logits = list(ov_outputs.values())[0]

    predicted_ids = np.argmax(logits, axis=-1)
    transcription = processor.decode(predicted_ids[0])

    return transcription.lower()

In [24]:


results_openvino = []

for wav_file in sorted(os.listdir(test_data_dir)):
    if not wav_file.endswith(".wav"):
        continue

    wav_path = os.path.join(test_data_dir, wav_file)
    txt_file = wav_file.replace(".wav", ".txt")
    txt_path = os.path.join(transcribs_dir, txt_file)

    if not os.path.isfile(txt_path):
        print(f"❌ Ground truth missing for {wav_file}, skipping...")
        continue

    with open(txt_path, "r", encoding="utf-8") as f:
        ground_truth = f.read().strip().lower()

    speech_array, sr = sf.read(wav_path)
    audio_duration = len(speech_array) / sr

    try:
        start_time = time.time()
        prediction = transcribe_from_file_ov(wav_path)
        transcription_time = time.time() - start_time
    except Exception as e:
        print(f"❌ Error transcribing {wav_file}: {e}")
        continue

    error_rate = wer(ground_truth, prediction)

    print(f"{wav_file} | Audio Duration: {audio_duration:.2f}s | Transcription Time: {transcription_time:.2f}s | WER: {error_rate:.3f}")
    print(f"🔊 Prediction   : {prediction}")
    print(f"📜 Ground Truth : {ground_truth}\n")

    results_openvino.append({
        "model": "openvino_FineTuned",
        "audio_file_name": wav_file,
        "audio_duration_sec": round(audio_duration, 2),
        "transcription_time_sec": round(transcription_time, 2),
        "wer": round(error_rate, 4),
        "prediction": prediction,
        "ground_truth": ground_truth
    })

eight.wav | Audio Duration: 10.30s | Transcription Time: 2.29s | WER: 0.467
🔊 Prediction   : सहकार्य र सहयोग मात्र समाजमा शान्ति र सम्तृति आउछ आमि सबैदले माया जमझधारी बढाउनुपर्छ।
📜 Ground Truth : सहकार्य र सहयोगले मात्र समाजमा शान्ति र समृद्धि आउँछ हामी सबैले माया र समझदारी बढाउनुपर्छ।

five.wav | Audio Duration: 9.70s | Transcription Time: 2.13s | WER: 0.375
🔊 Prediction   : स्वच्छता र स्वास्थ्य पुख्याल राख्नु हरेक नागरिको कर्त ब्यउ सफा वातावरणले मात्र स्वास्थ्य समाज सम्भव हुन्छ।
📜 Ground Truth : स्वच्छता र स्वास्थ्यको ख्याल राख्नु हरेक नागरिकको कर्तव्य हो सफा वातावरणले मात्र स्वस्थ समाज सम्भव हुन्छ।

four.wav | Audio Duration: 9.38s | Transcription Time: 1.40s | WER: 0.500
🔊 Prediction   : हाम्नो देशको समनीति हाम र एकताको बलमा निर्भन गर्छ हामीले एक भएर मात्रै उजुवल भविषय बनाउन सक्छ।
📜 Ground Truth : हाम्रो देशको समृद्धि हाम्रो एकताको बलमा निर्भर गर्छ हामीले एक भएर मात्रै उज्जवल भविष्य बनाउन सक्छौं।

nine.wav | Audio Duration: 9.91s | Transcription Time: 1.46s | WER: 0.467
🔊 Predictio

In [25]:
def summarize_onnx_wer_open(results_openvino):
    from collections import defaultdict

    model_wer_stats = defaultdict(lambda: {"total_wer": 0.0, "count": 0, "total_audio_duration": 0.0, "total_transcription_time": 0.0})

    for result in results_openvino:
        model = result["model"]
        model_wer_stats[model]["total_wer"] += result["wer"]
        model_wer_stats[model]["count"] += 1
        model_wer_stats[model]["total_audio_duration"] += result.get("audio_duration_sec", 0)
        model_wer_stats[model]["total_transcription_time"] += result.get("transcription_time_sec", 0)

    print("\n=== Openvino Model Summary ===")
    for model, stats in model_wer_stats.items():
        count = stats["count"]
        avg_wer = stats["total_wer"] / count if count > 0 else 0
        avg_audio_dur = stats["total_audio_duration"] / count if count > 0 else 0
        avg_trans_time = stats["total_transcription_time"] / count if count > 0 else 0

        print(f"Model: {model}")
        print(f"  Number of files     : {count}")
        print(f"  Average WER         : {avg_wer:.4f}")
        print(f"  Average Audio Length: {avg_audio_dur:.2f} sec")
        print(f"  Average Transcription Time: {avg_trans_time:.2f} sec\n")


In [26]:
summarize_onnx_wer_open(results_openvino)


=== Openvino Model Summary ===
Model: openvino_FineTuned
  Number of files     : 10
  Average WER         : 0.3819
  Average Audio Length: 9.62 sec
  Average Transcription Time: 1.46 sec



In [27]:
summarize_onnx_wer_o(results_onnx)


=== ONNX Model Summary ===
Model: onnx_FineTuned
  Number of files     : 10
  Average WER         : 0.3819
  Average Audio Length: 9.62 sec
  Average Transcription Time: 1.92 sec



In [28]:
summarize_model_wer(results)


=== Model Summary ===
Model: fine_tuned_model
  Number of files     : 10
  Average WER         : 0.3819
  Average Audio Length: 9.62 sec
  Average Transcription Time: 2.92 sec



### QUANTIZING pytorch fine tuned nepali model

In [29]:
from torch.quantization import quantize_dynamic

In [30]:
model.eval()

Wav2Vec2ForCTC(
  (wav2vec2): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (1-4): 4 x Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (5-6): 2 x Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): Wav2Vec2FeatureProjection(
      (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (projec

In [31]:
dyanamic_quantized_model = quantize_dynamic(model, {torch.nn.Linear}, dtype = torch.qint8)

In [32]:
quantized_path = os.path.join(models_dir, "dynamic_quantized_finedTuned_model_state_dict.pth")
torch.save(dyanamic_quantized_model.state_dict(), quantized_path)
print(f"Quantized model state_dict saved to: {quantized_path}")

Quantized model state_dict saved to: finedtunemodels\dynamic_quantized_finedTuned_model_state_dict.pth


In [9]:
import os

model_path = "fine_tuned_models.onnx"
size_bytes = os.path.getsize(model_path)
size_mb = size_bytes / (1024 * 1024)
size_gb = size_mb / 1024

print(f"Model size: {size_bytes} bytes")
print(f"Model size: {size_mb:.2f} MB")
print(f"Model size: {size_gb:.2f} GB")

if size_gb > 2:
    print("Model size is GREATER than 2GB")
else:
    print("Model size is LESS than or equal to 2GB")

Model size: 1262519609 bytes
Model size: 1204.03 MB
Model size: 1.18 GB
Model size is LESS than or equal to 2GB


### Dynamic quantizing ONNX fine tuned model 

In [10]:
import onnx

model_path = "fine_tuned_models.onnx"
model = onnx.load(model_path)

In [11]:
from onnxruntime.quantization import quantize_dynamic, QuantType
from onnxruntime.quantization.shape_inference import quant_pre_process

In [12]:
quant_pre_process(
    input_model="fine_tuned_models.onnx",
    output_model_path="fine_tuned_models_optimized.onnx",
    save_as_external_data=True,
    all_tensors_to_one_file=True,               
    external_data_location="fine_tuned_models.data"  
)

In [14]:
quantize_dynamic(
    model_input="fine_tuned_models_optimized.onnx",  
    model_output="fine_tuned_models_quantized.onnx", 
    weight_type=QuantType.QInt8,
)

### Transcribe using quantized finetuned pytorch model

In [41]:
state_dict = torch.load(quantized_path)
dyanamic_quantized_model.load_state_dict(state_dict)

  device=storage.device,


<All keys matched successfully>

In [42]:
dyanamic_quantized_model.eval()

Wav2Vec2ForCTC(
  (wav2vec2): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (1-4): 4 x Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (5-6): 2 x Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): Wav2Vec2FeatureProjection(
      (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (projec

In [43]:
def transcribe_with_quantized_model(wav_path):
    # Load audio
    waveform, sr = torchaudio.load(wav_path)

    # Convert stereo to mono if needed
    if waveform.shape[0] > 1:
        waveform = waveform.mean(dim=0, keepdim=True)

    # Resample if needed
    if sr != 16000:
        resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)
        waveform = resampler(waveform)

    # Remove batch/channel dimension
    waveform = waveform.squeeze()

    # Preprocess
    inputs = processor(waveform, sampling_rate=16000, return_tensors="pt", padding=True)

    # Inference
    with torch.no_grad():
        outputs = dyanamic_quantized_model(**inputs)
        logits = outputs.logits
        predicted_ids = torch.argmax(logits, dim=-1)

    # Decode
    transcription = processor.decode(predicted_ids[0])

    return transcription.lower()

In [44]:
results_quantized = []

for wav_file in sorted(os.listdir(test_data_dir)):
    if not wav_file.endswith(".wav"):
        continue

    wav_path = os.path.join(test_data_dir, wav_file)
    txt_file = wav_file.replace(".wav", ".txt")
    txt_path = os.path.join(transcribs_dir, txt_file)

    if not os.path.isfile(txt_path):
        print(f"Ground truth missing for {wav_file}, skipping...")
        continue

    # Read ground truth transcription
    with open(txt_path, "r", encoding="utf-8") as f:
        ground_truth = f.read().strip().lower()

    # Load audio to get duration
    speech_array, sr = sf.read(wav_path)
    audio_duration = len(speech_array) / sr  # audio duration in seconds

    # Transcribe with ONNX model and measure time
    start_time = time.time()
    try:
        prediction = transcribe_with_quantized_model(wav_path)
    except Exception as e:
        print(f"Error transcribing {wav_file}: {e}")
        continue
    transcription_time = time.time() - start_time

    # Compute WER
    error_rate = wer(ground_truth, prediction)

    # Print result
    print(f"{wav_file} | Audio Duration: {audio_duration:.2f}s | Transcription Time: {transcription_time:.2f}s | WER: {error_rate:.3f}")
    print(f"Prediction   : {prediction}")
    print(f"Ground Truth : {ground_truth}\n")

    # Save result with updated keys
    results_quantized.append({
        "model": dyanamic_quantized_model,
        "audio_file_name": wav_file,
        "audio_duration_sec": round(audio_duration, 2),
        "transcription_time_sec": round(transcription_time, 2),
        "wer": round(error_rate, 4),
        "prediction": prediction,
        "ground_truth": ground_truth
    })


eight.wav | Audio Duration: 10.30s | Transcription Time: 1.66s | WER: 0.467
Prediction   : सहकार्य र सहयोग मात्र समाजमा शान्ति र सम्धृति आउछ आमि सबैदले माया जमजधारी बढाउनुपर्छ।
Ground Truth : सहकार्य र सहयोगले मात्र समाजमा शान्ति र समृद्धि आउँछ हामी सबैले माया र समझदारी बढाउनुपर्छ।

five.wav | Audio Duration: 9.70s | Transcription Time: 1.14s | WER: 0.375
Prediction   : स्वच्छता र स्वास्थ्य पुख्याल राख्नु हरेक नागरिको कर्त ब्यउु सफा वातावरणले मात्र स्वास्थ्य समाज सम्भव हुन्छ।
Ground Truth : स्वच्छता र स्वास्थ्यको ख्याल राख्नु हरेक नागरिकको कर्तव्य हो सफा वातावरणले मात्र स्वस्थ समाज सम्भव हुन्छ।

four.wav | Audio Duration: 9.38s | Transcription Time: 1.08s | WER: 0.438
Prediction   : हाम्नो देशको संनीति हाम रो एकताको बलमा निर्भन गर्छ हामीले एक भएर मात्रै उजुवल भविष्य बनाउन सक्छ।
Ground Truth : हाम्रो देशको समृद्धि हाम्रो एकताको बलमा निर्भर गर्छ हामीले एक भएर मात्रै उज्जवल भविष्य बनाउन सक्छौं।

nine.wav | Audio Duration: 9.91s | Transcription Time: 1.10s | WER: 0.400
Prediction   : कामभू

In [45]:
def summarize_model_wer_quantized(results_quantized):
    from collections import defaultdict

    model_stats = defaultdict(lambda: {
        "total_wer": 0.0,
        "total_audio_duration": 0.0,
        "total_transcription_time": 0.0,
        "count": 0
    })

    for result in results_quantized:
        model = result["model"]
        model_stats[model]["total_wer"] += result["wer"]
        model_stats[model]["total_audio_duration"] += result.get("audio_duration_sec", 0)
        model_stats[model]["total_transcription_time"] += result.get("transcription_time_sec", 0)
        model_stats[model]["count"] += 1

    print("\n=== Average Metrics per Model ===")
    for model, stats in model_stats.items():
        count = stats["count"]
        avg_wer = stats["total_wer"] / count if count > 0 else 0
        avg_audio_duration = stats["total_audio_duration"] / count if count > 0 else 0
        avg_transcription_time = stats["total_transcription_time"] / count if count > 0 else 0

        print(f"Model: {model}")
        print(f"  Number of files        : {count}")
        print(f"  Average WER            : {avg_wer:.4f}")
        print(f"  Average Audio Duration : {avg_audio_duration:.2f} sec")
        print(f"  Average Transcription Time : {avg_transcription_time:.2f} sec\n")


In [46]:
summarize_model_wer_quantized(results_quantized)


=== Average Metrics per Model ===
Model: Wav2Vec2ForCTC(
  (wav2vec2): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (1-4): 4 x Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (5-6): 2 x Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): Wav2Vec2FeatureProjection(
      (layer_norm): LayerNorm((512,), eps=1e-

In [47]:
summarize_model_wer(results)


=== Model Summary ===
Model: fine_tuned_model
  Number of files     : 10
  Average WER         : 0.3819
  Average Audio Length: 9.62 sec
  Average Transcription Time: 2.92 sec



### Transcription using finetuned ONNX model

In [15]:
def transcribe_from_file(filepath):
    # Load audio
    speech_array, sr = sf.read(filepath)

    # If not mono, convert to mono
    if len(speech_array.shape) > 1:
        speech_array = np.mean(speech_array, axis=1)

    # Resample if needed
    if sr != 16000:
        resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)
        speech_array = resampler(torch.tensor(speech_array).float()).numpy()
        sr = 16000

    # Prepare input for ONNX
    inputs = processor(speech_array, sampling_rate=16000, return_tensors="pt")
    input_values = inputs.input_values.numpy()

    # Run ONNX inference
    ort_inputs = {session.get_inputs()[0].name: input_values}
    ort_outputs = session.run(None, ort_inputs)[0]

    # Decode prediction
    predicted_ids = np.argmax(ort_outputs, axis=-1)
    transcription = processor.decode(predicted_ids[0])

    return transcription.lower()

In [27]:
results_onnx_quantized = []

for wav_file in sorted(os.listdir(test_data_dir)):
    if not wav_file.endswith(".wav"):
        continue

    wav_path = os.path.join(test_data_dir, wav_file)
    txt_file = wav_file.replace(".wav", ".txt")
    txt_path = os.path.join(transcribs_dir, txt_file)

    if not os.path.isfile(txt_path):
        print(f"Ground truth missing for {wav_file}, skipping...")
        continue

    # Read ground truth transcription
    with open(txt_path, "r", encoding="utf-8") as f:
        ground_truth = f.read().strip().lower()

    # Load audio to get duration
    speech_array, sr = sf.read(wav_path)
    audio_duration = len(speech_array) / sr  # audio duration in seconds

    # Transcribe with ONNX model and measure time
    start_time = time.time()
    try:
        prediction = transcribe_from_file(wav_path)
    except Exception as e:
        print(f"Error transcribing {wav_file}: {e}")
        continue
    transcription_time = time.time() - start_time

    # Compute WER
    error_rate = wer(ground_truth, prediction)

    # Print result
    print(f"{wav_file} | Audio Duration: {audio_duration:.2f}s | Transcription Time: {transcription_time:.2f}s | WER: {error_rate:.3f}")
    print(f"Prediction   : {prediction}")
    print(f"Ground Truth : {ground_truth}\n")

    # Save result with updated keys
    results_onnx_quantized.append({
        "model": "fine_tuned_models_quantized.onnx",
        "audio_file_name": wav_file,
        "audio_duration_sec": round(audio_duration, 2),
        "transcription_time_sec": round(transcription_time, 2),
        "wer": round(error_rate, 4),
        "prediction": prediction,
        "ground_truth": ground_truth
    })


eight.wav | Audio Duration: 10.30s | Transcription Time: 1.65s | WER: 0.467
Prediction   : सहकार्य र सहयोग मात्र समाजमा शान्ति र सम्तृति आउछ आमि सबैदले माया जमझधारी बढाउनुपर्छ।
Ground Truth : सहकार्य र सहयोगले मात्र समाजमा शान्ति र समृद्धि आउँछ हामी सबैले माया र समझदारी बढाउनुपर्छ।

five.wav | Audio Duration: 9.70s | Transcription Time: 1.52s | WER: 0.375
Prediction   : स्वच्छता र स्वास्थ्य पुख्याल राख्नु हरेक नागरिको कर्त ब्यउ सफा वातावरणले मात्र स्वास्थ्य समाज सम्भव हुन्छ।
Ground Truth : स्वच्छता र स्वास्थ्यको ख्याल राख्नु हरेक नागरिकको कर्तव्य हो सफा वातावरणले मात्र स्वस्थ समाज सम्भव हुन्छ।

four.wav | Audio Duration: 9.38s | Transcription Time: 1.45s | WER: 0.500
Prediction   : हाम्नो देशको समनीति हाम र एकताको बलमा निर्भन गर्छ हामीले एक भएर मात्रै उजुवल भविषय बनाउन सक्छ।
Ground Truth : हाम्रो देशको समृद्धि हाम्रो एकताको बलमा निर्भर गर्छ हामीले एक भएर मात्रै उज्जवल भविष्य बनाउन सक्छौं।

nine.wav | Audio Duration: 9.91s | Transcription Time: 1.53s | WER: 0.467
Prediction   : कामभू ने

In [28]:
def summarize_onnx_wer_quantized(results_onnx_quantized):
    from collections import defaultdict

    model_wer_stats = defaultdict(lambda: {"total_wer": 0.0, "count": 0, "total_audio_duration": 0.0, "total_transcription_time": 0.0})

    for result in results_onnx_quantized:
        model = result["model"]
        model_wer_stats[model]["total_wer"] += result["wer"]
        model_wer_stats[model]["count"] += 1
        model_wer_stats[model]["total_audio_duration"] += result.get("audio_duration_sec", 0)
        model_wer_stats[model]["total_transcription_time"] += result.get("transcription_time_sec", 0)

    print("\n=== Quantized ONNX Model Summary ===")
    for model, stats in model_wer_stats.items():
        count = stats["count"]
        avg_wer = stats["total_wer"] / count if count > 0 else 0
        avg_audio_dur = stats["total_audio_duration"] / count if count > 0 else 0
        avg_trans_time = stats["total_transcription_time"] / count if count > 0 else 0

        print(f"Model: {model}")
        print(f"  Number of files     : {count}")
        print(f"  Average WER         : {avg_wer:.4f}")
        print(f"  Average Audio Length: {avg_audio_dur:.2f} sec")
        print(f"  Average Transcription Time: {avg_trans_time:.2f} sec\n")


In [29]:
summarize_onnx_wer_quantized(results_onnx_quantized)


=== Quantized ONNX Model Summary ===
Model: fine_tuned_models_quantized.onnx
  Number of files     : 10
  Average WER         : 0.3819
  Average Audio Length: 9.62 sec
  Average Transcription Time: 1.48 sec



In [30]:
summarize_onnx_wer_o(results_onnx)


=== ONNX Model Summary ===
Model: onnx_FineTuned
  Number of files     : 10
  Average WER         : 0.3819
  Average Audio Length: 9.62 sec
  Average Transcription Time: 1.57 sec

