# Wav2vec2Bert

In [None]:
import librosa
import soundfile as sf
import os
import numpy as np
from IPython.display import Audio, display

In [None]:
def load_audio(file_paths):

    loaded_files = []

    for file_path in file_paths:
        # Load audio file
        audio, sr = librosa.load(file_path, sr=None, mono=False)
        loaded_files.append((file_path, audio, sr))

        print(f"Loaded {file_path}")
        print(f"Shape: {audio.shape}, Sample rate: {sr}")
        print("---")

    return loaded_files

In [None]:
from google.colab import files

uploaded = files.upload()  # This will prompt you to upload files

file_paths = list(uploaded.keys())

Saving common_voice_hi_31998548.mp3 to common_voice_hi_31998548.mp3
Saving common_voice_hi_31998549.mp3 to common_voice_hi_31998549.mp3
Saving common_voice_hi_31998550.mp3 to common_voice_hi_31998550.mp3


In [None]:
# Load audio files
loaded_files = load_audio(file_paths)

Loaded common_voice_hi_31998548.mp3
Shape: (199296,), Sample rate: 32000
---
Loaded common_voice_hi_31998549.mp3
Shape: (182016,), Sample rate: 32000
---
Loaded common_voice_hi_31998550.mp3
Shape: (185472,), Sample rate: 32000
---


In [None]:
def preprocess_audio(loaded_files, target_sr=16000, mono=True):

    processed_files = []

    for file_path, audio, sr in loaded_files:
        original_filename = os.path.splitext(os.path.basename(file_path))[0]

        # Display original audio
        print(f"Original audio: {original_filename}")
        display(Audio(audio, rate=sr))

        # Resample if necessary
        if sr != target_sr:
            audio = librosa.resample(y=audio, orig_sr=sr, target_sr=target_sr)

        # Convert to mono if required
        if mono and audio.ndim > 1:
            audio = librosa.to_mono(audio)

        processed_files.append((file_path, audio, target_sr))

        # Display processed audio
        print(f"Processed audio: {original_filename}")
        display(Audio(audio, rate=target_sr))

        print(f"Processed {file_path}")
        print(f"Shape: {audio.shape}, Sample rate: {target_sr}")
        print("---")

    return processed_files

In [None]:
# Process loaded audio files
processed_files = preprocess_audio(loaded_files, target_sr=16000, mono=True)

Original audio: common_voice_hi_31998548


Processed audio: common_voice_hi_31998548


Processed common_voice_hi_31998548.mp3
Shape: (99648,), Sample rate: 16000
---
Original audio: common_voice_hi_31998549


Processed audio: common_voice_hi_31998549


Processed common_voice_hi_31998549.mp3
Shape: (91008,), Sample rate: 16000
---
Original audio: common_voice_hi_31998550


Processed audio: common_voice_hi_31998550


Processed common_voice_hi_31998550.mp3
Shape: (92736,), Sample rate: 16000
---


In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from transformers import AutoProcessor, Wav2Vec2BertForCTC
import torch
import numpy as np

# Initialize the processor and model
wav2vec2bert_processor = AutoProcessor.from_pretrained("hf-audio/wav2vec2-bert-CV16-en")
wav2vec2bert_model = Wav2Vec2BertForCTC.from_pretrained("hf-audio/wav2vec2-bert-CV16-en")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/299 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.09k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/369 [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/32.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/544 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.89k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.42G [00:00<?, ?B/s]

In [None]:
import time
import psutil
def transcribe_with_wav2vec2Bert(processed_files):
    transcriptions = {}
    transcription_times = {}
    cpu_usages = {}
    mem_usages = {}

    # Get initial CPU and memory usage
    process = psutil.Process()
    initial_cpu = psutil.cpu_percent(interval=None)
    initial_mem = process.memory_info().rss

    for file_path, audio, sr in processed_files:
        if sr != 16000:
            continue  # Wav2Vec2 requires 16kHz

        start_time = time.time()

        # Preprocess audio
        inputs = wav2vec2bert_processor(audio, sampling_rate=sr, return_tensors="pt")
        with torch.no_grad():
            # Pass the entire 'inputs' dictionary to the model
            logits = wav2vec2bert_model(**inputs).logits
            predicted_ids = torch.argmax(logits, dim=-1)
        transcription = wav2vec2bert_processor.batch_decode(predicted_ids)[0]
        end_time = time.time()

        # Record timings and resource usage
        elapsed_time = end_time - start_time
        current_cpu = psutil.cpu_percent(interval=None)
        current_mem = process.memory_info().rss

        transcription_times[file_path] = elapsed_time
        cpu_usages[file_path] = current_cpu - initial_cpu
        mem_usages[file_path] = (current_mem - initial_mem) / (1024 * 1024)  # Convert bytes to MB

        # Update initial values for next iteration
        initial_cpu = current_cpu
        initial_mem = current_mem

        transcriptions[file_path] = transcription

        print(f"Transcribed {file_path}")
        print(f"Transcription: {transcription}")
        print(f"Time taken: {elapsed_time:.2f} seconds")
        print(f"CPU usage: {cpu_usages[file_path]:.2f} %")
        print(f"Memory usage: {mem_usages[file_path]:.2f} MB")
        print("---")

    return transcriptions, transcription_times, cpu_usages, mem_usages

In [None]:
wav2vec2Bert_transcriptions, transcription_times, cpu_usages, mem_usages = transcribe_with_wav2vec2Bert(processed_files)

print("Transcription Times:", transcription_times)
print("CPU Usages:", cpu_usages)
print("Memory Usages:", mem_usages)

Transcribed common_voice_hi_31998548.mp3
Transcription: me ik natak padhraha hung
Time taken: 11.82 seconds
CPU usage: 41.00 %
Memory usage: 132.39 MB
---
Transcribed common_voice_hi_31998549.mp3
Transcription: bhartie sansad videshi wishwa vit dala me padhange
Time taken: 7.18 seconds
CPU usage: -29.00 %
Memory usage: 1.36 MB
---
Transcribed common_voice_hi_31998550.mp3
Transcription: shertokeulanghan ki karan kochi teskers eye piels in i lumbit
Time taken: 9.57 seconds
CPU usage: 25.50 %
Memory usage: 0.00 MB
---
Transcription Times: {'common_voice_hi_31998548.mp3': 11.824863195419312, 'common_voice_hi_31998549.mp3': 7.1815009117126465, 'common_voice_hi_31998550.mp3': 9.569878101348877}
CPU Usages: {'common_voice_hi_31998548.mp3': 41.00000000000001, 'common_voice_hi_31998549.mp3': -29.000000000000007, 'common_voice_hi_31998550.mp3': 25.500000000000007}
Memory Usages: {'common_voice_hi_31998548.mp3': 132.390625, 'common_voice_hi_31998549.mp3': 1.36328125, 'common_voice_hi_31998550.mp3

In [None]:
wav2vec2Bert_transcriptions

{'common_voice_hi_31998548.mp3': 'me ik natak padhraha hung',
 'common_voice_hi_31998549.mp3': 'bhartie sansad videshi wishwa vit dala me padhange',
 'common_voice_hi_31998550.mp3': 'shertokeulanghan ki karan kochi teskers eye piels in i lumbit'}

WER and CER are the most standard metrics for evaluating transcription accuracy.
BLEU is helpful for assessing the quality of longer transcriptions.
PESQ, STOI, and eSTOI are useful if you're also interested in the perceptual and intelligibility aspects of the generated transcriptions.
SNR and PER are valuable for specific scenarios where noise and phoneme-level accuracy are important.

In [None]:
from google.colab import files

# Upload the reference file
uploaded = files.upload()

# Get the name of the uploaded file
reference_file_path = next(iter(uploaded))

# Load the reference transcriptions from the .txt file
reference_dict = {}

with open(reference_file_path, 'r', encoding='utf-8') as file:
    for line in file:
        parts = line.strip().split('\t')
        if len(parts) == 2:
            file_name, transcription = parts
            reference_dict[file_name] = transcription

print("Loaded reference transcriptions:", reference_dict)


Saving Hindi_language_reference.txt to Hindi_language_reference.txt
Loaded reference transcriptions: {'common_voice_hi_31998548.mp3': 'मैं एक नाटक पढ़ रहा हूँ।', 'common_voice_hi_31998549.mp3': 'भारतीय सांसद विदेशी विश्वविद्यालय में पढ़ेंगे!', 'common_voice_hi_31998550.mp3': 'शर्तों के उल्लंघन के कारण कोच्चि टस्कर्स आईपीएल से निलंबित', 'common_voice_hi_31998551.mp3': 'तमांग की हत्या का मुख्य आरोपी फरार', 'common_voice_hi_31998552.mp3': 'दिल्ली को वर्ल्ड क्लास बनाना चाहते हैं, विकासपुरी से विकास नदारद'}


Translator

In [None]:
!pip install googletrans==4.0.0-rc1

Collecting googletrans==4.0.0-rc1
  Downloading googletrans-4.0.0rc1.tar.gz (20 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting httpx==0.13.3 (from googletrans==4.0.0-rc1)
  Downloading httpx-0.13.3-py3-none-any.whl.metadata (25 kB)
Collecting hstspreload (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading hstspreload-2024.8.1-py3-none-any.whl.metadata (2.1 kB)
Collecting chardet==3.* (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading chardet-3.0.4-py2.py3-none-any.whl.metadata (3.2 kB)
Collecting idna==2.* (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading idna-2.10-py2.py3-none-any.whl.metadata (9.1 kB)
Collecting rfc3986<2,>=1.3 (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading rfc3986-1.5.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting httpcore==0.9.* (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading httpcore-0.9.1-py3-none-any.whl.metadata (4.6 kB)
Collecting h11<0.10,>=0.8 (from httpcore==0.9.*->httpx==0.13.3->googl

In [None]:
from googletrans import Translator

translator = Translator()

def translate_text(text, src_language='auto', dest_language='en'):
    translation = translator.translate(text, src=src_language, dest=dest_language)
    return translation.text


# Translate reference and model transcriptions to English
translated_reference_dict = {file_name: translate_text(text) for file_name, text in reference_dict.items()}
# Translate English sentences to Hindi
translated_to_hindi_dict = {file_name: translate_text(text, src_language='en', dest_language='hi') for file_name, text in wav2vec2Bert_transcriptions.items()}

print("Model transcriptions:", wav2vec2Bert_transcriptions)
print("Translated model sentences to Hindi:", translated_to_hindi_dict)
print("Reference transcriptions:", reference_dict)
print("Translated reference transcriptions:", translated_reference_dict)

Model transcriptions: {'common_voice_hi_31998548.mp3': 'me ik natak padhraha hung', 'common_voice_hi_31998549.mp3': 'bhartie sansad videshi wishwa vit dala me padhange', 'common_voice_hi_31998550.mp3': 'shertokeulanghan ki karan kochi teskers eye piels in i lumbit'}
Translated model sentences to Hindi: {'common_voice_hi_31998548.mp3': 'मुझे एक नताक पड राहा हुन', 'common_voice_hi_31998549.mp3': 'Bhartiya Sansad videshi vishwa vit dala me padhenge', 'common_voice_hi_31998550.mp3': 'Shertok Ulanghan ka karan kochi tuskers आई पिक्सेल i lumbit'}
Reference transcriptions: {'common_voice_hi_31998548.mp3': 'मैं एक नाटक पढ़ रहा हूँ।', 'common_voice_hi_31998549.mp3': 'भारतीय सांसद विदेशी विश्वविद्यालय में पढ़ेंगे!', 'common_voice_hi_31998550.mp3': 'शर्तों के उल्लंघन के कारण कोच्चि टस्कर्स आईपीएल से निलंबित', 'common_voice_hi_31998551.mp3': 'तमांग की हत्या का मुख्य आरोपी फरार', 'common_voice_hi_31998552.mp3': 'दिल्ली को वर्ल्ड क्लास बनाना चाहते हैं, विकासपुरी से विकास नदारद'}
Translated reference

In [None]:
!pip install jiwer
!pip install pystoi

Collecting jiwer
  Downloading jiwer-3.0.4-py3-none-any.whl.metadata (2.6 kB)
Collecting rapidfuzz<4,>=3 (from jiwer)
  Downloading rapidfuzz-3.9.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading jiwer-3.0.4-py3-none-any.whl (21 kB)
Downloading rapidfuzz-3.9.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, jiwer
Successfully installed jiwer-3.0.4 rapidfuzz-3.9.6
Collecting pystoi
  Downloading pystoi-0.4.1-py2.py3-none-any.whl.metadata (4.0 kB)
Downloading pystoi-0.4.1-py2.py3-none-any.whl (8.2 kB)
Installing collected packages: pystoi
Successfully installed pystoi-0.4.1


In [None]:
import jiwer

def evaluate_wer(transcriptions, reference_dict):
    wer_scores = {}

    for file_path, transcription in transcriptions.items():
        file_name = file_path.split('/')[-1]
        if file_name in reference_dict:
            reference_text = reference_dict[file_name]
            wer = jiwer.wer(reference_text, transcription)
            wer_scores[file_path] = wer

    return wer_scores

In [None]:
def evaluate_cer(transcriptions, reference_dict):
    cer_scores = {}

    for file_path, transcription in transcriptions.items():
        file_name = file_path.split('/')[-1]
        if file_name in reference_dict:
            reference_text = reference_dict[file_name]
            cer = jiwer.cer(reference_text, transcription)
            cer_scores[file_path] = cer

    return cer_scores

In [None]:
import nltk
from nltk.translate.bleu_score import sentence_bleu

def evaluate_bleu(transcriptions, reference_dict):
    bleu_scores = {}

    for file_path, transcription in transcriptions.items():
        file_name = file_path.split('/')[-1]
        if file_name in reference_dict:
            reference_text = reference_dict[file_name]
            reference_tokens = [reference_text.split()]
            hypothesis_tokens = transcription.split()

            bleu_score = sentence_bleu(reference_tokens, hypothesis_tokens)
            bleu_scores[file_path] = bleu_score

    return bleu_scores

In [None]:
# Evaluate metrics
wer_scores = evaluate_wer(translated_to_hindi_dict, reference_dict)
cer_scores = evaluate_cer(translated_to_hindi_dict, reference_dict)
bleu_scores = evaluate_bleu(translated_to_hindi_dict, reference_dict)

print("WER Scores:", wer_scores)
print("CER Scores:", cer_scores)
print("BLEU Scores:", bleu_scores)


WER Scores: {'common_voice_hi_31998548.mp3': 0.8333333333333334, 'common_voice_hi_31998549.mp3': 1.3333333333333333, 'common_voice_hi_31998550.mp3': 1.0}
CER Scores: {'common_voice_hi_31998548.mp3': 0.4583333333333333, 'common_voice_hi_31998549.mp3': 1.0217391304347827, 'common_voice_hi_31998550.mp3': 0.8793103448275862}
BLEU Scores: {'common_voice_hi_31998548.mp3': 1.1640469867513693e-231, 'common_voice_hi_31998549.mp3': 0, 'common_voice_hi_31998550.mp3': 0}


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Performance Test