<a href="https://colab.research.google.com/github/Kabin119245/AI-ML/blob/main/EvaluatingWhisperModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install datasets evaluate

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl.metadata (9.3 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.5.0,>=2023.1.0 (from fsspec[http]<=2024.5.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.5.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━

In [2]:
!pip install jiwer

Collecting jiwer
  Downloading jiwer-3.0.4-py3-none-any.whl.metadata (2.6 kB)
Collecting rapidfuzz<4,>=3 (from jiwer)
  Downloading rapidfuzz-3.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading jiwer-3.0.4-py3-none-any.whl (21 kB)
Downloading rapidfuzz-3.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m46.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, jiwer
Successfully installed jiwer-3.0.4 rapidfuzz-3.9.5


In [6]:
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
from datasets import load_dataset
import torch
import evaluate
import librosa

# Function to log in to Hugging Face
from huggingface_hub import login
login(token="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")

# Check if CUDA is available and set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

# Load the processor and model from the local directory
local_model_path = "/content/drive/MyDrive/MODEL/whisper-small-ne-NP/checkpoint-400"
processor = AutoProcessor.from_pretrained(local_model_path)
model = AutoModelForSpeechSeq2Seq.from_pretrained(local_model_path).to(device)

# Load the WER metric
wer_metric = evaluate.load("wer")

# Function to preprocess audio and transcribe
def transcribe(audio_array, sampling_rate):
    # Resample audio to 16,000 Hz if necessary
    if sampling_rate != 16000:
        audio_array = librosa.resample(audio_array, orig_sr=sampling_rate, target_sr=16000)
        sampling_rate = 16000

    # Preprocess the audio
    input_features = processor(audio_array, sampling_rate=sampling_rate, return_tensors="pt").input_features.to(device)

    # Generate transcription
    with torch.no_grad():
        predicted_ids = model.generate(input_features)

    # Decode the prediction
    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
    return transcription

# Preprocess the dataset for evaluation
def prepare_dataset(batch):
    audio = batch["audio"]
    # Resample audio to 16,000 Hz if necessary
    if audio["sampling_rate"] != 16000:
        audio_array = librosa.resample(audio["array"], orig_sr=audio["sampling_rate"], target_sr=16000)
        sampling_rate = 16000
    else:
        audio_array = audio["array"]
        sampling_rate = audio["sampling_rate"]

    batch["input_features"] = processor(audio_array, sampling_rate=sampling_rate, return_tensors="pt").input_features[0]
    batch["labels"] = processor.tokenizer(batch["sentence"]).input_ids
    return batch

def compute_metrics(pred):
    pred_ids = pred["predictions"]
    label_ids = pred["label_ids"]

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = processor.tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = processor.tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * wer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

if __name__ == "__main__":
    # Load the test dataset
    common_voice = load_dataset("mozilla-foundation/common_voice_17_0", "ne-NP", split="test", token="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")

    # Print the first example to inspect the keys
    print(common_voice[0])

    # Prepare the test dataset
    common_voice = common_voice.map(prepare_dataset, remove_columns=["client_id", "up_votes", "down_votes", "age", "gender", "accent", "locale", "segment", "variant"], num_proc=2)

    # Evaluate the model and compute WER
    predictions = []
    references = []

    for example in common_voice:
        audio = example["audio"]
        audio_array = audio["array"]
        sampling_rate = audio["sampling_rate"]
        reference_transcription = example["sentence"]

        transcription = transcribe(audio_array, sampling_rate)
        predictions.append(transcription)
        references.append(reference_transcription)

    # Compute WER over the entire dataset
    pred = {
        "predictions": processor.tokenizer(predictions, return_tensors="pt", padding=True).input_ids.to(device),
        "label_ids": processor.tokenizer(references, return_tensors="pt", padding=True).input_ids.to(device)
    }
    metrics = compute_metrics(pred)
    print("WER:", metrics["wer"])


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful
cuda


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


{'client_id': '35519c95f34c573e87b8a04e6da4786df9e7e554b894e5389ff92dc7fde42f30066dc11b30292ee1d2493dbf331cc7e34585594898d1ad5aac06fd0e3b6f1cd9', 'path': '/root/.cache/huggingface/datasets/downloads/extracted/8c452239408c9b2b0f0af1336080825fb42e13bf0efaa731d0e6649055d2e232/ne-NP_test_0/common_voice_ne-NP_39608586.mp3', 'audio': {'path': '/root/.cache/huggingface/datasets/downloads/extracted/8c452239408c9b2b0f0af1336080825fb42e13bf0efaa731d0e6649055d2e232/ne-NP_test_0/common_voice_ne-NP_39608586.mp3', 'array': array([-1.42108547e-14, -3.55271368e-14, -6.03961325e-14, ...,
        2.85540659e-06,  1.08017466e-05,  7.92140327e-06]), 'sampling_rate': 48000}, 'sentence': 'यहाँहरुको दिन सुखद अनि फलदायी होस् ।', 'up_votes': 2, 'down_votes': 0, 'age': 'twenties', 'gender': 'male_masculine', 'accent': '', 'locale': 'ne-NP', 'segment': '', 'variant': ''}


  self.pid = os.fork()


Map (num_proc=2):   0%|          | 0/217 [00:00<?, ? examples/s]

WER: 65.29745042492918
