In [1]:
import pandas as pd
import os

In [2]:
data_dir = 'D:/Common Voice/'

cv_valid_test = pd.read_csv(os.path.join(data_dir, 'cv-valid-test.csv'))

In [3]:
cv_valid_test

Unnamed: 0,filename,text,up_votes,down_votes,age,gender,accent,duration
0,cv-valid-test/sample-000000.mp3,without the dataset the article is useless,1,0,,,,
1,cv-valid-test/sample-000001.mp3,i've got to go to him,1,0,twenties,male,,
2,cv-valid-test/sample-000002.mp3,and you know it,1,0,,,,
3,cv-valid-test/sample-000003.mp3,down below in the darkness were hundreds of pe...,4,0,twenties,male,us,
4,cv-valid-test/sample-000004.mp3,hold your nose to keep the smell from disablin...,2,0,,,,
...,...,...,...,...,...,...,...,...
3990,cv-valid-test/sample-003990.mp3,the old man opened his cape and the boy was st...,1,0,,,,
3991,cv-valid-test/sample-003991.mp3,in alchemy it's called the soul of the world,2,1,,,,
3992,cv-valid-test/sample-003992.mp3,at that point in their lives everything is cle...,3,0,,,,
3993,cv-valid-test/sample-003993.mp3,he told them all to be seated,3,0,,,,


In [4]:
cv_valid_test = cv_valid_test[['filename', 'text']]

In [27]:
import torch
from transformers import (
    Wav2Vec2ForCTC, Wav2Vec2Processor,
    WhisperProcessor, WhisperForConditionalGeneration
)
import librosa 


In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model_name = 'facebook/wav2vec2-base-960h'
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = Wav2Vec2ForCTC.from_pretrained(model_name).to(device)
model.eval()

In [12]:
from tqdm import tqdm

In [13]:
audio_dir = 'D:/Common Voice/cv-valid-test/'

preds = []
refs = []

for i, row in tqdm(cv_valid_test.iterrows()):
    filename = str(row['filename'])
    file_path =  os.path.join(audio_dir, filename)

    origin_text = str(row['text'])
    
    audio, sr = librosa.load(file_path, sr=16000, mono=True)

    input_values = processor(
        audio,
        sampling_rate=16000,
        return_tensors='pt'
    ).input_values

    with torch.no_grad():
        logits = model(input_values.to(device)).logits

    pred_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(pred_ids)[0]

    preds.append(transcription)
    refs.append(origin_text)

3995it [01:58, 33.81it/s]


In [18]:
import evaluate
from jiwer import mer

In [21]:
preds = [pred.lower() for pred in preds]

In [28]:
wer = evaluate.load("wer")
cer = evaluate.load("cer")

print("WER:", wer.compute(predictions=preds, references=refs))
print("CER:", cer.compute(predictions=preds, references=refs))
print(mer(refs, preds))

WER: 0.1367444564843936
CER: 0.05801926049221258
0.13477819166948865


In [30]:
processor_wh = WhisperProcessor.from_pretrained("openai/whisper-base")
model_wh = WhisperForConditionalGeneration.from_pretrained("openai/whisper-base")

In [33]:
model.config.forced_decoder_ids = processor_wh.get_decoder_prompt_ids(language="english", task="transcribe")
model_wh.to(device)

preds_wh = []
refs_wh = []

for i, row in cv_valid_test.iterrows():
    filename = str(row['filename'])
    file_path =  os.path.join(audio_dir, filename)

    origin_text = str(row['text'])
    
    audio, sr = librosa.load(file_path, sr=16000, mono=True)

    input_features = processor_wh(
        audio,
        sampling_rate=16000,
        return_tensors='pt'
    ).input_features.to(device)

    with torch.no_grad():
        predicted_ids = model_wh.generate(input_features)

    transcription = processor_wh.batch_decode(predicted_ids, skip_special_tokens=True)[0]

    preds_wh.append(transcription)
    refs_wh.append(origin_text)

Using custom `forced_decoder_ids` from the (generation) config. This is deprecated in favor of the `task` and `language` flags/config options.
Transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English. This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`. See https://github.com/huggingface/transformers/pull/28687 for more details.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


In [37]:
import re

def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text)
    return text.lower().strip()

preds_wh = [clean_text(pred) for pred in preds]

In [39]:
print("WER:", wer.compute(predictions=preds_wh, references=refs_wh))
print("CER:", cer.compute(predictions=preds_wh, references=refs_wh))
print(mer(refs_wh, preds_wh))

WER: 0.15466342469011815
CER: 0.06158084909512905
0.15243950089869493
