In [36]:
from pathlib import Path

import jiwer
import librosa
import IPython.display as idp
from transformers import WhisperProcessor, WhisperForConditionalGeneration

# Prepare and use pretrained TTS model

In [37]:
# load model and processor
processor = WhisperProcessor.from_pretrained("openai/whisper-medium")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-medium")
model.config.forced_decoder_ids = None

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [38]:
forced_decoder_ids = processor.get_decoder_prompt_ids(
    language="russian",
    task="transcribe",
)

In [39]:
path_to_content = Path().cwd().parent / "content"
path_to_audio = path_to_content / "test_audio.wav"
test_audio, sr = librosa.load(path_to_audio, sr=16_000)
idp.Audio(test_audio, rate=sr)

In [40]:
input_features = processor(test_audio, sampling_rate=sr, return_tensors="pt").input_features

# generate token ids
predicted_ids = model.generate(input_features, forced_decoder_ids=forced_decoder_ids)

In [41]:
# decode token ids to text
recognized = processor.batch_decode(predicted_ids, skip_special_tokens=True)
recognized

[' Мы уже обработали кривые гис и смоделировали 3D модель пласта.']

# Compute metrics

In [42]:
path_to_transcription = path_to_content / "audio_transcription.txt"
with open(path_to_transcription) as f:
    reference = f.readlines()
    
reference

['Мы уже обработали кривые гис и смоделировали 3D модель пласта.']

In [43]:
transforms = jiwer.Compose(
    [
        jiwer.RemoveEmptyStrings(),
        jiwer.ToLowerCase(),
        jiwer.RemoveMultipleSpaces(),
        jiwer.Strip(),
        jiwer.RemovePunctuation(),
        jiwer.ReduceToListOfListOfWords(),
    ]
)

wer = jiwer.wer(
                reference,
                recognized,
                truth_transform=transforms,
                hypothesis_transform=transforms,
            )
print(f"Word Error Rate (WER) :", wer)

Word Error Rate (WER) : 0.0
