In [27]:
from pathlib import Path

import jiwer
import librosa
import IPython.display as idp
from transformers import WhisperProcessor, WhisperForConditionalGeneration

# Prepare and use pretrained TTS model

In [28]:
# load model and processor
processor = WhisperProcessor.from_pretrained("openai/whisper-medium")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-medium")
model.config.forced_decoder_ids = None

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [29]:
forced_decoder_ids = processor.get_decoder_prompt_ids(
    language="russian",
    task="transcribe",
)

In [30]:
path_to_content = Path().cwd().parent / "content"
path_to_audio = path_to_content / "test_audio.wav"
test_audio, sr = librosa.load(path_to_audio, sr=16_000)
idp.Audio(test_audio, rate=sr)

In [31]:
input_features = processor(test_audio, sampling_rate=sr, return_tensors="pt").input_features

# generate token ids
predicted_ids = model.generate(input_features, forced_decoder_ids=forced_decoder_ids)

In [32]:
# decode token ids to text
recognized = processor.batch_decode(predicted_ids, skip_special_tokens=True)
recognized

[' Я провела анализ и обработку ГИС и успешно рассчитала 3D-модель пласта. Это позволило нам получить подробное представление о структуре и характеристиках пласта, что, безусловно, имеет важное значение для разработки нефтяных месторождений. Наша 3D-модель дает нам возможность прогнозировать потенциальные места добычи нефти, оптимизировать работу скважин и принимать обоснованные решения на основе точных данных.']

In [33]:
path_to_doc =  "../content/recognized_transcription.txt"
with open(path_to_doc, "w", encoding="utf-8") as f:
    f.write(recognized[0])

# Compute metrics

In [34]:
path_to_transcription = path_to_content / "audio_transcription.txt"
with open(path_to_transcription) as f:
    reference = f.readlines()
    
reference

FileNotFoundError: [Errno 2] No such file or directory: '/Users/v.a.dyakov/study/NLP_SIRIUS/content/audio_transcription.txt'

In [None]:
transforms = jiwer.Compose(
    [
        jiwer.RemoveEmptyStrings(),
        jiwer.ToLowerCase(),
        jiwer.RemoveMultipleSpaces(),
        jiwer.Strip(),
        jiwer.RemovePunctuation(),
        jiwer.ReduceToListOfListOfWords(),
    ]
)

wer = jiwer.wer(
                reference,
                recognized,
                truth_transform=transforms,
                hypothesis_transform=transforms,
            )
print(f"Word Error Rate (WER) :", wer)