# Importing libraries

In [14]:
import torch
import whisper
import torchaudio
import numpy as np

In [6]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

## Testing audio with different frequencies

using cut speech of the "just do it" video
1. speech16k - 16000 hz
2. speech44k - 44100 hz

In [5]:
speech16k_file = "./audio/just_do_it_16000.wav"
speech44k_file = "./audio/just_do_it_44100.wav"

speech16k = torchaudio.load(speech16k_file)
speech44k = torchaudio.load(speech44k_file)

In [11]:
assert speech16k[1] == 16000
assert speech44k[1] == 44100

In [12]:
audio16k = whisper.pad_or_trim(speech16k[0].flatten()).to(DEVICE)
mel16k = whisper.log_mel_spectrogram(audio16k)

audio44k = whisper.pad_or_trim(speech44k[0].flatten()).to(DEVICE)
mel44k = whisper.log_mel_spectrogram(audio44k)

## Loading model

In [15]:
model = whisper.load_model("base.en")
print(
    f"Model is {'multilingual' if model.is_multilingual else 'English-only'} "
    f"and has {sum(np.prod(p.shape) for p in model.parameters()):,} parameters."
)

Model is English-only and has 71,825,408 parameters.


In [16]:
options = whisper.DecodingOptions(language="en")

## Decoding speech

In [17]:
speech16k_result = model.decode(mel16k, options)
speech16k_result.text

"Some people dream of success while you're gonna wake up and work hard at it. Nothing is impossible. You should get to the point where anyone else would quit and you're not gonna stop there. No, what are you waiting for? Do it! Just do it! Yes you can! Just do it!"

In [18]:
speech44k_result = model.decode(mel44k, options)
speech44k_result.text

"Scent of false sense, you have only got it, and you're bigger, and you're some pure something."