In [16]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, AutoTokenizer

device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model_id = "openai/whisper-large-v2"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id,
    torch_dtype=torch_dtype,
    low_cpu_mem_usage=True,
    use_safetensors=True,
    attn_implementation="sdpa",
)
model.to(device)

processor = AutoProcessor.from_pretrained(model_id)

In [17]:
from datasets import load_dataset

dataset = load_dataset("mpanda27/voxpopuli_fi_pseudo_labelled", "fi", split="test")

In [None]:
output_dir = "voxpopuli_audio_samples"
os.makedirs(output_dir, exist_ok=True)

# Extract and save audio samples
for idx, sample in enumerate(dataset):
    # Assuming the audio column is named 'audio'
    audio_data = sample["audio"]["array"]
    sample_rate = sample["audio"]["sampling_rate"]
    
    # Save audio to a .wav file
    output_file = os.path.join(output_dir, f"sample_{idx}.wav")
    sf.write(output_file, audio_data, sample_rate)
    print(f"Saved: {output_file}")

    # Break after saving one file (remove this line to save all audio files)
    if idx == 0:
        break

In [18]:
import time

def generate_with_time(model, inputs, **kwargs):
    start_time = time.time()
    outputs = model.generate(**inputs, **kwargs)
    generation_time = time.time() - start_time
    return outputs, generation_time

In [19]:
from tqdm import tqdm

all_time = 0
predictions = []
references = []

for sample in tqdm(dataset):
    audio = sample["audio"]
    inputs = processor(audio["array"], sampling_rate=audio["sampling_rate"], return_tensors="pt")
    inputs = inputs.to(device=device, dtype=torch.float16)

    output, gen_time = generate_with_time(model, inputs)
    all_time += gen_time
    predictions.append(processor.batch_decode(output, skip_special_tokens=True, normalize=True)[0])
    references.append(processor.tokenizer._normalize(sample["normalized_text"]))

print(all_time)

100%|██████████| 199/199 [10:37<00:00,  3.20s/it]

620.0537178516388





In [5]:
from evaluate import load

wer = load("wer")

print(wer.compute(predictions=predictions, references=references))

0.1504890895410083


### Whisper Model tiny

In [20]:
assistant_model_id = "openai/whisper-tiny"

assistant_model = AutoModelForSpeechSeq2Seq.from_pretrained(
    assistant_model_id,
    torch_dtype=torch_dtype,
    low_cpu_mem_usage=True,
    use_safetensors=True,
    attn_implementation="sdpa",
)

assistant_model.to(device);
  # Load the tokenizer for the assistant model

In [21]:
def assisted_generate_with_time(model, inputs, **kwargs):
    start_time = time.time()
    outputs = model.generate(**inputs, assistant_model=assistant_model, **kwargs)
    generation_time = time.time() - start_time
    return outputs, generation_time

In [22]:
all_time = 0
predictions_distilled = []
references = []

for sample in tqdm(dataset):
    audio = sample["audio"]
    inputs = processor(audio["array"], sampling_rate=audio["sampling_rate"], return_tensors="pt")
    inputs = inputs.to(device=device, dtype=torch.float16)

    output, gen_time = assisted_generate_with_time(model, inputs)
    all_time += gen_time
    predictions_distilled.append(processor.batch_decode(output, skip_special_tokens=True, normalize=True)[0])
    references.append(processor.tokenizer._normalize(sample["normalized_text"]))

print(all_time)

  0%|          | 0/199 [00:00<?, ?it/s]From v4.47 onwards, when a model cache is to be returned, `generate` will return a `Cache` instance instead by default (as opposed to the legacy tuple of tuples format). If you want to keep returning the legacy format, please set `return_legacy_cache=True`.
100%|██████████| 199/199 [09:14<00:00,  2.79s/it]

538.9085578918457





In [23]:
print(wer.compute(predictions=predictions_distilled, references=references))

0.1619011788312014
