In [None]:
import itertools
from datasets import load_dataset
from IPython.display import Audio
import matplotlib.pyplot as plt
import numpy as np
import librosa
import librosa.display

from transformers import pipeline

## Task

In [None]:
# Load dataset
data = load_dataset("facebook/voxpopuli", name="en", split="train", streaming=True)

In [None]:
# Get third sample
sample = next(itertools.islice(data, 2, None))
sample

In [None]:
# Check it
Audio(sample["audio"]["array"], rate=sample["audio"]["sampling_rate"])

In [None]:
# Waveform
plt.figure().set_figwidth(12)
librosa.display.waveshow(sample["audio"]["array"], sr=sample["audio"]["sampling_rate"])

In [None]:
# Spectrogram
D = librosa.stft(sample["audio"]["array"])
S_db = librosa.amplitude_to_db(np.abs(D), ref=np.max)

plt.figure().set_figwidth(12)
librosa.display.specshow(S_db, x_axis="time", y_axis="hz")
plt.colorbar();

In [None]:
%%time
# Speech recognition
asr = pipeline("automatic-speech-recognition", model="distil-whisper/distil-large-v2")
res = asr(sample["audio"]["array"])

print(f"Pred: {res['text']}", end="\n\n")
print(f"Real data: {sample['raw_text']}")