# Hands-On Exercise

In [2]:
from datasets import load_dataset

# Load the dataset in streaming mode
ds = load_dataset(
    "facebook/voxpopuli",
    name="de",
    streaming=True,
    trust_remote_code=True
)

In [3]:
# Take the first three examples from the training part
ds_head = ds["train"].take(3)

In [None]:
# Get the third example and explore it
example = list(ds_head)[-1]
example

In [None]:
# Listen to the example's audio
from IPython.display import Audio

Audio(example["audio"]["array"], rate=16000)

In [None]:
# Waveform
import librosa
import matplotlib.pyplot as plt
import librosa.display

array = example["audio"]["array"]
sampling_rate = example["audio"]["sampling_rate"]
plt.figure().set_figwidth(12)
librosa.display.waveshow(array, sr=sampling_rate)

In [None]:
# Spectrogram
import numpy as np

D = librosa.stft(array)
S_db = librosa.amplitude_to_db(np.abs(D), ref=np.max)

plt.figure().set_figwidth(12)
librosa.display.specshow(S_db, x_axis="time", y_axis="hz")
plt.colorbar()

In [None]:
from transformers import pipeline

asr = pipeline(
    "automatic-speech-recognition",
    model="facebook/wav2vec2-large-xlsr-53-german"
)

In [None]:
# Compare the results
print(asr(example["audio"]["array"]))
print(example["raw_text"])

In [None]:
from IPython.display import Audio

Audio(example["audio"]["array"], rate=16000)