In [2]:
from datasets import load_dataset
from IPython.display import Audio, display
from typing import Dict

  from .autonotebook import tqdm as notebook_tqdm


## Data Exploration

### Data Fields
- "audio": a datasets.Audio representation of the spoken utterance
- "text": a datasets.Value string representation of spoken utterance
- "labels": a datasets.ClassLabel representation of the emotion label
- "speaker_id": a datasets.Value string representation of the speaker ID
- "speaker_gender": a datasets.Value string representation of the speaker gender

### Sampling Rate
48kHz.wav

The sampling rate of 48kHz (48,000 samples per second) tells us about the quality and frequency range of the audio data in your dataset. Here's what it means:
Quality of Audio: A higher sampling rate generally means better audio quality. This is because a higher sampling rate can capture more details of the audio signal, especially the high-frequency components.
Frequency Range: The Nyquist-Shannon sampling theorem states that the sampling rate must be at least twice the highest frequency present in the signal. In the case of 48kHz, it means that the audio can accurately represent frequencies up to 24kHz. Human hearing typically ranges from 20Hz to 20kHz, so a 48kHz sampling rate is more than sufficient to capture all audible frequencies.
File Size: Higher sampling rates result in larger file sizes. This is because more samples are taken per second, which requires more storage space.
In the context of your dataset, a 48kHz sampling rate suggests that the audio data is of high quality and can accurately represent the full range of human speech and emotion.

In [3]:
dataset = load_dataset("narad/ravdess", split="train")
dataset

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Dataset({
    features: ['audio', 'text', 'labels', 'speaker_id', 'speaker_gender'],
    num_rows: 1440
})

In [4]:
audio_array_wrapper = dataset[0]["audio"]
audio_array_wrapper


{'path': '/Users/hannahmanfredi/.cache/huggingface/datasets/downloads/extracted/f99008b92e4fb133dc86d3d6479d9ad013cdafafa0b32c1e76fb7c2b95726fe6/Actor_16/03-01-05-01-02-01-16.wav',
 'array': array([ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
        -3.05175781e-05, -6.10351562e-05, -3.05175781e-05]),
 'sampling_rate': 48000}

In [5]:
display(Audio(audio_array_wrapper['array'], rate=audio_array_wrapper['sampling_rate']))



In [6]:
labels = dataset.features['labels'].names
labels


['neutral', 'calm', 'happy', 'sad', 'angry', 'fearful', 'disgust', 'surprised']

In [7]:
text = dataset[0]["text"]
text


'Dogs are sitting by the door'

In [8]:
label = dataset[0]["labels"]
label

4

In [9]:
gender = dataset[0]['speaker_gender']
gender

'female'

In [10]:
labels[label]

'angry'

### Convenience mapping:

In [11]:
labels = dataset.features["labels"].names
label2id: Dict[str, str] = dict()
id2label: Dict[str, str] = dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label
    

## PreProcess Data

In [12]:
# the model we are going to fine-tune is already trained on the following sampling rate so we need to resample the audio at that rate
sampling_rate = 16000

# remove all columns we don't need for our current task:
processed_dataset = dataset.remove_columns(["text", "speaker_id", "speaker_gender"])
processed_dataset
processed_dataset = processed_dataset.cast_column("audio", Audio(sampling_rate=sampling_rate))


TypeError: Audio.__init__() got an unexpected keyword argument 'sampling_rate'