## Get raw data sample from validation set

In [95]:
import os

import datasets

import pandas as pd

import numpy as np

Set the path of the test set. 

In [96]:
path = os.getcwd()
ROOT_DIR =  os.path.dirname(path)
RAW_DATA_DIR = os.path.join(ROOT_DIR, 'data', 'raw')

Load dataset and get a sample of the raw data.

In [97]:
audio_dataset = datasets.load_from_disk(RAW_DATA_DIR)
sample = audio_dataset["validation"].select([333])
print(sample)
print(sample['file'])
print(type(sample['audio'][0]['array']))
print(sample['audio'][0]['array'])
print(sample['label'])

Dataset({
    features: ['file', 'audio', 'label', 'is_unknown', 'speaker_id', 'utterance_id'],
    num_rows: 1
})
['bed/605ed0ff_nohash_0.wav']
<class 'numpy.ndarray'>
[-0.00247192 -0.00704956 -0.00418091 ... -0.0005188   0.00076294
 -0.00012207]
[20]


Store sample in a pandas dataframe file. This is chosen to preserve data structures (that may not happen in csv files).

In [101]:
d = {'file': [sample['file']], 'audio_array': [sample['audio'][0]['array']], 'label': [sample['label']]}
df = pd.DataFrame(data=d)
RAW_DATA_SAMPLE = os.path.join(ROOT_DIR, 'data', 'raw_sample_example', 'sample_example.pkl')
df.to_pickle(RAW_DATA_SAMPLE)
print(df)

                          file  \
0  [bed/605ed0ff_nohash_0.wav]   

                                         audio_array label  
0  [-0.002471923828125, -0.007049560546875, -0.00...  [20]  


Make some checks on the data to ensure that it is correct.

In [99]:
# Read the CSV file
sample_df = pd.read_pickle('sample_example.pkl')
print(sample_df)

                          file  \
0  [bed/605ed0ff_nohash_0.wav]   

                                         audio_array label  
0  [-0.002471923828125, -0.007049560546875, -0.00...  [20]  


We can see how the audio array structure is preserved

In [100]:
print(type(sample_df["audio_array"][0]))

<class 'numpy.ndarray'>
