In [1]:
import h5py
import numpy as np
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm

In [2]:
df_md = pd.read_csv("audio_metadata - filtered.csv")
df_md_id = np.array(df_md['id']).flatten()

In [3]:
df_md_id[:5]

array([3843, 3443, 3440, 3439, 3454])

In [4]:
wavs = []
srs = []

def load_sample(f, sample_id):
    group = f["audio_data"][str(sample_id)]
    waveform = np.array(group["waveform"])
    sample_rate = np.array(group["sample_rate"])
    return waveform, sample_rate

with h5py.File("audio_data.h5", "r") as f:
    for i in df_md_id:
        waveform, sr = load_sample(f, i)
        wavs.append(waveform)
        srs.append(sr)


In [5]:
assert len(wavs) == len(srs)

In [6]:
df_ar = pd.DataFrame({
    'id': df_md_id,
    "sample_rate": srs,
    'waveform': wavs
})

In [7]:
df_ar.head(5)

Unnamed: 0,id,sample_rate,waveform
0,3843,44100,"[5.164626e-05, -0.000103295286, 2.5824858e-05,..."
1,3443,44100,"[-0.0004272461, -0.0016479492, -0.007232666, -..."
2,3440,44100,"[-0.0009765625, -0.0026550293, -0.009033203, -..."
3,3439,44100,"[-0.00039672852, -0.00045776367, -0.0014953613..."
4,3454,44100,"[-0.0029296875, -0.010894775, -0.051208496, -0..."


In [8]:
with h5py.File("filtered_audio_data.h5", "w") as f:
    audio_data = f.create_group('audio_data')

    def save_sample(row):
        sample_group = audio_data.create_group(str(row['id']))
        sample_group.create_dataset('waveform', data=row['waveform'], compression='gzip')
        sample_group.create_dataset('sample_rate', data=row['sample_rate'])

    # Use ThreadPoolExecutor for parallelism
    with ThreadPoolExecutor(max_workers=10) as executor:
        list(tqdm(executor.map(save_sample, [row for _, row in df_ar.iterrows()]), total=len(df_ar)))
    

100%|██████████| 2047/2047 [00:08<00:00, 241.17it/s]
