In [1]:
import json
from pathlib import Path

In [2]:
librispeech = Path('data/datasets/librispeech')
assert librispeech.exists()

In [4]:
names = ['dev-clean', 'train-clean-100', 'test-clean', 'test-other']
index = []
for name in names:
    index_path = librispeech / f'{name}_index.json'
    assert index_path.exists()
    with open(index_path, 'r') as f:
        index.append(json.load(f))

In [9]:
import pandas as pd
import numpy as np

q = [99, 95]
df = pd.DataFrame(columns=(
    ['n_samples', 'max_audio_len'] + 
    [f'audio_len_{q_i}%' for q_i in q] + 
    ['max_text_len'] +
    [f'max_text_len_{q_i}%' for q_i in q] +
    ['n_hours']
))

for name, ind in zip(names, index):
    n_samples = len(ind)

    audio_len = [x['audio_len'] for x in ind]
    max_audio_len = max(audio_len)

    text_len = [len(x['text']) for x in ind]
    max_text_len = max(text_len)

    df.loc[name, :] = (
        [n_samples, max(audio_len)] +
        [np.quantile(audio_len, q=q_i/100) for q_i in q] +
        [max(text_len)] +
        [np.quantile(text_len, q=q_i/100) for q_i in q] +
        [sum(audio_len) / 60 / 60]
    )
df

Unnamed: 0,n_samples,max_audio_len,audio_len_99%,audio_len_95%,max_text_len,max_text_len_99%,max_text_len_95%,n_hours
dev-clean,2703,32.645,23.755,16.4135,516,366.96,256.9,5.387811
train-clean-100,28539,24.525,16.7031,16.085,398,289.0,262.0,100.59088
test-clean,2620,34.955,25.47575,17.842,576,363.05,261.0,5.403467
test-other,2939,34.51,21.4248,15.761,618,320.24,226.0,5.341547
