In [48]:
import os
import tqdm
import json
import shutil
import fnmatch
import pathlib
import subprocess
import numpy as np

In [37]:
BASE = pathlib.Path('/home/stanislavv/data/libritts')
LOCAL_DATA_ROOT = BASE / 'LibriTTS'
NGC_DATA_ROOT = BASE / 'LibriTTS_ngc'
LOCAL_MANIFEST_ROOT = BASE / 'local'
NGC_MANIFEST_ROOT = BASE / 'ngc'

In [50]:
def process(part):
    def read_text(file):
        with open(file, 'r') as f:
            return str(f.read())
    
    def write_manifest(data, file):
        with open(file, 'w') as f:
            for example in data:
                f.write(json.dumps(example) + '\n')
    
    ngc_datat_root = NGC_DATA_ROOT / part
    ngc_datat_root.mkdir(parents=True, exist_ok=True)
    
    wavs = list((LOCAL_DATA_ROOT / part).glob('*/*/*.wav'))
    local_data, ngc_data = [], []
    for wav in tqdm.tqdm(wavs):
        shutil.copy(wav, ngc_datat_root / wav.name)
        
        normalized_text = read_text(wav.with_suffix('.normalized.txt'))
        original_text = read_text(wav.with_suffix('.original.txt'))
        duration = float(subprocess.check_output(f'soxi -D {wav}', shell=True))
        speaker = int(wav.name.split('_')[0])

        local_data.append(
            dict(
                audio_filepath=str(wav),
                duration=duration,
                normalized_text=normalized_text,
                original_text=original_text,
                speaker=speaker,
            )
        )
        
        ngc_data.append(
            dict(
                audio_filepath=str(pathlib.Path('/data/libritts') / part / wav.name),
                duration=duration,
                normalized_text=normalized_text,
                original_text=original_text,
                speaker=speaker,
            )
        )

    LOCAL_MANIFEST_ROOT.mkdir(parents=True, exist_ok=True)
    write_manifest(local_data, LOCAL_MANIFEST_ROOT / f'{part}.json')

    NGC_MANIFEST_ROOT.mkdir(parents=True, exist_ok=True)
    write_manifest(ngc_data, NGC_MANIFEST_ROOT / f'{part}.json')


process('dev-clean')

100%|██████████| 5736/5736 [00:44<00:00, 128.77it/s]


In [51]:
for part in ['dev-other', 'test-clean', 'test-other', 'train-clean-100', 'train-clean-360', 'train-other-500']:
    print(part)
    process(part)

  0%|          | 13/4613 [00:00<00:35, 127.91it/s]

dev-other


100%|██████████| 4613/4613 [00:32<00:00, 142.37it/s]
  0%|          | 14/4837 [00:00<00:36, 132.77it/s]

test-clean


100%|██████████| 4837/4837 [00:37<00:00, 130.14it/s]
  0%|          | 14/5120 [00:00<00:37, 135.68it/s]

test-other


100%|██████████| 5120/5120 [00:38<00:00, 131.62it/s]


train-clean-100


100%|██████████| 33236/33236 [04:31<00:00, 122.43it/s]


train-clean-360


100%|██████████| 116500/116500 [19:11<00:00, 101.19it/s]


train-other-500


100%|██████████| 205044/205044 [39:53<00:00, 85.67it/s]


In [53]:
shutil.copy(LOCAL_DATA_ROOT / 'eval_sentences10.tsv', LOCAL_MANIFEST_ROOT / 'eval_sentences10.tsv')
shutil.copy(LOCAL_DATA_ROOT / 'speakers.tsv', LOCAL_MANIFEST_ROOT / 'speakers.tsv')
shutil.copy(LOCAL_DATA_ROOT / 'eval_sentences10.tsv', NGC_MANIFEST_ROOT / 'eval_sentences10.tsv')
shutil.copy(LOCAL_DATA_ROOT / 'speakers.tsv', NGC_MANIFEST_ROOT / 'speakers.tsv')

PosixPath('/home/stanislavv/data/libritts/ngc/speakers.tsv')

In [54]:
# !cat train-clean-100.json train-clean-360.json train-other-500.json >train-all.json