In [2]:
import os
import tqdm
import json
import nemo
import shutil
import string
import fnmatch
import pathlib
import subprocess
import numpy as np
import pandas as pd

from nemo.collections import asr as nemo_asr

In [2]:
BASE = pathlib.Path('/home/stanislavv/data/libritts')
LOCAL_DATA_ROOT = BASE / 'LibriTTS'
NGC_DATA_ROOT = BASE / 'LibriTTS_ngc'
LOCAL_MANIFEST_ROOT = BASE / 'local'
NGC_MANIFEST_ROOT = BASE / 'ngc'

In [50]:
def process(part):
    def read_text(file):
        with open(file, 'r') as f:
            return str(f.read())
    
    def write_manifest(data, file):
        with open(file, 'w') as f:
            for example in data:
                f.write(json.dumps(example) + '\n')
    
    ngc_datat_root = NGC_DATA_ROOT / part
    ngc_datat_root.mkdir(parents=True, exist_ok=True)
    
    wavs = list((LOCAL_DATA_ROOT / part).glob('*/*/*.wav'))
    local_data, ngc_data = [], []
    for wav in tqdm.tqdm(wavs):
        shutil.copy(wav, ngc_datat_root / wav.name)
        
        normalized_text = read_text(wav.with_suffix('.normalized.txt'))
        original_text = read_text(wav.with_suffix('.original.txt'))
        duration = float(subprocess.check_output(f'soxi -D {wav}', shell=True))
        speaker = int(wav.name.split('_')[0])

        local_data.append(
            dict(
                audio_filepath=str(wav),
                duration=duration,
                normalized_text=normalized_text,
                original_text=original_text,
                speaker=speaker,
            )
        )
        
        ngc_data.append(
            dict(
                audio_filepath=str(pathlib.Path('/data/libritts') / part / wav.name),
                duration=duration,
                normalized_text=normalized_text,
                original_text=original_text,
                speaker=speaker,
            )
        )

    LOCAL_MANIFEST_ROOT.mkdir(parents=True, exist_ok=True)
    write_manifest(local_data, LOCAL_MANIFEST_ROOT / f'{part}.json')

    NGC_MANIFEST_ROOT.mkdir(parents=True, exist_ok=True)
    write_manifest(ngc_data, NGC_MANIFEST_ROOT / f'{part}.json')


process('dev-clean')

100%|██████████| 5736/5736 [00:44<00:00, 128.77it/s]


In [51]:
for part in ['dev-other', 'test-clean', 'test-other', 'train-clean-100', 'train-clean-360', 'train-other-500']:
    print(part)
    process(part)

  0%|          | 13/4613 [00:00<00:35, 127.91it/s]

dev-other


100%|██████████| 4613/4613 [00:32<00:00, 142.37it/s]
  0%|          | 14/4837 [00:00<00:36, 132.77it/s]

test-clean


100%|██████████| 4837/4837 [00:37<00:00, 130.14it/s]
  0%|          | 14/5120 [00:00<00:37, 135.68it/s]

test-other


100%|██████████| 5120/5120 [00:38<00:00, 131.62it/s]


train-clean-100


100%|██████████| 33236/33236 [04:31<00:00, 122.43it/s]


train-clean-360


100%|██████████| 116500/116500 [19:11<00:00, 101.19it/s]


train-other-500


100%|██████████| 205044/205044 [39:53<00:00, 85.67it/s]


In [53]:
shutil.copy(LOCAL_DATA_ROOT / 'eval_sentences10.tsv', LOCAL_MANIFEST_ROOT / 'eval_sentences10.tsv')
shutil.copy(LOCAL_DATA_ROOT / 'speakers.tsv', LOCAL_MANIFEST_ROOT / 'speakers.tsv')
shutil.copy(LOCAL_DATA_ROOT / 'eval_sentences10.tsv', NGC_MANIFEST_ROOT / 'eval_sentences10.tsv')
shutil.copy(LOCAL_DATA_ROOT / 'speakers.tsv', NGC_MANIFEST_ROOT / 'speakers.tsv')

PosixPath('/home/stanislavv/data/libritts/ngc/speakers.tsv')

In [54]:
# !cat train-clean-100.json train-clean-360.json train-other-500.json >train-all.json

## Analysis

In [7]:
alphabet = set()
for part in [
    'dev-clean',
    'dev-other',
    'test-clean',
    'test-other',
    'train-all',
]:
    print(part)
    manifest = LOCAL_MANIFEST_ROOT / f'{part}.json'
    with open(manifest, 'r') as f:
        for line in f:
            text = json.loads(line)['normalized_text']
            alphabet.update(text)

dev-clean
dev-other
test-clean
test-other
train-all


In [25]:
alpha = string.ascii_letters
non_ascii = alphabet - set(alpha)
print(*non_ascii)

æ ( ê ô . ! { ] ' ) } [ ; — ò é : œ ¯ "   ñ , ? - è /


In [24]:
punct = '()[]{}!?,.;:-/\'" '
left = non_ascii - set(punct)
print(*left)

æ — ò ê ô é œ è ¯ ñ


In [26]:
print(*alpha)
print(*punct)
print(*left)

a b c d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
( ) [ ] { } ! ? , . ; : - / ' "  
æ — ò ê ô é œ è ¯ ñ


In [4]:
TRAIN_DATA = '/home/stanislavv/data/libritts/local/train-all.json'
train_all_sp = set(e['speaker'] for e in nemo.collections.asr.parts.manifest.item_iter(TRAIN_DATA))
len(train_all_sp)

2311

In [32]:
DATA = '/home/stanislavv/data/libritts/local/dev-clean.json'
dev_clean_sp = set(e['speaker'] for e in nemo.collections.asr.parts.manifest.item_iter(DATA))
len(dev_clean_sp)

40

In [34]:
DATA = '/home/stanislavv/data/libritts/local/dev-other.json'
dev_other_sp = set(e['speaker'] for e in nemo.collections.asr.parts.manifest.item_iter(DATA))
len(dev_other_sp)

33

In [6]:
train_all_sp & dev_clean_sp

set()

In [8]:
dev_clean_sp & dev_other_sp

set()

In [9]:
train_all_sp & dev_other_sp

set()

In [12]:
for item in items:
    if item['speaker'] == 6358:
        print(item)

{'audio_file': '/home/stanislavv/data/libritts/LibriTTS/train-other-500/6358/73329/6358_73329_000053_000006.wav', 'duration': 3.62, 'text': 'A doorway cut near the fireplace showed there was probably an inner chamber.', 'offset': None, 'speaker': 6358, 'id': 188015}
{'audio_file': '/home/stanislavv/data/libritts/LibriTTS/train-other-500/6358/73329/6358_73329_000051_000002.wav', 'duration': 10.07, 'text': 'Accustomed to the life of a convent, they could not conceive of any other; and when one morning their bars and gratings were flung down, they had shuddered at finding themselves free.', 'offset': None, 'speaker': 6358, 'id': 188016}
{'audio_file': '/home/stanislavv/data/libritts/LibriTTS/train-other-500/6358/73329/6358_73329_000079_000001.wav', 'duration': 17.09, 'text': "Beneath the dome of saint Peter's in Rome, God had never seemed more majestic to man than he did now in this refuge of poverty and to the eyes of these Christians,--so true is it that between man and God all mediatio

In [11]:
speakers = list(sorted(set([item['speaker'] for item in items])))
6358 in speakers

True

In [15]:
SPEAKER_TSV = '/home/stanislavv/data/libritts/local/speakers.tsv'
tsv_speakers = list(pd.read_csv(SPEAKER_TSV, sep='\t')['READER'])
tsv_speakers[:10]

['F', 'F', 'M', 'F', 'F', 'F', 'F', 'M', 'M', 'M']

In [18]:
pd.read_csv(SPEAKER_TSV, sep='\t').index

Int64Index([  14,   16,   17,   19,   20,   22,   23,   25,   26,   27,
            ...
            8867, 8875, 8879, 8887, 8897, 8975, 9000, 9022, 9023, 9026],
           dtype='int64', length=2484)

# Embeddings

In [45]:
EMB_ROOT = '/home/stanislavv/data/libritts/speaker-embeddings-16k'
SPEAKERS_TABLE = '/home/stanislavv/data/libritts/ngc/speakers.tsv'
EMB_SIZE = 256

In [46]:
sid_to_i = {sid: i for i, sid in enumerate(pd.read_csv(SPEAKERS_TABLE, sep='\t').index)}
len(sid_to_i)

2484

In [47]:
embs = np.zeros((len(sid_to_i), EMB_SIZE))
nums = np.zeros(len(sid_to_i))

In [48]:
path = pathlib.Path(EMB_ROOT)
files = []
for f in tqdm.tqdm(path.glob('*/*/*/*.npy')):
    speaker_id = int(f.name.split('_')[0])
    arr = np.load(str(f))
    
    embs[sid_to_i[speaker_id]] += arr
    nums[sid_to_i[speaker_id]] += 1

375086it [02:34, 2427.98it/s]


In [49]:
sembs = embs / np.where(nums == 0.0, 1, nums)[:, np.newaxis]
sembs.shape

(2484, 256)

In [50]:
sembs.min(), sembs.max()

(0.0, 0.3968268036842346)

In [51]:
np.save('/home/stanislavv/data/librimeta/speaker-embs/16k.npy', sembs)