# MOS

In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import nemo
import tqdm
import torch
import random
import shutil
import pathlib
import librosa
import argparse
import numpy as np
import pandas as pd
import scipy.stats as st

from ruamel import yaml
from nemo.collections import asr as nemo_asr
from nemo.collections import tts as nemo_tts

In [6]:
DATA = pathlib.Path('/home/stanislavv/data')
LJSPEECH, MOS = DATA / 'ljspeech/local', DATA / 'mos'
MODELS, BATCHES = MOS / 'models', MOS / 'batches'

## Meta

### TalkNet part

In [249]:
LJ = pathlib.Path('/home/stanislavv/data/ljspeech')

In [250]:
def load_local(file):
    data = {}
    for e in nemo_asr.parts.manifest.item_iter([file]):
        name, text = pathlib.Path(e['audio_file']).stem, e['text'].strip()

        data[name] = text
    
    return data


local_eval = load_local(LJSPEECH / 'split3/eval.json')
local_test = load_local(LJSPEECH / 'split3/test.json')
local = {**local_eval, **local_test}
len(local)

600

### Tacotron 2 part

In [251]:
def local_t2(file):
    data = {}
    with open(file) as f:
        for line in f:
            name, text = line.split('|')
            name, text = pathlib.Path(name).stem, text.strip()

            data[name] = text

    return data


t2_eval = local_t2(MOS / 'tacotron2/ljs_audio_text_val_filelist.txt')
t2_test = local_t2(MOS / 'tacotron2/ljs_audio_text_test_filelist.txt')
t2 = {**t2_eval, **t2_test}
len(t2)

600

### Merge

In [252]:
len(local.keys() - t2.keys()), len(local.keys() & t2.keys()), len(t2.keys() - local.keys())

(576, 24, 576)

In [256]:
def sample(keys, k, data):
    keys = list(keys)
    random.shuffle(keys)

    return {k: data[k] for k in keys[:k]}


local_part = sample(local.keys() - t2.keys(), 250, local)
t2_part = sample(t2.keys() - local.keys(), 250, t2)
test = {**local_part, **t2_part}
len(test)

500

### Dumping

In [293]:
df = pd.DataFrame(list(test.items()), columns=['name', 'text'])
df = df.sample(frac=1).reset_index(drop=True)  # Shuffle
df.to_csv(MOS / 'meta.csv', index=False)
pd.read_csv(MOS / 'meta.csv').head()

Unnamed: 0,name,text
0,LJ012-0054,"Solomons, while waiting to appear in court, pe..."
1,LJ035-0187,he also directed that the back door be sealed ...
2,LJ035-0148,She testified that she heard three shots which...
3,LJ031-0008,"Meanwhile, Chief Curry ordered the police base..."
4,LJ021-0139,There should be at least a full and fair trial...


In [117]:
df = pd.read_csv(MOS / 'meta.csv')
df.head()

Unnamed: 0,name,text
0,LJ012-0054,"Solomons, while waiting to appear in court, pe..."
1,LJ035-0187,he also directed that the back door be sealed ...
2,LJ035-0148,She testified that she heard three shots which...
3,LJ031-0008,"Meanwhile, Chief Curry ordered the police base..."
4,LJ021-0139,There should be at least a full and fair trial...


## Models

### Ground Truth

In [120]:
GT = MODELS / 'ground-truth'

In [121]:
GT.mkdir(parents=True, exist_ok=True)
for i, e in df.iterrows():
    name, text = e['name'], e['text']
    shutil.copy(LJSPEECH / f'wavs/{name}.wav', GT / f'{i:03}.wav')

### Mel + WaveGlow

In [8]:
MEL_WG = MODELS / 'mel-waveglow'
CONFIG = pathlib.Path('/home/stanislavv/src/tts/NeMo/examples/tts/configs/fasterspeech-mels-lj.yaml')
WG_CODE = pathlib.Path('../../waveglow')
WG_CKPT = DATA / 'ckpts/waveglow.pth'

In [9]:
yaml_loader = yaml.YAML(typ='safe')
with open(CONFIG) as f:
    config = argparse.Namespace(**yaml_loader.load(f))
nemo.core.NeuralModuleFactory();
data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor(
    **config.AudioToMelSpectrogramPreprocessor
)
wf = nemo_asr.parts.features.WaveformFeaturizer(sample_rate=config.sample_rate)
wg = nemo_tts.WaveGlowInference(str(WG_CODE), WG_CKPT)

MEL_WG.mkdir(parents=True, exist_ok=True)

for i, e in tqdm.notebook.tqdm(df.iterrows(), total=len(df)):
    name, text = e['name'], e['text']
    audio = wf.process(LJSPEECH / f'wavs/{name}.wav')
    audio_len = torch.tensor(audio.shape[0]).long()
    mel = data_preprocessor.forward(audio.cuda().unsqueeze_(0), audio_len.cuda().unsqueeze_(0))[0][0]
    audio = wg(mel, sigma=0.666, denoiser=0.01, norm=False)
    librosa.output.write_wav(MEL_WG / f'{i:03}.wav', audio, config.sample_rate, norm=False)

[NeMo I 2020-05-01 00:31:56 features:144] PADDING: 1
[NeMo I 2020-05-01 00:31:56 features:152] STFT using conv




HBox(children=(FloatProgress(value=0.0, max=500.0), HTML(value='')))

  mel = torch.tensor(mel, device='cuda').unsqueeze(0)





### Tacotron 2 + WaveGlow

In [36]:
# Procced to MOS / 'tacotron2'.

### TalkNet

In [38]:
# Procced to separate nb.

## Index

In [218]:
AUDIO_URL = 'https://acb2d2f5.ngrok.io'
MODELS_NAMES = [d.name for d in MODELS.iterdir()]
MODELS_NAMES.remove('hard50')
MODELS_NAMES

['mel-waveglow',
 'ground-truth',
 'talknet-waveglow_unbiased-0.05',
 'talknet-waveglow_better',
 'tacotron2-waveglow',
 'talknet-waveglow_unbiased-0.03-drop-0.05',
 'talknet-waveglow']

In [219]:
sample = pd.read_csv(MOS / 'sample.csv')
sample.head()

Unnamed: 0,audio_url
0,Hit1_audio_url_data
1,Hit2_audio_url_data
2,Hit3_audio_url_data


In [221]:
def make_index(models, k, file):
    urls = []
    for model in models:
        for i in range(k):
            urls.append(f'{AUDIO_URL}/{model}/{i:03}.wav')

    index = pd.DataFrame(urls, columns=['audio_url'])
    index = index.sample(frac=1).reset_index(drop=True)  # Shuffle

    index.to_csv(file, index=False)

make_index(MODELS_NAMES, 100, MOS / 'indexes/mem.csv')
!ls $MOS/indexes

big100.csv  biggest100.csv  last50.csv	  medium.csv  onemore50.csv  tundra.csv
big50.csv   kek.csv	    medium-2.csv  mem.csv     small.csv      whole.csv


## Resutls

In [15]:
MARKS = {
    'Excellent - Completely natural speech': 5,
    'Good - Mostly natural speech': 4,
    'Fair - Equally natural and unnatural speech': 3,
    'Poor - Mostly unnatural speech': 2,
    'Bad - Completely unnatural speech': 1,
}
MARKS_NEW = {
    'Excellent': 5,
    'Good': 4,
    'Fair': 3,
    'Poor': 2,
    'Bad': 1,
}
MARKS_NEW_2 = {
    '3.5': 3.5,
    '1.0 - Bad': 1.0,
    '2.0': 2.0,
    '2.5': 2.5,
    '5.0 - Excellent': 5.0, 
    '1.5': 1.5,
    '3.0': 3.0,
    '4.5': 4.5,
    '4.0': 4.0,
}

In [17]:
def process_batch(batch, marks=MARKS, min_seconds=0):
    batch = pd.read_csv(BATCHES / f'{batch}.csv')
    
    def parse(url):
        *_, model, example = url.split('/')
        example = example.split('.')[0]
        return model, example

    data = []
    for _, e in batch.iterrows():
        mark = marks.get(e['Answer.audio-naturalness.label'])
        model, example = parse(e['Input.audio_url'])
        time = int(e['WorkTimeInSeconds'])

        if time >= min_seconds:
            data.append((model, example, mark))
    
    df = pd.DataFrame(data, columns=['model', 'example', 'mark'])

    return df


df = process_batch('mem', MARKS_NEW_2)
df.groupby(['model']).mean().sort_values('mark')

Unnamed: 0_level_0,mark
model,Unnamed: 1_level_1
talknet-waveglow,3.3285
talknet-waveglow_better,3.3295
talknet-waveglow_unbiased-0.03-drop-0.05,3.3695
talknet-waveglow_unbiased-0.05,3.3995
tacotron2-waveglow,3.8035
mel-waveglow,3.9955
ground-truth,4.2795


In [215]:
df[df.model == 'talknet-waveglow'].groupby(['example']).mean().sort_values('mark').index[:10]

Index(['086', '050', '033', '097', '022', '036', '001', '067', '009', '077'], dtype='object', name='example')

In [153]:
!ls $MOS/batches

big50.csv	last50.csv    medium.csv      onemore50.csv  tundra.csv
biggest100.csv	medium-2.csv  medium-new.csv  small.csv      whole.csv


In [200]:
def collect_all(batches=None):
    dfs = []
    for batch in (batches or BATCHES.iterdir()):
        dfs.append(process_batch(batch))
    
#     dfs.append(process_batch('kek', MARKS_NEW))
    
    df = pd.concat(dfs)
    return df


df = collect_all([
    'big50', 'biggest100', 'last50',
    'onemore50', 'tundra', 'whole',
])
df.head()

Unnamed: 0,model,example,mark
0,talknet-waveglow_l2-log-400,32,3
1,talknet-waveglow_l2-log-400,32,4
2,talknet-waveglow_l2-log-400,32,4
3,talknet-waveglow_l2-log-400,22,3
4,talknet-waveglow_l2-log-400,22,2


In [194]:
df.groupby(['model']).mean().sort_values('mark')

Unnamed: 0_level_0,mark
model,Unnamed: 1_level_1
talknet-waveglow,3.101
tacotron2-waveglow,3.502
mel-waveglow,4.018
ground-truth,4.369


In [82]:
df = process_batch('medium-2', min_seconds=10)
df.groupby(['model']).mean().sort_values('mark')

Unnamed: 0_level_0,mark
model,Unnamed: 1_level_1
talknet-megatron-xe,3.529412
talknet-megatron-l2log,3.546875
talknet-megatron-l2log-better,3.745902
tacotron2-waveglow,3.848485
mel-waveglow,4.03876
ground-truth,4.087302


In [84]:
df.groupby('model').std()

Unnamed: 0_level_0,mark
model,Unnamed: 1_level_1
ground-truth,0.829649
mel-waveglow,0.887439
tacotron2-waveglow,0.795837
talknet-megatron-l2log,0.912377
talknet-megatron-l2log-better,0.777381
talknet-megatron-xe,0.811188


In [18]:
def conf95(a):
    l, r = st.t.interval(0.95, len(a) - 1, loc=np.mean(a), scale=st.sem(a))
    return (l + r) / 2, (r - l) / 2

In [19]:
df.groupby('model')['mark'].apply(conf95)

model
ground-truth                                (4.2795, 0.053358699210656724)
mel-waveglow                                (3.9955, 0.056365839386855576)
tacotron2-waveglow                           (3.8035, 0.06007056476917061)
talknet-waveglow                             (3.3285, 0.07018263872182962)
talknet-waveglow_better                      (3.3295, 0.06651233510197319)
talknet-waveglow_unbiased-0.03-drop-0.05     (3.3695, 0.06797522693855784)
talknet-waveglow_unbiased-0.05                (3.3995, 0.0666851026350721)
Name: mark, dtype: object

In [219]:
ii = df[df.model == 'talknet-waveglow'].groupby('example').mean().sort_values('mark')[:25].index.tolist()
print(*ii)

12 89 80 66 44 81 55 64 71 75 58 57 16 11 51 28 54 5 53 69 60 8 83 91 95
