In [1]:
import csv
from pathlib import Path

import numpy as np
import pandas as pd

In [2]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.colheader_justify', 'center')

In [3]:
SAMPLE_AUDIO_PATH = "generated_hifi/esd_tune_advloss0/g_3164999/0020/surprise/9.wav"
EVAL_MAP_PATH = "reports/eval_map.csv"
SCORES_PATH = "data/scores_results.xlsx"

In [4]:
## NOTE: all model keys need to be composed of a single letter followed by a number
model2folder = {
    "m1": 'generated_hifi/esd_tune/g_3164999',
    "m2": 'generated_hifi/esd_tune_reversal/g_3164999',
    "m3": 'generated_hifi/esd_tune_advloss0/g_3164999',
}

In [5]:
folder2model = {v: k for k, v in model2folder.items()}
folder2model

{'generated_hifi/esd_tune/g_3164999': 'm1',
 'generated_hifi/esd_tune_reversal/g_3164999': 'm2',
 'generated_hifi/esd_tune_advloss0/g_3164999': 'm3'}

In [6]:
df_map = pd.read_csv(
    EVAL_MAP_PATH,
    sep = "\t",
    header = None,
    quoting = csv.QUOTE_NONE,
)

df_map

Unnamed: 0,0,1,2,3,4,5
0,-,-,id_1,id_2,id_3,-
1,"We got few vegetables and fruits , and became fish eaters .",0011,generated_hifi/esd_tune_reversal/g_3164999/0011/angry/1.wav,generated_hifi/esd_tune/g_3164999/0011/angry/1.wav,generated_hifi/esd_tune_advloss0/g_3164999/0011/angry/1.wav,"We got few vegetables and fruits , and became fish eaters ."
2,-,-,id_4,id_5,id_6,-
3,Humans also judge distance by using the relative sizes of objects.,0011,generated_hifi/esd_tune_reversal/g_3164999/0011/angry/10.wav,generated_hifi/esd_tune_advloss0/g_3164999/0011/angry/10.wav,generated_hifi/esd_tune/g_3164999/0011/angry/10.wav,Humans also judge distance by using the relative sizes of objects.
4,-,-,id_7,id_8,id_9,-
...,...,...,...,...,...,...
2695,The Claudine was leaving next morning for Honolulu .,0020,generated_hifi/esd_tune_reversal/g_3164999/0020/surprise/7.wav,generated_hifi/esd_tune/g_3164999/0020/surprise/7.wav,generated_hifi/esd_tune_advloss0/g_3164999/0020/surprise/7.wav,The Claudine was leaving next morning for Honolulu .
2696,-,-,id_4045,id_4046,id_4047,-
2697,Prosecutors have opened a massive investigation into allegations of fixing games and illegal betting.,0020,generated_hifi/esd_tune/g_3164999/0020/surprise/8.wav,generated_hifi/esd_tune_reversal/g_3164999/0020/surprise/8.wav,generated_hifi/esd_tune_advloss0/g_3164999/0020/surprise/8.wav,Prosecutors have opened a massive investigation into allegations of fixing games and illegal betting.
2698,-,-,id_4048,id_4049,id_4050,-


In [7]:
def get_audio_details(f):
    f_path = Path(f)
    model_ = folder2model[str(f_path.parent.parent.parent)]
    basename_ = f_path.name
    sentence_id = int(f_path.stem)
    speaker = f_path.parent.parent.name
    accent = None
    return {
        'model': model_,
        'sent_id': sentence_id,
        'speaker': speaker,
        'accent': accent
    }


get_audio_details(SAMPLE_AUDIO_PATH)

{'model': 'm3', 'sent_id': 9, 'speaker': '0020', 'accent': None}

In [8]:
def process_cell(f):
    if Path(f).exists():
        return get_audio_details(f)
    return (f)

In [9]:
df_detailed = df_map.applymap(lambda f: process_cell(f))
df_detailed.columns = ['text_left', 'speaker', 'm1', 'm2', 'm3', 'text_right']
df_detailed

Unnamed: 0,text_left,speaker,m1,m2,m3,text_right
0,-,-,id_1,id_2,id_3,-
1,"We got few vegetables and fruits , and became fish eaters .",0011,"{'model': 'm2', 'sent_id': 1, 'speaker': '0011', 'accent': None}","{'model': 'm1', 'sent_id': 1, 'speaker': '0011', 'accent': None}","{'model': 'm3', 'sent_id': 1, 'speaker': '0011', 'accent': None}","We got few vegetables and fruits , and became fish eaters ."
2,-,-,id_4,id_5,id_6,-
3,Humans also judge distance by using the relative sizes of objects.,0011,"{'model': 'm2', 'sent_id': 10, 'speaker': '0011', 'accent': None}","{'model': 'm3', 'sent_id': 10, 'speaker': '0011', 'accent': None}","{'model': 'm1', 'sent_id': 10, 'speaker': '0011', 'accent': None}",Humans also judge distance by using the relative sizes of objects.
4,-,-,id_7,id_8,id_9,-
...,...,...,...,...,...,...
2695,The Claudine was leaving next morning for Honolulu .,0020,"{'model': 'm2', 'sent_id': 7, 'speaker': '0020', 'accent': None}","{'model': 'm1', 'sent_id': 7, 'speaker': '0020', 'accent': None}","{'model': 'm3', 'sent_id': 7, 'speaker': '0020', 'accent': None}",The Claudine was leaving next morning for Honolulu .
2696,-,-,id_4045,id_4046,id_4047,-
2697,Prosecutors have opened a massive investigation into allegations of fixing games and illegal betting.,0020,"{'model': 'm1', 'sent_id': 8, 'speaker': '0020', 'accent': None}","{'model': 'm2', 'sent_id': 8, 'speaker': '0020', 'accent': None}","{'model': 'm3', 'sent_id': 8, 'speaker': '0020', 'accent': None}",Prosecutors have opened a massive investigation into allegations of fixing games and illegal betting.
2698,-,-,id_4048,id_4049,id_4050,-


In [10]:
total_K_models = len(model2folder)
total_K_models

3

In [11]:
total_wavs_per_model = int(len(df_map) / 2)
total_wavs_per_model

1350

In [12]:
x_range = np.array(list(range(total_wavs_per_model)))
x_indices = x_range * 2
y_indices = x_range * 2 + 1

In [13]:
m1_indices = df_detailed['m1'].iloc[x_indices]
m2_indices = df_detailed['m2'].iloc[x_indices]
m3_indices = df_detailed['m3'].iloc[x_indices]
m1_indices

0          id_1
2          id_4
4          id_7
6         id_10
8         id_13
         ...   
2690    id_4036
2692    id_4039
2694    id_4042
2696    id_4045
2698    id_4048
Name: m1, Length: 1350, dtype: object

In [14]:
meta = {}

for model_ in folder2model.values():
    meta.update(
        dict(zip(df_detailed[model_].iloc[x_indices], df_detailed[model_].iloc[y_indices]))
    )

meta

{'id_1': {'model': 'm2', 'sent_id': 1, 'speaker': '0011', 'accent': None},
 'id_4': {'model': 'm2', 'sent_id': 10, 'speaker': '0011', 'accent': None},
 'id_7': {'model': 'm1', 'sent_id': 11, 'speaker': '0011', 'accent': None},
 'id_10': {'model': 'm1', 'sent_id': 12, 'speaker': '0011', 'accent': None},
 'id_13': {'model': 'm3', 'sent_id': 13, 'speaker': '0011', 'accent': None},
 'id_16': {'model': 'm3', 'sent_id': 14, 'speaker': '0011', 'accent': None},
 'id_19': {'model': 'm1', 'sent_id': 15, 'speaker': '0011', 'accent': None},
 'id_22': {'model': 'm3', 'sent_id': 16, 'speaker': '0011', 'accent': None},
 'id_25': {'model': 'm1', 'sent_id': 17, 'speaker': '0011', 'accent': None},
 'id_28': {'model': 'm3', 'sent_id': 18, 'speaker': '0011', 'accent': None},
 'id_31': {'model': 'm1', 'sent_id': 19, 'speaker': '0011', 'accent': None},
 'id_34': {'model': 'm3', 'sent_id': 2, 'speaker': '0011', 'accent': None},
 'id_37': {'model': 'm3', 'sent_id': 20, 'speaker': '0011', 'accent': None},
 'id

In [15]:
assert(len(meta) == total_K_models*total_wavs_per_model)

In [16]:
sentences = {}

for model_ in folder2model.values():
    sentences.update(
        dict(zip(df_detailed[model_].iloc[x_indices], df_detailed['text_left'].iloc[y_indices]))
    )

for k, v in sentences.items():
    sentences[k] = {'text': v}

sentences

{'id_1': {'text': 'We got few vegetables and fruits , and became fish eaters .'},
 'id_4': {'text': 'Humans also judge distance by using the relative sizes of objects.'},
 'id_7': {'text': 'If this is true then those who tend to think creatively, really are somehow different.'},
 'id_10': {'text': 'But really in the grand scheme of things, this information is insignificant.'},
 'id_13': {'text': 'He had a private jet with three king-size beds, expensive rugs, porcelain vases and a dining area.'},
 'id_16': {'text': 'When I reached Atlanta my steadily increasing disappointment was not lessened. I found it a big, dull, red town.'},
 'id_19': {'text': 'She woke Meg with a "Merry Christmas", and bade her see what was under her pillow. A green–covered book appeared, with the same picture inside, and a few words written by their mother, which made their one present very precious in their eyes.'},
 'id_22': {'text': "Does Jane know about your new job? No, and don't you dare tell her! She will

In [17]:
def fillna_with_random_scores(df):
    val_range = [1, 2, 3]
    ranval_array = np.random.choice(val_range, size=(df.shape[0], df.shape[1]))
    ranval_df = pd.DataFrame(ranval_array, columns=df.columns, index=df.index)
    return df.fillna(ranval_df)


df_scores = pd.read_excel(SCORES_PATH)
df_scores = fillna_with_random_scores(df_scores)  # If scores are unfilled
df_scores

Unnamed: 0.1,Unnamed: 0,m1_text,m1_accent_speaker,m1,m2,m3,m2_text
0,0,-,-,id_1,id_2,id_3,-
1,1,"We got few vegetables and fruits , and became fish eaters .",0011,2,2,1,"We got few vegetables and fruits , and became fish eaters ."
2,2,-,-,id_4,id_5,id_6,-
3,3,Humans also judge distance by using the relative sizes of objects.,0011,3,3,3,Humans also judge distance by using the relative sizes of objects.
4,4,-,-,id_7,id_8,id_9,-
...,...,...,...,...,...,...,...
2695,2695,The Claudine was leaving next morning for Honolulu .,0020,2,2,3,The Claudine was leaving next morning for Honolulu .
2696,2696,-,-,id_4045,id_4046,id_4047,-
2697,2697,Prosecutors have opened a massive investigation into allegations of fixing games and illegal betting.,0020,1,2,3,Prosecutors have opened a massive investigation into allegations of fixing games and illegal betting.
2698,2698,-,-,id_4048,id_4049,id_4050,-


In [18]:
scores = {}

for model_ in folder2model.values():
    scores.update(
        dict(zip(df_scores[model_].iloc[x_indices], df_scores[model_].iloc[y_indices]))
    )

for k,v in scores.items():
    scores[k] = {'nmistakes': v}

scores

{'id_1': {'nmistakes': 2},
 'id_4': {'nmistakes': 3},
 'id_7': {'nmistakes': 3},
 'id_10': {'nmistakes': 3},
 'id_13': {'nmistakes': 2},
 'id_16': {'nmistakes': 3},
 'id_19': {'nmistakes': 1},
 'id_22': {'nmistakes': 2},
 'id_25': {'nmistakes': 3},
 'id_28': {'nmistakes': 2},
 'id_31': {'nmistakes': 3},
 'id_34': {'nmistakes': 1},
 'id_37': {'nmistakes': 2},
 'id_40': {'nmistakes': 2},
 'id_43': {'nmistakes': 3},
 'id_46': {'nmistakes': 3},
 'id_49': {'nmistakes': 1},
 'id_52': {'nmistakes': 1},
 'id_55': {'nmistakes': 2},
 'id_58': {'nmistakes': 2},
 'id_61': {'nmistakes': 2},
 'id_64': {'nmistakes': 1},
 'id_67': {'nmistakes': 3},
 'id_70': {'nmistakes': 3},
 'id_73': {'nmistakes': 3},
 'id_76': {'nmistakes': 1},
 'id_79': {'nmistakes': 2},
 'id_82': {'nmistakes': 1},
 'id_85': {'nmistakes': 2},
 'id_88': {'nmistakes': 2},
 'id_91': {'nmistakes': 2},
 'id_94': {'nmistakes': 1},
 'id_97': {'nmistakes': 1},
 'id_100': {'nmistakes': 2},
 'id_103': {'nmistakes': 2},
 'id_106': {'nmistake

In [19]:
pref = {}

for model_ in folder2model.values():
    pref.update(
        dict(zip(df_scores[model_].iloc[x_indices], df_scores[model_].iloc[y_indices]))
    )

for k,v in pref.items():
    pref[k] = {'pref_rank': v}

pref

{'id_1': {'pref_rank': 2},
 'id_4': {'pref_rank': 3},
 'id_7': {'pref_rank': 3},
 'id_10': {'pref_rank': 3},
 'id_13': {'pref_rank': 2},
 'id_16': {'pref_rank': 3},
 'id_19': {'pref_rank': 1},
 'id_22': {'pref_rank': 2},
 'id_25': {'pref_rank': 3},
 'id_28': {'pref_rank': 2},
 'id_31': {'pref_rank': 3},
 'id_34': {'pref_rank': 1},
 'id_37': {'pref_rank': 2},
 'id_40': {'pref_rank': 2},
 'id_43': {'pref_rank': 3},
 'id_46': {'pref_rank': 3},
 'id_49': {'pref_rank': 1},
 'id_52': {'pref_rank': 1},
 'id_55': {'pref_rank': 2},
 'id_58': {'pref_rank': 2},
 'id_61': {'pref_rank': 2},
 'id_64': {'pref_rank': 1},
 'id_67': {'pref_rank': 3},
 'id_70': {'pref_rank': 3},
 'id_73': {'pref_rank': 3},
 'id_76': {'pref_rank': 1},
 'id_79': {'pref_rank': 2},
 'id_82': {'pref_rank': 1},
 'id_85': {'pref_rank': 2},
 'id_88': {'pref_rank': 2},
 'id_91': {'pref_rank': 2},
 'id_94': {'pref_rank': 1},
 'id_97': {'pref_rank': 1},
 'id_100': {'pref_rank': 2},
 'id_103': {'pref_rank': 2},
 'id_106': {'pref_ran

In [20]:
list(df_scores['m1'].iloc[x_indices])[:5]

['id_1', 'id_4', 'id_7', 'id_10', 'id_13']

In [21]:
for wav_id in meta.keys():
    record_ = meta[wav_id]
    record_.update(sentences[wav_id])
    record_.update(scores[wav_id])
    record_.update(pref[wav_id])

meta

{'id_1': {'model': 'm2',
  'sent_id': 1,
  'speaker': '0011',
  'accent': None,
  'text': 'We got few vegetables and fruits , and became fish eaters .',
  'nmistakes': 2,
  'pref_rank': 2},
 'id_4': {'model': 'm2',
  'sent_id': 10,
  'speaker': '0011',
  'accent': None,
  'text': 'Humans also judge distance by using the relative sizes of objects.',
  'nmistakes': 3,
  'pref_rank': 3},
 'id_7': {'model': 'm1',
  'sent_id': 11,
  'speaker': '0011',
  'accent': None,
  'text': 'If this is true then those who tend to think creatively, really are somehow different.',
  'nmistakes': 3,
  'pref_rank': 3},
 'id_10': {'model': 'm1',
  'sent_id': 12,
  'speaker': '0011',
  'accent': None,
  'text': 'But really in the grand scheme of things, this information is insignificant.',
  'nmistakes': 3,
  'pref_rank': 3},
 'id_13': {'model': 'm3',
  'sent_id': 13,
  'speaker': '0011',
  'accent': None,
  'text': 'He had a private jet with three king-size beds, expensive rugs, porcelain vases and a dining

In [22]:
len(pref)

4050

In [23]:
df_meta = pd.DataFrame.from_dict(meta, orient='index')
df_meta['nwords'] = df_meta['text'].apply(lambda x: len(x.split(' ')))
df_meta.sample(5)

Unnamed: 0,model,sent_id,speaker,accent,text,nmistakes,pref_rank,nwords
id_3994,m2,2,20,,Fifty yards ahead of her were the first of the rocks .,3,3,12
id_719,m1,9,12,,"Different telescope designs perform differently, and have different strengths and weaknesses.",3,3,11
id_4012,m3,25,20,,Young people want to feel supported and appreciated by their company and their superiors.,2,2,14
id_3258,m3,14,18,,"When I reached Atlanta my steadily increasing disappointment was not lessened. I found it a big, dull, red town.",3,3,19
id_1331,m2,30,13,,"All this is thanks to his childhood in the mountains and to genetics, but it is his mental strength that sets him apart.",1,1,23


In [24]:
df_meta.loc[df_meta['nmistakes'] == 'FAILED'].sample(3)

Unnamed: 0,model,sent_id,speaker,accent,text,nmistakes,pref_rank,nwords
id_3107,m1,23,17,,For more than two hundred years the pessimists have been winning the public debate.,FAILED,FAILED,14
id_3112,m1,25,17,,Young people want to feel supported and appreciated by their company and their superiors.,FAILED,FAILED,14
id_3108,m2,23,17,,For more than two hundred years the pessimists have been winning the public debate.,FAILED,FAILED,14


In [25]:
# Filter bad speakers?
some_bad_speaker = 'scottish__scottish_male__cmu_us_awb_arctic'
df_failed = df_meta.loc[df_meta['speaker'] == some_bad_speaker]
df_good = df_meta.loc[df_meta['speaker'] != some_bad_speaker]
df_good = df_good.loc[df_good['nmistakes'] != "FAILED"]

In [26]:
df_failed_good_models = df_failed.loc[df_failed['nmistakes'] != 'FAILED', :]
df_failed_good_models

Unnamed: 0,model,sent_id,speaker,accent,text,nmistakes,pref_rank,nwords


In [27]:
df_good['wer'] = df_good['nmistakes'] / df_good['nwords']
df_good.sample(5)

Unnamed: 0,model,sent_id,speaker,accent,text,nmistakes,pref_rank,nwords,wer
id_682,m2,25,12,,Young people want to feel supported and appreciated by their company and their superiors.,1,1,14,0.071429
id_1709,m2,9,14,,"Different telescope designs perform differently, and have different strengths and weaknesses.",2,2,11,0.181818
id_3289,m2,24,18,,It's wearing me out trying to juggle work with looking after my children and my family.,2,2,16,0.125
id_3131,m1,30,17,,"All this is thanks to his childhood in the mountains and to genetics, but it is his mental strength that sets him apart.",2,2,23,0.086957
id_807,m3,8,12,,Prosecutors have opened a massive investigation into allegations of fixing games and illegal betting.,2,2,14,0.142857


In [28]:
df_full = df_good

In [29]:
mean_wer = {}
std_wer = {}
stats = {}
for m, df in df_full.groupby('model'):
    mean_wer[m] = df['wer'].mean()
    std_wer[m] = df['wer'].std()
    stats[m] = (mean_wer[m], std_wer[m])

In [30]:
mean_wer

{'m1': 0.15581893610424408, 'm2': 0.1551033686858993, 'm3': 0.1505934661339411}

In [31]:
stats

{'m1': (0.15581893610424408, 0.0917370138172076),
 'm2': (0.1551033686858993, 0.08791329488553278),
 'm3': (0.1505934661339411, 0.08370192086698483)}

In [32]:
conf_int = {}
for m in stats.keys():
    conf_int[m] = (stats[m][0] - stats[m][1], stats[m][0] + stats[m][1])

conf_int

{'m1': (0.06408192228703648, 0.2475559499214517),
 'm2': (0.06719007380036653, 0.2430166635714321),
 'm3': (0.06689154526695626, 0.23429538700092592)}

In [33]:
model2folder

{'m1': 'generated_hifi/esd_tune/g_3164999',
 'm2': 'generated_hifi/esd_tune_reversal/g_3164999',
 'm3': 'generated_hifi/esd_tune_advloss0/g_3164999'}

In [34]:
mean_wer = {}
std_wer = {}
stats = {}
conf_int = {}
meta_speaker = {}
for speaker, df1 in df_full.groupby('speaker'):
    for m, df in df1.groupby('model'):
        mean_wer[m] = df['wer'].mean()
        std_wer[m] = df['wer'].std()
        stats[m] = (mean_wer[m], std_wer[m])
        conf_int[m] = (stats[m][0] - stats[m][1], stats[m][0] + stats[m][1])
        meta_speaker[speaker + '_' + m] = {
            'mean': mean_wer[m],
            'std': std_wer[m],
            'conf_int': conf_int[m]
        }

meta_speaker

{'0011_m1': {'mean': 0.1649417562959747,
  'std': 0.09848066669836777,
  'conf_int': (0.06646108959760694, 0.2634224229943425)},
 '0011_m2': {'mean': 0.15374506591006587,
  'std': 0.09372400140491297,
  'conf_int': (0.060021064505152905, 0.24746906731497884)},
 '0011_m3': {'mean': 0.15757343048134242,
  'std': 0.09647052824930616,
  'conf_int': (0.061102902232036266, 0.25404395873064856)},
 '0012_m1': {'mean': 0.15808598946517752,
  'std': 0.08908074500705088,
  'conf_int': (0.06900524445812664, 0.2471667344722284)},
 '0012_m2': {'mean': 0.15953635163248103,
  'std': 0.09717079605964918,
  'conf_int': (0.062365555572831854, 0.2567071476921302)},
 '0012_m3': {'mean': 0.1525969552657367,
  'std': 0.07814707979240157,
  'conf_int': (0.07444987547333512, 0.23074403505813826)},
 '0013_m1': {'mean': 0.16239193389292073,
  'std': 0.09757158871850868,
  'conf_int': (0.06482034517441204, 0.2599635226114294)},
 '0013_m2': {'mean': 0.16063079414164727,
  'std': 0.08754696864487832,
  'conf_int': 

In [35]:
df_meta_speakers = pd.DataFrame.from_dict(meta_speaker, orient='index')
df_meta_speakers['speaker_model'] = df_meta_speakers.index
df_meta_speakers['speaker'] = df_meta_speakers['speaker_model'].apply(lambda x: x.split('_')[0])
df_meta_speakers

Unnamed: 0,mean,std,conf_int,speaker_model,speaker
0011_m1,0.164942,0.098481,"(0.06646108959760694, 0.2634224229943425)",0011_m1,11
0011_m2,0.153745,0.093724,"(0.060021064505152905, 0.24746906731497884)",0011_m2,11
0011_m3,0.157573,0.096471,"(0.061102902232036266, 0.25404395873064856)",0011_m3,11
0012_m1,0.158086,0.089081,"(0.06900524445812664, 0.2471667344722284)",0012_m1,12
0012_m2,0.159536,0.097171,"(0.062365555572831854, 0.2567071476921302)",0012_m2,12
0012_m3,0.152597,0.078147,"(0.07444987547333512, 0.23074403505813826)",0012_m3,12
0013_m1,0.162392,0.097572,"(0.06482034517441204, 0.2599635226114294)",0013_m1,13
0013_m2,0.160631,0.087547,"(0.07308382549676895, 0.2481777627865256)",0013_m2,13
0013_m3,0.151915,0.08349,"(0.06842450305797412, 0.23540548757671007)",0013_m3,13
0014_m1,0.153306,0.087367,"(0.06593880323377925, 0.24067260289293907)",0014_m1,14


In [36]:
model2folder

{'m1': 'generated_hifi/esd_tune/g_3164999',
 'm2': 'generated_hifi/esd_tune_reversal/g_3164999',
 'm3': 'generated_hifi/esd_tune_advloss0/g_3164999'}