In [27]:
from loadfile import *
from phonemes import *
from preprocess import *
from evaluate import *
from google_kb_z_speaker.main import *
from tqdm.auto import tqdm
from experiment_helpers import *

# Text processing

In [28]:
# Speaker
speakers = ["z21", "z20", "z28", "z29" ]
google_fnames = [ f"transcriptions_new/{s}.googleasr"  for s in speakers]
correct_fnames = [f"transcriptions_new/{s}.correct" for s in speakers]
kb_fnames = [f"transcriptions_new/{s}.kb" for s in speakers]

google_lines = [get_fname_lines(fname) for fname in google_fnames]
correct_lines = [get_fname_lines(fname) for fname in correct_fnames]
kb_lines = [get_fname_lines(fname) for fname in kb_fnames]

[fix_lines(*model) for model in zip(correct_lines, google_lines, kb_lines)]

google_lines = reduce(lambda x,y : x + y, google_lines)
correct_lines = reduce(lambda x,y : x + y, correct_lines )
kb_lines = reduce(lambda x,y : x + y, kb_lines)

google_lines = transcriptions(google_lines)
correct_lines = transcriptions(correct_lines)
kb_lines = transcriptions(kb_lines)

google_lines = [preprocess_text(x) for x in google_lines]
correct_lines = [preprocess_text(x) for x in correct_lines]
kb_lines = [preprocess_text(x) for x in kb_lines]

text_data_bunch = [google_lines, correct_lines, kb_lines]
phonemizer= init_phonemizer("cuda", "./models/deep-phonemizer-se.pt")

In [29]:
#for x in idxs:
#    print("=====")
#    print(google_lines[x])
#    print(correct_lines[x])
#    print(kb_lines[x])

In [30]:
def singular_phonemes(txt : str):
    """
    Creates a list of singular phonemes from the
    output of get_swedish_phonemes
    """
    return txt.replace("_", " ").split()
def set_of_all_phonemes(google_lines, correct_lines, kb_lines):
    phonemes_set = set()
    for g,c,kb in zip(google_lines, correct_lines, kb_lines):
        phonemes_set.update(singular_phonemes(g))
        phonemes_set.update(singular_phonemes(c))
        phonemes_set.update(singular_phonemes(kb))
    return phonemes_set
def singular_phonemes_preprocess(google_lines, correct_lines, kb_lines):
    st = set_of_all_phonemes(google_lines, correct_lines, kb_lines)
    mapping = w2id(" ".join(st))
    
    def line2singular_phonemes_str(x):
        sp = singular_phonemes(x)
        #enc = encode_txt(" ".join(sp), mapping)
        #return enc
        return " ".join(sp)
    
    id2w = {v :k for k,v in mapping.items()}
    return [
        [line2singular_phonemes_str(x) for x in google_lines],
        [line2singular_phonemes_str(x) for x in correct_lines],
        [line2singular_phonemes_str(x) for x in kb_lines],
        mapping,id2w
    ]

# Experiments

In [31]:
def evaluate_lines(google_lines,correct_lines,kb_lines, _filter=None, phoneme_words=False, singular_phonemes=False, preprocess_hook=None): 
    if phoneme_words:
        google_lines = [preprocess_phonemes(get_swedish_phonemes(x, phonemizer))
                for x in tqdm(google_lines)]
        correct_lines = [preprocess_phonemes(get_swedish_phonemes(x, phonemizer))
                         for x in tqdm(correct_lines)]
        kb_lines = [preprocess_phonemes(get_swedish_phonemes(x, phonemizer))
                    for x in tqdm(kb_lines)]
    if singular_phonemes:
        google_lines = [get_swedish_phonemes(x, phonemizer)
                    for x in tqdm(google_lines)]
        correct_lines = [get_swedish_phonemes(x, phonemizer)
                         for x in tqdm(correct_lines)]
        kb_lines = [get_swedish_phonemes(x, phonemizer)
                    for x in tqdm(kb_lines)]
        google_lines, correct_lines, kb_lines, mapping,id2w = singular_phonemes_preprocess(google_lines, correct_lines, kb_lines)

    if _filter is not None:
        if len(_filter) == 1 and "agreement" in _filter:
            correct_lines, google_lines, kb_lines = filter_lines_only_on_agreement(correct_lines, google_lines,
                                                                                kb_lines, _filter["agreement"])
        else:
            correct_lines, google_lines, kb_lines = filter_lines(correct_lines,google_lines,
                                                               kb_lines,**_filter)

    filter_bunch = [google_lines, correct_lines, kb_lines]
    
    kei = 0
    oei = 0
    gei = 0
    tei = 0

    for c_l, g_l, kb_l in zip(correct_lines, google_lines, kb_lines):
        g_s = set(error_idxs(c_l, g_l))
        k_s = set(error_idxs(c_l, kb_l))

        gei += len(g_s)
        kei += len(k_s)
        oei += len(g_s & k_s)
        tei += len(g_s | k_s)

    error_index_overlap = oei / tei
   
    agreement, g_correct_kb_not, kb_correct_g_not, agreement_not_correct, agreement_correct,\
    both_incorrect_disagreement =\
        percentage_of_agreement(correct_lines, google_lines, kb_lines)

    if preprocess_hook is not None:
        preprocess_hook(**locals())
    return {
        "google_wer" : wer(correct_lines, google_lines),
        "kb_wer" : wer(correct_lines, kb_lines),
        "agreement" : agreement,
        "g_correct_kb_not " : g_correct_kb_not,
        "kb_correct_g_not " : kb_correct_g_not,
        "agreement_not_correct" :  agreement_not_correct,
        "both_incorrect_disagreement" : both_incorrect_disagreement,
        "error_index_overlap": error_index_overlap
    }

In [32]:
lines = []
def locate_empty_lines(**kwargs):
    g = kwargs["google_lines"]
    c = kwargs["correct_lines"]
    k = kwargs["kb_lines"]
    for i, (gi, ci, ki) in enumerate(zip(g,c,k)):
        #print([id2w[x] for x in ci])
        #print(type(c), type(ci))
        if ci == "":
            lines.append((i,(gi,ci,ki)))
    print("lines ", lines)
    return lines
        

In [33]:
google_lines[0]

'ja det har jag också startat'

In [38]:
google_fnames

['transcriptions_new/z21.googleasr',
 'transcriptions_new/z20.googleasr',
 'transcriptions_new/z28.googleasr',
 'transcriptions_new/z29.googleasr']

In [34]:
experiment = evaluate_lines
kwargs = {
    "_filter": None, 
    "phoneme_words" : False,
    "singular_phonemes" : False,
    "preprocess_hook" : None
}
res = experiment_repeats(experiment, 5, google_lines,correct_lines, kb_lines, **kwargs)
print_experiment_report(res)

  0%|          | 0/5 [00:00<?, ?it/s]

google_wer: 0.1983669548511047±0.0
kb_wer: 0.19132244636567403±0.0
agreement: 0.36553238199780463±0.0
g_correct_kb_not : 0.2768166089965398±0.0
kb_correct_g_not : 0.15051903114186851±0.0
agreement_not_correct: 0.0990990990990991±0.0
both_incorrect_disagreement: 0.5726643598615917±0.0
error_index_overlap: 0.30330982094411285±0.0


In [None]:
idx = 43
size = 10
def get_triad(idx, bunch):
    #### Google, correct, Kb
    return bunch[0][idx], bunch[1][idx], bunch[2][idx]

def plot_triad(triad,ax):
    ax.text(0.0, 0.0, triad[0], size=size, rotation=0,
             ha="center", va="center",
             bbox=dict(boxstyle="round",
                       ec=(1., 0.5, 0.5),
                       fc=(1., 0.8, 0.8),
                       )
             )

    ax.text(0, -2.5,  triad[2], size=size, rotation=0,
             ha="center", va="center",
             bbox=dict(boxstyle="round",
                       ec=(153/255, 51/255, 0/255),
                       fc=(255/255, 153/255, 102/255),
                       )
             )
    ax.text(0, -5,  triad[1], size=size, rotation=0,
             ha="center", va="center",
             bbox=dict(boxstyle="round",
                       ec=(42 / 255, 162 / 255, 42 / 255),
                       fc=(133 / 255, 224 / 255, 133 / 255),
                       )
             )
    
    ax.set_ylim(-10, 10)
    ax.set_xlim(-10, 10)

In [None]:
fig, axs = plt.subplots(1,2, figsize=(15, 5))
idx = 0
plot_triad(get_triad(idx, text_data_bunch) ,axs[0])
plot_triad(get_triad(idx, phoneme_data_bunch) ,axs[1])

In [None]:

def print_triad_correctness(triad):
    #### Google, correct, Kb
    print("Google correct ", triad[1] == triad[0])
    print("KB correct ", triad[1] == triad[2])
    print("Agreement ", triad[0] == triad[2])
    
    print("Google WER ", wer(triad[1], triad[0]))
    print("KB WER ", wer(triad[1], triad[2]))