### Regress reduced encoder representations to WERs

The goal is to assess how reliable encoder representations are for predicting WER

In [69]:
import json
import os

import numpy as np
from omegaconf import OmegaConf
from sklearn.linear_model import LinearRegression, Ridge, Lasso

In [26]:
# CONFIG
tsne_exp_dir = "version_11"  # experiment folder where tsne data is stored
enc_block = 16  # encoder block to use for regression
wers_exp_dir = "version_48"  # experiment folder where wers data is stored (should evaluate the same model as the tsne data)

In [25]:
log_dir = "../../../logs/de"
vecs_dir = os.path.join(log_dir, "analysis", tsne_exp_dir)
wers_dir = os.path.join(log_dir, "asr", "evaluate", wers_exp_dir)

In [14]:
# get the number of samples per accent used from the vecs dir config
config = OmegaConf.load(os.path.join(vecs_dir, "config.yaml"))
n_samples = config.components.EncoderViz.n_samples
n_samples

100

The EncoderViz class computes the encoder representations for the first `n_samples` samples of each accent.

In [45]:
# find the WERs for the first `n_samples` samples of each accent
wers_file = os.path.join(wers_dir, "results.json")
all_wers = json.load(open(wers_file))
wers = {acc: [] for acc in all_wers.keys()}
for accent, samples in all_wers.items():
    for sample_id in samples:
        wers[accent].append(samples[sample_id]["wer"])
        if len(wers[accent]) == n_samples:
            break

# replace filenames with the accent for each accent and convert WERs to numpy array
old_keys = list(all_wers.keys())
for accent in old_keys:
    pretty_accent = os.path.splitext(accent)[0][5:]
    wers[pretty_accent] = np.array(wers.pop(accent))

# print the number of samples and mean WER for each accent
for acc, vals in wers.items():
    print(acc, np.round(np.mean(vals)*100, 2), len(vals))

de_ni 0.88 100
ch 4.35 100
de_al 0.96 100
it 2.21 100
fr 3.91 100
de 3.8 100
ru 2.63 100
us 5.09 100
gb 4.72 100
at 1.38 100
ca 3.29 100


In [82]:
# get the reduced encoder vectors for each accent
vec_langs = json.load(open(os.path.join(vecs_dir, "langs.json")))
block_vecs = np.load(os.path.join(vecs_dir, "tsne", f"b{enc_block}", "vecs.npy"))
vec_labels = np.load(os.path.join(vecs_dir, "tsne", f"b{enc_block}", "labels.npy"))

In [83]:
# order the WERs in the same order as the vecs
ordered_wers = np.array([])
for i in range(0, len(vec_labels), n_samples):
    accent = vec_langs[int(vec_labels[i])]
    ordered_wers = np.concatenate([ordered_wers, wers[accent]])
ordered_wers.shape

(1100,)

In [84]:
# fit a linear regression model to the WERs and encoder vectors
linear_model = LinearRegression().fit(block_vecs, ordered_wers)
linear_model.score(block_vecs, ordered_wers)

0.07475050788077864

In [85]:
ridge_model = Ridge().fit(block_vecs, ordered_wers)
ridge_model.score(block_vecs, ordered_wers)

0.03675192636572633

In [86]:
lasso_model = Lasso().fit(block_vecs, ordered_wers)
lasso_model.score(block_vecs, ordered_wers)

-8.881784197001252e-16

In [104]:
# score each accent separately with the linear regression model (best)
for idx, lang in enumerate(vec_langs):
    idx_acc = np.argwhere(vec_labels == idx).flatten()
    score = linear_model.score(block_vecs[idx_acc], wers[lang])
    print(lang, np.round(score, 3))

de 0.033
at -0.06
ch 0.12
ca 0.058
it 0.165
ru 0.096
us 0.107
gb 0.054
fr 0.088
de_al -0.561
de_ni -0.605
