# LibriCrowd Evaluate

In [2]:
import datetime, os, re, string
import numpy as np
import pandas as pd
import jiwer
# wer is same as https://code.amazon.com/packages/DeepADSPhase1/blobs/6930484f38136e49395dcb3e7df8baaec97e40c9/--/src/deep_ads_phase1/sentence_utils_wlm.py#L230, replace_cost=3
from utils.agreement import normalize
from utils.rover import ROVER

In [3]:
folder = './data/LibriSpeechCrowd'
dataset = 'train-mixed-10h' # test-other, dev-other, test-clean, dev-clean, train-mixed-10h, train-other-10h
goldpath = os.path.join(folder, 'librispeech_{:s}_trans.csv'.format(dataset.replace('-','_')))
crowdpath = os.path.join(folder, 'librispeech_{:s}_crowd.csv'.format(dataset.replace('-','_')))
crowdpath_5 = os.path.join(folder, 'librispeech_{:s}_crowd_5.csv'.format(dataset.replace('-','_')))
crowdpath_raw = os.path.join(folder, 'librispeech_{:s}_crowd_raw.csv'.format(dataset.replace('-','_')))

### Load Gold and Std

In [4]:
def find_csv_filenames(path_to_dir, suffix='.csv'):
    filenames = os.listdir(path_to_dir)
    filepaths = [os.path.join(path_to_dir, filename) for filename in filenames if filename.endswith(suffix)]
    return filepaths

In [5]:
def audio2utt(audio_url):
    audio = audio_url.split('.')[0]
    book = audio.split('-')[0]
    chap = audio.split('-')[1]
    try:
        uttr = audio.split('-')[2]
    except:
        print(audio_url)
    utt = book + '_' + chap + '_' + str(int(uttr))
    return utt


num2words = {1: 'One', 2: 'Two', 3: 'Three', 4: 'Four', 5: 'Five', \
             6: 'Six', 7: 'Seven', 8: 'Eight', 9: 'Nine', 10: 'Ten', \
            11: 'Eleven', 12: 'Twelve', 13: 'Thirteen', 14: 'Fourteen', \
            15: 'Fifteen', 16: 'Sixteen', 17: 'Seventeen', 18: 'Eighteen', \
            19: 'Nineteen', 20: 'Twenty', 30: 'Thirty', 40: 'Forty', \
            50: 'Fifty', 60: 'Sixty', 70: 'Seventy', 80: 'Eighty', \
            90: 'Ninety', 0: 'Zero'}

strnum2word = dict()
for k in num2words:
    strnum2word[str(k)] = num2words[k]


def normalize_word(word):
    word = strnum2word.get(word, word)
    if word in ('okay', 'ok'): return 'OK'
    if word in ('mr', 'mr.'): return 'Mister'
    if word in ('ms', 'ms.'): return 'Miss'
    if word == 'mrs.': return 'Misses'
    if word == 'ya': return 'you'
    if word in ('cos','coz'): return 'because'
    if word in ('ah','eh','hm','um','er','ahhh'): return ''
    if word == "there's": return 'there is'
    if word == "i'm": return 'i am'
    if word == "where's": return 'where is'
    if word == "what's": return 'what is'
    if word == "when's": return 'when is'
    if word == "why's": return 'why is'
    if word == "who's": return 'who is'
    if word == "you're": return 'you are'
    if word == 'wanna': return 'want to'
    if word == 'k': return 'OK'
    if word == 'anytime': return 'any time'
    if word[-1] == '.' and word not in ('a.m.','p.m.'): return word[:-1]
    return word.lower()


def normalize_trans(trans):
    trans = trans.lower()
    trans = trans.replace('?','').replace(',','').replace('.','')
    words = trans.split()
    for i in range(len(words)):
        words[i] = normalize_word(words[i])
    trans = ' '.join(words)
#     res = jiwer.ExpandCommonEnglishContractions()(trans)
    return trans

In [6]:
def SubmitTime2DateTime(s):
    '''
    s = 'Wed Dec 28 17:35:02 PST 2022'
    t = '2022-12-28 17:35:02'
    '''
    e = '%a %b %d %H:%M:%S %Z %Y'
    d = datetime.datetime.strptime(s, e)
    t = datetime.datetime.strftime(d, '%Y-%m-%d %H:%M:%S')
    return t

In [7]:
df_gold = pd.read_csv(goldpath)
df_gold.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2763 entries, 0 to 2762
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   utt         2763 non-null   object
 1   gold_trans  2763 non-null   object
 2   std_trans   2763 non-null   object
dtypes: object(3)
memory usage: 64.9+ KB


In [8]:
filepaths = find_csv_filenames(os.path.join(folder, dataset))
df_submits = []
for filepath in filepaths:
    df_submits.append(pd.read_csv(filepath))
df_submit = pd.concat(df_submits)
df_submit.dropna(subset=['Input.audio_url'], inplace=True)
df_submit = df_submit[df_submit['Input.audio_url'].apply(lambda x:'.txt' not in x)].reset_index(drop=True)
df_submit['taskId'] = df_submit['HITId'] + '|' + df_submit['WorkerId']

In [9]:
# merge std with gold and rename columns
df_submit['utt'] = df_submit['Input.audio_url'].apply(lambda x: audio2utt(x))
df_all = df_submit.merge(df_gold[['utt','gold_trans']], on = ['utt'], how = 'left')
df_all = df_all.rename(columns={'Answer.transcription':'std_trans'})
df_all['std_trans'] = df_all['std_trans'].fillna('ImputedNA')
df_all['num_q_mark'] = df_all['std_trans'].apply(lambda x: x.count('?'))
df_all['submittime'] = df_all['SubmitTime'].apply(lambda x: SubmitTime2DateTime(x))
# normalize trans
df_all['gold_trans'] = df_all['gold_trans'].apply(lambda x: normalize_trans(x))
df_all['std_trans'] = df_all['std_trans'].apply(lambda x: normalize_trans(x))
df_all.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14231 entries, 0 to 14230
Data columns (total 36 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   HITId                        14231 non-null  object 
 1   HITTypeId                    14231 non-null  object 
 2   Title                        14231 non-null  object 
 3   Description                  14231 non-null  object 
 4   Keywords                     14231 non-null  object 
 5   Reward                       14231 non-null  object 
 6   CreationTime                 14231 non-null  object 
 7   MaxAssignments               14231 non-null  int64  
 8   RequesterAnnotation          14231 non-null  object 
 9   AssignmentDurationInSeconds  14231 non-null  int64  
 10  AutoApprovalDelayInSeconds   14231 non-null  int64  
 11  Expiration                   14231 non-null  object 
 12  NumberOfSimilarHITs          0 non-null      float64
 13  LifetimeInSecond

### Get WER

In [10]:
def cost_edit_distance(source: list, target: list, insert_cost=1, delete_cost=1, replace_cost=1):
    """
    Given two lists of tokens, calculate weighted edit distance
    https://code.amazon.com/packages/DeepADSPhase1/blobs/6930484f38136e49395dcb3e7df8baaec97e40c9/--/src/deep_ads_phase1/sentence_utils_wlm.py#L230
    """
    m = len(source)
    n = len(target)
    if target == source:
        return 0
    if m == 0:
        return n
    if n == 0:
        return m
    d = np.zeros((n + 1, m + 1))
    f = np.zeros((n + 1, m + 1))
    d[0, 0] = 0
    f[0, 0] = 0
    for i in range(1, n + 1):
        d[i, 0] = d[i - 1, 0] + insert_cost
        f[i, 0] = f[i - 1, 0] + 1
    for j in range(1, m + 1):
        d[0, j] = d[0, j - 1] + delete_cost
        f[0, j] = f[0, j - 1] + 1
    for i in range(1, n + 1):
        for j in range(1, m + 1):
            multiplier = int(not target[i - 1] == source[j - 1])
            d_costs = [d[i - 1, j] + insert_cost,
                       d[i - 1, j - 1] + multiplier * replace_cost,
                       d[i, j - 1] + delete_cost]
            f_costs = [f[i - 1, j] + 1,
                       f[i - 1, j - 1] + multiplier,
                       f[i, j - 1] + 1]
            idx = int(np.argmin(d_costs))
            assert isinstance(idx, int)
            d[i, j] = d_costs[idx]
            f[i, j] = f_costs[idx]
    return f[n, m]

def compute_wer(t1, t2):
    """
    Given two transcriptions, calculate Word Error Rate (WER)
    https://code.amazon.com/packages/DeepADSPhase1/blobs/6930484f38136e49395dcb3e7df8baaec97e40c9/--/src/deep_ads_phase1/sentence_utils_wlm.py#L174
    """
    word_error_count = cost_edit_distance(t1.split(), t2.split())
    return word_error_count

In [11]:
def get_wer(df, std_trans='std_trans', reuse=True):
    if reuse == False or 'std_trans_wec' not in df.columns:
        print('compute std_trans_wec ...', end='\r')
        df['gold_trans_stc'] = df['gold_trans'].apply(lambda x: len(x.split()))
        df['std_trans_stc']  = df[std_trans].apply(lambda x: len(x.split()))
        df['std_trans_wec']  = df.apply(lambda x: compute_wer(x.gold_trans, x[std_trans]), axis=1)
    df1 = df.groupby('utt').agg({'std_trans_wec': ['mean', 'min', 'max']})
    df2 = df.groupby('utt').agg({'gold_trans_stc': ['min']})
    wer_avg = df1[('std_trans_wec', 'mean')].sum() / df2[('gold_trans_stc', 'min')].sum()
    wer_min = df1[('std_trans_wec', 'min')].sum()  / df2[('gold_trans_stc', 'min')].sum()
    wer_max = df1[('std_trans_wec', 'max')].sum()  / df2[('gold_trans_stc', 'min')].sum()
    return wer_avg, wer_min, wer_max

In [12]:
def compute_metrics(gold_trans, std_trans):
    metrics = jiwer.compute_measures(gold_trans, std_trans)
    return metrics

In [13]:
df_all['gold_trans_stc'] = df_all['gold_trans'].apply(lambda x: len(x.split()))
df_all['std_trans_stc']  = df_all['std_trans'].apply(lambda x: len(x.split()))
df_all['std_trans_wec']  = df_all.apply(lambda x: compute_wer(x.gold_trans, x.std_trans), axis=1)

In [14]:
df_approved = df_all[df_all['AssignmentStatus'] == 'Approved'].reset_index(drop=True)
df_rejected = df_all[df_all['AssignmentStatus'] == 'Rejected'].reset_index(drop=True)
wer_avg_approve, wer_min_approve, wer_max_approve = get_wer(df_approved)
wer_avg_reject,  wer_min_reject,  wer_max_reject  = get_wer(df_rejected)
print('Approved: count = {:5d}, wer_avg = {:.4f}, wer_min = {:.4f}, wer_max = {:.4f}'.format(len(df_approved), wer_avg_approve, wer_min_approve, wer_max_approve))
print('Rejected: count = {:5d}, wer_avg = {:.4f}, wer_min = {:.4f}, wer_max = {:.4f}'.format(len(df_rejected), wer_avg_reject, wer_min_reject, wer_max_reject))

Approved: count = 13803, wer_avg = 0.0489, wer_min = 0.0205, wer_max = 0.1053
Rejected: count =   428, wer_avg = 0.6368, wer_min = 0.6088, wer_max = 0.6676


### Error Type Statistics for Approved and Rejected Transcriptions

In [24]:
def error_type(df, std_trans='std_trans'):
    df[std_trans] = df[std_trans].fillna('')
    df['std_trans_metrics'] = df.apply(lambda x: compute_metrics(x.gold_trans, x[std_trans]), axis=1)
    df['gold_trans_stc'] = df['gold_trans'].apply(lambda x: len(x.split()))
    df['std_trans_stc'] = df[std_trans].apply(lambda x: len(x.split()))
    df['std_trans_del'] = df['std_trans_metrics'].apply(lambda x: x['deletions'])
    df['std_trans_ins'] = df['std_trans_metrics'].apply(lambda x: x['insertions'])
    df['std_trans_sub'] = df['std_trans_metrics'].apply(lambda x: x['substitutions'])
    df['std_trans_wec'] = df['std_trans_metrics'].apply(lambda x: x['deletions'] + x['insertions'] + x['substitutions'])
    res = dict()
    res['CNT'] = len(df)
    res['LEN'] = df['std_trans_stc'].sum() / len(df)
    res['DEL'] = df['std_trans_del'].sum() / df['gold_trans_stc'].sum() * 100
    res['INS'] = df['std_trans_ins'].sum() / df['gold_trans_stc'].sum() * 100
    res['SUB'] = df['std_trans_sub'].sum() / df['gold_trans_stc'].sum() * 100
    res['WER'] = df['std_trans_wec'].sum() / df['gold_trans_stc'].sum() * 100
    return res

In [15]:
res = error_type(df_approved)
print('dataset = {:s}, approved: count = {:d}, len = {:.1f}, del = {:.2f}%, ins = {:.2f}%, sub = {:.2f}%, wer = {:.2f}%'\
      .format(dataset, res['CNT'], res['LEN'], res['DEL'], res['INS'], res['SUB'], res['WER']))
res = error_type(df_rejected)
print('dataset = {:s}, rejected: count = {:d}, len = {:.1f}, del = {:.2f}%, ins = {:.2f}%, sub = {:.2f}%, wer = {:.2f}%'\
      .format(dataset, res['CNT'], res['LEN'], res['DEL'], res['INS'], res['SUB'], res['WER']))

dataset = train-mixed-10h, approved: count = 13803, len = 35.2, del = 0.87%, ins = 0.44%, sub = 3.58%, wer = 4.89%
dataset = train-mixed-10h, rejected: count = 428, len = 17.1, del = 44.72%, ins = 1.74%, sub = 15.40%, wer = 61.85%


### Save Data

In [16]:
cols = ['HITId','HITTypeId','Title','Description','Keywords','Reward','CreationTime','MaxAssignments','RequesterAnnotation','AssignmentDurationInSeconds','AutoApprovalDelayInSeconds','Expiration','AssignmentId','WorkerId','AssignmentStatus','AcceptTime','SubmitTime','AutoApprovalTime','ApprovalTime','RejectionTime','RequesterFeedback','WorkTimeInSeconds','LifetimeApprovalRate','Last30DaysApprovalRate','Last7DaysApprovalRate','Input.audio_url','std_trans','utt','gold_trans','num_q_mark','submittime','gold_trans_stc','std_trans_stc','std_trans_wec','taskId']
df_crowd_raw = df_all[cols].reset_index(drop=True)
df_crowd_raw.to_csv(crowdpath_raw, index=False)

### Relabeling

In [17]:
cols = ['utt','submittime','AssignmentId']
df = df_all[cols].sort_values(by=['submittime','utt']).reset_index(drop=True)

In [18]:
df1 = df.groupby('utt', as_index=False).apply(lambda x: x if len(x)==1 else x.iloc[[0]]).reset_index(level=0, drop=True)
df2 = df.groupby('utt', as_index=False).apply(lambda x: x if len(x)==1 else x.iloc[[1]]).reset_index(level=0, drop=True)
df3 = df.groupby('utt', as_index=False).apply(lambda x: x if len(x)==1 else x.iloc[[2]]).reset_index(level=0, drop=True)
df4 = df.groupby('utt', as_index=False).apply(lambda x: x if len(x)==1 else x.iloc[[3]]).reset_index(level=0, drop=True)
df5 = df.groupby('utt', as_index=False).apply(lambda x: x if len(x)==1 else x.iloc[[4]]).reset_index(level=0, drop=True)

In [19]:
df_5 = pd.concat([df1, df2, df3, df4, df5])
df_before_relabel = df_all.merge(df_5[['AssignmentId']], on = ['AssignmentId'], how='right')
df_after_relabel  = df_approved.copy()

In [20]:
wer_avg_before, wer_min_before, wer_max_before = get_wer(df_before_relabel)
wer_avg_after, wer_min_after, wer_max_after = get_wer(df_after_relabel)
print('Before Relabeling: wer_avg = {:.4f}, wer_min = {:.4f}, wer_max = {:.4f}'.format(wer_avg_before, wer_min_before, wer_max_before))
print('After  Relabeling: wer_avg = {:.4f}, wer_min = {:.4f}, wer_max = {:.4f}'.format(wer_avg_after, wer_min_after, wer_max_after))
relabel_rate = len(df_rejected) / len(df_approved)
num_worker_before = len(set(df_before_relabel['WorkerId']))
num_worker_after  = len(set(df_all['WorkerId']))
print('Relabeling percent = {:.2f}%, WER improve = {:.0f} bps'.format(100*relabel_rate, 1e4*(wer_avg_before - wer_avg_after)))
print('Number of workers: before = {:d}, after = {:d}'.format(num_worker_before, num_worker_after))

Before Relabeling: wer_avg = 0.0597, wer_min = 0.0209, wer_max = 0.1541
After  Relabeling: wer_avg = 0.0489, wer_min = 0.0205, wer_max = 0.1053
Relabeling percent = 3.10%, WER improve = 108 bps
Number of workers: before = 581, after = 616


## Aggregation

### Random and Oracle

In [21]:
fn1 = lambda x: x.loc[np.random.choice(x.index, size=1, replace=False) if len(x) >= 4 else np.random.choice(x.index, 4, True),:]
fn2 = lambda x: x.loc[np.random.choice(x.index, size=2, replace=False) if len(x) >= 4 else np.random.choice(x.index, 4, True),:]
fn3 = lambda x: x.loc[np.random.choice(x.index, size=3, replace=False) if len(x) >= 4 else np.random.choice(x.index, 4, True),:]
fn4 = lambda x: x.loc[np.random.choice(x.index, size=4, replace=False) if len(x) >= 4 else np.random.choice(x.index, 4, True),:]

In [22]:
df_before_random_1 = df_before_relabel.groupby('utt', as_index=False).apply(fn1).reset_index(drop=True)
df_before_random_2 = df_before_relabel.groupby('utt', as_index=False).apply(fn2).reset_index(drop=True)
df_before_random_3 = df_before_relabel.groupby('utt', as_index=False).apply(fn3).reset_index(drop=True)
df_before_random_4 = df_before_relabel.groupby('utt', as_index=False).apply(fn4).reset_index(drop=True)
df_after_random_1  = df_after_relabel.groupby('utt', as_index=False).apply(fn1).reset_index(drop=True)
df_after_random_2  = df_after_relabel.groupby('utt', as_index=False).apply(fn2).reset_index(drop=True)
df_after_random_3  = df_after_relabel.groupby('utt', as_index=False).apply(fn3).reset_index(drop=True)
df_after_random_4  = df_after_relabel.groupby('utt', as_index=False).apply(fn4).reset_index(drop=True)

In [23]:
wer_random_before, _, _  = get_wer(df_before_random_1)
wer_random_after, _, _   = get_wer(df_after_random_1)
_, wer_oracle2_before, _ = get_wer(df_before_random_2)
_, wer_oracle2_after, _  = get_wer(df_after_random_2)
_, wer_oracle3_before, _ = get_wer(df_before_random_3)
_, wer_oracle3_after, _  = get_wer(df_after_random_3)
_, wer_oracle4_before, _ = get_wer(df_before_random_4)
_, wer_oracle4_after, _  = get_wer(df_after_random_4)
wer_oracle5_before       = wer_min_before
wer_oracle5_after        = wer_min_after

In [24]:
print('wer_random1_before = {:.4f}, wer_random1_after = {:.4f}'.format(wer_random_before, wer_random_after))
print('wer_oracle2_before = {:.4f}, wer_oracle2_after = {:.4f}'.format(wer_oracle2_before, wer_oracle2_after))
print('wer_oracle3_before = {:.4f}, wer_oracle3_after = {:.4f}'.format(wer_oracle3_before, wer_oracle3_after))
print('wer_oracle4_before = {:.4f}, wer_oracle4_after = {:.4f}'.format(wer_oracle4_before, wer_oracle4_after))
print('wer_oracle5_before = {:.4f}, wer_oracle5_after = {:.4f}'.format(wer_oracle5_before, wer_oracle5_after))

wer_random1_before = 0.0591, wer_random1_after = 0.0487
wer_oracle2_before = 0.0307, wer_oracle2_after = 0.0286
wer_oracle3_before = 0.0246, wer_oracle3_after = 0.0247
wer_oracle4_before = 0.0224, wer_oracle4_after = 0.0220
wer_oracle5_before = 0.0209, wer_oracle5_after = 0.0205


### Longest std_trans

In [25]:
def f_longest(x):
    ref = -1
    res = -1
    for idx, std_trans_stc in zip(x.index, x.std_trans_stc):
        if std_trans_stc > ref:
            ref = std_trans_stc
            res = idx
    return x.loc[res,:]

In [26]:
df_longest_before = df_before_relabel.groupby('utt', as_index=False).apply(f_longest).reset_index(drop=True)
df_longest_after  = df_after_relabel.groupby('utt', as_index=False).apply(f_longest).reset_index(drop=True)

### Highest approve_rate

In [27]:
df_before_relabel['approve_rate'] = df_before_relabel['LifetimeApprovalRate'].apply(lambda x: int(x.split('%')[0])/100)
df_after_relabel['approve_rate']  = df_after_relabel['LifetimeApprovalRate'].apply(lambda x: int(x.split('%')[0])/100)

In [28]:
def f_highest(x):
    ref = -1
    res = -1
    for idx, approve_rate in zip(x.index, x.approve_rate):
        if approve_rate > ref:
            ref = approve_rate
            res = idx
    return x.loc[res,:]

In [29]:
df_highest_before = df_before_relabel.groupby('utt', as_index=False).apply(f_highest).reset_index(drop=True)
df_highest_after  = df_after_relabel.groupby('utt', as_index=False).apply(f_highest).reset_index(drop=True)

### Oracle Worst

In [34]:
def f_best(x):
    ref = 9999
    res = -1
    for idx, wec in zip(x.index, x.std_trans_wec):
        if wec < ref:
            ref = wec
            res = idx
    return x.loc[res,:]

In [35]:
def f_worst(x):
    ref = -1
    res = -1
    for idx, wec in zip(x.index, x.std_trans_wec):
        if wec > ref:
            ref = wec
            res = idx
    return x.loc[res,:]

In [36]:
df_best_before = df_before_relabel.groupby('utt', as_index=False).apply(f_best).reset_index(drop=True)
df_best_after  = df_after_relabel.groupby('utt', as_index=False).apply(f_best).reset_index(drop=True)
df_worst_before = df_before_relabel.groupby('utt', as_index=False).apply(f_worst).reset_index(drop=True)
df_worst_after  = df_after_relabel.groupby('utt', as_index=False).apply(f_worst).reset_index(drop=True)

### Auto Correction

In [37]:
rover_before = ROVER().fit_predict(df_before_relabel[['utt','std_trans']].rename(columns={"utt": "task", "std_trans": "output"}))

  0%|          | 0/2763 [00:00<?, ?it/s]

In [38]:
rover_after = ROVER().fit_predict(df_after_relabel[['utt','std_trans']].rename(columns={"utt": "task", "std_trans": "output"}))

  0%|          | 0/2762 [00:00<?, ?it/s]

In [39]:
df_eval_before = df_before_relabel.merge(rover_before.reset_index(), left_on='utt', right_on='task').groupby('utt').agg(lambda x: x.iloc[0]).reset_index()
df_eval_before.rename(columns={'output': 'std_trans_rover'}, inplace=True)
df_eval_after = df_after_relabel.merge(rover_after.reset_index(), left_on='utt', right_on='task').groupby('utt').agg(lambda x: x.iloc[0]).reset_index()
df_eval_after.rename(columns={'output': 'std_trans_rover'}, inplace=True)

In [40]:
# wer_correct_before, _, _  = get_wer(df_eval_before, std_trans='std_trans_rover', reuse=False)
# wer_correct_after, _, _   = get_wer(df_eval_after, std_trans='std_trans_rover', reuse=False)
# print('wer_correct_before = {:.4f}, wer_correct_after = {:.4f}'.format(wer_correct_before, wer_correct_after))

In [41]:
# res = error_type(df_eval_before, std_trans='std_trans_rover')
# print('dataset = {:s}, rover_before: count = {:d}, len = {:.1f}, del = {:.2f}%, ins = {:5.2f}%, sub = {:5.2f}%, wer = {:5.2f}%'\
#       .format(dataset, res['CNT'], res['LEN'], res['DEL'], res['INS'], res['SUB'], res['WER']))
# res = error_type(df_eval_after, std_trans='std_trans_rover')
# print('dataset = {:s}, rover_after : count = {:d}, len = {:.1f}, del = {:.2f}%, ins = {:5.2f}%, sub = {:5.2f}%, wer = {:5.2f}%'\
#       .format(dataset, res['CNT'], res['LEN'], res['DEL'], res['INS'], res['SUB'], res['WER']))

### Print

In [59]:
res = error_type(df_before_relabel)
print('dataset = {:s}, raw_before   : count = {:d}, len = {:.1f}, del = {:.2f}%, ins = {:5.2f}%, sub = {:5.2f}%, wer = {:5.2f}%'\
      .format(dataset, res['CNT'], res['LEN'], res['DEL'], res['INS'], res['SUB'], res['WER']))
res = error_type(df_worst_before)
print('dataset = {:s}, worst_before  : count = {:d}, len = {:.1f}, del = {:.2f}%, ins = {:5.2f}%, sub = {:5.2f}%, wer = {:5.2f}%'\
      .format(dataset, res['CNT'], res['LEN'], res['DEL'], res['INS'], res['SUB'], res['WER']))
res = error_type(df_before_random_1)
print('dataset = {:s}, random_before : count = {:d}, len = {:.1f}, del = {:.2f}%, ins = {:5.2f}%, sub = {:5.2f}%, wer = {:5.2f}%'\
      .format(dataset, res['CNT'], res['LEN'], res['DEL'], res['INS'], res['SUB'], res['WER']))
res = error_type(df_longest_before)
print('dataset = {:s}, longest_before: count = {:d}, len = {:.1f}, del = {:.2f}%, ins = {:5.2f}%, sub = {:5.2f}%, wer = {:5.2f}%'\
      .format(dataset, res['CNT'], res['LEN'], res['DEL'], res['INS'], res['SUB'], res['WER']))
res = error_type(df_highest_before)
print('dataset = {:s}, highest_before: count = {:d}, len = {:.1f}, del = {:.2f}%, ins = {:5.2f}%, sub = {:5.2f}%, wer = {:5.2f}%'\
      .format(dataset, res['CNT'], res['LEN'], res['DEL'], res['INS'], res['SUB'], res['WER']))
res = error_type(df_eval_before, std_trans='std_trans_rover')
print('dataset = {:s}, correct_before: count = {:d}, len = {:.1f}, del = {:.2f}%, ins = {:5.2f}%, sub = {:5.2f}%, wer = {:5.2f}%'\
      .format(dataset, res['CNT'], res['LEN'], res['DEL'], res['INS'], res['SUB'], res['WER']))
res = error_type(df_best_before)
print('dataset = {:s}, best_before   : count = {:d}, len = {:.1f}, del = {:.2f}%, ins = {:5.2f}%, sub = {:5.2f}%, wer = {:5.2f}%'\
      .format(dataset, res['CNT'], res['LEN'], res['DEL'], res['INS'], res['SUB'], res['WER']))
print('-'*122)
res = error_type(df_after_relabel)
print('dataset = {:s}, raw_after    : count = {:d}, len = {:.1f}, del = {:.2f}%, ins = {:5.2f}%, sub = {:5.2f}%, wer = {:5.2f}%'\
      .format(dataset, res['CNT'], res['LEN'], res['DEL'], res['INS'], res['SUB'], res['WER']))
res = error_type(df_worst_after)
print('dataset = {:s}, worst_after   : count = {:d}, len = {:.1f}, del = {:.2f}%, ins = {:5.2f}%, sub = {:5.2f}%, wer = {:5.2f}%'\
      .format(dataset, res['CNT'], res['LEN'], res['DEL'], res['INS'], res['SUB'], res['WER']))
res = error_type(df_after_random_1)
print('dataset = {:s}, random_after  : count = {:d}, len = {:.1f}, del = {:.2f}%, ins = {:5.2f}%, sub = {:5.2f}%, wer = {:5.2f}%'\
      .format(dataset, res['CNT'], res['LEN'], res['DEL'], res['INS'], res['SUB'], res['WER']))
res = error_type(df_longest_after)
print('dataset = {:s}, longest_after : count = {:d}, len = {:.1f}, del = {:.2f}%, ins = {:5.2f}%, sub = {:5.2f}%, wer = {:5.2f}%'\
      .format(dataset, res['CNT'], res['LEN'], res['DEL'], res['INS'], res['SUB'], res['WER']))
res = error_type(df_highest_after)
print('dataset = {:s}, highest_after : count = {:d}, len = {:.1f}, del = {:.2f}%, ins = {:5.2f}%, sub = {:5.2f}%, wer = {:5.2f}%'\
      .format(dataset, res['CNT'], res['LEN'], res['DEL'], res['INS'], res['SUB'], res['WER']))
res = error_type(df_eval_after, std_trans='std_trans_rover')
print('dataset = {:s}, correct_after : count = {:d}, len = {:.1f}, del = {:.2f}%, ins = {:5.2f}%, sub = {:5.2f}%, wer = {:5.2f}%'\
      .format(dataset, res['CNT'], res['LEN'], res['DEL'], res['INS'], res['SUB'], res['WER']))
res = error_type(df_best_after)
print('dataset = {:s}, best_after    : count = {:d}, len = {:.1f}, del = {:.2f}%, ins = {:5.2f}%, sub = {:5.2f}%, wer = {:5.2f}%'\
      .format(dataset, res['CNT'], res['LEN'], res['DEL'], res['INS'], res['SUB'], res['WER']))

dataset = train-mixed-10h, raw_before   : count = 13815, len = 34.9, del = 1.78%, ins =  0.47%, sub =  3.72%, wer =  5.97%
dataset = train-mixed-10h, worst_before  : count = 2763, len = 33.3, del = 6.86%, ins =  1.04%, sub =  7.51%, wer = 15.41%
dataset = train-mixed-10h, random_before : count = 2763, len = 34.9, del = 1.68%, ins =  0.48%, sub =  3.75%, wer =  5.91%
dataset = train-mixed-10h, longest_before: count = 2763, len = 35.6, del = 0.21%, ins =  1.07%, sub =  3.75%, wer =  5.03%
dataset = train-mixed-10h, highest_before: count = 2763, len = 35.2, del = 0.76%, ins =  0.42%, sub =  3.32%, wer =  4.50%
dataset = train-mixed-10h, correct_before: count = 2763, len = 35.3, del = 0.31%, ins =  0.30%, sub =  2.42%, wer =  3.04%
dataset = train-mixed-10h, best_before   : count = 2763, len = 35.3, del = 0.22%, ins =  0.16%, sub =  1.71%, wer =  2.09%
--------------------------------------------------------------------------------------------------------------------------
dataset = train-

### Save data

In [48]:
df_crowd = df_gold.copy()
df_crowd['gold_trans'] = df_crowd['gold_trans'].apply(lambda x: normalize_trans(x))
df_crowd = df_crowd.rename(columns={'std_trans':'std_trans_synthetic'})
# random before
df_merge_random_before = df_before_random_1[['utt','std_trans']].drop_duplicates(subset=['utt'])
df_crowd = df_crowd.merge(df_merge_random_before, on=['utt'], how='left')
df_crowd = df_crowd.rename(columns={'std_trans':'std_trans_random_before'})
# random after
df_merge_random_after = df_after_random_1[['utt','std_trans']].drop_duplicates(subset=['utt'])
df_crowd = df_crowd.merge(df_merge_random_after, on=['utt'], how='left')
df_crowd = df_crowd.rename(columns={'std_trans':'std_trans_random_after'})
df_crowd['std_trans_random_after'] = df_crowd['std_trans_random_after'].fillna(df_crowd['std_trans_random_before'])
# longest before
df_merge_longest_before = df_longest_before[['utt','std_trans']].drop_duplicates(subset=['utt'])
df_crowd = df_crowd.merge(df_merge_longest_before, on=['utt'], how='left')
df_crowd = df_crowd.rename(columns={'std_trans':'std_trans_longest_before'})
df_crowd['std_trans_longest_before'] = df_crowd['std_trans_longest_before'].fillna(df_crowd['std_trans_random_before'])
# longest after
df_merge_longest_after = df_longest_after[['utt','std_trans']].drop_duplicates(subset=['utt'])
df_crowd = df_crowd.merge(df_merge_longest_after, on=['utt'], how='left')
df_crowd = df_crowd.rename(columns={'std_trans':'std_trans_longest_after'})
df_crowd['std_trans_longest_after'] = df_crowd['std_trans_longest_after'].fillna(df_crowd['std_trans_random_after'])
# highest before
df_merge_highest_before = df_highest_before[['utt','std_trans']].drop_duplicates(subset=['utt'])
df_crowd = df_crowd.merge(df_merge_highest_before, on=['utt'], how='left')
df_crowd = df_crowd.rename(columns={'std_trans':'std_trans_highest_before'})
df_crowd['std_trans_highest_before'] = df_crowd['std_trans_highest_before'].fillna(df_crowd['std_trans_random_before'])
# highest after
df_merge_highest_after = df_highest_after[['utt','std_trans']].drop_duplicates(subset=['utt'])
df_crowd = df_crowd.merge(df_merge_highest_after, on=['utt'], how='left')
df_crowd = df_crowd.rename(columns={'std_trans':'std_trans_highest_after'})
df_crowd['std_trans_highest_after'] = df_crowd['std_trans_highest_after'].fillna(df_crowd['std_trans_random_after'])
# correct before
df_merge_correct_before = df_eval_before.reset_index()[['utt','std_trans_rover']].drop_duplicates(subset=['utt','std_trans_rover'])
df_crowd = df_crowd.merge(df_merge_correct_before, on=['utt'], how='left')
df_crowd['std_trans_rover'] = df_crowd['std_trans_rover'].fillna(df_crowd['std_trans_random_before'])
df_crowd = df_crowd.rename(columns={'std_trans_rover':'std_trans_correct_before'})
# correct after
df_merge_correct_after = df_eval_after.reset_index()[['utt','std_trans_rover']].drop_duplicates(subset=['utt','std_trans_rover'])
df_crowd = df_crowd.merge(df_merge_correct_after, on=['utt'], how='left')
df_crowd['std_trans_rover'] = df_crowd['std_trans_rover'].fillna(df_crowd['std_trans_random_after'])
df_crowd = df_crowd.rename(columns={'std_trans_rover':'std_trans_correct_after'})
# best before
df_merge_best_before = df_best_before[['utt','std_trans']].drop_duplicates(subset=['utt'])
df_crowd = df_crowd.merge(df_merge_best_before, on=['utt'], how='left')
df_crowd = df_crowd.rename(columns={'std_trans':'std_trans_best_before'})
df_crowd['std_trans_best_before'] = df_crowd['std_trans_best_before'].fillna(df_crowd['std_trans_random_before'])
# best after
df_merge_best_after = df_best_after[['utt','std_trans']].drop_duplicates(subset=['utt'])
df_crowd = df_crowd.merge(df_merge_best_after, on=['utt'], how='left')
df_crowd = df_crowd.rename(columns={'std_trans':'std_trans_best_after'})
df_crowd['std_trans_best_after'] = df_crowd['std_trans_best_after'].fillna(df_crowd['std_trans_random_after'])
# worst before
df_merge_worst_before = df_worst_before[['utt','std_trans']].drop_duplicates(subset=['utt'])
df_crowd = df_crowd.merge(df_merge_worst_before, on=['utt'], how='left')
df_crowd = df_crowd.rename(columns={'std_trans':'std_trans_worst_before'})
df_crowd['std_trans_worst_before'] = df_crowd['std_trans_worst_before'].fillna(df_crowd['std_trans_random_before'])
# worst after
df_merge_worst_after = df_worst_after[['utt','std_trans']].drop_duplicates(subset=['utt'])
df_crowd = df_crowd.merge(df_merge_worst_after, on=['utt'], how='left')
df_crowd = df_crowd.rename(columns={'std_trans':'std_trans_worst_after'})
df_crowd['std_trans_worst_after'] = df_crowd['std_trans_worst_after'].fillna(df_crowd['std_trans_random_after'])


In [49]:
df_crowd.info()
df_crowd.to_csv(crowdpath, index=False)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2763 entries, 0 to 2762
Data columns (total 15 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   utt                       2763 non-null   object
 1   gold_trans                2763 non-null   object
 2   std_trans_synthetic       2763 non-null   object
 3   std_trans_random_before   2763 non-null   object
 4   std_trans_random_after    2763 non-null   object
 5   std_trans_longest_before  2763 non-null   object
 6   std_trans_longest_after   2763 non-null   object
 7   std_trans_highest_before  2763 non-null   object
 8   std_trans_highest_after   2763 non-null   object
 9   std_trans_correct_before  2763 non-null   object
 10  std_trans_correct_after   2763 non-null   object
 11  std_trans_best_before     2763 non-null   object
 12  std_trans_best_after      2763 non-null   object
 13  std_trans_worst_before    2763 non-null   object
 14  std_trans_worst_after   

In [50]:
df_crowd_before = df_before_relabel[['utt','gold_trans','std_trans','LifetimeApprovalRate','taskId']].rename(columns={'std_trans':'std_trans_crowd'})
df_crowd_before['relabel'] = 0
df_crowd_before['approve_rate'] = df_crowd_before['LifetimeApprovalRate'].apply(lambda x: int(x.split('%')[0])/100)
df_crowd_after  = df_after_relabel[['utt','gold_trans','std_trans','LifetimeApprovalRate','taskId']].rename(columns={'std_trans':'std_trans_crowd'})
df_crowd_after['relabel'] = 1
df_crowd_after['approve_rate'] = df_crowd_before['LifetimeApprovalRate'].apply(lambda x: int(x.split('%')[0])/100)

In [51]:
df_crowd_before['len_gold_trans'] = df_crowd_before['gold_trans'].apply(lambda x:len(x.split()))
df1 = df_crowd_before.groupby(['utt'], as_index=False).nth(0).sort_values(by='len_gold_trans')
df2 = df_crowd_before.groupby(['utt'], as_index=False).nth(1).sort_values(by='len_gold_trans')
df3 = df_crowd_before.groupby(['utt'], as_index=False).nth(2).sort_values(by='len_gold_trans')
df4 = df_crowd_before.groupby(['utt'], as_index=False).nth(3).sort_values(by='len_gold_trans')
df5 = df_crowd_before.groupby(['utt'], as_index=False).nth(4).sort_values(by='len_gold_trans')
df_crowd_before = pd.concat([df1,df2,df3,df4,df5]).reset_index(drop=True)

In [52]:
df_crowd_after['len_gold_trans'] = df_crowd_after['gold_trans'].apply(lambda x:len(x.split()))
df1 = df_crowd_after.groupby(['utt'], as_index=False).nth(0).sort_values(by='len_gold_trans')
df2 = df_crowd_after.groupby(['utt'], as_index=False).nth(1).sort_values(by='len_gold_trans')
df3 = df_crowd_after.groupby(['utt'], as_index=False).nth(2).sort_values(by='len_gold_trans')
df4 = df_crowd_after.groupby(['utt'], as_index=False).nth(3).sort_values(by='len_gold_trans')
df5 = df_crowd_after.groupby(['utt'], as_index=False).nth(4).sort_values(by='len_gold_trans')
df_crowd_after = pd.concat([df1,df2,df3,df4,df5]).reset_index(drop=True)

In [53]:
cols = ['utt','taskId','gold_trans','std_trans_crowd','approve_rate','relabel']
df_crowd_5 = pd.concat([df_crowd_before[cols], df_crowd_after[cols]]).reset_index(drop=True)
df_crowd_5.info()
df_crowd_5.to_csv(crowdpath_5, index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27614 entries, 0 to 27613
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   utt              27614 non-null  object 
 1   taskId           27614 non-null  object 
 2   gold_trans       27614 non-null  object 
 3   std_trans_crowd  27614 non-null  object 
 4   approve_rate     27614 non-null  float64
 5   relabel          27614 non-null  int64  
dtypes: float64(1), int64(1), object(4)
memory usage: 1.3+ MB


### Check

In [54]:
df_check = pd.read_csv('./data/LibriSpeechCrowd/librispeech_{:s}_crowd.csv'.format(dataset.replace('-','_')))
df_check.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2763 entries, 0 to 2762
Data columns (total 15 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   utt                       2763 non-null   object
 1   gold_trans                2763 non-null   object
 2   std_trans_synthetic       2763 non-null   object
 3   std_trans_random_before   2762 non-null   object
 4   std_trans_random_after    2763 non-null   object
 5   std_trans_longest_before  2763 non-null   object
 6   std_trans_longest_after   2763 non-null   object
 7   std_trans_highest_before  2763 non-null   object
 8   std_trans_highest_after   2763 non-null   object
 9   std_trans_correct_before  2763 non-null   object
 10  std_trans_correct_after   2763 non-null   object
 11  std_trans_best_before     2763 non-null   object
 12  std_trans_best_after      2763 non-null   object
 13  std_trans_worst_before    2755 non-null   object
 14  std_trans_worst_after   

In [56]:
for col in ['std_trans_worst_before','std_trans_random_before','std_trans_longest_before','std_trans_highest_before','std_trans_correct_before','std_trans_best_before',
            'std_trans_worst_after','std_trans_random_after','std_trans_longest_after','std_trans_highest_after','std_trans_correct_after','std_trans_best_after']:
    res = error_type(df_check, std_trans=col)
    print('dataset = {:s}, col = {:24s}: count = {:d}, len = {:.1f}, del = {:.2f}%, ins = {:5.2f}%, sub = {:5.2f}%, wer = {:5.2f}%'\
          .format(dataset, col, res['CNT'], res['LEN'], res['DEL'], res['INS'], res['SUB'], res['WER']))

dataset = train-mixed-10h, col = std_trans_worst_before  : count = 2763, len = 33.3, del = 6.86%, ins =  1.04%, sub =  7.51%, wer = 15.41%
dataset = train-mixed-10h, col = std_trans_random_before : count = 2763, len = 34.9, del = 1.68%, ins =  0.48%, sub =  3.75%, wer =  5.91%
dataset = train-mixed-10h, col = std_trans_longest_before: count = 2763, len = 35.6, del = 0.21%, ins =  1.07%, sub =  3.75%, wer =  5.03%
dataset = train-mixed-10h, col = std_trans_highest_before: count = 2763, len = 35.2, del = 0.76%, ins =  0.42%, sub =  3.32%, wer =  4.50%
dataset = train-mixed-10h, col = std_trans_correct_before: count = 2763, len = 35.3, del = 0.31%, ins =  0.30%, sub =  2.42%, wer =  3.04%
dataset = train-mixed-10h, col = std_trans_best_before   : count = 2763, len = 35.3, del = 0.22%, ins =  0.16%, sub =  1.71%, wer =  2.09%
dataset = train-mixed-10h, col = std_trans_worst_after   : count = 2763, len = 34.8, del = 2.45%, ins =  0.97%, sub =  7.11%, wer = 10.53%
dataset = train-mixed-10h, 

### Result

In [44]:
### test-other
# Approved: count = 14655, wer_avg = 0.1248, wer_min = 0.0500, wer_max = 0.2323
# Rejected: count =  1295, wer_avg = 0.7215, wer_min = 0.6877, wer_max = 0.7559

# dataset = test-other, approved: count = 14655, len = 17.6, del = 2.77%, ins = 1.33%, sub = 8.34%, wer = 12.43%
# dataset = test-other, rejected: count = 1295, len = 9.3, del = 49.31%, ins = 3.91%, sub = 19.34%, wer = 72.56%

# Before Relabeling: wer_avg = 0.1661, wer_min = 0.0532, wer_max = 0.3737
# After  Relabeling: wer_avg = 0.1248, wer_min = 0.0500, wer_max = 0.2323
# Relabeling percent = 8.84%, WER improve = 413 bps
# Number of workers: before = 934, after = 989

# wer_random1_before = 0.1650, wer_random1_after = 0.1280
# wer_oracle2_before = 0.0919, wer_oracle2_after = 0.0812
# wer_oracle3_before = 0.0689, wer_oracle3_after = 0.0651
# wer_oracle4_before = 0.0591, wer_oracle4_after = 0.0565
# wer_oracle5_before = 0.0532, wer_oracle5_after = 0.0500

# dataset = test-other, raw_before   : count = 14695, len = 17.0, del = 6.31%, ins =  1.45%, sub =  8.85%, wer = 16.61%
# dataset = test-other, random_before : count = 2939, len = 17.0, del = 5.96%, ins =  1.51%, sub =  9.03%, wer = 16.50%
# dataset = test-other, longest_before: count = 2939, len = 18.4, del = 0.53%, ins =  3.69%, sub = 10.16%, wer = 14.38%
# dataset = test-other, highest_before: count = 2939, len = 17.6, del = 2.60%, ins =  1.22%, sub =  7.83%, wer = 11.65%
# dataset = test-other, correct_before: count = 2939, len = 17.6, del = 2.16%, ins =  0.86%, sub =  5.20%, wer =  8.22%
# ---------------------------------------------------------------------------------------------------------------------
# dataset = test-other, raw_after    : count = 14655, len = 17.6, del = 2.77%, ins =  1.33%, sub =  8.34%, wer = 12.43%
# dataset = test-other, random_after  : count = 2977, len = 17.4, del = 2.92%, ins =  1.33%, sub =  8.76%, wer = 13.01%
# dataset = test-other, longest_after : count = 2938, len = 18.3, del = 0.47%, ins =  2.97%, sub =  8.98%, wer = 12.42%
# dataset = test-other, highest_after : count = 2938, len = 17.6, del = 2.45%, ins =  1.21%, sub =  7.63%, wer = 11.30%
# dataset = test-other, correct_after : count = 2938, len = 17.8, del = 1.38%, ins =  0.97%, sub =  5.16%, wer =  7.51%

In [45]:
### dev-other
# Approved: count = 14237, wer_avg = 0.0959, wer_min = 0.0413, wer_max = 0.1769
# Rejected: count =   998, wer_avg = 0.7465, wer_min = 0.7177, wer_max = 0.7776

# dataset = dev-other, approved: count = 14237, len = 17.7, del = 1.67%, ins = 0.95%, sub = 6.90%, wer = 9.52%
# dataset = dev-other, rejected: count = 998, len = 8.9, del = 48.94%, ins = 3.54%, sub = 20.47%, wer = 72.95%

# Before Relabeling: wer_avg = 0.1269, wer_min = 0.0429, wer_max = 0.2974
# After  Relabeling: wer_avg = 0.0959, wer_min = 0.0413, wer_max = 0.1769
# Relabeling percent = 7.01%, WER improve = 310 bps
# Number of workers: before = 572, after = 620

# wer_random1_before = 0.1220, wer_random1_after = 0.0971
# wer_oracle2_before = 0.0700, wer_oracle2_after = 0.0626
# wer_oracle3_before = 0.0551, wer_oracle3_after = 0.0522
# wer_oracle4_before = 0.0474, wer_oracle4_after = 0.0456
# wer_oracle5_before = 0.0429, wer_oracle5_after = 0.0413

# dataset = dev-other, raw_before   : count = 14320, len = 17.2, del = 4.30%, ins =  1.05%, sub =  7.34%, wer = 12.69%
# dataset = dev-other, random_before : count = 2864, len = 17.3, del = 3.74%, ins =  1.02%, sub =  7.43%, wer = 12.20%
# dataset = dev-other, longest_before: count = 2864, len = 18.2, del = 0.32%, ins =  2.60%, sub =  7.97%, wer = 10.90%
# dataset = dev-other, highest_before: count = 2864, len = 17.7, del = 1.59%, ins =  0.95%, sub =  6.78%, wer =  9.32%
# dataset = dev-other, correct_before: count = 2864, len = 17.8, del = 0.91%, ins =  0.75%, sub =  5.03%, wer =  6.69%
# --------------------------------------------------------------------------------------------------------------------
# dataset = dev-other, raw_after    : count = 14237, len = 17.7, del = 1.67%, ins =  0.95%, sub =  6.90%, wer =  9.52%
# dataset = dev-other, random_after  : count = 2943, len = 17.6, del = 1.71%, ins =  1.06%, sub =  7.43%, wer = 10.20%
# dataset = dev-other, longest_after : count = 2862, len = 18.1, del = 0.36%, ins =  2.20%, sub =  7.63%, wer = 10.20%
# dataset = dev-other, highest_after : count = 2862, len = 17.7, del = 1.80%, ins =  0.95%, sub =  6.76%, wer =  9.51%
# dataset = dev-other, correct_after : count = 2862, len = 17.8, del = 0.71%, ins =  0.76%, sub =  4.98%, wer =  6.45%

In [46]:
### test-clean
# Approved: count = 13097, wer_avg = 0.0578, wer_min = 0.0188, wer_max = 0.1255
# Rejected: count =   490, wer_avg = 0.7550, wer_min = 0.7353, wer_max = 0.7754

# dataset = test-clean, approved: count = 13097, len = 20.0, del = 1.17%, ins = 0.48%, sub = 4.12%, wer = 5.78%
# dataset = test-clean, rejected: count = 490, len = 9.4, del = 57.22%, ins = 2.43%, sub = 15.12%, wer = 74.77%

# Before Relabeling: wer_avg = 0.0823, wer_min = 0.0198, wer_max = 0.2259
# After  Relabeling: wer_avg = 0.0578, wer_min = 0.0188, wer_max = 0.1255
# Relabeling percent = 3.74%, WER improve = 245 bps
# Number of workers: before = 515, after = 527

# wer_random1_before = 0.0871, wer_random1_after = 0.0574
# wer_oracle2_before = 0.0365, wer_oracle2_after = 0.0318
# wer_oracle3_before = 0.0263, wer_oracle3_after = 0.0252
# wer_oracle4_before = 0.0217, wer_oracle4_after = 0.0213
# wer_oracle5_before = 0.0198, wer_oracle5_after = 0.0188

# dataset = test-clean, raw_before   : count = 13100, len = 19.5, del = 3.23%, ins =  0.55%, sub =  4.45%, wer =  8.23%
# dataset = test-clean, random_before : count = 2620, len = 19.5, del = 3.47%, ins =  0.65%, sub =  4.60%, wer =  8.71%
# dataset = test-clean, longest_before: count = 2620, len = 20.3, del = 0.21%, ins =  1.47%, sub =  4.44%, wer =  6.11%
# dataset = test-clean, highest_before: count = 2620, len = 20.0, del = 1.04%, ins =  0.42%, sub =  3.63%, wer =  5.09%
# dataset = test-clean, correct_before: count = 2620, len = 20.0, del = 0.55%, ins =  0.25%, sub =  2.48%, wer =  3.27%
# ---------------------------------------------------------------------------------------------------------------------
# dataset = test-clean, raw_after    : count = 13097, len = 20.0, del = 1.17%, ins =  0.48%, sub =  4.12%, wer =  5.78%
# dataset = test-clean, random_after  : count = 2629, len = 19.9, del = 1.19%, ins =  0.45%, sub =  4.14%, wer =  5.77%
# dataset = test-clean, longest_after : count = 2620, len = 20.3, del = 0.25%, ins =  1.31%, sub =  4.73%, wer =  6.30%
# dataset = test-clean, highest_after : count = 2620, len = 20.0, del = 1.12%, ins =  0.45%, sub =  3.61%, wer =  5.17%
# dataset = test-clean, correct_after : count = 2620, len = 20.0, del = 0.45%, ins =  0.26%, sub =  2.34%, wer =  3.05%

In [47]:
# Approved: count = 13476, wer_avg = 0.0502, wer_min = 0.0169, wer_max = 0.1079
# Rejected: count =   518, wer_avg = 0.5365, wer_min = 0.5053, wer_max = 0.5702

# dataset = dev-clean, approved: count = 13476, len = 20.1, del = 0.91%, ins = 0.42%, sub = 3.68%, wer = 5.01%
# dataset = dev-clean, rejected: count = 518, len = 11.5, del = 33.46%, ins = 2.71%, sub = 17.49%, wer = 53.66%

# Before Relabeling: wer_avg = 0.0610, wer_min = 0.0180, wer_max = 0.1509
# After  Relabeling: wer_avg = 0.0502, wer_min = 0.0169, wer_max = 0.1079
# Relabeling percent = 3.84%, WER improve = 108 bps
# Number of workers: before = 498, after = 523

# wer_random1_before = 0.0642, wer_random1_after = 0.0489
# wer_oracle2_before = 0.0302, wer_oracle2_after = 0.0293
# wer_oracle3_before = 0.0235, wer_oracle3_after = 0.0223
# wer_oracle4_before = 0.0204, wer_oracle4_after = 0.0192
# wer_oracle5_before = 0.0180, wer_oracle5_after = 0.0169

# dataset = dev-clean, raw_before   : count = 13515, len = 19.9, del = 1.73%, ins =  0.47%, sub =  3.89%, wer =  6.10%
# dataset = dev-clean, random_before : count = 2703, len = 19.8, del = 1.97%, ins =  0.50%, sub =  3.96%, wer =  6.42%
# dataset = dev-clean, longest_before: count = 2703, len = 20.3, del = 0.23%, ins =  1.24%, sub =  4.04%, wer =  5.51%
# dataset = dev-clean, highest_before: count = 2703, len = 20.0, del = 0.86%, ins =  0.36%, sub =  3.44%, wer =  4.66%
# dataset = dev-clean, correct_before: count = 2703, len = 20.1, del = 0.44%, ins =  0.26%, sub =  2.24%, wer =  2.94%
# --------------------------------------------------------------------------------------------------------------------
# dataset = dev-clean, raw_after    : count = 13476, len = 20.1, del = 0.91%, ins =  0.42%, sub =  3.68%, wer =  5.01%
# dataset = dev-clean, random_after  : count = 2715, len = 20.0, del = 0.85%, ins =  0.42%, sub =  3.66%, wer =  4.94%
# dataset = dev-clean, longest_after : count = 2697, len = 20.4, del = 0.22%, ins =  1.15%, sub =  4.45%, wer =  5.83%
# dataset = dev-clean, highest_after : count = 2697, len = 20.1, del = 0.87%, ins =  0.35%, sub =  3.49%, wer =  4.72%
# dataset = dev-clean, correct_after : count = 2697, len = 20.1, del = 0.35%, ins =  0.24%, sub =  2.18%, wer =  2.76%

In [48]:
### train-mixed-10h
# Approved: count = 13803, wer_avg = 0.0489, wer_min = 0.0205, wer_max = 0.1053
# Rejected: count =   428, wer_avg = 0.6368, wer_min = 0.6088, wer_max = 0.6676

# dataset = train-mixed-10h, approved: count = 13803, len = 35.2, del = 0.87%, ins = 0.44%, sub = 3.58%, wer = 4.89%
# dataset = train-mixed-10h, rejected: count = 428, len = 17.1, del = 44.72%, ins = 1.74%, sub = 15.40%, wer = 61.85%

# Before Relabeling: wer_avg = 0.0597, wer_min = 0.0209, wer_max = 0.1541
# After  Relabeling: wer_avg = 0.0489, wer_min = 0.0205, wer_max = 0.1053
# Relabeling percent = 3.10%, WER improve = 108 bps
# Number of workers: before = 581, after = 616

# wer_random1_before = 0.0576, wer_random1_after = 0.0495
# wer_oracle2_before = 0.0301, wer_oracle2_after = 0.0288
# wer_oracle3_before = 0.0249, wer_oracle3_after = 0.0242
# wer_oracle4_before = 0.0224, wer_oracle4_after = 0.0222
# wer_oracle5_before = 0.0209, wer_oracle5_after = 0.0205

# dataset = train-mixed-10h, raw_before   : count = 13815, len = 34.9, del = 1.78%, ins =  0.47%, sub =  3.72%, wer =  5.97%
# dataset = train-mixed-10h, worst_before  : count = 2763, len = 33.3, del = 6.86%, ins =  1.04%, sub =  7.51%, wer = 15.41%
# dataset = train-mixed-10h, random_before : count = 2763, len = 34.9, del = 1.68%, ins =  0.48%, sub =  3.75%, wer =  5.91%
# dataset = train-mixed-10h, longest_before: count = 2763, len = 35.6, del = 0.21%, ins =  1.07%, sub =  3.75%, wer =  5.03%
# dataset = train-mixed-10h, highest_before: count = 2763, len = 35.2, del = 0.76%, ins =  0.42%, sub =  3.32%, wer =  4.50%
# dataset = train-mixed-10h, correct_before: count = 2763, len = 35.3, del = 0.31%, ins =  0.30%, sub =  2.42%, wer =  3.04%
# dataset = train-mixed-10h, best_before   : count = 2763, len = 35.3, del = 0.22%, ins =  0.16%, sub =  1.71%, wer =  2.09%
# --------------------------------------------------------------------------------------------------------------------------
# dataset = train-mixed-10h, raw_after    : count = 13803, len = 35.2, del = 0.87%, ins =  0.44%, sub =  3.58%, wer =  4.89%
# dataset = train-mixed-10h, worst_after   : count = 2762, len = 34.8, del = 2.45%, ins =  0.97%, sub =  7.10%, wer = 10.53%
# dataset = train-mixed-10h, random_after  : count = 2771, len = 35.1, del = 0.88%, ins =  0.46%, sub =  3.58%, wer =  4.92%
# dataset = train-mixed-10h, longest_after : count = 2762, len = 35.6, del = 0.21%, ins =  1.01%, sub =  3.91%, wer =  5.12%
# dataset = train-mixed-10h, highest_after : count = 2762, len = 35.2, del = 0.79%, ins =  0.40%, sub =  3.40%, wer =  4.60%
# dataset = train-mixed-10h, correct_after : count = 2762, len = 35.3, del = 0.30%, ins =  0.30%, sub =  2.40%, wer =  3.00%
# dataset = train-mixed-10h, best_after    : count = 2762, len = 35.3, del = 0.23%, ins =  0.16%, sub =  1.67%, wer =  2.05%

In [49]:
### train-other-10h
# Approved: count = 15987, wer_avg = 0.0687, wer_min = 0.0285, wer_max = 0.1387
# Rejected: count =  2706, wer_avg = 0.8302, wer_min = 0.6832, wer_max = 1.0355

# dataset = train-other-10h, approved: count = 15987, len = 31.7, del = 1.53%, ins = 0.54%, sub = 4.76%, wer = 6.83%
# dataset = train-other-10h, rejected: count = 2706, len = 25.1, del = 22.59%, ins = 20.71%, sub = 34.89%, wer = 78.19%

# Before Relabeling: wer_avg = 0.1571, wer_min = 0.0344, wer_max = 0.4540
# After  Relabeling: wer_avg = 0.0687, wer_min = 0.0285, wer_max = 0.1387
# Relabeling percent = 16.93%, WER improve = 884 bps
# Number of workers: before = 1127, after = 1258

# wer_random1_before = 0.1550, wer_random1_after = 0.0695
# wer_oracle2_before = 0.0679, wer_oracle2_after = 0.0424
# wer_oracle3_before = 0.0514, wer_oracle3_after = 0.0341
# wer_oracle4_before = 0.0426, wer_oracle4_after = 0.0312
# wer_oracle5_before = 0.0344, wer_oracle5_after = 0.0285

# dataset = train-other-10h, raw_before   : count = 21715, len = 31.2, del = 4.86%, ins =  4.39%, sub = 10.02%, wer = 19.26%
# dataset = train-other-10h, random_before : count = 3165, len = 31.9, del = 3.60%, ins =  3.42%, sub =  8.49%, wer = 15.50%
# dataset = train-other-10h, longest_before: count = 3165, len = 36.4, del = 0.42%, ins = 14.16%, sub = 18.25%, wer = 32.83%
# dataset = train-other-10h, highest_before: count = 3165, len = 31.5, del = 1.90%, ins =  0.58%, sub =  5.10%, wer =  7.57%
# dataset = train-other-10h, correct_before: count = 3165, len = 31.7, del = 1.43%, ins =  0.45%, sub =  3.69%, wer =  5.56%
# --------------------------------------------------------------------------------------------------------------------------
# dataset = train-other-10h, raw_after    : count = 15987, len = 31.7, del = 1.53%, ins =  0.54%, sub =  4.76%, wer =  6.83%
# dataset = train-other-10h, random_after  : count = 3237, len = 31.2, del = 1.62%, ins =  0.54%, sub =  5.09%, wer =  7.25%
# dataset = train-other-10h, longest_after : count = 3165, len = 32.2, del = 0.37%, ins =  1.22%, sub =  5.33%, wer =  6.92%
# dataset = train-other-10h, highest_after : count = 3165, len = 31.6, del = 1.79%, ins =  0.54%, sub =  5.02%, wer =  7.35%
# dataset = train-other-10h, correct_after : count = 3165, len = 31.9, del = 0.68%, ins =  0.34%, sub =  3.12%, wer =  4.14%

### Debug missing approve

In [50]:
# df = df_approved[cols].sort_values(by=['submittime','utt']).reset_index(drop=True)
# tmp = df.groupby('utt', as_index=False).size().reset_index(name='counts')
# tmp[tmp['counts'] < 5]
# df7902 = pd.read_csv('./data/LibriSpeechCrowd/test-other/7902.csv')
# df7902['AssignmentStatus'][df7902['Input.audio_url'] == '7902-96594-0015.flac']