# Crowdsource LibriSpeech Relabeling

In [1]:
import os, re, string
import numpy as np
import pandas as pd

In [2]:
dataset_name = 'train_other_60h'
folder = '/Users/gajian/Dropbox/Academia/Database/'
process = 'process' # process, process2, process5
data_dir = os.path.join(folder, 'LibriSpeech', 'batch_result', process)

In [3]:
batch_nums = sorted([int(file.split('.csv')[0]) for file in os.listdir(data_dir) if '.csv' in file and '_out' not in file])
print('batch_nums =', batch_nums)

batch_nums = [60007, 60008, 60012]


In [4]:
def audio2utt(audio_url):
    if '/' in audio_url:
        audio_url = audio_url.split('/')[-1]
    audio = audio_url.split('.')[0]
    book = audio.split('-')[0]
    chap = audio.split('-')[1]
    uttr = audio.split('-')[2]
    utt = book + '_' + chap + '_' + str(int(uttr))
    return utt


num2words = {1: 'One', 2: 'Two', 3: 'Three', 4: 'Four', 5: 'Five', \
             6: 'Six', 7: 'Seven', 8: 'Eight', 9: 'Nine', 10: 'Ten', \
            11: 'Eleven', 12: 'Twelve', 13: 'Thirteen', 14: 'Fourteen', \
            15: 'Fifteen', 16: 'Sixteen', 17: 'Seventeen', 18: 'Eighteen', \
            19: 'Nineteen', 20: 'Twenty', 30: 'Thirty', 40: 'Forty', \
            50: 'Fifty', 60: 'Sixty', 70: 'Seventy', 80: 'Eighty', \
            90: 'Ninety', 0: 'Zero'}

strnum2word = dict()
for k in num2words:
    strnum2word[str(k)] = num2words[k]


def normalize_word(word):
    word = strnum2word.get(word, word)
    if word in ('okay', 'ok'): return 'OK'
    if word in ('mr', 'mr.'): return 'Mister'
    if word in ('ms', 'ms.'): return 'Miss'
    if word == 'mrs.': return 'Misses'
    if word == 'ya': return 'you'
    if word in ('cos','coz'): return 'because'
    if word in ('ah','eh','hm','um','er','ahhh'): return ''
    if word == "there's": return 'there is'
    if word == "i'm": return 'i am'
    if word == "i'll": return 'i will'
    if word == "it's": return 'it is'
    if word == "she's": return 'she is'
    if word == "he's": return 'he is'
    if word == "where's": return 'where is'
    if word == "what's": return 'what is'
    if word == "when's": return 'when is'
    if word == "why's": return 'why is'
    if word == "who's": return 'who is'
    if word == "you're": return 'you are'
    if word == "you've": return 'you have'
    if word == "don't": return 'do not'
    if word == "didn't": return 'did not'
    if word == "couldn't": return 'could not'
    if word == "won't": return 'will not'
    if word == "wasn't": return 'was not'
    if word == "o'clock": return 'oclock'
    if word == "dwarf's": return 'dwarfs'
    if word == 'wanna': return 'want to'
    if word == 'k': return 'OK'
    if word == 'anytime': return 'any time'
    if word[-1] == '.' and word not in ('a.m.','p.m.'): return word[:-1]
    return word.lower()


def normalize_trans(trans):
    trans = trans.lower()
    trans = trans.replace('?','').replace(',','').replace('.','')
    words = trans.split()
    for i in range(len(words)):
        words[i] = normalize_word(words[i])
    return ' '.join(words)

### Load Gold and Std

In [5]:
df_gold = pd.read_csv(os.path.join(folder, 'LibriSpeech', 'librispeech_{:s}_trans.csv'.format(dataset_name)))

In [6]:
df_submits = []
df_submit_aggs = []
for batch_num in batch_nums:
    df_submit = pd.read_csv(os.path.join(folder, 'LibriSpeech', 'batch_result/{:s}/{:d}.csv'.format(process, batch_num)))
    ori_cols = df_submit.columns
    df_submit_agg = df_submit
    df_submit = df_submit[df_submit['Input.audio_url'].apply(lambda x: '.txt' not in x)]
    df_submits.append(df_submit)
    df_submit_agg['batch_num'] = batch_num
    df_submit_aggs.append(df_submit_agg)
df_submit_all = pd.concat(df_submit_aggs).reset_index(drop=True)
df_submit_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5188 entries, 0 to 5187
Data columns (total 32 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   HITId                        5188 non-null   object 
 1   HITTypeId                    5188 non-null   object 
 2   Title                        5188 non-null   object 
 3   Description                  5188 non-null   object 
 4   Keywords                     5188 non-null   object 
 5   Reward                       5188 non-null   object 
 6   CreationTime                 5188 non-null   object 
 7   MaxAssignments               5188 non-null   int64  
 8   RequesterAnnotation          5188 non-null   object 
 9   AssignmentDurationInSeconds  5188 non-null   int64  
 10  AutoApprovalDelayInSeconds   5188 non-null   int64  
 11  Expiration                   5188 non-null   object 
 12  NumberOfSimilarHITs          0 non-null      float64
 13  LifetimeInSeconds 

In [7]:
df_submit_all = df_submit_all[df_submit_all['AssignmentStatus'] == 'Submitted']
df_submit_all['utt'] = df_submit_all['Input.audio_url'].apply(lambda x: audio2utt(x))
df_check = df_submit_all.merge(df_gold[['utt','gold_trans']], on = ['utt'], how = 'left')
df_check = df_check.rename(columns={'Answer.transcription':'std_trans'}).reset_index(drop=True)
df_check['std_trans'] = df_check['std_trans'].fillna('ImputedNA')
df_check['num_q_mark'] = df_check['std_trans'].apply(lambda x: x.count('?'))
df_check['gold_trans'] = df_check['gold_trans'].apply(lambda x: normalize_trans(x))
df_check['std_trans'] = df_check['std_trans'].apply(lambda x: normalize_trans(x))
if len(df_check) == 0:
    print('No pending task in batch_nums:', batch_nums)

### Get WER

In [8]:
def cost_edit_distance(source: list, target: list, insert_cost=3, delete_cost=3, replace_cost=4):
    """
    Given two lists of tokens, calculate weighted edit distance
    https://code.amazon.com/packages/DeepADSPhase1/blobs/6930484f38136e49395dcb3e7df8baaec97e40c9/--/src/deep_ads_phase1/sentence_utils_wlm.py#L230
    """
    m = len(source)
    n = len(target)
    if target == source:
        return 0
    if m == 0:
        return n
    if n == 0:
        return m
    d = np.zeros((n + 1, m + 1))
    f = np.zeros((n + 1, m + 1))
    d[0, 0] = 0
    f[0, 0] = 0
    for i in range(1, n + 1):
        d[i, 0] = d[i - 1, 0] + insert_cost
        f[i, 0] = f[i - 1, 0] + 1
    for j in range(1, m + 1):
        d[0, j] = d[0, j - 1] + delete_cost
        f[0, j] = f[0, j - 1] + 1
    for i in range(1, n + 1):
        for j in range(1, m + 1):
            multiplier = int(not target[i - 1] == source[j - 1])
            d_costs = [d[i - 1, j] + insert_cost,
                       d[i - 1, j - 1] + multiplier * replace_cost,
                       d[i, j - 1] + delete_cost]
            f_costs = [f[i - 1, j] + 1,
                       f[i - 1, j - 1] + multiplier,
                       f[i, j - 1] + 1]
            idx = int(np.argmin(d_costs))
            assert isinstance(idx, int)
            d[i, j] = d_costs[idx]
            f[i, j] = f_costs[idx]
    return f[n, m]

def compute_wer(t1, t2):
    """
    Given two transcriptions, calculate Word Error Rate (WER)
    https://code.amazon.com/packages/DeepADSPhase1/blobs/6930484f38136e49395dcb3e7df8baaec97e40c9/--/src/deep_ads_phase1/sentence_utils_wlm.py#L174
    """
    word_error_count = cost_edit_distance(t1.split(), t2.split())
    return word_error_count

In [9]:
df_check['gold_trans_stc'] = df_check['gold_trans'].apply(lambda x: len(x.split()))
df_check['std_trans_stc'] = df_check['std_trans'].apply(lambda x: len(x.split()))
df_check['std_trans_wec'] = df_check.apply(lambda x: compute_wer(x.gold_trans, x.std_trans), axis=1)
# df_check.info()

In [10]:
df6 = df_check.groupby('utt').agg({'std_trans_wec': ['mean', 'min', 'max']})
df7 = df_check.groupby('utt').agg({'gold_trans_stc': ['min']})
wer_mean = df6[('std_trans_wec', 'mean')].sum() / df7[('gold_trans_stc', 'min')].sum()
wer_max = df6[('std_trans_wec', 'max')].sum() / df7[('gold_trans_stc', 'min')].sum()
wer_oracle = df6[('std_trans_wec', 'min')].sum() / df7[('gold_trans_stc', 'min')].sum()
print('wer_mean = {:.4f}, wer_oracle = {:.4f}, wer_max = {:.4f}'.format(wer_mean, wer_oracle, wer_max))

wer_mean = 0.1658, wer_oracle = 0.1658, wer_max = 0.1658


In [11]:
df_report = df_check[['gold_trans_stc','std_trans_wec','batch_num']].groupby(by=['batch_num']).sum()
df_report['wer'] = df_report['std_trans_wec'] / df_report['gold_trans_stc']
df_report.loc['All'] = [df_report['gold_trans_stc'].sum(), df_report['std_trans_wec'].sum(), df_report['std_trans_wec'].sum() / df_report['gold_trans_stc'].sum()] 
df_report

Unnamed: 0_level_0,gold_trans_stc,std_trans_wec,wer
batch_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
60007,328.0,102.0,0.310976
60008,670.0,203.0,0.302985
60012,3985.0,521.0,0.13074
All,4983.0,826.0,0.165764


### Predict WER and Make Decision

In [12]:
df_check['wer'] = df_check['std_trans_wec'] / df_check['gold_trans_stc']
df_check['wec'] = df_check['std_trans_wec'] - df_check['num_q_mark']

In [13]:
relabeling = 0.2
threshold = max(0.2, np.quantile(df_check['wer'], 1-relabeling))
need_relabeling = (df_check['wer'] > threshold) & ((df_check['wec'] > 2) | (df_check['num_q_mark'] > df_check['gold_trans_stc'] * 0.5))
print('wer threshold = {:.2f}, need relabeling: count = {:d}, percent = {:.2f}'.format(threshold, sum(need_relabeling), sum(need_relabeling)/len(df_check)))

wer threshold = 0.32, need relabeling: count = 31, percent = 0.19


In [14]:
def reject_reason(df, batch_num):
    gold_trans, std_trans = df.gold_trans, df.std_trans
    gold_trans = gold_trans.replace("'", '')
    missing = (df.std_trans_stc < df.gold_trans_stc * 0.7)
    wec = df.std_trans_wec
    if missing:
        reason = 'the transcription is not correct, missing a lot (batch id: {:d}). your answer: {:s}, groundtruth: {:s}. #errors = {:.0f}'\
        .format(batch_num, std_trans, gold_trans, wec)
    else:
        reason = 'the transcription is not correct, many errors (batch id: {:d}). your answer: {:s}, groundtruth: {:s}. #errors = {:.0f}'\
        .format(batch_num, std_trans, gold_trans, wec)
    return reason

In [15]:
df_check['Reject'][need_relabeling] = df_check.apply(lambda x: reject_reason(x, x['batch_num']), axis=1)
df_check['Approve'][df_check['Reject'].isnull()] = 'x'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_check['Reject'][need_relabeling] = df_check.apply(lambda x: reject_reason(x, x['batch_num']), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_check['Approve'][df_check['Reject'].isnull()] = 'x'


In [16]:
# double check
if process == 'process':
    cols = ['Input.audio_url','Approve','Reject','std_trans','gold_trans','std_trans_wec','wer']
    df8 = df_check[need_relabeling].reset_index(drop=True)
    i = 0
    for col in cols:
        if i < len(df8):
            print('{:s} = {:s}'.format(col, str(df8[col][i])))

Input.audio_url = 5735/48575/5735-48575-0001.flac
Approve = nan
Reject = the transcription is not correct, missing a lot (batch id: 60007). your answer: there is no help horsemen, groundtruth: there is no help for it but thou send out horsemen. #errors = 6
std_trans = there is no help horsemen
gold_trans = there is no help for it but thou send out horsemen
std_trans_wec = 6.0
wer = 0.5454545454545454


### Generate CSV file

In [17]:
cols = ['HITId','WorkerId','Approve','Reject']
all_reject_cnt = 0
for i, batch_num in enumerate(batch_nums):
    df_submit = df_submits[i]
    df_out = df_submit[ori_cols].drop(columns=['Approve','Reject']).merge(df_check[cols], on = ['HITId','WorkerId'])
    df_out.to_csv(os.path.join(folder, 'LibriSpeech', 'batch_result/{:s}/{:d}_out.csv'.format(process, batch_num)), index=False)
#     df_out.info()
    n_reject = sum(~df_out['Reject'].isnull())
    rep_rate  = n_reject / len(df_out)
    all_reject_cnt += n_reject
    print('batch_num = {:04d}, n_reject = {:02d}, rep_rate = {:.2f}'.format(batch_num, n_reject, rep_rate))
print('batch_num = all, n_reject = {:02d}, rep_rate = {:.2f}'.format(all_reject_cnt, all_reject_cnt/len(df_check)))

batch_num = 60007, n_reject = 08, rep_rate = 0.53
batch_num = 60008, n_reject = 10, rep_rate = 0.43
batch_num = 60012, n_reject = 13, rep_rate = 0.10
batch_num = all, n_reject = 31, rep_rate = 0.19


### Gold may have error

In [18]:
# dev-other: 
# 1585-131718-0017.flac
# 1585-157660-0009.flac
# 1650-167613-0006.flac hard and need domain knowledge