In [1]:
from swda import Transcript
import glob, os
import numpy as np
from tqdm import tqdm as tqdm
from collections import namedtuple

DATA_FOLDER = '/n/sd7/trung/csp/data/swbd'
from pydub import AudioSegment

In [2]:
dlgs = {}
for file in tqdm(glob.glob(os.path.join(DATA_FOLDER, "swda", '**/*.csv')), desc="Load Dialogs"):
    trans = Transcript(file, os.path.join(DATA_FOLDER, "swda", 'swda-metadata.csv'))
    dlgid = os.path.basename(file).split('_')[2].split('.')[0]
    dlgs[dlgid] = list(trans.utterances)
    
print("Dialog Count:", len(dlgs))

Load Dialogs: 100%|██████████| 1155/1155 [04:46<00:00,  4.04it/s]

Dialog Count: 1155





In [29]:
# process act tag
da_tags = {}
for dlgid in dlgs:
    dlg = dlgs[dlgid]
    for utt in dlg:
        tag = utt.damsl_act_tag()
        
        if tag in da_tags: da_tags[tag] += 1
        else: da_tags[tag] = 1
    
print(len(da_tags))
da_tagids = { da: i for i, da in enumerate(list(da_tags.keys())) }
print(da_tagids.keys())

43
dict_keys(['fo_o_fw_"_by_bc', 'qw', 'h', 'sd', 'sv', 'b', 'x', '%', '+', 'qy', 'qrr', 'na', 'bk', 'ba', 'ny', '^q', 'aa', 'nn', 'fc', 'ad', 'qo', 'qh', 'no', 'ng', '^2', 'bh', 'qy^d', 'br', 'b^m', '^h', 'bf', 'fa', 'oo_co_cc', 'ar', 'bd', 't1', 'arp_nd', 't3', 'ft', '^g', 'qw^d', 'fp', 'aap_am'])


In [4]:
import re
from nltk.tokenize import word_tokenize

punctuations = ['?', '.', ',', ';', '!', ':']
def split_words(sent):
    if ' /' in sent: sent = sent[:sent.index(' /')]
    sent = sent.lower()
    sent = re.sub(r'\{.', '', sent)
    sent = re.sub(r'\[\[.*\]\]', '', sent)
    sent = sent + ' '
    for c in ['[', ']', '(', ')', '#', '}', '"', '\\', '/', '*', '=', '+', '&']: sent = sent.replace(c, '')
    for c in punctuations: sent = sent.replace(c, ' ' + c)
    start = 0
    i, br = 0, 0
    words = []
    while i < len(sent) and start < len(sent):
        if sent[i] in ['[', '<', '{', '(']: br += 1
        elif sent[i] in [']', '>', '}', ')']: br -= 1
        elif sent[i] == ' ' and br == 0: 
            words.append(sent[start:i])
            start = i + 1
        i += 1
    #words = word_tokenize(sent)
    i = 0
    ret = []
    sqbr, acbr = 0, 0
    for i, word in enumerate(words):
        if word == '': continue
        elif word[0] in ['<']: continue
        elif word in ['typo']: continue
        #elif not all('a' <= c <= 'z' or c in punctuations + ['-', "'"] for c in word): ret.append('<oov>')
        #elif word in ['{', '}', '-', '(', ')', '#']: pass
        #elif word == '[': sqbr += 1
        #elif word == ']': sqbr -= 1
        #elif word == '<': acbr += 1
        #elif word == '>': acbr -= 1
        #elif sqbr > 0 and word in ['+']: pass
        #elif i > 0 and words[i - 1] == '{': pass
        #elif acbr > 0: pass
        #elif i > 0 and i < len(words) - 1 and words[i - 1] == '<' and words[i + 1] == '>': pass #ret.append('[%s]' % word.lower())
        else: ret.append(word)
    return ret

vocab = set()
for dlgid in tqdm(dlgs, desc="Build Vocab List"):
    for utt in dlgs[dlgid]:
        for word in split_words(utt.text): vocab.add(word)

word_ids = { word: i for i, word in enumerate(list(vocab)) }

with open(os.path.join(DATA_FOLDER, "vocab", "words_swda_raw.txt"), "w") as f:
    f.write("\n".join(list(vocab)))

print("Word Count:", len(vocab))
print([word for word in vocab if not all('a' <= c <= 'z' or c in punctuations + ['-', "'"] for c in word)])

Build Vocab List: 100%|██████████| 1155/1155 [00:05<00:00, 204.31it/s]

Word Count: 21707
['child_talking', "fianc3ee's", 'appliqu3ed', 'fianc3ee', 'fianc3e', 'blas3e']





In [5]:
_keys = list(dlgs.keys())
dlgs_train = _keys[20:]
dlgs_test = _keys[:20]

dlgs_dev_set = dlgs_train[:40]
dlgs_train_set = dlgs_train[40:]

In [6]:
trans_utts = {}
tmp = set()

Utt = namedtuple("Utt", "id, caller, start, end, act_tag, words, trans_words, npy")

for transfile in tqdm(list(glob.glob(os.path.join(DATA_FOLDER, "ptree_transcripts", "alignments", "*.text")))):
    dlgid = os.path.basename(transfile)[2:6]
    c = os.path.basename(transfile)[6]
    
    for t in [0]:
        if not os.path.exists(os.path.join(DATA_FOLDER, "wav", dlgid)):
            os.mkdir(os.path.join(DATA_FOLDER, "wav", dlgid))
        #transfile = os.path.join(DATA_FOLDER, "ptree_transcripts/alignments/sw%s%s-ms98-a-penn.text" % (dlgid, c))
        #if not os.path.exists(transfile): 
        #    print("(not existed: %s%s)" % (dlgid, c), end=' ')
        #    continue
            
        if dlgid not in trans_utts: trans_utts[dlgid] = {}
            
        with open(transfile) as f:
            lines = f.read().split('\n')
            lines = [line.split('\t') for line in lines]
            lines = [dict(
                start=int(float(line[2]) * 100 + 0.05), #start
                end=int(float(line[3]) * 100 + 0.05), #end
                id=int(line[1].split('.')[-1]), #id
                word=line[5].lower(), #word
                caller=line[1].split('.')[0] #caller 
            ) for line in lines if len(line) == 7]
            if lines[0]['caller'] != c: tmp.add(dlgid)
            
            cur = None
            i = 0
            ignored_ls = ['[silence]', '[noise]', '[laughter]', '[vocalized-noise]', '---', '+++', '<e_aside>', '<b_aside>', '-h', '-s']
            while i < len(lines):
                line = lines[i]
                word = line['word']
                for c in ['"', "."]: word = word.replace(c, "")
                if word in ignored_ls: pass
                elif cur is not None and line['id'] == id:
                    cur['words'].append(dict(start=line['start'], end=line['end'], word=word))
                else:
                    if cur is not None:
                        trans_utts[dlgid][id] = cur
                    cur = dict(words=[dict(start=line['start'], end=line['end'], word=word)], caller=c)
                    id = line['id']
                i += 1
                
            trans_utts[dlgid][id] = cur # (start, end, id, text)
print(len(trans_utts))

print("----- List of dialog with inconsistent caller id -----")
print(len(tmp))

100%|██████████| 2252/2252 [00:29<00:00, 76.48it/s]

1126
----- List of dialog with inconsistent caller id -----
351





In [7]:
dlgs = {dlgid: dlgs[dlgid] for dlgid in dlgs if dlgid in trans_utts}

In [8]:
# Export acoustic features

equi_pairs = [("that's", "that"), ("n't", "wouldn't"), ("it's", "it"), ("there", "there's"),
             ("i", "i've"), ("you", "you're"), ("i", "i'm"), ("he", "he's"), ("not", "cannot"), ("A", "A's"),
             ("m-'n", "'n"), ("twentys", "twenty's"), ("hinckleys", "hinckley's"), ("your", "you're"), ("it's", "its"),
             ("the'vette", "'vette"), ("i'd", "'d"), ("watch'em", "'em"), ("brother's", "brothers")]

def preproc_pos_words(ls):
    ret = []
    for word in ls:
        word = word.lower()
        if word in ['']: continue
        if ('a' > word[0] or word[0] > 'z') and word[0] not in list("'"): continue
        ret.append(word)
    sent = ' '.join(ret)
    rpl = [(" n't", "n't"), (" '", "'")]
    for src, tgt in rpl: sent = sent.replace(src, tgt)
    return sent.split(' ')
    return ret

def find_word(s, pos_words, pos):
    word = pos_words[-1]
    word = word.replace("''", "")
    if word == "mumblex": word = pos_words[-2]
    for i in range(5):
        for id in [pos - i, pos + i]:
            if id < 0 or id > len(s) - 1: continue
            w = s[id]['word'].lower()
            w = ''.join([c for c in w if c not in list('"')])
            # print(w, word)
            if w == word: return id
            for s1, s2 in equi_pairs:
                if w == s1 and word == s2 or w == s2 and word == s1:
                    return id
            if w.startswith(word) and len(word) * 2 > len(w): return id
            if word == "n't" and w[-3:] == "n't": return id
            if word == "'d" and w[-2:] == "'d": return id
            if w[-2:] == "'s" and word[-1] == "s": return id
            if w.startswith(word) and all(w[i] == '-' for i in range(len(word), len(w))): return id
            if w.startswith(word) and w[-3:] in ["'re", "'ve", "'ll"]: return id

dlg_utts = {}

# Dialog with annotation
for dlgid in tqdm(dlgs):
    #if dlgid != '2495': continue
    dlg = dlgs[dlgid]
    dlg.sort(key=lambda utt: utt.transcript_index)
    #print([utt.transcript_index for utt in dlg])
    if dlgid not in trans_utts:
        print(dlgid)
        continue
    dlg_utts[dlgid] = []
    i = 0
    while i < len(dlg): # loop through utterance in da
        utt = dlg[i]
        id = utt.utterance_index
        if id not in trans_utts[dlgid]:
            i += 1
            continue
        trans_utt = trans_utts[dlgid][id]['words']
        trans_cur_pos = 0
        while utt.utterance_index == id:
            if dlgid == '3825' and utt.transcript_index == 5: 
                i += 1; utt = dlg[i]; continue
            pos_words = preproc_pos_words(split_words(utt.text))
            if len(pos_words) == 0: i += 1; break
            if len(pos_words) == 1 and pos_words[0] in ["", "mumblex"]: i += 1; break
            last_pos = find_word(trans_utt, pos_words, trans_cur_pos + len(pos_words) - 1)
                
            if last_pos is None:
                print(dlgid)
                print(utt.caller, utt.utterance_index, utt.transcript_index)
                print('-->', [w['word'] for w in trans_utt])
                print(trans_cur_pos + len(pos_words), "/", len(trans_utt))
                
                # print(dlg[i - 2].transcript_index, dlg[i - 2].act_tag, dlg[i - 2].pos_words())
                print(dlg[i - 1].transcript_index, dlg[i - 1].act_tag, preproc_pos_words(dlg[i - 1].pos_words()))
                print(dlg[i].transcript_index, dlg[i].act_tag, pos_words)
                if i + 1 < len(dlg):
                    print(dlg[i + 1].transcript_index, dlg[i + 1].act_tag, preproc_pos_words(dlg[i + 1].pos_words()))
                print(id, [w['word'] for w in trans_utt[trans_cur_pos:last_pos + 1]])
            
            dlg_utts[dlgid].append(dict(
                id=utt.transcript_index,
                utt_id=utt.utterance_index,
                caller=utt.caller,
                start=trans_utt[trans_cur_pos]['start'],#start=trans_utts[dlgid][cur_id]['start'], 
                end=trans_utt[last_pos]['end'],#end=trans_utts[dlgid][cur_id]['end'],
                act_tag=utt.damsl_act_tag(),
                words=pos_words,
                pos_from=trans_cur_pos,
                pos_to=last_pos,
                trans_words=[w['word'] for w in trans_utt[trans_cur_pos:last_pos + 1]],
                #npy=os.path.join(DATA_FOLDER, "features", "npy", dlgid, "%s_%s.npy" % (cur_id, caller))
            ))
            
            if abs(len(pos_words) - (last_pos - trans_cur_pos + 1)) > 2:
                print('-->', [w['word'] for w in trans_utt])
                print("%s\n%s" % (dlg_utts[dlgid][-3]['words'], [w for w in dlg_utts[dlgid][-3]['trans_words']]))
                print("%s\n%s" % (dlg_utts[dlgid][-2]['words'], [w for w in dlg_utts[dlgid][-2]['trans_words']]))
                print("%s %d\n%s" % (dlg_utts[dlgid][-1]['words'], dlg_utts[dlgid][-1]['pos_from'], [w for w in dlg_utts[dlgid][-1]['trans_words']]))
                print(trans_cur_pos)
            
            trans_cur_pos = last_pos + 1
            i += 1
            if i < len(dlg): utt = dlg[i]
            else: break
        #print(da_utts[longid].keys())
    #da_utts[dlgid][cur_id] = cur

    #dlg_utts[dlgid].sort(key=lambda utt: utt.id)

# Dialog without annotation

print(len(dlg_utts))

100%|██████████| 1126/1126 [00:11<00:00, 101.75it/s]

1126





In [22]:
import copy
dlg_utts_seg = {}
for id in tqdm(dlg_utts.keys()):
    ls = []
    cur = None
    for utt in dlg_utts[id]:
        if cur is not None and cur['caller'] == utt['caller']:
            cur['end'] = utt['end']
            cur['act_tag'].append(utt['act_tag'])
            cur['trans_words'] += utt['trans_words'] + ['</da_%s>' % utt['act_tag']]
        else:
            if cur is not None: ls.append(cur)
            cur = copy.deepcopy(utt)
            cur['trans_words'].append('</da_%s>' % utt['act_tag'])
            #cur['trans_words'].append('</da>')
            cur['act_tag'] = [cur['act_tag']]
        #if cur is not None: print(cur['trans_words'])
    dlg_utts_seg[id] = ls

100%|██████████| 1126/1126 [00:03<00:00, 372.87it/s]


In [71]:
from htk import read as read_htk
sil_duration = 25
PREFIX = "swda_seg_padding25_speaker_norm"

total_frame_num = 0
reverse_caller = ["3061", "2064", "2854", "2968", "2960", "2794", "2954", "2543", "3077"] # these dialogs have incorrect caller annotation
for dlgid in tqdm(dlg_utts_seg):
    #if dlgid not in reverse_caller: continue
    #print(dlgid)
    if not os.path.exists(os.path.join(DATA_FOLDER, "feature", "numpy", PREFIX, dlgid)):
        os.mkdir(os.path.join(DATA_FOLDER, "feature", "numpy", PREFIX, dlgid))
    for caller in ['A', 'B']:
        utterance_dict = list(filter(lambda utt: utt['caller'] == caller, dlg_utts_seg[dlgid]))
        audio_path = os.path.join(DATA_FOLDER, "htk", "swbd", "sw0%s-%s.htk" % (dlgid, caller if dlgid not in reverse_caller else ("A" if caller == "B" else "B")))
        
        input_data, _, _ = read_htk(audio_path)
        feature_dim = input_data.shape[1]
        input_data_dict = {}
        total_frame_num = 0
        end_frame_pre = 0
        global_mean = None
        global_std = None
        input_data_utt_std = np.zeros((feature_dim,), dtype=np.float32)
        input_data_utt_sum = np.zeros((feature_dim,), dtype=np.float32)
        
        for k in range(3):
            for i, utt in enumerate(utterance_dict):
                start_frame, end_frame = utt['start'], utt['end']
                if i == 0:
                    start_frame_extend = max(start_frame - sil_duration, 0)
                    start_frame_next = utterance_dict[i + 1]['start']
                    end_frame_extend = max(end_frame, min(end_frame + sil_duration, (start_frame_next + end_frame) // 2))
                    end_frame_pre = end_frame
                elif i == len(utterance_dict) - 1:
                    start_frame_extend = max(start_frame - sil_duration, (start_frame + end_frame_pre) // 2)
                    end_frame_extend = max(end_frame, min(end_frame + sil_duration, input_data.shape[0]))
                else:
                    start_frame_extend = max(start_frame - sil_duration, (start_frame + end_frame_pre) // 2)
                    start_frame_next = utterance_dict[i + 1]['start']
                    if end_frame > start_frame_next:
                        print("Warning: utterances are overlapping.")
                    end_frame_extend = max(end_frame, min(end_frame + sil_duration, (start_frame_next + end_frame) // 2))
                    end_frame_pre = end_frame
            
                #print(end_frame_extend - start_frame_extend, end=" ")
                #start_frame_extend, end_frame_extend = start_frame, end_frame
                #print(end_frame_extend - start_frame_extend, end_frame - start_frame)
                # if k == 0: print(start_frame_extend, start_frame, end_frame, end_frame_extend)
                input_data_utt = input_data[start_frame_extend:end_frame_extend]
                input_data_utt_sum += np.sum(input_data_utt, axis=0)
        
                if global_mean is not None:
                    if global_std is None:
                        input_data_utt_std += np.sum(np.abs(input_data_utt - global_mean) ** 2, axis=0)
                    else: # save
                        input_utt = (input_data_utt - global_mean) / global_std
                        #print(os.path.join(DATA_FOLDER, "feature", "numpy", PREFIX, dlgid, "%s%s.npy" % (utt['id'], caller)))
                        np.save(os.path.join(DATA_FOLDER, "feature", "numpy", PREFIX, dlgid, "%s%s.npy" % (utt['id'], caller)), input_utt)
                #total_frame_num_file += end_frame_extend - start_frame_extend
                #input_data_dict[utt['id']] = input_data_utt
                total_frame_num += end_frame_extend - start_frame_extend
                
            if global_mean is not None:
                if global_std is None:
                    global_std = np.sqrt(input_data_utt_std / (total_frame_num - 1))
                    #np.save(INPUT_STD_PATH, global_std)
                    #print(total_frame_num, "global_std", global_std)
            else:
                global_mean = input_data_utt_sum / total_frame_num
                #np.save(INPUT_MEAN_PATH, global_mean)
                #print(total_frame_num, "global_mean", global_mean)

100%|██████████| 1126/1126 [23:27<00:00,  1.25s/it]


In [26]:
words_full = open(os.path.join(DATA_FOLDER, "vocab", "words_swda_full_old.txt")).read().split('\n')
words_full = {word: i for i, word in enumerate(words_full)}
# build test set for speech recognition result
#words_full['</da>'] = len(words_full) + 2
l = len(words_full)
for da in da_tagids.keys(): words_full['</da_%s>' % da] = l + da_tagids[da]
with open(os.path.join(DATA_FOLDER, "vocab", "words_swda_full_old_da.txt"), 'w') as f:
    f.write('\n'.join(words_full))
    
# for speech recogniton
PREFIX = "swda_seg_da_full_vocab_old"
#words = open(os.path.join(DATA_FOLDER, "vocab", "words_swda.txt")).read().split('\n')
#words = {word: i for i, word in enumerate(words)}

dlgs_dev_set = dlgs_train[:40]
dlgs_train_set = dlgs_train[40:]
#dlgs_dev_set = ['2053', '2067', '2071', '2072', '2160', '2163', '2175', '2253', '2289', '2299', '2340', '2373', '2395', '2399', '2455', '2501', '2534', '2558', '2593', '2594', '2598', '2620', '2621', '2623', '2630', '2653', '2713', '2755', '2772', '2776', '2790', '2832', '2839', '2842', '2854', '2874', '2888', '2889', '2944', '2959', '2981', '2989', '3015', '3046', '3072', '3096', '3148', '3156', '3181', '3184', '3190', '3191', '3202', '3207', '3239', '3246', '3250', '3251', '3255', '3257', '3281', '3288', '3290', '3291', '3334', '3346', '3352', '3354', '3382', '3433', '3445', '3491', '3497', '3500', '3506', '3509', '3554', '3576', '3584', '3587', '3658', '3659', '3666', '3675', '3686', '3697', '3711', '3769', '3797', '3810', '3811', '3921', '4004', '4026', '4037', '4048', '4072', '4318', '4321', '4347', '4356', '4372', '4572', '4633', '4660', '4697', '4707', '4716', '4736', '4802', '4890', '4917']
#dlgs_train_set = [k for k in dlgs_train if k not in dlgs_dev_set]
headers = ['dialog_id', 'sound', 'start', 'end', 'sound_len', 'caller', 'dialog_act', 'text', 'target', 'predicted_text']
for mode in ["train", "test", "dev"]:
    with open(os.path.join(DATA_FOLDER, '%s_split20_%s.csv' % (PREFIX, mode)), 'w') as fo:
        fo.write('\t'.join(headers) + '\n')
        for dlgid in tqdm(dlgs_test if mode == "test" else (dlgs_train_set if mode == "train" else dlgs_dev_set), desc=mode):
            #if dlgid != '3061': continue
            #print(len(dlg_utts[dlgid]))
            if dlgid not in dlg_utts: continue
            for utt in dlg_utts_seg[dlgid]:
                if len(utt['trans_words']) == 0: print(dlgid)
                if utt['start'] >= utt['end'] - 5: continue
                fo.write('\t'.join([
                    dlgid,
                    os.path.join(DATA_FOLDER, "feature", "numpy", "swda_seg_padding25_speaker_norm", dlgid, "%d%s.npy" % (utt['id'], utt['caller'])), 
                    str(utt['start']), str(utt['end']),
                    str(utt['end'] - utt['start']),
                    utt['caller'],
                    #utt['id'], 
                    str(','.join([str(da_tagids[tag]) for tag in utt['act_tag']])),
                    ' '.join([word.lower().replace('-', '') for word in utt['trans_words']]),
                    ' '.join([str(words_full[word.lower().replace('-', '')]) if word.lower().replace('-', '') in words_full else '0' for word in utt['trans_words']]),
                    ' '.join([str(words_full[word.lower().replace('-', '')]) if word.lower().replace('-', '') in words_full else '0' for word in utt['trans_words']])
                ]) + '\n')
#print(([word for word in oov if '-' not in word]))

train: 100%|██████████| 1095/1095 [00:03<00:00, 354.11it/s]
test: 100%|██████████| 20/20 [00:00<00:00, 243.63it/s]
dev: 100%|██████████| 40/40 [00:00<00:00, 268.75it/s]


In [28]:
print(len(words_full))

27328


In [38]:
dlg_utts_seg = {}
for id in tqdm(dlg_utts.keys()):
    ls = []
    cur = None
    for utt in dlg_utts[id]:
        if cur is not None and cur['caller'] == utt['caller']:
            cur['end'] = utt['end']
            cur['act_tag'].append(utt['act_tag'])
            cur['trans_words'] += utt['trans_words']
            cur['da_tag_seq'] += [43] * (len(utt['trans_words']) - 1) + [da_tagids[utt['act_tag']]]
        else:
            if cur is not None: ls.append(cur)
            cur = copy.deepcopy(utt)
            cur['da_tag_seq'] = [43] * (len(utt['trans_words']) - 1) + [da_tagids[utt['act_tag']]]
            #cur['trans_words'].append('</da>')
            cur['act_tag'] = [cur['act_tag']]
        #if cur is not None: print(cur['trans_words'])
    dlg_utts_seg[id] = ls

100%|██████████| 1126/1126 [00:03<00:00, 314.70it/s]


In [40]:
words_full = open(os.path.join(DATA_FOLDER, "vocab", "words_swda_full_old.txt")).read().split('\n')
words_full = {word: i for i, word in enumerate(words_full)}
# build test set for speech recognition result

# for speech recogniton
PREFIX = "swda_seg_da_seq_full_vocab_old"
#words = open(os.path.join(DATA_FOLDER, "vocab", "words_swda.txt")).read().split('\n')
#words = {word: i for i, word in enumerate(words)}

dlgs_dev_set = dlgs_train[:40]
dlgs_train_set = dlgs_train[40:]
#dlgs_dev_set = ['2053', '2067', '2071', '2072', '2160', '2163', '2175', '2253', '2289', '2299', '2340', '2373', '2395', '2399', '2455', '2501', '2534', '2558', '2593', '2594', '2598', '2620', '2621', '2623', '2630', '2653', '2713', '2755', '2772', '2776', '2790', '2832', '2839', '2842', '2854', '2874', '2888', '2889', '2944', '2959', '2981', '2989', '3015', '3046', '3072', '3096', '3148', '3156', '3181', '3184', '3190', '3191', '3202', '3207', '3239', '3246', '3250', '3251', '3255', '3257', '3281', '3288', '3290', '3291', '3334', '3346', '3352', '3354', '3382', '3433', '3445', '3491', '3497', '3500', '3506', '3509', '3554', '3576', '3584', '3587', '3658', '3659', '3666', '3675', '3686', '3697', '3711', '3769', '3797', '3810', '3811', '3921', '4004', '4026', '4037', '4048', '4072', '4318', '4321', '4347', '4356', '4372', '4572', '4633', '4660', '4697', '4707', '4716', '4736', '4802', '4890', '4917']
#dlgs_train_set = [k for k in dlgs_train if k not in dlgs_dev_set]
headers = ['dialog_id', 'sound', 'start', 'end', 'sound_len', 'caller', 'dialog_act', 'text', 'target', 'predicted_text']
for mode in ["train", "test", "dev"]:
    with open(os.path.join(DATA_FOLDER, '%s_split20_%s.csv' % (PREFIX, mode)), 'w') as fo:
        fo.write('\t'.join(headers) + '\n')
        for dlgid in tqdm(dlgs_test if mode == "test" else (dlgs_train_set if mode == "train" else dlgs_dev_set), desc=mode):
            #if dlgid != '3061': continue
            #print(len(dlg_utts[dlgid]))
            if dlgid not in dlg_utts: continue
            for utt in dlg_utts_seg[dlgid]:
                assert len(utt['da_tag_seq']) == len(utt['trans_words'])
                if len(utt['trans_words']) == 0: print(dlgid)
                if utt['start'] >= utt['end'] - 5: continue
                fo.write('\t'.join([
                    dlgid,
                    os.path.join(DATA_FOLDER, "feature", "numpy", "swda_seg_padding25_speaker_norm", dlgid, "%d%s.npy" % (utt['id'], utt['caller'])), 
                    str(utt['start']), str(utt['end']),
                    str(utt['end'] - utt['start']),
                    utt['caller'],
                    #utt['id'], 
                    str(','.join([str(tag) for tag in utt['da_tag_seq']])),
                    ' '.join([word.lower().replace('-', '') for word in utt['trans_words']]),
                    ' '.join([str(words_full[word.lower().replace('-', '')]) if word.lower().replace('-', '') in words_full else '0' for word in utt['trans_words']]),
                    ' '.join([str(words_full[word.lower().replace('-', '')]) if word.lower().replace('-', '') in words_full else '0' for word in utt['trans_words']])
                ]) + '\n')
#print(([word for word in oov if '-' not in word]))

train: 100%|██████████| 1095/1095 [00:03<00:00, 350.08it/s]
test: 100%|██████████| 20/20 [00:00<00:00, 265.20it/s]
dev: 100%|██████████| 40/40 [00:00<00:00, 266.14it/s]


In [51]:
words_full = open(os.path.join(DATA_FOLDER, "vocab", "words_swda_full_old.txt")).read().split('\n')
words_full = {word: i for i, word in enumerate(words_full)}
# build test set for speech recognition result

# for speech recogniton
PREFIX = "swda_seg_only_da_seq_full_vocab_old"
#words = open(os.path.join(DATA_FOLDER, "vocab", "words_swda.txt")).read().split('\n')
#words = {word: i for i, word in enumerate(words)}

dlgs_dev_set = dlgs_train[:40]
dlgs_train_set = dlgs_train[40:]
#dlgs_dev_set = ['2053', '2067', '2071', '2072', '2160', '2163', '2175', '2253', '2289', '2299', '2340', '2373', '2395', '2399', '2455', '2501', '2534', '2558', '2593', '2594', '2598', '2620', '2621', '2623', '2630', '2653', '2713', '2755', '2772', '2776', '2790', '2832', '2839', '2842', '2854', '2874', '2888', '2889', '2944', '2959', '2981', '2989', '3015', '3046', '3072', '3096', '3148', '3156', '3181', '3184', '3190', '3191', '3202', '3207', '3239', '3246', '3250', '3251', '3255', '3257', '3281', '3288', '3290', '3291', '3334', '3346', '3352', '3354', '3382', '3433', '3445', '3491', '3497', '3500', '3506', '3509', '3554', '3576', '3584', '3587', '3658', '3659', '3666', '3675', '3686', '3697', '3711', '3769', '3797', '3810', '3811', '3921', '4004', '4026', '4037', '4048', '4072', '4318', '4321', '4347', '4356', '4372', '4572', '4633', '4660', '4697', '4707', '4716', '4736', '4802', '4890', '4917']
#dlgs_train_set = [k for k in dlgs_train if k not in dlgs_dev_set]
headers = ['dialog_id', 'sound', 'start', 'end', 'sound_len', 'caller', 'dialog_act', 'text', 'target', 'predicted_text']
for mode in ["train", "test", "dev"]:
    with open(os.path.join(DATA_FOLDER, '%s_split20_%s.csv' % (PREFIX, mode)), 'w') as fo:
        fo.write('\t'.join(headers) + '\n')
        for dlgid in tqdm(dlgs_test if mode == "test" else (dlgs_train_set if mode == "train" else dlgs_dev_set), desc=mode):
            #if dlgid != '3061': continue
            #print(len(dlg_utts[dlgid]))
            if dlgid not in dlg_utts: continue
            for utt in dlg_utts_seg[dlgid]:
                assert len(utt['da_tag_seq']) == len(utt['trans_words'])
                if len(utt['trans_words']) == 0: print(dlgid)
                if utt['start'] >= utt['end'] - 5: continue
                fo.write('\t'.join([
                    dlgid,
                    os.path.join(DATA_FOLDER, "feature", "numpy", "swda_seg_padding25_speaker_norm", dlgid, "%d%s.npy" % (utt['id'], utt['caller'])), 
                    str(utt['start']), str(utt['end']),
                    str(utt['end'] - utt['start']),
                    utt['caller'],
                    #utt['id'], 
                    str(','.join([str(1 if tag == 44 else 2) for tag in utt['da_tag_seq']])),
                    ' '.join([word.lower().replace('-', '') for word in utt['trans_words']]),
                    ' '.join([str(words_full[word.lower().replace('-', '')]) if word.lower().replace('-', '') in words_full else '0' for word in utt['trans_words']]),
                    ' '.join([str(words_full[word.lower().replace('-', '')]) if word.lower().replace('-', '') in words_full else '0' for word in utt['trans_words']])
                ]) + '\n')
#print(([word for word in oov if '-' not in word]))

train: 100%|██████████| 1095/1095 [00:03<00:00, 346.61it/s]
test: 100%|██████████| 20/20 [00:00<00:00, 261.52it/s]
dev: 100%|██████████| 40/40 [00:00<00:00, 256.20it/s]


In [53]:
words_full = open(os.path.join(DATA_FOLDER, "vocab", "words_swda_full_old.txt")).read().split('\n')
words_full = {word: i for i, word in enumerate(words_full)}
# build test set for speech recognition result

# for speech recogniton
PREFIX = "swda_seg_da_seq_insert_bound_full_vocab_old"
#words = open(os.path.join(DATA_FOLDER, "vocab", "words_swda.txt")).read().split('\n')
#words = {word: i for i, word in enumerate(words)}

dlgs_dev_set = dlgs_train[:40]
dlgs_train_set = dlgs_train[40:]
#dlgs_dev_set = ['2053', '2067', '2071', '2072', '2160', '2163', '2175', '2253', '2289', '2299', '2340', '2373', '2395', '2399', '2455', '2501', '2534', '2558', '2593', '2594', '2598', '2620', '2621', '2623', '2630', '2653', '2713', '2755', '2772', '2776', '2790', '2832', '2839', '2842', '2854', '2874', '2888', '2889', '2944', '2959', '2981', '2989', '3015', '3046', '3072', '3096', '3148', '3156', '3181', '3184', '3190', '3191', '3202', '3207', '3239', '3246', '3250', '3251', '3255', '3257', '3281', '3288', '3290', '3291', '3334', '3346', '3352', '3354', '3382', '3433', '3445', '3491', '3497', '3500', '3506', '3509', '3554', '3576', '3584', '3587', '3658', '3659', '3666', '3675', '3686', '3697', '3711', '3769', '3797', '3810', '3811', '3921', '4004', '4026', '4037', '4048', '4072', '4318', '4321', '4347', '4356', '4372', '4572', '4633', '4660', '4697', '4707', '4716', '4736', '4802', '4890', '4917']
#dlgs_train_set = [k for k in dlgs_train if k not in dlgs_dev_set]
headers = ['dialog_id', 'sound', 'start', 'end', 'sound_len', 'caller', 'dialog_act', 'text', 'target', 'predicted_text']
for mode in ["train", "test", "dev"]:
    with open(os.path.join(DATA_FOLDER, '%s_split20_%s.csv' % (PREFIX, mode)), 'w') as fo:
        fo.write('\t'.join(headers) + '\n')
        for dlgid in tqdm(dlgs_test if mode == "test" else (dlgs_train_set if mode == "train" else dlgs_dev_set), desc=mode):
            #if dlgid != '3061': continue
            #print(len(dlg_utts[dlgid]))
            if dlgid not in dlg_utts: continue
            for utt in dlg_utts_seg[dlgid]:
                assert len(utt['da_tag_seq']) == len(utt['trans_words'])
                if len(utt['trans_words']) == 0: print(dlgid)
                if utt['start'] >= utt['end'] - 5: continue
                
                trans = []
                for i, word in enumerate(utt['trans_words']):
                    word = word.lower().replace('-', '')
                    trans.append(str(words_full[word]) if word in words_full else '0')
                    if utt['da_tag_seq'][i] != 44: trans.append('27287')
                fo.write('\t'.join([
                    dlgid,
                    os.path.join(DATA_FOLDER, "feature", "numpy", "swda_seg_padding25_speaker_norm", dlgid, "%d%s.npy" % (utt['id'], utt['caller'])), 
                    str(utt['start']), str(utt['end']),
                    str(utt['end'] - utt['start']),
                    utt['caller'],
                    #utt['id'], 
                    str(','.join([str(da_tagids[tag]) for tag in utt['act_tag']])),
                    #str(','.join([str(tag) for tag in utt['da_tag_seq']])),
                    ' '.join([word.lower().replace('-', '') for word in utt['trans_words']]),
                    ' '.join(trans),
                    ' '.join(trans)
                ]) + '\n')
#print(([word for word in oov if '-' not in word]))

train: 100%|██████████| 1095/1095 [00:12<00:00, 89.92it/s]
test: 100%|██████████| 20/20 [00:00<00:00, 46.19it/s]
dev: 100%|██████████| 40/40 [00:00<00:00, 92.62it/s]
