In [1]:
from swda import Transcript
import glob, os
import numpy as np
from tqdm import tqdm as tqdm
from collections import namedtuple

DATA_FOLDER = '/n/sd7/trung/csp/data/swbd'
from pydub import AudioSegment

### Load data from swda

In [2]:
dlgs = {}
for file in tqdm(glob.glob(os.path.join(DATA_FOLDER, "swda", '**/*.csv')), desc="Load Dialogs"):
    trans = Transcript(file, os.path.join(DATA_FOLDER, "swda", 'swda-metadata.csv'))
    dlgid = os.path.basename(file).split('_')[2].split('.')[0]
    dlgs[dlgid] = list(trans.utterances)
    
print("Dialog Count:", len(dlgs))

Load Dialogs: 100%|██████████| 1155/1155 [04:53<00:00,  3.94it/s]

Dialog Count: 1155





In [3]:
# process act tag
da_tags = {}
for dlgid in dlgs:
    dlg = dlgs[dlgid]
    for utt in dlg:
        tag = utt.damsl_act_tag()
        
        if tag in da_tags: da_tags[tag] += 1
        else: da_tags[tag] = 1
    
print(len(da_tags))
da_tagids = { da: i for i, da in enumerate(list(da_tags.keys())) }
print(da_tagids)

43
{'fo_o_fw_"_by_bc': 0, 'qw': 1, 'h': 2, 'sd': 3, 'sv': 4, 'b': 5, 'x': 6, '%': 7, '+': 8, 'qy': 9, 'qrr': 10, 'na': 11, 'bk': 12, 'ba': 13, 'ny': 14, '^q': 15, 'aa': 16, 'nn': 17, 'fc': 18, 'ad': 19, 'qo': 20, 'qh': 21, 'no': 22, 'ng': 23, '^2': 24, 'bh': 25, 'qy^d': 26, 'br': 27, 'b^m': 28, '^h': 29, 'bf': 30, 'fa': 31, 'oo_co_cc': 32, 'ar': 33, 'bd': 34, 't1': 35, 'arp_nd': 36, 't3': 37, 'ft': 38, '^g': 39, 'qw^d': 40, 'fp': 41, 'aap_am': 42}


In [4]:
_keys = list(dlgs.keys())
dlgs_train = _keys[20:]
dlgs_test = _keys[:20]

In [39]:
vocab = set()
for dlgid in tqdm(dlgs, desc="Build Vocab List"):
    for utt in dlgs[dlgid]:
        for word in utt.pos_words(): vocab.add(word.lower())

word_ids = { word: i for i, word in enumerate(list(vocab)) }

with open(os.path.join(DATA_FOLDER, "vocab", "words_swda_raw.txt"), "w") as f:
    f.write("\n".join(list(vocab)))
    
for mode in ["train", "test", "dev"]:
    _dlgs = dlgs_test if mode == "test" else (dlgs_train[40:] if mode == "train" else dlgs_train[:40])
    with open(os.path.join(DATA_FOLDER, "swda_raw_%s.txt" % mode), "w") as f:
        f.write('\t'.join(["dialog_id", "sound", "caller", "dialog_act", "text", "predicted_text", "target"]) + '\n')
        for dlgid in tqdm(_dlgs, desc="Build Dataset (%s)" % mode):
            for utt in dlgs[dlgid]:
                if len(utt.pos_words()) == 0: continue
                f.write('\t'.join([
                    dlgid, 
                    "none",
                    utt.caller, 
                    str(da_tagids[utt.damsl_act_tag()]), 
                    ' '.join(utt.pos_words()), 
                    ' '.join([str(word_ids[word.lower()]) for word in utt.pos_words()]),
                    ' '.join([str(word_ids[word.lower()]) for word in utt.pos_words()])
                ]))
                f.write('\n')

print("Word Count:", len(vocab))

Build Vocab List: 100%|██████████| 1155/1155 [00:02<00:00, 443.60it/s]
Build Dataset (train): 100%|██████████| 1095/1095 [00:10<00:00, 100.25it/s]
Build Dataset (test): 100%|██████████| 20/20 [00:00<00:00, 92.52it/s]
Build Dataset (dev): 100%|██████████| 40/40 [00:00<00:00, 90.58it/s]

Word Count: 20887





### Load transcript (ms98 Penn Treebank)

In [29]:
trans_utts = {}

Utt = namedtuple("Utt", "id, caller, start, end, act_tag, words, trans_words, npy")

for transfile in tqdm(list(glob.glob(os.path.join(DATA_FOLDER, "ptree_transcripts", "alignments", "*.text")))):
    dlgid = os.path.basename(transfile)[2:6]
    c = os.path.basename(transfile)[6]
    
    for t in [0]:
        if not os.path.exists(os.path.join(DATA_FOLDER, "wav", dlgid)):
            os.mkdir(os.path.join(DATA_FOLDER, "wav", dlgid))
        #transfile = os.path.join(DATA_FOLDER, "ptree_transcripts/alignments/sw%s%s-ms98-a-penn.text" % (dlgid, c))
        #if not os.path.exists(transfile): 
        #    print("(not existed: %s%s)" % (dlgid, c), end=' ')
        #    continue
            
        if dlgid not in trans_utts: trans_utts[dlgid] = {}
            
        with open(transfile) as f:
            lines = f.read().split('\n')
            lines = [line.split('\t') for line in lines]
            lines = [dict(
                start=int(float(line[2]) * 100 + 0.05), #start
                end=int(float(line[3]) * 100 + 0.05), #end
                id=int(line[1].split('.')[-1]), #id
                word=line[5].lower(), #word
                caller=line[1].split('.')[0] #caller 
            ) for line in lines if len(line) == 7]
            
            cur = None
            i = 0
            ignored_ls = ['[silence]', '[noise]', '[laughter]', '[vocalized-noise]', '---', '+++', '<e_aside>', '<b_aside>', '-h', '-s']
            while i < len(lines):
                line = lines[i]
                if line['word'] in ignored_ls: pass
                elif cur is not None and line['id'] == id:
                    cur['words'].append(dict(start=line['start'], end=line['end'], word=line['word']))
                else:
                    if cur is not None:
                        trans_utts[dlgid][id] = cur
                    cur = dict(words=[dict(start=line['start'], end=line['end'], word=line['word'])], caller=c)
                    id = line['id']
                i += 1
                
            trans_utts[dlgid][id] = cur # (start, end, id, text)
print(len(trans_utts))

100%|██████████| 2252/2252 [00:09<00:00, 237.75it/s]

1126





### Align transcript with swda

In [32]:
# Export acoustic features

equi_pairs = [("that's", "that"), ("n't", "wouldn't"), ("it's", "it"), ("there", "there's"),
             ("i", "i've"), ("you", "you're"), ("i", "i'm"), ("he", "he's"), ("not", "cannot"), ("A", "A's"),
             ("m-'n", "'n"), ("twentys", "twenty's"), ("hinckleys", "hinckley's"), ("your", "you're"), ("it's", "its"),
             ("the'vette", "'vette"), ("i'd", "'d"), ("watch'em", "'em"), ("brother's", "brothers")]

def preproc_pos_words(ls):
    ret = []
    for word in ls:
        word = word.lower()
        if word in ['']: continue
        if ('a' > word[0] or word[0] > 'z') and word[0] not in list("'"): continue
        ret.append(word)
    sent = ' '.join(ret)
    rpl = [(" n't", "n't"), (" '", "'")]
    for src, tgt in rpl: sent = sent.replace(src, tgt)
    return sent.split(' ')
    return ret

def find_word(s, pos_words, pos):
    word = pos_words[-1]
    word = word.replace("''", "")
    if word == "mumblex": word = pos_words[-2]
    for i in range(5):
        for id in [pos - i, pos + i]:
            if id < 0 or id > len(s) - 1: continue
            w = s[id]['word'].lower()
            w = ''.join([c for c in w if c not in list('"')])
            # print(w, word)
            if w == word: return id
            for s1, s2 in equi_pairs:
                if w == s1 and word == s2 or w == s2 and word == s1:
                    return id
            if w.startswith(word) and len(word) * 2 > len(w): return id
            if word == "n't" and w[-3:] == "n't": return id
            if word == "'d" and w[-2:] == "'d": return id
            if w[-2:] == "'s" and word[-1] == "s": return id
            if w.startswith(word) and all(w[i] == '-' for i in range(len(word), len(w))): return id
            if w.startswith(word) and w[-3:] in ["'re", "'ve", "'ll"]: return id

dlg_utts = {}

# Dialog with annotation
for dlgid in tqdm(dlgs):
    dlg = dlgs[dlgid]
    dlg.sort(key=lambda utt: utt.transcript_index)
    #print([utt.transcript_index for utt in dlg])
    if dlgid not in trans_utts: continue
    dlg_utts[dlgid] = []
    i = 0
    while i < len(dlg): # loop through utterance in da
        utt = dlg[i]
        id = utt.utterance_index
        if id not in trans_utts[dlgid]:
            i += 1
            continue
        trans_utt = trans_utts[dlgid][id]['words']
        trans_cur_pos = 0
        while utt.utterance_index == id:
            pos_words = preproc_pos_words(utt.pos_words())
            if len(pos_words) == 0: i += 1; break
            if len(pos_words) == 1 and pos_words[0] in ["", "mumblex"]: i += 1; break
            last_pos = find_word(trans_utt, pos_words, trans_cur_pos + len(pos_words) - 1)
            if last_pos is None:
                print(utt.caller, utt.utterance_index, utt.transcript_index)
                print('-->', [w['word'] for w in trans_utt])
                print(trans_cur_pos + len(pos_words), "/", len(trans_utt))
                # print(dlg[i - 2].transcript_index, dlg[i - 2].act_tag, dlg[i - 2].pos_words())
                print(dlg[i - 1].transcript_index, dlg[i - 1].act_tag, preproc_pos_words(dlg[i - 1].pos_words()))
                print(dlg[i].transcript_index, dlg[i].act_tag, pos_words)
                if i + 1 < len(dlg):
                    print(dlg[i + 1].transcript_index, dlg[i + 1].act_tag, preproc_pos_words(dlg[i + 1].pos_words()))
                print(id, [w['word'] for w in trans_utt[trans_cur_pos:last_pos + 1]])
            
            dlg_utts[dlgid].append(dict(
                id=utt.transcript_index,
                utt_id=utt.utterance_index,
                caller=utt.caller,
                start=trans_utt[trans_cur_pos]['start'],#start=trans_utts[dlgid][cur_id]['start'], 
                end=trans_utt[last_pos]['end'],#end=trans_utts[dlgid][cur_id]['end'],
                act_tag=utt.damsl_act_tag(),
                words=pos_words,
                pos_from=trans_cur_pos,
                pos_to=last_pos,
                trans_words=[w['word'] for w in trans_utt[trans_cur_pos:last_pos + 1]],
                #npy=os.path.join(DATA_FOLDER, "features", "npy", dlgid, "%s_%s.npy" % (cur_id, caller))
            ))
            
            if abs(len(pos_words) - (last_pos - trans_cur_pos + 1)) > 2:
                print('-->', [w['word'] for w in trans_utt])
                print("%s\n%s" % (dlg_utts[dlgid][-3]['words'], [w for w in dlg_utts[dlgid][-3]['trans_words']]))
                print("%s\n%s" % (dlg_utts[dlgid][-2]['words'], [w for w in dlg_utts[dlgid][-2]['trans_words']]))
                print("%s %d\n%s" % (dlg_utts[dlgid][-1]['words'], dlg_utts[dlgid][-1]['pos_from'], [w for w in dlg_utts[dlgid][-1]['trans_words']]))
                print(trans_cur_pos)
            
            trans_cur_pos = last_pos + 1
            i += 1
            if i < len(dlg): utt = dlg[i]
            else: break
        #print(da_utts[longid].keys())
    #da_utts[dlgid][cur_id] = cur

    #dlg_utts[dlgid].sort(key=lambda utt: utt.id)

# Dialog without annotation

print(len(dlg_utts))

100%|██████████| 1155/1155 [00:07<00:00, 149.45it/s]

1126





### global padding

In [None]:
#PREFIX = "swda"
PREFIX = "swda_padding15"
INPUT_MEAN_PATH = os.path.join(DATA_FOLDER, "%s_mean.npy" % PREFIX)
INPUT_STD_PATH = os.path.join(DATA_FOLDER, "%s_std.npy" % PREFIX)

global_mean = np.load(INPUT_MEAN_PATH) if os.path.exists(INPUT_MEAN_PATH) else None
global_std = np.load(INPUT_STD_PATH) if os.path.exists(INPUT_STD_PATH) else None
normalize = 'speaker'
global_mean = None
global_std = None
print(global_mean, global_std)

In [None]:
from htk import read as read_htk
sil_duration = 15

total_frame_num = 0
input_data_utt_sum = np.zeros((feature_dim,), dtype=np.float32)
for dlgid in tqdm(dlg_utts):
    #print(dlgid)
    if not os.path.exists(os.path.join(DATA_FOLDER, "feature", "numpy", PREFIX, dlgid)):
        os.mkdir(os.path.join(DATA_FOLDER, "feature", "numpy", PREFIX, dlgid))
    for caller in ['A', 'B']:
        utterance_dict = list(filter(lambda utt: utt['caller'] == caller, dlg_utts[dlgid]))
        audio_path = os.path.join(DATA_FOLDER, "htk", "swbd", "sw0%s-%s.htk" % (dlgid, caller))
        input_data, _, _ = read_htk(audio_path)
        feature_dim = input_data.shape[1]
        input_data_dict = {}
        total_frame_num_file = 0
        end_frame_pre = 0
        mean = None
        input_data_utt_std = np.zeros((feature_dim,), dtype=np.float32)
        
        for i, utt in enumerate(utterance_dict):
            start_frame, end_frame = utt['start'], utt['end']
            if i == 0:
                start_frame_extend = max(start_frame - sil_duration, 0)
                start_frame_next = utterance_dict[i + 1]['start']
                end_frame_extend = max(end_frame, min(end_frame + sil_duration, (start_frame_next + end_frame) // 2))
            elif i == len(utterance_dict) - 1:
                start_frame_extend = max(start_frame - sil_duration, (start_frame + end_frame_pre) // 2)
                end_frame_extend = max(end_frame, min(end_frame + sil_duration, input_data.shape[0]))
            else:
                start_frame_extend = max(start_frame - sil_duration, (start_frame + end_frame_pre) // 2)
                start_frame_next = utterance_dict[i + 1]['start']
                if end_frame > start_frame_next:
                    print("Warning: utterances are overlapping.")
                end_frame_extend = max(end_frame, min(end_frame + sil_duration, (start_frame_next + end_frame) // 2))
                end_frame_pre = end_frame
            
            #print(end_frame_extend - start_frame_extend, end=" ")
            #start_frame_extend, end_frame_extend = start_frame, end_frame
            #print(end_frame_extend - start_frame_extend, end_frame - start_frame)
            input_data_utt = input_data[start_frame_extend:end_frame_extend]
            input_data_utt_sum += np.sum(input_data_utt, axis=0)
            if global_mean is not None:
                if global_std is None:
                    input_data_utt_std += np.sum(np.abs(input_data_utt - global_mean) ** 2, axis=0)
                else: # save
                    input_utt = (input_data_utt - global_mean) / global_std
                    np.save(os.path.join(DATA_FOLDER, "feature", "numpy", PREFIX, dlgid, "%s%s.npy" % (utt['id'], caller)), input_utt)
            #total_frame_num_file += end_frame_extend - start_frame_extend
            input_data_dict[utt['id']] = input_data_utt
            total_frame_num += end_frame_extend - start_frame_extend

In [None]:
if global_mean is not None:
    if global_std is None:
        global_std = np.sqrt(input_data_utt_std / (total_frame_num - 1))
        np.save(INPUT_STD_PATH, global_std)
        print("global_std", global_std)
else:
    global_mean = input_data_utt_sum / total_frame_num
    np.save(INPUT_MEAN_PATH, global_mean)
    print("global_mean", global_mean)

In [40]:
# for speech recogniton
PREFIX = "swda_padding25_speaker_norm"
words = open(os.path.join(DATA_FOLDER, "vocab", "words_swda.txt")).read().split('\n')
words = {word: i for i, word in enumerate(words)}

if True: # build words
    vocab_freq = {}
    for dlgid in tqdm(dlgs_train):
        if dlgid not in dlg_utts: continue
        for utt in dlg_utts[dlgid]:
            for word in utt['trans_words']:
                word = word.lower()
                if word == '': continue
                if not 'a' <= word[0] <= 'z': continue
                if word in vocab_freq: vocab_freq[word] += 1
                else: vocab_freq[word] = 1

    words = list(vocab_freq.keys())
    words.sort(key=lambda word: vocab_freq[word], reverse=True)
    words = words[:-1]
    words = ["<oov>"] + words
    print("Vocab Size:", len(words))
    with open(os.path.join(DATA_FOLDER, "vocab", "words_20.txt"), 'w') as f:
        f.write('\n'.join(words))
    words = { word: i for i, word in enumerate(words) }

headers = ['dialog_id', 'sound', 'sound_len', 'caller', 'dialog_act', 'text', 'target', 'predicted_text']
for mode in ["train", "test", "dev"]:
    with open(os.path.join(DATA_FOLDER, 'inputs_%s_split20_%s.txt' % (PREFIX, mode)), 'w') as fo:
        fo.write('\t'.join(headers) + '\n')
        for dlgid in tqdm(dlgs_test if mode == "test" else (dlgs_train[40:] if mode == "train" else dlgs_train[:40])):
            if dlgid not in dlg_utts: continue
            for utt in dlg_utts[dlgid]:
                if len(utt['trans_words']) == 0: continue
                if utt['start'] >= utt['end'] - 5: continue
                fo.write('%s\t%s\t%d\t%s\t%s\t%s\t%s\t%s\n' % 
                    (dlgid,
                    os.path.join(DATA_FOLDER, "feature", "numpy", PREFIX, dlgid, "%d%s.npy" % (utt['id'], utt['caller'])), 
                    utt['end'] - utt['start'],
                    utt['caller'],
                    #utt['id'], 
                    #utt['end'] - utt['start'],
                    da_tagids[utt['act_tag']],
                    ' '.join([word.lower() for word in utt['trans_words']]),
                    ' '.join([str(words[word.lower()]) if word.lower() in words else '0' for word in utt['trans_words']]),
                    ' '.join([str(words[word.lower()]) if word.lower() in words else '0' for word in utt['trans_words']]),
                    ))

100%|██████████| 1135/1135 [00:00<00:00, 1400.83it/s]
  3%|▎         | 34/1095 [00:00<00:03, 339.77it/s]

Vocab Size: 21314


100%|██████████| 1095/1095 [00:02<00:00, 418.76it/s]
100%|██████████| 20/20 [00:00<00:00, 328.77it/s]
100%|██████████| 40/40 [00:00<00:00, 328.37it/s]


### speaker padding

In [None]:
from htk import read as read_htk
sil_duration = 25
PREFIX = "swda_padding25_speaker_norm"

total_frame_num = 0
for dlgid in tqdm(dlg_utts):
    #print(dlgid)
    if not os.path.exists(os.path.join(DATA_FOLDER, "feature", "numpy", PREFIX, dlgid)):
        os.mkdir(os.path.join(DATA_FOLDER, "feature", "numpy", PREFIX, dlgid))
    for caller in ['A', 'B']:
        utterance_dict = list(filter(lambda utt: utt['caller'] == caller, dlg_utts[dlgid]))
        audio_path = os.path.join(DATA_FOLDER, "htk", "swbd", "sw0%s-%s.htk" % (dlgid, caller))
        input_data, _, _ = read_htk(audio_path)
        feature_dim = input_data.shape[1]
        input_data_dict = {}
        total_frame_num = 0
        end_frame_pre = 0
        global_mean = None
        global_std = None
        input_data_utt_std = np.zeros((feature_dim,), dtype=np.float32)
        input_data_utt_sum = np.zeros((feature_dim,), dtype=np.float32)
        
        for k in range(3):
            for i, utt in enumerate(utterance_dict):
                start_frame, end_frame = utt['start'], utt['end']
                if i == 0:
                    start_frame_extend = max(start_frame - sil_duration, 0)
                    start_frame_next = utterance_dict[i + 1]['start']
                    end_frame_extend = max(end_frame, min(end_frame + sil_duration, (start_frame_next + end_frame) // 2))
                    end_frame_pre = end_frame
                elif i == len(utterance_dict) - 1:
                    start_frame_extend = max(start_frame - sil_duration, (start_frame + end_frame_pre) // 2)
                    end_frame_extend = max(end_frame, min(end_frame + sil_duration, input_data.shape[0]))
                else:
                    start_frame_extend = max(start_frame - sil_duration, (start_frame + end_frame_pre) // 2)
                    start_frame_next = utterance_dict[i + 1]['start']
                    if end_frame > start_frame_next:
                        print("Warning: utterances are overlapping.")
                    end_frame_extend = max(end_frame, min(end_frame + sil_duration, (start_frame_next + end_frame) // 2))
                    end_frame_pre = end_frame
            
                #print(end_frame_extend - start_frame_extend, end=" ")
                #start_frame_extend, end_frame_extend = start_frame, end_frame
                #print(end_frame_extend - start_frame_extend, end_frame - start_frame)
                input_data_utt = input_data[start_frame_extend:end_frame_extend]
                input_data_utt_sum += np.sum(input_data_utt, axis=0)
        
                if global_mean is not None:
                    if global_std is None:
                        input_data_utt_std += np.sum(np.abs(input_data_utt - global_mean) ** 2, axis=0)
                    else: # save
                        input_utt = (input_data_utt - global_mean) / global_std
                        np.save(os.path.join(DATA_FOLDER, "feature", "numpy", PREFIX, dlgid, "%s%s.npy" % (utt['id'], caller)), input_utt)
                #total_frame_num_file += end_frame_extend - start_frame_extend
                #input_data_dict[utt['id']] = input_data_utt
                total_frame_num += end_frame_extend - start_frame_extend
                
            if global_mean is not None:
                if global_std is None:
                    global_std = np.sqrt(input_data_utt_std / (total_frame_num - 1))
                    #np.save(INPUT_STD_PATH, global_std)
                    #print(total_frame_num, "global_std", global_std)
            else:
                global_mean = input_data_utt_sum / total_frame_num
                #np.save(INPUT_MEAN_PATH, global_mean)
                #print(total_frame_num, "global_mean", global_mean)

In [38]:
#dlg_utts = { id: dlg_utts[id] for id in dlg_utts if len(dlg_utts[id]) > 0 }
print("Conversations:", len(dlg_utts))
print("Utterances:", sum([len(dlg_utts[id]) for id in dlg_utts]))
print("Utterances' Length: %.2f hours" % (sum([sum([utt.end - utt.start for utt in dlg_utts[id]]) for id in dlg_utts]) / 3600))
print("Length: %.2f hours" % (sum([dlg_utts[id][-1].end for id in dlg_utts]) / 3600))

Conversations: 1126
Utterances: 214592


AttributeError: 'dict' object has no attribute 'end'

In [20]:
vocab = {}
mode = "train"
dlg_keys = dlgs_test if mode == "test" else dlgs_train
for longid in dlg_keys:
    if longid not in dlg_utts: break
    utts = dlg_utts[longid]
    for utt in utts:
        for word in utt['trans_words']:
            word = word.lower()
            if word in vocab: vocab[word] += 1
            else: vocab[word] = 1
                
# print(len([word for word in vocab if vocab[word] == 1]))
print(len(vocab))
vocab2 = [word for word in vocab if vocab[word] >= 1]
print(len(vocab2))
vocab2.sort()
with open(os.path.join(DATA_FOLDER, "vocab", "words20.txt"), 'w') as f:
    #f.write('<unk>\n')
    f.write('\n'.join(['%s' % (word) for i, word in enumerate(vocab2)]))

18783
18783


In [None]:
from subprocess import call
from struct import unpack, pack

#mean = np.load(os.path.join(DATA_FOLDER, "mean.npy"))
#var = np.load(os.path.join(DATA_FOLDER, "var.npy"))

outputs = []
mean = np.array([0] * 120)
var = np.array([0] * 120)
count = 0

for longid in dlg_keys:
    if longid != '2955': continue
    print(longid, end=' ')
    wav = {}
    for c in ['A', 'B']:
        wavpath = os.path.join(DATA_FOLDER, "wav/sw0%s_%s.wav" % (longid, c))
        wav[c] = AudioSegment.from_wav(wavpath)
    utts = dlg_utts[longid]
    
    if not os.path.exists(os.path.join(DATA_FOLDER, "features", "wav", longid)):
        os.mkdir(os.path.join(DATA_FOLDER, "features", "wav", longid))
    if not os.path.exists(os.path.join(DATA_FOLDER, "features", "npy", longid)):
        os.mkdir(os.path.join(DATA_FOLDER, "features", "npy", longid))
    if not os.path.exists(os.path.join(DATA_FOLDER, "features", "htk", longid)):
        os.mkdir(os.path.join(DATA_FOLDER, "features", "htk", longid))
    # count += 1
    
    for utt in utts:
        id = utt.id
        c = utt.caller
        output_wav = os.path.join(DATA_FOLDER, "features", "wav", longid, "%s_%s.wav" % (id, c))
        output_npy = os.path.join(DATA_FOLDER, "features", "npy", longid, "%s_%s.npy" % (id, c))
        output_htk = os.path.join(DATA_FOLDER, "features", "htk", longid, "%s_%s.htk" % (id, c))
            
        if True: #
            # utt_wav = AudioSegment.silent(500) + wav[c][int(utt.start * 1000):int(utt.end * 1000)] + AudioSegment.silent(500)
            utt_wav = wav[c][int(utt.start * 1000):int(utt.end * 1000)]
            utt_wav.export(output_wav, format='wav', bitrate=16000)

            call([
                "/n/sd7/trung/bin/htk/HTKTools/HCopy",
                output_wav,
                output_htk,
                "-C", "/n/sd7/trung/config.lmfb.40ch"
            ])
        
        fh = open(output_htk, "rb")
        spam = fh.read(12)
        nSamples, sampPeriod, sampSize, parmKind = unpack(">IIHH", spam)
        veclen = int(sampSize / 4)
        fh.seek(12, 0)
            
        dat = np.fromfile(fh, dtype=np.float32)
        dat = dat.reshape(len(dat) // veclen, veclen)
        dat = dat.byteswap()
        # print(utt.start - utt.end, len(dat))
        
        for k in range(len(dat)):
            count += 1
            updated_mean = mean + (dat[k] - mean) / count
            var = var + ((dat[k] - mean) * (dat[k] - updated_mean) - var) / count
            mean = updated_mean
        
        
        #dat = (dat - mean) / np.sqrt(var)
        # print(output_wav, len(dat))
                
        fh.close()
        np.save(output_npy, dat)
    
np.save(os.path.join(DATA_FOLDER, "mean.npy"), mean)
np.save(os.path.join(DATA_FOLDER, "var.npy"), var)

In [27]:
words = open(os.path.join(DATA_FOLDER, "vocab", "words20.txt")).read().split('\n')
words = {word: i for i, word in enumerate(words)}
outputs = []
for longid in dlg_keys:
    utts = dlg_utts[longid]
    for utt in utts:
        if len(utt['trans_words']) > 0 and utt['end'] - utt['start'] < 15:
            outputs.append("%s %s" % (
                utt['npy'],
                ' '.join(['2'] + [words[w.lower()] if w.lower() in words else str(words['<unk>']) for w in utt['trans_words']] + ['1'])
            ))

outputs.sort(key=lambda o: len(o))
print(len(outputs))
with open(os.path.join(DATA_FOLDER, 'inputs_%s.txt' % (mode)), 'w') as fo:
    fo.write('\n'.join(outputs))

KeyError: 'npy'

In [None]:
import numpy as np
import IPython, random
#npy_path = "/n/sd7/trung/csp/data/swb/features/npy/2955/1_B.npy"
npy_path = random.choice(dlg_utts[random.choice(list(dlg_utts.keys()))]).npy
print(npy_path)
longid = npy_path.split('/')[-2]
uttid = npy_path.split('/')[-1].split('.')[0]
utt = [utt for utt in dlg_utts[longid] if utt.id == int(uttid.split('_')[0])][0]
print(utt.caller, utt.id, utt.trans_words)
print(' '.join(['2'] + [words[w.lower()] if w.lower() in words else '0' for w in utt.trans_words] + ['1']))
dat = np.load(npy_path)
print(len(dat))
print(np.sum(dat, axis=0))
IPython.display.Audio(npy_path.replace('npy', 'wav'), autoplay=True)

In [None]:
import IPython
IPython.display.Audio(filename="/n/sd7/trung/csp/data/swb/wav/sw0%s_%s.wav" % ('2955', 'B'))

In [None]:
print(len(glob.glob(os.path.join(DATA_FOLDER, "features", '**/*.npy'))))