In [1]:
import pandas as pd
import os
import json
if 'cd' not in globals():
    os.chdir('..')
    cd = True


In [2]:
with open('scripts/labels.json', 'r') as f:
    labels = json.load(f)

In [3]:
def check_characters(string, labels):
    for c in string:
        if c not in labels:
            return False
    return True

In [4]:
labels

['_',
 "'",
 'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'L',
 'M',
 'N',
 'O',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'U',
 'V',
 'W',
 'X',
 'Y',
 'Z',
 ' ']

In [7]:
speech_dir = 'speech/alaw'
trans_dir = 'transcriptions/ast/xsampa'

In [5]:
def extract_trans(strings, start=0.0):
    if not strings:
        return None
    words = []
    types = []
    for s in strings:
        end, _, ph, word, word_type = s.strip().split()
        if word_type != 'common' and word_type != '-':
#             print('error word type', word_type)
            return None
        if '-' in word:
            continue
        words.append(word.replace(';', ' ').upper())
    trans = ' '.join(words)
    return [trans, float(start), float(end)]

In [8]:

file_trans = []

for i, filename in enumerate(os.listdir(f'{speech_dir}')):
    if not filename.endswith('.wav'):
        continue

    if i % 1000 == 0:
        print(f'{i}th file')

    utt, extname = filename.split('.')
    spk = utt[:-3]
    trans_path = f'{trans_dir}/{utt}.t02'
    audio_path = f'{speech_dir}/{utt}.wav'
    with open(trans_path, 'r', encoding="ISO-8859-1") as f:
#         print(audio_path)
        started = False
        start_time = 0.0
        strings = []
        for line in f:
            if 'ETimes' in line:
#                 print('started')
                started = True
                continue

            if started:
                
                end, _, ph, word, word_type = line.strip().split()
#                 print(line)
                if '[' in word or ']' in word:
                    # end of a sentence
                    result = extract_trans(strings, start_time)
#                     print('end of a sentence', trans)
                    start_time = end
                    if result is not None:
                        trans, start, end = result
                        if end-start > 1.2 and len(trans) > 10 and check_characters(trans, labels):
                            file_trans.append([audio_path, trans, spk, start, end])
                    strings = []
                    continue
                    
                elif '-' in word:
                    strings.append(line.strip())
                    continue
                else:
                    strings.append(line.strip())
                    continue


0th file
1000th file
2000th file
4000th file
5000th file
6000th file
7000th file
8000th file
10000th file
11000th file
14000th file
19000th file
20000th file
21000th file


In [9]:
file_trans

[['speech/alaw/AE089LF026.wav',
  "ELEVEN O'CLOCK",
  'AE089LF',
  0.646594,
  2.154468],
 ['speech/alaw/AE050LF036.wav',
  'THEY ALL AGREED THAT THE ESSAY IS BARELY',
  'AE050LF',
  0.662374,
  3.798461],
 ['speech/alaw/AE050LF040.wav',
  'REPEAT THAT PLEASE',
  'AE050LF',
  0.430679,
  2.414111],
 ['speech/alaw/AE372LM027.wav',
  'FIVE ONE THREE TWO SIX EIGHT FOUR SEVEN NINE ZERO',
  'AE372LM',
  0.537883,
  4.017245],
 ['speech/alaw/AE183LM007.wav',
  'ZERO SEVEN ZERO THREE',
  'AE183LM',
  1.913229,
  3.968592],
 ['speech/alaw/AE183LM007.wav',
  'DOUBLE ZERO NINE',
  'AE183LM',
  5.895492,
  7.308557],
 ['speech/alaw/AE029CF030.wav',
  'THREE THOUSAND EIGHT HUNDRED AND THIRTEEN RAND SIXTY SEVEN CENTS',
  'AE029CF',
  0.709564,
  4.522924],
 ['speech/alaw/AE122LM030.wav',
  'FIVE FIVE TWO',
  'AE122LM',
  1.582715,
  2.842661],
 ['speech/alaw/AE122LM030.wav',
  'OH EIGHT ONE THREE',
  'AE122LM',
  3.000154,
  4.417593],
 ['speech/alaw/AE184LM013.wav',
  'NUMBER THIRTY',
  'AE184LM',

In [10]:
len(file_trans)

3799

In [118]:
df = pd.DataFrame(file_trans, columns=['file', 'trans', 'speaker', 'start', 'end'])

In [121]:
n_speaker = len(df.speaker.unique())

In [123]:
all_speakers = sorted(df.speaker.unique())

In [124]:
n_speaker_train = int(n_speaker*0.8)
n_speaker_dev   = int(n_speaker*0.1)
n_speaker_test  = int(n_speaker*0.1)
train_speakers = all_speakers[:n_speaker_train]
dev_speakers   = all_speakers[n_speaker_train:n_speaker_train+n_speaker_dev]
test_speakers  = all_speakers[n_speaker_train+n_speaker_dev:]

In [126]:
len(train_speakers), len(dev_speakers), len(test_speakers)

(188, 23, 24)

In [129]:
df_train = df[df.speaker.isin(train_speakers)]
df_dev   = df[df.speaker.isin(dev_speakers)]
df_test  = df[df.speaker.isin(test_speakers)]

In [130]:
len(df_train), len(df_dev), len(df_test)

(2619, 335, 369)

In [131]:
df.to_csv('scripts/trans_all.csv')
df_train.to_csv('scripts/trans_train.csv')
df_dev.to_csv('scripts/trans_dev.csv')
df_test.to_csv('scripts/trans_test.csv')