In [40]:
from pydub import AudioSegment
import os
from tqdm import tqdm
import glob

In [None]:
stm_dir = 'path/to/your/data/mj output'
audio_dir = 'path/to/your/data//MJ/wav'
output_dir = './mj output/split_audio'

In [None]:
def get_utterances_stm(stm_path):
    utterances = []

    with open(stm_path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split(maxsplit=6)
            file_id, _, speaker, start, end, _ = parts[:6]
            text = parts[6] if len(parts) > 6 else ''

            utterances.append({
                'id': file_id,
                'text': text,
                'from': float(start),
                'to': float(end),
                'speaker': speaker
            })

    return utterances

def get_utterances_audacity(aud_path):
    with open(aud_path, 'r') as _f:
        lines = _f.readlines()
    utterances = []
    for line in lines:
        components = line.split('\t')
        text = components[2].strip()
        utterances.append({'text': text,
                           'from': float(components[0]),
                           'to': float(components[1])})
    return utterances


def get_partial_audio(audio_file_path, from_sec, to_sec):
    audio = AudioSegment.from_wav(audio_file_path)
    audio = audio.set_channels(1).set_frame_rate(16000)
    segment = audio[from_sec * 1000 : (to_sec * 1000 + 300)] #manually adding 300ms since words tend to get cut off otherwise
    return segment

def write_audio(file_path, audio_segment):
    audio_segment.export(file_path, format="wav")

In [44]:
audio_files = glob.glob(f'{audio_dir}/*.wav')
transcript_files = glob.glob(f'{stm_dir}/*.stm')
all_utterances = {}

for af in audio_files:
    c_id = os.path.splitext(os.path.basename(af))[0]
    single_transcript_path = f'{stm_dir}/{c_id}.stm'
    utterances = get_utterances_stm(single_transcript_path)

    for idx, u in enumerate(utterances):
        utt_id = f"{c_id}_u{idx}"
        all_utterances[utt_id] = {
            'file_id': u['id'],
            'text': u['text'],
            'audio_file': af,
            'from_sec': u['from'],
            'to_sec': u['to'],  
            'speaker': u['speaker']
        }

print('Writing individual utterance audio files...')
utterance_audio_folder = os.path.join(output_dir, 'audio_utterances')
os.makedirs(utterance_audio_folder, exist_ok=True)
for utt_id, utt in tqdm(all_utterances.items()):
    utt_audio = get_partial_audio(utt['audio_file'],
                                    utt['from_sec'],
                                    utt['to_sec'])
    if utt['speaker'] == 'inter_segment_gap':
        utt_id = f'{utt_id}_SIL'
    write_audio(os.path.join(utterance_audio_folder, f'{utt_id}.wav'),
                utt_audio)
    
print('Writing reference transcript file...')
reference_transcript_path = os.path.join(output_dir,
                                            'PR_reference.stm')
with open(reference_transcript_path, 'w') as f:
    for utt_id, utt in all_utterances.items():
        if len(utt['text']) > 0:
            f.write(f"{utt['file_id']} 1 {utt['speaker']} {utt['from_sec']} {utt['to_sec']} <o,f0,unknown> {utt['text']}\n")
        else:
            f.write(f"({utt['file_id']}\n")
print('Done!')

Writing individual utterance audio files...


100%|██████████| 3369/3369 [05:29<00:00, 10.22it/s]

Writing reference transcript file...
Done!



