In [None]:
import os
import re
import pandas as pd
import glob

In [None]:
mj_dir = 'path/to/your/data/MJ/ctm/MJ' ###
out_dir = "./mj output"

stm_folder = "path/to/your/data/MJ/stm" ###
out_path = os.path.join(stm_folder, "MJ_reference.stm")

In [None]:
def clean_word(word):
    #match words starting with ?? or having at least 3 x's, optionally followed by a period
    match = re.fullmatch(r'(\?\?.*)|x{3,}\.?', word.lower())
    
    if match is not None:
        if word.startswith('??'):
            cleaned_word = f"[{word[2:]}]"  #remove the ?? and add brackets
            return cleaned_word
        if word.lower().endswith('.'):
            base_word = word[:-1]  #remove the period temporarily
            cleaned_word = f"[{base_word}]."  #add brackets and keep the period
        else:
            cleaned_word = f"[{word}]"
        return cleaned_word
    
    return word

In [None]:
def convert_ctm_file(ctm_path, stm_path, silence_threshold=1):
    with open(ctm_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    stm_lines = []
    current_words = []
    current_start = None
    current_end = None
    filename = os.path.splitext(os.path.basename(ctm_path))[0]
    segment_index = 0  

    def flush_segment():
        nonlocal stm_lines, current_start, current_end, current_words, segment_index
        if current_words and current_start is not None and current_end is not None:
            text = " ".join(current_words)
            utt_id = f"{filename}_u{segment_index}"
            stm_lines.append(f"{utt_id} 1 spk1 {current_start:.2f} {current_end:.2f} <o,f0,unknown> {text}")
            segment_index += 1
        current_words = []
        current_start = None
        current_end = None

    for line in lines:
        parts = line.strip().split()
        if len(parts) < 5:
            continue

        file_id, _, start, end, word = parts[:5]
        start = float(start)
        end = float(end)
        duration = end - start
        word = clean_word(word)
        if duration <= 0:
            continue

        if word.lower() == "sil":
            if duration >= silence_threshold:
                flush_segment()
                utt_id = f"{filename}_u{segment_index}_SIL"
                stm_lines.append(f"{utt_id} 1 inter_segment_gap {start:.2f} {end:.2f} <o,f0,>")
                segment_index += 1
            else:
                if current_end is not None:
                    current_end = end
            continue

        if current_start is None:
            current_start = start
        current_end = end
        current_words.append(word)

        if word.endswith('.'):
            flush_segment()

    flush_segment()

    with open(stm_path, 'w', encoding='utf-8') as out:
        for line in stm_lines:
            out.write(line + '\n')

    print(f"Wrote {len(stm_lines)} segments to {stm_path}")


In [5]:
def ctm_folder_to_stm_folder(ctm_dir, stm_out_dir, silence_threshold=1):
    os.makedirs(stm_out_dir, exist_ok=True)
    ctm_files = [f for f in os.listdir(ctm_dir) if f.endswith('.ctm')]
    
    for ctm_file in ctm_files:
        ctm_path = os.path.join(ctm_dir, ctm_file)
        stm_filename = os.path.splitext(ctm_file)[0] + '.stm'
        stm_path = os.path.join(stm_out_dir, stm_filename)
        convert_ctm_file(ctm_path, stm_path, silence_threshold)

In [6]:
ctm_folder_to_stm_folder(mj_dir, out_dir)

Wrote 102 segments to ./mj output\MJ_2016_1.stm
Wrote 101 segments to ./mj output\MJ_2016_2.stm
Wrote 119 segments to ./mj output\MJ_2016_3.stm
Wrote 113 segments to ./mj output\MJ_2016_4.stm
Wrote 90 segments to ./mj output\MJ_2017_1.stm
Wrote 114 segments to ./mj output\MJ_2017_2.stm
Wrote 87 segments to ./mj output\MJ_2017_3.stm
Wrote 103 segments to ./mj output\MJ_2017_5.stm
Wrote 100 segments to ./mj output\MJ_2017_6.stm
Wrote 76 segments to ./mj output\MJ_2017_8.stm
Wrote 100 segments to ./mj output\MJ_2018_1.stm
Wrote 81 segments to ./mj output\MJ_2018_2.stm
Wrote 89 segments to ./mj output\MJ_2018_3.stm
Wrote 104 segments to ./mj output\MJ_2018_4.stm
Wrote 111 segments to ./mj output\MJ_2018_5.stm
Wrote 105 segments to ./mj output\MJ_2018_6.stm
Wrote 75 segments to ./mj output\MJ_2018_7.stm
Wrote 104 segments to ./mj output\MJ_2018_8.stm
Wrote 114 segments to ./mj output\MJ_2019_1.stm
Wrote 88 segments to ./mj output\MJ_2019_2.stm
Wrote 124 segments to ./mj output\MJ_2019_3.stm

In [None]:
def merge_stm_files(stm_folder, output_file):
    stm_files = glob.glob(os.path.join(stm_folder, '*.stm'))
    with open(output_file, 'w', encoding='utf-8') as out_f:
        for stm_file in stm_files:
            with open(stm_file, 'r', encoding='utf-8') as in_f:
                out_f.writelines(in_f.readlines())

merge_stm_files(stm_folder, out_path)