In [5]:
DIR_PATH = '../../data/6_11_2025_tcc'
OUT_PATH = '../../data/6_11_2025_tcc_vc'

In [2]:
import soundfile
import os
import csv

def _ensure_output_dir(output_path):
    clips_path = os.path.join(output_path, 'clips')
    os.makedirs(clips_path, exist_ok=True)

def _ensure_tsv_header(tsv_path):
    if not os.path.exists(tsv_path):
        with open(tsv_path, "w", encoding="utf-8") as f:
            w = csv.writer(f, delimiter="\t")
            w.writerow(["path","sentence","eng", "sw"])

def append_row(tsv_path, audio_path, sentence, eng, sw):
    with open(tsv_path, "a", encoding="utf-8") as f:
        w = csv.writer(f, delimiter="\t")
        w.writerow([audio_path, sentence, eng, sw])

def save_segment(output_dir, segment_id, audio_file, sr, transcript):
    audio_path = os.path.join(output_dir, 'clips', segment_id + '.mp3')
    _ensure_output_dir(output_dir)
    soundfile.write(audio_path, audio_file, sr)

    tsv_path = os.path.join(output_dir, 'dataset.tsv')
    _ensure_tsv_header(tsv_path)
    append_row(tsv_path, audio_path, transcript)



In [33]:
import os
from librosa import load
import textgrid
import re

transcription_tiers_map = {
    "tcc": r"^Asmjeeg$",
    "201": r"^ref",
    "IGS": r"Transcription"
}

def process(file_name, folder_path, output_path):
    # audio_file, sr = load(os.path.join(folder_path, file_name + '.mp3'))
    tg = textgrid.TextGrid.fromFile(os.path.join(folder_path, file_name + '.TextGrid')) 
    tier_name = transcription_tiers_map[file_name[:3]]
    # print('process', file_name, folder_path)
    
    for tier in tg.tiers:
        if not re.match(tier_name, tier.name):
            continue

        non_empty_intervals = [interval for interval in tier.intervals if interval.mark.strip()]
        if not non_empty_intervals:
            print('THERE ARENT ANY NON EMPTY INTERVALS :(')
            return
        current_start, current_end = non_empty_intervals[0].minTime, non_empty_intervals[0].maxTime
        segments = []

        if len(non_empty_intervals) == 1:
            segments = [(current_start, current_end)]
        else:
            for interval in non_empty_intervals[1:]:
                if interval.maxTime - current_start <= 30:
                    current_end = interval.maxTime
                else:
                    segments.append((current_start, current_end))
                    current_start = interval.minTime
        
        # print('segments', segments)
                


In [34]:
directories = [d for d in os.listdir(DIR_PATH) if os.path.isdir(os.path.join(DIR_PATH, d))]

for dir in directories:
    inner_path = os.path.join(DIR_PATH, dir)
    files = {file.split('.')[0] for file in os.listdir(inner_path)}
    
    for file in files:
        process(file, inner_path, '')

        

THERE ARENT ANY NON EMPTY INTERVALS :(
THERE ARENT ANY NON EMPTY INTERVALS :(
THERE ARENT ANY NON EMPTY INTERVALS :(
THERE ARENT ANY NON EMPTY INTERVALS :(
THERE ARENT ANY NON EMPTY INTERVALS :(
THERE ARENT ANY NON EMPTY INTERVALS :(
THERE ARENT ANY NON EMPTY INTERVALS :(
THERE ARENT ANY NON EMPTY INTERVALS :(
THERE ARENT ANY NON EMPTY INTERVALS :(
THERE ARENT ANY NON EMPTY INTERVALS :(
THERE ARENT ANY NON EMPTY INTERVALS :(
THERE ARENT ANY NON EMPTY INTERVALS :(
THERE ARENT ANY NON EMPTY INTERVALS :(
THERE ARENT ANY NON EMPTY INTERVALS :(
THERE ARENT ANY NON EMPTY INTERVALS :(
THERE ARENT ANY NON EMPTY INTERVALS :(
THERE ARENT ANY NON EMPTY INTERVALS :(
THERE ARENT ANY NON EMPTY INTERVALS :(
THERE ARENT ANY NON EMPTY INTERVALS :(
THERE ARENT ANY NON EMPTY INTERVALS :(
THERE ARENT ANY NON EMPTY INTERVALS :(
THERE ARENT ANY NON EMPTY INTERVALS :(
THERE ARENT ANY NON EMPTY INTERVALS :(
THERE ARENT ANY NON EMPTY INTERVALS :(
THERE ARENT ANY NON EMPTY INTERVALS :(
THERE ARENT ANY NON EMPTY