In [1]:
DIR_PATH = '../../data/6_11_2025_tcc'
OUT_PATH = '../../data/6_11_2025_tcc_vc'

In [7]:
import soundfile
import os
import csv

def _ensure_output_dir(output_path):
    clips_path = os.path.join(output_path, 'clips')
    os.makedirs(clips_path, exist_ok=True)

def _ensure_tsv_header(tsv_path):
    if not os.path.exists(tsv_path):
        with open(tsv_path, "w", encoding="utf-8") as f:
            w = csv.writer(f, delimiter="\t")
            w.writerow(["path","sentence","eng", "sw"])

def append_row(tsv_path, audio_path, sentence):
    with open(tsv_path, "a", encoding="utf-8") as f:
        w = csv.writer(f, delimiter="\t")
        w.writerow([audio_path, sentence])

def save_segment(output_dir, segment_id, audio_file, sr, transcript):
    audio_path = os.path.join(output_dir, 'clips', segment_id + '.mp3')
    _ensure_output_dir(output_dir)
    soundfile.write(audio_path, audio_file, sr)

    tsv_path = os.path.join(output_dir, 'dataset.tsv')
    _ensure_tsv_header(tsv_path)
    append_row(tsv_path, audio_path, transcript)



In [5]:
import os
from librosa import load
import textgrid
import re

transcription_tiers_map = {
    "tcc": r"^Asmjeeg$",
    "201": r"^ref",
    "IGS": r"Transcription"
}

def get_duration(interval):
    return interval.maxTime - interval.minTime

def process(file_name, folder_path, output_path):
    # audio_file, sr = load(os.path.join(folder_path, file_name + '.mp3'))
    tg = textgrid.TextGrid.fromFile(os.path.join(folder_path, file_name + '.TextGrid')) 
    tier_name = transcription_tiers_map[file_name[:3]]
    # print('process', file_name, folder_path)
    
    for tier in tg.tiers:
        if not re.match(tier_name, tier.name):
            continue

        non_empty_intervals = [interval for interval in tier.intervals if interval.mark.strip()]
        if not non_empty_intervals:
            print('THERE ARENT ANY NON EMPTY INTERVALS :(', folder_path, file_name)
            return 0, 1
        
        for interval in non_empty_intervals:
            print(interval)
        
    return 1, 0
                


In [6]:
directories = [d for d in os.listdir(DIR_PATH) if os.path.isdir(os.path.join(DIR_PATH, d))]
goods, empties = 0, 0

for dir in directories[:1]:
    inner_path = os.path.join(DIR_PATH, dir)
    files = {file.split('.')[0] for file in os.listdir(inner_path)}
    
    for file in files:
        good, empty = process(file, inner_path, '')
        goods += good
        empties += empty

print('good', goods, 'empty', empties)

        

Interval(54.77999, 56.45497, diyayd qaqút)
Interval(57.81832, 59.41539, diyayd gabúr
)
Interval(61.12932, 62.8822, diyayd qoháyd
)
Interval(64.40137, 66.31006, diyayd gokúll
)
Interval(67.75132, 69.5042, diyayd qafách
)
Interval(71.25709, 73.20473, diyayd gahósan
)
Interval(81.23081, 83.30102, gabúr díyáyd 
)
Interval(90.72697, 92.61078, qaqút díyáyd
)
Interval(105.69896, 108.30881, qoháyd díyáyd
)
Interval(109.98379, 111.58086, gokúll díyáyd
)
Interval(112.82735, 114.50233, qafách díyáyd
)
Interval(115.7563, 117.54813, gahósan diyáyd
)
Interval(122.06668, 123.93642, diyayd qabáran
)
Interval(126.70208, 128.49391, diyayd gagódan
)
Interval(131.10376, 133.16826, diyayd qong'ónyan
)
Interval(135.85602, 137.76471, diyayd gongíran
)
Interval(139.98503, 141.93268, diyayd gofkádan
)
Interval(143.80242, 145.82797, diyayd gagássán
)
Interval(148.00934, 150.1907, diyayd gagúrnan
)
Interval(152.53535, 154.28823, diyayd qabáreny
)
Interval(156.43065, 158.49515, diy