Test preprocessing of the https://www.elararchive.org/uncategorized/SO_a863403a-0a7d-4c07-9252-dac4c6777054/?pg=4

dataset contains audiofile, aligned with the transcription. Audiofiles are saved as .mp3, transcription TextGrid.

Purpose of this notebook is to be able to iterate through folders with the data, then preprocess each .textgrid - .mp3 pair and add it to the dataset, which could be later used for whisper finetuning.

mp3 recording should be stripped from silence (athe the beggining and the end), and than splitten into max 30 seconds clips. Then clips with annotations should be saved in a following way:

preprocessed_data:
    clips:
        clip_01.mp3
        clip_02.mp3
        ...
    all.tsv

all.tsv file should contain:
    audio(path)     sentence(transcription)     translation_en      translation_sw


In [1]:
print('CURRENT KERNEL')
!which python

CURRENT KERNEL
/Users/zuzamakowska/Documents/Africa/Project/Low-resource-languages/venv/bin/python


In [2]:
import soundfile
import os
import csv

def _ensure_output_dir(output_path):
    clips_path = os.path.join(output_path, 'clips')
    os.makedirs(clips_path, exist_ok=True)

def _ensure_tsv_header(tsv_path):
    if not os.path.exists(tsv_path):
        with open(tsv_path, "w", encoding="utf-8") as f:
            w = csv.writer(f, delimiter="\t")
            w.writerow(["path","sentence","eng", "sw"])

def append_row(tsv_path, audio_path, sentence, eng, sw):
    with open(tsv_path, "a", encoding="utf-8") as f:
        w = csv.writer(f, delimiter="\t")
        w.writerow([audio_path, sentence, eng, sw])

def save_segment(output_dir, segment_id, audio_file, sr, asmjeeg_str, eng_str, sw_str):
    audio_path = os.path.join(output_dir, 'clips', segment_id + '.mp3')
    _ensure_output_dir(output_dir)
    soundfile.write(audio_path, audio_file, sr)

    tsv_path = os.path.join(output_dir, 'dataset.tsv')
    _ensure_tsv_header(tsv_path)
    append_row(tsv_path, audio_path, asmjeeg_str.strip(), eng_str.strip(), sw_str.strip())




In [3]:
import os
from librosa import load
import tgt

def process(file, folder_path, output_path):
    audio_file, sr = load(os.path.join(folder_path, file + '.mp3'))
    grid = tgt.read_textgrid(os.path.join(folder_path, file + '.TextGrid'))
    asmjeeg_tier = grid.get_tier_by_name(grid.get_tier_names()[0])
    english_tier = grid.get_tier_by_name(grid.get_tier_names()[1])
    swahili_tier = grid.get_tier_by_name(grid.get_tier_names()[2])

    segments = []
    temp_segment = {}

    for interval in asmjeeg_tier:
        current_length = interval.end_time - interval.start_time

        if 'start' not in temp_segment.keys():
            temp_segment['start'] = interval.start_time
            temp_segment['end'] = interval.end_time
            temp_segment['length'] = current_length
            continue
        
        if temp_segment['length'] + current_length < 30:
            temp_segment['end'] = interval.end_time
            temp_segment['length'] += current_length
            temp_segment['id'] = len(segments) + 1
        else:
            segments.append(temp_segment)
            temp_segment = {}
    
    if 'start' in temp_segment.keys():
        segments.append(temp_segment)
    
    asmjeeg_str = ""
    eng_str = ""
    sw_str = ""
    
    for segment in segments:
        segment_id = file + '__' + str(segment['id'])
        asmjeeg_annotations = asmjeeg_tier.get_annotations_between_timepoints(segment['start'], segment['end'])
        english_annotations = english_tier.get_annotations_between_timepoints(segment['start'], segment['end'])
        swahili_annotations = swahili_tier.get_annotations_between_timepoints(segment['start'], segment['end'])
        
        asmjeeg_str = ' '.join([asmjeeg.text for asmjeeg in asmjeeg_annotations])
        eng_str = ' '.join([eng.text for eng in english_annotations])
        sw_str = ' '.join([sw.text for sw in swahili_annotations])   

        audio_segment = audio_file[int(segment['start'] * sr):int(segment['end'] * sr)]

        save_segment(output_path, segment_id, audio_segment, sr, asmjeeg_str, eng_str, sw_str)

        


In [None]:
import os

folder_path = "../../data/6_11_2025_tcc"
output_path = "../../data/6_11_2025_tcc_cv"

textgrid_files = [file for file in os.listdir(folder_path) if file.endswith('.TextGrid')]
mp3_files = [file for file in os.listdir(folder_path) if file.endswith('.mp3')]

corresponding_files = [file[:-len('.TextGrid')] for file in textgrid_files if file.replace('.TextGrid', '.mp3') in mp3_files]

for file in corresponding_files:
    process(file, folder_path, output_path)
    