In [1]:
import os
from pathlib import Path

# Create folders
Path("download").mkdir(exist_ok=True)
Path("data").mkdir(exist_ok=True)

# Download archives into download/
os.chdir("download")
!curl --remote-name-all https://www.clarin.si/repository/xmlui/bitstream/handle/11356/1444{/GosVL.TRS.zip,/GosVL.TEI.zip,/GosVL.vert.zip}
!curl --remote-name-all https://www.clarin.si/repository/xmlui/bitstream/handle/11356/1222{/GosVL.wav.0.zip,/GosVL.wav.1.zip,/GosVL.wav.2.zip,/GosVL.wav.3.zip,/GosVL.wav.4.zip,/GosVL.wav.5.zip}
os.chdir("..")

# Extract archives into data/
import zipfile

for zip_file in Path("download").glob("*.zip"):
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        zip_ref.extractall("data")


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
 65 13.9M   65 9312k    0     0  8662k      0  0:00:01  0:00:01 --:--:-- 8678k
100 13.9M  100 13.9M    0     0  9372k      0  0:00:01  0:00:01 --:--:-- 9382k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
 91 5895k   91 5382k    0     0  9654k      0 --:--:-- --:--:-- --:--:-- 9646k
100 5895k  100 5895k    0     0   9.9M      0 --:--:-- --:--:-- --:--:--  9.9M
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   

In [2]:
!pip install TTS[all] lxml pandas tqdm pydub unidecode



In [3]:
!python.exe -m pip install --upgrade pip



Split audio clips

In [5]:
import os
import re
import glob
from pydub import AudioSegment

def slovenian_to_ascii_phonetic(_text):
    return (_text.lower()
                .replace('č', 'ch')
                .replace('š', 'sh')
                .replace('ž', 'zh')
                .replace('ć', 'c')
                .replace('đ', 'dj'))

def normalize_text(_text):
    _text = slovenian_to_ascii_phonetic(_text)
    _text = re.sub(r"[^a-zA-Z\s']", "", _text)  # Keep basic letters and apostrophes
    _text = re.sub(r"\s+", " ", _text)          # Collapse multiple spaces
    return _text.strip().lower()

# Paths
audio_path = 'data/GosVL.wav/'
transcript_path = 'data/GosVL.TRS/'
output_path = 'data/wavs'

os.makedirs(output_path, exist_ok=True)

# Create an empty list to hold our dataset
dataset = []

def convert_audio_format(_audio):
    # Convert to mono (single channel)
    _audio = _audio.set_channels(1)

    # Set the sample rate to 22050 Hz
    _audio = _audio.set_frame_rate(22050)

    # Set the sample width to 2 bytes (16-bit)
    _audio = _audio.set_sample_width(2)

    return _audio

# Iterate over all audio files
for audio_file in glob.glob(os.path.join(audio_path, '*.wav')):
    # Derive the corresponding transcript file
    file_name = os.path.basename(audio_file)
    transcript_file = os.path.join(transcript_path, file_name.replace('.wav', '_wrd.txt'))

    # Load the audio file
    audio = AudioSegment.from_wav(audio_file)
    audio = convert_audio_format(audio)

    # Read the transcript
    with open(transcript_file, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    # Group the words into sentences (based on SILENCE or IGNORE as boundaries)
    sentence_start_time = None
    sentence_end_time = None
    sentence_words = []
    sentence_counter = 1
    prev_abs_end = None  # Keep track of the previous word's end time

    for line in lines:
        parts = line.strip().split()
        if len(parts) < 4:
            continue
        abs_start = float(parts[1]) * 1000  # Convert to milliseconds
        abs_end = float(parts[2]) * 1000  # Convert to milliseconds
        word = parts[5]

        # Skip silence and ignore words
        if word in ["SILENCE", "IGNORE"]:
            continue

        # Start a new sentence if we don't have one yet
        if sentence_start_time is None:
            sentence_start_time = abs_start

        # Add the current word to the sentence
        sentence_words.append(word)
        sentence_end_time = abs_end  # Update the last word's end time

        # If we detect a gap larger than a threshold, consider it the end of a sentence
        gap_threshold = 1.0  # 1 second gap between words can indicate sentence boundary
        if prev_abs_end and abs_start - prev_abs_end > gap_threshold:
            clip = audio[int(sentence_start_time):int(sentence_end_time)]

            # Adjusted length filtering
            if len(clip) < 2000 or len(clip) > 11000:  # 2-30 seconds instead of 1-20
                # Always update variables for the next sentence
                sentence_counter += 1
                sentence_start_time = abs_start  # Start new sentence from current word
                sentence_words = [word]  # Only keep the current word in the sentence
                prev_abs_end = abs_end  # Update the previous word's end time
                continue  # Skip this sentence as it doesn't meet the length criteria

            sentence_text = ' '.join(sentence_words)
            sentence_text = normalize_text(sentence_text)
            clip_file_name = f"{file_name.replace('.wav', '')}_sentence_{sentence_counter}.wav"
            clip.export(os.path.join(output_path, clip_file_name), format="wav")

            # Add to dataset
            dataset.append([os.path.join(output_path, clip_file_name), sentence_text])

            # Update variables for the next sentence
            sentence_counter += 1
            sentence_start_time = abs_start  # Start new sentence from current word
            sentence_words = [word]  # Only keep the current word in the sentence

        prev_abs_end = abs_end  # Update the previous word's end time

    # After looping through the lines, check if there's an incomplete sentence left
    if sentence_words:
        clip = audio[int(sentence_start_time):int(sentence_end_time)]

        if len(clip) < 1000 or len(clip) > 11000:
            continue

        sentence_text = ' '.join(sentence_words)
        sentence_text = normalize_text(sentence_text)
        clip_file_name = f"{file_name.replace('.wav', '')}_sentence_{sentence_counter}.wav"
        clip.export(os.path.join(output_path, clip_file_name), format="wav")

        # Add to dataset
        dataset.append([os.path.join(output_path, clip_file_name), sentence_text])

# Optionally save the dataset to CSV for tracking
import pandas as pd
df = pd.DataFrame(dataset, columns=["audio_filepath", "text"])
df.to_csv('data/GosVL_sentences.csv', index=False, encoding='utf-8')

print("Audio has been split into sentences, and the metadata has been saved.")

Audio has been split into sentences, and the metadata has been saved.


In [10]:
import pandas as pd
import os

# Define the paths
audio_path = 'data/wavs'  # The path where your clips are stored
csv_path = 'data/GosVL_sentences.csv'  # The file with your audio file names and transcripts
metadata_path = 'data/metadata.csv'  # The path for the metadata file

# Variable to control the percentage of samples in the dataset (e.g., 0.05 for 5%)
sample_percentage = 1.0

# Get the absolute path for the base directory where the audio files are stored
audio_base_path = os.path.abspath(audio_path)

# Read the sentences csv file - take random samples
df = pd.read_csv(csv_path)
if 0 < sample_percentage < 1:
    df = df.sample(frac=sample_percentage, random_state=42)

# Initialize the list to store the metadata
metadata = []

# Iterate through each row in the GosVL_sentences.csv
for _, row in df.iterrows():
    audio_filename = row['audio_filepath']  # Get the audio filename
    text = row['text']  # Get the transcript (text) for the audio

    # Ensure audio_filename is relative to audio_base_path
    if audio_filename.startswith(audio_path):
        audio_filename = audio_filename[len(audio_path)+1:]  # Remove the 'data/wavs' prefix

    # Remove the .wav extension
    audio_filename = os.path.splitext(audio_filename)[0]

    # Get the absolute path of the audio file by joining the base path and the relative file path
    audio_filename = os.path.join(audio_base_path, audio_filename)

    metadata.append([audio_filename, text, text])  # We repeat 'text' here as per the required format

# Create a DataFrame from the metadata
metadata_df = pd.DataFrame(metadata, columns=['audio_filepath', 'text', 'text'])

# Save the metadata to a CSV file
metadata_df.to_csv(metadata_path, sep='|', index=False, header=False, encoding='utf-8')

print(f"metadata.csv has been created at {metadata_path}")

metadata.csv has been created at data/metadata.csv
