In [None]:
prefixes
AS: AudioSet
CV: common voice
MJ: HoMed Medicijnjournaal
JA: JASMIN
PR: primock57

In [None]:
#uncomment if needed
#!pip install pydub

In [None]:
from pydub import AudioSegment
import os
from os.path import join
import pandas as pd
import subprocess
import wave
import numpy as np

In [None]:
main_path = r"path/to/your/data" ###
as_path = os.path.join(main_path, "AS")
cv_path = os.path.join(main_path, "CV/cv-corpus-20.0-2024-12-06-nl/cv-corpus-20.0-2024-12-06/nl/clips")
mj_path = os.path.join(main_path, "MJ/wav")
pr_path = os.path.join(main_path, "PR/transcripts")

ja_main_path = os.path.join(main_path, "JA/Data/Data/audio/wav")
ja_pnl_path = os.path.join(ja_main_path, "comp-p/nl")
ja_pvl_path = os.path.join(ja_main_path, "comp-p/vl")
ja_qnl_path = os.path.join(ja_main_path, "comp-q/nl")
ja_qvl_path = os.path.join(ja_main_path, "comp-q/vl")

In [None]:
#convert AS, CV, MJ to .wav

def convert_mp3_to_wav(path):
    for filename in os.listdir(path):
        if filename.endswith(".mp3"):
            try:
                mp3_path = os.path.join(path, filename)
                wav_filename = os.path.splitext(filename)[0] + ".wav"
                wav_path = os.path.join(path, wav_filename)

                #skip if wav already exists
                if os.path.exists(wav_path):
                    print(f"Skipping {filename} (WAV already exists)")
                    continue

                #load and export
                audio = AudioSegment.from_mp3(mp3_path)
                audio.export(wav_path, format="wav")
                print(f"Converted: {filename} -> {wav_filename}")

                #optional: remove original mp3
                #os.remove(mp3_path)
                #print(f"Deleted original: {filename}")

            except Exception as e:
                print(f"Error converting {filename}: {e}")
                continue
    print("Conversion complete.")

convert_mp3_to_wav(as_path)
convert_mp3_to_wav(cv_path)
convert_mp3_to_wav(mj_path)

In [None]:
#convert AS, MJ, CV to 16khz

def get_sample_rate(file_path):
    cmd = [
        "ffprobe",
        "-v", "error",
        "-select_streams", "a:0",
        "-show_entries", "stream=sample_rate",
        "-of", "default=noprint_wrappers=1:nokey=1",
        file_path
    ]
    result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    try:
        return int(result.stdout.decode().strip())
    except:
        return None

def to_16khz(path):
    for filename in os.listdir(path):
        if filename.endswith(".wav"):
            full_path = os.path.join(path, filename)
            temp_path = full_path.replace(".wav", "_tmp.wav")
            try: 
                if get_sample_rate(full_path) == 16000:
                    # Skip if already 16kHz
                    print(f"File already 16kHz: {filename}, skipping...")
                    continue

                elif get_sample_rate(full_path) > 16000:
                    print(f"Processing {filename}")
                    cmd = [
                        "ffmpeg",
                        "-y",                #overwrite without asking
                        "-i", full_path,     #input file
                        "-ar", "16000",      #sample rate
                        temp_path            #output to temporary file
                    ]
                    subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

                    #replace original file
                    os.replace(temp_path, full_path)

            except Exception as e:
                print(f"Error processing {filename}: {e}")
                continue
    print(f"Conversion to 16kHz complete for {path}.")

In [None]:
#for AS
to_16khz(as_path)

#for CV
to_16khz(cv_path)

#for MJ
to_16khz(mj_path)

In [None]:
#convert AS to one channel
as_mono_path = os.path.join(as_path, "mono")

def AS_to_mono(input_dir, output_dir):
    os.makedirs(output_dir, exist_ok=True)

    for filename in os.listdir(input_dir):
        if filename.endswith('.wav'):
            input_path = os.path.join(input_dir, filename)
            output_path = os.path.join(output_dir, filename)

            with wave.open(input_path, 'rb') as stereo_wave:
                if stereo_wave.getnchannels() == 2:
                    frames = stereo_wave.readframes(stereo_wave.getnframes())
                    stereo_audio = np.frombuffer(frames, dtype=np.int16)
                    mono_audio = stereo_audio[0::2] + stereo_audio[1::2]

                    #create a new mono .wav file
                    with wave.open(output_path, 'wb') as mono_wave:
                        mono_wave.setnchannels(1)
                        mono_wave.setsampwidth(stereo_wave.getsampwidth())
                        mono_wave.setframerate(stereo_wave.getframerate())
                        mono_wave.writeframes(mono_audio.tobytes())
                else:
                    #if the file is already mono, copy it to the output directory
                    os.system(f'cp {input_path} {output_path}')

    print(f"Conversion complete. Mono files saved to {output_dir}")

AS_to_mono(as_path, as_mono_path)

