In [1]:
#set project path from config.yaml
import pathlib
import yaml

with open("config.yaml", "r") as f:
   config = yaml.safe_load(f)

project_root = pathlib.Path(config["project"]["root_path"])

print("Current project path: ", project_root)

Current project path:  D:\Data Repositories\Coughvid


In [2]:
#Dataset currenntly has 3 file formats
    #-webm
    #-ogg
    #-wav

#WAV is uncompressed and has better compatibility with librosa and pyaudioanalysis. ogg and wav need different libraries which will result in 
    #a lot of unecessary trouble during proccesing. Here we'll convert all the .ogg and .webm files to .wav to make signal analysis and subsequent feature extraction easier.

#current directories
webm_dir = project_root / "audio_webm"
ogg_dir = project_root / "audio_ogg"
wav_dir = project_root / "audio_wav"
json_dir = project_root / "labels_json"

In [3]:
#create a new directory to move all the new and old .wav files in 
import pathlib 

wav_fulldset_dir = project_root / "audio_wav_fulldset"

wav_fulldset_dir.mkdir(parents = True, exist_ok=True)

In [4]:
import os
import json
import subprocess
from pathlib import Path
from tqdm import tqdm
import imageio_ffmpeg as ffbin

#get ffmpeg from venv install for conversion
ffmpeg_exe = ffbin.get_ffmpeg_exe()

#logic for metadata checking
    #a lot of the audio files are not labelled, those instances will be filtered out and not used for training/eval
    #instance counts are retained at cell output to gauge how many samples are being filtered out
target_labels = ["healthy", "symptomatic", "COVID-19"]
labels_path_dir = Path(json_dir) # Ensure this matches your labels directory variable

In [5]:
#---WAV DOWNSAMPLE 8KHZ MONO---
os.makedirs(wav_fulldset_dir, exist_ok=True)

#list all wav files in the source directory
files = [f for f in os.listdir(wav_dir) if f.lower().endswith('.wav')]

#counts
processed_count = 0
stats = {
    "missing_json": 0,
    "empty_status": 0
}

for filename in tqdm(files, desc="Standardizing WAVs to 8kHz"):
    src_file = os.path.join(wav_dir, filename)
    dst_file = os.path.join(wav_fulldset_dir, filename)
    
    # --- METADATA CHECK ---
    file_id = Path(filename).stem
    label_path = labels_path_dir / f"{file_id}.json"
    
    if not label_path.exists():
        stats["missing_json"] += 1
        continue
    
    try:
        with open(label_path, 'r') as f:
            data = json.load(f)
            
        status_value = data.get('status')
        
        #if no assigned status skip entry 
        if status_value not in target_labels:
            stats["empty_status"] += 1
            continue
            
        # --- END METADATA CHECK ---

        # -acodec pcm_s16le: Ensures standard 16-bit WAV format
        subprocess.run([
            ffmpeg_exe, "-y", "-i", src_file,
            "-ar", "8000", # 8KHz regardless to avoid headaches with sampling rates 
            "-ac", "1", #explicitly to mono, no need for stereo for training SVMs
            "-acodec", "pcm_s16le",
            dst_file
        ], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True)
        
        processed_count += 1
            
    except json.JSONDecodeError:
        print(f"Error reading JSON for {filename}")
    except subprocess.CalledProcessError as e:
        print(f"Error processing {filename}: {e}")
    except Exception as e:
        print(f"Unexpected error with {filename}: {e}")

print("-" * 30)
print(f"Processing Complete.")
print(f"Total files standardized and moved to full dataset: {processed_count}")
print(f"Files skipped (JSON missing): {stats['missing_json']}")
print(f"Files skipped (Status empty/invalid): {stats['empty_status']}")

Standardizing WAVs to 8kHz: 100%|██████████| 3309/3309 [01:09<00:00, 47.61it/s]

------------------------------
Processing Complete.
Total files standardized and moved to full dataset: 2202
Files skipped (JSON missing): 0
Files skipped (Status empty/invalid): 1107





In [6]:
#--WEBM FILES CONVERT TO WAV---
webm_files = [f for f in os.listdir(webm_dir) if f.lower().endswith('.webm')]

stats = {
    "missing_json": 0,
    "empty_status": 0,
    "converted": 0
}

# Loop through files and convert
for filename in tqdm(webm_files, desc="Converting WebM to WAV"):
    input_path = os.path.join(webm_dir, filename)
    # Change extension to .wav for the output filename
    file_id = Path(filename).stem
    output_filename = file_id + ".wav"
    output_path = os.path.join(wav_fulldset_dir, output_filename)

    # --- START Metadata checking logic ---
    label_path = labels_path_dir / f"{file_id}.json"
    
    if not label_path.exists():
        stats["missing_json"] += 1
        continue
    
    try:
        with open(label_path, 'r') as f:
            data = json.load(f)
            
        status_value = data.get('status')
        
        # if no assigned status (happens in a lot of cases, skip entry and move to next)
        # or if the status is not in our target balanced list
        if status_value not in target_labels:
            stats["empty_status"] += 1
            continue
        # --- END Metadata checking logic ---

        #-vn: disable video (safety for webm)
        subprocess.run([
            ffmpeg_exe, "-y", "-i", input_path, 
            "-vn", "-acodec", "pcm_s16le", "-ar",
            "8000", #EXPLICITLY subsampling the entire dset to 8khz to save memory and since the
                    #input data is phone recordings which are already low quality, the extra bandwidth won't help much  
            "-ac", "1", 
            output_path
        ], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True)
        
        stats["converted"] += 1
            
    except Exception as e:
        print(f"Error processing {filename}: {e}")

print("-" * 30)
print(f"WebM conversion complete. Files saved to: {wav_fulldset_dir}")
print(f"Total converted: {stats['converted']}")
print(f"Skipped (Missing JSON): {stats['missing_json']}")
print(f"Skipped (Invalid Status): {stats['empty_status']}")

Converting WebM to WAV:   0%|          | 0/29348 [00:00<?, ?it/s]

Converting WebM to WAV: 100%|██████████| 29348/29348 [16:06<00:00, 30.36it/s]

------------------------------
WebM conversion complete. Files saved to: D:\Data Repositories\Coughvid\audio_wav_fulldset
Total converted: 17344
Skipped (Missing JSON): 0
Skipped (Invalid Status): 12004





In [7]:
#--OGG FILES CONVERT TO WAV---


ogg_files = [f for f in os.listdir(ogg_dir) if f.lower().endswith('.ogg')]

stats = {
    "missing_json": 0,
    "empty_status": 0,
    "converted": 0
}

for filename in tqdm(ogg_files, desc="Converting OGG to WAV"):
    input_path = os.path.join(ogg_dir, filename)
    
  
    file_id = Path(filename).stem
    output_filename = file_id + ".wav"
    output_path = os.path.join(wav_fulldset_dir, output_filename)

    #--- METADATA CHECK ---
    label_path = labels_path_dir / f"{file_id}.json"
    
    if not label_path.exists():
        stats["missing_json"] += 1
        continue
    
    try:
        with open(label_path, 'r') as f:
            data = json.load(f)
            
        status_value = data.get('status')
        
        # if no assigned status (happens in a lot of cases, skip entry and move to next)
        # or if the status is not in our target balanced list
        if status_value not in target_labels:
            stats["empty_status"] += 1
            continue
        #--- END METADATA CHECK ---

        subprocess.run([
            ffmpeg_exe, "-y", "-i", input_path, 
            "-acodec", "pcm_s16le", 
            "-ar", "8000", # Resample to 8kHz
            "-ac", "1",    # Downmix to mono
            output_path
        ], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True)
        
        stats["converted"] += 1

    except Exception as e:
        print(f"Error processing {filename}: {e}")

print("-" * 30)
print(f"OGG conversion complete. Files saved to: {wav_fulldset_dir}")
print(f"Total converted: {stats['converted']}")
print(f"Skipped (Missing JSON): {stats['missing_json']}")
print(f"Skipped (Invalid Status): {stats['empty_status']}")

Converting OGG to WAV: 100%|██████████| 1777/1777 [01:44<00:00, 16.96it/s]

------------------------------
OGG conversion complete. Files saved to: D:\Data Repositories\Coughvid\audio_wav_fulldset
Total converted: 1118
Skipped (Missing JSON): 0
Skipped (Invalid Status): 659





In [8]:
#delete previous folders with raw audio to save space
import shutil

# Deleting the webm directory
if webm_dir.exists() and webm_dir.is_dir():
    shutil.rmtree(webm_dir)

# Deleting the ogg directory
if ogg_dir.exists() and ogg_dir.is_dir():
    shutil.rmtree(ogg_dir)

# Deleting the wav directory
if wav_dir.exists() and wav_dir.is_dir():
    shutil.rmtree(wav_dir)