In [1]:
#set project path from config.yaml
import pathlib
import yaml

with open("config.yaml", "r") as f:
   config = yaml.safe_load(f)

project_root = pathlib.Path(config["project"]["root_path"])

print("Current project path: ", project_root)

Current project path:  D:\Data Repositories\Coughvid


In [2]:
dset_wav_dir = project_root / "audio_wav_fulldset"
labels_dir = project_root / "labels_json"

In [3]:
#custom feature extraction function using pyaudioanalysis
import numpy as np
from pyAudioAnalysis import MidTermFeatures
from pyAudioAnalysis import audioBasicIO

def extract_features(Fs, x):

    if x.size == 0:
        return None 

    #standardization based on sampling rate
    #1 sec mid length and 50ms short length as a safe bet when it comes to steps

    mid_window = int(1.0 * Fs)
    mid_step = int(1.0 * Fs)
    short_window = int(0.05 * Fs)
    short_step = int(0.05 * Fs)


    mt_f, st_f, mt_names = MidTermFeatures.mid_feature_extraction(
        x, Fs, mid_window, mid_step, short_window, short_step
    )

    #68 rows total, 34 means of the features as described in the docs and the other 34 are STDs
        #only using means for now
    feature_vector = np.mean(mt_f[:34, :], axis=1)

    return feature_vector



In [4]:
#SVMs cannot be trained with on-the-fly augmentation since the dataframe has to be static
    #to save on storage space, K augmented versions of every wav will be created and their features will be added to the final DF 
    #this way we have more control over the creation of the DF and we don't have to go back and manually recreate thousands of files 
from audiomentations import Compose, AddGaussianNoise, TimeStretch, PitchShift, AddBackgroundNoise

def augment_audio(Fs, x):

    augment = Compose([
    AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.5),
    TimeStretch(min_rate=0.8, max_rate=1.25, p=0.5),
    PitchShift(min_semitones=-2, max_semitones=2, p=0.5),
    ]) #reproducibility

    augmented_x = augment(samples=x, sample_rate=Fs)

    return augmented_x


In [5]:
import json
from pathlib import Path
from tqdm import tqdm
import pandas as pd
import random

all_rows = []

k = 3 #number of augmented instances created per wav file

target_labels = ["healthy", "symptomatic", "COVID-19"]
samples_per_class = 1315 * k #1315 is the number of COVID-19 instances in the dataset and the lowest class number. 
    #to ensure class balance we are undersampling to the number of COVID instances

class_rows = {label: [] for label in target_labels}

#counting missing skipped etc
stats = {
    "total_wavs_found": 0,
    "missing_json": 0,
    "empty_status": 0,
    "valid_extracted": 0,
    "class_full_skip": 0
}

wav_files = list(dset_wav_dir.glob("*.wav"))
random.shuffle(wav_files) 

stats["total_wavs_found"] = len(wav_files)

for wav_path in tqdm(wav_files, desc="Creating training DF"):
        # Calculate total collected so far across all classes
    current_total = sum(len(rows) for rows in class_rows.values())

    if current_total >= 3 * samples_per_class:
        break
        
    file_id = wav_path.stem

    label_path = labels_dir / f"{file_id}.json"
    
    if not label_path.exists():
        stats["missing_json"] += 1
        continue
    
    try:
        with open(label_path, 'r') as f:
            data = json.load(f)
            
        status_value = data.get('status')
        
        # if no assigned status (happens in a lot of cases, skip entry and move to next)
        # or if the status is not in our target balanced list
        if status_value not in target_labels:
            stats["empty_status"] += 1
            continue
            
        # Check if we already have enough for this specific class
        if len(class_rows[status_value]) >= samples_per_class:
            stats["class_full_skip"] += 1
            continue
        #read the audio file
        [Fs, x] = audioBasicIO.read_audio_file(str(wav_path))
        
        for i in range(k):
                    # Check quota again inside loop to prevent overfilling 
                    if len(class_rows[status_value]) >= samples_per_class:
                        break

                    #function introduces randomness each call
                    augmented_x = augment_audio(Fs, x)

                    #custom feature extraction
                    pyaudio_features = extract_features(Fs, augmented_x)
                    
                    # Append to list with the same status_value
                    new_row = list(pyaudio_features) + [status_value]
                    class_rows[status_value].append(new_row)
                    stats["valid_extracted"] += 1
        
    except Exception as e:
        print(f"Error processing {file_id}: {e}")

# Flatten the dictionary of lists into the final all_rows list
for label in target_labels:
    all_rows.extend(class_rows[label])

# Final Summary
print("\n--- Processing Summary ---")
print(f"Total .wav files scanned: {stats['total_wavs_found']}")
print(f"Files skipped (JSON missing): {stats['missing_json']}")
print(f"Files skipped (Status empty/invalid): {stats['empty_status']}")
print(f"Files skipped (Class quota reached): {stats['class_full_skip']}")
print(f"Successfully added to DF: {len(all_rows)}")

# Breakdown of final counts
for label in target_labels:
    print(f" - {label}: {len(class_rows[label])} samples")

final_df = pd.DataFrame(all_rows)

Creating training DF: 100%|█████████▉| 20660/20664 [29:44<00:00, 11.58it/s] 


--- Processing Summary ---
Total .wav files scanned: 20664
Files skipped (JSON missing): 0
Files skipped (Status empty/invalid): 0
Files skipped (Class quota reached): 16715
Successfully added to DF: 11835
 - healthy: 3945 samples
 - symptomatic: 3945 samples
 - COVID-19: 3945 samples





In [8]:
# Show unique values
print("Unique Status Values:")
print(final_df.iloc[:, -1].unique())

# Better yet, show the count of each to check for class imbalance
print("\nValue Counts:")
print(final_df.iloc[:, -1].value_counts())

Unique Status Values:
['healthy' 'symptomatic' 'COVID-19']

Value Counts:
34
healthy        3945
symptomatic    3945
COVID-19       3945
Name: count, dtype: int64


In [9]:
final_df.to_csv("dset_augmented.csv", index = False)