In [1]:
#set project path from config.yaml
import pathlib
import yaml

with open("config.yaml", "r") as f:
   config = yaml.safe_load(f)

project_root = pathlib.Path(config["project"]["root_path"])

print("Current project path: ", project_root)

Current project path:  C:\Users\Konstantinos\Desktop\Coughvid Data


In [2]:
dset_wav_dir = project_root / "audio_wav_fulldset"
labels_dir = project_root / "labels_json"

In [3]:
import numpy as np
from pyAudioAnalysis import MidTermFeatures
from pyAudioAnalysis import audioBasicIO

def extract_features(path):
    [Fs, x] = audioBasicIO.read_audio_file(str(path))

    if x.size == 0:
        return None 

    #standardization based on sampling rate
    #1 sec mid length and 50ms short length as a safe bet when it comes to steps
    mid_window = int(1.0 * Fs)
    mid_step = int(1.0 * Fs)
    short_window = int(0.05 * Fs)
    short_step = int(0.05 * Fs)


    mt_f, st_f, mt_names = MidTermFeatures.mid_feature_extraction(
        x, Fs, mid_window, mid_step, short_window, short_step
    )

    #68 rows total, 34 means of the features as described in the docs and the other 34 are STDs
        #only using means for now
    feature_vector = np.mean(mt_f[:34, :], axis=1)

    return feature_vector, mt_names



In [4]:
import json
from pathlib import Path
from tqdm import tqdm
import pandas as pd
import random

all_rows = []

# set max len of dataset per class
# TODO ensure class balance - DONE
target_labels = ["healthy", "symptomatic", "COVID-19"]
samples_per_class = 1500
class_rows = {label: [] for label in target_labels}

# counting missing skipped etc
stats = {
    "total_wavs_found": 0,
    "missing_json": 0,
    "empty_status": 0,
    "valid_extracted": 0,
    "class_full_skip": 0
}

wav_files = list(dset_wav_dir.glob("*.wav"))
# Optional: Shuffle to ensure the 1500 samples are random, not just the first 1500 found
random.shuffle(wav_files) 

stats["total_wavs_found"] = len(wav_files)

for wav_path in tqdm(wav_files, desc="Creating training DF"):
    # Exit if all classes have reached the target
    if all(len(rows) >= samples_per_class for rows in class_rows.values()):
        break
        
    file_id = wav_path.stem

    label_path = labels_dir / f"{file_id}.json"
    
    if not label_path.exists():
        stats["missing_json"] += 1
        continue
    
    try:
        with open(label_path, 'r') as f:
            data = json.load(f)
            
        status_value = data.get('status')
        
        # if no assigned status (happens in a lot of cases, skip entry and move to next)
        # or if the status is not in our target balanced list
        if status_value not in target_labels:
            stats["empty_status"] += 1
            continue
            
        # Check if we already have enough for this specific class
        if len(class_rows[status_value]) >= samples_per_class:
            stats["class_full_skip"] += 1
            continue
            
        # call the custom function for mid extraction
        pyaudio_features = extract_features(str(wav_path))
        
        # create the final df row w status 
        new_row = list(pyaudio_features) + [status_value]
        class_rows[status_value].append(new_row)
        stats["valid_extracted"] += 1
        
    except Exception as e:
        print(f"Error processing {file_id}: {e}")

# Flatten the dictionary of lists into the final all_rows list
for label in target_labels:
    all_rows.extend(class_rows[label])

# Final Summary
print("\n--- Processing Summary ---")
print(f"Total .wav files scanned: {stats['total_wavs_found']}")
print(f"Files skipped (JSON missing): {stats['missing_json']}")
print(f"Files skipped (Status empty/invalid): {stats['empty_status']}")
print(f"Files skipped (Class quota reached): {stats['class_full_skip']}")
print(f"Successfully added to DF: {len(all_rows)}")

# Breakdown of final counts
for label in target_labels:
    print(f" - {label}: {len(class_rows[label])} samples")

final_df = pd.DataFrame(all_rows)

Creating training DF: 100%|██████████| 34434/34434 [13:24<00:00, 42.81it/s] 


--- Processing Summary ---
Total .wav files scanned: 34434
Files skipped (JSON missing): 0
Files skipped (Status empty/invalid): 13770
Files skipped (Class quota reached): 16349
Successfully added to DF: 4315
 - healthy: 1500 samples
 - symptomatic: 1500 samples
 - COVID-19: 1315 samples





In [5]:
# Show unique values
print("Unique Status Values:")
print(final_df.iloc[:, -1].unique())

# Better yet, show the count of each to check for class imbalance
print("\nValue Counts:")
print(final_df.iloc[:, -1].value_counts())



Unique Status Values:
['healthy' 'symptomatic' 'COVID-19']

Value Counts:
2
healthy        1500
symptomatic    1500
COVID-19       1315
Name: count, dtype: int64


In [6]:
final_df.to_csv("toy_dset.csv", index = False)