In [1]:
#set project path from config.yaml
import pathlib
import yaml

with open("config.yaml", "r") as f:
   config = yaml.safe_load(f)

project_root = pathlib.Path(config["project"]["root_path"])

print("Current project path: ", project_root)

Current project path:  C:\Users\Konstantinos\Desktop\Coughvid Data


In [3]:
dset_wav_dir = project_root / "audio_wav_fulldset"
labels_dir = project_root / "labels_json"

In [6]:
import numpy as np
from pyAudioAnalysis import MidTermFeatures
from pyAudioAnalysis import audioBasicIO

def extract_features(path):
    [Fs, x] = audioBasicIO.read_audio_file(str(path))

    if x.size == 0:
        return None 

    #standardization based on sampling rate
    #1 sec mid length and 50ms short length as a safe bet when it comes to steps
    mid_window = int(1.0 * Fs)
    mid_step = int(1.0 * Fs)
    short_window = int(0.05 * Fs)
    short_step = int(0.05 * Fs)


    mt_f, st_f, mt_names = MidTermFeatures.mid_feature_extraction(
        x, Fs, mid_window, mid_step, short_window, short_step
    )

    #68 rows total, 34 means of the features as described in the docs and the other 34 are STDs
        #only using means for now
    feature_vector = np.mean(mt_f[:34, :], axis=1)

    return feature_vector, mt_names

In [None]:
import json
from pathlib import Path
from tqdm import tqdm
import pandas as pd

all_rows = []

#set max len of dataset 
    #TODO ensure class balance
dset_size = 10

#counting missing skipped etc
stats = {
    "total_wavs_found": 0,
    "missing_json": 0,
    "empty_status": 0,
    "valid_extracted": 0
}

wav_files = list(dset_wav_dir.glob("*.wav"))
stats["total_wavs_found"] = len(wav_files)

for wav_path in tqdm(wav_files, desc="Creating training DF"):
    if len(all_rows) >= dset_size:
        break
        
    file_id = wav_path.stem

    label_path = labels_dir / f"{file_id}.json"
    
    if not label_path.exists():
        stats["missing_json"] += 1
        continue
    
    try:

        with open(label_path, 'r') as f:
            data = json.load(f)
            
    
        status_value = data.get('status')
        
        #if no assigned status (happens in a lot of cases, skip entry and move to next)
        if status_value is None or str(status_value).strip() == "":
            stats["empty_status"] += 1
            continue
            
        #call the custom function for mid extraction
        pyaudio_features = extract_features(str(wav_path))
        
        #create the final df row w status 
        new_row = list(pyaudio_features) + [status_value]
        all_rows.append(new_row)
        stats["valid_extracted"] += 1
        
    except Exception as e:
        print(f"Error processing {file_id}: {e}")

# Final Summary
print("\n--- Processing Summary ---")
print(f"Total .wav files scanned: {stats['total_wavs_found']}")
print(f"Files skipped (JSON missing): {stats['missing_json']}")
print(f"Files skipped (Status empty): {stats['empty_status']}")
print(f"Successfully added to DF: {stats['valid_extracted']}")

final_df = pd.DataFrame(all_rows)




Creating training DF:   0%|          | 15/34434 [00:02<1:23:51,  6.84it/s]



--- Processing Summary ---
Total .wav files scanned: 34434
Files skipped (JSON missing): 0
Files skipped (Status empty): 5
Successfully added to DF: 10


ValueError: 136 columns passed, passed data had 35 columns

In [None]:
# Show unique values
print("Unique Status Values:")
print(final_df.iloc[:, -1].unique())

# Better yet, show the count of each to check for class imbalance
print("\nValue Counts:")
print(final_df.iloc[:, -1].value_counts())



Unique Status Values:
['healthy' 'COVID-19' 'symptomatic']

Value Counts:
34
healthy        84
symptomatic    12
COVID-19        4
Name: count, dtype: int64


In [15]:
final_df.to_csv("toy_dset.csv", index = False)