In [4]:
#set project path from config.yaml
import pathlib
import yaml

with open("config.yaml", "r") as f:
   config = yaml.safe_load(f)

project_root = pathlib.Path(config["project"]["root_path"])

print("Current project path: ", project_root)

Current project path:  D:\Data Repositories\Coughvid


In [5]:
dset_wav_dir = project_root / "audio_wav_fulldset"
labels_dir = project_root / "labels_json"

In [9]:
import numpy as np
from pyAudioAnalysis import MidTermFeatures
from pyAudioAnalysis import audioBasicIO

def extract_features(path):
    [Fs, x] = audioBasicIO.read_audio_file(str(path))

    if x.size == 0:
        return None 

    # Standardization based on sampling rate
    mid_window = int(1.0 * Fs)
    mid_step = int(1.0 * Fs)
    short_window = int(0.05 * Fs)
    short_step = int(0.05 * Fs)

    # mt_f shape is (136, n_windows)
    # The first 68 rows are means and stds of the 34 short-term features
    mt_f, st_f, mt_names = MidTermFeatures.mid_feature_extraction(
        x, Fs, mid_window, mid_step, short_window, short_step
    )

    # 1. Slice the first 34 rows (the means of the short-term features)
    # 2. Average across the columns (the mid-term windows) 
    # This results in a 1D array of 34 values
    feature_vector = np.mean(mt_f[:34, :], axis=1)

    return feature_vector

In [7]:
#SVMs cannot be trained with on-the-fly augmentation since the dataframe has to be static
    #to save on storage space, K augmented versions of every wav will be created and their features will be added to the final DF 
    #this way we have more control over the creation of the DF and we don't have to go back and manually recreate thousands of files 
from audiomentations import Compose, AddGaussianNoise, TimeStretch, PitchShift, AddBackgroundNoise

def augment_audio(Fs, x):

    augment = Compose([
    AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.5),
    TimeStretch(min_rate=0.8, max_rate=1.25, p=0.5),
    PitchShift(min_semitones=-2, max_semitones=2, p=0.5),
    ]) 

    augmented_x = augment(samples=x, sample_rate=Fs)

    return augmented_x


In [13]:
import json
from pathlib import Path
from tqdm import tqdm
import pandas as pd

all_rows = []
LIMIT = 100

# Diagnostic Counters
stats = {
    "total_wavs_found": 0,
    "missing_json": 0,
    "empty_status": 0,
    "valid_extracted": 0
}

wav_files = list(dset_wav_dir.glob("*.wav"))
stats["total_wavs_found"] = len(wav_files)

for wav_path in tqdm(wav_files, desc="Creating training DF"):
    """"
    if len(all_rows) >= LIMIT:
        break
    """

    file_id = wav_path.stem
    # Changed extension to .json
    label_path = labels_dir / f"{file_id}.json"
    
    # Check 1: Does JSON exist?
    if not label_path.exists():
        stats["missing_json"] += 1
        continue
    
    try:
        # Load JSON file
        with open(label_path, 'r') as f:
            data = json.load(f)
            
        # Check 2: Get "status" from the dictionary
        status_value = data.get('status')
        
        if status_value is None or str(status_value).strip() == "":
            stats["empty_status"] += 1
            continue
            
        # 2. Success: Extract Features
        pyaudio_features = extract_features(str(wav_path))
        
        # Combine features and status
        new_row = list(pyaudio_features) + [status_value]
        all_rows.append(new_row)
        stats["valid_extracted"] += 1
        
    except Exception as e:
        print(f"Error processing {file_id}: {e}")

# Final Summary
print("\n--- Processing Summary ---")
print(f"Total .wav files scanned: {stats['total_wavs_found']}")
print(f"Files skipped (JSON missing): {stats['missing_json']}")
print(f"Files skipped (Status empty): {stats['empty_status']}")
print(f"Successfully added to DF: {stats['valid_extracted']}")

final_df = pd.DataFrame(all_rows)

Creating training DF: 100%|██████████| 20593/20593 [34:42<00:00,  9.89it/s]


--- Processing Summary ---
Total .wav files scanned: 20593
Files skipped (JSON missing): 0
Files skipped (Status empty): 0
Successfully added to DF: 20593





In [14]:
print(final_df.head)

<bound method NDFrame.head of              0         1         2         3         4         5         6   \
0      0.028471  0.002779  3.146846  0.053351  0.043814  0.273602  0.050972   
1      0.133642  0.002768  2.869909  0.309003  0.245174  1.881792  0.012808   
2      0.218935  0.005202  2.897677  0.297827  0.222117  1.665932  0.014579   
3      0.160827  0.027925  3.024841  0.277583  0.234702  1.273127  0.021699   
4      0.217478  0.005380  3.094527  0.390247  0.249535  2.143341  0.037738   
...         ...       ...       ...       ...       ...       ...       ...   
20588  0.301466  0.002803  3.036267  0.402742  0.265695  2.596700  0.006148   
20589  0.225084  0.001692  2.868786  0.343297  0.231645  1.718931  0.017871   
20590  0.275063  0.021999  2.931340  0.328595  0.155308  1.408996  0.126350   
20591  0.219698  0.029114  2.871901  0.301677  0.220506  1.511766  0.016517   
20592  0.291479  0.006313  2.698703  0.358840  0.260377  1.973697  0.011696   

             7       

In [15]:
final_df.to_csv("full_dset.csv", index = False)