In [1]:
#set project path from config.yaml
import pathlib
import yaml

with open("config.yaml", "r") as f:
   config = yaml.safe_load(f)

project_root = pathlib.Path(config["project"]["root_path"])

print("Current project path: ", project_root)

Current project path:  D:\Data Repositories\Coughvid


In [2]:
dset_wav_dir = project_root / "audio_wav_fulldset"
labels_dir = project_root / "labels_json"

In [3]:
#custom feature extraction function using pyaudioanalysis
import numpy as np
from pyAudioAnalysis import MidTermFeatures
from pyAudioAnalysis import audioBasicIO

def extract_features(Fs, x):

    if x.size == 0:
        return None 

    #standardization based on sampling rate
    #1 sec mid length and 50ms short length as a safe bet when it comes to steps

    mid_window = int(1.0 * Fs)
    mid_step = int(1.0 * Fs)
    short_window = int(0.05 * Fs)
    short_step = int(0.05 * Fs)


    mt_f, st_f, mt_names = MidTermFeatures.mid_feature_extraction(
        x, Fs, mid_window, mid_step, short_window, short_step
    )

    #68 rows total, 34 means of the features as described in the docs and the other 34 are STDs
        #only using means for now
    feature_vector = np.mean(mt_f[:34, :], axis=1)

    return feature_vector



In [4]:
#SVMs cannot be trained with on-the-fly augmentation since the dataframe has to be static
    #to save on storage space, K augmented versions of every wav will be created and their features will be added to the final DF 
    #this way we have more control over the creation of the DF and we don't have to go back and manually recreate thousands of files 
from audiomentations import Compose, AddGaussianNoise, TimeStretch, PitchShift, AddBackgroundNoise

def augment_audio(Fs, x):

    augment = Compose([
    AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.5),
    TimeStretch(min_rate=0.8, max_rate=1.25, p=0.5),
    PitchShift(min_semitones=-2, max_semitones=2, p=0.5),
    ]) 

    augmented_x = augment(samples=x, sample_rate=Fs)

    return augmented_x


In [5]:
import json
from pathlib import Path
from tqdm import tqdm
import pandas as pd
import random
from sklearn.model_selection import train_test_split

# --- STEP 1: PRE-SCAN LABELS ---
file_to_status = {}
wav_files = list(dset_wav_dir.glob("*.wav"))

print("Scanning labels for stratification...")
for wav_path in wav_files:
    label_path = labels_dir / f"{wav_path.stem}.json"
    if label_path.exists():
        with open(label_path, 'r') as f:
            data = json.load(f)
            status = data.get('status')
            if status in ["healthy", "symptomatic", "COVID-19"]:
                file_to_status[wav_path.stem] = status

# Convert to Series for easy splitting
ids = list(file_to_status.keys())
labels = [file_to_status[i] for i in ids]

# --- STEP 2: STRATIFIED SPLIT ---
# This ensures valid_ids has the same 96/2/0.8 ratio as the original population
train_ids, valid_ids = train_test_split(
    ids, test_size=0.20, stratify=labels, random_state=42
)
valid_ids_set = set(valid_ids) # faster lookup

# --- STEP 3: CALCULATE TRAINING QUOTAS ---
k = 3
# Undersample training to 80% of the smallest class (COVID-19)
# 1315 * 0.8 = 1052. Then apply augmentation factor k.
samples_per_class = int(1315 * 0.8) * k 

train_rows = {label: [] for label in ["healthy", "symptomatic", "COVID-19"]}
valid_rows = []

stats = {"total": len(wav_files), "valid_added": 0, "train_added": 0}

Scanning labels for stratification...


In [6]:
# --- PHASE 1: VALIDATION COLLECTION ---
valid_rows = []
stats["valid_added"] = 0

# Create a filtered list of wav paths that belong to validation
valid_wav_paths = [p for p in wav_files if p.stem in valid_ids_set]

v_pbar = tqdm(valid_wav_paths, desc="Phase 1: Validation Set")
for wav_path in v_pbar:
    status_value = file_to_status[wav_path.stem]
    
    try:
        [Fs, x] = audioBasicIO.read_audio_file(str(wav_path))
        features = extract_features(Fs, x) 
        valid_rows.append(list(features) + [status_value])
        stats["valid_added"] += 1
        
        # Update progress with class counts for validation
        v_pbar.set_postfix({"Total Val": stats["valid_added"]})
    except Exception as e:
        print(f"Error on {wav_path.stem}: {e}")

df_valid = pd.DataFrame(valid_rows)

Phase 1: Validation Set: 100%|██████████| 4133/4133 [07:39<00:00,  9.00it/s, Total Val=4133]


In [7]:
print(df_valid.head)

<bound method NDFrame.head of             0         1         2         3         4         5         6   \
0     0.133642  0.002768  2.869909  0.309003  0.245174  1.881792  0.012808   
1     0.056516  0.003085  3.055794  0.113271  0.066920  0.613245  0.110051   
2     0.148417  0.024882  2.735781  0.244915  0.185135  1.123642  0.099260   
3     0.240110  0.005646  2.957414  0.369249  0.276009  1.775163  0.009528   
4     0.028960  0.000527  3.171525  0.068770  0.040655  0.364464  0.036303   
...        ...       ...       ...       ...       ...       ...       ...   
4128  0.095542  0.002295  2.968932  0.161171  0.127174  0.725894  0.069822   
4129  0.197306  0.000872  3.032930  0.311191  0.255449  1.814753  0.041129   
4130  0.340376  0.035111  2.946380  0.398258  0.262767  2.126538  0.006993   
4131  0.225084  0.001692  2.868786  0.343297  0.231645  1.718931  0.017871   
4132  0.275063  0.021999  2.931340  0.328595  0.155308  1.408996  0.126350   

            7          8         

In [11]:
# Define your categories and parameters at the top
target_labels = ["healthy", "symptomatic", "COVID-19"]
k = 3 


augmentation = False

# Calculate the quota for training (80% of the minority class 1315)
# Multiplied by k because each file generates k augmented instances
if(augmentation):
    samples_per_class = int(1315 * 0.8) * k
else:
    samples_per_class = int(1315 * 0.8)

In [12]:
 # --- PHASE 2: TRAINING COLLECTION ---

train_rows = {label: [] for label in target_labels}

stats["train_added"] = 0

# Filter for files belonging to training
train_wav_paths = [p for p in wav_files if p.stem in train_ids]
random.shuffle(train_wav_paths) # Shuffle for random undersampling

t_pbar = tqdm(train_wav_paths, desc="Phase 2: Training Set")

for wav_path in t_pbar:
    # Check if ALL class quotas are met to exit early
    current_counts = {l: len(rows) for l, rows in train_rows.items()}
    if all(count >= samples_per_class for count in current_counts.values()):
        print("\nAll training quotas reached. Stopping.")
        break
    # Update TQDM with current training balance
    t_pbar.set_postfix({
        "H": current_counts["healthy"],
        "S": current_counts["symptomatic"],
        "C": current_counts["COVID-19"]
    })

    status_value = file_to_status[wav_path.stem]   
    # Only process if this specific class still needs samples

    if current_counts[status_value] < samples_per_class:

        if(augmentation):
            try:
                [Fs, x] = audioBasicIO.read_audio_file(str(wav_path))
                for i in range(k):
                    if len(train_rows[status_value]) >= samples_per_class:
                        break

                    # Apply augmentation only to training
                    x = augment_audio(Fs, x)
                    features = extract_features(Fs, x)
                    train_rows[status_value].append(list(features) + [status_value])
                    stats["train_added"] += 1

            except Exception as e:
                print(f"Error on {wav_path.stem}: {e}")
        else:
            try:
                [Fs, x] = audioBasicIO.read_audio_file(str(wav_path))
                features = extract_features(Fs, x)
                train_rows[status_value].append(list(features) + [status_value])
                stats["train_added"] += 1   
            except Exception as e:
                print(f"Error on {wav_path.stem}: {e}")



# Flatten and create DataFrame
all_train = [row for label_list in train_rows.values() for row in label_list]

df_train = pd.DataFrame(all_train) 

Phase 2: Training Set: 100%|█████████▉| 16486/16531 [06:53<00:01, 39.89it/s, H=1052, S=1052, C=1051] 


All training quotas reached. Stopping.





In [13]:
print("\nValue Counts:")
print(df_valid.iloc[:, -1].value_counts())

print("\nValue Counts:")
print(df_train.iloc[:, -1].value_counts())


Value Counts:
34
healthy        3095
symptomatic     775
COVID-19        263
Name: count, dtype: int64

Value Counts:
34
healthy        1052
symptomatic    1052
COVID-19       1052
Name: count, dtype: int64


In [14]:
df_train.to_csv("train_df.csv", index = False)
df_valid.to_csv("valid_df.csv", index = False)