In [14]:
import os
import pandas as pd

# Paths
metadata_path = '../data/interim/metadata.csv'
wav_dir = '../data/raw/COUGHVID'

# Load raw metadata
metadata = pd.read_csv(metadata_path)
metadata['id'] = metadata['filename'].str.replace('.wav', '', regex=False)

# List actual .wav files
wav_ids = {f.replace('.wav', '') for f in os.listdir(wav_dir) if f.endswith('.wav')}
metadata['has_wav'] = metadata['id'].isin(wav_ids)

print(f"Total entries: {len(metadata)}")
print(f"With .wav files: {metadata['has_wav'].sum()}")


Total entries: 27550
With .wav files: 27550


In [17]:
filtered_df = metadata[
    (metadata['has_wav']) &                           # Only samples with .wav files
    (metadata['status'].isin(['COVID-19', 'healthy'])) &  # Binary labels only
    (metadata['cough_detected'] >= 0.3)                # Medium to high-quality coughs
    ].copy()

# Reset index and sort by ID for reproducibility
filtered_df = filtered_df.reset_index(drop=True).sort_values('id')

# Save to your own clean metadata CSV
out_path = '../data/interim/metadata_clean.csv'
os.makedirs(os.path.dirname(out_path), exist_ok=True)
filtered_df.to_csv(out_path, index=False)

print(f"✅ Saved cleaned metadata to: {out_path}")
print(f"Samples retained: {len(filtered_df)}")


✅ Saved cleaned metadata to: ../data/interim/metadata_clean.csv
Samples retained: 11285


In [20]:
import librosa
import numpy as np
from tqdm import tqdm
import soundfile as sf

# Load cleaned metadata
clean_df = pd.read_csv('../data/interim/metadata_clean.csv')

# Paths
wav_dir = '../data/raw/COUGHVID/'
mfcc_out_dir = 'data/processed/mfccs/'
os.makedirs(mfcc_out_dir, exist_ok=True)

# Settings
SR = 16000
DURATION = 4
N_MFCC = 40

for _, row in tqdm(clean_df.iterrows(), total=len(clean_df)):
    wav_id = row['id']
    wav_path = os.path.join(wav_dir, f"{wav_id}.wav")
    out_path = os.path.join(mfcc_out_dir, f"{wav_id}.npy")

    try:
        y, _ = librosa.load(wav_path, sr=SR)
        y = librosa.util.fix_length(y, size=SR * DURATION)  # pad/truncate

        mfcc = librosa.feature.mfcc(y=y, sr=SR, n_mfcc=N_MFCC)
        np.save(out_path, mfcc)
    except Exception as e:
        print(f"⚠️ Skipped {wav_id}: {e}")


100%|██████████| 11285/11285 [06:56<00:00, 27.07it/s]
