In [15]:
import os, glob
import numpy as np
import pandas as pd
import librosa, librosa.display
import cv2
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchvision import models, transforms
from tqdm import tqdm
from sklearn.model_selection import GroupShuffleSplit

# reproducibility
torch.manual_seed(0)
np.random.seed(0)

## 2. Paths & Parameters

In [16]:
# ─── Paths (refactored for new folder layout) ──────────────────────────────────
import os

# 1. Base data directory (the folder that contains both your raw download and metadata.csv)
DATA_ROOT       = "data"

# 2. Raw audio + annotation files are in the downloaded ICBHI folder
RAW_DIR         = os.path.join(DATA_ROOT, "ICBHI_final_database")

# 3. Since .wav and .txt both live here, point both to RAW_DIR
AUDIO_DIR       = RAW_DIR
ANNOTATION_DIR  = RAW_DIR

# 4. Metadata CSV lives alongside the download folder
METADATA_FILE   = os.path.join(DATA_ROOT, "metadata.csv")

# 5. Cache directory for spectrogram images or pre-computed features
CACHE_DIR       = os.path.join(DATA_ROOT, "cache_imgs")
os.makedirs(CACHE_DIR, exist_ok=True)

# Sanity checks
assert os.path.isdir(RAW_DIR),        f"❌ RAW_DIR not found: {RAW_DIR}"
assert os.path.isfile(METADATA_FILE), f"❌ metadata.csv not found: {METADATA_FILE}"

In [17]:
# Audio parameters
SR         = 4000    # common resample rate
WIN_SEC    = 3
HOP_SEC    = 3

# STFT params
N_FFT      = 512
HOP_LENGTH = 256

# Image size
IMG_SIZE   = 128

# Training params
BATCH_SIZE = 32
LR         = 1e-4
WEIGHT_DECAY = 1e-5
EPOCHS     = 30
DEVICE     = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [18]:
# Load the provided metadata which contains diagnosis info
df_diag = pd.read_csv('metadata.csv')
# The 'id' column corresponds to the patient ID
df_diag = df_diag.rename(columns={'id': 'patient_id'})

# --- Step 1: Discover audio files and create a full metadata table ---
# Scan the audio directory to find all .wav files
audio_files = glob.glob(os.path.join(AUDIO_DIR, "*.wav"))

if not audio_files:
    raise FileNotFoundError(f"No .wav files found in {AUDIO_DIR}. Please ensure your audio files are in the correct directory.")

# Create a list of metadata from the filenames found
file_metadata = []
for fpath in audio_files:
    fname = os.path.basename(fpath)
    # Extract patient ID from filename like '101_1b1_Al_sc_Meditron.wav'
    patient_id = int(fname.split('_')[0])
    file_metadata.append({
        'patient_id': patient_id,
        'filename': fname
    })
df_files = pd.DataFrame(file_metadata)

# --- Step 2: Merge file info with diagnosis info ---
# This links filenames to their diagnosis via the patient_id
df_meta = pd.merge(df_files, df_diag, on='patient_id', how='left')

# Drop any files that didn't have a matching diagnosis
df_meta = df_meta.dropna(subset=['diagnosis'])

print("--- Successfully created a complete metadata table ---")
print(f"Found {len(df_meta)} recordings with matching diagnoses.")


# --- Step 3: Perform a Patient-Aware Train-Test Split ---
# We split based on unique patient IDs to prevent data leakage
splitter = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
groups = df_meta['patient_id']
train_idx, test_idx = next(splitter.split(df_meta, groups=groups))

# Add the 'split' column to our new metadata table
df_meta['split'] = 'train'
df_meta.loc[test_idx, 'split'] = 'test'


# --- Step 4: Display Dataset Overview ---
print("\n--- Dataset Overview ---")
print("Total recordings:", len(df_meta))

print("\nRecordings per split:")
print(df_meta['split'].value_counts())

print("\nPatients per split:")
print(df_meta.groupby('split')['patient_id'].nunique())

print("\nClass distribution (overall):")
print(df_meta['diagnosis'].value_counts())

# Map string labels to integers (as in original code)
labels = sorted(df_meta['diagnosis'].unique())
label2idx = {lbl: i for i, lbl in enumerate(labels)}
print("\nClasses:", labels)

--- Successfully created a complete metadata table ---
Found 920 recordings with matching diagnoses.

--- Dataset Overview ---
Total recordings: 920

Recordings per split:
split
train    706
test     214
Name: count, dtype: int64

Patients per split:
split
test      26
train    100
Name: patient_id, dtype: int64

Class distribution (overall):
diagnosis
COPD              793
Pneumonia          37
Healthy            35
URTI               23
Bronchiectasis     16
Bronchiolitis      13
LRTI                2
Asthma              1
Name: count, dtype: int64

Classes: ['Asthma', 'Bronchiectasis', 'Bronchiolitis', 'COPD', 'Healthy', 'LRTI', 'Pneumonia', 'URTI']


In [19]:
def segment_audio(y, sr, win_sec=3, hop_sec=3):
    win_len = int(win_sec * sr)
    hop_len = int(hop_sec * sr)
    segments = []
    # The last segment is allowed to be shorter than win_len
    for start in range(0, len(y), hop_len):
        end = start + win_len
        segments.append(y[start:end])
    # If the last segment is shorter than win_len, pad it
    if len(segments[-1]) < win_len:
        segments[-1] = np.pad(segments[-1], (0, win_len - len(segments[-1])), 'constant')
    return segments

In [20]:
def create_segment_manifest(df, audio_dir, sr, win_sec, hop_sec):
    manifest = []
    for _, row in tqdm(df.iterrows(), total=df.shape[0], desc=f"Creating {row.split} manifest"):
        fname = row.filename
        label = row.diagnosis
        audio_path = os.path.join(audio_dir, fname)
        
        try:
            y, _ = librosa.load(audio_path, sr=sr)
        except Exception as e:
            print(f"Could not load {fname}: {e}")
            continue

        win_len = int(win_sec * sr)
        hop_len = int(hop_sec * sr)
        
        num_segments = (len(y) - win_len) // hop_len + 1
        if num_segments <= 0:
            num_segments = 1
        
        for i in range(num_segments):
            manifest.append({
                "filename": fname,
                "segment_idx": i,
                "diagnosis": label
            })
            
    return pd.DataFrame(manifest)

# Split the master metadata table
df_train_meta = df_meta[df_meta.split == 'train']
df_test_meta  = df_meta[df_meta.split == 'test']

# Create manifests for training and testing sets
df_train = create_segment_manifest(df_train_meta, AUDIO_DIR, SR, WIN_SEC, HOP_SEC)
df_test  = create_segment_manifest(df_test_meta,  AUDIO_DIR, SR, WIN_SEC, HOP_SEC)

print(f"\nTotal training files: {len(df_train_meta)} -> Total training segments: {len(df_train)}")
print(f"Total testing files: {len(df_test_meta)} -> Total testing segments: {len(df_test)}")

UnboundLocalError: cannot access local variable 'row' where it is not associated with a value

In [21]:
def make_spectrogram(y, sr):
    S = np.abs(librosa.stft(y, n_fft=N_FFT, hop_length=HOP_LENGTH))
    S = librosa.util.normalize(S)
    S = cv2.resize(S, (IMG_SIZE, IMG_SIZE))
    return (S * 255).astype(np.uint8)

def make_mfcc(y, sr, n_mfcc=13):
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc, n_fft=N_FFT, hop_length=HOP_LENGTH)
    mfcc = librosa.util.normalize(mfcc)
    mfcc = cv2.resize(mfcc, (IMG_SIZE, IMG_SIZE))
    return (mfcc * 255).astype(np.uint8)

def make_chroma(y, sr):
    chroma = librosa.feature.chroma_stft(y=y, sr=sr, hop_length=HOP_LENGTH)
    chroma = librosa.util.normalize(chroma)
    chroma = cv2.resize(chroma, (IMG_SIZE, IMG_SIZE))
    return (chroma * 255).astype(np.uint8)

In [22]:
class ICBHIDataset(Dataset):
    def __init__(self, df, audio_dir, cache_dir, transform=None):
        self.df = df.reset_index(drop=True)
        self.audio_dir = audio_dir
        self.cache_dir = cache_dir
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.loc[idx]
        fname = row.filename
        seg_idx = row.segment_idx
        label = label2idx[row.diagnosis]
        
        # Cache path is now unique for each segment
        base = os.path.splitext(fname)[0]
        cache_path = os.path.join(self.cache_dir, f"{base}_seg{seg_idx}.npz")

        if os.path.exists(cache_path):
            data = np.load(cache_path)
            spec, mfcc, chroma = data['spec'], data['mfcc'], data['chroma']
        else:
            y, _ = librosa.load(os.path.join(self.audio_dir, fname), sr=SR)
            
            # Segment the full audio and select the correct one
            segs = segment_audio(y, SR, WIN_SEC, HOP_SEC)
            seg = segs[seg_idx]

            spec   = make_spectrogram(seg, SR)
            mfcc   = make_mfcc(seg, SR)
            chroma = make_chroma(seg, SR)
            np.savez(cache_path, spec=spec, mfcc=mfcc, chroma=chroma)

        # Stack into 3-channel float tensor
        img = np.stack([spec, mfcc, chroma], axis=0).astype(np.float32) / 255.0
        if self.transform:
            img = self.transform(torch.from_numpy(img))
        
        return img, label

# Transforms
transform = transforms.Normalize(mean=[0.5]*3, std=[0.2]*3)

# Create Datasets using the new segment manifests
train_ds = ICBHIDataset(df_train, AUDIO_DIR, CACHE_DIR, transform)
test_ds  = ICBHIDataset(df_test,  AUDIO_DIR, CACHE_DIR, transform)

# Create DataLoaders
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True,  num_workers=2) # num_workers adjusted for broader compatibility
test_loader  = DataLoader(test_ds,  batch_size=BATCH_SIZE, shuffle=False, num_workers=2)

NameError: name 'df_train' is not defined

In [23]:
class TripleStreamNet(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.streams = nn.ModuleList()
        for _ in range(3):
            m = models.resnet18(weights=models.ResNet18_Weights.DEFAULT) # Updated to new torchvision API
            m.conv1 = nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3, bias=False)
            m.fc    = nn.Identity()
            self.streams.append(m)

        self.classifier = nn.Sequential(
            nn.Linear(3 * 512, 256),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(256, num_classes),
        )

    def forward(self, x):
        feats = []
        for i in range(3):
            xi = x[:, i:i+1, :, :]
            fi = self.streams[i](xi)
            feats.append(fi)
        f = torch.cat(feats, dim=1)
        return self.classifier(f)

model = TripleStreamNet(num_classes=len(labels)).to(DEVICE)

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /home/nhat-minh/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth


100%|██████████| 44.7M/44.7M [00:04<00:00, 10.3MB/s]


In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
criterion = nn.CrossEntropyLoss()

for epoch in range(1, EPOCHS+1):
    model.train()
    total_loss = 0
    for imgs, lbls in tqdm(train_loader, desc=f"Epoch {epoch}/{EPOCHS}"):
        imgs, lbls = imgs.to(DEVICE), lbls.to(DEVICE)
        preds = model(imgs)
        loss = criterion(preds, lbls)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"  → Train loss: {total_loss/len(train_loader):.4f}")

In [None]:
model.eval()
all_probs, all_labels = [], []
with torch.no_grad():
    for imgs, lbls in tqdm(test_loader, desc="Evaluating"):
        imgs = imgs.to(DEVICE)
        logits = model(imgs)
        probs = F.softmax(logits, dim=1)
        all_probs.append(probs.cpu().numpy())
        all_labels.append(lbls.numpy())
# 
probs = np.vstack(all_probs)
labels = np.hstack(all_labels)
preds  = np.argmax(probs, axis=1)

acc = (preds == labels).mean()
print(f"\nTest Accuracy (per segment): {acc:.4f}")