In [20]:
import os
import librosa
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image
from sklearn.model_selection import train_test_split
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms


In [21]:
AUDIO_DIR = 'audio'
SPEC_DIR = 'spec_images'
META_PATH = 'metadata.csv'

WINDOW_SIZE = 3.0  # seconds
HOP_SIZE = 1.5     # seconds
SR = 22050         # sampling rate
IMG_SIZE = 128     # spectrogram image size


In [None]:
os.makedirs(SPEC_DIR, exist_ok=True)
meta_df = pd.read_csv(META_PATH)  # columns: filename,label

label_map = {label: idx for idx, label in enumerate(meta_df['diagnosis'].unique())}
label_df = meta_df.set_index('id')['diagnosis'].to_dict()

def save_spec_patch(y, sr, out_path):
    S = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=1024, hop_length=512, n_mels=128)
    S_dB = librosa.power_to_db(S, ref=np.max)
    S_img = Image.fromarray(S_dB).resize((IMG_SIZE, IMG_SIZE))
    S_img = S_img.convert('L')  # grayscale
    S_img.save(out_path)

for fname in tqdm(os.listdir(AUDIO_DIR)):
    if not fname.endswith('.wav'): continue
    file_id = os.path.splitext(fname)[0]
    y, _ = librosa.load(os.path.join(AUDIO_DIR, fname), sr=SR)
    duration = librosa.get_duration(y=y, sr=SR)
    
    win_len = int(WINDOW_SIZE * SR)
    hop_len = int(HOP_SIZE * SR)
    output_dir = os.path.join(SPEC_DIR, file_id)
    os.makedirs(output_dir, exist_ok=True)

    for i, start in enumerate(np.arange(0, duration - WINDOW_SIZE, HOP_SIZE)):
        s = int(start * SR)
        y_win = y[s : s + win_len]
        out_path = os.path.join(output_dir, f"{file_id}_{i}.png")
        save_spec_patch(y_win, SR, out_path)


100%|██████████| 1843/1843 [03:56<00:00,  7.80it/s]


In [28]:
class SpecPatchDataset(Dataset):
    def __init__(self, root_dir, file_ids_to_load, patient_to_diagnosis_map, diagnosis_to_int_map, transform=None):
        """
        Args:
            root_dir (string): Directory with all the spectrogram sub-folders.
            file_ids_to_load (list): List of specific .wav filenames to load (e.g., train_ids or val_ids).
            patient_to_diagnosis_map (dict): Maps patient ID (int) to diagnosis (str).
            diagnosis_to_int_map (dict): Maps diagnosis (str) to a class index (int).
            transform (callable, optional): Optional transform to be applied on a sample.
        """
        self.samples = []
        self.labels = []
        self.transform = transform or transforms.ToTensor()
        
        # Loop over only the file IDs we're supposed to load
        for file_name in file_ids_to_load:
            file_id_without_ext = os.path.splitext(file_name)[0]
            
            # Extract patient ID to get the correct label
            patient_id = int(file_name.split('_')[0])
            class_label_str = patient_to_diagnosis_map[patient_id]
            label_idx = diagnosis_to_int_map[class_label_str]
            
            # Find all spectrogram patches for this audio file
            spectrogram_folder = os.path.join(root_dir, file_id_without_ext)
            if os.path.isdir(spectrogram_folder):
                for img_name in os.listdir(spectrogram_folder):
                    self.samples.append(os.path.join(spectrogram_folder, img_name))
                    self.labels.append(label_idx)

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        img = Image.open(self.samples[idx]).convert('L')
        img = self.transform(img)
        label = self.labels[idx]
        return img, label

In [32]:
class SimpleCNN(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        
        # We define the feature extractor part of the model separately
        self.features = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=5),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
            nn.Conv2d(16, 32, kernel_size=3),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
            nn.Conv2d(32, 64, kernel_size=3),
            nn.ReLU(),
            nn.MaxPool2d(2, 2)
        )
        
        # To find the input size for the linear layer, we pass a dummy tensor
        # through the feature extractor once.
        with torch.no_grad():
            # Create a dummy tensor with the same dimensions as one input image
            dummy_input = torch.randn(1, 1, IMG_SIZE, IMG_SIZE)
            # Pass it through the feature extractor
            dummy_output = self.features(dummy_input)
            # Get the size of the flattened output
            flattened_size = dummy_output.view(1, -1).size(1)
            
        # Now we can define the classifier with the correct input size
        self.classifier = nn.Sequential(
            nn.Linear(flattened_size, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, num_classes)
        )

    def forward(self, x):
        # Pass input through the feature extractor
        x = self.features(x)
        # Flatten the output for the classifier
        x = x.view(x.size(0), -1)
        # Pass the flattened tensor through the classifier
        x = self.classifier(x)
        return x

In [25]:
print(meta_df['diagnosis'].value_counts())

diagnosis
COPD              64
Healthy           26
URTI              14
Bronchiectasis     7
Bronchiolitis      6
Pneumonia          6
LRTI               2
Asthma             1
Name: count, dtype: int64


In [29]:
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5], std=[0.5])
])

# 1. Identify and filter out classes with fewer than 2 samples
class_counts = meta_df['diagnosis'].value_counts()
classes_to_remove = class_counts[class_counts < 2].index.tolist()

if classes_to_remove:
    print(f"Removing rare classes: {classes_to_remove}")
    ids_to_remove = meta_df[meta_df['diagnosis'].isin(classes_to_remove)]['id'].astype(str).tolist()
    all_file_ids = [f for f in os.listdir(AUDIO_DIR) if f.endswith('.wav')]
    file_ids = [f for f in all_file_ids if f.split('_')[0] not in ids_to_remove]
else:
    file_ids = [f for f in os.listdir(AUDIO_DIR) if f.endswith('.wav')]

# 2. Create labels for the filtered files for stratification
patient_ids = [int(f.split('_')[0]) for f in file_ids]
stratify_labels = [meta_df.loc[meta_df['id'] == pid, 'diagnosis'].values[0] for pid in patient_ids]

# 3. Split the filtered data
train_ids, val_ids = train_test_split(
    file_ids,
    test_size=0.2,
    stratify=stratify_labels,
    random_state=42
)

# 4. Define the necessary dictionaries for the dataset
# This maps patient ID (e.g., 101) to diagnosis (e.g., 'COPD')
patient_to_diagnosis_map = meta_df.set_index('id')['diagnosis'].to_dict()
# This maps diagnosis (e.g., 'COPD') to an integer (e.g., 0)
diagnosis_to_int_map = {label: idx for idx, label in enumerate(meta_df['diagnosis'].unique())}

# 5. Instantiate the datasets using the corrected class
train_ds = SpecPatchDataset(SPEC_DIR, train_ids, patient_to_diagnosis_map, diagnosis_to_int_map, transform)
val_ds   = SpecPatchDataset(SPEC_DIR, val_ids, patient_to_diagnosis_map, diagnosis_to_int_map, transform)

# Create dataloaders
train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)
val_loader   = DataLoader(val_ds, batch_size=32, shuffle=False)

print(f"\\nTraining spectrogram patches: {len(train_ds)}")
print(f"Validation spectrogram patches: {len(val_ds)}")

Removing rare classes: ['Asthma']
\nTraining spectrogram patches: 9431
Validation spectrogram patches: 2494


In [33]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SimpleCNN(num_classes=len(label_map)).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

for epoch in range(1, 16):
    model.train()
    total_loss = 0
    for imgs, labels in train_loader:
        imgs, labels = imgs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(imgs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch:02d} | Train Loss: {total_loss / len(train_loader):.4f}")


Epoch 01 | Train Loss: 0.6465
Epoch 02 | Train Loss: 0.6179
Epoch 03 | Train Loss: 0.6109
Epoch 04 | Train Loss: 0.6066
Epoch 05 | Train Loss: 0.6056
Epoch 06 | Train Loss: 0.6057
Epoch 07 | Train Loss: 0.6037
Epoch 08 | Train Loss: 0.6051
Epoch 09 | Train Loss: 0.6012
Epoch 10 | Train Loss: 0.6029
Epoch 11 | Train Loss: 0.6024
Epoch 12 | Train Loss: 0.5989
Epoch 13 | Train Loss: 0.5980
Epoch 14 | Train Loss: 0.6008
Epoch 15 | Train Loss: 0.5994


In [34]:
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for imgs, labels in val_loader:
        imgs, labels = imgs.to(device), labels.to(device)
        preds = model(imgs).argmax(dim=1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

print(f"Validation Accuracy: {correct / total:.4f}")


Validation Accuracy: 0.8797


In [36]:
#MLP version
class MLP(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        # Calculate the input size from the image dimensions
        input_size = IMG_SIZE * IMG_SIZE

        # Define the sequence of layers
        self.layers = nn.Sequential(
            # This layer flattens the 2D image into a 1D vector
            nn.Flatten(),
            
            # First hidden layer
            nn.Linear(input_size, 512),
            nn.ReLU(),
            nn.Dropout(0.3),
            
            # Second hidden layer
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            
            # Output layer
            nn.Linear(256, num_classes)
        )

    def forward(self, x):
        """The forward pass."""
        return self.layers(x)

In [37]:
# Make sure you have defined the MLP class in a cell above this one

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Instantiate the new MLP model
model = MLP(num_classes=len(label_map)).to(device)

print("Using MLP model:")
print(model)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

for epoch in range(1, 16):
    model.train()
    total_loss = 0
    for imgs, labels in train_loader:
        imgs, labels = imgs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(imgs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch:02d} | Train Loss: {total_loss / len(train_loader):.4f}")

Using MLP model:
MLP(
  (layers): Sequential(
    (0): Flatten(start_dim=1, end_dim=-1)
    (1): Linear(in_features=16384, out_features=512, bias=True)
    (2): ReLU()
    (3): Dropout(p=0.3, inplace=False)
    (4): Linear(in_features=512, out_features=256, bias=True)
    (5): ReLU()
    (6): Dropout(p=0.3, inplace=False)
    (7): Linear(in_features=256, out_features=8, bias=True)
  )
)
Epoch 01 | Train Loss: 0.7482
Epoch 02 | Train Loss: 0.6436
Epoch 03 | Train Loss: 0.6398
Epoch 04 | Train Loss: 0.6262
Epoch 05 | Train Loss: 0.6201
Epoch 06 | Train Loss: 0.6161
Epoch 07 | Train Loss: 0.6137
Epoch 08 | Train Loss: 0.6060
Epoch 09 | Train Loss: 0.6076
Epoch 10 | Train Loss: 0.6081
Epoch 11 | Train Loss: 0.6075
Epoch 12 | Train Loss: 0.6035
Epoch 13 | Train Loss: 0.6060
Epoch 14 | Train Loss: 0.6004
Epoch 15 | Train Loss: 0.6009


Validation Accuracy: 0.8797
