In [4]:
import os
import math
import numpy as np
import librosa
import torch
from torch.utils.data import Dataset, DataLoader

# Dataset class for audio question detection
class AudioDataset(Dataset):
    def __init__(self, root_dir, split, mean=None, std=None):
        self.files = []
        self.labels = []
        # Traverse the 'questions' and 'others' subdirectories
        for label_name in ["questions", "others"]:
            class_dir = os.path.join(root_dir, split, label_name)
            if not os.path.isdir(class_dir):
                continue
            for fname in os.listdir(class_dir):
                if fname.endswith(".wav"):
                    self.files.append(os.path.join(class_dir, fname))
                    # Label: 1 for question, 0 for other
                    self.labels.append(1 if label_name == "questions" else 0)
        self.mean = mean
        self.std = std

    def __len__(self):
        return len(self.files)

    def __getitem__(self, idx):
        file_path = self.files[idx]
        # Load audio (use original sampling rate or specify one, e.g., sr=16000)
        waveform, sr = librosa.load(file_path, sr=None)
        # Compute power Mel spectrogram (128 Mel bins)
        S = librosa.feature.melspectrogram(y=waveform, sr=sr, n_mels=128, power=2.0)
        # Convert to log-scale (dB). ref=np.max sets 0 dB to the peak power in this spectrogram.
        S_db = librosa.power_to_db(S, top_db=80.0, ref=np.max)
        # Normalize using global mean and std if provided
        if self.mean is not None and self.std is not None:
            S_db = (S_db - self.mean) / (self.std + 1e-8)
        # Convert to torch tensor (time_frames, 128)
        # Note: librosa returns shape (128, time_frames), so transpose to (time_frames, 128)
        spectrogram = torch.tensor(S_db.T, dtype=torch.float32)
        label = torch.tensor(self.labels[idx], dtype=torch.long)
        return spectrogram, label

# Collate function to pad variable-length spectrograms in a batch
def collate_fn(batch):
    """
    Pads a list of (spectrogram, label) tuples to the same length.
    Returns:
      - batch_spectrogram: Tensor of shape (B, T_max, 128)
      - batch_labels: Tensor of shape (B,)
      - batch_mask: Boolean mask of shape (B, T_max) where False indicates real data and True indicates padding.
    """
    # Get sequence lengths for each sample
    lengths = [spec.shape[0] for spec, _ in batch]
    max_len = max(lengths)
    batch_size = len(batch)
    # Initialize padded tensor and mask
    batch_spectrogram = torch.zeros((batch_size, max_len, 128), dtype=torch.float32)
    # Mask with True for padding positions (will be used to ignore pads in attention)
    batch_mask = torch.ones((batch_size, max_len), dtype=torch.bool)
    batch_labels = torch.zeros((batch_size,), dtype=torch.long)
    for i, (spec, label) in enumerate(batch):
        L = spec.shape[0]
        batch_spectrogram[i, :L] = spec        # pad with zeros beyond L
        batch_mask[i, :L] = False             # False means non-padded (actual data)
        batch_labels[i] = label
    return batch_spectrogram, batch_labels, batch_mask

# Load training set without normalization to compute global mean and std
train_dataset_temp = AudioDataset(root_dir="cleaned_dataset", split="train", mean=None, std=None)
# Compute global mean and std across all training spectrogram values
sum_val, sum_sq_val, count = 0.0, 0.0, 0
for spectrogram, _ in train_dataset_temp:
    # spectrogram is a tensor shape (T, 128)
    sum_val += spectrogram.sum().item()
    sum_sq_val += (spectrogram ** 2).sum().item()
    count += spectrogram.numel()
global_mean = sum_val / count
global_var = sum_sq_val / count - (global_mean ** 2)
global_std = math.sqrt(global_var)  # standard deviation

print(f"Global mean: {global_mean:.4f}, Global std: {global_std:.4f}")

# Now create Dataset objects with normalization
train_dataset = AudioDataset(root_dir="cleaned_dataset", split="train", mean=global_mean, std=global_std)
test_dataset  = AudioDataset(root_dir="cleaned_dataset", split="test",  mean=global_mean, std=global_std)

# DataLoaders for batching
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
test_loader  = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)



Global mean: -62.0506, Global std: 19.5367


In [5]:
import torch.nn as nn
import torch.nn.functional as F


# Positional Encoding module (sinusoidal)
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000, dropout=0.1):
        super(PositionalEncoding, self).__init__()
        # Create constant positional encoding matrix (1, max_len, d_model)
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        # Compute sinusoidal functions of different frequencies for each dimension:
        div_term = torch.exp(
            torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)
        )
        pe[:, 0::2] = torch.sin(
            position * div_term
        )  # apply sin to even indices in the encoding
        pe[:, 1::2] = torch.cos(position * div_term)  # apply cos to odd indices
        pe = pe.unsqueeze(0)  # shape (1, max_len, d_model)
        self.register_buffer("pe", pe)  # register as buffer so it's not a parameter
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # x shape: (batch_size, seq_len, d_model)
        # Add positional encoding up to the sequence length of x
        seq_len = x.size(1)
        x = x + self.pe[:, :seq_len].to(x.device)
        return self.dropout(x)


# Transformer-based audio classifier
class AudioTransformerClassifier(nn.Module):
    def __init__(
        self,
        d_model=64,
        nhead=4,
        num_layers=2,
        dim_feedforward=128,
        dropout=0.1,
        pooling="mean",
    ):
        super(AudioTransformerClassifier, self).__init__()
        self.d_model = d_model
        self.pooling = pooling
        # Project 128-dim Mel frame to d_model dimensions
        self.frame_projection = nn.Linear(128, d_model)
        # Positional encoding for sequence ordering
        self.pos_encoder = PositionalEncoding(d_model, dropout=dropout)
        # Transformer Encoder stack
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=nhead,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            batch_first=True,
        )  # batch_first allows input as (batch, seq, embed)
        self.transformer_encoder = nn.TransformerEncoder(
            encoder_layer, num_layers=num_layers
        )
        # Classification MLP head
        self.fc1 = nn.Linear(d_model, d_model // 2)  # hidden layer (reduce dimension)
        self.fc2 = nn.Linear(d_model // 2, 2)  # 2 output classes
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, src_key_padding_mask=None):
        """
        x: Tensor of shape (batch, seq_len, 128) - input log-Mel spectrogram sequence.
        src_key_padding_mask: Boolean mask of shape (batch, seq_len) with True for padded positions.
        """
        # 1. Linear projection (embed each frame) and scale by sqrt(d_model)
        x = self.frame_projection(x) * math.sqrt(self.d_model)
        # 2. Add positional encodings
        x = self.pos_encoder(x)
        # 3. Transformer encoder (will attend to non-padded frames based on the mask)
        # src_key_padding_mask: True values are ignored (pad positions)
        encoded_seq = self.transformer_encoder(
            x, src_key_padding_mask=src_key_padding_mask
        )
        # 4. Pooling over time frames to get a single vector representation
        if self.pooling == "mean":
            # Mean pooling (exclude padded frames from the mean)
            if src_key_padding_mask is not None:
                # mask: False for real frames, True for pads. Invert to 1 for real frames:
                mask = (
                    (~src_key_padding_mask).unsqueeze(-1).float()
                )  # shape (batch, seq_len, 1)
                # Multiply to zero-out padded frames, then sum and divide by actual lengths
                summed = (encoded_seq * mask).sum(
                    dim=1
                )  # sum over time dim -> shape (batch, d_model)
                lengths = mask.sum(
                    dim=1
                )  # shape (batch, 1), number of real frames per sample
                lengths[lengths == 0] = 1.0  # avoid division by zero
                pooled = summed / lengths
            else:
                # If no mask provided, just average across all frames
                pooled = encoded_seq.mean(dim=1)
        elif self.pooling == "last":
            # Use the output of the last real frame (for each sequence)
            if src_key_padding_mask is not None:
                lengths = (~src_key_padding_mask).sum(
                    dim=1
                )  # number of real (non-pad) frames for each sample
            else:
                lengths = torch.tensor(
                    [encoded_seq.size(1)] * encoded_seq.size(0), device=x.device
                )
            # Clamp minimum length to 1 to avoid invalid indexing if any sequence is empty
            lengths = torch.clamp(lengths, min=1)
            # Gather the last output for each sample
            # Construct index tensor of shape (batch, 1, d_model) for gathering the last frame
            idx = (lengths - 1).view(-1, 1, 1).expand(-1, 1, encoded_seq.size(2))
            pooled = encoded_seq.gather(dim=1, index=idx).squeeze(
                1
            )  # shape (batch, d_model)
        else:
            raise ValueError(f"Unknown pooling type: {self.pooling}")
        # 5. Classification MLP: [d_model] -> [d_model/2] -> [2]
        x = self.dropout(pooled)
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        logits = self.fc2(x)  # raw scores for the two classes
        return logits

In [6]:
import torch.optim as optim
from sklearn.metrics import f1_score, accuracy_score

# Initialize model, optimizer, loss, and scheduler
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AudioTransformerClassifier(
    d_model=64, nhead=4, num_layers=2, dim_feedforward=128, dropout=0.1, pooling="mean"
).to(device)
criterion = nn.CrossEntropyLoss()  # for binary classification (with logits of size 2)
optimizer = optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-5)
# Reduce LR by factor of 0.5 if F1 doesn't improve for 2 epochs
scheduler = optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode="max", factor=0.5, patience=2
)

num_epochs = 20
patience = 5  # early stopping patience

best_f1 = 0.0
epochs_no_improve = 0

for epoch in range(1, num_epochs + 1):
    model.train()
    running_loss = 0.0
    # Training loop
    for batch_x, batch_y, batch_mask in train_loader:
        batch_x = batch_x.to(device)
        batch_y = batch_y.to(device)
        batch_mask = batch_mask.to(device)
        optimizer.zero_grad()
        # Forward pass
        logits = model(batch_x, src_key_padding_mask=batch_mask)
        loss = criterion(logits, batch_y)
        # Backpropagation
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * batch_x.size(0)
    avg_train_loss = running_loss / len(train_loader.dataset)
    # Evaluation on test set
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch_x, batch_y, batch_mask in test_loader:
            batch_x = batch_x.to(device)
            batch_y = batch_y.to(device)
            batch_mask = batch_mask.to(device)
            logits = model(batch_x, src_key_padding_mask=batch_mask)
            # Predicted class is the index of max logit
            preds = torch.argmax(logits, dim=1)
            all_preds.append(preds.cpu().numpy())
            all_labels.append(batch_y.cpu().numpy())
    # Concatenate all batches
    all_preds = np.concatenate(all_preds)
    all_labels = np.concatenate(all_labels)
    # Compute accuracy and F1
    epoch_acc = accuracy_score(all_labels, all_preds)
    # Compute F1-score for class "question" (label 1)
    epoch_f1 = f1_score(all_labels, all_preds, average="binary", pos_label=1)
    # Print epoch summary
    print(
        f"Epoch {epoch:02d}: TrainLoss={avg_train_loss:.4f}, TestAcc={epoch_acc:.3f}, TestF1={epoch_f1:.3f}"
    )
    # Scheduler step on F1 (to maximize, use mode='max')
    scheduler.step(epoch_f1)
    # Early stopping check
    if epoch_f1 > best_f1:
        best_f1 = epoch_f1
        epochs_no_improve = 0
        # Save the best model weights (optional)
        torch.save(model.state_dict(), "best_model.pth")
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= patience:
            print(
                "Early stopping triggered - no improvement in F1 for {} epochs.".format(
                    patience
                )
            )
            break

# Load best model for further use (if saved)
# model.load_state_dict(torch.load("best_model.pth"))

  output = torch._nested_tensor_from_mask(


Epoch 01: TrainLoss=0.6303, TestAcc=0.850, TestF1=0.127




Epoch 02: TrainLoss=0.6065, TestAcc=0.832, TestF1=0.395




Epoch 03: TrainLoss=0.6000, TestAcc=0.847, TestF1=0.397




Epoch 04: TrainLoss=0.5762, TestAcc=0.637, TestF1=0.419




Epoch 05: TrainLoss=0.5405, TestAcc=0.824, TestF1=0.489




Epoch 06: TrainLoss=0.5275, TestAcc=0.865, TestF1=0.289




Epoch 07: TrainLoss=0.5159, TestAcc=0.869, TestF1=0.502




Epoch 08: TrainLoss=0.5054, TestAcc=0.831, TestF1=0.515




Epoch 09: TrainLoss=0.5002, TestAcc=0.850, TestF1=0.497




Epoch 10: TrainLoss=0.5106, TestAcc=0.529, TestF1=0.371




Epoch 11: TrainLoss=0.4845, TestAcc=0.761, TestF1=0.490




Epoch 12: TrainLoss=0.4680, TestAcc=0.762, TestF1=0.492




Epoch 13: TrainLoss=0.4504, TestAcc=0.773, TestF1=0.497
Early stopping triggered - no improvement in F1 for 5 epochs.


In [7]:
import os
import math
import numpy as np
import librosa
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from sklearn.metrics import accuracy_score, f1_score


# 1. Dataset and DataLoader setup
class AudioDataset(Dataset):
    def __init__(self, root_dir, split, mean=None, std=None):
        self.file_paths = []
        self.labels = []
        for label_name in ["questions", "others"]:
            class_dir = os.path.join(root_dir, split, label_name)
            if not os.path.isdir(class_dir):
                continue
            for fname in os.listdir(class_dir):
                if fname.endswith(".wav"):
                    self.file_paths.append(os.path.join(class_dir, fname))
                    self.labels.append(1 if label_name == "questions" else 0)
        self.mean = mean
        self.std = std

    def __len__(self):
        return len(self.file_paths)

    def __getitem__(self, idx):
        path = self.file_paths[idx]
        waveform, sr = librosa.load(path, sr=None)
        S = librosa.feature.melspectrogram(y=waveform, sr=sr, n_mels=128, power=2.0)
        S_db = librosa.power_to_db(S, top_db=80.0, ref=np.max)
        if self.mean is not None and self.std is not None:
            S_db = (S_db - self.mean) / (self.std + 1e-8)
        spectrogram = torch.tensor(S_db.T, dtype=torch.float32)  # shape (T, 128)
        label = torch.tensor(self.labels[idx], dtype=torch.long)
        return spectrogram, label


def collate_fn(batch):
    lengths = [spec.shape[0] for spec, _ in batch]
    max_len = max(lengths)
    batch_size = len(batch)
    batch_specs = torch.zeros((batch_size, max_len, 128), dtype=torch.float32)
    batch_mask = torch.ones((batch_size, max_len), dtype=torch.bool)  # True = PAD
    batch_labels = torch.zeros((batch_size,), dtype=torch.long)
    for i, (spec, label) in enumerate(batch):
        L = spec.shape[0]
        batch_specs[i, :L] = spec
        batch_mask[i, :L] = False  # False = actual data
        batch_labels[i] = label
    return batch_specs, batch_labels, batch_mask


# Compute global mean and std from training data
train_temp = AudioDataset("cleaned_dataset", "train", mean=None, std=None)
sum_val, sum_sq_val, count = 0.0, 0.0, 0
for spec, _ in train_temp:
    sum_val += spec.sum().item()
    sum_sq_val += (spec**2).sum().item()
    count += spec.numel()
global_mean = sum_val / count
global_std = math.sqrt(max(sum_sq_val / count - global_mean**2, 0.0))
print(f"Computed global_mean={global_mean:.4f}, global_std={global_std:.4f}")

# Create normalized datasets and loaders
train_dataset = AudioDataset(
    "cleaned_dataset", "train", mean=global_mean, std=global_std
)
test_dataset = AudioDataset("cleaned_dataset", "test", mean=global_mean, std=global_std)
train_loader = DataLoader(
    train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn
)
test_loader = DataLoader(
    test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn
)


# 2. Model definition
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000, dropout=0.1):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(
            torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)
        )
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)  # shape (1, max_len, d_model)
        self.register_buffer("pe", pe)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = x + self.pe[:, : x.size(1)].to(x.device)
        return self.dropout(x)


class AudioTransformerClassifier(nn.Module):
    def __init__(
        self,
        d_model=64,
        nhead=4,
        num_layers=2,
        dim_feedforward=128,
        dropout=0.1,
        pooling="mean",
    ):
        super(AudioTransformerClassifier, self).__init__()
        self.d_model = d_model
        self.pooling = pooling
        self.frame_projection = nn.Linear(128, d_model)
        self.pos_encoder = PositionalEncoding(d_model, dropout=dropout)
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=nhead,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            batch_first=True,
        )
        self.transformer_encoder = nn.TransformerEncoder(
            encoder_layer, num_layers=num_layers
        )
        # Classification MLP
        self.fc1 = nn.Linear(d_model, d_model // 2)
        self.fc2 = nn.Linear(d_model // 2, 2)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, src_key_padding_mask=None):
        # x: (batch, seq_len, 128)
        x = self.frame_projection(x) * math.sqrt(self.d_model)
        x = self.pos_encoder(x)
        encoded = self.transformer_encoder(x, src_key_padding_mask=src_key_padding_mask)
        if self.pooling == "mean":
            if src_key_padding_mask is not None:
                mask = (~src_key_padding_mask).unsqueeze(-1).float()
                summed = (encoded * mask).sum(dim=1)
                lengths = mask.sum(dim=1)
                lengths[lengths == 0] = 1.0
                pooled = summed / lengths
            else:
                pooled = encoded.mean(dim=1)
        elif self.pooling == "last":
            if src_key_padding_mask is not None:
                lengths = (~src_key_padding_mask).sum(dim=1)
            else:
                lengths = torch.tensor(
                    [encoded.size(1)] * encoded.size(0), device=x.device
                )
            lengths = torch.clamp(lengths, min=1)
            idx = (lengths - 1).view(-1, 1, 1).expand(-1, 1, encoded.size(2))
            pooled = encoded.gather(dim=1, index=idx).squeeze(1)
        else:
            raise ValueError("Unknown pooling type")
        x = self.dropout(pooled)
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        logits = self.fc2(x)
        return logits


# 3. Training setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AudioTransformerClassifier(
    d_model=64, nhead=4, num_layers=2, dim_feedforward=128, dropout=0.1, pooling="mean"
).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-5)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode="max", factor=0.5, patience=2
)

num_epochs = 20
patience = 5
best_f1 = 0.0
epochs_no_improve = 0

for epoch in range(1, num_epochs + 1):
    model.train()
    total_loss = 0.0
    for batch_x, batch_y, batch_mask in train_loader:
        batch_x, batch_y, batch_mask = (
            batch_x.to(device),
            batch_y.to(device),
            batch_mask.to(device),
        )
        optimizer.zero_grad()
        outputs = model(batch_x, src_key_padding_mask=batch_mask)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * batch_x.size(0)
    avg_loss = total_loss / len(train_loader.dataset)
    # Evaluate on test set
    model.eval()
    all_preds, all_targets = [], []
    with torch.no_grad():
        for batch_x, batch_y, batch_mask in test_loader:
            batch_x, batch_y, batch_mask = (
                batch_x.to(device),
                batch_y.to(device),
                batch_mask.to(device),
            )
            outputs = model(batch_x, src_key_padding_mask=batch_mask)
            preds = torch.argmax(outputs, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_targets.extend(batch_y.cpu().numpy())
    acc = accuracy_score(all_targets, all_preds)
    f1 = f1_score(all_targets, all_preds, average="binary", pos_label=1)
    print(
        f"Epoch {epoch:02d}: TrainLoss={avg_loss:.4f}, TestAcc={acc:.3f}, TestF1={f1:.3f}"
    )
    scheduler.step(f1)
    # Check for improvement
    if f1 > best_f1:
        best_f1 = f1
        epochs_no_improve = 0
        torch.save(model.state_dict(), "best_model.pth")
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= patience:
            print("Early stopping: no improvement after %d epochs." % patience)
            break

# Training complete. The best model is saved in 'best_model.pth'.



Computed global_mean=-62.0506, global_std=19.5367
Epoch 01: TrainLoss=0.6336, TestAcc=0.779, TestF1=0.395




Epoch 02: TrainLoss=0.6026, TestAcc=0.784, TestF1=0.426




Epoch 03: TrainLoss=0.5785, TestAcc=0.742, TestF1=0.462




Epoch 04: TrainLoss=0.5459, TestAcc=0.678, TestF1=0.434




Epoch 05: TrainLoss=0.5318, TestAcc=0.780, TestF1=0.490




Epoch 06: TrainLoss=0.5103, TestAcc=0.707, TestF1=0.439




Epoch 07: TrainLoss=0.5176, TestAcc=0.662, TestF1=0.437




Epoch 08: TrainLoss=0.5044, TestAcc=0.787, TestF1=0.508




Epoch 09: TrainLoss=0.4854, TestAcc=0.848, TestF1=0.529




Epoch 10: TrainLoss=0.4700, TestAcc=0.783, TestF1=0.496




Epoch 11: TrainLoss=0.4652, TestAcc=0.835, TestF1=0.532




Epoch 12: TrainLoss=0.4661, TestAcc=0.799, TestF1=0.509




Epoch 13: TrainLoss=0.4554, TestAcc=0.871, TestF1=0.512




Epoch 14: TrainLoss=0.4562, TestAcc=0.506, TestF1=0.368




Epoch 15: TrainLoss=0.4402, TestAcc=0.778, TestF1=0.509




Epoch 16: TrainLoss=0.4136, TestAcc=0.775, TestF1=0.512
Early stopping: no improvement after 5 epochs.
