# Performance of classifier on im2wav mixed audio dataset

In [15]:
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torchvision import models
import torch
import torchaudio
from torchaudio.transforms import MelSpectrogram, AmplitudeToDB

import os
import torch
import librosa
import numpy as np
from PIL import Image
from torchvision import transforms
import csv
import tqdm

## Load self-trained best classifier

In [4]:
# Set configurations for the model
class ResNetClassifier(nn.Module):
    # Replacing original ResNet18's last layer with a custom classifier for multi-label classification
    def __init__(self, num_classes, use_dropout=False, dropout_rate=0.3):
        super().__init__()
        # Load a pretrained ResNet18 and modify the first and last layers
        resnet = models.resnet18(pretrained=True)
        resnet.conv1 = nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3, bias=False)
        self.backbone = nn.Sequential(*list(resnet.children())[:-1])  # remove last layer(avg pool and fc) because we will use our own mlb classifier

        layers = [
            nn.Flatten(),# (B, 512, 1, 1) -> (B, 512)
            nn.Linear(512, 256),
            nn.ReLU()# apply ReLU activation function to introduce non-linearity
        ]
        if use_dropout:
            layers.append(nn.Dropout(p=dropout_rate))
        layers.append(nn.Linear(256, num_classes))# output logits for BCEWithLogitsLoss
        
        self.classifier = nn.Sequential(*layers)
        
    def forward(self, x):
        x = self.backbone(x)
        x = self.classifier(x)
        return x

In [5]:
# Process the audio data
n_mels = 128
n_fft = 1024
hop_length = 512
sample_rate = 16000
target_duration_sec = 4
target_length = sample_rate * target_duration_sec

mel_transform = torch.nn.Sequential(
    MelSpectrogram(
        sample_rate=sample_rate, 
        n_fft=n_fft, 
        hop_length=hop_length, 
        n_mels=n_mels
    ),
    AmplitudeToDB()
)

def preprocess_audio(filepath):
    waveform, sr = torchaudio.load(filepath)
    
    if waveform.shape[1] < target_length:
        waveform = torch.nn.functional.pad(waveform, (0, target_length - waveform.shape[1]))
    elif waveform.shape[1] > target_length:
        waveform = waveform[:, :target_length]
    
    mel_spec = mel_transform(waveform).squeeze(0)     # [128, Time]
    mel_spec = mel_spec.unsqueeze(0).unsqueeze(0)     # -> [1, 1, 128, Time] for model
    return mel_spec.cuda()

In [None]:
class_names = ['bird', 'cat', 'cow', 'dog', 'elephant', 'horse', 'lion', 'sheep']
num_classes = len(class_names)
test_folder = "E:/LSE/ST311/ST311-Group-Project/audio_output/single_object_audios"
model = ResNetClassifier(num_classes=num_classes, use_dropout=False)
model.load_state_dict(torch.load("E:/LSE/ST311/ST311-Group-Project/experiment/mixed_res/resnet_f1_0.9744.pth"))
model = model.cuda()
model.eval()



ResNetClassifier(
  (backbone): Sequential(
    (0): Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (4): Sequential(
      (0): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (1): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_s

# Performance on Single Animal Audios

In [18]:
threshold = 0.3
results = []

for filename in os.listdir(test_folder):
    if filename.endswith(".wav"):
        path = os.path.join(test_folder, filename)
        input_tensor = preprocess_audio(path)

        with torch.no_grad():
            logits = model(input_tensor)
            probs = torch.sigmoid(logits)
            preds = (probs > threshold).squeeze().cpu().numpy()
            predicted_labels = [class_names[i] for i, p in enumerate(preds) if p]
            results.append([filename] + predicted_labels)
            print(f"{filename}: {predicted_labels}")

# Save results as CSV
csv_path = "inference_results_single.csv"
with open(csv_path, "w", newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(["filename"] + [f"tag{i+1}" for i in range(10)])  # header row
    for row in results:
        writer.writerow(row + [""] * (11 - len(row)))  # pad if fewer than 10 tags

print(f"\nResults saved in {csv_path}")

0_cat_bird1_bird1_0.wav: ['bird']
0_cow_dog1_cow0_0.wav: []
0_cow_horse1_cow1_0.wav: ['sheep']
0_cow_sheep2_cow0_0.wav: ['cow']
0_dog_bird1_bird1_0.wav: ['horse']
0_dog_cat2_cat1_0.wav: ['horse']
0_dog_horse2_dog1_0.wav: ['cow']
0_elephant_bird1_bird1_0.wav: ['bird']
0_elephant_lion1_elephant0_0.wav: ['lion']
0_horse_sheep1_horse1_0.wav: ['dog']
1_cat_bird1_cat0_0.wav: ['elephant']
1_cow_dog1_dog1_0.wav: ['horse']
1_cow_horse1_horse0_0.wav: ['cat', 'cow']
1_cow_sheep2_sheep1_0.wav: ['cat']
1_dog_bird1_dog0_0.wav: ['dog']
1_dog_cat2_dog0_0.wav: ['dog']
1_dog_horse2_horse0_0.wav: ['elephant']
1_elephant_bird1_elephant0_0.wav: ['cow', 'sheep']
1_elephant_lion1_lion0_0.wav: ['lion']
1_horse_sheep1_sheep0_0.wav: ['cat', 'sheep']
2_cat_bird2_bird1_0.wav: ['bird']
2_cow_dog2_cow0_0.wav: ['dog', 'elephant']
2_cow_horse2_cow0_0.wav: ['sheep']
2_cow_sheep3_cow0_0.wav: ['dog']
2_dog_cat4_cat0_0.wav: ['elephant']
2_dog_horse3_dog1_0.wav: ['dog']
2_elephant_lion3_elephant0_0.wav: ['sheep']
2_horse_

# Performance on Mixed Audios

In [17]:
threshold = 0.1
test_folder_mixed = "E:/LSE/ST311/ST311-Group-Project/audio_output/mixed_test_audios"
results = []

for filename in os.listdir(test_folder_mixed):
    if filename.endswith(".wav"):
        path = os.path.join(test_folder_mixed, filename)
        input_tensor = preprocess_audio(path)

        with torch.no_grad():
            logits = model(input_tensor)
            probs = torch.sigmoid(logits)
            preds = (probs > threshold).squeeze().cpu().numpy()
            predicted_labels = [class_names[i] for i, p in enumerate(preds) if p]
            results.append([filename] + predicted_labels)
            print(f"{filename}: {predicted_labels}")

# Save results as CSV
csv_path = "inference_results_01.csv"
with open(csv_path, "w", newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(["filename"] + [f"tag{i+1}" for i in range(10)])  # header row
    for row in results:
        writer.writerow(row + [""] * (11 - len(row)))  # pad if fewer than 10 tags

print(f"\nResults saved in {csv_path}")


mixed_cat_bird1.wav: ['bird']
mixed_cat_bird2.wav: ['cat', 'sheep']
mixed_cow_dog1.wav: ['dog', 'elephant', 'horse']
mixed_cow_dog2.wav: ['dog', 'elephant']
mixed_cow_dog5.wav: ['cow', 'elephant', 'sheep']
mixed_cow_horse1.wav: ['sheep']
mixed_cow_horse2.wav: ['cat', 'sheep']
mixed_cow_horse3.wav: ['cow', 'lion']
mixed_cow_horse4.wav: ['cat', 'elephant']
mixed_cow_sheep2.wav: ['cow', 'sheep']
mixed_cow_sheep3.wav: ['dog', 'sheep']
mixed_cow_sheep4.wav: ['elephant']
mixed_dog_bird1.wav: ['dog']
mixed_dog_cat2.wav: ['dog']
mixed_dog_cat4.wav: ['dog']
mixed_dog_cat6.wav: ['elephant', 'horse']
mixed_dog_horse2.wav: ['dog', 'elephant']
mixed_dog_horse3.wav: ['dog']
mixed_dog_horse7.wav: ['cat', 'dog']
mixed_elephant_bird1.wav: ['bird']
mixed_elephant_lion1.wav: ['lion']
mixed_elephant_lion3.wav: ['dog', 'lion']
mixed_elephant_lion4.wav: ['elephant']
mixed_horse_sheep1.wav: ['cat', 'dog']
mixed_horse_sheep2.wav: ['cat', 'cow', 'dog']

Results saved in inference_results_01.csv


In [None]:
threshold = 0.3
test_folder_mixed = "E:/LSE/ST311/ST311-Group-Project/audio_output/mixed_test_audios"
results = []

for filename in os.listdir(test_folder_mixed):
    if filename.endswith(".wav"):
        path = os.path.join(test_folder_mixed, filename)
        input_tensor = preprocess_audio(path)

        with torch.no_grad():
            logits = model(input_tensor)
            probs = torch.sigmoid(logits)
            preds = (probs > threshold).squeeze().cpu().numpy()
            predicted_labels = [class_names[i] for i, p in enumerate(preds) if p]
            results.append([filename] + predicted_labels)
            print(f"{filename}: {predicted_labels}")

# Save results as CSV
csv_path = "inference_results_higherth.csv"
with open(csv_path, "w", newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(["filename"] + [f"tag{i+1}" for i in range(10)])  # header row
    for row in results:
        writer.writerow(row + [""] * (11 - len(row)))  # pad if fewer than 10 tags

print(f"\nResults saved in {csv_path}")


mixed_cat_bird1.wav: ['bird']
mixed_cat_bird2.wav: ['cat']
mixed_cow_dog1.wav: []
mixed_cow_dog2.wav: ['dog', 'elephant']
mixed_cow_dog5.wav: ['sheep']
mixed_cow_horse1.wav: ['sheep']
mixed_cow_horse2.wav: ['cat', 'sheep']
mixed_cow_horse3.wav: ['lion']
mixed_cow_horse4.wav: ['cat']
mixed_cow_sheep2.wav: ['cow']
mixed_cow_sheep3.wav: ['sheep']
mixed_cow_sheep4.wav: ['elephant']
mixed_dog_bird1.wav: ['dog']
mixed_dog_cat2.wav: ['dog']
mixed_dog_cat4.wav: ['dog']
mixed_dog_cat6.wav: ['horse']
mixed_dog_horse2.wav: ['dog', 'elephant']
mixed_dog_horse3.wav: ['dog']
mixed_dog_horse7.wav: ['dog']
mixed_elephant_bird1.wav: ['bird']
mixed_elephant_lion1.wav: ['lion']
mixed_elephant_lion3.wav: ['lion']
mixed_elephant_lion4.wav: ['elephant']
mixed_horse_sheep1.wav: ['cat', 'dog']
mixed_horse_sheep2.wav: ['cat']

Results saved in inference_results_higherth.csv


## PANN model

We modified PANN's inference file and run the command in local terminal(Windows powershell). Results are saved in csv files.


python pytorch/inference_modified.py `
    --model_type=Cnn14_16k `
    --checkpoint_path=E:/LSE/ST311/ST311-Group-Project/Cnn14_16k_mAP=0.438.pth `
    --audio_folder="E:/LSE/ST311/ST311-Group-Project/audio_output/mixed_test_audios" `
    --cuda

# Evaluate performance

In [19]:
from sklearn.metrics import precision_score, recall_score, f1_score

csv_path = "E:/LSE/ST311/ST311-Group-Project/inference_results_higherth.csv"

y_true_all = []
y_pred_all = []
all_labels_set = set()

with open(csv_path, 'r', encoding='utf-8') as f:
    reader = csv.DictReader(f)
    for row in reader:
        # gain the ground truth labels from filename
        fname = row['filename']
        parts = fname.replace(".wav", "").split("_")[1:]  # delete 'mixed'
        true_labels = set(parts)
        
        # prediction（tag1 ~ tag10）
        pred_labels = set(filter(None, [row[f'tag{i}'] for i in range(1, 11)]))
        
        all_labels_set.update(true_labels)
        all_labels_set.update(pred_labels)

        y_true_all.append(true_labels)
        y_pred_all.append(pred_labels)

# turn into multi-hot vectors
labels_list = sorted(list(all_labels_set))
label_to_idx = {label: i for i, label in enumerate(labels_list)}

def to_multihot(label_set):
    vec = [0] * len(labels_list)
    for label in label_set:
        if label in label_to_idx:
            vec[label_to_idx[label]] = 1
    return vec

y_true_vecs = [to_multihot(s) for s in y_true_all]
y_pred_vecs = [to_multihot(s) for s in y_pred_all]


precision = precision_score(y_true_vecs, y_pred_vecs, average='micro')
recall = recall_score(y_true_vecs, y_pred_vecs, average='micro')
f1 = f1_score(y_true_vecs, y_pred_vecs, average='micro')

print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1 Score:  {f1:.4f}")


Precision: 0.3214
Recall:    0.1800
F1 Score:  0.2308


In [22]:
from sklearn.metrics import precision_score, recall_score, f1_score

csv_path = "E:/LSE/ST311/ST311-Group-Project/inference_results_single.csv"

y_true_all = []
y_pred_all = []
all_labels_set = set()

with open(csv_path, 'r', encoding='utf-8') as f:
    reader = csv.DictReader(f)
    for row in reader:
        # gain the ground truth labels from filename
        fname = row['filename']
        parts = fname.replace(".wav", "").split("_")[1:]  # delete 'mixed'
        true_labels = set(parts)
        
        # prediction（tag1 ~ tag10）
        pred_labels = set(filter(None, [row[f'tag{i}'] for i in range(1, 11)]))
        
        all_labels_set.update(true_labels)
        all_labels_set.update(pred_labels)

        y_true_all.append(true_labels)
        y_pred_all.append(pred_labels)

# turn into multi-hot vectors
labels_list = sorted(list(all_labels_set))
label_to_idx = {label: i for i, label in enumerate(labels_list)}

def to_multihot(label_set):
    vec = [0] * len(labels_list)
    for label in label_set:
        if label in label_to_idx:
            vec[label_to_idx[label]] = 1
    return vec

y_true_vecs = [to_multihot(s) for s in y_true_all]
y_pred_vecs = [to_multihot(s) for s in y_pred_all]


precision = precision_score(y_true_vecs, y_pred_vecs, average='micro')
recall = recall_score(y_true_vecs, y_pred_vecs, average='micro')
f1 = f1_score(y_true_vecs, y_pred_vecs, average='micro')

print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1 Score:  {f1:.4f}")


Precision: 0.2034
Recall:    0.0600
F1 Score:  0.0927
