In [9]:
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import seaborn as sns

import numpy as np

import pandas as pd

import scipy.io.wavfile as wav
from scipy.signal import spectrogram

import os

import cv2

from tqdm import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, random_split
from sklearn.preprocessing import LabelEncoder

import random

from sklearn.metrics import confusion_matrix, classification_report, precision_score, recall_score, f1_score


In [5]:
def sauvolaThresholding(intensityNorm, windowSize=15, k=0.5, R=128):
    intensityScaled = (intensityNorm * 255).astype(np.uint8)

    mean = cv2.boxFilter(intensityScaled, ddepth=-1, ksize=(windowSize, windowSize))
    meanSq = cv2.boxFilter(intensityScaled**2, ddepth=-1, ksize=(windowSize, windowSize))
    
    std = np.sqrt(meanSq - mean**2)

    threshold = mean * (1 + k * ((std / R) - 1))

    binarySpectrogram = intensityScaled > threshold

    return binarySpectrogram



def saveBinarySpectrogramAsImage(audioFile, outputPath):
    sampleRate, audioData = wav.read(audioFile)

    # If the audio has multiple channels, use only the first one
    if len(audioData.shape) > 1:
        audioData = audioData[:, 0]

    # Set FFT Window Size and Overlap
    nps = 2048  # Larger FFT window improves frequency resolution
    frequencies, time, intensity = spectrogram(audioData, 
                                               fs=sampleRate, 
                                               nperseg=nps, 
                                               noverlap=(nps // 2))

    mask = (frequencies >= 10000) & (frequencies <= 40000)
    frequencies = frequencies[mask]
    intensity = intensity[mask, :]  

    # Normalize the intensity
    intensityNorm = np.clip(intensity / np.max(intensity), 0, 1)

    binarySpectrogram = sauvolaThresholding(intensityNorm, windowSize=15, k=0.5)

    # Save the binary spectrogram as an image
    plt.figure(figsize=(10, 6))
    plt.pcolormesh(time, frequencies, binarySpectrogram, cmap='gray', shading='gouraud')
    plt.axis('off')  
    plt.tight_layout()
    plt.savefig(outputPath, bbox_inches='tight', pad_inches=0)
    plt.close()  


def processImage(inputImagePath, outputFolder):
    image = cv2.imread(inputImagePath, 0)
    kernel = np.ones((5, 5), np.uint8)

    # Apply median filtering
    medianFilteredImage = cv2.medianBlur(image, 5)

    # Apply dilation
    dilatedImage = cv2.dilate(medianFilteredImage, kernel, iterations=1)

    # Apply erosion
    dilatedThenEroded = cv2.erode(dilatedImage, kernel, iterations=1)

    processedImage = dilatedThenEroded
    baseName = os.path.splitext(os.path.basename(inputImagePath))[0]
    cv2.imwrite(os.path.join(outputFolder, f"{baseName}_processed_img.png"), processedImage)

if __name__ == "__main__":
    # for testing -  testClips  # main folder - LabelledAudioChunks     
    audioFolder = "LabelledAudioChunks"  # Folder containing audio files      # default = LabelledAudioChunks
    binarySpectrogramFolder = "binarySpectrogramImages"  # Folder to save binary spectrogram images     # default = binarySpectrogramImages #test = testBinSpecImgsFolder
    processedImagesFolder = "processedSpectrogramImages"  # Folder to save processed images  # default = processedSpectrogramImages    #test = testProcessedSpecImgsFolder

    # Create output folders if they don't exist
    os.makedirs(binarySpectrogramFolder, exist_ok=True)
    os.makedirs(processedImagesFolder, exist_ok=True)

    # Generate binary spectrograms with progress bar
    audioFiles = [file for file in os.listdir(audioFolder) if file.endswith(".wav")]
    for file in tqdm(audioFiles, desc="Generating Binary Spectrograms"):
        filePath = os.path.join(audioFolder, file)
        outputImagePath = os.path.join(binarySpectrogramFolder, f"{os.path.splitext(file)[0]}_binary.png")

        # Generate and save the binary spectrogram as an image
        saveBinarySpectrogramAsImage(filePath, outputImagePath)

    # Process each generated binary spectrogram with progress bar
    spectrogramFiles = [file for file in os.listdir(binarySpectrogramFolder) if file.endswith(".png")]
    for file in tqdm(spectrogramFiles, desc="Processing Spectrogram Images"):
        inputImagePath = os.path.join(binarySpectrogramFolder, file)

        # Apply image processing and save the results
        processImage(inputImagePath, processedImagesFolder)

Generating Binary Spectrograms: 100%|██████████| 1685/1685 [52:29<00:00,  1.87s/it]
Processing Spectrogram Images: 100%|██████████| 1685/1685 [00:40<00:00, 41.16it/s]


In [10]:
# CNN model

random.seed(123)
np.random.seed(123)
torch.manual_seed(123)
torch.cuda.manual_seed_all(123)

class SpectrogramDataset(Dataset):
    def __init__(self, imageFolder, img_size=(224, 224)):
        self.imageFolder = imageFolder
        self.img_size = img_size
        self.images = []
        self.labels = []
        self.label_encoder = LabelEncoder()
        
        self.loadData()

    def loadData(self):
        imageFiles = [file for file in os.listdir(self.imageFolder) if file.endswith(".png")]
        
        for file in tqdm(imageFiles, desc="Processing Images"):
            filePath = os.path.join(self.imageFolder, file)
            
            image = cv2.imread(filePath, cv2.IMREAD_GRAYSCALE)
            if image is None:
                print(f"Error loading image {filePath}")
                continue
            
            resizedImage = cv2.resize(image, self.img_size)
            self.images.append(resizedImage)
            
            label = self.getSpeciesLabel(file)
            self.labels.append(label)
        
        self.images = np.array(self.images)
        self.labels = self.label_encoder.fit_transform(self.labels)

    def getSpeciesLabel(self, fileName):
        return fileName.split('_')[0]


    # 'dunder' methods (basically just used to make the dataset compatible with PyTorch's DataLoader)
    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        image = self.images[idx].astype('float32') / 255.0  # normalize image
        image = torch.tensor(image).unsqueeze(0)  
        label = torch.tensor(self.labels[idx], dtype=torch.long)
        return image, label



class CNNModel(nn.Module):
    def __init__(self, num_classes):
        super(CNNModel, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
        self.fc1 = nn.Linear(64 * 56 * 56, 128) 
        self.fc2 = nn.Linear(128, num_classes)
        self.dropout = nn.Dropout(0.5)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)

    def forward(self, x):
        x = self.pool(torch.relu(self.conv1(x)))
        x = self.pool(torch.relu(self.conv2(x)))
        x = x.view(x.size(0), -1)  # flatten 
        x = torch.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        return x


def trainModel(model, train_loader, val_loader, num_epochs):    
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)  # learning rate

    for epoch in range(num_epochs):
        model.train()
        runningLoss = 0.0
        correct = 0
        total = 0

        for images, labels in train_loader:
            optimizer.zero_grad()

            # Forward pass
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            runningLoss += loss.item()

            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

        epochLoss = runningLoss / len(train_loader)
        epochAccuracy = 100 * correct / total

        print(f"\nEpoch {epoch+1}/{num_epochs}, Loss: {epochLoss:.4f}, Accuracy: {epochAccuracy:.2f}%")

        # Validation phase
        model.eval()
        valCorrect = 0
        valTotal = 0
        allPreds = []
        allLabels = []

        with torch.no_grad():
            for images, labels in val_loader:
                outputs = model(images)
                _, predicted = torch.max(outputs, 1)
                valTotal += labels.size(0)
                valCorrect += (predicted == labels).sum().item()

                # Store predictions and labels
                allPreds.extend(predicted.cpu().numpy())
                allLabels.extend(labels.cpu().numpy())

        valAccuracy = 100 * valCorrect / valTotal
        print(f"Validation Accuracy: {valAccuracy:.2f}%")

        # Compute and display confusion matrix
        cm = confusion_matrix(allLabels, allPreds)
        cm_df = pd.DataFrame(cm, index=dataset.label_encoder.classes_, columns=dataset.label_encoder.classes_)
        print(f"\nConfusion Matrix (Epoch {epoch+1}):")
        print(cm_df.to_string())

        # Calculate Precision, Recall, F1 Score
        precision = precision_score(allLabels, allPreds, average='weighted', zero_division=0)
        recall = recall_score(allLabels, allPreds, average='weighted', zero_division=0)
        f1 = f1_score(allLabels, allPreds, average='weighted', zero_division=0)

        print(f"Precision (Weighted): {precision:.4f}")
        print(f"Recall (Weighted):    {recall:.4f}")
        print(f"F1 Score (Weighted):  {f1:.4f}")

        # Optional: Full Classification Report
        print("\nClassification Report:")
        print(classification_report(allLabels, allPreds, target_names=dataset.label_encoder.classes_))


if __name__ == "__main__":
    imageFolder = "processedSpectrogramImages"  # default = processedSpectrogramImages
    imgSize = (224, 224) 
    
    dataset = SpectrogramDataset(imageFolder, imgSize)
    trainSize = int(0.8 * len(dataset))
    valSize = len(dataset) - trainSize
    trainDataset, valDataset = random_split(dataset, [trainSize, valSize])
    
    trainLoader = DataLoader(trainDataset, batch_size=10, shuffle=True)
    valLoader = DataLoader(valDataset, batch_size=10, shuffle=False)
    
    numClasses = len(np.unique(dataset.labels))
    
    model = CNNModel(num_classes=numClasses)
    
    # decide number of epochs
    # on seed 123 the model seems to startt overfitting past epoch 5
    trainModel(model, trainLoader, valLoader, num_epochs=10)

    # to save the model
    #torch.save(model.state_dict(), "cnn_grasshopper_cricket_classifier_26-02-2025.pth")


Processing Images: 100%|██████████| 1685/1685 [00:07<00:00, 237.90it/s]



Epoch 1/10, Loss: 0.4843, Accuracy: 84.79%
Validation Accuracy: 97.92%

Confusion Matrix (Epoch 1):
                      BackgroundNoise  MeadowGrasshopper  NoID1  NoID10  Roesel'sBush-Cricket
BackgroundNoise                   104                  0      0       0                     0
MeadowGrasshopper                   2                 12      0       0                     4
NoID1                               1                  0     15       0                     0
NoID10                              0                  0      0      11                     0
Roesel'sBush-Cricket                0                  0      0       0                   188
Precision (Weighted): 0.9797
Recall (Weighted):    0.9792
F1 Score (Weighted):  0.9775

Classification Report:
                      precision    recall  f1-score   support

     BackgroundNoise       0.97      1.00      0.99       104
   MeadowGrasshopper       1.00      0.67      0.80        18
               NoID1       1.00      