In [None]:
# --- Hyperparameters and Configuration ---

# Number of training epochs
epoch_num = 500
# Number of samples in each batch
batch_size = 40
# Learning rate for the optimizer
learning_rate = 1e-3

# Directory containing the training and validation data subfolders
train_data_dir = "./train-val-independent/"

# File paths for the generated CSV files listing the training and validation data
train_fn = "./csv_lists_indep/trainData.csv"
eval_fn = "./csv_lists_indep/valiData.csv"

# A string identifier for this specific training run, used in output folder names
str_id = "_msplus_v2_independent"

# Directories to save model checkpoints and TensorBoard logs
output_dir = f"./results{str_id}/checkpoints"
log_dir = f"./results{str_id}/logs"


In [None]:
# --- Library Imports ---

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader

import librosa
import numpy as np
import os
import csv

from tqdm import tqdm
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, balanced_accuracy_score
from torch.utils.tensorboard import SummaryWriter
from datasets import load_dataset


In [None]:
# --- CSV Generation for Data Loading ---

# Create the directories for CSV files if they don't already exist
os.makedirs(os.path.dirname(train_fn), exist_ok=True)
os.makedirs(os.path.dirname(eval_fn), exist_ok=True)

def generate_csv(folder_path, output_csv):
    """
    Generates a CSV file from a directory structure.
    Assumes that subdirectories in folder_path are class labels (e.g., species names).
    The CSV will have columns: Fname, Genera, Species.
    
    Args:
        folder_path (str): The path to the data directory (e.g., "./data/train/").
        output_csv (str): The path where the output CSV file will be saved.
        
    Returns:
        dict: A dictionary mapping each species label to a unique integer ID.
    """
    data = []
    label_list = set()

    # Iterate through the subdirectories (each representing a species)
    for species in os.listdir(folder_path):
        label_list.add(species)
        species_path = os.path.join(folder_path, species)
        
        if os.path.isdir(species_path):
            # Iterate through the audio files in the species directory
            for fname in os.listdir(species_path):
                # Ensure it's a file before adding
                if os.path.isfile(os.path.join(species_path, fname)):
                    data.append({
                        "Fname": os.path.join(species_path, fname).replace("\\", "/"),
                        "Genera": "",  # Genera is not used in this script, kept for compatibility
                        "Species": species
                    })

    # Write the collected data to a CSV file
    with open(output_csv, mode='w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=["Fname", "Genera", "Species"])
        writer.writeheader()
        writer.writerows(data)
        
    # Return a mapping of class names to integer indices
    return {key: idx for idx, key in enumerate(label_list)}

# Generate the CSV for the training dataset
label_list = generate_csv(train_data_dir + "train/", train_fn)
print(f"Train CSV created: {train_fn}")

# Generate the CSV for the validation dataset
generate_csv(train_data_dir + "validation/", eval_fn)
print(f"Validation CSV created: {eval_fn}")

print(f"Generated label map: {label_list}")
# Set the number of classes based on the discovered labels
cls_num = len(label_list)


In [None]:
# --- Model Definition (MosquitoSongPlus) ---

class MosquitoSongPlus(nn.Module):
    """
    A 1D Convolutional Neural Network for audio classification, inspired by MosquitoSong.
    """
    def __init__(self, input_size=16000, num_classes=8):
        super(MosquitoSongPlus, self).__init__()
        
        # Convolutional layers
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=32, kernel_size=100, stride=4, padding=2)
        self.conv2 = nn.Conv1d(in_channels=32, out_channels=32, kernel_size=64, stride=4, padding=2)
        self.conv3 = nn.Conv1d(in_channels=32, out_channels=64, kernel_size=64, stride=3, padding=2)

        # Max-pooling layer
        self.pool = nn.MaxPool1d(kernel_size=3, stride=3)

        # Dropout layer for regularization
        self.dropout = nn.Dropout(0.5)

        # Dynamically calculate the input size for the first fully connected layer
        fc_input_size = self.calculate_fc_input_size(input_size)
        
        # Fully connected layers
        self.fc1 = nn.Linear(fc_input_size, 256)
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, num_classes)

    def calculate_fc_input_size(self, input_size):
        """
        Calculates the flattened feature size after all conv and pool layers.
        This makes the model adaptable to different input audio lengths.
        Formula for Conv1d output size: floor((L_in + 2*padding - kernel_size) / stride) + 1
        Formula for MaxPool1d output size: floor((L_in - kernel_size) / stride) + 1
        """
        size = input_size
        size = (size + 2 * 2 - 100) // 4 + 1  # After conv1
        size = (size - 3) // 3 + 1             # After pool1
        size = (size + 2 * 2 - 64) // 4 + 1   # After conv2
        size = (size - 3) // 3 + 1             # After pool2
        size = (size + 2 * 2 - 64) // 3 + 1   # After conv3
        size = (size - 3) // 3 + 1             # After pool3
        # The final size is multiplied by the number of output channels from the last conv layer
        return size * 64

    def forward(self, x):
        """
        Defines the forward pass of the model.
        """
        # Input shape: (batch_size, 1, input_size)
        x = F.relu(self.conv1(x))
        x = self.pool(x)
        x = F.relu(self.conv2(x))
        x = self.pool(x)
        x = F.relu(self.conv3(x))
        x = self.pool(x)

        # Flatten the features for the fully connected layers
        x = x.view(x.size(0), -1)
        x = self.dropout(x)
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        
        # We return raw logits because nn.CrossEntropyLoss applies softmax internally.
        return x

# Instantiate the model
model = MosquitoSongPlus(input_size=16000, num_classes=cls_num)
print(model)

In [None]:
# --- Data Loading and Preprocessing ---

# Load the CSV datasets using the Hugging Face `datasets` library
dataset = load_dataset("csv", data_files={"train": train_fn, "test": eval_fn})
print(dataset)

# Create label-to-id and id-to-label mappings
label2id = label_list
id2label = {v: k for k, v in label2id.items()}

def preprocess_csv_function(examples):
    """Maps the species name (string) to an integer label."""
    return {
        "label": label2id[examples["Species"]],
        "audio_filepath": examples["Fname"]
    }

# Apply the initial CSV preprocessing
csv_processed_dataset = dataset.map(preprocess_csv_function)
print(csv_processed_dataset["train"][0])

train_dataset = csv_processed_dataset["train"]
validation_dataset = csv_processed_dataset["test"]
print(f"Validation dataset info: {validation_dataset}")
print(f"Class labels (id2label): {id2label}")


In [None]:
def preprocess_audio_function(examples, target_length=16000):
    """
    Loads, normalizes, and pads/truncates an audio file to a target length.
    """
    audio_fn = examples['audio_filepath']
    # Load audio at a 16kHz sample rate
    audio, sr = librosa.load(audio_fn, sr=16000)
    # Normalize the audio waveform
    audio = librosa.util.normalize(audio)
    
    # Pad or truncate the audio to ensure uniform length
    if len(audio) < target_length:
        padding = target_length - len(audio)
        audio = np.pad(audio, (0, padding), mode="constant")
    elif len(audio) > target_length:
        audio = audio[:target_length]
    
    return {"input_values": audio, "labels": int(examples["label"])}


In [None]:
# Apply the audio preprocessing function to both datasets
encoded_train_dataset = train_dataset.map(preprocess_audio_function, remove_columns=["Fname", "Genera", "Species"])
encoded_validation_dataset = validation_dataset.map(preprocess_audio_function, remove_columns=["Fname", "Genera", "Species"])


In [None]:
#print(encoded_train_dataset["input_values"][0])
#encoded_train_dataset["input_values"][0]


In [None]:
def collate_fn(batch):
    """
    Custom collate function to batch data points.
    It stacks input values into a single tensor and creates a tensor for labels.
    """
    # Convert list of numpy arrays to a batch tensor of type float32
    input_values = torch.stack([torch.tensor(item["input_values"], dtype=torch.float32) for item in batch])
    # Convert list of integer labels to a batch tensor of type long
    labels = torch.tensor([item["labels"] for item in batch], dtype=torch.long)
    return {
        "input_values": input_values,
        "labels": labels
    }


In [None]:
# --- Metrics, Training, and Evaluation Functions ---

def compute_metrics(predictions, labels):
    """Computes various classification metrics."""
    # Get the predicted class by finding the index of the max logit
    predictions = torch.argmax(predictions, dim=1).cpu().numpy()
    labels = labels.cpu().numpy()
    
    accuracy = accuracy_score(labels, predictions)
    # Use 'weighted' average for multiclass precision, recall, f1 to account for label imbalance
    precision = precision_score(labels, predictions, average="weighted", zero_division=0)
    recall = recall_score(labels, predictions, average="weighted", zero_division=0)
    f1 = f1_score(labels, predictions, average="weighted", zero_division=0)
    balanced_acc = balanced_accuracy_score(labels, predictions)
    
    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "balanced_accuracy": balanced_acc
    }


In [None]:
def train(model, train_loader, criterion, optimizer, device):
    """Performs one epoch of training."""
    model.train()  # Set the model to training mode
    total_loss = 0
    for batch in tqdm(train_loader, desc="Training", leave=False):
        # Move data to the specified device (GPU or CPU)
        # unsqueeze(1) adds the channel dimension: (batch, length) -> (batch, 1, length)
        inputs = batch["input_values"].unsqueeze(1).to(device)
        labels = batch["labels"].to(device)
        
        optimizer.zero_grad()  # Reset gradients
        outputs = model(inputs)  # Forward pass
        loss = criterion(outputs, labels)  # Compute loss
        loss.backward()  # Backward pass
        optimizer.step()  # Update weights
        
        total_loss += loss.item()
    return total_loss / len(train_loader)

def evaluate(model, eval_loader, criterion, device):
    """Performs evaluation on the validation set."""
    model.eval()  # Set the model to evaluation mode
    total_loss = 0
    all_predictions = []
    all_labels = []
    with torch.no_grad():  # Disable gradient calculation for efficiency
        for batch in tqdm(eval_loader, desc="Evaluating", leave=False):
            inputs = batch["input_values"].unsqueeze(1).to(device)
            labels = batch["labels"].to(device)
            
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            
            total_loss += loss.item()
            all_predictions.append(outputs)
            all_labels.append(labels)
            
    # Concatenate all batch predictions and labels
    all_predictions = torch.cat(all_predictions)
    all_labels = torch.cat(all_labels)
    
    # Compute metrics over the entire validation set
    metrics = compute_metrics(all_predictions, all_labels)
    
    return total_loss / len(eval_loader), metrics


In [None]:
# --- Main Training Loop ---

# Create output directories and initialize TensorBoard writer
os.makedirs(output_dir, exist_ok=True)
os.makedirs(log_dir, exist_ok=True)
writer = SummaryWriter(log_dir)

# Set device, initialize model, loss function, and optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MosquitoSongPlus(input_size=16000, num_classes=len(label2id)).to(device)
criterion = nn.CrossEntropyLoss()
# AdamW is a variant of Adam with improved weight decay regularization
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=1e-4)

# Create DataLoader instances for training and evaluation
train_loader = DataLoader(encoded_train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
eval_loader = DataLoader(encoded_validation_dataset, batch_size=batch_size, collate_fn=collate_fn)

# Variables to track the best model
best_balanced_acc = 0
best_model_path = ""

# Start the training loop
for epoch in range(epoch_num):
    train_loss = train(model, train_loader, criterion, optimizer, device)
    val_loss, metrics = evaluate(model, eval_loader, criterion, device)

    # Log metrics to TensorBoard for visualization
    writer.add_scalar("Loss/Train", train_loss, epoch)
    writer.add_scalar("Loss/Validation", val_loss, epoch)
    writer.add_scalar("Accuracy/Validation", metrics["accuracy"], epoch)
    writer.add_scalar("BalancedAccuracy/Validation", metrics["balanced_accuracy"], epoch)
    writer.add_scalar("F1/Validation", metrics["f1"], epoch)

    # Print epoch results to the console
    print(f"Epoch {epoch + 1}/{epoch_num}")
    print(f"  Training Loss: {train_loss:.4f}")
    print(f"  Validation Loss: {val_loss:.4f}")
    print(f"  Metrics: {metrics}")

    # Save the model if it has the best balanced accuracy so far
    if metrics["balanced_accuracy"] > best_balanced_acc:
        best_balanced_acc = metrics["balanced_accuracy"]
        best_model_path = os.path.join(output_dir, f"best_model_epoch_{epoch + 1}.pt")
        torch.save(model.state_dict(), best_model_path)
        print(f"  ---> New best model saved to {best_model_path}")

# Save the final model after the last epoch
last_model_path = os.path.join(output_dir, "last_model.pt")
torch.save(model.state_dict(), last_model_path)

print("\n--- Training Finished ---")
print(f"Best model was saved at: {best_model_path} with Balanced Accuracy: {best_balanced_acc:.4f}")
print(f"Last model saved at: {last_model_path}")

# Close the TensorBoard writer
writer.close()
