In [14]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision.transforms import Resize
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
import soundfile as sf
import torchaudio
from scipy.signal import stft
import glob

plt.rcParams['figure.dpi'] = 600


In [17]:
# Takes the directory with the data and returns pandas with metadata
def load_metadata(directory, trim=False):
    if trim:
        df = pd.read_csv(directory+'/train_metadata_trim.csv')
    else:
        df = pd.read_csv(directory+'/train_metadata.csv')
    df['filename'] = directory+"/train_audio/"+df['filename']
    chosen_coloumns = ['latitude', 'longitude', 'common_name', 'rating', 'filename', 'primary_label']
    return df[chosen_coloumns]


# Takes filepath from metadata dataframe and returns audio file
def load_audiofile(filepath):
    audio, sr = sf.read(filepath)
    return audio.astype(np.float32), sr


# Converts ogg audio to waveform and spectrogram. Exact values for melspectrogram function might need to be changed values currently chosen from https://www.kaggle.com/code/awsaf49/birdclef23-pretraining-is-all-you-need-train
# audio -- Can be filepath from metadata dataframe or numpy array with ogg data
def get_melspectrogram(audio, sr=32000, n_mels=128, n_fft=2028, hop_length=512, fmax=16000, fmin=20,power=2.0,top_db=100):
    if type(audio) is str:
        audio, sr = load_audiofile(audio)
    waveform = torch.from_numpy(audio)
    transform = torchaudio.transforms.MelSpectrogram( 
                                    sample_rate=sr, 
                                    n_mels=n_mels,
                                    n_fft=n_fft,
                                    hop_length=hop_length, #base value from function in notebook it is calculated as duration_of_audio*sr//(384-1)
                                    f_max=fmax,
                                    f_min=fmin,
                                    power=2.0
                                    )
    melspectrogram = transform(waveform)

    melspectrogram = torchaudio.transforms.AmplitudeToDB()(melspectrogram)
    melspectrogram = torch.nn.functional.normalize(melspectrogram, p=2, dim=0)
    
    melspectrogram = (melspectrogram * 255)

    return melspectrogram

#Calculates Short Time Fourier Transformation of an audio file
# audio -- Can be filepath from metadata dataframe or numpy array with ogg data
def get_STFT(audio, sr=32000, n_fft=2028, nperseg=512):
    if type(audio) is str:
        audio, sr = load_audiofile(audio)
    stft_audio = stft(audio, nfft=n_fft, nperseg=nperseg)
    return stft_audio



In [21]:
#load data
path = "D:\KU\Masters\AppML\APPML-BirdCLEF\data"
meta_data = load_metadata(path, trim=True)


In [19]:
def load_audiofile2(filepath, cutoff_time):
    #Function that takes a file and cutoff time to create training and validation sets for training
    #If the audioclip is lower than the cutoff_time then the clip is looped untill desired duration reached
    audio, sr = sf.read(filepath)
    duration = len(audio) / sr


    if duration >= cutoff_time:
        training_audio = audio[:int(sr * 15)]
        validation_audio = audio[int(sr * 15):int(sr * 30)]
    else:
        
        loop_count = int(np.ceil(cutoff_time / duration))
        audio = np.tile(audio, loop_count)

        training_audio = audio[:int(sr * 15)]
        validation_audio = audio[int(sr * 15):int(sr * 30)]

    return training_audio.astype(np.float32), validation_audio.astype(np.float32), sr

## Spectrogram extraction

In [22]:
import random
#load one of each birds data
spectrograms = []
selected_spots = random.sample(range(len(meta_data) - 4), 100)  

for common_name in meta_data['primary_label'].unique():
    index = meta_data.loc[meta_data['primary_label'] == common_name].index[0] 
    spectrogram = get_melspectrogram(meta_data['filename'][index])  
    spectrograms.append([spectrogram, common_name])

for i in selected_spots:
    for j in range(i, i + 4):
        spectrograms.append([get_melspectrogram(meta_data['filename'][j]), meta_data['primary_label'][j]])

spectrograms = np.asarray(spectrograms)

  spectrograms = np.asarray(spectrograms)


ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 2 dimensions. The detected shape was (664, 2) + inhomogeneous part.

In [None]:
#Split data into validation and test. (Need to look at this again, possible mistake in the validation data creation)
train_ratio = 0.8
train_size = int(train_ratio * len(spectrograms))

train_labels = spectrograms[:,1]

label_mapping = {label: index for index, label in enumerate(set(train_labels))}

spectrograms[:,1] = [label_mapping.get(label, -1)+1 for label in train_labels]

validation_set = []
training_set = []

#Split data 
for i, (spectrogram, label) in enumerate(spectrograms):
    shape = np.shape(spectrogram)
    if shape[1] >= 100:
        validation_data = spectrogram[:, :50]
        validation_set.append([validation_data, label])
        
        remaining_data = spectrogram[:, 50:]
        num_chunks = remaining_data.shape[1] // 50
        if num_chunks > 0:
            chunks = np.split(remaining_data[:, :num_chunks*50], num_chunks, axis=1)
            for chunk in chunks:
                training_set.append([chunk, label])
    else: print(i,label)


605 188
645 95


## Network

### Direct Spectrogram

In [None]:
#Working
class CNN(nn.Module):
    def __init__(self, num_classes):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 16, kernel_size=3, stride=1, padding=1)
        self.relu = nn.ReLU()
        self.maxpool = nn.MaxPool2d(kernel_size=2, stride=2)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1)
        self.fc1 = nn.Linear(12288, num_classes)
        self.fc2 = nn.Linear(num_classes, num_classes)
    
    def forward(self, x):
        x = self.conv1(x)
        x = self.relu(x)
        x = self.maxpool(x)
        x = self.conv2(x)
        x = self.relu(x)
        x = self.maxpool(x)
        size = x.shape[1] * x.shape[2] * x.shape[3]
        x = x.view(x.size(0), -1)
        self.fc1 = nn.Linear(size, num_classes)
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

In [None]:
#Working
class CNN(nn.Module):
    def __init__(self, num_classes):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 16, kernel_size=3, stride=1, padding=1)
        self.relu = nn.ReLU()
        self.maxpool = nn.MaxPool2d(kernel_size=2, stride=2)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1)
        self.fc1 = nn.Linear(12288, num_classes)
        self.fc2 = nn.Linear(num_classes, num_classes)
    
    def forward(self, x):
        x = self.conv1(x)
        x = self.relu(x)
        x = self.maxpool(x)
        x = self.conv2(x)
        x = self.relu(x)
        x = self.maxpool(x)
        x = x.view(x.size(0), -1)
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

In [None]:

#Load data into batches of 32
batch_size = 32
train_loader = DataLoader(training_set, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(validation_set, batch_size=batch_size, shuffle=False)

# Initialize the CNN and use +1 for classes due to "no class" being labeled as -1
num_classes = len(meta_data['primary_label'].unique())+1
cnn = CNN(num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(cnn.parameters(), lr=0.001)

# Training loop and attempt to use cuda
num_epochs = 10
device = torch.device("cpu")
cnn.to(device)

for epoch in range(num_epochs):
    print("epoch : ", epoch)
    cnn.train()
    train_loss = 0.0
    train_correct = 0
    i = 0 
    for images, labels in train_loader:
        #load data onto device, either gpu or cpu
        images = images.unsqueeze(1).to(device) 
        labels = labels.to(device)
        
        optimizer.zero_grad()
        
        outputs = cnn(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item() * images.size(0)
        _, predicted = torch.max(outputs, 1)
        train_correct += (predicted == labels).sum().item()
        i+=1
        
    # Validation loop
    cnn.eval()
    val_loss = 0.0
    val_correct = 0
    
    with torch.no_grad(): #Disables some calculations, used to reduce memory.
        for images, labels in val_loader:
        #load data onto device, either gpu or cpu
            images = images.unsqueeze(1).to(device)
            labels = labels.to(device)
            
            outputs = cnn(images)
            loss = criterion(outputs, labels)
            
            val_loss += loss.item() * images.size(0)
            _, predicted = torch.max(outputs, 1)
            val_correct += (predicted == labels).sum().item()
    
    train_loss /= len(train_loader.dataset)
    train_acc = train_correct / len(train_loader.dataset)
    val_loss /= len(val_loader.dataset)
    val_acc = val_correct / len(val_loader.dataset)
    
    print(f"Epoch {epoch+1}/{num_epochs}:")
    print(f"Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f}")
    print(f"Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f}")

epoch :  0
Epoch 1/10:
Train Loss: 1.8677 | Train Acc: 0.6185
Val Loss: 2.2717 | Val Acc: 0.5076
epoch :  1
Epoch 2/10:
Train Loss: 0.3637 | Train Acc: 0.9002
Val Loss: 1.8863 | Val Acc: 0.6042
epoch :  2
Epoch 3/10:
Train Loss: 0.1959 | Train Acc: 0.9447
Val Loss: 2.1042 | Val Acc: 0.5967
epoch :  3
Epoch 4/10:
Train Loss: 0.1379 | Train Acc: 0.9615
Val Loss: 2.3378 | Val Acc: 0.5770
epoch :  4
Epoch 5/10:
Train Loss: 0.1186 | Train Acc: 0.9662
Val Loss: 2.0243 | Val Acc: 0.6148
epoch :  5
Epoch 6/10:
Train Loss: 0.1042 | Train Acc: 0.9700
Val Loss: 1.7906 | Val Acc: 0.6918
epoch :  6
Epoch 7/10:
Train Loss: 0.0947 | Train Acc: 0.9742
Val Loss: 2.2409 | Val Acc: 0.6329
epoch :  7
Epoch 8/10:
Train Loss: 0.0756 | Train Acc: 0.9787
Val Loss: 2.1307 | Val Acc: 0.6495
epoch :  8
Epoch 9/10:
Train Loss: 0.0713 | Train Acc: 0.9797
Val Loss: 2.4943 | Val Acc: 0.6148
epoch :  9
Epoch 10/10:
Train Loss: 0.0603 | Train Acc: 0.9821
Val Loss: 2.2951 | Val Acc: 0.6631


In [None]:
def split_audio(audio, sr, segment_duration=30):
    segment_length = segment_duration * sr  # Length of each segment in samples
    total_samples = len(audio)
    num_segments = total_samples // segment_length

    segments = []
    for i in range(num_segments):
        segment_start = i * segment_length
        segment_end = segment_start + segment_length
        segment = audio[segment_start:segment_end]
        segments.append([segment,i*segment_duration])
    
    return segments

In [None]:
import os
arr = os.listdir(test_path)

for file in arr:
    print(file)



soundscape_29201.ogg


In [None]:
import csv

test_path = r"C:\Users\zhakk\Desktop\Uni\Kandidat\AML-BirdCLEFproject\data\birdCLEF2023\test_soundscapes"

for file in arr:
    audio,sr = load_audiofile(test_path+"/"+file)
    audio_segment = split_audio(audio,sr)
    spectrograms = []
    for i in audio_segment:
        spectrogram = get_melspectrogram(i[0])  
        spectrograms.append([spectrogram, i[1]])

    spectrograms = np.asarray(spectrograms)
    test_set = []

    for i, (spectrogram, time) in enumerate(spectrograms):
        shape = np.shape(spectrogram)
        if shape[1] >= 100:
            test_data = spectrogram[:, :50]  # Use the first 50 time steps as test data
            test_set.append([test_data, time])
        else:
            print(i, label)

    # Convert the test data to a numpy array for easier manipulation
    test_set = np.array(test_set)


    # Separate the test features and labels
    test_features = test_set[:, 0]
    test_times = test_set[:, 1]
    
    test_loader = DataLoader(test_features, batch_size=batch_size, shuffle=False)
    cnn.eval()

    predictions = []
    with torch.no_grad():
        for images in test_loader:
            images = images.unsqueeze(1).to(device)
            outputs = cnn(images)
            probabilities = torch.softmax(outputs, dim=1)
            predictions.extend(probabilities.tolist())

    test_data = []

    for start_time, probabilities in zip(test_times, predictions):
        row_id = file + '_' + str(start_time)
        test_data_row = [row_id] + probabilities

        test_data.append(test_data_row)

    # Write test data to a CSV file
    output_file = 'submission.csv'
    header = ['row_id'] + list(label_mapping.keys())

    with open(output_file, 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(header)
        writer.writerows(test_data)

    print('Test data saved to', output_file)


Test data saved to submission.csv


  spectrograms = np.asarray(spectrograms)
  spectrograms = np.asarray(spectrograms)
  test_set = np.array(test_set)
  test_set = np.array(test_set)
