# Produce a Convolutional Neural Network that can distinuish between different Seal Vocalisations

We have produced 480 npz files of spectrogram data for different seal data. 

In [15]:
import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from collections import Counter


In [8]:
# function to load the npz files

folder_path = 'data/processed/NPZ_files'
files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.npz')]

# create x and y arrays. x being the spectorgram and y being the call annotation
x = []
y = []

for file in files:
    npz_data = np.load(file)
    spectrogram = npz_data['spectrogram']
    call = npz_data['annotation']
    
    x.append(spectrogram)
    y.append(call)

x = np.array(x)
y = np.array(y)

x.shape
#y.shape

(480,)

The shape of our data is (480, 1025, 56). This represents dimensions for number of samples, frequency bins and time steps. 

480 - the number of files we have
1025 - The frequency axis has been split into 1025 different bins. This is decided based on the nfft you choose. We used 2048. The formula[[1]](https://dsp.stackexchange.com/questions/26927/what-is-a-frequency-bin) used is nfft-2+1.
561 - the x axis has been split into frames and is related to the nover used when generating the spectrograms[[2]](https://stackoverflow.com/questions/64136637/time-steps-difference-in-spectrogram). 

Our data is only 3D, it doesnt have a channel dimension. CNN's require a 4D shape[[3]](https://stackoverflow.com/questions/60157742/convolutional-neural-network-cnn-input-shape). We need to add a channel dimension to our data. As the training will be done on grayscale spectrograms, we will add a channel dimension of 1.

In [9]:
# Add a channel dimension to the data
x = np.expand_dims(x, axis=-1)  # Shape becomes (480, 1025, 561, 1)
x.shape

(480, 1025, 561, 1)

A greyscale will have an array between 0 and 1. We need to check what ours are.

In [10]:
min_value = x.min()
max_value = x.max()

print(f"Minimum value: {min_value}")
print(f"Maximum value: {max_value}")

Minimum value: 7.733122287834104e-17
Maximum value: 49.907100677490234


As our images are not greyscale at the moment we need to carry out some normalisation before using the Neural Network.

In [11]:
# Normalise to [0, 1]

x = x / max_value

new_min_value = x.min()
new_max_value = x.max()
print(f"Minimum value: {new_min_value}")
print(f"Maximum value: {new_max_value}")

Minimum value: 1.549503364563041e-18
Maximum value: 1.0


Now our data is in the range [0,1]

In [16]:
# Checking how many calls are present in our dataset 

number_of_calls = np.unique(y).size
number_of_calls

5

We have 5 different calls in our dataset of 480. If its unabalanced it will be harder to train the model. 

In [17]:
Counter(y)

Counter({'Rupe A': 342,
         'Rupe B': 121,
         'Rupe C': 8,
         'Guttural rupe': 7,
         'Growl B': 2})

There are 342 Rupe A calls, 121 Rupe B but only 8, 7 and 2 fro Rupe C, Gutturral Rupe and Growl. 
We will disregard these as there are not enough samples to train the model efficiently and instead focus on A and B.
As x no longer has the class we need to filter them out based on their index in the dataframe[[5]](https://stackoverflow.com/questions/72047933/accessing-a-value-by-index-in-enumerate-for-loop)


In [19]:
# Filter the dataset to only include Rupe A and Rupe B
selected_calls = ['Rupe A', 'Rupe B']

# Filter the dataset
indices = [i for i, label in enumerate(y) if label in selected_calls]
x_filtered = x[indices]
y_filtered = y[indices]

In [25]:
# checking its what we expect
x_filtered.shape, y_filtered.shape, np.unique(y_filtered)

((463, 1025, 561, 1), (463,), array(['Rupe A', 'Rupe B'], dtype='<U13'))

Now we have 463 samples between just 2 classes.

Next we will encode our labels and split our data into test and train groups. 

label_encoder and encoded_labels convert our calls (Rupe A/B) into integers (0/1) as neural networks require input and output variable to be numbers [[6]](https://machinelearningmastery.com/how-to-prepare-categorical-data-for-deep-learning-in-python/)

In [27]:

# Encode labels
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(y_filtered)
categorical_labels = to_categorical(encoded_labels)


X_train, X_test, y_train, y_test = train_test_split(x_filtered, categorical_labels, test_size=0.2, random_state=42)


In [None]:
# Assuming data shape: (samples, height, width, channels)
input_shape = X_train.shape[1:]

model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=input_shape),
    BatchNormalization(),
    MaxPooling2D((2, 2)),
    Dropout(0.25),
    
    Conv2D(64, (3, 3), activation='relu'),
    BatchNormalization(),
    MaxPooling2D((2, 2)),
    Dropout(0.25),
    
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(categorical_labels.shape[1], activation='softmax')  # Output layer
])

model.compile(optimizer=Adam(learning_rate=0.001),
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [None]:
history = model.fit(X_train, y_train,
                    validation_split=0.2,
                    epochs=20,
                    batch_size=32,
                    verbose=1)

https://stackoverflow.com/questions/60157742/convolutional-neural-network-cnn-input-shape