# Produce a Convolutional Neural Network that can distinuish between different Seal Vocalisations

We have produced 480 npz files of spectrogram data for different seal data. 

In [1]:
import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from collections import Counter


  from pandas.core import (


In [2]:
# function to load the npz files

folder_path = 'data/processed/NPZ_files'
files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.npz')]

# create x and y arrays. x being the spectorgram and y being the call annotation
x = []
y = []

for file in files:
    npz_data = np.load(file)
    spectrogram = npz_data['spectrogram']
    call = npz_data['annotation']
    
    x.append(spectrogram)
    y.append(call)

x = np.array(x)
y = np.array(y)

x.shape
#y.shape

(480, 1025, 561)

The shape of our data is (480, 1025, 56). This represents dimensions for number of samples, frequency bins and time steps. 

480 - the number of files we have
1025 - The frequency axis has been split into 1025 different bins. This is decided based on the nfft you choose. We used 2048. The formula[[1]](https://dsp.stackexchange.com/questions/26927/what-is-a-frequency-bin) used is nfft-2+1.
561 - the x axis has been split into frames and is related to the nover used when generating the spectrograms[[2]](https://stackoverflow.com/questions/64136637/time-steps-difference-in-spectrogram). 

Our data is only 3D, it doesnt have a channel dimension. CNN's require a 4D shape[[3]](https://stackoverflow.com/questions/60157742/convolutional-neural-network-cnn-input-shape). We need to add a channel dimension to our data. As the training will be done on grayscale spectrograms, we will add a channel dimension of 1.

In [3]:
# Add a channel dimension to the data
x = np.expand_dims(x, axis=-1)  # Shape becomes (480, 1025, 561, 1)
x.shape

(480, 1025, 561, 1)

A greyscale will have an array between 0 and 1. We need to check what ours are.

In [4]:
min_value = x.min()
max_value = x.max()

print(f"Minimum value: {min_value}")
print(f"Maximum value: {max_value}")

Minimum value: 7.733122287834104e-17
Maximum value: 49.907100677490234


As our images are not greyscale at the moment we need to carry out some normalisation before using the Neural Network.

In [5]:
# Normalise to [0, 1]

x = x / max_value

new_min_value = x.min()
new_max_value = x.max()
print(f"Minimum value: {new_min_value}")
print(f"Maximum value: {new_max_value}")

Minimum value: 1.549503364563041e-18
Maximum value: 1.0


Now our data is in the range [0,1]

In [6]:
# Checking how many calls are present in our dataset 

number_of_calls = np.unique(y).size
number_of_calls

5

We have 5 different calls in our dataset of 480. If its unabalanced it will be harder to train the model. 

In [7]:
Counter(y)

Counter({'Rupe A': 342,
         'Rupe B': 121,
         'Rupe C': 8,
         'Guttural rupe': 7,
         'Growl B': 2})

There are 342 Rupe A calls, 121 Rupe B but only 8, 7 and 2 fro Rupe C, Gutturral Rupe and Growl. 
We will disregard these as there are not enough samples to train the model efficiently and instead focus on A and B.
As x no longer has the class we need to filter them out based on their index in the dataframe[[5]](https://stackoverflow.com/questions/72047933/accessing-a-value-by-index-in-enumerate-for-loop)


In [8]:
# Filter the dataset to only include Rupe A and Rupe B
selected_calls = ['Rupe A', 'Rupe B']

# Filter the dataset
indices = [i for i, label in enumerate(y) if label in selected_calls]
x_filtered = x[indices]
y_filtered = y[indices]

In [9]:
# checking its what we expect
number_of_classes = np.unique(y_filtered).size
x_filtered.shape, y_filtered.shape, number_of_classes

((463, 1025, 561, 1), (463,), 2)

Now we have 463 samples between just 2 classes.

Next we will encode our labels and split our data into test and train groups. 

Neural networks require input and output variable to be numbers[[6]](https://machinelearningmastery.com/how-to-prepare-categorical-data-for-deep-learning-in-python/). 
``label_encoder`` and ``encoded_labels`` convert our calls (Rupe A/B) into integers (0/1). One hot encoding, via ``categorical_labels``, then converts this to a binary vector ``[0,1]`` or ``[1,0]``.



In [10]:

# Encode labels
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(y_filtered)
categorical_labels = to_categorical(encoded_labels)


X_train, X_test, y_train, y_test = train_test_split(x_filtered, categorical_labels, random_state=42)


The plan was to have an extra Convolutional layer and use a Flatten layer, however this was proving to be computationally demanding for my laptop, so I used ``GlobalAveragePooling`` instead.

In [27]:
model = tf.keras.models.Sequential()
model.add(tf.keras.Input(shape=(1025, 561, 1)))
model.add(layers.Conv2D(16, 3, padding='same', activation='relu'))
model.add(layers.MaxPooling2D())
model.add(layers.Conv2D(32, 3, padding='same', activation='relu'))
model.add(layers.MaxPooling2D())
model.add(layers.GlobalAveragePooling2D())  # Replace Flatten
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(number_of_classes, activation='softmax'))

In [24]:
model.summary()

In [25]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(0.001),
    loss=tf.keras.losses.CategoricalCrossentropy(from_logits=False),  #USed CategoricalCrossentropy as classes are one-hot encoded
    metrics=['accuracy'],
)

In [26]:
history = model.fit(X_train, y_train, epochs=25, validation_split=0.2)

Epoch 1/25
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m160s[0m 17s/step - accuracy: 0.7091 - loss: 0.6883 - val_accuracy: 0.6857 - val_loss: 0.6750
Epoch 2/25
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m138s[0m 16s/step - accuracy: 0.7878 - loss: 0.6565 - val_accuracy: 0.6857 - val_loss: 0.6493
Epoch 3/25
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m146s[0m 16s/step - accuracy: 0.7672 - loss: 0.6155 - val_accuracy: 0.6857 - val_loss: 0.6254
Epoch 4/25
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m132s[0m 14s/step - accuracy: 0.7509 - loss: 0.5779 - val_accuracy: 0.6857 - val_loss: 0.6319
Epoch 5/25
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m140s[0m 14s/step - accuracy: 0.7385 - loss: 0.5759 - val_accuracy: 0.6857 - val_loss: 0.6466
Epoch 6/25
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m134s[0m 14s/step - accuracy: 0.7425 - loss: 0.5745 - val_accuracy: 0.6857 - val_loss: 0.6399
Epoch 7/25
[1m9/9[0m [32m━━━━━━━━━━━━

The scores for this arent great, however the training loss does decrease as it runs through epoch, achieving a best score of 0.5237 at epoch 24. The best validation loss is actually at epoch 2 but it doesnt vary much throughout.  

In [None]:
model.evaluate(X_test, y_test, return_dict=True)

In [None]:
def plot_loss(history, which='loss'):
    plt.plot(history.history[which], label='train')
    try:
        plt.plot(history.history['val_'+which], label='validation')
    except:
        None
    plt.xlabel('Epoch')
    plt.ylabel(which)
    plt.legend()
    plt.grid(True)

In [None]:
plot_loss(history)

In [None]:
model2 = tf.keras.models.Sequential([
    tf.keras.Input(shape=(125, 94, 1)),
    data_augmentation,
    layers.Conv2D(16, 3, padding='same', activation='relu'),
    layers.MaxPooling2D(),
    layers.Dropout(0.2),
    layers.Conv2D(32, 3, padding='same', activation='relu'),
    layers.MaxPooling2D(),
    layers.Conv2D(64, 3, padding='same', activation='relu'),
    layers.MaxPooling2D(),
    layers.Dropout(0.2),
    layers.Flatten(),
    layers.Dense(128, activation='relu'),
    layers.Dense(number_of_classes, activation='softmax')])

In [None]:
model2.compile(
    optimizer=tf.keras.optimizers.Adam(0.001),
    loss=tf.keras.losses.CategoricalCrossentropy(from_logits=False),
    metrics=['accuracy'],
)

In [None]:
history = model2.fit(X_train, y_train, epochs=100, validation_split=0.2)

In [None]:
plot_loss(history)

In [None]:
model2.evaluate(X_test, y_test, return_dict=True)

In [None]:

input_shape = X_train.shape[1:]

model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=input_shape),
    BatchNormalization(),
    MaxPooling2D((2, 2)),
    Dropout(0.25),
    
    Conv2D(64, (3, 3), activation='relu'),
    BatchNormalization(),
    MaxPooling2D((2, 2)),
    Dropout(0.25),
    
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(categorical_labels.shape[1], activation='softmax')  # Output layer
])

model.compile(optimizer=Adam(learning_rate=0.001),
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [None]:
history = model.fit(X_train, y_train,
                    validation_split=0.2,
                    epochs=20,
                    batch_size=32,
                    verbose=1)

https://stackoverflow.com/questions/60157742/convolutional-neural-network-cnn-input-shape
https://dataheadhunters.com/academy/encoding-categorical-data-one-hot-vs-label-encoding/