In [18]:
import os
import io
import librosa
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt

from keras import layers
from keras import models
from keras import Input
from sklearn.model_selection import train_test_split

In [19]:
# Paths to data
gunshot_path = "../data/preprocessed/gunshot"
non_gunshot_path = "../data/preprocessed/not-gunshot"

# Verify that paths exists
print(os.path.exists(gunshot_path))
print(os.path.exists(non_gunshot_path))

True
True


In [20]:
SAMPLE_RATE = 16000
FRAME_SIZE = 2048
HOP_SIZE = 128
SEGMENT_LENGTH = SAMPLE_RATE * 2
SEGMENT_HOP = SEGMENT_LENGTH // 2

In [21]:
# Function to generate numpy array of spectrogram from image
def generate_spectrogram(wav_file_path, model_input_dims = (128, 128)):
    # Load the .wav file
    audio_segment, sr = librosa.load(wav_file_path, sr=SAMPLE_RATE)

    # Compress channels
    if audio_segment.ndim > 1:
        audio_segment = audio_segment.squeeze()

    # Compute the STFT
    stft = librosa.stft(audio_segment, n_fft = FRAME_SIZE, hop_length = HOP_SIZE)
    stft_magnitude = np.abs(stft) ** 2

    # Convert to log-amplitude scale
    stft_db = librosa.amplitude_to_db(stft_magnitude)

    # Create the spectrogram plot
    plt.figure(figsize=(10, 4))
    plt.axis('off')
    librosa.display.specshow(
        stft_db,
        sr=SAMPLE_RATE,
        hop_length=HOP_SIZE,
        x_axis='time',
        y_axis='log',
        cmap='magma'
    )

    # Save the figure into a memory buffer
    buf = io.BytesIO()
    plt.savefig(buf, format='png', bbox_inches='tight', pad_inches=0, transparent=True)
    buf.seek(0)

    # Load the image from the buffer and convert it to RGB
    spectrogram_image = Image.open(buf).convert("RGB")
    spectrogram_image = spectrogram_image.resize((128, 128))  # Resize to (128, 128)

    # Convert to numpy array and normalize the pixel values
    spectrogram_image = np.array(spectrogram_image) / 255.0

    # Close the plot to release memory
    plt.close()

    return np.expand_dims(spectrogram_image, axis = 0)

In [22]:
# Function to load and process images -> 0 for non-gunshot and 1 for gunshot
def load_images(path, label):
    images = []
    labels = []

    for filename in os.listdir(path):

        # Only accept images
        valid_extensions = ['.wav']
        if not any(filename.lower().endswith(ext) for ext in valid_extensions):
            continue

        # Load local path to image
        img_path = os.path.join(path, filename)

        print(f"Attempting to load image {img_path}")

        try:
            # Compute spectrogram
            img = generate_spectrogram(img_path)

            # Append image and corresponding label
            images.append(img)
            labels.append(label)
        except Exception as error:
            print(error)
            continue

    return images, labels

In [23]:
# Load gunshots
gunshot_images, gunshot_labels = load_images(gunshot_path, 1)
non_gunshot_images, non_gunshot_labels = load_images(non_gunshot_path, 0)

Attempting to load image ../data/preprocessed/gunshot/gunshot_0.wav
Attempting to load image ../data/preprocessed/not-gunshot/not_gunshot_0_1.wav
Attempting to load image ../data/preprocessed/not-gunshot/not_gunshot_0_0.wav
Attempting to load image ../data/preprocessed/not-gunshot/not_gunshot_0_2.wav


In [24]:
# Combine gunshots and labels of both types
images = np.array(gunshot_images + non_gunshot_images)
labels = np.array(gunshot_labels + non_gunshot_labels)

In [25]:
# 20% for testing 80# for training
# X_train -> training subset for input features (images)
# X_val   -> validation subset for input features (images)
# y_train -> training subset for labels
# y_val   -> validation subset for labels
X_train, X_val, y_train, y_val = train_test_split(images, labels, test_size=0.2, random_state=42)

In [26]:
# Flatten the list into the required 4D shape (num_samples, 128, 128, 3)
X_train = np.vstack(X_train)
X_val = np.vstack(X_val)

In [27]:
# Define the CNN model
model = models.Sequential()

# Input layer to define the input shape
# Output -> 3D Tensor shape (128, 128, 3) -> RGB image 128x128 pixels
model.add(Input(shape=(128, 128, 3)))

# Applies 32 convolution filters of dimension 3x3 to input
# Output -> (126, 126, 32)
model.add(layers.Conv2D(32, (3, 3), activation='relu'))

# Reduces spatial dimensions by taking max value within overlapping 2x2 regions
# Output -> (63, 63, 32)
model.add(layers.MaxPooling2D((2, 2)))

# Applies 64 convolution filters of dimension 3x3 to input
# Output -> (61, 61, 64)
model.add(layers.Conv2D(64, (3, 3), activation='relu'))

# Reduces spatial dimensions by taking max value within overlapping 2x2 regions
# Output -> (30, 30, 64)
model.add(layers.MaxPooling2D((2, 2)))

# Applies 128 convolution filters of dimension 3x3 to input
# Output -> (28, 28, 128)
model.add(layers.Conv2D(128, (3, 3), activation='relu'))

# Reduces spatial dimensions by taking max value within overlapping 2x2 regions
# Output -> (14, 14, 128)
model.add(layers.MaxPooling2D((2, 2)))

# Converts 3D output from previous layer to 1D vector
# Output -> (25088, )
model.add(layers.Flatten())

# Applies 128 neurons to learn 'high level' features
# Output -> (128, )
model.add(layers.Dense(128, activation='relu'))

# Sigmoid function to determine gunshot or non-gunshot
# 0 = non-gunshot, 1 = gunshot
model.add(layers.Dense(1, activation='sigmoid'))


In [28]:
# Compile the model:
# Optimizer -> adam
# loss function -> binary_crossentropy
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [32]:
# Train the model:
# Epochs -> Total passes through model
# Batch size -> # of samples the model processes before updating weights
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_val, y_val))

Epoch 1/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 69ms/step - accuracy: 1.0000 - loss: 4.2756e-19 - val_accuracy: 1.0000 - val_loss: 1.4132e-23
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step - accuracy: 1.0000 - loss: 2.3304e-21 - val_accuracy: 1.0000 - val_loss: 3.9004e-26
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step - accuracy: 1.0000 - loss: 1.4384e-23 - val_accuracy: 1.0000 - val_loss: 1.2763e-28
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step - accuracy: 1.0000 - loss: 1.0413e-25 - val_accuracy: 1.0000 - val_loss: 5.1171e-31
Epoch 5/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step - accuracy: 1.0000 - loss: 9.0914e-28 - val_accuracy: 1.0000 - val_loss: 2.5745e-33
Epoch 6/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step - accuracy: 1.0000 - loss: 9.7348e-30 - val_accuracy: 1.0000 - val_loss: 1.6552e-3

In [33]:
# Determine performance metrics
test_loss, test_acc = model.evaluate(X_val, y_val)
print(f"Test accuracy: {test_acc}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - accuracy: 1.0000 - loss: 0.0000e+00
Test accuracy: 1.0


In [31]:
# Save model
model_name = "test_model"
model.save(f'../models/{model_name}.h5')

