# Training and Testing Audio Classification model to detect siren

In [26]:
import os
import librosa

# Define the path to the sounds folder
sounds_folder = '../Dataset/sounds'

# Initialize a variable to track the maximum number of time frames
max_time_steps = 0

# Iterate through each sound type
for sound_type in ['ambulance', 'firetruck', 'traffic']:
    folder_path = os.path.join(sounds_folder, sound_type)
    for file in os.listdir(folder_path):
        if file.endswith('.wav'):
            file_path = os.path.join(folder_path, file)
            # Load the audio file
            signal, sr = librosa.load(file_path, sr=None)
            # Compute MFCCs
            mfccs = librosa.feature.mfcc(y=signal, sr=sr, n_mfcc=80)
            # Update max_time_steps if the current file has more time frames
            max_time_steps = max(max_time_steps, mfccs.shape[1])

print(f"Maximum time steps (columns) across all MFCCs: {max_time_steps}")


Maximum time steps (columns) across all MFCCs: 283


In [27]:
import os
import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPooling2D

# Define the path to the sounds folder
sounds_folder = '../Dataset/sounds'

# Initialize lists to hold data and labels
data = []
labels = []

# Define the sound types (subfolder names)
sound_types = ['ambulance', 'firetruck', 'traffic']

# Set the maximum number of time steps determined earlier
max_time_steps = 283  # Set this as per your earlier analysis

# Function to pad sequences to the maximum time steps
def pad_sequence(sequence, max_len):
    return np.pad(sequence, ((0, 0), (0, max_len - sequence.shape[1])), mode='constant')

def save_mfcc_image(mfccs, sample_rate, save_path):
    """
    Save the MFCCs as a PNG image.

    Parameters:
    - mfccs: The MFCC array.
    - sample_rate: The sample rate of the audio.
    - save_path: The path where the image will be saved.
    """
    plt.figure(figsize=(10, 4))
    librosa.display.specshow(mfccs, sr=sample_rate, x_axis='time')
    plt.colorbar()
    plt.title('MFCC')
    plt.tight_layout()
    plt.axis('off')  # Hide axes
    plt.savefig(save_path, bbox_inches='tight', pad_inches=0)
    plt.close()

# Extract MFCCs from each audio file and save as PNG
for sound_type in sound_types:
    folder_path = os.path.join(sounds_folder, sound_type)
    for file in os.listdir(folder_path):
        if file.endswith('.wav'):
            file_path = os.path.join(folder_path, file)
            signal, sr = librosa.load(file_path, sr=None)
            mfccs = librosa.feature.mfcc(y=signal, sr=sr, n_mfcc=80)
            # Pad MFCCs to the maximum time steps
            mfccs_padded = pad_sequence(mfccs, max_time_steps)
            data.append(mfccs_padded)
            labels.append(sound_type)

            # Save MFCC image
            image_filename = f"{os.path.splitext(file)[0]}_mfcc.png"
            image_path = os.path.join(folder_path, image_filename)
            save_mfcc_image(mfccs, sr, image_path)

# Number of samples and MFCC coefficients
num_samples = len(data)
num_mfcc_coeffs = data[0].shape[0]

# Convert data to numpy array
data = np.array(data)

In [28]:
# Encode the labels
label_encoder = LabelEncoder()
labels_encoded = label_encoder.fit_transform(labels)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(data, labels_encoded, test_size=0.2, random_state=42)

# Expand the dimensions of the input data to add the channel dimension
X_train = np.expand_dims(X_train, axis=-1)
X_test = np.expand_dims(X_test, axis=-1)

# Build the CNN model
model = Sequential([
    Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=X_train.shape[1:]),
    MaxPooling2D(pool_size=(2, 2)),
    Dropout(0.25),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(len(sound_types), activation='softmax')  # Number of sound classes
])

# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=30, batch_size=20, validation_split=0.2)

# Evaluate the model
test_loss, test_acc = model.evaluate(X_test, y_test)
print('Accuracy on test data:', test_acc)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/30
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 1s/step - accuracy: 0.5103 - loss: 87.3958 - val_accuracy: 0.9583 - val_loss: 1.9824
Epoch 2/30
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 474ms/step - accuracy: 0.8528 - loss: 9.6736 - val_accuracy: 0.9167 - val_loss: 2.8634
Epoch 3/30
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 450ms/step - accuracy: 0.8854 - loss: 3.2836 - val_accuracy: 0.8958 - val_loss: 0.5923
Epoch 4/30
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 425ms/step - accuracy: 0.8963 - loss: 0.5186 - val_accuracy: 0.9479 - val_loss: 0.1396
Epoch 5/30
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 399ms/step - accuracy: 0.8946 - loss: 1.0554 - val_accuracy: 0.9375 - val_loss: 0.2362
Epoch 6/30
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 443ms/step - accuracy: 0.9237 - loss: 0.3497 - val_accuracy: 0.9167 - val_loss: 0.1711
Epoch 7/30
[1m20/20[0m [32

In [29]:
model.save('audio_model.h5')  # Saves the model in HDF5 format




# Prediction

In [1]:
import numpy as np
import librosa
from keras.models import load_model

# Load the pre-trained model
model = load_model('audio_model.h5')

# Define the sound types
sound_types = ['ambulance', 'firetruck', 'traffic']

def extract_mfcc(file_path, max_time_steps=283):
    """
    Extract MFCC features from an audio file and pad/truncate to a fixed length.

    Parameters:
    - file_path: Path to the audio file.
    - max_time_steps: Maximum number of time steps (columns) for padding/truncation.

    Returns:
    - mfccs_padded: Padded MFCC features.
    """
    # Load the audio file
    signal, sr = librosa.load(file_path, sr=None)
    
    # Extract MFCCs
    mfccs = librosa.feature.mfcc(y=signal, sr=sr, n_mfcc=80)
    
    # Pad or truncate the MFCCs to the desired length
    if mfccs.shape[1] < max_time_steps:
        pad_width = max_time_steps - mfccs.shape[1]
        mfccs_padded = np.pad(mfccs, ((0, 0), (0, pad_width)), mode='constant')
    else:
        mfccs_padded = mfccs[:, :max_time_steps]
    
    return mfccs_padded

def predict_siren(file_path):
    """
    Predict if the audio file contains a siren sound.

    Parameters:
    - file_path: Path to the audio file.

    Returns:
    - result: 'Siren Detected' or 'Siren Not Detected'.
    """
    # Extract MFCC features
    mfccs_padded = extract_mfcc(file_path)
    
    # Expand dimensions to match model input
    mfccs_padded = np.expand_dims(mfccs_padded, axis=-1)
    mfccs_padded = np.expand_dims(mfccs_padded, axis=0)
    
    # Make a prediction
    prediction = model.predict(mfccs_padded)
    print("prediction",prediction)
    # Get the predicted class index
    predicted_class_index = np.argmax(prediction, axis=1)[0]
    
    # Map the predicted class index to the corresponding sound type
    predicted_class = sound_types[predicted_class_index]
    print("predicted class index: ",predicted_class_index)
    print("predicted classs: ",predicted_class)
    # Determine if a siren is detected
    if predicted_class in ['ambulance', 'firetruck']:
        return 'Siren Detected'
    else:
        return 'Siren Not Detected'

# Example usage
file_path = 'audio.wav'
result = predict_siren(file_path)
print(result)




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 125ms/step
prediction [[9.9903572e-01 9.6431060e-04 1.4861967e-25]]
predicted class index:  0
predicted classs:  ambulance
Siren Detected


In [2]:
import numpy as np
import librosa
from keras.models import load_model

# Load the pre-trained model
model = load_model('audio_model.h5')

# Define the sound types
sound_types = ['ambulance', 'firetruck', 'traffic']

def extract_mfcc(file_path, max_time_steps=283):
    """
    Extract MFCC features from an audio file and pad/truncate to a fixed length.

    Parameters:
    - file_path: Path to the audio file.
    - max_time_steps: Maximum number of time steps (columns) for padding/truncation.

    Returns:
    - mfccs_padded: Padded MFCC features.
    """
    # Load the audio file
    signal, sr = librosa.load(file_path, sr=None)
    
    # Extract MFCCs
    mfccs = librosa.feature.mfcc(y=signal, sr=sr, n_mfcc=80)
    
    # Pad or truncate the MFCCs to the desired length
    if mfccs.shape[1] < max_time_steps:
        pad_width = max_time_steps - mfccs.shape[1]
        mfccs_padded = np.pad(mfccs, ((0, 0), (0, pad_width)), mode='constant')
    else:
        mfccs_padded = mfccs[:, :max_time_steps]
    
    return mfccs_padded

def predict_siren(file_path):
    """
    Predict if the audio file contains a siren sound.

    Parameters:
    - file_path: Path to the audio file.

    Returns:
    - result: 'Siren Detected' or 'Siren Not Detected'.
    """
    # Extract MFCC features
    mfccs_padded = extract_mfcc(file_path)
    
    # Expand dimensions to match model input
    mfccs_padded = np.expand_dims(mfccs_padded, axis=-1)
    mfccs_padded = np.expand_dims(mfccs_padded, axis=0)
    
    # Make a prediction
    prediction = model.predict(mfccs_padded)
    print("prediction",prediction)
    # Get the predicted class index
    predicted_class_index = np.argmax(prediction, axis=1)[0]
    
    # Map the predicted class index to the corresponding sound type
    predicted_class = sound_types[predicted_class_index]
    print("predicted class index: ",predicted_class_index)
    print("predicted classs: ",predicted_class)
    # Determine if a siren is detected
    if predicted_class in ['ambulance', 'firetruck']:
        return 'Siren Detected'
    else:
        return 'Regular Traffic Noise: Siren Not Detected'

# Example usage
file_path = '/Volumes/Personal Drive/Freelancing/Emergency Vehicle Dtection/emergency_vehicle/Audio Classification/Lane Number 5C.m4a'
result = predict_siren(file_path)
print(result)


  signal, sr = librosa.load(file_path, sr=None)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 152ms/step
prediction [[8.5115737e-01 1.4884260e-01 8.4013724e-10]]
predicted class index:  0
predicted classs:  ambulance
Siren Detected
