Install librosa

In [1]:
!pip install librosa



import dependencies

In [17]:
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
import os

Mount Google Drive

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


load audio file and extract MFCC features

In [18]:
def extract_mfcc(audio_path, n_mfcc=20):
    # Load audio file
    y, sr = librosa.load(audio_path, sr=None)

    # Extract MFCCs
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)

    return mfccs

In [32]:
def extract_mfccs_from_directory(directory_path, n_mfcc=20):
    mfccs_list = []
    filenames = []

    # Iterate over all files in the directory
    for filename in os.listdir(directory_path):
        if filename.endswith(".wav"):
            file_path = os.path.join(directory_path, filename)

            # Extract MFCCs from the current file
            mfccs = extract_mfcc(file_path, n_mfcc=n_mfcc)

            # Append MFCCs and filename to the lists
            mfccs_list.append(mfccs)
            filenames.append(filename)

    return mfccs_list, filenames


In [33]:
# Replace 'directory_path' with the path to the directory containing the WAV files
directory_path = '/content/drive/MyDrive/db/audios_30'
mfccs_list, filenames = extract_mfccs_from_directory(directory_path, n_mfcc=20)

<font color='red'>Issue 1: MFCC dimension is not uniform</font>

In [39]:
for mfcc, filename in zip(mfccs_list, filenames):
  print(mfcc.shape, filename)

(20, 938) 18-music3.wav
(20, 865) 23-skateboard2.wav
(20, 865) 5-bird.wav
(20, 865) 28-wild animal.wav
(20, 865) 10-dog.wav
(20, 938) 12-fire1.wav
(20, 938) 24-violin1.wav
(20, 951) 4-bell1.wav
(20, 865) 6-cat.wav
(20, 977) 20-noise1.wav
(20, 865) 9-clap2.wav
(20, 977) 3-aircraft3.wav
(20, 938) 19-music4.wav
(20, 939) 16-motoboat.wav
(20, 981) 11-echo1.wav
(20, 947) 21-noise2.wav
(20, 945) 1-aircraft1.wav
(20, 943) 15-helicopter1.wav
(20, 938) 13-fire2.wav
(20, 945) 2-aircraft2.wav
(20, 938) 29-wind1.wav
(20, 938) 30-wind2.wav
(20, 945) 27-whitenoise1.wav
(20, 865) 26-whistle.wav
(20, 865) 22-skateboard.wav
(20, 938) 17-music1.wav
(20, 938) 14-guitar.wav
(20, 865) 7-cat2.wav
(20, 865) 8-clap.wav
(20, 938) 25-water1.wav


Apply padding to MFCC matrices

In [59]:
# Calculate the maximum number of time frames (T) among all MFCCs
max_T = max(mfcc.shape[1] for mfcc in mfccs_list)

# Pad or truncate each MFCC matrix to have the same number of time frames
mfccs_padded = [np.pad(mfcc, ((0, 0), (0, max_T - mfcc.shape[1])), mode='constant') for mfcc in mfccs_list]

# Flat mfccs_padded to two dimensional array
mfccs_flattened = [mfcc.flatten() for mfcc in mfccs_padded]

In [61]:
from sklearn.neighbors import KDTree

# Build KDTree
kdtree = KDTree(mfccs_flattened, leaf_size=10)

<font color='red'>Issue 2: Dimension of query file must be uniform, cannot be guaranteed</font>

In [76]:
from ast import Index
mfcc_query = [mfccs_flattened[0]]

# Perform the query
k = 5  # Number of nearest neighbors to retrieve
distances, indices = kdtree.query(mfcc_query, k=k)
res = [filenames[i] for i in indices[0]]


# Print the indices and distances of the nearest neighbors
print("Indices of nearest neighbors:", indices)
print("Distances to nearest neighbors:", distances)
print(f"Top {k} most similar sounds (descending):", res)

Indices of nearest neighbors: [[ 0 18  5 13  8]]
Distances to nearest neighbors: [[   0.         4320.93534709 4445.20065099 4950.77089993 5309.89491331]]
Top 5 most similar sounds (descending): ['18-music3.wav', '13-fire2.wav', '12-fire1.wav', '16-motoboat.wav', '6-cat.wav']
