In [22]:
!pip install resampy




In [23]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("chrisfilo/urbansound8k")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/urbansound8k


In [24]:
import os
import numpy as np
import pandas as pd
import librosa
import librosa.display
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPool2D, Dropout, Flatten, Dense
from tensorflow.keras.optimizers import Adam


In [2]:
# ----------------- Imports -----------------
import os
import numpy as np
import pandas as pd
import librosa
import librosa.display
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPool2D, Dropout, Flatten, Dense
from tensorflow.keras.optimizers import Adam

# ----------------- Load Metadata -----------------
metadata_path = '../input/urbansound8k/UrbanSound8K.csv'
df = pd.read_csv(metadata_path)

# ----------------- Feature Extraction -----------------
def extract_features(df, base_path='../input/urbansound8k/'):
    features = []
    labels = []
    
    for i in range(len(df)):
        file_path = os.path.join(base_path, 'fold' + str(df.loc[i, "fold"]), df.loc[i, "slice_file_name"])
        try:
            audio, sr = librosa.load(file_path, res_type='kaiser_fast)
            mels = librosa.feature.melspectrogram(y=audio, sr=sr)
            mels_mean = np.mean(mels.T, axis=0)
            features.append(mels_mean)
            labels.append(df.loc[i, "classID"])
        except Exception as e:
            print(f"Error processing {file_path}: {e}")
    
    return np.array(features), np.array(labels)

print("Extracting features (this may take ~30-45 minutes)...")
X, y = extract_features(df)
print("Feature extraction completed.")

# ----------------- Data Preprocessing -----------------
y_cat = to_categorical(y, num_classes=10)

# Padding X to ensure uniform shape (8732, 128)
X_padded = np.zeros((len(X), 128))
for i in range(len(X)):
    padded = np.pad(X[i], (0, 128 - len(X[i])), 'constant') if len(X[i]) < 128 else X[i][:128]
    X_padded[i] = padded

# Reshape for CNN
X_reshaped = X_padded.reshape(X_padded.shape[0], 16, 8, 1)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_reshaped, y_cat, test_size=0.25, random_state=1)

# ----------------- CNN Model -----------------
model = Sequential([
    Conv2D(64, (3, 3), padding="same", activation="tanh", input_shape=(16, 8, 1)),
    MaxPool2D(pool_size=(2, 2)),
    Conv2D(128, (3, 3), padding="same", activation="tanh"),
    MaxPool2D(pool_size=(2, 2)),
    Dropout(0.1),
    Flatten(),
    Dense(1024, activation="tanh"),
    Dense(10, activation="softmax")
])

model.compile(optimizer=Adam(), loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

# ----------------- Model Training -----------------
history = model.fit(X_train, y_train, epochs=90, batch_size=50, validation_data=(X_test, y_test))


2025-05-26 06:17:21.943807: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748240242.171356      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748240242.237522      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Extracting features (this may take ~30-45 minutes)...




Feature extraction completed.


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
I0000 00:00:1748240946.698562      35 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 13942 MB memory:  -> device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5
I0000 00:00:1748240946.699250      35 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 13942 MB memory:  -> device: 1, name: Tesla T4, pci bus id: 0000:00:05.0, compute capability: 7.5


Epoch 1/90


I0000 00:00:1748240950.594166     113 service.cc:148] XLA service 0x7faf580052f0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1748240950.594873     113 service.cc:156]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
I0000 00:00:1748240950.594892     113 service.cc:156]   StreamExecutor device (1): Tesla T4, Compute Capability 7.5
I0000 00:00:1748240950.927004     113 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m 61/131[0m [32m━━━━━━━━━[0m[37m━━━━━━━━━━━[0m [1m0s[0m 3ms/step - accuracy: 0.3469 - loss: 1.9520

I0000 00:00:1748240953.973248     113 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m131/131[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 25ms/step - accuracy: 0.4055 - loss: 1.7706 - val_accuracy: 0.5790 - val_loss: 1.2676
Epoch 2/90
[1m131/131[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.6021 - loss: 1.1691 - val_accuracy: 0.6436 - val_loss: 1.1180
Epoch 3/90
[1m131/131[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.6752 - loss: 0.9655 - val_accuracy: 0.6587 - val_loss: 1.0593
Epoch 4/90
[1m131/131[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.7191 - loss: 0.8416 - val_accuracy: 0.6825 - val_loss: 0.9542
Epoch 5/90
[1m131/131[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.7394 - loss: 0.7721 - val_accuracy: 0.7164 - val_loss: 0.8829
Epoch 6/90
[1m131/131[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.7776 - loss: 0.6522 - val_accuracy: 0.7197 - val_loss: 0.8803
Epoch 7/90
[1m131/131[0m [32m━━━━━━

In [5]:
model.save("sound_classifier_model_cnn.h5")


In [27]:
import os
import numpy as np
import pandas as pd
import librosa
import librosa.display
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPool2D, Dropout, Flatten, Dense
from tensorflow.keras.optimizers import Adam

# ----------------- Load Metadata -----------------
metadata_path = '../input/urbansound8k/UrbanSound8K.csv'
df = pd.read_csv(metadata_path)

# ----------------- Feature Extraction -----------------
def extract_features(df, base_path='../input/urbansound8k/'):
    features = []
    labels = []
    
    for i in range(len(df)):
        file_path = os.path.join(base_path, 'fold' + str(df.loc[i, "fold"]), df.loc[i, "slice_file_name"])
        try:
            audio, sr = librosa.load(file_path)
            mels = librosa.feature.melspectrogram(y=audio, sr=sr)
            mels_mean = np.mean(mels.T, axis=0)
            features.append(mels_mean)
            labels.append(df.loc[i, "classID"])
        except Exception as e:
            print(f"Error processing {file_path}: {e}")
    
    return np.array(features), np.array(labels)

print("Extracting features (this may take ~30-45 minutes)...")
X, y = extract_features(df)
print("Feature extraction completed.")

Extracting features (this may take ~30-45 minutes)...




Feature extraction completed.


In [28]:

# ----------------- Data Preprocessing -----------------
y_cat = to_categorical(y, num_classes=10)

# Padding X to ensure uniform shape (8732, 128)
X_padded = np.zeros((len(X), 128))
for i in range(len(X)):
    padded = np.pad(X[i], (0, 128 - len(X[i])), 'constant') if len(X[i]) < 128 else X[i][:128]
    X_padded[i] = padded

# Reshape for CNN
X_reshaped = X_padded.reshape(X_padded.shape[0], 16, 8, 1)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_reshaped, y_cat, test_size=0.25, random_state=1)


In [29]:
from tensorflow.keras.models import load_model
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

# ----------------- Load the Saved Model -----------------
model_path = "/kaggle/input/sound-classifier-1/sound_classifier_model_cnn.h5"
loaded_model = load_model(model_path)

# ----------------- Evaluate on Test Data -----------------
test_loss, test_accuracy = loaded_model.evaluate(X_test, y_test, verbose=1)
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")
print(f"Test Loss: {test_loss:.4f}")

# ----------------- Classification Report -----------------
y_pred_probs = loaded_model.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1)
y_true = np.argmax(y_test, axis=1)

print("\nClassification Report:")
print(classification_report(y_true, y_pred))

# ----------------- Confusion Matrix -----------------
conf_matrix = confusion_matrix(y_true, y_pred)
print("\nConfusion Matrix:")
print(conf_matrix)


I0000 00:00:1748335070.634818      97 service.cc:148] XLA service 0x7ae1b0005820 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1748335070.640124      97 service.cc:156]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
I0000 00:00:1748335070.640145      97 service.cc:156]   StreamExecutor device (1): Tesla T4, Compute Capability 7.5
I0000 00:00:1748335071.062099      97 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m50/69[0m [32m━━━━━━━━━━━━━━[0m[37m━━━━━━[0m [1m0s[0m 2ms/step - accuracy: 0.8698 - loss: 0.8194

I0000 00:00:1748335073.797771      97 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 11ms/step - accuracy: 0.8719 - loss: 0.8274
Test Accuracy: 87.59%
Test Loss: 0.8549
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step

Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.97      0.93       238
           1       0.96      0.79      0.86       121
           2       0.78      0.76      0.77       233
           3       0.77      0.79      0.78       246
           4       0.94      0.85      0.89       258
           5       0.93      0.97      0.95       255
           6       0.96      0.88      0.92        99
           7       0.91      0.95      0.93       266
           8       0.87      0.95      0.91       223
           9       0.83      0.81      0.82       244

    accuracy                           0.88      2183
   macro avg       0.88      0.87      0.88      2183
weighted avg       0.88      0.88      0.88      21

In [36]:
import librosa
import numpy as np
from tensorflow.keras.models import load_model

# Load the trained model
model = load_model('/kaggle/input/sound-classifier-1/sound_classifier_model_cnn.h5')

# Define class labels from UrbanSound8K
class_labels = [
    "air_conditioner", "car_horn", "children_playing", "dog_bark", "drilling",
    "engine_idling", "gun_shot", "jackhammer", "siren", "street_music"
]

# Load and preprocess test audio file
def preprocess_audio(file_path):
    audio, sr = librosa.load(file_path, duration=4.0)
    mels = librosa.feature.melspectrogram(y=audio, sr=sr)
    mels_db = librosa.power_to_db(mels, ref=np.max)
    mels_mean = np.mean(mels_db.T, axis=0)
    
    if len(mels_mean) < 128:
        mels_mean = np.pad(mels_mean, (0, 128 - len(mels_mean)))
    else:
        mels_mean = mels_mean[:128]
    
    input_data = mels_mean.reshape(1, 16, 8, 1)
    return input_data

# Predict
file_path = "/kaggle/input/small-dog-bark/small_dog_bark_IgU_UsS.wav"
processed_audio = preprocess_audio(file_path)
prediction = model.predict(processed_audio)
predicted_class = class_labels[np.argmax(prediction)]

print(f"Predicted Class: {predicted_class}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 287ms/step
Predicted Class: car_horn


In [40]:
import librosa
import numpy as np
from tensorflow.keras.models import load_model

# Load your model
model = load_model('/kaggle/input/sound-classifier-1/sound_classifier_model_cnn.h5')

# Load and preprocess the dog bark MP3
def preprocess_audio(file_path):
    audio, sr = librosa.load(file_path, duration=4.0)
    mels = librosa.feature.melspectrogram(y=audio, sr=sr)
    mels_db = librosa.power_to_db(mels, ref=np.max)
    mels_mean = np.mean(mels_db.T, axis=0)
    
    if len(mels_mean) < 128:
        mels_mean = np.pad(mels_mean, (0, 128 - len(mels_mean)))
    else:
        mels_mean = mels_mean[:128]
    
    return mels_mean.reshape(1, 16, 8, 1)

# Predict class ID
file_path = "/kaggle/input/gun-shot-sound/gun-shots6-times-fast-230509.mp3"
processed = preprocess_audio(file_path)
prediction = model.predict(processed)
predicted_class_id = int(np.argmax(prediction))

print(f"Predicted Class ID: {predicted_class_id}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 271ms/step
Predicted Class ID: 1
