In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
import os
import numpy as np
import librosa
import librosa.display
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import (Input, Conv2D, MaxPooling2D, Dropout, Dense,
                                     BatchNormalization, Flatten, GlobalAveragePooling2D)
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from sklearn.model_selection import train_test_split
from skimage.transform import resize

# Define your folder structure
data_dir = '/content/drive/MyDrive/training_dataset'
classes = ['cat', 'dog','bird','cow','lion','sheep','frog','chicken','donkey','monkey', 'others']

# Load and preprocess audio data
def load_and_preprocess_data(data_dir, classes, target_shape=(128, 128)):
    data = []
    labels = []

    for i, class_name in enumerate(classes):
        class_dir = os.path.join(data_dir, class_name)
        if os.path.exists(class_dir):
            for filename in os.listdir(class_dir):
                if filename.endswith('.wav'):
                    file_path = os.path.join(class_dir, filename)
                    audio_data, sample_rate = librosa.load(file_path, sr=22050)

                    # Convert to Mel spectrogram
                    mel_spectrogram = librosa.feature.melspectrogram(y=audio_data, sr=sample_rate, n_mels=128)
                    mel_spectrogram = librosa.power_to_db(mel_spectrogram, ref=np.max)  # Convert to dB scale

                    # Resize for CNN input
                    mel_spectrogram = resize(mel_spectrogram, target_shape, mode='constant')
                    mel_spectrogram = np.expand_dims(mel_spectrogram, axis=-1)  # Add channel dimension

                    data.append(mel_spectrogram)
                    labels.append(i)

    return np.array(data), np.array(labels)

# Data Augmentation (Time Shift, Noise Injection, Pitch Shift)
def augment_audio(audio, sr):
    if np.random.rand() < 0.3:
        audio = np.roll(audio, shift=int(sr * 0.1))  # Time Shift
    if np.random.rand() < 0.3:
        audio += 0.005 * np.random.randn(len(audio))  # Add Gaussian Noise
    if np.random.rand() < 0.3:
        audio = librosa.effects.pitch_shift(audio, sr=sr, n_steps=np.random.uniform(-2, 2))  # Pitch Shift
    return audio

# Load and split data
data, labels = load_and_preprocess_data(data_dir, classes)
labels = to_categorical(labels, num_classes=len(classes))
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)

# Define CNN model
input_shape = X_train.shape[1:]

input_layer = Input(shape=input_shape)

x = Conv2D(64, (3, 3), activation='relu', padding='same')(input_layer)
x = BatchNormalization()(x)
x = MaxPooling2D((2, 2))(x)
x = Dropout(0.3)(x)

x = Conv2D(128, (3, 3), activation='relu', padding='same')(x)
x = BatchNormalization()(x)
x = MaxPooling2D((2, 2))(x)
x = Dropout(0.4)(x)

x = Conv2D(256, (3, 3), activation='relu', padding='same')(x)
x = BatchNormalization()(x)
x = MaxPooling2D((2, 2))(x)
x = Dropout(0.4)(x)

x = Conv2D(512, (3, 3), activation='relu', padding='same')(x)
x = BatchNormalization()(x)
x = MaxPooling2D((2, 2))(x)
x = Dropout(0.5)(x)

x = GlobalAveragePooling2D()(x)  # Prevents overfitting by reducing parameter count

x = Dense(512, activation='relu', kernel_regularizer=l2(0.01))(x)
x = Dropout(0.5)(x)

output_layer = Dense(len(classes), activation='softmax')(x)

model = Model(input_layer, output_layer)

# Compile the model
optimizer = Adam(learning_rate=0.001)
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

# Callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, verbose=1)
model_checkpoint = ModelCheckpoint('best_model.h5', save_best_only=True, monitor='val_accuracy', mode='max')

# Train the model
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=50, batch_size=32,
                    callbacks=[early_stopping, reduce_lr, model_checkpoint])

Epoch 1/50
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 207ms/step - accuracy: 0.3054 - loss: 7.0662



[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 257ms/step - accuracy: 0.3072 - loss: 7.0485 - val_accuracy: 0.1200 - val_loss: 11.0291 - learning_rate: 0.0010
Epoch 2/50
[1m37/38[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 47ms/step - accuracy: 0.4725 - loss: 4.9345



[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 55ms/step - accuracy: 0.4751 - loss: 4.9173 - val_accuracy: 0.2167 - val_loss: 7.3351 - learning_rate: 0.0010
Epoch 3/50
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step - accuracy: 0.6087 - loss: 3.7208



[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 54ms/step - accuracy: 0.6086 - loss: 3.7166 - val_accuracy: 0.4467 - val_loss: 3.9287 - learning_rate: 0.0010
Epoch 4/50
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 51ms/step - accuracy: 0.6721 - loss: 2.8945 - val_accuracy: 0.3600 - val_loss: 3.8440 - learning_rate: 0.0010
Epoch 5/50
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step - accuracy: 0.7442 - loss: 2.2531



[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 61ms/step - accuracy: 0.7444 - loss: 2.2501 - val_accuracy: 0.5067 - val_loss: 2.5925 - learning_rate: 0.0010
Epoch 6/50
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 57ms/step - accuracy: 0.7789 - loss: 1.7967 - val_accuracy: 0.1767 - val_loss: 6.8427 - learning_rate: 0.0010
Epoch 7/50
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step - accuracy: 0.8020 - loss: 1.5037



[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 55ms/step - accuracy: 0.8021 - loss: 1.5023 - val_accuracy: 0.6333 - val_loss: 1.9537 - learning_rate: 0.0010
Epoch 8/50
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 52ms/step - accuracy: 0.8106 - loss: 1.2726 - val_accuracy: 0.5833 - val_loss: 1.8799 - learning_rate: 0.0010
Epoch 9/50
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 51ms/step - accuracy: 0.8550 - loss: 1.0181 - val_accuracy: 0.3567 - val_loss: 3.3282 - learning_rate: 0.0010
Epoch 10/50
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 52ms/step - accuracy: 0.8835 - loss: 0.8705 - val_accuracy: 0.3600 - val_loss: 3.3358 - learning_rate: 0.0010
Epoch 11/50
[1m37/38[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 48ms/step - accuracy: 0.8786 - loss: 0.7690



[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 61ms/step - accuracy: 0.8788 - loss: 0.7678 - val_accuracy: 0.7467 - val_loss: 1.2150 - learning_rate: 0.0010
Epoch 12/50
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 53ms/step - accuracy: 0.9128 - loss: 0.6523 - val_accuracy: 0.6233 - val_loss: 1.6573 - learning_rate: 0.0010
Epoch 13/50
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 52ms/step - accuracy: 0.9255 - loss: 0.5871 - val_accuracy: 0.7300 - val_loss: 1.1509 - learning_rate: 0.0010
Epoch 14/50
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 51ms/step - accuracy: 0.9273 - loss: 0.4936 - val_accuracy: 0.7200 - val_loss: 1.2137 - learning_rate: 0.0010
Epoch 15/50
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 52ms/step - accuracy: 0.9182 - loss: 0.5014 - val_accuracy: 0.6733 - val_loss: 1.3653 - learning_rate: 0.0010
Epoch 16



[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 56ms/step - accuracy: 0.9294 - loss: 0.4452 - val_accuracy: 0.8433 - val_loss: 0.6956 - learning_rate: 0.0010
Epoch 17/50
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 54ms/step - accuracy: 0.9303 - loss: 0.4377 - val_accuracy: 0.7300 - val_loss: 1.0277 - learning_rate: 0.0010
Epoch 18/50
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 51ms/step - accuracy: 0.9483 - loss: 0.3921 - val_accuracy: 0.3167 - val_loss: 3.6055 - learning_rate: 0.0010
Epoch 19/50
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 51ms/step - accuracy: 0.9433 - loss: 0.3842 - val_accuracy: 0.8200 - val_loss: 0.8680 - learning_rate: 0.0010
Epoch 20/50
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 51ms/step - accuracy: 0.9621 - loss: 0.3077 - val_accuracy: 0.7567 - val_loss: 0.9495 - learning_rate: 0.0010
Epoch 21



[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 53ms/step - accuracy: 0.9920 - loss: 0.1678 - val_accuracy: 0.9033 - val_loss: 0.3671 - learning_rate: 5.0000e-04
Epoch 26/50
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 51ms/step - accuracy: 0.9940 - loss: 0.1479 - val_accuracy: 0.8967 - val_loss: 0.4179 - learning_rate: 5.0000e-04
Epoch 27/50
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 52ms/step - accuracy: 0.9920 - loss: 0.1464 - val_accuracy: 0.8667 - val_loss: 0.4653 - learning_rate: 5.0000e-04
Epoch 28/50
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 52ms/step - accuracy: 0.9970 - loss: 0.1278 - val_accuracy: 0.8933 - val_loss: 0.4151 - learning_rate: 5.0000e-04
Epoch 29/50
[1m37/38[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 47ms/step - accuracy: 0.9985 - loss: 0.1190



[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 54ms/step - accuracy: 0.9986 - loss: 0.1191 - val_accuracy: 0.9300 - val_loss: 0.3439 - learning_rate: 5.0000e-04
Epoch 30/50
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 50ms/step - accuracy: 0.9914 - loss: 0.1331 - val_accuracy: 0.7867 - val_loss: 0.7490 - learning_rate: 5.0000e-04
Epoch 31/50
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 50ms/step - accuracy: 0.9921 - loss: 0.1263 - val_accuracy: 0.9133 - val_loss: 0.3245 - learning_rate: 5.0000e-04
Epoch 32/50
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 50ms/step - accuracy: 0.9913 - loss: 0.1220 - val_accuracy: 0.9167 - val_loss: 0.3888 - learning_rate: 5.0000e-04
Epoch 33/50
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 56ms/step - accuracy: 0.9943 - loss: 0.1205 - val_accuracy: 0.8767 - val_loss: 0.5149 - learning_rate:



[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 55ms/step - accuracy: 0.9954 - loss: 0.1039 - val_accuracy: 0.9333 - val_loss: 0.2847 - learning_rate: 2.5000e-04
Epoch 38/50
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 51ms/step - accuracy: 0.9995 - loss: 0.0904 - val_accuracy: 0.9167 - val_loss: 0.3191 - learning_rate: 2.5000e-04
Epoch 39/50
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 52ms/step - accuracy: 0.9973 - loss: 0.0910 - val_accuracy: 0.8833 - val_loss: 0.4272 - learning_rate: 2.5000e-04
Epoch 40/50
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 51ms/step - accuracy: 0.9989 - loss: 0.0889 - val_accuracy: 0.8967 - val_loss: 0.3751 - learning_rate: 2.5000e-04
Epoch 41/50
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 51ms/step - accuracy: 0.9976 - loss: 0.0881 - val_accuracy: 0.9300 - val_loss: 0.2990 - learning_rate:



[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 60ms/step - accuracy: 0.9996 - loss: 0.0732 - val_accuracy: 0.9433 - val_loss: 0.2582 - learning_rate: 1.2500e-04
Epoch 46/50
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 51ms/step - accuracy: 1.0000 - loss: 0.0719 - val_accuracy: 0.9400 - val_loss: 0.2593 - learning_rate: 1.2500e-04
Epoch 47/50
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 53ms/step - accuracy: 1.0000 - loss: 0.0683 - val_accuracy: 0.9300 - val_loss: 0.2636 - learning_rate: 1.2500e-04
Epoch 48/50
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 52ms/step - accuracy: 0.9991 - loss: 0.0699 - val_accuracy: 0.9267 - val_loss: 0.2434 - learning_rate: 1.2500e-04
Epoch 49/50
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step - accuracy: 0.9998 - loss: 0.0654



[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 54ms/step - accuracy: 0.9998 - loss: 0.0655 - val_accuracy: 0.9467 - val_loss: 0.2253 - learning_rate: 1.2500e-04
Epoch 50/50
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 52ms/step - accuracy: 0.9991 - loss: 0.0726 - val_accuracy: 0.9400 - val_loss: 0.2340 - learning_rate: 1.2500e-04
