In [47]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import os

In [48]:
yamnet_model_handle = "https://tfhub.dev/google/yamnet/1"
yamnet = hub.load(yamnet_model_handle)

In [49]:
def extract_embedding(audio_file):
    try:
        y, sr = librosa.load(audio_file, sr=16000, mono=True)
        waveform = tf.convert_to_tensor(y, dtype=tf.float32)
        scores, embeddings, spectrogram = yamnet(waveform)
        
        # Ensure embeddings are 3D (batch, time, features)
        if len(embeddings.shape) == 2:
            embeddings = tf.expand_dims(embeddings, axis=0)  # Add batch dim
        
        # Compute mean over time axis (axis=1)
        mean_embedding = np.mean(embeddings.numpy()[0], axis=0)
        return mean_embedding
    except Exception as e:
        print(f"Error processing {audio_file}: {e}")
        return np.zeros(1024)  # Return a zero vector if extraction fails

In [50]:
# Load dataset
audio_dir = ""
# Load dataset
audio_dir = "C:/Users/5A_Traders/Downloads/FYP_ON_DEV/FYP_IntelliTrain/AudioClassification/Dataset/animal_audio/Animal-Soundprepros"
# test_audio_dir = "C:/Users/5A_Traders/Downloads/FYP_ON_DEV/FYP_IntelliTrain/AudioClassification/Dataset/archive/DataTest"


In [51]:
def load_data(directory):
    labels, features = [], []
    for class_label in os.listdir(directory):
        class_path = os.path.join(directory, class_label)
        if os.path.isdir(class_path):
            for audio_file in os.listdir(class_path):
                file_path = os.path.join(class_path, audio_file)
                try:
                    feature = extract_embedding(file_path)
                    features.append(feature)
                    labels.append(class_label)
                except Exception as e:
                    print(f"Error processing {file_path}: {e}")
    return np.array(features), np.array(labels)


In [52]:
# Load ALL data (train + test) or adjust paths if you want separate train/test dirs
# Here we assume you're using DataTrain for training and DataTest for testing
X, y = load_data(audio_dir)
# X_test, y_test = load_data(test_audio_dir)

# Encode labels using LabelEncoder (fit on ALL labels to cover all classes)
# le = LabelEncoder()
# le.fit(np.concatenate([y_train, y_test]))  # Fit on all possible labels

# # Transform string labels to integers
# y_train = le.transform(y_train)
# y_test = le.transform(y_test)

le = LabelEncoder()
le.fit(y)
labels_encoded = le.transform(y)
X = np.array(X)
y = np.array(labels_encoded)

# Convert features to numpy arrays
# X_train = np.array(X_train)
# X_test = np.array(X_test)


# # If features are 1D (due to errors), reshape to (n_samples, 1024)
# if X_train.ndim == 1:
#     X_train = X_train.reshape(-1, 1024)
#     X_test = X_test.reshape(-1, 1024)

# print("X_train shape:", X_train.shape)  # Should be (n_samples, 1024)

In [53]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [54]:
print("X_train shape:", X_train.shape)  # Should be (n_samples, 1024)
print("Example feature shape:", X_train[0].shape)  # Should be (1024,)

X_train shape: (520, 1024)
Example feature shape: (1024,)


In [55]:
# Define the Model
model = Sequential([
    Input(shape=(X_train.shape[1],)),  # Shape (1024,)
    Dense(256, activation='relu'),
    Dropout(0.3),
    Dense(128, activation='relu'),
    Dropout(0.3),
    Dense(len(le.classes_), activation='softmax')
])

In [56]:
model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',  # For integer-encoded labels
    metrics=['accuracy']
)

In [57]:
# Train Model
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=20, batch_size=16)

Epoch 1/20
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 19ms/step - accuracy: 0.2443 - loss: 2.2816 - val_accuracy: 0.6692 - val_loss: 1.2910
Epoch 2/20
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.6586 - loss: 1.2447 - val_accuracy: 0.7538 - val_loss: 0.8284
Epoch 3/20
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.7012 - loss: 0.8989 - val_accuracy: 0.7462 - val_loss: 0.8225
Epoch 4/20
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.7997 - loss: 0.6907 - val_accuracy: 0.7846 - val_loss: 0.6753
Epoch 5/20
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - accuracy: 0.8579 - loss: 0.5543 - val_accuracy: 0.7846 - val_loss: 0.6733
Epoch 6/20
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.8439 - loss: 0.5052 - val_accuracy: 0.8154 - val_loss: 0.6543
Epoch 7/20
[1m33/33[0m [32m━━━━━━━━

<keras.src.callbacks.history.History at 0x2111b19b760>

In [58]:
# Save Model
model.save("audio_classification_model.h5")
print("Model training complete and saved!")



Model training complete and saved!


In [59]:
import joblib
joblib.dump(le, "label_encoder.pkl")

['label_encoder.pkl']

In [60]:
import librosa
import numpy as np
from tensorflow.keras.models import load_model
import joblib

# Load assets
model = load_model("audio_classification_model.h5")
le = joblib.load("label_encoder.pkl")
yamnet = hub.load("https://tfhub.dev/google/yamnet/1")

def predict_audio_class(audio_path):
    try:
        # Extract embedding
        y, sr = librosa.load(audio_path, sr=16000, mono=True)
        waveform = tf.convert_to_tensor(y, dtype=tf.float32)
        scores, embeddings, spectrogram = yamnet(waveform)
        mean_embedding = np.mean(embeddings.numpy()[0], axis=0)
        
        # Predict
        embedding = np.expand_dims(mean_embedding, axis=0)
        predictions = model.predict(embedding)
        predicted_idx = np.argmax(predictions)
        return le.inverse_transform([predicted_idx])[0]
    except Exception as e:
        print(f"Prediction error: {e}")
        return None

# Test




In [61]:
def predict_audio_class(audio_path, model, label_encoder):
    # Extract features using YAMNet
    embedding = extract_embedding(audio_path)
    
    # Verify embedding shape
    print("Raw embedding shape:", embedding.shape)  # Should be (1024,)
    
    # Reshape to (1, 1024) for batch inference
    embedding = np.expand_dims(embedding, axis=0)
    print("Reshaped embedding shape:", embedding.shape)  # Should be (1, 1024)
    
    # Predict
    predictions = model.predict(embedding)
    predicted_class_idx = np.argmax(predictions)
    predicted_label = label_encoder.inverse_transform([predicted_class_idx])[0]
    return predicted_label

In [68]:
print(predict_audio_class("C:/Users/5A_Traders/Downloads/FYP_ON_DEV/FYP_IntelliTrain/AudioClassification/Dataset/archive/DataTest/horse/horse3.wav",model,le)) # Example: returns "cat"


Raw embedding shape: (1024,)
Reshaped embedding shape: (1, 1024)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
Horse
