In [56]:
# --- Imports ---
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import librosa
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input, Bidirectional, LSTM, Reshape
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers.schedules import ExponentialDecay
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
import os
import audiomentations as A  ## NEW ##
import joblib

In [57]:
# --- Constants ---
YAMNET_HANDLE = "https://tfhub.dev/google/yamnet/1"
AUDIO_DIR = "C:/Users/5A_Traders/Downloads/FYP_ON_DEV/FYP_IntelliTrain/AudioClassification/Dataset/animal_audio/Animal-Soundprepros"  # Update path
SAMPLE_RATE = 16000

In [61]:
# --- Data Augmentation Setup ---  ## NEW ##
augment = A.Compose([
    A.AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.5),
    A.TimeStretch(min_rate=0.8, max_rate=1.2, p=0.5),
    A.PitchShift(min_semitones=-4, max_semitones=4, p=0.5),
    A.TimeMask(min_band_part=0.2, max_band_part=0.5) 
      
])

In [62]:
# --- Feature Extraction with Augmentation ---
def extract_embedding(audio_file, apply_augmentation=True):  # Changed parameter name
    try:
        y, sr = librosa.load(audio_file, sr=SAMPLE_RATE, mono=True)
        
        if apply_augmentation:  # Now using correct reference
            y = augment(samples=y, sample_rate=sr)  # Now refers to audiomentations object
            
        waveform = tf.convert_to_tensor(y, dtype=tf.float32)
        scores, embeddings, _ = yamnet(waveform)
        
        if len(embeddings.shape) == 2:
            embeddings = tf.expand_dims(embeddings, 0)
            
        return np.mean(embeddings.numpy()[0], axis=0)
    except Exception as e:
        print(f"Error processing {audio_file}: {e}")
        return np.zeros(1024)

In [79]:
# --- Enhanced Model Architecture ---  ## NEW ##
def create_model(num_classes):
    model = Sequential([
        Input(shape=(1024,)),
        Reshape((1, 1024)),  # Prepare for sequence processing
        Bidirectional(LSTM(128, return_sequences=True, kernel_regularizer=l2(0.001))),
        Bidirectional(LSTM(64, return_sequences=False)),
        Dropout(0.5),
        Dense(256, activation='relu', kernel_regularizer=l2(0.01)),  # FC layer
        Dropout(0.4),
        Dense(num_classes, activation='softmax')
    ])
    
    # Learning Rate Schedule  ## NEW ##
    lr_schedule = ExponentialDecay(
        initial_learning_rate=1e-3,
        decay_steps=1000,
        decay_rate=0.9
    )
    
    
    
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=lr_schedule),
        loss=tf.keras.losses.SparseCategoricalCrossentropy(),
        metrics=['accuracy']
    )
    return model

In [77]:
# --- Data Loading ---
def load_data(directory):
    features, labels = [], []
    for class_label in os.listdir(directory):
        class_path = os.path.join(directory, class_label)
        if os.path.isdir(class_path):
            for audio_file in os.listdir(class_path):
                file_path = os.path.join(class_path, audio_file)
                try:
                    # Use augmentation for training data  ## NEW ##
                    feature = extract_embedding(file_path, apply_augmentation=True)
                    features.append(feature)
                    labels.append(class_label)
                except Exception as e:
                    print(f"Error processing {file_path}: {e}")
    return np.array(features), np.array(labels)


In [None]:
# # --- Main Execution ---
# if __name__ == "__main__":
#     # Load YAMNet
#     yamnet = hub.load(YAMNET_HANDLE)
    
#     # Load and preprocess data
#     X, y = load_data(AUDIO_DIR)
#     le = LabelEncoder()
#     y_encoded = le.fit_transform(y)
    
#     # Split data
#     X_train, X_test, y_train, y_test = train_test_split(
#         X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
#     )
    
#     # Handle class imbalance  ## NEW ##
#     class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
#     class_weights = dict(enumerate(class_weights))
    
#     # Create and train model
#     model = create_model(len(le.classes_))
#     history = model.fit(
#         X_train, y_train,
#         validation_data=(X_test, y_test),
#         epochs=50,
#         batch_size=32,
#         class_weight=class_weights  ## NEW ##
#     )
    
#     # Save artifacts
#     model.save("enhanced_audio_model.h5")
#     joblib.dump(le, "label_encoder.pkl")
#     print("Model training complete!")

In [65]:
model.add(BatchNormalization())

In [66]:
 # Load YAMNet
yamnet = hub.load(YAMNET_HANDLE)

In [67]:
  # Load and preprocess data
X, y = load_data(AUDIO_DIR)
le = LabelEncoder()
y_encoded = le.fit_transform(y)


In [68]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

In [69]:
# Handle class imbalance  ## NEW ##
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights = dict(enumerate(class_weights))

In [70]:
print("Number of classes:", len(le.classes_))
print("Unique labels in y_train:", np.unique(y_train))

Number of classes: 13
Unique labels in y_train: [ 0  1  2  3  4  5  6  7  8  9 10 11 12]


In [71]:
print("X_train shape:", X_train.shape)  # Should be (samples, 1024)
print("y_train shape:", y_train.shape)  # Should be (samples,)
assert X_train.shape[1] == 1024
assert len(y_train.shape) == 1

X_train shape: (520, 1024)
y_train shape: (520,)


In [80]:
early_stop = EarlyStopping(
        monitor='val_accuracy',
        patience=5,  # Stop if no improvement for 5 epochs
        restore_best_weights=True,
        verbose=1 
    )

In [81]:
# Create and train model
model = create_model(len(le.classes_))
history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=50,
    batch_size=32,
    class_weight=class_weights,
    callbacks = [early_stop]
)

Epoch 1/50
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 136ms/step - accuracy: 0.1553 - loss: 5.2864 - val_accuracy: 0.4077 - val_loss: 4.3182
Epoch 2/50
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 31ms/step - accuracy: 0.3548 - loss: 4.0950 - val_accuracy: 0.4692 - val_loss: 3.4380
Epoch 3/50
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 30ms/step - accuracy: 0.3737 - loss: 3.2892 - val_accuracy: 0.4769 - val_loss: 2.6803
Epoch 4/50
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 29ms/step - accuracy: 0.4295 - loss: 2.5682 - val_accuracy: 0.5538 - val_loss: 2.1008
Epoch 5/50
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 29ms/step - accuracy: 0.5751 - loss: 2.0464 - val_accuracy: 0.6077 - val_loss: 1.8553
Epoch 6/50
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 31ms/step - accuracy: 0.6002 - loss: 1.8208 - val_accuracy: 0.5846 - val_loss: 1.6796
Epoch 7/50
[1m17/17[0m [32m━━

In [82]:
# Save artifacts
model.save("enhanced_audio_model.h5")
joblib.dump(le, "label_encoder.pkl")
print("Model training complete!")



Model training complete!


In [83]:
# --- Enhanced Prediction Function ---  ## NEW ##
def predict_audio_class(audio_path, model, le):
    try:
        # Extract features without augmentation for prediction
        embedding = extract_embedding(audio_path, apply_augmentation=False)
        embedding = np.expand_dims(embedding, axis=0)
        
        # Get prediction probabilities
        probs = model.predict(embedding)[0]
        pred_idx = np.argmax(probs)
        confidence = probs[pred_idx]
        
        return le.inverse_transform([pred_idx])[0], confidence
    except Exception as e:
        print(f"Prediction error: {e}")
        return None, 0.0

In [85]:
print(predict_audio_class("C:/Users/5A_Traders/Downloads/FYP_ON_DEV/FYP_IntelliTrain/AudioClassification/Dataset/archive/DataTest/lion/lion1.wav",model,le)) # Example: returns "cat"


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
('Bear', 0.5269819)
