Import all the libraries

In [20]:
import os
import pandas as pd
import numpy as np
import librosa
import tensorflow_hub as hub
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import SVC
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Conv1D, MaxPooling1D, Flatten, Dropout, BatchNormalization, Bidirectional
import tensorflow.keras.layers as layers

Load the YAMNet Model

- Reference - https://www.tensorflow.org/hub/tutorials/yamnet

In [2]:
yamnet_model = hub.load('https://tfhub.dev/google/yamnet/1')

2025-02-15 01:56:18.942486: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1 Pro
2025-02-15 01:56:18.942515: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2025-02-15 01:56:18.942519: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2025-02-15 01:56:18.942583: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:303] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2025-02-15 01:56:18.942718: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:269] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [3]:
def extract_yamnet_features(audio_path):
    try:
        audio, sr = librosa.load(audio_path, sr=16000, mono=True)
        waveform = audio.astype(np.float32)
        scores, embeddings, _ = yamnet_model(waveform)
        return np.mean(embeddings.numpy(), axis=0)  
    except Exception as e:
        print(f"Error processing {audio_path}: {e}")
        return None

def extract_all_yamnet_features(data_path):
    data = []
    labels = []
    for genre in os.listdir(data_path):
        genre_path = os.path.join(data_path, genre)
        if os.path.isdir(genre_path):
            for file in os.listdir(genre_path):
                if file.endswith(".wav"):
                    file_path = os.path.join(genre_path, file)
                    features = extract_yamnet_features(file_path)
                    if features is not None:
                        data.append(features)
                        labels.append(genre)
    return np.array(data), np.array(labels)

Support Vector Machine

In [4]:
def train_svm(X_train, y_train):
    model = SVC(kernel='rbf', C=10, gamma='scale')
    model.fit(X_train, y_train)
    return model

Convolutional Neural Network

In [5]:
def create_cnn(input_shape, num_classes):
    model = Sequential([
        Conv1D(128, kernel_size=5, activation='relu', input_shape=input_shape),
        BatchNormalization(),
        MaxPooling1D(pool_size=2),
        Dropout(0.3),
        Conv1D(256, kernel_size=5, activation='relu'),
        BatchNormalization(),
        MaxPooling1D(pool_size=2),
        Dropout(0.3),
        Flatten(),
        Dense(256, activation='relu'),
        Dropout(0.5),
        Dense(num_classes, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model


Long Short Term Memory

In [7]:
def create_lstm(input_shape, num_classes):
    model = Sequential([
        Bidirectional(LSTM(128, return_sequences=True, input_shape=input_shape)),
        Dropout(0.3),
        Bidirectional(LSTM(64)),
        Dense(128, activation='relu'),
        Dropout(0.5),
        Dense(num_classes, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

Transformer

In [8]:
def create_transformer(input_shape, num_classes):
    inputs = layers.Input(shape=input_shape)
    x = layers.Dense(128, activation="relu")(inputs)
    x = layers.LayerNormalization()(x)
    transformer_layer = layers.MultiHeadAttention(num_heads=8, key_dim=64)(x, x)
    x = layers.Add()([x, transformer_layer])
    x = layers.LayerNormalization()(x)
    x = layers.GlobalAveragePooling1D()(x)
    x = layers.Dense(128, activation='relu')(x)
    x = Dropout(0.5)(x)

    outputs = layers.Dense(num_classes, activation='softmax')(x)
    
    model = tf.keras.Model(inputs, outputs)
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

In [9]:
def classify_audio(audio_path, model, scaler, label_encoder):
    features = extract_yamnet_features(audio_path)
    if features is not None:
        features = scaler.transform([features])
        prediction = model.predict(features)
        return label_encoder.inverse_transform(prediction)[0]
    return "Error processing audio"

In [10]:
# Load extracted features
df = pd.read_csv("/Users/js/Desktop/Music Genre Classification/Data/yamnet_features.csv")
label_encoder = LabelEncoder()
df["Genre"] = label_encoder.fit_transform(df["Genre"])
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values
scaler = StandardScaler()
X = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
# Train models
svm_model = train_svm(X_train, y_train)
cnn_model = create_cnn((X_train.shape[1], 1), len(label_encoder.classes_))
lstm_model = create_lstm((X_train.shape[1], 1), len(label_encoder.classes_))
transformer_model = create_transformer((X_train.shape[1], 1), len(label_encoder.classes_))

In [12]:
# Reshape for CNN & LSTM
X_train_r = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_test_r = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)

In [13]:
cnn_model.fit(X_train_r, y_train, epochs=20, batch_size=32, validation_data=(X_test_r, y_test))

Epoch 1/20


2025-02-15 01:56:46.621026: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2025-02-15 01:56:48.210043: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x32b68c820>

In [14]:
lstm_model.fit(X_train_r, y_train, epochs=20, batch_size=32, validation_data=(X_test_r, y_test))


Epoch 1/20


2025-02-15 01:57:20.016141: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2025-02-15 01:57:20.434724: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2025-02-15 01:57:20.454501: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2025-02-15 01:57:20.641078: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2025-02-15 01:57:20.660408: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2025-02-15 01:57:20.850452: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2025-02-15 01:57:20.877620: I tensorflow/core/grappler/optimizers/cust

 1/23 [>.............................] - ETA: 1:30 - loss: 2.2010 - accuracy: 0.0625

2025-02-15 01:57:21.160291: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2025-02-15 01:57:21.192272: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2025-02-15 01:57:28.371955: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2025-02-15 01:57:28.519392: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2025-02-15 01:57:28.532365: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2025-02-15 01:57:28.646221: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2025-02-15 01:57:28.660806: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x32b295310>

In [17]:
transformer_model.fit(X_train_r, y_train, epochs=20, batch_size=32, validation_data=(X_test_r, y_test))

Epoch 1/20

2025-02-15 02:01:46.161447: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x3e0868b50>

In [19]:
# Save models
import joblib
svm_model_path = "/Users/js/Desktop/Music Genre Classification/Models/svm_model.pkl"
cnn_model_path = "/Users/js/Desktop/Music Genre Classification/Models/cnn_model.h5"
lstm_model_path = "/Users/js/Desktop/Music Genre Classification/Models/lstm_model.h5"
transformer_model_path = "/Users/js/Desktop/Music Genre Classification/Models/transformer_model.h5"


joblib.dump(svm_model, svm_model_path)
cnn_model.save(cnn_model_path)
lstm_model.save(lstm_model_path)
transformer_model.save(transformer_model_path)

print("Music Genre Classification Models Trained and Saved Successfully!")


  saving_api.save_model(


Music Genre Classification Models Trained and Saved Successfully!


--------------------------------------------------------------------------------------------------------------------------------------------

YAMNet Testing

- For debugging purpose, can ignore the code below.
- Contains SVM model trained on just YAMNet extracted features.

In [23]:
import os

data_path = "/Users/js/Desktop/Music Genre Classification/Data/genres_original"

def extract_all_yamnet_features(data_path):
    data = []
    labels = []
    
    for genre in os.listdir(data_path):
        genre_path = os.path.join(data_path, genre)
        if os.path.isdir(genre_path):
            for file in os.listdir(genre_path):
                if file.endswith(".wav"):
                    file_path = os.path.join(genre_path, file)
                    features = extract_yamnet_features(file_path)
                    if features is not None:
                        data.append(features)
                        labels.append(genre)
    
    return np.array(data), np.array(labels)

X_yamnet, y_yamnet = extract_all_yamnet_features(data_path)

df_yamnet = pd.DataFrame(X_yamnet)
df_yamnet["Genre"] = y_yamnet
df_yamnet.to_csv("/Users/js/Desktop/Music Genre Classification/Data/yamnet_features.csv", index=False)

print("YAMNet feature extraction complete!")


2025-02-15 14:32:27.696333: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


YAMNet feature extraction complete!


In [24]:
df = pd.read_csv("/Users/js/Desktop/Music Genre Classification/Data/yamnet_features.csv")

label_encoder = LabelEncoder()
df["Genre"] = label_encoder.fit_transform(df["Genre"])

X = df.iloc[:, :-1].values 
y = df.iloc[:, -1].values

scaler = StandardScaler()
X = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

svm_model = SVC(kernel='rbf', C=10, gamma='scale')
svm_model.fit(X_train, y_train)

print("SVM trained on YAMNet features!")


SVM trained on YAMNet features!


In [25]:
def classify_audio(audio_path, model, scaler, label_encoder):
    features = extract_yamnet_features(audio_path)
    if features is not None:
        features = scaler.transform([features]) 
        prediction = model.predict(features)
        return label_encoder.inverse_transform(prediction)[0]
    return "Error processing audio"


In [26]:
classify_audio('/Users/js/Downloads/action-urban-trap-141691.wav', svm_model, scaler, label_encoder)


'hiphop'