## Import Libraries

In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt
import librosa

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras import layers, models
from tensorflow.keras.applications import VGG16

## Load Dataset

In [2]:
data_dir = 'Data/genres_original'
genres = os.listdir(data_dir)

features = []
labels = []

## Extract MFCC Features

In [3]:
for genre in genres:
    genre_path = os.path.join(data_dir, genre)
    for file in os.listdir(genre_path):
        if file.endswith('.wav'):
            file_path = os.path.join(genre_path, file)
            y, sr = librosa.load(file_path)
            mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=20)
            mfcc_mean = np.mean(mfcc.T, axis=0)
            features.append(mfcc_mean)
            labels.append(genre)

## Preprocessing

In [4]:
X = np.array(features)
y = np.array(labels)

le = LabelEncoder()
y = le.fit_transform(y)

scaler = StandardScaler()
X = scaler.fit_transform(X)

## Train Random Forest Classifier

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
rf_model = RandomForestClassifier(n_estimators=200, random_state=42)
rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=le.classes_))

Random Forest Accuracy: 0.615
              precision    recall  f1-score   support

       blues       0.79      0.52      0.63        21
   classical       0.57      1.00      0.73        12
     country       0.75      0.62      0.68        24
       disco       0.60      0.55      0.57        22
      hiphop       0.42      0.53      0.47        15
        jazz       0.89      0.59      0.71        27
       metal       0.65      0.83      0.73        18
         pop       0.62      0.84      0.71        19
      reggae       0.50      0.55      0.52        22
        rock       0.40      0.30      0.34        20

    accuracy                           0.61       200
   macro avg       0.62      0.63      0.61       200
weighted avg       0.64      0.61      0.61       200



## Generate Mel-Spectrogram Images

In [6]:
output_dir = 'spectrograms'
os.makedirs(output_dir, exist_ok=True)

for genre in genres:
    genre_path = os.path.join(data_dir, genre)
    genre_out = os.path.join(output_dir, genre)
    os.makedirs(genre_out, exist_ok=True)

    for file in os.listdir(genre_path):
        if file.endswith('.wav'):
            file_path = os.path.join(genre_path, file)
            y, sr = librosa.load(file_path, duration=30)
            spect = librosa.feature.melspectrogram(y=y, sr=sr)
            spect_db = librosa.power_to_db(spect, ref=np.max)

            plt.figure(figsize=(2, 2))
            librosa.display.specshow(spect_db, sr=sr, cmap='magma')
            plt.axis('off')
            plt.savefig(os.path.join(genre_out, file.replace('.wav', '.png')), bbox_inches='tight', pad_inches=0)
            plt.close()

## Create ImageDataGenerator

In [7]:
datagen = ImageDataGenerator(validation_split=0.2, rescale=1./255)

train_gen_cnn = datagen.flow_from_directory(
    output_dir,
    target_size=(128, 128),
    batch_size=32,
    subset='training'
)

val_gen_cnn = datagen.flow_from_directory(
    output_dir,
    target_size=(128, 128),
    batch_size=32,
    subset='validation'
)

Found 800 images belonging to 10 classes.
Found 199 images belonging to 10 classes.


## Build and Train Basic CNN

In [8]:
cnn_model = models.Sequential([
    layers.Conv2D(32, (3,3), activation='relu', input_shape=(128,128,3)),
    layers.MaxPooling2D(2,2),
    layers.Conv2D(64, (3,3), activation='relu'),
    layers.MaxPooling2D(2,2),
    layers.Conv2D(128, (3,3), activation='relu'),
    layers.MaxPooling2D(2,2),
    layers.Flatten(),
    layers.Dense(128, activation='relu'),
    layers.Dense(len(genres), activation='softmax')
])

cnn_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
history = cnn_model.fit(train_gen_cnn, validation_data=val_gen_cnn, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


## Build Improved CNN with Dropout

In [9]:
def build_cnn_model(num_classes):
    model = models.Sequential([
        layers.Conv2D(32, (3,3), activation='relu', input_shape=(128,128,3)),
        layers.MaxPooling2D(2,2),
        layers.Conv2D(64, (3,3), activation='relu'),
        layers.MaxPooling2D(2,2),
        layers.Conv2D(128, (3,3), activation='relu'),
        layers.MaxPooling2D(2,2),
        layers.Flatten(),
        layers.Dense(128, activation='relu'),
        layers.Dropout(0.5),
        layers.Dense(num_classes, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

cnn_model = build_cnn_model(len(genres))
history_cnn = cnn_model.fit(train_gen_cnn, validation_data=val_gen_cnn, epochs=20)
cnn_model.save("cnn_genre_model.h5")

val_loss, val_acc = cnn_model.evaluate(val_gen_cnn)
print(f"CNN Validation Accuracy: {val_acc:.2f}")

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
CNN Validation Accuracy: 0.48


## Train VGG16 Transfer Learning Model

In [10]:
train_gen_vgg = datagen.flow_from_directory(
    output_dir,
    target_size=(224, 224),
    batch_size=32,
    subset='training'
)

val_gen_vgg = datagen.flow_from_directory(
    output_dir,
    target_size=(224, 224),
    batch_size=32,
    subset='validation'
)

weights_path = r"vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5"

base_model = VGG16(weights=weights_path, include_top=False, input_shape=(224,224,3))
base_model.trainable = False

vgg_model = models.Sequential([
    base_model,
    layers.Flatten(),
    layers.Dense(256, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(len(genres), activation='softmax')
])

vgg_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
history_vgg = vgg_model.fit(train_gen_vgg, validation_data=val_gen_vgg, epochs=10)
vgg_model.save("vgg16_genre_model.h5")

Found 800 images belonging to 10 classes.
Found 199 images belonging to 10 classes.
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
