In [1]:
import librosa
import librosa.display
import os
import numpy as np
from sklearn.preprocessing import LabelEncoder

def extract_features(data_path):
    features = []
    labels = []
    genres = os.listdir(data_path)
    for genre in genres:
        genre_path = os.path.join(data_path, genre)
        for file in os.listdir(genre_path):
            if file.endswith(".wav"):
                file_path = os.path.join(genre_path, file)
                y, sr = librosa.load(file_path, duration=30)
                mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
                mfcc_mean = np.mean(mfcc.T, axis=0)
                features.append(mfcc_mean)
                labels.append(genre)
    return np.array(features), np.array(labels)

X, y = extract_features("data/genres")

In [2]:
from sklearn.model_selection import train_test_split

encoder = LabelEncoder()
y_encoded = encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

In [3]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

In [4]:
from sklearn.metrics import classification_report, accuracy_score

y_pred = rf_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=encoder.classes_))

Accuracy: 0.575
              precision    recall  f1-score   support

       blues       0.71      0.50      0.59        24
   classical       0.81      0.89      0.85        19
     country       0.75      0.46      0.57        26
       disco       0.44      0.58      0.50        19
      hiphop       0.40      0.77      0.53        13
        rock       0.44      0.37      0.40        19

    accuracy                           0.57       120
   macro avg       0.59      0.60      0.57       120
weighted avg       0.61      0.57      0.57       120



In [5]:
import pickle

with open("models/tabular_model.pkl", "wb") as f:
    pickle.dump((rf_model, encoder), f)

### CNN with Spectrograms

In [6]:
import matplotlib.pyplot as plt

def create_spectrogram(file_path, output_path):
    y, sr = librosa.load(file_path)
    plt.figure(figsize=(2.24, 2.24))
    S = librosa.feature.melspectrogram(y=y, sr=sr)
    S_DB = librosa.power_to_db(S, ref=np.max)
    librosa.display.specshow(S_DB, sr=sr, x_axis='time', y_axis='mel')
    plt.axis('off')
    plt.savefig(output_path, bbox_inches='tight', pad_inches=0)
    plt.close()

In [7]:
import os

input_dir = "data/genres"
output_dir = "data/spectrograms"

os.makedirs(output_dir, exist_ok=True)

for genre in os.listdir(input_dir):
    genre_path = os.path.join(input_dir, genre)
    for file in os.listdir(genre_path):
        if file.endswith(".wav"):
            input_path = os.path.join(genre_path, file)
            output_path = os.path.join(output_dir, f"{genre}_{file.replace('.wav', '.png')}")
            create_spectrogram(input_path, output_path)

In [9]:
import os

data_dir = "data/spectrograms"
genres = [f.name for f in os.scandir(data_dir) if f.is_dir()]
print("Genres found:", genres)

for genre in genres:
    count = len(os.listdir(os.path.join(data_dir, genre)))
    print(f"{genre}: {count} images")


Genres found: ['blues', 'classical', 'country', 'disco', 'hiphop', 'rock']
blues: 96 images
classical: 100 images
country: 100 images
disco: 100 images
hiphop: 100 images
rock: 100 images


In [10]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

img_size = (224, 224)
batch_size = 16
data_dir = "data/spectrograms"

# Create ImageDataGenerator with validation split
datagen = ImageDataGenerator(
    rescale=1. / 255,
    validation_split=0.2  # 20% for validation (test)
)

# Training generator (80%)
train_gen = datagen.flow_from_directory(
    data_dir,
    target_size=img_size,
    batch_size=batch_size,
    class_mode='categorical',
    subset='training',
    shuffle=True,
    seed=42
)

# Validation generator (20%)
test_gen = datagen.flow_from_directory(
    data_dir,
    target_size=img_size,
    batch_size=batch_size,
    class_mode='categorical',
    subset='validation',
    shuffle=False,
    seed=42
)

Found 477 images belonging to 6 classes.
Found 119 images belonging to 6 classes.


In [13]:
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D

base_model = MobileNetV2(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
x = base_model.output
x = GlobalAveragePooling2D()(x)
x = Dense(64, activation='relu')(x)
predictions = Dense(train_gen.num_classes, activation='softmax')(x)

model_cnn = Model(inputs=base_model.input, outputs=predictions)

for layer in base_model.layers:
    layer.trainable = False

model_cnn.compile(optimizer=Adam(), loss='categorical_crossentropy', metrics=['accuracy'])

model_cnn.fit(train_gen, validation_data=test_gen, epochs=10)

Epoch 1/10
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 362ms/step - accuracy: 0.3512 - loss: 1.6194 - val_accuracy: 0.5294 - val_loss: 1.1724
Epoch 2/10
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 286ms/step - accuracy: 0.6097 - loss: 0.9783 - val_accuracy: 0.5798 - val_loss: 0.9998
Epoch 3/10
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 270ms/step - accuracy: 0.6590 - loss: 0.8335 - val_accuracy: 0.6134 - val_loss: 1.0293
Epoch 4/10
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 274ms/step - accuracy: 0.7432 - loss: 0.7125 - val_accuracy: 0.6218 - val_loss: 0.9785
Epoch 5/10
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 270ms/step - accuracy: 0.7640 - loss: 0.5868 - val_accuracy: 0.5966 - val_loss: 0.9883
Epoch 6/10
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 261ms/step - accuracy: 0.8493 - loss: 0.5024 - val_accuracy: 0.6134 - val_loss: 0.9765
Epoch 7/10
[1m30/30[0m [

<keras.src.callbacks.history.History at 0x1be465d99f0>

In [12]:
model_cnn.save("models/cnn_model.h5")

