In [1]:
import librosa
import matplotlib.pyplot as plt
import os
import numpy as np
from skimage import io
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
import xgboost as xgb

In [None]:
# Generates a mel scaled spectrograms for every 3 seconds part of a .wav file
def generate_spectrograms(wav_file, output_path, display_spectrogram=False):
    # Load the wav file using librosa
    y, sr = librosa.load(wav_file)

    # Calculate the number of samples for 3 seconds
    samples_3s = 3 * sr

    # Divide the wav file into 3-second parts
    for i in range(0, len(y) - samples_3s + 1, samples_3s):
        y_part = y[i:i+samples_3s]

        # Generate the spectrogram
        S = librosa.feature.melspectrogram(y=y_part, sr=sr, n_mels=128)
        S_dB = librosa.power_to_db(S, ref=np.max)

        # Save the spectrogram to the output path
        file_name = f"{os.path.splitext(os.path.basename(wav_file))[0]}_{i//samples_3s}.png"
        os.makedirs(output_path, exist_ok=True)
        output_file = os.path.join(output_path, file_name)

        # Create and save the spectrogram image
        plt.figure()
        librosa.display.specshow(S_dB, sr=sr, x_axis='time', y_axis='mel')
        plt.axis('off')
        plt.savefig(output_file, bbox_inches='tight', pad_inches=0)
        plt.close()

        # Display the spectrogram
        if display_spectrogram:
            plt.figure()
            librosa.display.specshow(S_dB, sr=sr, x_axis='time', y_axis='mel')
            plt.title('Mel-frequency spectrogram')
            plt.tight_layout()
            plt.show()




audio_files_root_dir = os.path.join('Data', 'genres_original')
spectrograms_root_dir = os.path.join('Data', 'spectrograms')
display_spectrogram = True

# loop through all wav files
for root, _, files in os.walk(audio_files_root_dir):
    for file in files:
        if not file.lower().endswith('.wav'):
            continue
        wav_file = os.path.join(root, file)
        genre = file.split('.')[0]
        generate_spectrograms(wav_file, os.path.join(spectrograms_root_dir, genre), display_spectrogram)
        display_spectrogram = False

In [2]:
def load_data(spectrograms_root_dir):
    features, labels = [], []
    for root, _, files in os.walk(spectrograms_root_dir):
        for file in files:
            if not file.lower().endswith('.png'):
                continue
            image_path = os.path.join(root, file)
            genre = os.path.basename(root)
            features.append(extract_features(image_path))
            labels.append(genre)
    return np.array(features), np.array(labels)

def extract_features(image_path):
    img = io.imread(image_path, as_gray=True)
    img = img / 255.0  # Normalize pixel values
    features = img.flatten()  # Flatten the image into a 1D array
    return features


spectrograms_root_dir = os.path.join('Data', 'spectrograms')
X, y = load_data(spectrograms_root_dir)

# Encode genre labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Train the XGBoost model
model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
model.fit(X_train, y_train)

# Test the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: {:.2f}%".format(accuracy * 100))

# Print the classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))



Accuracy: 66.75%

Classification Report:
              precision    recall  f1-score   support

       blues       0.74      0.69      0.71       208
   classical       0.80      0.87      0.83       204
     country       0.60      0.64      0.62       186
       disco       0.54      0.54      0.54       195
      hiphop       0.65      0.54      0.59       236
        jazz       0.70      0.70      0.70       189
       metal       0.80      0.83      0.81       210
         pop       0.65      0.80      0.72       197
      reggae       0.61      0.61      0.61       199
        rock       0.51      0.43      0.47       173

    accuracy                           0.67      1997
   macro avg       0.66      0.67      0.66      1997
weighted avg       0.66      0.67      0.66      1997

