## Feature Extraction

### Convolutional Neural Network
CNN algorithms recognize patterns in spatial data, which works best with images. So we will be converting the original audio data into spectrograms which are graphs that visually represent the change in frequency over time.<br>
We are starting with 1000 wav files for our data. I will convert these into mel-spectrogrmas. we chose mel-spectrograms specifically because they measure the mel scale instead of frequency along the y-axis. Also changing the color of the points based off the decibal scale not the amplitude of the wave. These spectrograms focus more on what humans will actually here making it more ideal for genre classification.

In [None]:
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
import os

# Function to convert a 3-second audio segment to a mel-spectrogram
def save_mel_spectrogram(y, sr, output_image_path, n_mels=128):
    mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels)
    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
    plt.figure(figsize=(10, 4))
    librosa.display.specshow(mel_spec_db, sr=sr, x_axis='time', y_axis='mel')
    plt.axis('off')
    plt.savefig(output_image_path, bbox_inches='tight', pad_inches=0)
    plt.close()

# Paths
PATH_MP3 = "./data/genres_original/"
PATH_IMG = "./data/genres_img/"
sr = 22050 
skip = []

os.makedirs(PATH_IMG, exist_ok=True)

# Convert all WAV files to mel spectrograms (split into 3-second segments)
for genre in os.listdir(PATH_MP3):
    if genre not in skip:
        genre_path = os.path.join(PATH_MP3, genre)
        genre_img_path = os.path.join(PATH_IMG, genre)
        os.makedirs(genre_img_path, exist_ok=True)  
        
        for music in os.listdir(genre_path):
            wav_path = os.path.join(genre_path, music)
            y, sr = librosa.load(wav_path, sr=sr)

            # Total duration of the file (should be ~30 sec)
            total_duration = librosa.get_duration(y=y, sr=sr)

            segment_length = sr * 3  

            for i in range(10):  
                start_sample = i * segment_length
                end_sample = start_sample + segment_length
                if end_sample > len(y):  
                    break
                segment = y[start_sample:end_sample]
                output_img_path = os.path.join(genre_img_path, f"{music[:-4]}_{i}.png") 
                save_mel_spectrogram(segment, sr, output_img_path)

        print(f"Finished: {genre}")
    else:
        print(f"Skipped: {genre}")

## Creating the Model

### Convolutional Neural Network

Model using this will have ~84% accuracy

In [None]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

class MusicGenreCNN(nn.Module):
    def __init__(self, num_classes):
        super(MusicGenreCNN, self).__init__()
        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1)
        self.conv3 = nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1)
        self.conv4 = nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
        self.fc1 = nn.Linear(512 * 8 * 8, 1024)
        self.fc2 = nn.Linear(1024, num_classes)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = self.pool(F.relu(self.conv3(x)))
        x = self.pool(F.relu(self.conv4(x)))
        x = x.view(x.size(0), -1) 
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Initialize model
num_classes = len(genre_labels)  # Number of music genres
model = MusicGenreCNN(num_classes).to(device)

# Define loss function (CrossEntropy for classification)
criterion = nn.CrossEntropyLoss()

# Define optimizer (Adam for better convergence)
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 20 # Number of training iterations

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0

    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)

        # Forward pass
        outputs = model(images)
        loss = criterion(outputs, labels)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader):.4f}")

print("Training complete!")

model.eval()
correct = 0
total = 0

with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = 100 * correct / total
print(f"Test Accuracy: {accuracy:.2f}%")

torch.save(model.state_dict(), "music_genre_cnn.pth")
print("Model saved successfully")

Found 800 images belonging to 10 classes.
Found 200 images belonging to 10 classes.
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Validation Accuracy: 0.0950


## Testing Model