# CNNs for Music Genre Classification based on MEL Spectograms

### Example data from MTG-Jamendo dataset

In [None]:
import copy
from os import getcwd

import pandas as pd
import plotly.express as px
import numpy as np
import os
import tarfile
import requests
import tqdm
import time
import random
import torch
from matplotlib.pyplot import annotate
from sklearn.metrics import f1_score
from torch.utils.data import Dataset, DataLoader, Subset

In [None]:
# Load data from npy file
example = np.load('assets/1354100.npy')
example.shape

In [None]:
fig = px.imshow(example)
fig.show()

In [None]:
example_2 = np.load('assets/1169700.npy')
example_2.shape

In [None]:
fig = px.imshow(example_2)
fig.show()

As we can see the data is not cut into 30s segments, and for cnn we need to cut it into 30s segments. For that we need to answer a question: how was the mel spectogram obtained. I managed to dig into the code of MTG-Jamendo dataset and found the following code:
```python
def melspectrogram(audio,
                   sampleRate=12000, frameSize=512, hopSize=256,
                   window='hann', zeroPadding=0, center=True,
                   numberBands=96, lowFrequencyBound=0, highFrequencyBound=None,
                   weighting='linear', warpingFormula='slaneyMel',
                   normalize='unit_tri'):

    if highFrequencyBound is None:
        highFrequencyBound = sampleRate/2

    windowing = Windowing(type=window, normalized=False, zeroPadding=zeroPadding)
    spectrum = Spectrum()
    melbands = MelBands(numberBands=numberBands,
                        sampleRate=sampleRate,
                        lowFrequencyBound=lowFrequencyBound,
                        highFrequencyBound=highFrequencyBound,
                        inputSize=(frameSize+zeroPadding)//2+1,
                        weighting=weighting,
                        normalize=normalize,
                        warpingFormula=warpingFormula,
                        type='power')
    amp2db = UnaryOperator(type='lin2db', scale=2)

    pool = essentia.Pool()
    for frame in FrameGenerator(audio,
                                frameSize=frameSize, hopSize=hopSize,
                                startFromZero=not center):
        pool.add('mel', amp2db(melbands(spectrum(windowing(frame)))))

    return pool['mel'].T
```
Let's see if the parameters are the same as the ones used to generate the mel spectogram.

In [None]:
# Parameters from the melspectrogram function
sample_rate = 12000  # in Hz
frame_size = 512     # in samples
hop_size = 256       # in samples

# Frame duration in seconds
frame_duration_seconds = hop_size / sample_rate

# Example mapping of time frames (N) to duration in seconds
N_values = [example.shape[1], example_2.shape[1], 1376]  # Example N values (number of frames)
durations = [round(N * frame_duration_seconds, 1) for N in N_values]

frame_duration_seconds, list(zip(N_values, durations))

In [None]:
# Based on the data above let's create a function for generating mel spectograms with librosa
import librosa
import librosa.display
import matplotlib.pyplot as plt

def generate_mel_spectrogram(audio_path, sample_rate=12000, frame_size=512, hop_size=256):
    """
    Generate a mel spectrogram from an audio file and save it as a npy file

    Args:
        audio_path (str): Path to the input audio file.
        output_path (str): Path to save the generated mel spectrogram npy file.

    Returns: mel spectogram as np array
    """

    # Load the audio file
    y, sr = librosa.load(audio_path, sr=sample_rate)

    # Short-time Fourier transform (STFT)
    D = librosa.stft(y, n_fft=frame_size, hop_length=hop_size)

    # gathering the absolute values for all values in our audio_stft
    magnitude = np.abs(D)

    # Converting the amplitude to decibels
    log_power = librosa.amplitude_to_db(magnitude, ref=np.max)
    
    # Create and save the mel spectrogram as a npy file to the output path
    




Let's check the duration of the tracks in the dataset:

In [None]:
mtg_genre_tags_filepath = 'assets/autotagging_genre.tsv'
columns = ['track_id', 'artist_id', 'album_id', 'path', 'duration', 'tag1', 'tag2', 'tag3', 'tag4', 'tag5', 'tag6',
           'tag7', 'tag8', 'tag9']

data = pd.read_csv(mtg_genre_tags_filepath, sep='\t', names=columns, engine='python')

In [None]:
data[data['path'].isin(['00/1354100.mp3', '00/1169700.mp3'])][['path', 'duration']]

The calculated duration is the same as in the dataset, so we can proceed with cutting the mel spectograms into 30-second segments.


### Cutting Mel Spectograms to 30s Segments and savind to spectograms_data folder


> 1376 frames correspond to 29.4 seconds of audio, given the frame size of 512 samples and hop size of 256 samples and is easily dividable by 2.

In [None]:
def cut_mel_spectrogram_to_30s(input_file_name, output_path, cut_length=1376): # 1376 is shorter than 30 second but easily dividable by 2
    """
    Cut a mel spectrogram to 30 seconds, selecting the middle portion of the spectrogram.

    Args:
        input_file_name (str): Path to the input mel spectrogram file in .npy format.
        output_path (str): Path to save the cut mel spectrogram.
        cut_length (int): Number of frames to keep, default is 1406 frames (30 seconds).

    Returns:
        None
    """
    try:
        # Load the mel spectrogram from the npy file
        mel_spectrogram = np.load(input_file_name)

        # Ensure the spectrogram has enough frames
        num_frames = mel_spectrogram.shape[1]
        if num_frames < cut_length:
            raise ValueError(f"The spectrogram is shorter than the required {cut_length} frames.")
        else:

            # Calculate the start and end frames to cut the middle part
            start_frame = (num_frames - cut_length) // 2
            end_frame = start_frame + cut_length

            # Slice the spectrogram to get the 30-second segment
            mel_spectrogram_cut = mel_spectrogram[:, start_frame:end_frame]

            # Save the cut mel spectrogram to spectograms_data folder with the same name
            np.save(output_path, mel_spectrogram_cut)

    except ValueError:
        print(f"The spectrogram for {input_file_name} is shorter than the required {cut_length} frames.")


Download tar file from mtg-jamendo dataset (if not in the temp_file folder) and extract it folder by folder (from 00 to 99 tar files f.ex. raw_30s_melspecs-00.tar) with deleting tar file, then run the function to cut all mel spectograms that are in the assets/genres_one_genre.csv file in the assets folder (search by path = folder + track name). Then delete extracted folders. Repeat for all tar files.

In [None]:
CHUNK_SIZE = 512 * 1024  # 512 KB
BASE_URL = "https://cdn.freesound.org/mtg-jamendo/raw_30s/melspecs/"

# Function to download tar file from the MTG-Jamendo dataset, track with tqdm
def download_tar_file(tar_file_name, tar_folder_path):
    """
    Download a tar file from the MTG-Jamendo dataset.

    Args:
        tar_file_name (str): Name of the tar file to download.
        tar_folder_path (str): Path to save the downloaded tar file.

    Returns:
        None
    """
    # Check if the tar file already exists
    tar_path = os.path.join(tar_folder_path, tar_file_name)
    if os.path.exists(tar_path):
        print(f"The tar file {tar_file_name} already exists.")
        return

    # Download the tar file from the MTG-Jamendo dataset
    url = BASE_URL + tar_file_name
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        with open(tar_path, 'wb') as f:
            for chunk in tqdm.tqdm(r.iter_content(chunk_size=CHUNK_SIZE)):
                f.write(chunk)

def extract_tar_files(tar_folder_path, extract_folder_path):
    """
    Extract all tar files in the given folder to the extract folder.

    Args:
        tar_folder_path (str): Path to the folder containing tar files.
        extract_folder_path (str): Path to the folder to extract the tar files.

    Returns:
        None
    """
    # List all files in the tar folder
    tar_files = [f for f in os.listdir(tar_folder_path) if f.endswith('.tar')]

    # Extract each tar file to the extract folder
    for tar_file in tar_files:
        tar_path = os.path.join(tar_folder_path, tar_file)
        with tarfile.open(tar_path, 'r') as tar:
            tar.extractall(extract_folder_path)
        # Delete the tar file after extraction
        os.remove(tar_path)

def cut_mel_spectrograms_from_folder(folder_path, output_folder):
    """
    Cut mel spectrograms in the given folder to 30-second segments.

    Args:
        folder_path (str): Path to the folder containing mel spectrogram files.
        output_folder (str): Path to save the cut mel spectrograms.

    Returns:
        None
    """
    # Print folders in the path
    mtg_jamendo_folder_path = os.listdir(folder_path)[0]
    # Take all files in the mtg_jamendo_folder_path without ending .npy
    files = [f for f in os.listdir(os.path.join(folder_path, mtg_jamendo_folder_path)) if f.endswith('.npy')]
    file_names = [f'{mtg_jamendo_folder_path}/{f.replace('.npy','')}.mp3' for f in files] # This is the path in the autotagging_genre_filtered.csv file


    # For each file in the folder if file name is in the assets/genres_one_genre.csv file (path) cut the mel spectogram to 30s. Take the genre from the same file and save it to spectograms_data folder to the folder with the genre name. Function to cut the mel spectogram to 30s is cut_mel_spectrogram_to_30s

    data = pd.read_csv('assets/genres_one_genre.csv')

    for file, file_name in zip(files, file_names):
         if file_name in data['path'].values:
            genre = data[data['path'] == file_name]['genre'].values[0]
            output_path = os.path.join(output_folder, genre)
            if not os.path.exists(output_path):
                os.makedirs(output_path)
            input_file = os.path.join(folder_path, mtg_jamendo_folder_path, file)
            cut_mel_spectrogram_to_30s(input_file, os.path.join(output_path, f'{mtg_jamendo_folder_path}_{file}'))


def delete_extracted_folders(extract_folder_path):
    """
    Delete all folders in the given path.

    Args:
        extract_folder_path (str): Path to the folder containing folders to delete.

    Returns:
        None
    """
    # List all files in the folder
    mtg_jamendo_folder_path = os.listdir(extract_folder_path)[0]

    # Delete the folder with its contents
    folder_path = os.path.join(extract_folder_path, mtg_jamendo_folder_path)
    for f in os.listdir(folder_path):
        file_path = os.path.join(folder_path, f)
        if os.path.isfile(file_path):
            os.remove(file_path)
        else:
            os.rmdir(file_path)
    # Delete the empty folder
    os.rmdir(folder_path)
    print("Deleted FOLDER_PATH: ", folder_path)


In [None]:
# Download and extract tar files
tar_folder_path = 'temp_files'
extract_folder_path = 'temp_extracted'

In [None]:
# download_tar_file('raw_30s_melspecs-05.tar', tar_folder_path)
# extract_tar_files(tar_folder_path, extract_folder_path)
# cut_mel_spectrograms_from_folder(extract_folder_path, 'spectograms_data')
# delete_extracted_folders(extract_folder_path)

In [None]:
# Check the lenght of example file in the spectograms_data folder
example_cut = np.load('spectograms_data/pop/00_268400.npy')
example_cut.shape

In [None]:
# viusalize the cut mel spectogram
fig = px.imshow(example_cut)
fig.show()

In [None]:
# Check how long does it take to download 00 tar file, extract it and cut the mel spectograms
start_time = time.time()
download_tar_file('raw_30s_melspecs-00.tar', tar_folder_path)
extract_tar_files(tar_folder_path, extract_folder_path)
cut_mel_spectrograms_from_folder(extract_folder_path, 'spectograms_data')
delete_extracted_folders(extract_folder_path)

print(f"Time taken: {time.time() - start_time:.2f} seconds")

In [None]:
# Do the same for all targ files from 02 to 99 in 4 separate cells in 4 batches

# Batch 1 - tar files 01-24

for i in range(1, 5):
    tar_file_name = f'raw_30s_melspecs-{i:02d}.tar'
    download_tar_file(tar_file_name, tar_folder_path)
    extract_tar_files(tar_folder_path, extract_folder_path)
    cut_mel_spectrograms_from_folder(extract_folder_path, 'spectograms_data')
    delete_extracted_folders(extract_folder_path)

In [None]:
# Do the same for all targ files from 02 to 99 in 4 separate cells in 4 batches

# Batch 1 - tar files 01-24

for i in range(6, 15):
    tar_file_name = f'raw_30s_melspecs-{i:02d}.tar'
    download_tar_file(tar_file_name, tar_folder_path)
    extract_tar_files(tar_folder_path, extract_folder_path)
    cut_mel_spectrograms_from_folder(extract_folder_path, 'spectograms_data')
    delete_extracted_folders(extract_folder_path)

In [None]:
# Do the same for all targ files from 02 to 99 in 4 separate cells in 4 batches

# Batch 1 - tar files 01-24
# TODO: need to cut mel spectograms from files 00-16 to 1376 frames later
for i in range(17, 25):
    tar_file_name = f'raw_30s_melspecs-{i:02d}.tar'
    download_tar_file(tar_file_name, tar_folder_path)
    extract_tar_files(tar_folder_path, extract_folder_path)
    cut_mel_spectrograms_from_folder(extract_folder_path, 'spectograms_data')
    delete_extracted_folders(extract_folder_path)

In [None]:
# Batch 2 - tar files 25-49

for i in range(25, 50):
    tar_file_name = f'raw_30s_melspecs-{i:02d}.tar'
    download_tar_file(tar_file_name, tar_folder_path)
    extract_tar_files(tar_folder_path, extract_folder_path)
    cut_mel_spectrograms_from_folder(extract_folder_path, 'spectograms_data')
    delete_extracted_folders(extract_folder_path)

In [None]:
# Batch 3 - tar files 50-74

for i in range(71, 75):
    tar_file_name = f'raw_30s_melspecs-{i:02d}.tar'
    download_tar_file(tar_file_name, tar_folder_path)
    extract_tar_files(tar_folder_path, extract_folder_path)
    cut_mel_spectrograms_from_folder(extract_folder_path, 'spectograms_data')
    delete_extracted_folders(extract_folder_path)


In [None]:
# Batch 4 - tar files 75-99

for i in range(75, 100):
    tar_file_name = f'raw_30s_melspecs-{i:02d}.tar'
    download_tar_file(tar_file_name, tar_folder_path)
    extract_tar_files(tar_folder_path, extract_folder_path)
    cut_mel_spectrograms_from_folder(extract_folder_path, 'spectograms_data')
    delete_extracted_folders(extract_folder_path)

In [None]:
# Count number of files in each subfolder in spectograms_data
count_dictionary = dict()
genres = ['pop', 'metal', 'rock', 'hiphop', 'rap', 'indie', 'jazz', 'country']

for genre in genres:
    genre_folder = os.path.join('spectograms_data', genre)
    count = 0
    for f in os.listdir(genre_folder):
        count += 1
    count_dictionary[genre] = count

In [None]:
count_dictionary

In [None]:
sum(count_dictionary.values())

In [None]:
# Cut all of the mel spectograms in spectograms_data to 1376 frames (if they are longer)

def cut_all_mel_spectrograms_to_1376_frames(spectograms_data_path, cut_length=1376):
    """
    Cut all mel spectrograms in the given folder to 1376 frames.

    Args:
        spectograms_data_path (str): Path to the folder containing mel spectrogram files.
        cut_length (int): Number of frames to keep, default is 1376 frames.

    Returns:
        None
    """
    genres = [d for d in os.listdir(spectograms_data_path) if os.path.isdir(os.path.join(spectograms_data_path, d))]
    for genre in genres:
        genre_folder = os.path.join(spectograms_data_path, genre)
        for f in os.listdir(genre_folder):
            input_file = os.path.join(genre_folder, f)
            cut_mel_spectrogram_to_30s(input_file, input_file, cut_length=cut_length)

# Cut all mel spectograms to 1376 frames
cut_all_mel_spectrograms_to_1376_frames('spectograms_data')

In [None]:
# Check the shape of random mel spectogram from each genre

genres = [d for d in os.listdir('spectograms_data') if os.path.isdir(os.path.join('spectograms_data', d))]
for genre in genres:
    genre_folder = os.path.join('spectograms_data', genre)
    files = [f for f in os.listdir(genre_folder) if f.endswith('.npy')]
    random_file = random.choice(files)
    random_spectrogram = np.load(os.path.join(genre_folder, random_file))
    print(f"Genre: {genre}, File: {random_file}, Shape: {random_spectrogram.shape}")

In [None]:
# Visualize randomly selected mel spectograms from each genre
spectograms_data_path = 'spectograms_data'

# Function to visualize a randomly selected spectrogram from each genre
def visualize_random_spectrograms(spectograms_data_path):
    genres = [d for d in os.listdir(spectograms_data_path) if os.path.isdir(os.path.join(spectograms_data_path, d))]
    for genre in genres:
        genre_folder = os.path.join(spectograms_data_path, genre)
        print(genre_folder)
        if os.path.exists(genre_folder):
            files = [f for f in os.listdir(genre_folder) if f.endswith('.npy')]
            if files:
                random_file = random.choice(files)
                spectrogram = np.load(os.path.join(genre_folder, random_file))
                fig = px.imshow(spectrogram, title=f'Random {genre} Spectrogram')
                fig.show()
            else:
                print(f"No files found in {genre_folder}")
        else:
            print(f"Genre folder {genre_folder} does not exist")

# Visualize randomly selected spectrograms
visualize_random_spectrograms(spectograms_data_path)

In [None]:
# Load the cut mel spectogram
example_cut = np.load('spectograms_data/country/24_1128824.npy')
# Check the shape of the mel spectogram
example_cut.shape

In [None]:
# Check how min and max values of the mel spectogram
example_cut.min(), example_cut.max()
# This is scale in dB, so the values are negative

In [None]:
# Build a CNN model for music genre classification using the MEL spectrogram
# First build a class MelSpectogramDataset to load data for torch DataLoader

class MelSpectogramDataset(Dataset):
    def __init__(self, data_path, transform=None, max_samples_for_class_dict=None):
        """
        Initialize the MelSpectogramDataset class.

        Args:
            data_path (str): Path to the folder containing the mel spectrogram files.
            transform (callable): Optional transform to be applied to the mel spectrogram.
        """

        self.data_path = data_path
        self.max_samples_for_class_dict = max_samples_for_class_dict
        self.transform = transform
        self.genres = [d for d in os.listdir(data_path) if os.path.isdir(os.path.join(data_path, d))]
        self.file_paths = []
        self.labels = []
        for i, genre in enumerate(self.genres):
            genre_folder = os.path.join(data_path, genre)
            files = [f for f in os.listdir(genre_folder) if f.endswith('.npy')]

            self.file_paths.extend([os.path.join(genre_folder, f) for f in files])
            self.labels.extend([i] * len(files))

    def __len__(self):
        return len(self.file_paths)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        mel_spectrogram = np.load(self.file_paths[idx])
        label = self.labels[idx]

        if self.transform:
            mel_spectrogram = self.transform(mel_spectrogram)

        return mel_spectrogram, label




In [None]:
class MelSpectogramDataset(Dataset):
    def __init__(self, data_path, transform=None, max_samples_dict=None):
        """
        Initialize the MelSpectogramDataset class.

        Args:
            data_path (str): Path to the folder containing the mel spectrogram files.
            transform (callable): Optional transform to be applied to the mel spectrogram.
            max_samples_dict (dict): Dictionary specifying the maximum number of samples per genre. 
                                     For example: {'rock': 1000, 'pop': 600}.
        """

        self.data_path = data_path
        self.transform = transform
        self.max_samples_dict = max_samples_dict
        self.genres = [d for d in os.listdir(data_path) if os.path.isdir(os.path.join(data_path, d))]
        self.file_paths = []
        self.labels = []

        for i, genre in enumerate(self.genres):
            genre_folder = os.path.join(data_path, genre)
            files = [f for f in os.listdir(genre_folder) if f.endswith('.npy')]

            # Check if this genre has a max_samples limit
            if max_samples_dict and genre in max_samples_dict:
                max_samples = max_samples_dict[genre]
                files = np.random.choice(files, size=min(max_samples, len(files)), replace=False).tolist()

            self.file_paths.extend([os.path.join(genre_folder, f) for f in files])
            self.labels.extend([i] * len(files))

    def __len__(self):
        return len(self.file_paths)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        mel_spectrogram = np.load(self.file_paths[idx])
        label = self.labels[idx]

        if self.transform:
            mel_spectrogram = self.transform(mel_spectrogram)

        return mel_spectrogram, label


In [None]:
# Build class CNNModel for CNN model for music genre classification
import torch.nn as nn

class CNNModel(nn.Module):
    def __init__(self, num_classes, num_filters):
        super(CNNModel, self).__init__()
        self.conv1 = nn.Sequential(
            nn.Conv2d(1, num_filters, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
        )
        self.conv2 = nn.Sequential(
            nn.Conv2d(num_filters, num_filters * 2, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
        )
        self.conv3 = nn.Sequential(
            nn.Conv2d(num_filters * 2, num_filters * 4, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
        )
        self.conv4 = nn.Sequential(
            nn.Conv2d(num_filters * 4, num_filters * 8, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
        )
        self.Flatten = nn.Flatten()
        self.linear1 = nn.Linear(num_filters * 8 * 6 * 86, 256)
        self.linear2 = nn.Linear(256, num_classes)

    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.conv4(x)
        x = self.Flatten(x)
        x = self.linear1(x)
        x = self.linear2(x)
        return x

In [None]:
# Render model summary

from torchsummary import summary

model = CNNModel(num_classes=8, num_filters=16)
summary(model, (1, 96, 1376))

In [None]:
model = CNNModel(num_classes=8, num_filters=40)
summary(model, (1, 96, 1376))

In [None]:
model.eval()

In [None]:
# Example use for MelSpectogramDataset and DataLoader
# Create a MelSpectogramDataset instance
mel_dataset = MelSpectogramDataset(data_path='spectograms_data')

# Create a DataLoader instance
mel_loader = DataLoader(mel_dataset, batch_size=32, shuffle=True)

# Iterate over the DataLoader
for i, (mel_spectrogram, label) in enumerate(mel_loader):
    print(f"Batch {i + 1}:")
    print("Shape of mel_spectrogram:", mel_spectrogram.shape)
    print("Labels:", label)
    break

In [None]:
# Print one instance of the mel spectogram dataset (with the file name)

mel_spectrogram, label = mel_dataset[1]
print("Shape of mel_spectrogram:", mel_spectrogram.shape)
print("Label:", label)

In [None]:
mel_spectrogram, label = mel_dataset[12000]
print(mel_dataset.file_paths[12000])
print(mel_dataset.labels[12000])
print(mel_dataset.genres[label])

In [None]:
# Print genres with corresponding label
labels = [0, 1, 2, 3, 4, 5, 6, 7]

for label in labels:
    print(f"Label: {label}, Genre: {mel_dataset.genres[label]}")

In [None]:
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score

def calculate_metrics(y_true, y_pred_probs, num_classes):
    """
    Calculate AUC, precision, recall, and F1 score for multiclass classification.

    Args:
        y_true (array-like): True labels.
        y_pred_probs (array-like): Predicted probabilities or logits.
        num_classes (int): Number of classes in the classification task.

    Returns:
        dict: A dictionary containing AUC, precision, recall, and F1 scores.
    """
    # Convert predicted probabilities to class predictions
    y_pred = y_pred_probs.argmax(axis=1)

    # Calculate metrics
    metrics = {
        "AUC": roc_auc_score(y_true, y_pred_probs, multi_class="ovr", average="macro"),
        "Precision": precision_score(y_true, y_pred, average="macro", zero_division=1),
        "Recall": recall_score(y_true, y_pred, average="macro"),
        "F1 Score": f1_score(y_true, y_pred, average="macro")
    }
    return metrics


In [None]:
from sklearn.metrics import classification_report
def per_class_metrics(y_true, y_pred, num_classes):
    """
    Prints classification metrics for each class.

    Args:
        y_true (array-like): True labels.
        y_pred (array-like): Predicted labels.
        num_classes (int): Number of classes in the classification task.
    """
    report = classification_report(
        y_true,
        y_pred,
        zero_division=1,
        target_names=[f"Class {i}" for i in range(num_classes)]
    )
    print(report)

In [None]:
# Define class EarlyStopping for early stopping in the training loop
import copy

class EarlyStopping:
    def __init__(self, patience=7, min_delta=0, restore_best_weights=True):
        self.patience = patience
        self.min_delta = min_delta
        self.restore_best_weights = restore_best_weights
        self.best_model = None
        self.best_loss = None
        self.counter = 0
        self.status = ""

    def __call__(self, model, val_loss):
        if self.best_model is None:
            self.best_model = copy.deepcopy(model.state_dict())
            self.best_loss = val_loss
        elif val_loss < self.best_loss - self.min_delta:
            self.best_model = copy.deepcopy(model.state_dict())
            self.best_loss = val_loss
            self.counter = 0
            self.status = f"Improvement found, reset counter to {self.counter}"
        else:
            self.counter += 1
            self.status = f"No improvement, counter {self.counter}"
            if self.counter >= self.patience:
                self.status = f"Stopping early, counter {self.counter}"
                if self.restore_best_weights:
                    model.load_state_dict(self.best_model)
                return True
        return False



In [None]:
# Create instance of MelSpectogramDataset with undersampling for pop, rock and hiphop genres max 1000 samples
max_samples_dict = {
    "pop": 1000,
    "rock": 1000,
    "hiphop": 1000
}

# Create dataset instance
dataset = MelSpectogramDataset(
    data_path="spectograms_data",
    transform=None,
    max_samples_dict=max_samples_dict
)

# Use DataLoader
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# Iterate over batches
for mel_spectrogram, label in dataloader:
    print(mel_spectrogram.shape, label)

In [None]:
# Count the number of samples for each genre

genre_counts = {genre: 0 for genre in mel_dataset.genres}
for label in mel_dataset.labels:
    genre_counts[mel_dataset.genres[label]] += 1

genre_counts

In [None]:
mel_dataset.genres

In [None]:
# StratifiedKFold cross-validation for CNN model (10 splits)
from sklearn.model_selection import StratifiedKFold

# Define the number of splits for KFold cross-validation
n_splits = 10

# Create a KFold instance
kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Set fixed random number of seed
torch.manual_seed(42)

# Make use of GPU if available or MPS (Apple) if one is available
device = (
    "mps"
    if torch.backends.mps.is_built()
    else "cuda"
    if torch.cuda.is_available()
    else "cpu"
)
print("Using device:", device)

# Iterate over the KFold splits and train the CNN model (using the MPS device if available)
for i, (train_index, test_index) in enumerate(kf.split(mel_dataset.file_paths, mel_dataset.labels)):
    print(f"Fold {i + 1}:")

    # Create a CNN model instance
    model = CNNModel(num_classes=8).to(device)

    # Define the loss function and optimizer with early stopping
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

    # Early stopping variables
    best_loss = float("inf")
    early_stopping_counter = 0
    es = EarlyStopping()


    # Create a DataLoader instance for training and testing
    train_loader = DataLoader(MelSpectogramDataset(data_path='spectograms_data', transform=None),
                              batch_size=32, sampler=torch.utils.data.SubsetRandomSampler(train_index))
    test_loader = DataLoader(MelSpectogramDataset(data_path='spectograms_data', transform=None),
                             batch_size=32, sampler=torch.utils.data.SubsetRandomSampler(test_index))

    # Train the CNN model
    for epoch in range(5):
        model.train()
        running_loss = 0.0
        train_progress = tqdm.tqdm(train_loader, desc=f"Training Epoch {epoch + 1}", leave=False)

        for mel_spectrogram, label in train_progress:
            mel_spectrogram, label = mel_spectrogram.to(device).float(), label.to(device)
            optimizer.zero_grad()
            output = model(mel_spectrogram.unsqueeze(1))
            loss = criterion(output, label)
            loss.backward()
            optimizer.step()

            # Update progress bar
            running_loss += loss.item()
            train_progress.set_postfix({"Loss": f"{running_loss / (train_progress.n + 1):.4f}"})

        # Evaluate the CNN model
        model.eval()
        correct, total = 0, 0
        all_labels, all_probs = [], []
        all_labels_per_class, all_probs_per_class = [], []
        test_progress = tqdm.tqdm(test_loader, desc=f"Testing Epoch {epoch + 1}", leave=False)

        # Use Early Stopping
        with torch.no_grad():
            for mel_spectrogram, label in test_progress:
                mel_spectrogram, label = mel_spectrogram.to(device).float(), label.to(device)
                output = model(mel_spectrogram.unsqueeze(1))
                probabilities = nn.Softmax(dim=1)(output).cpu().numpy()
                all_probs.append(probabilities)
                all_labels.append(label.cpu().numpy())
                all_labels_per_class.extend(label.cpu().numpy())
                all_probs_per_class.extend(output.cpu().numpy())
                _, predicted = torch.max(output.data, 1)
                total += label.size(0)
                correct += (predicted == label).sum().item()

                # Calculate the loss
                loss = criterion(output, label)
                test_progress.set_postfix({"Loss": f"{loss.item():.4f}"})


                # Calculate auc, accuracy, precision, recall, f1 score
                accuracy = 100 * correct / total



        # Concatenate all predictions and true labels
        all_labels = np.concatenate(all_labels)
        all_probs = np.concatenate(all_probs)

        all_labels_per_class = np.array(all_labels_per_class)
        all_probs_per_class = np.array(all_probs_per_class)
        y_pred = all_probs_per_class.argmax(axis=1)

        # Per-class metrics
        print(f"Fold {i + 1}, Epoch {epoch + 1}:")
        print("PER CLASS METRICS")
        per_class_metrics(all_labels, y_pred, num_classes=8)
        print("OVERALL METRICS")

        # Calculate metrics
        metrics = calculate_metrics(all_labels, all_probs, num_classes=8)
        print(f"Epoch {epoch + 1}: Accuracy: {accuracy:.2f}% | Metrics: {metrics}")

        # Early stopping
        if es(model, loss.item()):
            print("Early stopping")
            break

    # Save the model
    torch.save(model.state_dict(), f"models/model_fold_{i + 1}.pt")

In [None]:
from sklearn.metrics import f1_score

def calculate_f1_score(y_true, y_pred, average='macro'):
    """
    Calculate the F1 score for the given true and predicted labels.

    Args:
        y_true (array-like): True class labels.
        y_pred (array-like): Predicted class labels.
        average (str): Type of averaging to perform ('micro', 'macro', 'weighted', or None).

    Returns:
        float: The F1 score.
    """
    return f1_score(y_true, y_pred, average=average)

In [None]:
import optuna 
from sklearn.model_selection import train_test_split

def objective(trial):
    # Hyperparameters to optimize
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 1e-3, log=True)
    batch_size = trial.suggest_categorical("batch_size", [16, 32, 64])
    num_filters = trial.suggest_int("num_filters", 8, 64, step=8)
    
    # Create a MelSpectogramDataset instance
    mel_dataset = MelSpectogramDataset(data_path='spectograms_data')

    # Dataset and DataLoader
    train_idx, test_idx = train_test_split(
        np.arange(len(mel_dataset)),
        test_size=0.1,
        stratify=mel_dataset.labels,
        random_state=42
    )

    train_loader = DataLoader(
        mel_dataset,
        batch_size=batch_size,
        sampler=torch.utils.data.SubsetRandomSampler(train_idx)
    )
    test_loader = DataLoader(
        mel_dataset,
        batch_size=batch_size,
        sampler=torch.utils.data.SubsetRandomSampler(test_idx)
    )
    
    # Set fixed random number of seed
    torch.manual_seed(42)
    
    # Make use of GPU if available or MPS (Apple) if one is available
    device = (
    "mps"
    if torch.backends.mps.is_built()
    else "cuda"
    if torch.cuda.is_available()
    else "cpu"
    )
    
    print("Using device:", device)

    # Define the model with dynamic num_filters
    model = CNNModel(num_classes=8, num_filters=num_filters).to(device)

    # Define the loss function and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    # Training loop
    num_epochs = 10
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        train_progress = tqdm.tqdm(train_loader, desc=f"Training Epoch {epoch + 1}", leave=False)

        for mel_spectrogram, label in train_progress:
            mel_spectrogram, label = mel_spectrogram.to(device).float(), label.to(device)
            optimizer.zero_grad()
            output = model(mel_spectrogram.unsqueeze(1))
            loss = criterion(output, label)
            loss.backward()
            optimizer.step()

            # Update progress bar
            running_loss += loss.item()
            train_progress.set_postfix({"Loss": f"{running_loss / (train_progress.n + 1):.4f}"})

        # Evaluate the CNN model
        model.eval()
        correct, total = 0, 0
        all_labels, all_probs = [], []
        test_progress = tqdm.tqdm(test_loader, desc=f"Testing Epoch {epoch + 1}", leave=False)

        # Use Early Stopping
        with torch.no_grad():
            for mel_spectrogram, label in test_progress:
                mel_spectrogram, label = mel_spectrogram.to(device).float(), label.to(device)
                output = model(mel_spectrogram.unsqueeze(1))
                probabilities = nn.Softmax(dim=1)(output).cpu().numpy()
                all_probs.append(probabilities)
                all_labels.append(label.cpu().numpy())


        # Concatenate all predictions and true labels
        all_labels = np.concatenate(all_labels)
        all_probs = np.concatenate(all_probs)

        # Calculate metrics
        f1 = calculate_metrics(all_labels, all_probs, num_classes=8)["F1 Score"]
        
        trial.report(f1, epoch)

        # Prune if needed based on the reported F1 score
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()

        # Optionally, print the F1 score for each epoch
        print(f"Epoch {epoch + 1} - F1 Score: {f1:.4f}")

    # Return the F1 score after the training loop
    return f1

In [None]:
# Optuna study
study_name = "cnn_optimization_v1"
# Create a file to save study name and my comments on used dataset and hyperparameters
study_comment = "Imbalanced full dataset. Optimizing learning rate, batch size, and number of filters for the CNN model. No cross-validation used. 90-10 split for training and testing."
study_comment_file = "study_comment.csv"

# Save name and comment to a CSV file
with open(study_comment_file, "w") as f:
    f.write(f"Study Name,Study Comment\n")
    f.write(f"{study_name},{study_comment}")
    
# Create an Optuna study
study = optuna.create_study(direction="maximize", storage="sqlite:///db.sqlite3", study_name=study_name, load_if_exists=True)
study.optimize(objective, n_trials=10)

# Best trial
print("Best trial:")
print(f"F1 Score: {study.best_trial.value}")
print(f"Params: {study.best_trial.params}")

In [None]:
for epoch in range(5):
        model.train()
        running_loss = 0.0
        train_progress = tqdm.tqdm(train_loader, desc=f"Training Epoch {epoch + 1}", leave=False)

        for mel_spectrogram, label in train_progress:
            mel_spectrogram, label = mel_spectrogram.to(device).float(), label.to(device)
            optimizer.zero_grad()
            output = model(mel_spectrogram.unsqueeze(1))
            loss = criterion(output, label)
            loss.backward()
            optimizer.step()

            # Update progress bar
            running_loss += loss.item()
            train_progress.set_postfix({"Loss": f"{running_loss / (train_progress.n + 1):.4f}"})

        # Evaluate the CNN model
        model.eval()
        correct, total = 0, 0
        all_labels, all_probs = [], []
        all_labels_per_class, all_probs_per_class = [], []
        test_progress = tqdm.tqdm(test_loader, desc=f"Testing Epoch {epoch + 1}", leave=False)

        # Use Early Stopping
        with torch.no_grad():
            for mel_spectrogram, label in test_progress:
                mel_spectrogram, label = mel_spectrogram.to(device).float(), label.to(device)
                output = model(mel_spectrogram.unsqueeze(1))
                probabilities = nn.Softmax(dim=1)(output).cpu().numpy()
                all_probs.append(probabilities)
                all_labels.append(label.cpu().numpy())
                all_labels_per_class.extend(label.cpu().numpy())
                all_probs_per_class.extend(output.cpu().numpy())
                _, predicted = torch.max(output.data, 1)
                total += label.size(0)
                correct += (predicted == label).sum().item()

                # Calculate the loss
                loss = criterion(output, label)
                test_progress.set_postfix({"Loss": f"{loss.item():.4f}"})


                # Calculate auc, accuracy, precision, recall, f1 score
                accuracy = 100 * correct / total



        # Concatenate all predictions and true labels
        all_labels = np.concatenate(all_labels)
        all_probs = np.concatenate(all_probs)

        all_labels_per_class = np.array(all_labels_per_class)
        all_probs_per_class = np.array(all_probs_per_class)
        y_pred = all_probs_per_class.argmax(axis=1)

        # Per-class metrics
        print(f"Fold {i + 1}, Epoch {epoch + 1}:")
        print("PER CLASS METRICS")
        per_class_metrics(all_labels, y_pred, num_classes=8)
        print("OVERALL METRICS")

        # Calculate metrics
        metrics = calculate_metrics(all_labels, all_probs, num_classes=8)

In [None]:
# Save the best model
torch.save(model.state_dict(), "models/best_model_v1.pth")

In [None]:
from torch.utils.data import WeightedRandomSampler
"""
Training the model on best parameters + weighted random sampler on whole DS
"""
# Train the model with best parameters
# Define the model with best parameters
best_params = study.best_params
model = CNNModel(num_classes=8, num_filters=best_params["num_filters"]).to(device)

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=best_params["learning_rate"])

# Create dataset instance
mel_dataset = MelSpectogramDataset(data_path='spectograms_data')

# Split the dataset into train, validation and test with 90-5-5 split

# Define the dataset sizes
train_size = int(0.9 * len(mel_dataset))
val_size = (len(mel_dataset) - train_size) // 2
test_size = len(mel_dataset) - train_size - val_size 

# Split the dataset
train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(
    mel_dataset, [train_size, val_size, test_size], generator=torch.Generator().manual_seed(42) # For reproducibility
)

# Handle imabalanced dataset with WeightedRandomSampler

# Get train dataset labels
train_labels = [label for _, label in train_dataset]

# Get train dataset label count for each class
class_counts = np.bincount(train_labels)

# Calculate the class weights
class_weights = 1.0 / class_counts
weights = [class_weights[label] for label in train_labels]

# Create a WeightedRandomSampler
sampler = WeightedRandomSampler(weights, num_samples=len(train_dataset))

# Create DataLoader instances
train_loader = DataLoader(train_dataset, batch_size=32, sampler=sampler)
val_loader = DataLoader(val_dataset, batch_size=32)
test_loader = DataLoader(test_dataset, batch_size=32)

In [None]:
print(class_counts)
labels = list(range(8))
for i in labels:
    print(f"Class {i}, Genre {train_dataset.dataset.genres[i]} :{class_counts[i]} samples")

In [None]:
# Plain training loop without early stopping
num_epochs = 6

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    train_progress = tqdm.tqdm(train_loader, desc=f"Training Epoch {epoch + 1}", leave=False)

    for mel_spectrogram, label in train_progress:
        mel_spectrogram, label = mel_spectrogram.to(device).float(), label.to(device)
        optimizer.zero_grad()
        output = model(mel_spectrogram.unsqueeze(1))
        loss = criterion(output, label)
        loss.backward()
        optimizer.step()

        # Update progress bar
        running_loss += loss.item()
        train_progress.set_postfix({"Loss": f"{running_loss / (train_progress.n + 1):.4f}"})

    # Evaluate the CNN model on the validation set
    model.eval()
    correct, total = 0, 0
    all_labels, all_probs = [], []
    all_labels_per_class, all_probs_per_class = [], []
    test_progress = tqdm.tqdm(val_loader, desc=f"Testing Epoch {epoch + 1}", leave=False)

    with torch.no_grad():
        for mel_spectrogram, label in test_progress:
            mel_spectrogram, label = mel_spectrogram.to(device).float(), label.to(device)
            output = model(mel_spectrogram.unsqueeze(1))
            probabilities = nn.Softmax(dim=1)(output).cpu().numpy()
            all_probs.append(probabilities)
            all_labels.append(label.cpu().numpy())
            all_labels_per_class.extend(label.cpu().numpy())
            all_probs_per_class.extend(output.cpu().numpy())
            _, predicted = torch.max(output.data, 1)
            total += label.size(0)
            correct += (predicted == label).sum().item()

            # Calculate the loss
            loss = criterion(output, label)
            test_progress.set_postfix({"Loss": f"{loss.item():.4f}"})

    # Concatenate all predictions and true labels
    all_labels = np.concatenate(all_labels)
    all_probs = np.concatenate(all_probs)

    all_labels_per_class = np.array(all_labels_per_class)
    all_probs_per_class = np.array(all_probs_per_class)
    y_pred = all_probs_per_class.argmax(axis=1)
    
    # Per-class metrics
    print(f"Epoch {epoch + 1}:")
    print("PER CLASS METRICS")
    per_class_metrics(all_labels, y_pred, num_classes=8)
    print("OVERALL METRICS")
    
    # Calculate metrics
    metrics = calculate_metrics(all_labels, all_probs, num_classes=8)
    print(f"Epoch {epoch + 1}: Accuracy: {accuracy:.2f}% | Metrics: {metrics}")

In [None]:
# Save this model
torch.save(model.state_dict(), "models/best_model_v2_weighted.pth")

In [None]:
# Evaluate the model on the test set

# Load the model and evaluate it on the test set

# Create a CNN model instance
test_model = CNNModel(num_classes=8, num_filters=best_params["num_filters"]).to(device)

# Load the model weights
test_model.load_state_dict(torch.load("models/best_model_v2_weighted.pth"))

# Create a DataLoader instance for the test set
test_loader = DataLoader(test_dataset, batch_size=32)

test_model.eval()

with torch.no_grad():
    correct, total = 0, 0
    all_labels, all_probs = [], []
    all_labels_per_class, all_probs_per_class = [], []
    test_progress = tqdm.tqdm(test_loader, desc="Testing", leave=False)

    for mel_spectrogram, label in test_progress:
        mel_spectrogram, label = mel_spectrogram.to(device).float(), label.to(device)
        output = test_model(mel_spectrogram.unsqueeze(1))
        probabilities = nn.Softmax(dim=1)(output).cpu().numpy()
        all_probs.append(probabilities)
        all_labels.append(label.cpu().numpy())
        all_labels_per_class.extend(label.cpu().numpy())
        all_probs_per_class.extend(output.cpu().numpy())
        _, predicted = torch.max(output.data, 1)
        total += label.size(0)
        correct += (predicted == label).sum().item()

        # Calculate the loss
        loss = criterion(output, label)
        test_progress.set_postfix({"Loss": f"{loss.item():.4f}"})

    # Concatenate all predictions and true labels
    all_labels = np.concatenate(all_labels)
    all_probs = np.concatenate(all_probs)

    all_labels_per_class = np.array(all_labels_per_class)
    all_probs_per_class = np.array(all_probs_per_class)
    y_pred = all_probs_per_class.argmax(axis=1)
    
    # Per-class metrics
    print("PER CLASS METRICS")
    per_class_metrics(all_labels, y_pred, num_classes=8)
        
    # Calculate metrics
    metrics = calculate_metrics(all_labels, all_probs, num_classes=8)
    print("OVERALL METRICS")
    
    print(f"Accuracy: {100 * correct / total:.2f}% | Metrics: {metrics}")
    

In [None]:
# Plot the confusion matrix with annotations on test set
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt

# Calculate the confusion matrix
conf_matrix = confusion_matrix(all_labels, y_pred)

# Plot the confusion matrix with annotations
plt.figure(figsize=(10, 8))
plt.imshow(conf_matrix, cmap="Blues")
plt.colorbar()
plt.title("Confusion Matrix")
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.xticks(range(8), mel_dataset.genres, rotation=45)
plt.yticks(range(8), mel_dataset.genres)
for i in range(8):
    for j in range(8):
        plt.text(j, i, conf_matrix[i, j], ha="center", va="center", color="black")
plt.show()

In [None]:
# Now plot the normalized confusion matrix
import seaborn as sns

# Normalize the confusion matrix
conf_matrix_norm = conf_matrix / conf_matrix.sum(axis=1)[:, np.newaxis]

# Plot the normalized confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(conf_matrix_norm, cmap="Blues", annot=True, fmt=".2f", xticklabels=mel_dataset.genres, yticklabels=mel_dataset.genres)
plt.title("Normalized Confusion Matrix")
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.show()

In [None]:
# Pop and hiphop are overwhealming rest of the classes because of the dataset imbalance
# Let's try to undersample the dataset genres: pop and hiphop and train again.
# Change the max_samples_dict to undersample pop and hiphop genres in the objective and train the model again.


import optuna 
from sklearn.model_selection import train_test_split

def objective(trial):
    # Hyperparameters to optimize
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 1e-3, log=True)
    batch_size = trial.suggest_categorical("batch_size", [16, 32, 64])
    num_filters = trial.suggest_int("num_filters", 8, 64, step=8)
    
    # Create a MelSpectogramDataset instance
    from torch.utils.data import WeightedRandomSampler
    """
    Training the model with stratified split, undersample pop and rock, apply weighted random sampler with Optuna.
    """


    # Create dataset instance
    
    # max_samples_dict to undersample pop and rock genres
    max_samples_dict = {
        "pop": 2000,
        "rock": 2000,
    }
    
    mel_dataset = MelSpectogramDataset(data_path='spectograms_data', max_samples_dict=max_samples_dict)
    
    # Split the dataset into train, validation and test with 90-5-5 split

    
    # Access file paths and labels
    file_paths = mel_dataset.file_paths
    labels = mel_dataset.labels
    
    # Split dataset using stratification
    train_file_paths, temp_file_paths, train_labels, temp_labels = train_test_split(
        file_paths, labels, test_size=0.1, random_state=42, stratify=labels
    )
    
    val_file_paths, test_file_paths, val_labels, test_labels = train_test_split(
        temp_file_paths, temp_labels, test_size=0.5, random_state=42, stratify=temp_labels
    )
    
    # Create Datasets for each split
    train_indices = [file_paths.index(path) for path in train_file_paths]
    val_indices = [file_paths.index(path) for path in val_file_paths]
    #test_indices = [file_paths.index(path) for path in test_file_paths]
    
    train_dataset = Subset(mel_dataset, train_indices)
    val_dataset = Subset(mel_dataset, val_indices)
    #test_dataset = Subset(mel_dataset, test_indices)
    
    # Handle imabalanced dataset with WeightedRandomSampler

    # Get train dataset labels
    train_labels = [label for _, label in train_dataset]
    
    # Get train dataset label count for each class
    class_counts = np.bincount(train_labels)
    
    # Calculate the class weights
    class_weights = 1.0 / class_counts
    weights = [class_weights[label] for label in train_labels]
    
    # Create a WeightedRandomSampler
    sampler = WeightedRandomSampler(weights, num_samples=len(train_dataset))
    
    # Create DataLoader instances
    train_loader = DataLoader(train_dataset, batch_size=batch_size, sampler=sampler)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)
    #test_loader = DataLoader(test_dataset, batch_size=32)
    
    # Set fixed random number of seed
    torch.manual_seed(42)
    
    # Make use of GPU if available or MPS (Apple) if one is available
    device = (
    "mps"
    if torch.backends.mps.is_built()
    else "cuda"
    if torch.cuda.is_available()
    else "cpu"
    )
    
    print("Using device:", device)

    # Define the model with dynamic num_filters
    model = CNNModel(num_classes=8, num_filters=num_filters).to(device)

    # Define the loss function and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    # Training loop
    num_epochs = 10
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        train_progress = tqdm.tqdm(train_loader, desc=f"Training Epoch {epoch + 1}", leave=False)

        for mel_spectrogram, label in train_progress:
            mel_spectrogram, label = mel_spectrogram.to(device).float(), label.to(device)
            optimizer.zero_grad()
            output = model(mel_spectrogram.unsqueeze(1))
            loss = criterion(output, label)
            loss.backward()
            optimizer.step()

            # Update progress bar
            running_loss += loss.item()
            train_progress.set_postfix({"Loss": f"{running_loss / (train_progress.n + 1):.4f}"})

        # Evaluate the CNN model
        model.eval()
        all_labels, all_probs = [], []
        test_progress = tqdm.tqdm(val_loader, desc=f"Testing Epoch {epoch + 1}", leave=False)

        # Use Early Stopping
        with torch.no_grad():
            for mel_spectrogram, label in test_progress:
                mel_spectrogram, label = mel_spectrogram.to(device).float(), label.to(device)
                output = model(mel_spectrogram.unsqueeze(1))
                probabilities = nn.Softmax(dim=1)(output).cpu().numpy()
                all_probs.append(probabilities)
                all_labels.append(label.cpu().numpy())


        # Concatenate all predictions and true labels
        all_labels = np.concatenate(all_labels)
        all_probs = np.concatenate(all_probs)

        # Calculate metrics
        f1 = calculate_metrics(all_labels, all_probs, num_classes=8)["F1 Score"]
        
        trial.report(f1, epoch)

        # Prune if needed based on the reported F1 score
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()

        # Optionally, print the F1 score for each epoch
        print(f"Epoch {epoch + 1} - F1 Score: {f1:.4f}")

    # Return the F1 score after the training loop
    return f1


In [None]:
# Optuna study
study_name = "cnn_optimization_v3_undersampled_weighted_stratified"
# Create a file to save study name and my comments on used dataset and hyperparameters
study_comment = "Undersampled pop and rock to 2000 samples. Optimizing learning rate, batch size, and number of filters for the CNN model. No cross-validation used. 90-5-5 split for training and validation and testing. Stratified split used. WeightedRandomSampler used for training."
study_comment_file = "study_comment.csv"

# Write new line with study name and comment
with open(study_comment_file, "a") as f:
    f.write(f"\n{study_name},{study_comment}")
# Create an Optuna study
study = optuna.create_study(direction="maximize", storage="sqlite:///db.sqlite3", study_name=study_name, load_if_exists=True)
study.optimize(objective, n_trials=10)

# Best trial
print("Best trial:")
print(f"F1 Score: {study.best_trial.value}")
print(f"Params: {study.best_trial.params}")


### Train with best parameters and evaluate the model on the test set

In [None]:
# Prepare data

max_samples_dict = {
        "pop": 2000,
        "rock": 2000,
    }
    
mel_dataset = MelSpectogramDataset(data_path='spectograms_data', max_samples_dict=max_samples_dict)

# Split the dataset into train, validation and test with 90-5-5 split
# Access file paths and labels
file_paths = mel_dataset.file_paths
labels = mel_dataset.labels

# Split dataset using stratification
train_file_paths, temp_file_paths, train_labels, temp_labels = train_test_split(
    file_paths, labels, test_size=0.1, random_state=42, stratify=labels
)

val_file_paths, test_file_paths, val_labels, test_labels = train_test_split(
    temp_file_paths, temp_labels, test_size=0.5, random_state=42, stratify=temp_labels
)

# Create Datasets for each split
train_indices = [file_paths.index(path) for path in train_file_paths]
val_indices = [file_paths.index(path) for path in val_file_paths]
test_indices = [file_paths.index(path) for path in test_file_paths]

train_dataset = Subset(mel_dataset, train_indices)
val_dataset = Subset(mel_dataset, val_indices)
test_dataset = Subset(mel_dataset, test_indices)

# Handle imabalanced dataset with WeightedRandomSampler

# Get train dataset labels
train_labels = [label for _, label in train_dataset]

# Get train dataset label count for each class
class_counts = np.bincount(train_labels)

# Calculate the class weights
class_weights = 1.0 / class_counts
weights = [class_weights[label] for label in train_labels]

# Create a WeightedRandomSampler
sampler = WeightedRandomSampler(weights, num_samples=len(train_dataset))

# Create DataLoader instances
train_loader = DataLoader(train_dataset, batch_size=best_params['batch_size'], sampler=sampler)
val_loader = DataLoader(val_dataset, batch_size=best_params['batch_size'])
test_loader = DataLoader(test_dataset, batch_size=best_params['batch_size'])

# Model

# Define the model with dynamic num_filters
model = CNNModel(num_classes=8, num_filters=best_params['num_filters']).to(device)

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=best_params['learning_rate'])
# Set fixed random number of seed
torch.manual_seed(42)
    
# Plain training loop without early stopping
num_epochs = 8

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    train_progress = tqdm.tqdm(train_loader, desc=f"Training Epoch {epoch + 1}", leave=False)

    for mel_spectrogram, label in train_progress:
        mel_spectrogram, label = mel_spectrogram.to(device).float(), label.to(device)
        optimizer.zero_grad()
        output = model(mel_spectrogram.unsqueeze(1))
        loss = criterion(output, label)
        loss.backward()
        optimizer.step()

        # Update progress bar
        running_loss += loss.item()
        train_progress.set_postfix({"Loss": f"{running_loss / (train_progress.n + 1):.4f}"})

    # Evaluate the CNN model on the validation set
    model.eval()
    correct, total = 0, 0
    all_labels, all_probs = [], []
    all_labels_per_class, all_probs_per_class = [], []
    test_progress = tqdm.tqdm(val_loader, desc=f"Testing Epoch {epoch + 1}", leave=False)

    with torch.no_grad():
        for mel_spectrogram, label in test_progress:
            mel_spectrogram, label = mel_spectrogram.to(device).float(), label.to(device)
            output = model(mel_spectrogram.unsqueeze(1))
            probabilities = nn.Softmax(dim=1)(output).cpu().numpy()
            all_probs.append(probabilities)
            all_labels.append(label.cpu().numpy())
            all_labels_per_class.extend(label.cpu().numpy())
            all_probs_per_class.extend(output.cpu().numpy())
            _, predicted = torch.max(output.data, 1)
            total += label.size(0)
            correct += (predicted == label).sum().item()

            # Calculate the loss
            loss = criterion(output, label)
            test_progress.set_postfix({"Loss": f"{loss.item():.4f}"})

    # Concatenate all predictions and true labels
    all_labels = np.concatenate(all_labels)
    all_probs = np.concatenate(all_probs)

    all_labels_per_class = np.array(all_labels_per_class)
    all_probs_per_class = np.array(all_probs_per_class)
    y_pred = all_probs_per_class.argmax(axis=1)

    # Per-class metrics
    print(f"Epoch {epoch + 1}:")
    print("PER CLASS METRICS")
    per_class_metrics(all_labels, y_pred, num_classes=8)
    print("OVERALL METRICS")

    # Calculate metrics
    metrics = calculate_metrics(all_labels, all_probs, num_classes=8)
    print(f"Epoch {epoch + 1}: Accuracy: {accuracy:.2f}% | Metrics: {metrics}")
# Save this model
torch.save(model.state_dict(), "models/best_model_v2_weighted.pth")
# Evaluate the model on the test set

# Load the model and evaluate it on the test set

# Create a CNN model instance
test_model = CNNModel(num_classes=8, num_filters=best_params["num_filters"]).to(device)

# Load the model weights
test_model.load_state_dict(torch.load("models/best_model_v2_weighted.pth"))

# Create a DataLoader instance for the test set
# test_loader = DataLoader(test_dataset, batch_size=32)

test_model.eval()

with torch.no_grad():
    correct, total = 0, 0
    all_labels, all_probs = [], []
    all_labels_per_class, all_probs_per_class = [], []
    test_progress = tqdm.tqdm(test_loader, desc="Testing", leave=False)

    for mel_spectrogram, label in test_progress:
        mel_spectrogram, label = mel_spectrogram.to(device).float(), label.to(device)
        output = test_model(mel_spectrogram.unsqueeze(1))
        probabilities = nn.Softmax(dim=1)(output).cpu().numpy()
        all_probs.append(probabilities)
        all_labels.append(label.cpu().numpy())
        all_labels_per_class.extend(label.cpu().numpy())
        all_probs_per_class.extend(output.cpu().numpy())
        _, predicted = torch.max(output.data, 1)
        total += label.size(0)
        correct += (predicted == label).sum().item()

        # Calculate the loss
        loss = criterion(output, label)
        test_progress.set_postfix({"Loss": f"{loss.item():.4f}"})

    # Concatenate all predictions and true labels
    all_labels = np.concatenate(all_labels)
    all_probs = np.concatenate(all_probs)

    all_labels_per_class = np.array(all_labels_per_class)
    all_probs_per_class = np.array(all_probs_per_class)
    y_pred = all_probs_per_class.argmax(axis=1)

    # Per-class metrics
    print("PER CLASS METRICS")
    per_class_metrics(all_labels, y_pred, num_classes=8)

    # Calculate metrics
    metrics = calculate_metrics(all_labels, all_probs, num_classes=8)
    print("OVERALL METRICS")

    print(f"Accuracy: {100 * correct / total:.2f}% | Metrics: {metrics}")

# Plot the confusion matrix with annotations on test set
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt

# Calculate the confusion matrix
conf_matrix = confusion_matrix(all_labels, y_pred)

# Plot the confusion matrix with annotations
plt.figure(figsize=(10, 8))
plt.imshow(conf_matrix, cmap="Blues")
plt.colorbar()
plt.title("Confusion Matrix")
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.xticks(range(8), mel_dataset.genres, rotation=45)
plt.yticks(range(8), mel_dataset.genres)
for i in range(8):
    for j in range(8):
        plt.text(j, i, conf_matrix[i, j], ha="center", va="center", color="black")
plt.show()
# Now plot the normalized confusion matrix
import seaborn as sns

# Normalize the confusion matrix
conf_matrix_norm = conf_matrix / conf_matrix.sum(axis=1)[:, np.newaxis]

# Plot the normalized confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(conf_matrix_norm, cmap="Blues", annot=True, fmt=".2f", xticklabels=mel_dataset.genres,
            yticklabels=mel_dataset.genres)
plt.title("Normalized Confusion Matrix")
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.show()

In [None]:
# Load the model and evaluate it on the test set

# Create a CNN model instance
model = CNNModel(num_classes=8).to(device)

# Load the model weights
model.load_state_dict(torch.load("models/model_fold_1.pt"))

# Create a DataLoader instance for the test set
test_loader = DataLoader(MelSpectogramDataset(data_path='spectograms_data', transform=None),
                         batch_size=32, shuffle=True)

# Evaluate the model on the test set
model.eval()


# Use Early Stopping


In [None]:
# Draw confusion matrix
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt

# Calculate the confusion matrix
conf_matrix = confusion_matrix(all_labels, y_pred)


In [None]:
# Plot the confusion matrix with annotations

plt.figure(figsize=(10, 8))
plt.imshow(conf_matrix, cmap="Blues")
plt.colorbar()
plt.title("Confusion Matrix")
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.xticks(range(8), mel_dataset.genres, rotation=45)
plt.yticks(range(8), mel_dataset.genres)
for i in range(8):
    for j in range(8):
        plt.text(j, i, conf_matrix[i, j], ha="center", va="center", color="black")
plt.show()


In [None]:
# We can see that because pop, rock and hiphop dominate the dataset, the model has a hard time classifying other genres.

# Let's try to undersample the dataset genres: pop, rock and hiphop and train again.

In [None]:
# Undersample the dataset genres: pop, rock, and hiphop
# Define the genres to undersample
undersampled_genres = ["pop", "rock", "hiphop"]

#


# Train with RESNET 

In [None]:
import torch.nn as nn
import torchvision.models as models
from torchsummary import summary

resnet = models.resnet18()
resnet.conv1 = nn.Conv2d(1, 64, kernel_size=7, stride=(1, 2), bias=True)
resnet.maxpool = nn.MaxPool2d((2, 3), stride=(1, 2))
resnet.fc = nn.Linear(512, 8)
summary(resnet, (1, 96, 1376))

In [None]:
# Pop and hiphop are overwhealming rest of the classes because of the dataset imbalance
# Let's try to undersample the dataset genres: pop and hiphop and train again.
# Change the max_samples_dict to undersample pop and hiphop genres in the objective and train the model again.


import optuna 
from sklearn.model_selection import train_test_split
from torch.utils.data import WeightedRandomSampler

def objective(trial):
    # Hyperparameters to optimize
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 1e-3, log=True)
    batch_size = trial.suggest_categorical("batch_size", [16, 32, 64])
    
    # Create a MelSpectogramDataset instance
    """
    Training the model with stratified split, undersample pop and rock, apply weighted random sampler with Optuna.
    """
    # Create dataset instance
    
    
    mel_dataset = MelSpectogramDataset(data_path='spectograms_data')
    
    # Split the dataset into train, validation and test with 90-5-5 split

    
    # Access file paths and labels
    file_paths = mel_dataset.file_paths
    labels = mel_dataset.labels
    
    # Split dataset using stratification to train - validate
    train_file_paths, val_file_paths, train_labels, val_labels = train_test_split(
        file_paths, labels, test_size=0.1, random_state=42, stratify=labels
    )
    
    
    # Create Datasets for each split
    train_indices = [file_paths.index(path) for path in train_file_paths]
    val_indices = [file_paths.index(path) for path in val_file_paths]
    
    train_dataset = Subset(mel_dataset, train_indices)
    val_dataset = Subset(mel_dataset, val_indices)
    
    # Handle imabalanced dataset with WeightedRandomSampler

    # Get train dataset labels
    train_labels = [label for _, label in train_dataset]
    
    # Get train dataset label count for each class
    class_counts = np.bincount(train_labels)
    
    # Calculate the class weights
    class_weights = 1.0 / class_counts
    weights = [class_weights[label] for label in train_labels]
    
    # Create a WeightedRandomSampler
    sampler = WeightedRandomSampler(weights, num_samples=len(train_dataset))
    
    # Create DataLoader instances
    train_loader = DataLoader(train_dataset, batch_size=batch_size, sampler=sampler)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)
    
    # Set fixed random number of seed
    torch.manual_seed(42)
    
    # Make use of GPU if available or MPS (Apple) if one is available
    device = (
    "mps"
    if torch.backends.mps.is_built()
    else "cuda"
    if torch.cuda.is_available()
    else "cpu"
    )
    
    print("Using device:", device)

    # Define the model with dynamic num_filters
    model = resnet.to(device)

    # Define the loss function and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    # Training loop
    num_epochs = 5
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        train_progress = tqdm.tqdm(train_loader, desc=f"Training Epoch {epoch + 1}", leave=False)

        for mel_spectrogram, label in train_progress:
            mel_spectrogram, label = mel_spectrogram.to(device).float(), label.to(device)
            optimizer.zero_grad()
            output = model(mel_spectrogram.unsqueeze(1))
            loss = criterion(output, label)
            loss.backward()
            optimizer.step()

            # Update progress bar
            running_loss += loss.item()
            train_progress.set_postfix({"Loss": f"{running_loss / (train_progress.n + 1):.4f}"})

        # Evaluate the CNN model
        model.eval()
        all_labels, all_probs = [], []
        test_progress = tqdm.tqdm(val_loader, desc=f"Testing Epoch {epoch + 1}", leave=False)

        # Use Early Stopping
        with torch.no_grad():
            for mel_spectrogram, label in test_progress:
                mel_spectrogram, label = mel_spectrogram.to(device).float(), label.to(device)
                output = model(mel_spectrogram.unsqueeze(1))
                probabilities = nn.Softmax(dim=1)(output).cpu().numpy()
                all_probs.append(probabilities)
                all_labels.append(label.cpu().numpy())


        # Concatenate all predictions and true labels
        all_labels = np.concatenate(all_labels)
        all_probs = np.concatenate(all_probs)

        # Calculate metrics
        f1 = calculate_metrics(all_labels, all_probs, num_classes=8)["F1 Score"]
        
        trial.report(f1, epoch)

        # Prune if needed based on the reported F1 score
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()

        # Optionally, print the F1 score for each epoch
        print(f"Epoch {epoch + 1} - F1 Score: {f1:.4f}")

    # Return the F1 score after the training loop
    return f1


In [None]:
# Optuna study
study_name = "resnet_optimization_v1_weighted_stratified"
# Create a file to save study name and my comments on used dataset and hyperparameters
study_comment = "Full dataset. Optimizing learning rate, batch size. No cross-validation used. 90-10 split for training and validation. Stratified split used. WeightedRandomSampler used for training."
study_comment_file = "study_comment.csv"

# Write new line with study name and comment
with open(study_comment_file, "a") as f:
    f.write(f"\n{study_name},{study_comment}")
# Create an Optuna study
study = optuna.create_study(direction="maximize", storage="sqlite:///db.sqlite3", study_name=study_name, load_if_exists=True)
study.optimize(objective, n_trials=10)

# Best trial
print("Best trial:")
print(f"F1 Score: {study.best_trial.value}")
print(f"Params: {study.best_trial.params}")
