In [1]:
import os
import librosa
import numpy as np
import matplotlib.pyplot as plt
import librosa.display
from PIL import Image
import tempfile
import gc


In [None]:
### CREATE DATASET ###

# # We want to create a custom dataset usable as datasets.ImageFolder() from torchvision.
#     - audio_dataset/
#         - class001/
#             - class001.mp3
#             - class001.mp3
#             - ...
#         - class002/
#             - class002.mp3
#             - class002.mp3
#             - ...
#         - ...
# # 2. Process .mp3 files to spectrograms.png with same structure:
#     - spectrogram_dataset/
#         - class001/
#             - class001.png
#             - class001.png
#             - ...
#         - class002/
#             - class002.png
#             - class002.png
#             - ...
#         - ...

In [2]:
def create_spectrograms(audio_path, output_dir, segment_duration=20, sr=22050, n_mels=128, fmax=8000, img_size=(224, 224), dpi=100):
    """
    Create and save spectrograms from an audio file.
    Will try to create as many segments as possible from the audio file, based on the segment duration.

    Parameters:
    audio_path (str): Path to the audio file.
    output_dir (str): Directory to save the spectrogram images.
    segment_duration (int, optional): Duration of each segment in seconds. Default is 20.
    sr (int, optional): Sampling rate for loading the audio. Default is 22050.
    n_mels (int, optional): Number of Mel bands to generate. Default is 128.
    fmax (int, optional): Maximum frequency for the Mel spectrogram. Default is 8000.
    img_size (tuple, optional): Size of the output image. Default is (224, 224).
    dpi (int, optional): Dots per inch for the output image. Default is 100.

    Returns:
    None
    """
    try:
        # Load the audio file
        y, sr = librosa.load(audio_path, sr=sr)
        total_duration = librosa.get_duration(y=y, sr=sr)
        
        # Calculate the number of full segments
        num_segments = int(total_duration // segment_duration)
        
        # Create output directory if it doesn't exist
        os.makedirs(output_dir, exist_ok=True)
        
        for i in range(num_segments):
            start_sample = i * segment_duration * sr
            end_sample = start_sample + segment_duration * sr
            segment = y[start_sample:end_sample]
            
            # Ensure the segment is exactly 20 seconds long
            if len(segment) == segment_duration * sr:
                # Generate the spectrogram
                S = librosa.feature.melspectrogram(y=segment, sr=sr, n_mels=n_mels, fmax=fmax)
                S_DB = librosa.power_to_db(S, ref=np.max)
                
                # Plot and save the spectrogram without additional elements
                with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as tmpfile:
                    plt.figure(figsize=(10, 4), dpi=dpi)  # Adjusted figure size for better aspect ratio
                    plt.axis('off')  # Turn off the axis
                    librosa.display.specshow(S_DB, sr=sr, x_axis=None, y_axis=None, fmax=fmax)
                    plt.savefig(tmpfile.name, bbox_inches='tight', pad_inches=0)
                    plt.close()
                    
                    # Resize the image
                    img = Image.open(tmpfile.name)
                    img = img.resize(img_size, Image.Resampling.LANCZOS)
                    base_name = os.path.splitext(os.path.basename(audio_path))[0]
                    output_file = os.path.join(output_dir, f'{base_name}_spectrogram_{i+1:03d}.png')
                    img.save(output_file)
                    os.remove(tmpfile.name)
                    
    except Exception as e:
        print(f"Error processing {audio_path}: {e}")


def process_genre_folder(genre_folder, spectrograms_dir):
    """
    Process all MP3 files in a genre folder to create spectrograms.

    Parameters:
    genre_folder (str): Path to the genre folder containing MP3 files.
    spectrograms_dir (str): Directory to save the spectrogram images.

    Returns:
    None
    """
    for root, dirs, files in os.walk(genre_folder):
        for file in files:
            if file.endswith(".mp3"):
                audio_path = os.path.join(root, file)
                output_dir = os.path.join(spectrograms_dir, os.path.basename(genre_folder))
                create_spectrograms(audio_path, output_dir)
                # Explicitly clear memory after processing each MP3 file
                gc.collect()


In [3]:
# Define paths
PATH_TO_AUDIO_DATASET = "DATASET/originals"
PATH_TO_SPECTROGRAM_DATASET = "DATASET/spectrograms"

# Create the base directory for the spectrogram dataset if it doesn't exist
os.makedirs(PATH_TO_SPECTROGRAM_DATASET, exist_ok=True)

os.listdir(PATH_TO_AUDIO_DATASET)

['hard rock',
 'house',
 'classical',
 'chanson',
 'blues',
 'metal',
 'reggae',
 'rock',
 'dub',
 'jazz',
 'electro',
 'country',
 'funk',
 'hip-hop',
 'rap',
 'pop',
 'folk',
 'dance']

In [None]:
# # Process the genre folder
# genre_folder = "dance"
# process_genre_folder(os.path.join(PATH_TO_AUDIO_DATASET, genre_folder), PATH_TO_SPECTROGRAM_DATASET)

In [5]:
# for root, dirs, files in os.walk(PATH_TO_AUDIO_DATASET):
#     for genre in dirs:
#         genre_folder = os.path.join(root, genre)
#         process_genre_folder(genre_folder, PATH_TO_SPECTROGRAM_DATASET)