<a href="https://drive.google.com/file/d/1-lmvLqHRoVztabnwQ8RbZuDhpsd1kmYY/view?usp=sharing" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Research project** for the course of *Selected Topics in Music and Acoustic Engineering* :

***Music Instrument Classification***
---
Team:
* Andrea Crisafulli
* Marco Porcella
* Giacomo De Toni
* Gianluigi Vecchini

### *Import libraries*:

In [None]:
# === Core Python & Scientific Computing ===
import numpy as np                # Numerical computing
import pandas as pd              # Data handling and manipulation
import matplotlib.pyplot as plt  # Plotting
from pathlib import Path         # File path handling
import scipy.signal as signal    # Signal processing tools

# === Audio Processing ===
import librosa                   # Audio analysis
import librosa.display           # Visualization for librosa outputs
import IPython.display as ipd    # For audio playback in notebooks

# === Scikit-learn: ML & Preprocessing ===
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay  # Evaluation
from sklearn.decomposition import PCA         # Dimensionality reduction
from sklearn.preprocessing import scale, StandardScaler, MultiLabelBinarizer  # Data scaling & encoding
from sklearn.model_selection import train_test_split  # Dataset splitting
from sklearn.svm import SVC                    # Support Vector Classifier
from sklearn.neighbors import KNeighborsClassifier  # k-NN classifier
from sklearn.cluster import KMeans            # Clustering

# === Deep Learning: TensorFlow / Keras ===
import tensorflow as tf
from tensorflow import keras
from keras.callbacks import CSVLogger, EarlyStopping, ModelCheckpoint  # Training utilities
from keras.optimizers import Adam            # Optimizer for model training
from keras import layers, models

# === Optional: PyTorch (if used) ===
#import torch
#import torch.nn as nn
#import torch.nn.functional as F              # Functional API for building models

# === Others ===
import yaml                                   # Parsing metadata in YAML format
from collections import Counter               # Frequency counting for label analysis
from tqdm import tqdm                         # Progress bar for loops

# === Plotting Style ===
plt.style.use("seaborn-v0_8")                 # Set default plotting style

### Import audio data.

In [2]:
basePath = Path("E:/MedleyDB")                         # For windows
#basePath = Path("/Volumes/Extreme SSD/MedleyDB")        # For mac
audioPath = basePath / "Audio"
data = []

# Iterates over directories in the melodyDB/Audio folder
for songDir in audioPath.iterdir():
    labelArray = []
    
    # Security check to skip not directory items
    if not songDir.is_dir():
        continue
    
    songName = songDir.name
    yamlFilePath = audioPath / songDir / f"{songName}_METADATA.yaml" # Path to YAML metadata file
    
    # Opens YAML metadata file in read mode
    with open(yamlFilePath, "r") as f:
        metadata = yaml.safe_load(f)
    
    # Recovers stems from metadata and stores in dictionary
    stemsData = metadata.get("stems", {})
    
    # Iterates over stems
    for stemId, stem in stemsData.items():
        instrumentData = [] # Empty data for raw paths
        
        rawData = stem.get("raw", {})
        # Iterates over raw items to store the relative paths
        for rawId, raw in rawData.items():
            rawPath = songDir /  f"{songName}_RAW" / raw.get("filename")
            
            # Checks for valid files
            if(not rawPath.name.startswith(".")):
                newData = {
                    "song": songName,
                    "songPath": audioPath / songDir,
                    "label": stem.get("instrument"),
                    "filePath": rawPath
                }
                
                data.append(newData)
        
        # Creates new data
        newData = {
            "song": songName,
            "songPath": songDir,
            "label": stem.get("instrument"),
            "filePath": songDir / f"{songName}_STEMS" / stem.get("filename")
        }
        
        # Appends to data 
        data.append(newData)
        
        labelArray.append(stem.get("instrument"))
    
    # Format    
    labelFormatted = "|".join(sorted(set(labelArray)))
    
    mixData = {
        "song": songName,
        "songPath": audioPath / songDir,
        "label": labelFormatted,
        "filePath": songDir / f"{songName}_MIX.wav"
    }
    
    data.append(mixData)
# Create DataFrame
df = pd.DataFrame(data)
print(f"Loaded {len(df)} audio files.")

# String convertion to list
df["labelList"] = df["label"].str.split("|")

mlb = MultiLabelBinarizer()
audioLabelsBinary = mlb.fit_transform(df["labelList"])
audioLabelsBinary = np.asarray(audioLabelsBinary)

Loaded 2879 audio files.


In [3]:
df.head()

Unnamed: 0,song,songPath,label,filePath,labelList
0,AClassicEducation_NightOwl,E:\MedleyDB\Audio\AClassicEducation_NightOwl,electric bass,E:\MedleyDB\Audio\AClassicEducation_NightOwl\A...,[electric bass]
1,AClassicEducation_NightOwl,E:\MedleyDB\Audio\AClassicEducation_NightOwl,electric bass,E:\MedleyDB\Audio\AClassicEducation_NightOwl\A...,[electric bass]
2,AClassicEducation_NightOwl,E:\MedleyDB\Audio\AClassicEducation_NightOwl,electric bass,E:\MedleyDB\Audio\AClassicEducation_NightOwl\A...,[electric bass]
3,AClassicEducation_NightOwl,E:\MedleyDB\Audio\AClassicEducation_NightOwl,drum set,E:\MedleyDB\Audio\AClassicEducation_NightOwl\A...,[drum set]
4,AClassicEducation_NightOwl,E:\MedleyDB\Audio\AClassicEducation_NightOwl,drum set,E:\MedleyDB\Audio\AClassicEducation_NightOwl\A...,[drum set]


In [4]:
df.tail()

Unnamed: 0,song,songPath,label,filePath,labelList
2874,Wolf_DieBekherte,E:\MedleyDB\Audio\Wolf_DieBekherte,piano,E:\MedleyDB\Audio\Wolf_DieBekherte\Wolf_DieBek...,[piano]
2875,Wolf_DieBekherte,E:\MedleyDB\Audio\Wolf_DieBekherte,piano,E:\MedleyDB\Audio\Wolf_DieBekherte\Wolf_DieBek...,[piano]
2876,Wolf_DieBekherte,E:\MedleyDB\Audio\Wolf_DieBekherte,piano,E:\MedleyDB\Audio\Wolf_DieBekherte\Wolf_DieBek...,[piano]
2877,Wolf_DieBekherte,E:\MedleyDB\Audio\Wolf_DieBekherte,piano,E:\MedleyDB\Audio\Wolf_DieBekherte\Wolf_DieBek...,[piano]
2878,Wolf_DieBekherte,E:\MedleyDB\Audio\Wolf_DieBekherte,female singer|piano,E:\MedleyDB\Audio\Wolf_DieBekherte\Wolf_DieBek...,"[female singer, piano]"


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2879 entries, 0 to 2878
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   song       2879 non-null   object
 1   songPath   2879 non-null   object
 2   label      2879 non-null   object
 3   filePath   2879 non-null   object
 4   labelList  2879 non-null   object
dtypes: object(5)
memory usage: 112.6+ KB


In [6]:
audioFiles = []
audioLabels = []

# Extract paths and labels
for _, row in df.iterrows():
    audioFiles.append(row["filePath"])
    audioLabels.append(row["labelList"])
    

# Security check
if(len(audioFiles) == len(audioLabels)):
    print(f"Extracted files and labels for a total lenght of {len(audioFiles)}")
else:
    print("Error in dataset")

Extracted files and labels for a total lenght of 2879


In [7]:
n_classes_test = 3

# Flatten all labels e conta frequenze
all_labels = sum(df["labelList"], [])
label_counts = Counter(all_labels)

# Prendi le 10 classi più frequenti
top_labels = [label for label, _ in label_counts.most_common(n_classes_test)]
print("Top labels:", top_labels)

# Filtra righe dove almeno una label è in top_labels
df_subset = df[df["labelList"].apply(lambda labels: any(label in top_labels for label in labels))]

# Estrai audio e label
audioFilesSubset = df_subset["filePath"].tolist()
audioLabelsSubset = df_subset["labelList"].tolist()
audioLabelsSubsetBinary = audioLabelsBinary[df_subset.index]
audioLabelsSubsetBinary = np.asarray(audioLabelsSubsetBinary)

if len(audioFilesSubset) == len(audioLabelsSubset):
    print(f"Extracted {len(audioFilesSubset)} samples from top {n_classes_test} labels")
else:
    print("Mismatch in extracted data")

print(f"Subset binary label matrix shape: {audioLabelsSubsetBinary.shape}")

Top labels: ['drum set', 'vocalists', 'electric bass']
Extracted 876 samples from top 3 labels
Subset binary label matrix shape: (876, 82)


In [None]:
# Load data
audioFilesToExtract = audioFilesSubset # TESTING
#audioFilesToExtract = audioFiles # REAL

signals = []

#for file in tqdm(audioFilesSubset, desc="Loading audio files..."):
#    y, _ = librosa.load(file, sr=22050)
#    signals.append(y)

for x in tqdm(audioFilesSubset[0:10], desc="Loading audio files..."):
    y, _ = librosa.load(x, sr=22050)
    signals.append(y)

: 

In [None]:
melSpegrams = []

# Iterates over signals, normalizes them and computes mel spectrograms via librosa feature
for signal in tqdm(signals, desc="Processing audio signals..."):
    # Normalization of signal n
    if np.max(np.abs(signal)) > 0:
        signal = signal / np.max(np.abs(signal))

    # Creation of mel spectrogram
    S = librosa.feature.melspectrogram(y=signal, sr=22050)
    S_dB = librosa.power_to_db(S, ref=np.max)
    melSpegrams.append(S_dB)

In [None]:
# Plot of spectrogram n=0
import IPython.display

for i in range(0, len(signals), len(signals)/10):
    print("\n\n-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-")
    
    IPython.display.display(IPython.display.Audio(signals[i], rate=22050))

    plt.figure(figsize=(10,6))
    librosa.display.specshow(melSpegrams[i], sr=22050, x_axis='time', y_axis='mel', fmax=22050/2)
    plt.clim(-60,None)
    plt.colorbar()

    filename = str(audioFilesSubset[i]).split("\\")[-1]
    
    plt.title(f'{filename} (data #{i})')
    plt.show()

The user now has to choose which data he wants to load

In [None]:
# CHOOSE WHICH DATA TO LOAD
#labelsToLoad = audioLabelsBinary
labelsToLoad = audioLabelsSubsetBinary[0:10]

In [None]:
# === Compute the maximum length across all mel spectrograms ===
# This will define the target temporal dimension for padding
maxLen = max(mel.shape[1] for mel in melSpegrams)
print(f"Max mel spectrogram length detected: {maxLen} frames")

# === Define padding function ===
# Pads the mel spectrogram to match the target length (right-padding with silence level -80 dB)
def pad_mel(mel, targetLen=maxLen):
    padWidth = targetLen - mel.shape[1]
    if padWidth > 0:
        return np.pad(mel, ((0, 0), (0, padWidth)), mode='constant', constant_values=-80)
    else:
        return mel[:, :targetLen]  # In case some spectrograms are slightly longer

# === Apply padding to all mel spectrograms ===
# The output list will contain uniformly sized mel spectrograms
melSpegramsPadded = [pad_mel(mel) for mel in melSpegrams]

# === Convert to NumPy array and add channel dimension ===
# Final shape: (N_samples, 128, max_len, 1) — ready for CNN input
melSpegramsPadded = np.expand_dims(np.asarray(melSpegramsPadded), axis=-1)
print(f"Padded mel spectrogram array shape: {melSpegramsPadded.shape}")

In [None]:
# Split into Train (70%) and Temp (30%)
# X = mel spectrograms
# y = label
X_train, X_temp, y_train, y_temp = train_test_split(
    melSpegramsPadded, labelsToLoad, test_size=0.30, random_state=42
)

# Split Temp into Validation (15%) and Test (15%)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42
)

# Summary
print(f"Train samples:      {len(X_train)}")
print(f"Validation samples: {len(X_val)}")
print(f"Test samples:       {len(X_test)}")
print(f"Test lables:       {len(y_train)}")

In [None]:
# === Define input shape and number of output classes ===
inputShape = (128, 43, 1)  # (n_mels, time_frames, channels)
numClasses = labelsToLoad.shape[1]  # number of multilabel classes

# === Build CNN model ===
modelCNN = models.Sequential([

    # Input
    layers.Input(shape=inputShape),

    # === Block 1 ===
    layers.Conv2D(32, (3, 3), activation='relu', padding='same'),
    layers.Conv2D(32, (3, 3), activation='relu', padding='same'),
    layers.MaxPooling2D((3, 3), padding='same'),
    layers.Dropout(0.25),
    layers.BatchNormalization(),

    # === Block 2 ===
    layers.Conv2D(64, (3, 3), activation='relu', padding='same'),
    layers.Conv2D(64, (3, 3), activation='relu', padding='same'),
    layers.MaxPooling2D((3, 3), padding='same'),
    layers.Dropout(0.25),
    layers.BatchNormalization(),

    # === Block 3 ===
    layers.Conv2D(128, (3, 3), activation='relu', padding='same'),
    layers.Conv2D(128, (3, 3), activation='relu', padding='same'),
    layers.MaxPooling2D((3, 3), padding='same'),
    layers.Dropout(0.25),
    layers.BatchNormalization(),

    # === Block 4 ===
    layers.Conv2D(256, (3, 3), activation='relu', padding='same'),
    layers.Conv2D(256, (3, 3), activation='relu', padding='same'),
    layers.GlobalMaxPooling2D(),
    layers.Dropout(0.25),
    layers.BatchNormalization(),

    # === Fully Connected ===
    layers.Dense(1024, activation='relu'),
    layers.Dropout(0.5),

    # === Output Layer (sigmoid for multilabel) ===
    layers.Dense(numClasses, activation='sigmoid')
])

# === Compile the model ===
modelCNN.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

# === Summary ===
modelCNN.summary()

# Optional: show classes
print(f"Number of classes: {numClasses}")
print(f"Class names: {mlb.classes_}")

In [None]:
from keras.callbacks import CSVLogger, EarlyStopping, ModelCheckpoint

# Path to save logs and models
csvLogPath = 'training_log.csv'
checkpointPath = 'best_model.h5'

# CSVLogger: logs every epoch to CSV
csvLogger = CSVLogger(csvLogPath, append=True)

# EarlyStopping: stop if val_loss doesn't improve after 100 epochs
earlyStop = EarlyStopping(
    monitor='val_loss',
    patience=100,
    restore_best_weights=True,
    verbose=1
)

# ModelCheckpoint: save best model based on val_accuracy
checkpoint = ModelCheckpoint(
    filepath=checkpointPath,
    monitor='val_accuracy',
    save_best_only=True,
    verbose=1
)

# Bundle them
callbacks = [csvLogger, earlyStop, checkpoint]

In [None]:
from keras.optimizers import Adam

# Compile the model
opt = tf.keras.optimizers.Adam(learning_rate=0.001)
modelCNN.compile(optimizer=opt, loss='binary_crossentropy', metrics=['accuracy'])

batchSize=32
epochs=300

history = modelCNN.fit(X_train, y_train, validation_data=(X_val, y_val), batch_size=batchSize, epochs=epochs, verbose=0, callbacks = callbacks)