<a href="https://drive.google.com/file/d/1-lmvLqHRoVztabnwQ8RbZuDhpsd1kmYY/view?usp=sharing" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Research project** for the course of *Selected Topics in Music and Acoustic Engineering* :

***Music Instrument Classification***
---
Team:
* Andrea Crisafulli
* Marco Porcella
* Giacomo De Toni
* Gianluigi Vecchini

### *Import libraries*:

In [4]:
# Core Python & Scientific Computing
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
import scipy.signal as signal

# Audio Processing
import librosa
import librosa.display
import IPython

# Scikit-learn (ML + preprocessing)
import sklearn
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans

# TensorFlow / Keras (DL)
import tensorflow as tf
from tensorflow.keras.callbacks import CSVLogger, EarlyStopping, ModelCheckpoint

# PyTorch (opzionale, se usi anche torch)
#import torch
#import torch.nn as nn
#import torch.nn.functional as F

import yaml
from pathlib import Path

import pandas as pd
import yaml
from pathlib import Path
from sklearn.preprocessing import MultiLabelBinarizer


# Plotting style
plt.style.use("seaborn-v0_8")


In [2]:
class CNN_MFCC(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 16, kernel_size=3)
        self.pool = nn.MaxPool2d(2)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3)
        self.fc1 = nn.Linear(32 * 30 * 3, 64)
        self.fc2 = nn.Linear(64, num_classes)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x.unsqueeze(1))))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        return self.fc2(x)

In [3]:
def extract_mfcc(file_path, n_mfcc=13, max_len=130):
    y, sr = librosa.load(file_path, sr=None)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
    if mfcc.shape[1] < max_len:
        pad_width = max_len - mfcc.shape[1]
        mfcc = np.pad(mfcc, ((0, 0), (0, pad_width)), mode='constant')
    else:
        mfcc = mfcc[:, :max_len]
    return mfcc

### Import audio data.

In [6]:
#basePath = Path("E:/MedleyDB")                          # For windows
basePath = Path("/Volumes/Extreme SSD/MedleyDB")        # For mac
audioPath = basePath / "Audio"
data = []

# Iterates over directories in the melodyDB/Audio folder
for songDir in audioPath.iterdir():
    labelArray = []
    
    # Security check to skip not directory items
    if not songDir.is_dir():
        continue
    
    songName = songDir.name
    yamlFilePath = audioPath / songDir / f"{songName}_METADATA.yaml" # Path to YAML metadata file
    
    # Opens YAML metadata file in read mode
    with open(yamlFilePath, "r") as f:
        metadata = yaml.safe_load(f)
    
    # Recovers stems from metadata and stores in dictionary
    stemsData = metadata.get("stems", {})
    
    # Iterates over stems
    for stemId, stem in stemsData.items():
        instrumentData = [] # Empty data for raw paths
        
        rawData = stem.get("raw", {})
        # Iterates over raw items to store the relative paths
        for rawId, raw in rawData.items():
            rawPath = songDir /  f"{songName}_RAW" / raw.get("filename")
            
            # Checks for valid files
            if(not rawPath.name.startswith(".")):
                newData = {
                    "song": songName,
                    "songPath": audioPath / songDir,
                    "label": stem.get("instrument"),
                    "filePath": rawPath
                }
                
                data.append(newData)
        
        # Creates new data
        newData = {
            "song": songName,
            "songPath": songDir,
            "label": stem.get("instrument"),
            "filePath": songDir / f"{songName}_STEMS" / stem.get("filename")
        }
        
        # Appends to data 
        data.append(newData)
        
        labelArray.append(stem.get("instrument"))
        
    labelFormatted = "|".join(sorted(set(labelArray)))
    
    mixData = {
        "song": songName,
        "songPath": audioPath / songDir,
        "label": labelFormatted,
        "filePath": songDir / f"{songName}_MIX.wav"
    }
    
    data.append(mixData)
# Create DataFrame
df = pd.DataFrame(data)
print(f"Loaded {len(df)} audio files.")

# Converte stringa in lista
df["labelList"] = df["label"].str.split("|")

mlb = MultiLabelBinarizer()
mlb.fit_transform(df["labelList"])

Loaded 2879 audio files.


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [7]:
df.head()

Unnamed: 0,song,songPath,label,filePath,labelList
0,AClassicEducation_NightOwl,/Volumes/Extreme SSD/MedleyDB/Audio/AClassicEd...,electric bass,/Volumes/Extreme SSD/MedleyDB/Audio/AClassicEd...,[electric bass]
1,AClassicEducation_NightOwl,/Volumes/Extreme SSD/MedleyDB/Audio/AClassicEd...,electric bass,/Volumes/Extreme SSD/MedleyDB/Audio/AClassicEd...,[electric bass]
2,AClassicEducation_NightOwl,/Volumes/Extreme SSD/MedleyDB/Audio/AClassicEd...,electric bass,/Volumes/Extreme SSD/MedleyDB/Audio/AClassicEd...,[electric bass]
3,AClassicEducation_NightOwl,/Volumes/Extreme SSD/MedleyDB/Audio/AClassicEd...,drum set,/Volumes/Extreme SSD/MedleyDB/Audio/AClassicEd...,[drum set]
4,AClassicEducation_NightOwl,/Volumes/Extreme SSD/MedleyDB/Audio/AClassicEd...,drum set,/Volumes/Extreme SSD/MedleyDB/Audio/AClassicEd...,[drum set]


In [9]:
df.tail()

Unnamed: 0,song,songPath,label,filePath,labelList
2874,Wolf_DieBekherte,/Volumes/Extreme SSD/MedleyDB/Audio/Wolf_DieBe...,piano,/Volumes/Extreme SSD/MedleyDB/Audio/Wolf_DieBe...,[piano]
2875,Wolf_DieBekherte,/Volumes/Extreme SSD/MedleyDB/Audio/Wolf_DieBe...,piano,/Volumes/Extreme SSD/MedleyDB/Audio/Wolf_DieBe...,[piano]
2876,Wolf_DieBekherte,/Volumes/Extreme SSD/MedleyDB/Audio/Wolf_DieBe...,piano,/Volumes/Extreme SSD/MedleyDB/Audio/Wolf_DieBe...,[piano]
2877,Wolf_DieBekherte,/Volumes/Extreme SSD/MedleyDB/Audio/Wolf_DieBe...,piano,/Volumes/Extreme SSD/MedleyDB/Audio/Wolf_DieBe...,[piano]
2878,Wolf_DieBekherte,/Volumes/Extreme SSD/MedleyDB/Audio/Wolf_DieBe...,female singer|piano,/Volumes/Extreme SSD/MedleyDB/Audio/Wolf_DieBe...,"[female singer, piano]"


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2879 entries, 0 to 2878
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   song       2879 non-null   object
 1   songPath   2879 non-null   object
 2   label      2879 non-null   object
 3   filePath   2879 non-null   object
 4   labelList  2879 non-null   object
dtypes: object(5)
memory usage: 112.6+ KB


In [12]:
audioFiles = []
audioLabels = []

# Extract paths and labels
for _, row in df.iterrows():
    audioFiles.append(row["filePath"])
    audioLabels.append(row["labelList"])

# Security check
if(len(audioFiles) == len(audioLabels)):
    print(f"Extracted files and labels for a total lenght of {len(audioFiles)}")
else:
    print("Error in dataset")

Extracted files and labels for a total lenght of 2879


In [13]:
from collections import Counter

# Flatten all labels e conta frequenze
all_labels = sum(df["labelList"], [])
label_counts = Counter(all_labels)

# Prendi le 10 classi più frequenti
top_labels = [label for label, _ in label_counts.most_common(10)]
print("Top labels:", top_labels)

# Filtra righe dove almeno una label è in top_labels
df_subset = df[df["labelList"].apply(lambda labels: any(label in top_labels for label in labels))]

# Estrai audio e label
audioFilesSubset = df_subset["filePath"].tolist()
audioLabelsSubset = df_subset["labelList"].tolist()

if len(audioFilesSubset) == len(audioLabelsSubset):
    print(f"Extracted {len(audioFilesSubset)} samples from top 10 labels")
else:
    print("Mismatch in extracted data")


Top labels: ['drum set', 'vocalists', 'electric bass', 'clean electric guitar', 'piano', 'synthesizer', 'distorted electric guitar', 'male singer', 'fx/processed sound', 'acoustic guitar']
Extracted 1803 samples from top 10 labels


In [27]:
# Load data
audioFilesToExtract = audioFilesSubset # TESTING
#audioFilesToExtract = audioFiles # REAL

print(len(audioFilesSubset))

print(audioFilesSubset[0])

signals, sample_rate = [librosa.load(file, sr=22050) for file in audioFilesSubset]
#signals, sample_rate = librosa.load(audioFilesSubset[0], sr=22050)

print(f"MAX: {max(signals)}")
print(f"min: {min(signals)}")

mel_spegrams = []

# Iterates over signals, normalizes them and computes mel spectrograms via librosa feature
for signal in signals:
    # Normalization of signal n
    if np.max(np.abs(signal)) > 0:
        signal = signal / np.max(np.abs(signal))

    # Creation of mel spectrogram
    S = librosa.feature.melspectrogram(y=signal, sr=22050)
    S_dB = librosa.power_to_db(S, ref=np.max)
    mel_spegrams.append(S_dB)

# Plot of spectrogram n=0
plt.figure(figsize=(15,8))
librosa.display.specshow(mel_spegrams[0], sr=sample_rate, x_axis='time', y_axis='mel', fmax=sample_rate/2)
plt.clim(-40,None)
plt.colorbar()
plt.title('Mel-scale Spectrogram of signal 0 (Librosa) - 40 Mel Filters')

1803
/Volumes/Extreme SSD/MedleyDB/Audio/AClassicEducation_NightOwl/AClassicEducation_NightOwl_RAW/AClassicEducation_NightOwl_RAW_01_01.wav


: 

In [None]:
# Convert list to numpy arrays
AudioData = np.asarray(mel_spegrams)
AudioLabels = np.asarray(audioLabelsSubset)

print('Number of audio data: {} \nNumber of audio labels: {}'.format(AudioData.shape, AudioLabels.shape))

In [None]:
# Plot an histogram to see data distribution over classes
plt.figure(figsize=(10, 6))  
plt.hist(AudioLabels, bins=20, edgecolor='black') 
plt.xlabel('Labels', fontsize=12)
plt.ylabel('Quantity', fontsize=12)
plt.title('Ydata Histogram', fontsize=14)
plt.grid(True, linestyle='--', alpha=0.7)  
plt.tight_layout()
plt.show()

In [None]:
X = df["filepath"]
y = df["instrument"]

# Split into Train (70%) and Temp (30%)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, stratify=y, random_state=42
)

# Split Temp into Validation (15%) and Test (15%)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42
)

print(f"Train: {len(X_train)} samples")
print(f"Validation: {len(X_val)} samples")
print(f"Test: {len(X_test)} samples")


Train: 1400 samples
Validation: 300 samples
Test: 300 samples
