<a href="https://drive.google.com/file/d/1-lmvLqHRoVztabnwQ8RbZuDhpsd1kmYY/view?usp=sharing" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Research project** for the course of *Selected Topics in Music and Acoustic Engineering* :

***Music Instrument Classification***
---
Team:
* Andrea Crisafulli
* Marco Porcella
* Giacomo De Toni
* Gianluigi Vecchini

### *Import libraries*:

In [25]:
# ✅ Core Python & Scientific Computing
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
import scipy.signal as signal

# ✅ Audio Processing
import librosa
import librosa.display
import IPython

# ✅ Scikit-learn (ML + preprocessing)
import sklearn
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans

# ✅ TensorFlow / Keras (DL)
import tensorflow as tf
from tensorflow.keras.callbacks import CSVLogger, EarlyStopping, ModelCheckpoint

# ✅ PyTorch (opzionale, se usi anche torch)
import torch
import torch.nn as nn
import torch.nn.functional as F

import yaml
from pathlib import Path

# ✅ Plotting style
plt.style.use("seaborn-v0_8")


In [3]:
class CNN_MFCC(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 16, kernel_size=3)
        self.pool = nn.MaxPool2d(2)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3)
        self.fc1 = nn.Linear(32 * 30 * 3, 64)
        self.fc2 = nn.Linear(64, num_classes)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x.unsqueeze(1))))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        return self.fc2(x)

In [4]:
def extract_mfcc(file_path, n_mfcc=13, max_len=130):
    y, sr = librosa.load(file_path, sr=None)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
    if mfcc.shape[1] < max_len:
        pad_width = max_len - mfcc.shape[1]
        mfcc = np.pad(mfcc, ((0, 0), (0, pad_width)), mode='constant')
    else:
        mfcc = mfcc[:, :max_len]
    return mfcc

### Import audio data.

In [None]:
import pandas as pd
import yaml
from pathlib import Path
from sklearn.preprocessing import MultiLabelBinarizer

basePath = Path("E:/MedleyDB")
audioPath = basePath / "Audio"
data = []

# Iterates over directories in the melodyDB/Audio folder
for songDir in audioPath.iterdir():
    labelArray = []
    
    # Security check to skip not directory items
    if not songDir.is_dir():
        continue
    
    songName = songDir.name
    yamlFilePath = audioPath / songDir / f"{songName}_METADATA.yaml" # Path to YAML metadata file
    
    # Opens YAML metadata file in read mode
    with open(yamlFilePath, "r") as f:
        metadata = yaml.safe_load(f)
    
    # Recovers stems from metadata and stores in dictionary
    stemsData = metadata.get("stems", {})
    
    # Iterates over stems
    for stemId, stem in stemsData.items():
        instrumentData = [] # Empty data for raw paths
        
        rawData = stem.get("raw", {})
        # Iterates over raw items to store the relative paths
        for rawId, raw in rawData.items():
            rawPath = audioPath / songDir /  f"{songDir}_RAW" / raw.get("filename")
            
            # Checks for valid files
            if(not rawPath.name.startswith(".")):
                instrumentData.append(rawPath)
        
        # Also adds stem generic file
        instrumentData.append(audioPath / songDir / f"{songDir}_STEMS" / stem.get("filename"))
        
        # Creates new data
        newData = {
            "song": songName,
            "songPath": audioPath / songDir,
            "label": stem.get("instrument"),
            "filePaths": instrumentData
        }
        
        # Appends to data 
        data.append(newData)
        
        labelArray.append(stem.get("instrument"))
        
    labelFormatted = "|".join(sorted(set(labelArray)))
    
    mixData = {
        "song": songName,
        "songPath": audioPath / songDir,
        "label": labelFormatted,
        "filePaths": audioPath / songDir / f"{songDir}_MIX.wav"
    }
    
    data.append(mixData)
# Create DataFrame
df = pd.DataFrame(data)
print(f"Loaded {len(df)} audio files.")

# Converte stringa in lista
df["label_list"] = df["label"].str.split("|")
df["label_list"] = df["label_list"].apply(lambda label_list: [label for label in label_list if label != "Main System"])

mlb = MultiLabelBinarizer()
mlb.fit_transform(df["label_list"])

Loaded 978 audio files.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 978 entries, 0 to 977
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   song        978 non-null    object
 1   songPath    978 non-null    object
 2   label       978 non-null    object
 3   filePaths   978 non-null    object
 4   label_list  978 non-null    object
dtypes: object(5)
memory usage: 38.3+ KB


In [63]:
df.head()

Unnamed: 0,song,songPath,label,filePaths,label_list
0,AClassicEducation_NightOwl,E:\MedleyDB\Audio\AClassicEducation_NightOwl,electric bass,[E:\MedleyDB\Audio\AClassicEducation_NightOwl_...,[electric bass]
1,AClassicEducation_NightOwl,E:\MedleyDB\Audio\AClassicEducation_NightOwl,drum set,[E:\MedleyDB\Audio\AClassicEducation_NightOwl_...,[drum set]
2,AClassicEducation_NightOwl,E:\MedleyDB\Audio\AClassicEducation_NightOwl,distorted electric guitar,[E:\MedleyDB\Audio\AClassicEducation_NightOwl_...,[distorted electric guitar]
3,AClassicEducation_NightOwl,E:\MedleyDB\Audio\AClassicEducation_NightOwl,distorted electric guitar,[E:\MedleyDB\Audio\AClassicEducation_NightOwl_...,[distorted electric guitar]
4,AClassicEducation_NightOwl,E:\MedleyDB\Audio\AClassicEducation_NightOwl,clean electric guitar,[E:\MedleyDB\Audio\AClassicEducation_NightOwl_...,[clean electric guitar]


In [64]:

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 978 entries, 0 to 977
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   song        978 non-null    object
 1   songPath    978 non-null    object
 2   label       978 non-null    object
 3   filePaths   978 non-null    object
 4   label_list  978 non-null    object
dtypes: object(5)
memory usage: 38.3+ KB


In [None]:
X = df["filepath"]
y = df["instrument"]

# Split into Train (70%) and Temp (30%)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, stratify=y, random_state=42
)

# Split Temp into Validation (15%) and Test (15%)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42
)

print(f"Train: {len(X_train)} samples")
print(f"Validation: {len(X_val)} samples")
print(f"Test: {len(X_test)} samples")


Train: 1400 samples
Validation: 300 samples
Test: 300 samples
