<a href="https://drive.google.com/file/d/1-lmvLqHRoVztabnwQ8RbZuDhpsd1kmYY/view?usp=sharing" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Research project** for the course of *Selected Topics in Music and Acoustic Engineering* :

***Music Instrument Classification***
---
Team:
* Andrea Crisafulli
* Marco Porcella
* Giacomo De Toni
* Gianluigi Vecchini

### *Import libraries*:

In [16]:
# ✅ Core Python & Scientific Computing
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
import scipy.signal as signal

# ✅ Audio Processing
import librosa
import librosa.display
import IPython

# ✅ Scikit-learn (ML + preprocessing)
import sklearn
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans

# ✅ TensorFlow / Keras (DL)
import tensorflow as tf
from tensorflow.keras.callbacks import CSVLogger, EarlyStopping, ModelCheckpoint

# ✅ PyTorch (opzionale, se usi anche torch)
import torch
import torch.nn as nn
import torch.nn.functional as F

# ✅ Plotting style
plt.style.use("seaborn-v0_8")


In [11]:
class CNN_MFCC(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 16, kernel_size=3)
        self.pool = nn.MaxPool2d(2)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3)
        self.fc1 = nn.Linear(32 * 30 * 3, 64)
        self.fc2 = nn.Linear(64, num_classes)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x.unsqueeze(1))))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        return self.fc2(x)

In [10]:
def extract_mfcc(file_path, n_mfcc=13, max_len=130):
    y, sr = librosa.load(file_path, sr=None)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
    if mfcc.shape[1] < max_len:
        pad_width = max_len - mfcc.shape[1]
        mfcc = np.pad(mfcc, ((0, 0), (0, pad_width)), mode='constant')
    else:
        mfcc = mfcc[:, :max_len]
    return mfcc

### Import audio data.

In [17]:
# Load CSV file
csv_path = ".\ESC-50-master\_meta\esc50.csv"
df = pd.read_csv(csv_path)

# Base path to audio files
audio_base_path = Path(".\ESC-50-master\_audio")
filepaths = df["filename"].apply(lambda x: audio_base_path / x)
labels = df["category"]

# Loading file
y, sr = librosa.load(filepaths[0], sr=None)

In [18]:
X = filepaths
y = labels

# Prima split: train + (val+test)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.30, stratify=y, random_state=42)

# Seconda split: val + test
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

print(f"Train: {len(X_train)} samples")
print(f"Validation: {len(X_val)} samples")
print(f"Test: {len(X_test)} samples")


Train: 1400 samples
Validation: 300 samples
Test: 300 samples
