# Lecture 5: Supervised learning with CNNs


We provide a collab with training and testing code for an MLP and a CNN model classifier. 


Task 1: Run both examples and discuss the performance difference.


### Train a small autotagging model
In this section, we will train a small convolutional neural network (CNN) to perform music autotagging using a toy dataset.

#### Steps:
1. Load and preprocess the dataset
2. Define the CNN model
3. Train the model
4. Evaluate the model

In [165]:
!pip install torch
!pip install mirdata
!pip install tqdm
!pip install sklearn
!pip install torchaudio

Collecting sklearn
  Using cached sklearn-0.0.post12.tar.gz (2.6 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25lerror
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mGetting requirements to build wheel[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m [31m[15 lines of output][0m
  [31m   [0m The 'sklearn' PyPI package is deprecated, use 'scikit-learn'
  [31m   [0m rather than 'sklearn' for pip commands.
  [31m   [0m 
  [31m   [0m Here is how to fix this error in the main use cases:
  [31m   [0m - use 'pip install scikit-learn' rather than 'pip install sklearn'
  [31m   [0m - replace 'sklearn' by 'scikit-learn' in your pip requirements files
  [31m   [0m   (requirements.txt, setup.py, setup.cfg, Pipfile, etc ...)
  [31m   [0m - if the 'sklearn' package is used by one of your dependencies,
  [31m   [0m   it would be great if you take some time to tra

In [189]:
# Import necessary libraries
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import mirdata

from sklearn.preprocessing import OneHotEncoder
from torch.utils.data import DataLoader, Dataset
from torchaudio.transforms import MelSpectrogram, Resample
from tqdm import tqdm

### Step 1: Load and preprocess the dataset
We will use a toy dataset (e.g., ESC-10 or a similar audio dataset). The dataset will be preprocessed using PyTorch's DataLoader utilities.


In [75]:
dataset = mirdata.initialize("tinysol")
dataset.download()
splits = dataset.get_random_track_splits([0.8, 0.2])

train_ids = splits[0]
val_ids = splits[1]



In [282]:
class TinySOLDataset(Dataset):
    def __init__(self, mirdata_dataset, ids):


        self.orig_sample_rate = 44100
        self.sample_rate = 16000

        self.audio_duration = 3
        
        self.mirdata_dataset = mirdata_dataset
        self.tids = ids

        # Load audio and labels
        self.audio = {}
        self.natural_labels = {}

        self.resample = Resample(orig_freq=self.orig_sample_rate, new_freq=self.sample_rate)

        n_samples = self.sample_rate * self.audio_duration
        
        for tid in tqdm(ids, desc="Loading audio"):
            track = self.mirdata_dataset.track(tid)
            audio, sr = track.audio

            assert sr == self.orig_sample_rate
            audio = self.resample(torch.Tensor(audio))

            if len(audio) >= n_samples:
                audio = audio[:n_samples]
            else:
                pad_size = n_samples - len(audio)
                audio = torch.cat([audio, torch.zeros(pad_size)])
            
            self.audio[tid] = audio
            self.natural_labels[tid] = track.instrument_full

        # One hot encode labels
        natural_labels = np.array(list(self.natural_labels.values())).reshape(-1, 1)
        ohe = OneHotEncoder()
        one_hot_labels = ohe.fit_transform(natural_labels).toarray()
        self.labels = {k: v for k, v in zip(self.tids, one_hot_labels)}


    def __len__(self):
        return len(self.tids)

    def __getitem__(self, idx, audio_cap=4):
        tid = self.tids[idx]
        audio = self.audio[tid]
        label = self.labels[tid]
        
        return {"audio": audio, "labels": label}

train_dataset = TinySOLDataset(dataset, train_ids)
val_dataset = TinySOLDataset(dataset, val_ids)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

Loading audio: 100%|█████████████████████████████████████████████████████████████████████████████| 2331/2331 [00:05<00:00, 454.48it/s]
Loading audio: 100%|███████████████████████████████████████████████████████████████████████████████| 582/582 [00:01<00:00, 438.50it/s]


### Step 2: Define the CNN model
We will define a simple CNN architecture suitable for music autotagging.


In [293]:
# Feature extractor. Using mel-spectrogram
feature_extractor = MelSpectrogram(n_mels=64, n_fft=1024)

In [297]:
n_classes = len(set(val_dataset.natural_labels.values()))

batch = next(iter(val_loader))
mels = feature_extractor(batch["audio"])

print("Mel-spectrogram shape:", mels.shape)
print("Number of classes:", n_classes)

Mel-spectrogram shape torch.Size([32, 64, 94])
Number of classes 14


In [298]:
class MLP(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

dense_size = 128

mlp = MLP( 64 * 94, dense_size, n_classes)

In [299]:
class CNN(nn.Module):
    def __init__(
        self,
        kernel_size: int=3,
        poolin_size: int=4,
        dense_size: int=64,
        n_classes: int=10,
    ):
        super(CNN, self).__init__()
        self.dense_size = dense_size
        self.conv1 = nn.Conv2d(1, dense_size // 4, kernel_size=kernel_size, stride=1, padding=1)
        self.conv2 = nn.Conv2d(dense_size // 4, dense_size// 2, kernel_size=kernel_size, stride=1, padding=1)
        self.conv3 = nn.Conv2d(dense_size // 2, dense_size, kernel_size=kernel_size, stride=1, padding=1)
        
        self.pool = nn.MaxPool2d(kernel_size=poolin_size, stride=poolin_size)
        
        self.fc1 = nn.Linear(dense_size, dense_size)
        self.fc2 = nn.Linear(dense_size, n_classes)  # Assuming 10 classes

    def forward(self, x):
        x = x.unsqueeze(1)
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = self.pool(F.relu(self.conv3(x)))
        
        x = x.view(-1, self.dense_size)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Instantiate the model
cnn = CNN(dense_size=dense_size, n_classes=n_classes)

### Step 3: Train the models
Set up the training loop with loss functions and optimizers.


In [305]:
mels.shape

torch.Size([32, 64, 94])

In [307]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
n_epochs = 3
for epoch in range(n_epochs):  # Number of epochs
    model.train()
    for i, batch in enumerate(train_loader):
        optimizer.zero_grad()

        mels = feature_extractor(batch["audio"])
        # Just flatten the input
        mels = mels.reshape(-1, 64 * 94)
        outputs = mlp(mels).softmax(dim=1)

        loss = criterion(outputs, batch["labels"])
        loss.backward()
        optimizer.step()
    print(f'Epoch [{epoch+1}/{n_epochs}], Loss: {loss.item():.4f}')

Epoch [1/20], Loss: 76.0935
Epoch [2/20], Loss: 40.3076
Epoch [3/20], Loss: 13.5963
Epoch [4/20], Loss: 21.6461
Epoch [5/20], Loss: 13.2569
Epoch [6/20], Loss: 3.4883
Epoch [7/20], Loss: 10.2740
Epoch [8/20], Loss: 4.9114
Epoch [9/20], Loss: 4.2719
Epoch [10/20], Loss: 9.1220
Epoch [11/20], Loss: 21.3596
Epoch [12/20], Loss: 25.6680
Epoch [13/20], Loss: 4.2586
Epoch [14/20], Loss: 62.4288
Epoch [15/20], Loss: 4.8110
Epoch [16/20], Loss: 26.1160
Epoch [17/20], Loss: 13.6851
Epoch [18/20], Loss: 44.1044
Epoch [19/20], Loss: 17.6623
Epoch [20/20], Loss: 17.7802


In [310]:
# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
n_epochs = 3
for epoch in range(n_epochs):  # Number of epochs
    model.train()
    for i, batch in enumerate(train_loader):
        optimizer.zero_grad()

        mels = feature_extractor(batch["audio"])
        outputs = cnn(mels).softmax(dim=1)

        loss = criterion(outputs, batch["labels"])
        loss.backward()
        optimizer.step()
    print(f'Epoch [{epoch+1}/{n_epochs}], Loss: {loss.item():.4f}')

Epoch [1/10], Loss: 2.6619
Epoch [2/10], Loss: 2.6313
Epoch [3/10], Loss: 2.6439
Epoch [4/10], Loss: 2.6755
Epoch [5/10], Loss: 2.6316
Epoch [6/10], Loss: 2.6087
Epoch [7/10], Loss: 2.6670
Epoch [8/10], Loss: 2.6086
Epoch [9/10], Loss: 2.6661


KeyboardInterrupt: 

In [283]:
outputs.shape

torch.Size([11, 14])

In [16]:
2048 / 32

64.0