# Audio Classification using Conformer

This notebook provides a structure to train an audio classification model using Conformer as a feature extractor.

In [1]:

import torch
import torch.nn as nn
import librosa
import numpy as np
import sys
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm


In [2]:
sys.path.append('/Users/jaewone/developer/tensorflow/baby-cry-classification')

In [3]:
from conformer.model import Conformer

In [4]:

def extract_mel_spectrogram(file_path, n_mels=80):
    y, sr = librosa.load(file_path, sr=16000)
    mel_spectrogram = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels)
    log_mel_spectrogram = librosa.power_to_db(mel_spectrogram)
    return log_mel_spectrogram.T


In [5]:
# Get data
from trans_data import get_state_samples, split_dataset

NUM_CLASSES = 7
state_list = ['sleepy', 'uncomfortable', 'diaper', 'awake', 'sad', 'hug', 'hungry']
class_map = {'sleepy': 0, 'uncomfortable': 1, 'diaper': 2, 'awake': 3, 'sad': 4, 'hug': 5, 'hungry': 6}

# get wav file list
file_list = get_state_samples('/Users/jaewone/developer/tensorflow/baby-cry-classification/data', n_extract=10)
features = np.array([extract_mel_spectrogram(file) for file in file_list])
labels = np.array([class_map[file.rsplit('/', 2)[1]] for file in file_list])
print(features.shape)
print(labels.shape)



(70, 63, 80)
(70,)


## Model Definition

In [6]:

class AudioClassifier(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(AudioClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, 256)
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, num_classes)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.fc3(x)
        return x

class AudioClassificationModel(nn.Module):
    def __init__(self, conformer, classifier):
        super(AudioClassificationModel, self).__init__()
        self.conformer = conformer
        self.classifier = classifier

    def forward(self, inputs, input_lengths):
        # Using the mean and max of the encoder outputs across time dimension for classification
        encoder_outputs, _ = self.conformer(inputs, input_lengths)
        encoder_outputs_mean = encoder_outputs.mean(dim=1)
        encoder_outputs_max, _ = encoder_outputs.max(dim=1)
        encoder_outputs_combined = torch.cat((encoder_outputs_mean, encoder_outputs_max), dim=1)
        return self.classifier(encoder_outputs_combined)


## Training

In [7]:

# Convert features and labels to torch tensors
features_tensor = torch.tensor(features, dtype=torch.float32)
labels_tensor = torch.tensor(labels, dtype=torch.long)
input_lengths_tensor = torch.tensor([f.shape[0] for f in features], dtype=torch.long)

# Create a DataLoader
dataset = TensorDataset(features_tensor, labels_tensor, input_lengths_tensor)

train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(dataset, [0.7, 0.2, 0.1])
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=2, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=2, shuffle=False)
print(len(train_dataset))
print(len(val_dataset))
print(len(test_dataset))

49
14
7


In [8]:
# Load model
conformer_model = Conformer(num_classes=7)
classifier = AudioClassifier(input_dim=1024, num_classes=7) 
audio_classification_model = AudioClassificationModel(conformer=conformer_model, classifier=classifier)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(audio_classification_model.parameters(), lr=0.001)

In [11]:
# Training loop
def run(audio_classification_model, criterion):
    num_epochs = 1
    len_train_dataloader = len(train_loader)
    len_val_dataloader = len(val_loader)
    for epoch in range(num_epochs):
        # Training
        audio_classification_model.train()
        train_loss = 0.0
        with tqdm(total=len_train_dataloader, desc=f"Epoch {epoch + 1}/{num_epochs} training  ", position=0) as pbar:
            for batch_features, batch_labels, batch_input_lengths in train_loader:
                optimizer.zero_grad()
                outputs = audio_classification_model(batch_features, batch_input_lengths)
                loss = criterion(outputs, batch_labels)
                loss.backward()
                optimizer.step()
                train_loss += loss.item()
                pbar.update(1)

        # Validation
        audio_classification_model.eval()
        val_loss = 0.0
        correct = 0
        total = 0
        with tqdm(total=len_val_dataloader, desc=f"Epoch {epoch + 1}/{num_epochs} validation", position=0) as pbar:
            with torch.no_grad():
                for batch_features, batch_labels, batch_input_lengths in val_loader:
                    outputs = audio_classification_model(batch_features, batch_input_lengths)
                    loss = criterion(outputs, batch_labels)
                    val_loss += loss.item()
                    _, predicted = outputs.max(1)
                    total += batch_labels.size(0)
                    correct += predicted.eq(batch_labels).sum().item()
                    pbar.update(1)

        print(f"Train Loss: {train_loss/len(train_loader):.4f}, Validation Loss: {val_loss/len(val_loader):.4f}")
        print(f"Validation Accuracy: {100 * correct / total:.2f}%\n")
    
    return audio_classification_model, criterion

audio_classification_model, criterion = run(audio_classification_model, criterion)

Epoch 1/1 training  : 100%|██████████| 25/25 [01:29<00:00,  3.58s/it]
Epoch 1/1 validation: 100%|██████████| 7/7 [00:10<00:00,  1.47s/it]

Train Loss: 2.0281, Validation Loss: 2.1312
Validation Accuracy: 7.14%






In [12]:
def evaluate_model(model, loader, criterion):
    model.eval()
    total_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for batch_features, batch_labels, batch_input_lengths in loader:
            outputs = model(batch_features, batch_input_lengths)
            loss = criterion(outputs, batch_labels)
            total_loss += loss.item()
            _, predicted = outputs.max(1)
            total += batch_labels.size(0)
            correct += predicted.eq(batch_labels).sum().item()
    return total_loss / len(loader), 100 * correct / total

# Evaluating the model on the test dataset (after training)
test_loss, test_accuracy = evaluate_model(audio_classification_model, test_loader, criterion)
test_loss, test_accuracy
print(f"Test Loss: {test_loss:.4f}")
print(f"Test Accuracy: {test_accuracy:.2f}%")

Test Loss: 2.0595
Test Accuracy: 14.29%


In [None]:
# def save_checkpoint(epoch, model, optimizer, path):
#     torch.save({
#         'epoch': epoch,
#         'model_state_dict': model.state_dict(),
#         'optimizer_state_dict': optimizer.state_dict(),
#     }, path)

# best_val_loss = float('inf')
# num_epochs = 10

# for epoch in range(num_epochs):
#     train_loss = train(model, train_data, criterion, optimizer, device)
#     val_loss = validate(model, val_data, criterion, device)

#     print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss}, Validation Loss: {val_loss}")

#     # 체크포인트 저장: 여기서는 검증 손실이 이전의 최고 값보다 낮을 때만 저장합니다.
#     if val_loss < best_val_loss:
#         best_val_loss = val_loss
#         save_checkpoint(epoch, model, optimizer, "/Users/jaewone/developer/tensorflow/baby-cry-classification/model/conformer2/best_model_checkpoint.pth")
