In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision.transforms import Compose, ToTensor
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
import numpy as np
from scipy.io import wavfile
import matplotlib.pyplot as plt
import librosa
import os
import sys


In [22]:
torch.cuda.is_available()

False

In [2]:
main_path = '/Users/jaewone/developer/tensorflow/baby-cry-classification'
data_path = os.path.join(main_path, 'data')

work_path = os.path.join(main_path, 'model', 'coAtNet')
AUDIO_DIR = os.path.join(work_path, 'audios')
MODEL_PATH = os.path.join(work_path, 'model.pt')

In [3]:
sys.path.append(main_path)

In [4]:
from trans_data import extract_state_sample, get_state_file_list

In [17]:
if not os.path.exists(AUDIO_DIR):
    extract_state_sample(data_path, AUDIO_DIR, 10, with_dir=False)

In [18]:
class CoAtNet(nn.Module):
    def __init__(self, num_classes=36):
        super(CoAtNet, self).__init__()

        # Convolutional part
        self.conv_layers = nn.Sequential(
            # in_channels = 1 : Number of channels in the input image
            # out_channels = 32 : Number of channels produced by the convolution
            nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
        )

        # Transformer part
        encoder_layer = nn.TransformerEncoderLayer(d_model=32, nhead=8)
        self.transformer_encoder = nn.TransformerEncoder(
            encoder_layer, num_layers=2)

        # Linear classifier
        self.fc = nn.Linear(32, num_classes)

    def forward(self, x):
        x = self.conv_layers(x)

        # Flattening
        x = x.view(x.size(0), -1, x.size(1))

        # Transformer encoder
        x = self.transformer_encoder(x)

        # Max pooling over time
        x, _ = torch.max(x, dim=1)

        # Classifier
        x = self.fc(x)
        return x


In [19]:
file_list = os.listdir(AUDIO_DIR)

In [21]:
class ToMelSpectrogram:
    def __call__(self, samples):
        # print(samples.shape) # (16000,)
        return librosa.feature.melspectrogram(y=samples, sr=16000, n_mels=64, hop_length=225)


# This class is to load audio data and apply the transformation
class AudioDataset(torch.utils.data.Dataset):
    def __init__(self, data_dir, transform=None):
        self.data_dir = data_dir
        self.transform = transform
        self.file_list = os.listdir(self.data_dir)

    def __len__(self):
        return len(self.file_list)

    def __getitem__(self, idx):
        waveform, _ = librosa.load(os.path.join(self.data_dir, self.file_list[idx]),
                                   sr=None,
                                   duration=1.0,
                                   mono=True)

        # Assuming the file name is 'label_otherInfo.wav'
        label = self.file_list[idx].split("_")[0]

        if self.transform:
            waveform = self.transform(waveform)

        return waveform, label


def train():
    # We will use the transformation to convert the audio into Mel spectrogram
    transform = Compose([ToMelSpectrogram(), ToTensor()])

    dataset = AudioDataset(AUDIO_DIR, transform=transform)
    train_set, val_set = train_test_split(
        dataset, test_size=0.2, stratify=[file.rsplit('_', 1)[0] for file in os.listdir(AUDIO_DIR)])
    train_loader = DataLoader(dataset=train_set, batch_size=16, shuffle=True)
    val_loader = DataLoader(dataset=val_set, batch_size=16, shuffle=True)

    # Assuming we have this class implemented following the paper or using a library
    model = CoAtNet()
    model = model.cuda()
    optimizer = optim.Adam(model.parameters(), lr=5e-4)
    criterion = nn.CrossEntropyLoss().cuda()  

    num_epochs = 500

    for epoch in range(num_epochs):
        model.train()
        for inputs, labels in train_loader:
            inputs = inputs.cuda()
            labels = labels.cuda()

            optimizer.zero_grad()

            # Forward pass
            outputs = model(inputs)
            loss = criterion(outputs, labels)

            # Backward and optimize
            loss.backward()
            optimizer.step()

        print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item()}")

        # Validation
        if (epoch + 1) % 5 == 0:
            model.eval()
            with torch.no_grad():
                correct = 0
                total = 0
                for inputs, labels in val_loader:
                    inputs = inputs.cuda()
                    labels = labels.cuda()
                    outputs = model(inputs)
                    _, predicted = torch.max(outputs.data, 1)
                    total += labels.size(0)
                    correct += (predicted == labels).sum().item()

                print(f"Validation Accuracy: {correct/total}")

    torch.save(model.state_dict(), MODEL_PATH)

train()

AssertionError: Torch not compiled with CUDA enabled

In [None]:
# # The following class help transform our input into mel-spectrogram
# class ToMelSpectrogram:
#     def __call__(self, samples):
#         # return librosa.feature.melspectrogram(samples, n_mels=64, length=1024, hop_length=225)
#         return librosa.feature.melspectrogram(y=samples, sr=16000, n_mels=64, hop_length=225)


# # This class is to load audio data and apply the transformation
# class AudioDataset(torch.utils.data.Dataset):
#     def __init__(self, file_list, transform=None):
#         """
#         file_list : wav file path list
#         transform : transformer model
#         """
#         self.file_list = file_list
#         self.transform = transform

#     def __len__(self):
#         return len(self.file_list)

#     def __getitem__(self, idx):
#         waveform, _ = librosa.load(self.file_list[idx],
#                                    sr=None,
#                                    duration=1.0,
#                                    mono=True)

#         # Assuming the file name is 'label/label_randNum.wav'
#         label = self.file_list[idx].rsplit('/', 2)[1]

#         if self.transform:
#             waveform = self.transform(waveform)

#         return waveform, label


# def train():
#     # We will use the transformation to convert the audio into Mel spectrogram
#     transform = Compose([ToMelSpectrogram(), ToTensor()])

#     dataset = AudioDataset(file_list, transform=transform)
#     train_set, val_set = train_test_split(
#         dataset, test_size=0.2, stratify=[file.rsplit('_', 1)[0] for file in os.listdir(os.path.join(AUDIO_DIR, 'hungry'))])
#     train_loader = DataLoader(dataset=train_set, batch_size=16, shuffle=True)
#     val_loader = DataLoader(dataset=val_set, batch_size=16, shuffle=True)

#     print(len(train_set))
#     print(len(val_set))
#     return

#     # Assuming we have this class implemented following the paper or using a library
#     model = CoAtNet()
#     model = model.cuda()
#     optimizer = optim.Adam(model.parameters(), lr=5e-4)
#     criterion = nn.CrossEntropyLoss()

#     num_epochs = 500

#     for epoch in range(num_epochs):
#         model.train()
#         for inputs, labels in train_loader:
#             inputs = inputs.cuda()
#             labels = labels.cuda()

#             optimizer.zero_grad()

#             # Forward pass
#             outputs = model(inputs)
#             loss = criterion(outputs, labels)

#             # Backward and optimize
#             loss.backward()
#             optimizer.step()

#         print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item()}")

#         # Validation
#         if (epoch + 1) % 5 == 0:
#             model.eval()
#             with torch.no_grad():
#                 correct = 0
#                 total = 0
#                 for inputs, labels in val_loader:
#                     inputs = inputs.cuda()
#                     labels = labels.cuda()
#                     outputs = model(inputs)
#                     _, predicted = torch.max(outputs.data, 1)
#                     total += labels.size(0)
#                     correct += (predicted == labels).sum().item()

#                 print(f"Validation Accuracy: {correct/total}")

#     torch.save(model.state_dict(), MODEL_PATH)


# train()