In [1]:
import os
import librosa
import librosa.display
import numpy as np
import matplotlib.pyplot as plt


In [2]:
import torch
import torchaudio
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import seaborn as sns
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from PIL import Image
import random
#from plot_audio import plot_specgram, plot_waveform
os.getcwd()

  from .autonotebook import tqdm as notebook_tqdm
Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md



'/Users/jansta/learn/acoustics'

In [3]:
dict_mats = np.load('/Users/jansta/learn/acoustics/dict_mats.npy', allow_pickle=True).item()


In [4]:
# t = dict_mats['A']['can_opening'][2]


# print(t.shape)
# t2 = t / t.max()

# print(np.max(t), np.min(t), np.mean(t), np.std(t))
# #plt.plot(t)
# plt.hist(t, bins=100)




In [5]:
len(dict_mats['A']['can_opening'][3])

all_labels = list(dict_mats['A'].keys())
print(all_labels)

['dog', 'chirping_birds', 'vacuum_cleaner', 'thunderstorm', 'door_wood_knock', 'can_opening', 'crow', 'clapping', 'fireworks', 'chainsaw', 'airplane', 'mouse_click', 'pouring_water', 'train', 'sheep', 'water_drops', 'church_bells', 'clock_alarm', 'keyboard_typing', 'wind', 'footsteps', 'frog', 'cow', 'brushing_teeth', 'car_horn', 'crackling_fire', 'helicopter', 'drinking_sipping', 'rain', 'insects', 'laughing', 'hen', 'engine', 'breathing', 'crying_baby', 'hand_saw', 'coughing', 'glass_breaking', 'snoring', 'toilet_flush', 'pig', 'washing_machine', 'clock_tick', 'sneezing', 'rooster', 'sea_waves', 'siren', 'cat', 'door_wood_creaks', 'crickets']


In [6]:
chosen_labels = all_labels[24:]

encoded_labels = {}
for i, label in enumerate(chosen_labels):
    encoded_labels[label] = i

In [7]:
class AudioDataset(Dataset):
    def __init__(self, dict_mats, chosen_labels, encoded_labels, transform=None):
        self.X = []
        self.y = []
        self.transform = transform
        for key in dict_mats.keys():
            if key in chosen_labels:
                for i in range(len(dict_mats[key])):
                    self.X.append(dict_mats[key][i])
                    self.y.append(encoded_labels[key])
        
        self.X = np.array(self.X)
        self.y = np.array(self.y)
        
    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        sample = self.X[idx]
        label = self.y[idx]
        
        # Add a channel dimension
        sample = np.expand_dims(sample, axis=0)
        
        # Convert to tensor
        sample = torch.FloatTensor(sample)
        label = torch.tensor(label, dtype=torch.long)
        
        if self.transform:
            sample = self.transform(sample)
        
        return sample, label

In [8]:
transform = transforms.Compose(
    [transforms.Resize((64,431)),
    transforms.Grayscale(num_output_channels=1),
    #transforms.ToTensor(),
    transforms.Normalize((0.5, ), (0.5, ))
    ])

In [9]:
# Create dataset with transform
dataset = AudioDataset(dict_mats['A'], chosen_labels, encoded_labels, transform=transform)

# Split dataset
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

# Create dataloaders
batch_size = 4
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)



In [10]:
# Test the dataloader
for i, (inputs, labels) in enumerate(train_loader):
    print(f"Batch {i+1}:")
    print(f"Input batch size: {inputs.size()}")
    print(f"Labels: {labels}")
    print("-" * 30)
    break  # Just to test the first batch

Batch 1:
Input batch size: torch.Size([4, 1, 64, 431])
Labels: tensor([ 0, 21,  4,  1])
------------------------------


Changes to accomodate the full dataset. 
- Add more convolutional layers to capture more complex features.
- Add batch normalization layers to stabilize and accelerate training.
- Add dropout layers to prevent overfitting.
- Use global average pooling before the fully connected layers to reduce the number of parameters.
- Ensure the fully connected layers are appropriately sized for the increased complexity.

In [11]:
n_classes = len(chosen_labels)

class AudioClassifNetBig(nn.Module):
    def __init__(self, n_classes) -> None:
        super().__init__()
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=32, kernel_size=3, padding=1)
        #  batch normalization layers to stabilize and accelerate training.
        self.bn1 = nn.BatchNorm2d(32)
        self.conv2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, padding=1)
        self.bn2 = nn.BatchNorm2d(64)
        self.conv3 = nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, padding=1)
        self.bn3 = nn.BatchNorm2d(128)
        self.conv4 = nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, padding=1)
        self.bn4 = nn.BatchNorm2d(256)
        self.pool = nn.MaxPool2d(2, 2)
        # dropout layers to prevent overfitting.
        self.dropout = nn.Dropout(0.5)
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(256 * 4 * 26, 512)  # Adjusted based on pooling layers
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, n_classes)
        self.relu = nn.ReLU()
        self.softmax = nn.LogSoftmax(dim=1)
    
    def forward(self, x):
        x = self.conv1(x)  # out: (BS, 32, 64, 431)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.pool(x)   # out: (BS, 32, 32, 215)
        
        x = self.conv2(x)  # out: (BS, 64, 32, 215)
        x = self.bn2(x)
        x = self.relu(x)
        x = self.pool(x)   # out: (BS, 64, 16, 107)
        
        x = self.conv3(x)  # out: (BS, 128, 16, 107)
        x = self.bn3(x)
        x = self.relu(x)
        x = self.pool(x)   # out: (BS, 128, 8, 53)
        
        x = self.conv4(x)  # out: (BS, 256, 8, 53)
        x = self.bn4(x)
        x = self.relu(x)
        x = self.pool(x)   # out: (BS, 256, 4, 26)
        
        x = self.flatten(x) # out: (BS, 256 * 4 * 26)
        x = self.dropout(x)
        x = self.fc1(x)  # out: (BS, 512)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)  # out: (BS, 256)
        x = self.relu(x)
        x = self.fc3(x)  # out: (BS, n_classes)
        x = self.softmax(x)
        return x

# Create an instance of the model
model = AudioClassifNet()

In [12]:
def check_for_nans(tensor, name):
    if torch.isnan(tensor).any():
        print(f"NaNs found in {name}")
        return True
    return False



In [13]:
## Create an  instance of the model:
model = AudioClassifNet()

In [None]:
# Assuming the model is defined as AudioClassifNet
model = AudioClassifNetBig(n_classes)

# Loss function and optimizer
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training parameters
losses_epoch_mean = []
NUM_EPOCHS = 8000
CLIP = 1.0  # Gradient clipping value

for epoch in range(NUM_EPOCHS):
    losses_epoch = []
    model.train()  # Set the model to training mode
    
    for i, data in enumerate(train_loader):
        inputs, labels = data
        
        # Check for NaN in inputs
        if torch.isnan(inputs).any():
            print(f"NaN input at epoch {epoch}, batch {i}")
            break
        
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = loss_fn(outputs, labels)
        
        # Check for NaN in loss
        if torch.isnan(loss):
            print(f"NaN loss at epoch {epoch}, batch {i}")
            break
        
        loss.backward()
        
        # Gradient clipping to prevent exploding gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=CLIP)
        
        optimizer.step()
        losses_epoch.append(loss.item())
    
    losses_epoch_mean.append(np.mean(losses_epoch))
    
    if epoch % 100 == 0:
        print(f'Epoch {epoch}/{NUM_EPOCHS}, Loss: {np.mean(losses_epoch):.6f}')

sns.lineplot(x=list(range(len(losses_epoch_mean))), y=losses_epoch_mean)

Epoch 0/8000, Loss: 3.632030
Epoch 100/8000, Loss: 0.471074
Epoch 200/8000, Loss: 0.398045
Epoch 300/8000, Loss: 0.493930
Epoch 400/8000, Loss: 0.236736
Epoch 500/8000, Loss: 0.494331
Epoch 600/8000, Loss: 0.229933
Epoch 700/8000, Loss: 0.397229
Epoch 800/8000, Loss: 0.471671
Epoch 900/8000, Loss: 0.092682
Epoch 1000/8000, Loss: 0.334093
Epoch 1100/8000, Loss: 0.257831
Epoch 1200/8000, Loss: 0.413166
Epoch 1300/8000, Loss: 0.088354
Epoch 1400/8000, Loss: 0.321582
Epoch 1500/8000, Loss: 0.051388
Epoch 1600/8000, Loss: 0.243373
Epoch 1700/8000, Loss: 0.159811
Epoch 1800/8000, Loss: 0.074493
Epoch 1900/8000, Loss: 0.030372


In [None]:
y_val = []
y_val_hat = []
for i, data in enumerate(val_loader):
    inputs, y_val_temp = data
    with torch.no_grad():
        y_val_hat_temp = model(inputs).round()
    
    y_val.extend(y_val_temp.numpy())
    y_val_hat.extend(y_val_hat_temp.numpy())

In [None]:

# Accuracy
acc = accuracy_score(y_val, np.argmax(y_val_hat, axis=1))
print(f'Accuracy: {acc*100:.2f} %')
# confusion matrix
cm = confusion_matrix(y_val, np.argmax(y_val_hat, axis=1))
sns.heatmap(cm, annot=True, xticklabels=chosen_labels, yticklabels=chosen_labels)
