In [1]:
import models
import torchaudio
import pathlib
import pandas as pd
import librosa
import numpy as np
import utils
import importlib
import torch
import torch.nn as nn
from torch.utils.data import random_split
import wandb

In [2]:
from torch.utils.data import Dataset

In [3]:
classes = ['Alarm_bell_ringing', 'Cat', 'Dishes', 'Dog', 'Electric_shaver_toothbrush']
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [4]:
config = {
    "sample_rate": 22000,
    "n_mels": 64,
    "n_fft": 1024, 
    "hop_len": None,
    "top_db": 80,
    "n_mfcc": 64,
    "classes": classes,
    "data_path": "../scaper/soundscapes/train",
    "duration": 10000
}

In [5]:
model = models.Cnn14(**config)

In [6]:
model.to(device)

Cnn14(
  (logmel_extractor): MelSpectrogram(
    (spectrogram): Spectrogram()
    (mel_scale): MelScale()
  )
  (bn0): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv_block1): ConvBlock(
    (conv1): Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (conv_block2): ConvBlock(
    (conv1): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (co

In [7]:
importlib.reload(utils)

<module 'utils' from '/home/lux_t1/Desktop/endo_audio_scaper/network/utils.py'>

In [8]:
df, classes = utils.get_dataset_dataframe(config["data_path"])

  0%|          | 0/10000 [00:00<?, ?it/s]

In [9]:
df.head()

Unnamed: 0,relative_path,class_ids
0,soundscape_unimodal4241.wav,"[0, 1, 1, 0, 1]"
1,soundscape_unimodal1544.wav,"[0, 0, 1, 1, 0]"
2,soundscape_unimodal5168.wav,"[0, 0, 0, 1, 1]"
3,soundscape_unimodal5422.wav,"[0, 0, 1, 1, 0]"
4,soundscape_unimodal3289.wav,"[0, 0, 0, 0, 0]"


In [10]:
config["classes"] = classes
classes

array(['Alarm_bell_ringing', 'Cat', 'Dishes', 'Dog',
       'Electric_shaver_toothbrush'], dtype=object)

In [11]:
class SoundDataSet(Dataset):
    def __init__(self, df, data_path, duration, sample_rate, top_db, n_mels, hop_len, n_fft,**kwargs):
        self.df = df
        self.data_path = pathlib.Path(data_path)
        self.sr = sample_rate
        self.duration = duration,
        self.top_db = top_db
        self.n_mels = n_mels
        self.hop_len = hop_len
        self.n_fft = n_fft
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.loc[idx, :]
        audiofile_path = self.data_path.joinpath(row["relative_path"])
        class_ids = row["class_ids"]
        
        sig, sr = torchaudio.load(audiofile_path)
        if sr is not self.sr:
            sig, sr = utils.resample((sig, sr), self.sr)
        
#         sig = utils.get_melspec(sig, self.sr, self.n_fft, self.hop_len, self.n_mels, self.top_db)
        return sig, class_ids
        

In [12]:
dataset = SoundDataSet(df, **config)
# Train / Val Split

num_items = len(dataset)
print(num_items)
num_train = round(num_items * 0.9)
num_val = num_items - num_train
batch_size = 100

train_ds, val_ds = random_split(dataset, [num_train, num_val])
train_dl = torch.utils.data.DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_dl = torch.utils.data.DataLoader(val_ds, batch_size=batch_size, shuffle=False)

10000


In [14]:
def evaluate(model, dl, classes, criterion):
    n_classes = len(classes)
    running_loss = 0.0
    correct_prediction = np.zeros((n_classes))
    total_prediction = 0

    # Repeat for each batch in the training set
    for i, data in enumerate(dl):
        # Get the input features and target labels, and put them on the GPU
        inputs, labels = data[0].to(device), data[1].to(device)

        # Normalize the inputs
        inputs_m, inputs_s = inputs.mean(), inputs.std()
        inputs = (inputs - inputs_m) / inputs_s

        # forward + backward + optimize
        output_dict = model(inputs)
        outputs = output_dict["clipwise_output"]
        loss = criterion(outputs, labels)

        # Keep stats for Loss and Accuracy
        running_loss += loss.item()
        outputs[outputs>0.5] = 1
        outputs[outputs<=0.5] = 0
        prediction = outputs

        # Count of predictions that matched the target label
        correct_prediction += (prediction == labels).sum(axis = 0).cpu().numpy()
        total_prediction += prediction.shape[0]

    # Print stats at the end of the epoch
    num_batches = len(train_dl)
    avg_loss = running_loss / num_batches
    acc = correct_prediction/(total_prediction*n_classes)
    acc_dict = {classes[j]: _ for j, _ in enumerate(acc)}
    return avg_loss, acc_dict

In [15]:
def training(model, train_dl, num_epochs, classes):
    wandb.init(project='endomic', entity='maddonix')
    config = wandb.config
    
    criterion = nn.MultiLabelMarginLoss() # nn.BCEWithLogitsLoss()
    optimizer = torch.optim.Adam(model.parameters(),lr=0.001)
    scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=0.001,
                                                steps_per_epoch=int(len(train_dl)),
                                                epochs=num_epochs,
                                                anneal_strategy='linear')
    n_classes = len(classes)

    # Repeat for each epoch
    for epoch in range(num_epochs):
        running_loss = 0.0
        correct_prediction = np.zeros((n_classes))
        total_prediction = 0

        # Repeat for each batch in the training set
        for i, data in enumerate(train_dl):
            model.train()
            # Get the input features and target labels, and put them on the GPU
            inputs, labels = data[0].to(device), data[1].to(device)

            # Normalize the inputs
#             inputs_m, inputs_s = inputs.mean(), inputs.std()
#             inputs = (inputs - inputs_m) / inputs_s

            # Zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            output_dict = model(inputs)
            outputs = output_dict["clipwise_output"]
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            scheduler.step()

            # Keep stats for Loss and Accuracy
            running_loss += loss.item()

            # Get the predicted classes with threshold > 0.5
            outputs[outputs>0.5] = 1
            outputs[outputs<=0.5] = 0
            prediction = outputs
            
            # Count of predictions that matched the target label
            correct_prediction += (prediction == labels).sum(axis = 0).cpu().numpy()
            total_prediction += prediction.shape[0]

#             if i % 10 == 0 and i > 0:    # print every 10 mini-batches
#                 print(f'Epoch: {epoch}, iteration: {i} loss: {running_loss / i}')

        # Print stats at the end of the epoch
        num_batches = len(train_dl)
        avg_loss = running_loss / num_batches
        acc = correct_prediction/(total_prediction*n_classes)
        acc_dict ={_: acc[i] for i, _ in enumerate(classes)}
        model.eval()
        val_loss, val_acc_dict = evaluate(model, val_dl, classes, criterion)
        wandb.log({
            "train_loss": loss,
            "train_acc": acc_dict,
            "val_loss": val_loss,
            "val_acc": val_acc_dict
        })        
        
        print(f'Epoch: {epoch}')
        print(f'Train: Loss: {avg_loss}, Accuracy: {acc}')
        print(f'Val: Loss: {val_loss}, Accuracy: {val_acc_dict}')
        

    print('Finished Training')

In [16]:
num_epochs = 200
training(model, train_dl, num_epochs, config["classes"])

[34m[1mwandb[0m: Currently logged in as: [33mmaddonix[0m (use `wandb login --relogin` to force relogin)


Epoch: 0
Train: Loss: 0.1358211074852281, Accuracy: [0.0678     0.10911111 0.13426667 0.13408889 0.13468889]
Val: Loss: 0.013339154008362029, Accuracy: {'Alarm_bell_ringing': 0.0666, 'Cat': 0.1192, 'Dishes': 0.1342, 'Dog': 0.1318, 'Electric_shaver_toothbrush': 0.134}
Epoch: 1
Train: Loss: 0.05559374899086025, Accuracy: [0.06715556 0.10706667 0.13422222 0.13408889 0.13471111]
Val: Loss: 0.006723395362496376, Accuracy: {'Alarm_bell_ringing': 0.0666, 'Cat': 0.1136, 'Dishes': 0.1342, 'Dog': 0.1318, 'Electric_shaver_toothbrush': 0.134}
Epoch: 2
Train: Loss: 0.05591534619323081, Accuracy: [0.06715556 0.10677778 0.13422222 0.13408889 0.13471111]
Val: Loss: 0.00793910428053803, Accuracy: {'Alarm_bell_ringing': 0.0666, 'Cat': 0.115, 'Dishes': 0.1342, 'Dog': 0.1318, 'Electric_shaver_toothbrush': 0.134}
Epoch: 3
Train: Loss: 0.04590388112184074, Accuracy: [0.06715556 0.10804444 0.13422222 0.13408889 0.13471111]
Val: Loss: 0.005208510677847597, Accuracy: {'Alarm_bell_ringing': 0.0666, 'Cat': 0.110

In [17]:
# save
model_path = pathlib.Path("cnn14_audio.pth")
torch.save(model.state_dict(), model_path)

In [27]:
model.eval()
for i, data in enumerate(val_dl):
    inputs, labels = data[0].to(device), data[1].to(device)
    break


In [19]:
# logmel_extractor = torchaudio.transforms.MelSpectrogram(
#     config["sample_rate"],
#     config["n_fft"],
#     hop_length = config["hop_len"],
#     n_mels = config["n_mels"],
#     normalized = True
# )
# bn0 = nn.BatchNorm2d(64)

In [28]:
output_dict = model(inputs)

In [30]:
loss = (output_dict["clipwise_output"]- labels).sum()

In [32]:
loss.backward()

In [35]:
for param in model.parameters():
    print(param)

Parameter containing:
tensor([0.9885, 1.0034, 1.0126, 1.0108, 0.9099, 0.9433, 0.9824, 0.9244, 0.9922,
        1.0368, 0.9944, 1.0263, 1.0067, 0.9824, 0.9936, 1.0126, 1.0154, 1.0127,
        1.0166, 1.0634, 1.0563, 1.0834, 0.9918, 1.0364, 0.9727, 0.9358, 1.0059,
        1.0342, 1.0105, 1.0129, 1.0325, 1.0432, 0.9517, 0.9811, 1.0207, 1.0835,
        1.0206, 1.0259, 1.0557, 1.0266, 1.0111, 1.0342, 1.0569, 1.0348, 1.0227,
        1.0070, 0.9901, 0.9703, 0.9210, 0.9680, 0.9717, 1.0249, 1.0262, 0.9832,
        0.9867, 0.9538, 1.0152, 0.9223, 0.9607, 0.9931, 1.0020, 0.9661, 0.9546,
        1.0457], device='cuda:0', requires_grad=True)
Parameter containing:
tensor([ 0.0117,  0.0007,  0.0250,  0.0191, -0.0565, -0.0261,  0.0051, -0.0154,
         0.0031,  0.0340,  0.0400,  0.0670,  0.0354,  0.0188,  0.0479,  0.0284,
         0.0682,  0.0476,  0.0412,  0.0600,  0.0498,  0.0740, -0.0051, -0.0065,
        -0.0104, -0.0508, -0.0330,  0.0033,  0.0079,  0.0238, -0.0256,  0.0021,
         0.0254,  0.01

In [26]:
pred = model(batch_data.data)

RuntimeError: stft input and window must be on the same device but got self on cpu and window on cuda:0