In [1]:
import pathlib
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from sklearn.preprocessing import MultiLabelBinarizer

### Read Audio

In [2]:
def read_label_txt(path:pathlib.Path) -> pd.DataFrame:
    colnames = ["onset", "offset", "label"]
    df = pd.read_csv(path, sep = "\t", names = ["onset", "offset", "label"], index_col = False)
    return df

In [3]:
data_path = pathlib.Path("../scaper/soundscapes/train")
wav_paths = [_ for _ in data_path.iterdir() if _.suffix == ".wav"]
label_paths = [_.with_suffix(".txt") for _ in wav_paths]

labels = [read_label_txt(_)["label"].to_list() for _ in tqdm(label_paths)]

  0%|          | 0/10000 [00:00<?, ?it/s]

In [4]:
mlb = MultiLabelBinarizer()

In [5]:
label_array = mlb.fit_transform(labels)

In [6]:
n_classes = label_array.shape[1]
n_classes

5

In [7]:
print(mlb.classes_)

['Alarm_bell_ringing' 'Cat' 'Dishes' 'Dog' 'Electric_shaver_toothbrush']


In [8]:
df = pd.DataFrame(data = {
    "relative_path": [_.name for _ in wav_paths],
    "class_ids": [_ for _ in label_array]
})

### Network

In [9]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim 
from torch.utils.data import random_split

In [10]:
assert torch.cuda.is_available()

In [11]:
from utils import SoundDS

In [12]:
myds = SoundDS(df, data_path)

In [13]:
# Train / Val Split

num_items = len(myds)
num_train = round(num_items * 0.8)
num_val = num_items - num_train

train_ds, val_ds = random_split(myds, [num_train, num_val])

In [49]:
labels_in

Unnamed: 0,onset,offset,label
0,3.087113,3.325502,Dog
1,5.196833,9.30278,Dog


AudioClassifier(
  (conv1): Conv2d(2, 8, kernel_size=(5, 5), stride=(2, 2), padding=(2, 2))
  (relu1): ReLU()
  (bn1): BatchNorm2d(8, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv2): Conv2d(8, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
  (relu2): ReLU()
  (bn2): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv3): Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
  (relu3): ReLU()
  (bn3): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv4): Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
  (relu4): ReLU()
  (bn4): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (ap): AdaptiveAvgPool2d(output_size=1)
  (lin): Linear(in_features=64, out_features=5, bias=True)
  (conv): Sequential(
    (0): Conv2d(2, 8, kernel_size=(5, 5), stride=(2, 2), padding=(2, 2))
    (1): ReLU()
    (2): BatchNorm2d(8, eps=1e-05, momen

In [57]:
pred

tensor([[ 191.4008,   38.6748,  -44.3797,   64.2089, 1500.4611]],
       device='cuda:0', grad_fn=<AddmmBackward>)