In [1]:
import torch
import torchaudio
import pytorch_lightning as pl
import torchmetrics

import torch.nn as nn
from torch.nn import functional as F



In [2]:
import matplotlib.pyplot as plt
import pandas as pd
from pathlib import Path

In [3]:
torch.cuda.is_available()

True

# 3 big blocks

1. loading data
2. model
3. train

## Step 1: Loading the data

In [4]:
datapath = Path(r'D:\000data\ESC-50-master')

In [5]:
datapath.exists()

True

convert audio to image

In [6]:
csv = pd.read_csv(datapath / Path('meta/esc50.csv'))

In [7]:
csv

Unnamed: 0,filename,fold,target,category,esc10,src_file,take
0,1-100032-A-0.wav,1,0,dog,True,100032,A
1,1-100038-A-14.wav,1,14,chirping_birds,False,100038,A
2,1-100210-A-36.wav,1,36,vacuum_cleaner,False,100210,A
3,1-100210-B-36.wav,1,36,vacuum_cleaner,False,100210,B
4,1-101296-A-19.wav,1,19,thunderstorm,False,101296,A
...,...,...,...,...,...,...,...
1995,5-263831-B-6.wav,5,6,hen,False,263831,B
1996,5-263902-A-36.wav,5,36,vacuum_cleaner,False,263902,A
1997,5-51149-A-25.wav,5,25,footsteps,False,51149,A
1998,5-61635-A-8.wav,5,8,sheep,False,61635,A


In [8]:
csv[csv['fold'].isin([1,2])]

Unnamed: 0,filename,fold,target,category,esc10,src_file,take
0,1-100032-A-0.wav,1,0,dog,True,100032,A
1,1-100038-A-14.wav,1,14,chirping_birds,False,100038,A
2,1-100210-A-36.wav,1,36,vacuum_cleaner,False,100210,A
3,1-100210-B-36.wav,1,36,vacuum_cleaner,False,100210,B
4,1-101296-A-19.wav,1,19,thunderstorm,False,101296,A
...,...,...,...,...,...,...,...
795,2-99795-A-32.wav,2,32,keyboard_typing,False,99795,A
796,2-99796-A-32.wav,2,32,keyboard_typing,False,99796,A
797,2-99955-A-7.wav,2,7,insects,False,99955,A
798,2-99955-B-7.wav,2,7,insects,False,99955,B


In [9]:
csv.iloc[0,0]

'1-100032-A-0.wav'

In [10]:
x, sr = torchaudio.load(datapath / 'audio' / csv.iloc[0,0], normalize=True)

In [11]:
plt.plot(x[0, ::5])

[<matplotlib.lines.Line2D at 0x28bab82cd88>]

In [12]:
x.unique().shape

torch.Size([7217])

In [13]:
sr

44100

In [14]:
x.shape

torch.Size([1, 220500])

In [15]:
torchaudio.transforms.Resample(orig_freq=sr, new_freq=8000)(x).shape

torch.Size([1, 40000])

In [16]:
h = torchaudio.transforms.MelSpectrogram(sample_rate=sr)(x)

  "At least one mel filterbank has all zero values. "


In [17]:
h

tensor([[[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]]])

In [18]:
h.shape

torch.Size([1, 128, 1103])

In [19]:
h = torchaudio.transforms.AmplitudeToDB()(h)

In [20]:
plt.figure(figsize=(25,40))
plt.imshow(h[0])
# 纵轴是frequency content

<matplotlib.image.AxesImage at 0x28bb58a34c8>

In [21]:
class ESC50Dataset(torch.utils.data.Dataset):
    def __init__(self, path: Path = Path(r'D:\000data\ESC-50-master'),
                sample_rate: int = 8000,
                folds = [0]):
        self.path = path
        csv = pd.read_csv(path / Path('meta/esc50.csv'))
        self.csv = csv[csv['fold'].isin(folds)]
        self.resample = torchaudio.transforms.Resample(orig_freq=44100, new_freq=sample_rate)
        self.mels = x = torchaudio.transforms.MelSpectrogram(sample_rate=44100)
        self.amp = torchaudio.transforms.AmplitudeToDB()
        
        
    def __getitem__(self, index):
        x, _ = torchaudio.load(self.path / 'audio' / self.csv.iloc[index, 0], normalize=True)
        
        x = self.resample(x)
        x = self.mels(x)
        x = self.amp(x)
        
        y = self.csv.iloc[index, 2]
        
        return x, y
        
    def __len__(self):
        return len(self.csv)

In [22]:
dataset = ESC50Dataset(folds=[1])

In [23]:
for x, y in dataset:
    print(x,y)
    break

tensor([[[-100., -100., -100.,  ..., -100., -100., -100.],
         [-100., -100., -100.,  ..., -100., -100., -100.],
         [-100., -100., -100.,  ..., -100., -100., -100.],
         ...,
         [-100., -100., -100.,  ..., -100., -100., -100.],
         [-100., -100., -100.,  ..., -100., -100., -100.],
         [-100., -100., -100.,  ..., -100., -100., -100.]]]) 0


In [24]:
x,y = dataset[0]

In [25]:
plt.imshow(x[0])

<matplotlib.image.AxesImage at 0x28bb5bccb88>

In [26]:
y

0

In [27]:
train_data = ESC50Dataset(folds=[1])
val_data = ESC50Dataset(folds=[2])
test_data = ESC50Dataset(folds=[3])

In [28]:
train_loader = torch.utils.data.DataLoader(train_data, batch_size=8, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_data, batch_size=8)
test_loader = torch.utils.data.DataLoader(test_data, batch_size=8)

In [29]:
train_loader

<torch.utils.data.dataloader.DataLoader at 0x28bb58f4908>

## Step 2: Building the model

In [47]:
class AudioNet(pl.LightningModule):
    
    def __init__(self, n_classes = 50, base_filters = 32):
        super().__init__()
        self.conv1 = nn.Conv2d(1, base_filters, 11, padding=5)
        self.bn1 = nn.BatchNorm2d(base_filters)
        self.conv2 = nn.Conv2d(base_filters, base_filters, 3, padding=1)
        self.bn2 = nn.BatchNorm2d(base_filters)
        self.pool1 = nn.MaxPool2d(2)
        self.conv3 = nn.Conv2d(base_filters, base_filters * 2, 3, padding=1)
        self.bn3 = nn.BatchNorm2d(base_filters * 2)
        self.conv4 = nn.Conv2d(base_filters * 2, base_filters * 4, 3, padding=1)
        self.bn4 = nn.BatchNorm2d(base_filters * 4)
        self.pool2 = nn.MaxPool2d(2)
        self.fc1 = nn.Linear(base_filters * 4, n_classes)
        
    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(self.bn1(x))
        x = self.conv2(x)
        x = F.relu(self.bn2(x))
        x = self.pool1(x)
        x = self.conv3(x)
        x = F.relu(self.bn3(x))
        x = self.conv4(x)
        x = F.relu(self.bn4(x))
        x = self.pool2(x)
        x = F.adaptive_avg_pool2d(x, (1, 1))
        x = self.fc1(x[:, :, 0, 0])
        return x
    
    def training_step(self, batch, batch_idx):
        # Very simple training loop
        x, y = batch
        y_hat = self(x)
        loss = F.cross_entropy(y_hat, y)
        self.log('train_loss', loss, on_step=True)
        return loss
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        y_hat = torch.argmax(y_hat, dim=1)
        acc = torchmetrics.functional.accuracy(y_hat, y)
        self.log('val_acc', acc, on_epoch=True, prog_bar=True)
        return acc
    
    def test_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        y_hat = torch.argmax(y_hat, dim=1)
        acc = torchmetrics.functional.accuracy(y_hat, y)
        self.log('test_acc', acc, on_epoch=True, prog_bar=True)
        return acc
        
    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
        return optimizer

In [48]:
xb, yb = next(iter(train_loader))

In [49]:
xb.shape

torch.Size([8, 1, 128, 201])

In [50]:
audionet = AudioNet()

In [51]:
audionet(xb).shape

torch.Size([8, 50])

In [52]:
pl.seed_everything(0)

Global seed set to 0


0

In [53]:
trainer = pl.Trainer(gpus=1, max_epochs=1)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


In [54]:
trainer.fit(audionet, train_loader, val_loader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

   | Name  | Type        | Params
---------------------------------------
0  | conv1 | Conv2d      | 3.9 K 
1  | bn1   | BatchNorm2d | 64    
2  | conv2 | Conv2d      | 9.2 K 
3  | bn2   | BatchNorm2d | 64    
4  | pool1 | MaxPool2d   | 0     
5  | conv3 | Conv2d      | 18.5 K
6  | bn3   | BatchNorm2d | 128   
7  | conv4 | Conv2d      | 73.9 K
8  | bn4   | BatchNorm2d | 256   
9  | pool2 | MaxPool2d   | 0     
10 | fc1   | Linear      | 6.5 K 
---------------------------------------
112 K     Trainable params
0         Non-trainable params
112 K     Total params
0.450     Total estimated model params size (MB)


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…

  f"The dataloader, {name}, does not have many workers which may be a bottleneck."
Global seed set to 0




  f"The dataloader, {name}, does not have many workers which may be a bottleneck."


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…




In [55]:
trainer.test(audionet, test_loader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  f"The dataloader, {name}, does not have many workers which may be a bottleneck."


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Testing', layout=Layout(flex='2'), max=…

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_acc': 0.042500000447034836}
--------------------------------------------------------------------------------



[{'test_acc': 0.042500000447034836}]