### 1. Simplest Deep Neural Network

In [1]:
import torch
import torch.nn as nn

class DNN1(nn.Module):
    def __init__(self, input_size: int, num_labels: int):
        super().__init__()
        self.classifier = nn.Sequential(
                                        nn.Linear(input_size, 512), nn.ReLU(True),
                                        nn.Linear(512, 512), nn.ReLU(True),
                                        nn.Linear(512, num_labels),
                                        )
    def forward(self, input):
        outputs = self.classifier(input)
        return outputs

In [2]:
import pandas as pd
datapath = "/data/lujd/algorithm2022/audioset/"
dev_df = pd.read_csv(datapath+"dev.csv", sep="\t")

# dev set
import numpy as np
from Extract_feature import extract_mfcc

n_mels = 26
feature_option = "mfcc"
dev_files = list(dev_df.filename)
dev_labels = list(dev_df.scene_label)
dev_features = []
for filename in dev_files:
    wav_file_path = datapath+filename
    mfcc_feature = extract_mfcc(wav_file_path, n_mels=n_mels, option=feature_option)
    dev_features.append(mfcc_feature.reshape(1,-1))             # [1, frames*n_features] (flatten)
dev_features = np.concatenate(dev_features, axis=0)
print(f"dev feature size: {dev_features.shape}")

dev feature size: (379, 6487)


In [3]:
unique_labels = ['airport', 'bus', 'metro', 'metro_station',
                'park', 'public_square', 'shopping_mall',
                'street_pedestrian', 'street_traffic', 'tram']

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = DNN1(dev_features.shape[1], len(unique_labels)).to(device)
model.eval()

batch_size = 128
start_pos, end_pos = 0, 0
with torch.no_grad():
    while end_pos < len(dev_features):
        end_pos = end_pos+batch_size if (end_pos+batch_size)<len(dev_features) else len(dev_features)
        input = torch.Tensor(dev_features[start_pos:end_pos]).to(device)
        print(start_pos, end_pos)
        output = model(input)
        start_pos = end_pos
        print(f"output size: {output.shape}")

0 128
output size: torch.Size([128, 10])
128 256
output size: torch.Size([128, 10])
256 379
output size: torch.Size([123, 10])


In [4]:
output[:5]

tensor([[ 0.0091, -0.0346,  0.1381, -0.4813,  0.0720,  0.0573,  0.3161, -0.2161,
          0.0751, -0.2733],
        [ 0.0067, -0.0158,  0.1083, -0.2737,  0.0704,  0.0213,  0.1434, -0.1082,
          0.0572, -0.1698],
        [ 0.0231, -0.0075,  0.1276, -0.3178,  0.0343,  0.0272,  0.1563, -0.1500,
          0.0561, -0.1808],
        [ 0.0095,  0.0069,  0.0847, -0.2723,  0.0374,  0.0187,  0.1507, -0.1352,
          0.0151, -0.1853],
        [ 0.0011, -0.0277,  0.1109, -0.2673,  0.0473,  0.0580,  0.1613, -0.1307,
          0.0427, -0.1590]], device='cuda:0', grad_fn=<SliceBackward>)

In [5]:
pre_labels = torch.argmax(output, dim=1)
pre_labels[:5]

tensor([6, 6, 6, 6, 6], device='cuda:0')

In [None]:
import torch.nn as nn
import torch.optim as optim

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.0005)

In [12]:
loss = nn.CrossEntropyLoss()
input = torch.randn(3, 5, requires_grad=True)           # [3, 5]
target = torch.empty(3, dtype=torch.long).random_(5)    # [3]
output = loss(input, target)
output

tensor(2.7730, grad_fn=<NllLossBackward>)

process data for training DNN

In [24]:
from Extract_feature import extract_mfcc

datapath = "/data/lujd/algorithm2022/audioset/"

class Audio_Dataset(torch.utils.data.Dataset):
    def __init__(self, audio_df, label_dict, feature_type):
        super(Audio_Dataset, self).__init__()
        self.file_list = audio_df.filename.to_list()
        self.label_list = audio_df.scene_label.to_list()
        self.label_dict = label_dict
        
        feature_list = []
        for filename in self.file_list:
            wav_file_path = datapath + filename
            if feature_type=="mfcc" or feature_type=="fbanks":
                feature = extract_mfcc(wav_file_path, option=feature_type)
            feature_list.append(feature.reshape(1,-1))             # [1, frames*n_features] (flatten)
        self.feature_array = np.concatenate(feature_list, axis=0)
        print(self.feature_array.shape)

    def __getitem__(self, index):
        feature, label = self.feature_array[index], self.label_list[index]
        label = self.label_dict[label]
        return feature, label

    def __len__(self):
        return len(self.label_list)

In [25]:
import pandas as pd
dev_df = pd.read_csv(datapath+"dev.csv", sep="\t")

unique_labels = ['airport', 'bus', 'metro', 'metro_station', 'park', 'public_square',
                 'shopping_mall', 'street_pedestrian', 'street_traffic', 'tram']
label_dict = {}
for ind, c in enumerate(unique_labels):
    label_dict[c] = ind

feature_option = "mfcc"
dataset = Audio_Dataset(dev_df, label_dict, feature_option)

(379, 6487)


In [26]:
dev_loader = torch.utils.data.DataLoader(
            dataset,
            batch_size=64,
            shuffle=True,
            num_workers=0
        )

In [28]:
for feature_list, label_list in dev_loader:
    print(feature_list[:5])
    break

tensor([[-4.8444,  1.3241,  0.7179,  ...,  0.2825,  0.1520,  0.3084],
        [-8.3584,  2.8215,  0.3826,  ...,  0.2616,  0.0447,  0.1551],
        [-4.2064,  1.8694,  0.6441,  ...,  0.2537, -0.1173, -0.0734],
        [-3.8532,  1.1509, -0.4405,  ...,  0.0849,  0.0238, -0.0204],
        [-2.3669,  2.2008, -0.3398,  ...,  0.1766, -0.0520, -0.1226]],
       dtype=torch.float64)


In [29]:
for feature_list, label_list in dev_loader:
    print(label_list[:5])
    break

tensor([3, 6, 1, 2, 8])


In [66]:
import numpy as np
from Extract_feature import extract_mfcc
import torch.nn as nn
import torch.optim as optim

n_mels = 26
n_frames = 499
n_features = 13

device = torch.device("cuda:3" if torch.cuda.is_available() else "cpu")
model = DNN1(n_frames*n_features, len(unique_labels)).to(device)
model.train()

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.005)

train_loss_list, true_label_list, pre_label_list = [], [], []
for feature_list, label_list in dev_loader:
    
    print(f"dev feature size: {feature_list.shape}")

    # forward
    train_inputs = torch.Tensor(feature_list.float()).to(device)
    train_labels = torch.LongTensor(label_list).to(device)
    train_outputs = model(train_inputs)
    train_loss = criterion(train_outputs, train_labels)

    # backward
    optimizer.zero_grad()
    train_loss.backward()
    optimizer.step()

    pre_labels = torch.argmax(train_outputs, dim=1)
    train_loss_list.append(train_loss.cpu().detach().numpy().item())
    true_label_list.append(train_labels.cpu().detach().numpy())
    pre_label_list.append(pre_labels.cpu().detach().numpy())
    
    print(f"output size: {train_outputs.shape}")

dev feature size: torch.Size([64, 6487])
output size: torch.Size([64, 10])
dev feature size: torch.Size([64, 6487])
output size: torch.Size([64, 10])
dev feature size: torch.Size([64, 6487])
output size: torch.Size([64, 10])
dev feature size: torch.Size([64, 6487])
output size: torch.Size([64, 10])
dev feature size: torch.Size([64, 6487])
output size: torch.Size([64, 10])
dev feature size: torch.Size([59, 6487])
output size: torch.Size([59, 10])


In [67]:
train_loss_list

[2.2866268157958984,
 2.2864463329315186,
 2.2715611457824707,
 2.2785751819610596,
 2.2386128902435303,
 2.297173500061035]

In [72]:
true_label_list = np.concatenate(true_label_list)
pre_label_list = np.concatenate(pre_label_list)

In [82]:
true_label_list[:100]

array([4, 6, 8, 8, 1, 9, 5, 0, 6, 6, 3, 7, 9, 3, 3, 5, 4, 0, 5, 6, 2, 1,
       2, 4, 3, 2, 8, 3, 1, 7, 6, 1, 7, 6, 9, 5, 2, 9, 6, 1, 3, 1, 4, 3,
       0, 2, 5, 5, 8, 3, 9, 8, 7, 0, 5, 9, 7, 5, 0, 3, 9, 8, 6, 0, 7, 5,
       3, 0, 6, 8, 7, 2, 9, 2, 7, 8, 2, 9, 8, 3, 6, 5, 8, 5, 8, 1, 0, 4,
       3, 9, 5, 1, 4, 4, 7, 8, 7, 1, 1, 1])

In [81]:
pre_label_list[:100]

array([5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       5, 5, 5, 5, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       5, 5, 5, 5, 5, 5, 5, 1, 5, 5, 5, 5])

In [85]:
np.sum(true_label_list == pre_label_list)/len(true_label_list)

0.11609498680738786

In [86]:
len(dev_loader)

6

In [2]:
import numpy as np

a = np.ones(5)
np.expand_dims(a, axis=1)

array([[1.],
       [1.],
       [1.],
       [1.],
       [1.]])

In [1]:
import numpy as np
a=np.array([[[1,2],[3,4]],[[5,6],[7,8]]])
a

array([[[1, 2],
        [3, 4]],

       [[5, 6],
        [7, 8]]])

In [2]:
a.reshape(2,-1)

array([[1, 2, 3, 4],
       [5, 6, 7, 8]])

In [1]:
import torch
a=torch.LongTensor([1,2,3,4,5])
a

tensor([1, 2, 3, 4, 5])

In [3]:
b=a.repeat((10,1))
b

tensor([[1, 2, 3, 4, 5],
        [1, 2, 3, 4, 5],
        [1, 2, 3, 4, 5],
        [1, 2, 3, 4, 5],
        [1, 2, 3, 4, 5],
        [1, 2, 3, 4, 5],
        [1, 2, 3, 4, 5],
        [1, 2, 3, 4, 5],
        [1, 2, 3, 4, 5],
        [1, 2, 3, 4, 5]])

In [7]:
b.permute(1,0).reshape(-1)

tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5,
        5, 5])

In [11]:
c=a.repeat((10,1)).permute(1,0).reshape(-1).numpy()
c.reshape(-1,10)

array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
       [2, 2, 2, 2, 2, 2, 2, 2, 2, 2],
       [3, 3, 3, 3, 3, 3, 3, 3, 3, 3],
       [4, 4, 4, 4, 4, 4, 4, 4, 4, 4],
       [5, 5, 5, 5, 5, 5, 5, 5, 5, 5]])

---
#### CNN

In [11]:
import torch
from torch import nn

input = torch.randn((10, 1, 49, 40))
input[1], input.shape

(tensor([[[ 0.0830,  1.2401,  1.2569,  ...,  0.5619,  0.5436,  1.2856],
          [-0.8057, -0.7899, -0.4259,  ..., -2.1947,  0.8941, -0.3048],
          [-1.4934,  0.0231, -0.0458,  ...,  1.0432, -1.2269,  0.1303],
          ...,
          [-0.9848,  0.8359, -0.8844,  ..., -0.5607, -0.4978,  0.5743],
          [-0.1280, -0.7402, -0.8203,  ..., -0.2425,  0.7120,  0.6893],
          [-0.3238, -1.3476, -1.2397,  ..., -1.0251, -0.4712, -0.4028]]]),
 torch.Size([10, 1, 49, 40]))

In [12]:
bn = nn.BatchNorm2d(1)
output1 = bn(input)
output1[1], output1.shape

(tensor([[[ 0.0727,  1.2332,  1.2501,  ...,  0.5530,  0.5346,  1.2789],
          [-0.8185, -0.8027, -0.4376,  ..., -2.2116,  0.8862, -0.3162],
          [-1.5082,  0.0127, -0.0565,  ...,  1.0357, -1.2410,  0.1202],
          ...,
          [-0.9982,  0.8278, -0.8975,  ..., -0.5729, -0.5097,  0.5655],
          [-0.1389, -0.7529, -0.8332,  ..., -0.2537,  0.7035,  0.6808],
          [-0.3353, -1.3620, -1.2538,  ..., -1.0386, -0.4831, -0.4145]]],
        grad_fn=<SelectBackward>),
 torch.Size([10, 1, 49, 40]))

In [13]:
conv1 = nn.Conv2d(1, 96, kernel_size=5, stride=2, padding=1)
output2 = conv1(output1)
output2[1], output2.shape

(tensor([[[ 2.3429e-01, -3.2614e-01,  2.5552e-01,  ...,  7.8907e-02,
            1.3424e-01,  3.5770e-01],
          [ 6.9168e-01,  4.6772e-01,  2.0887e-01,  ..., -5.0911e-01,
           -6.8798e-01, -9.6675e-02],
          [ 1.4883e-01,  2.2471e-01, -5.7965e-01,  ..., -4.3458e-01,
           -4.5151e-01,  2.0097e-01],
          ...,
          [ 8.3697e-01,  1.0310e+00,  1.0765e+00,  ..., -6.6169e-01,
           -3.5074e-01, -1.3733e-01],
          [ 5.8132e-01, -6.9508e-01, -1.7622e-01,  ..., -2.9722e-01,
            6.9068e-01,  1.2362e-01],
          [-1.5263e-01,  5.0952e-01, -1.4035e-02,  ...,  5.6680e-02,
            4.4436e-01, -1.4924e-01]],
 
         [[ 2.0813e-01, -5.2340e-01,  7.9214e-01,  ..., -1.7131e-01,
            1.3149e-01,  8.5680e-02],
          [ 4.4741e-01,  4.3043e-01, -8.3380e-01,  ..., -1.0920e-01,
            1.0095e+00,  1.0611e-02],
          [ 3.7180e-01,  1.4497e-02,  4.8354e-01,  ...,  2.9547e-01,
            9.2717e-01,  7.7353e-02],
          ...,
    

In [19]:
max1 = nn.MaxPool2d(kernel_size=5, stride=2)
output3 = max1(output2)
output3[1], output3.shape

(tensor([[[0.6917, 0.9712, 0.9712,  ..., 1.0629, 1.0629, 0.9632],
          [0.9991, 0.9991, 0.8375,  ..., 1.0629, 1.0629, 0.9632],
          [0.9991, 0.9991, 1.5860,  ..., 1.1804, 1.1804, 0.9528],
          ...,
          [1.2292, 1.2292, 0.6592,  ..., 1.3920, 1.3920, 1.0350],
          [0.9072, 0.9072, 0.6592,  ..., 1.2191, 0.9031, 1.3068],
          [1.0765, 1.0765, 1.3608,  ..., 1.0150, 0.7969, 1.3068]],
 
         [[1.2583, 1.1477, 1.1740,  ..., 1.0985, 1.0985, 1.0955],
          [1.4524, 1.4524, 1.2429,  ..., 1.6752, 1.6752, 1.4318],
          [1.4524, 1.4524, 1.2429,  ..., 1.6752, 1.6752, 1.4318],
          ...,
          [1.0261, 1.0576, 1.3149,  ..., 1.3835, 1.4349, 1.4349],
          [1.1192, 1.1192, 1.3069,  ..., 1.3835, 1.4349, 1.4349],
          [1.1192, 1.1192, 1.3069,  ..., 0.9079, 1.4349, 1.4349]],
 
         [[1.0223, 1.0223, 0.6883,  ..., 0.4957, 0.9427, 0.9427],
          [1.0223, 1.0223, 0.8323,  ..., 0.5744, 0.9427, 0.9427],
          [1.0223, 1.7117, 1.7117,  ...,

In [17]:
flat = nn.Flatten()
output4 = flat(output3)
output4[1], output4.shape

(tensor([0.6917, 0.9712, 0.9712,  ..., 1.3102, 1.3102, 1.2192],
        grad_fn=<SelectBackward>),
 torch.Size([10, 7680]))

In [74]:
class ConvBlock(nn.Module):
    def __init__(self, in_channels, out_channels, is_bn=True):
        super(ConvBlock, self).__init__()

        self.is_bn = is_bn
        self.bn = nn.BatchNorm2d(in_channels)
        self.conv = nn.Conv2d(in_channels=in_channels, out_channels=out_channels,
                              kernel_size=5, stride=2, padding=2)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=1)
        
    def forward(self, input):
        if self.is_bn:
            output = self.bn(input)
            output = self.conv(output)
            output = self.maxpool(output)
        else:
            output = self.conv(input)
            output = self.maxpool(output)
        return output

In [78]:
cb1 = ConvBlock(1, 8)
output1 = cb1(input)
input.shape, output1.shape

(torch.Size([10, 1, 49, 40]), torch.Size([10, 8, 23, 18]))

In [79]:
cb2 = ConvBlock(8, 8)
output2 = cb2(output1)
output2.shape

torch.Size([10, 8, 10, 7])

In [80]:
output3 = flat(output2)
output3.shape

torch.Size([10, 560])

In [84]:
class CNN(nn.Module):
    def __init__(self, num_labels=10, is_bn=True):
        super(CNN, self).__init__()

        self.cb1 = ConvBlock(10, 16, is_bn=is_bn)
        self.cb2 = ConvBlock(16, 8, is_bn=is_bn)
        self.flat = nn.Flatten()
        self.classifier = nn.Sequential(
                                        nn.BatchNorm1d(560),
                                        nn.Linear(560, 128),
                                        nn.ReLU(True),
                                        nn.Linear(128, num_labels)
        )
        
    def forward(self, input):
        output = self.cb1(input)
        output = self.cb2(output)

        output = self.flat(output)
        output = self.classifier(output)

        return output

In [82]:
final_model = CNN()
output = final_model(input)
output.shape

torch.Size([10, 10])

In [87]:
input = torch.randn((32, 10, 49, 40))
final_model = CNN()
output = final_model(input)
input.shape, output.shape

(torch.Size([32, 10, 49, 40]), torch.Size([32, 10]))

In [89]:
input

tensor([[[[-4.3566e-01, -1.1920e+00,  4.5311e-01,  ..., -5.8033e-02,
           -1.3105e+00,  1.0425e+00],
          [ 1.3800e+00, -8.6652e-01,  7.6536e-01,  ..., -1.7274e-01,
            4.2595e-01, -5.9750e-01],
          [ 1.1824e+00,  5.8274e-01,  1.8138e+00,  ..., -1.7031e+00,
           -1.0004e+00, -5.6267e-02],
          ...,
          [-1.0465e-01, -7.0150e-01, -9.6694e-01,  ...,  8.8712e-01,
           -5.2279e-01,  5.7661e-01],
          [ 2.0852e-01,  9.0612e-01, -1.3937e-02,  ..., -1.8415e+00,
            9.2284e-01, -1.4254e+00],
          [ 9.3439e-02,  2.5655e-01,  3.0068e-02,  ..., -8.5392e-01,
           -6.9665e-01,  6.1748e-01]],

         [[ 1.4478e-01,  9.1604e-01,  6.0250e-01,  ..., -7.0077e-01,
           -1.6857e-01, -1.0752e-01],
          [ 6.5405e-01, -4.1792e-01, -3.8833e-01,  ...,  1.4104e+00,
           -5.6232e-01, -8.9570e-01],
          [-1.4413e+00,  4.2025e-02,  1.4221e-01,  ...,  1.3811e-01,
            2.7101e-01,  1.5342e-01],
          ...,
     

In [91]:
input.reshape(-1, input.shape[-2], input.shape[-1])

tensor([[[-4.3566e-01, -1.1920e+00,  4.5311e-01,  ..., -5.8033e-02,
          -1.3105e+00,  1.0425e+00],
         [ 1.3800e+00, -8.6652e-01,  7.6536e-01,  ..., -1.7274e-01,
           4.2595e-01, -5.9750e-01],
         [ 1.1824e+00,  5.8274e-01,  1.8138e+00,  ..., -1.7031e+00,
          -1.0004e+00, -5.6267e-02],
         ...,
         [-1.0465e-01, -7.0150e-01, -9.6694e-01,  ...,  8.8712e-01,
          -5.2279e-01,  5.7661e-01],
         [ 2.0852e-01,  9.0612e-01, -1.3937e-02,  ..., -1.8415e+00,
           9.2284e-01, -1.4254e+00],
         [ 9.3439e-02,  2.5655e-01,  3.0068e-02,  ..., -8.5392e-01,
          -6.9665e-01,  6.1748e-01]],

        [[ 1.4478e-01,  9.1604e-01,  6.0250e-01,  ..., -7.0077e-01,
          -1.6857e-01, -1.0752e-01],
         [ 6.5405e-01, -4.1792e-01, -3.8833e-01,  ...,  1.4104e+00,
          -5.6232e-01, -8.9570e-01],
         [-1.4413e+00,  4.2025e-02,  1.4221e-01,  ...,  1.3811e-01,
           2.7101e-01,  1.5342e-01],
         ...,
         [ 4.0694e-01, -1