In [1]:
import torch
import torch.nn as nn
import numpy as np

class SoundNet8_pytorch(nn.Module):
    def __init__(self):
        super(SoundNet8_pytorch, self).__init__()
        
        self.define_module()
        
    def define_module(self):
        self.conv1 = nn.Sequential(
            nn.Conv2d(1,16, (64,1), (2,1), (32,0), bias=True),
            nn.BatchNorm2d(16),
            nn.ReLU(inplace=True),
            nn.MaxPool2d((8,1), (8,1))
        )
        self.conv2 = nn.Sequential(
            nn.Conv2d(16, 32, (32,1), (2,1), (16,0), bias=True),
            nn.BatchNorm2d(32),
            nn.ReLU(inplace=True),
            nn.MaxPool2d((8,1),(8,1))
        )
        self.conv3 = nn.Sequential(
            nn.Conv2d(32, 64, (16,1), (2,1), (8,0), bias=True),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True)
        )
        self.conv4 = nn.Sequential(
            nn.Conv2d(64, 128, (8,1), (2,1), (4,0), bias=True),
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),
        )
        self.conv5 = nn.Sequential(
            nn.Conv2d(128, 256, (4,1),(2,1),(2,0), bias=True),
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True),
            nn.MaxPool2d((4,1),(4,1))
        ) 
        # difference here (0.24751323, 0.2474), padding error has beed debuged
        self.conv6 = nn.Sequential(
            nn.Conv2d(256, 512, (4,1), (2,1), (2,0), bias=True),
            nn.BatchNorm2d(512),
            nn.ReLU(inplace=True)
        )
        self.conv7 = nn.Sequential(
            nn.Conv2d(512, 1024, (4,1), (2,1), (2,0), bias=True),
            nn.BatchNorm2d(1024),
            nn.ReLU(inplace=True)
        )
        self.conv8 = nn.Sequential(
            nn.Conv2d(1024, 1000, (8,1), (2,1), (0,0), bias=True),
        ) 
        self.conv8_2 = nn.Sequential(
            nn.Conv2d(1024, 401, (8,1), (2,1), (0,0), bias=True)
        )

    def forward(self, x):
        for net in [self.conv1, self.conv2, self.conv3, self.conv4, self.conv5, self.conv6, self.conv7]:
            x = net(x)
            print('xxxxxx', x.shape)
        object_pred = self.conv8(x)
        scene_pred = self.conv8_2(x)
        print('------', object_pred.shape, scene_pred.shape)
        return object_pred, scene_pred

    def extract_feat(self,x:torch.Tensor)->list:
        output_list = []
        for net in [self.conv1, self.conv2, self.conv3, self.conv4, self.conv5, self.conv6, self.conv7]:
            x = net(x)
            output_list.append(x.detach().cpu().numpy())
        object_pred = self.conv8(x)
        output_list.append(object_pred.detach().cpu().numpy())
        scene_pred = self.conv8_2(x) 
        output_list.append(scene_pred.detach().cpu().numpy())
        return output_list

In [2]:
model = SoundNet8_pytorch()

input_data = np.load('demo.npy')
x = torch.from_numpy(input_data).view(1,1,-1,1)
model(x)

print('original input dim.: ', input_data.shape, 'reshaped input dim.: ', x.shape)

xxxxxx torch.Size([1, 16, 884592, 1])
xxxxxx torch.Size([1, 32, 55287, 1])
xxxxxx torch.Size([1, 64, 27644, 1])
xxxxxx torch.Size([1, 128, 13823, 1])
xxxxxx torch.Size([1, 256, 1728, 1])
xxxxxx torch.Size([1, 512, 865, 1])
xxxxxx torch.Size([1, 1024, 433, 1])
------ torch.Size([1, 1000, 213, 1]) torch.Size([1, 401, 213, 1])
original input dim.:  (14153472,) reshaped input dim.:  torch.Size([1, 1, 14153472, 1])


In [3]:
# feature extraction and further analysis
sound_feat = model.extract_feat(x)
print('There are ', len(sound_feat), 'in the list')

print(sound_feat[0].shape)

There are  9 in the list
(1, 16, 884592, 1)


In [4]:
# generate a fake sound waveform
fake_input = np.random.rand(1, 1, 1314520, 1)
model = model.double()
fake_feat = model(torch.from_numpy(fake_input))

xxxxxx torch.Size([1, 16, 82157, 1])
xxxxxx torch.Size([1, 32, 5134, 1])
xxxxxx torch.Size([1, 64, 2568, 1])
xxxxxx torch.Size([1, 128, 1285, 1])
xxxxxx torch.Size([1, 256, 160, 1])
xxxxxx torch.Size([1, 512, 81, 1])
xxxxxx torch.Size([1, 1024, 41, 1])
------ torch.Size([1, 1000, 17, 1]) torch.Size([1, 401, 17, 1])


In [6]:
import librosa

# NOTE: Load an audio as the same format in soundnet
# 1. Keep original sample rate (which conflicts their own paper)
# 2. Use first channel in multiple channels
# 3. Keep range in [-256, 256]

def load_audio(audio_path, sr=None):
    # By default, librosa will resample the signal to 22050Hz(sr=None). And range in (-1., 1.)
    sound_sample, sr = librosa.load(audio_path, sr=sr, mono=False)

    return sound_sample, sr

In [12]:
sound_sample, sr = load_audio('audio.wav')

In [13]:
sr

44100

In [14]:
sound_sample.shape

(2, 2748077)

In [15]:
sound_sample

array([[ 0.        ,  0.        ,  0.        , ..., -0.00595093,
        -0.00543213, -0.00506592],
       [ 0.        ,  0.        ,  0.        , ..., -0.00387573,
        -0.00372314, -0.00363159]], dtype=float32)