In [29]:
import pandas as pd
import torch.nn.functional as F
import torchaudio

from passt.models.passt import get_model
from passt.models.preprocess import AugmentMelSTFT

In [30]:
INITIAL_SR = 48_000
TARGET_SR = 32_000

In [31]:
model  = get_model(arch="passt_s_swa_p16_128_ap476", pretrained=True, n_classes=527, in_channels=1,
                   fstride=10, tstride=10,input_fdim=128, input_tdim=998,
                   u_patchout=0, s_patchout_t=40, s_patchout_f=4)
model.eval()



 Loading PaSST pre-trained on AudioSet Patch 16 stride 10 structured patchout mAP=476 SWA 


PaSST(
  (patch_embed): PatchEmbed(
    (proj): Conv2d(1, 768, kernel_size=(16, 16), stride=(10, 10))
    (norm): Identity()
  )
  (pos_drop): Dropout(p=0.0, inplace=False)
  (blocks): Sequential(
    (0): Block(
      (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (qkv): Linear(in_features=768, out_features=2304, bias=True)
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=768, out_features=768, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (drop_path): Identity()
      (norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=768, out_features=3072, bias=True)
        (act): GELU()
        (fc2): Linear(in_features=3072, out_features=768, bias=True)
        (drop): Dropout(p=0.0, inplace=False)
      )
    )
    (1): Bloc

PaSST(
  (patch_embed): PatchEmbed(
    (proj): Conv2d(1, 768, kernel_size=(16, 16), stride=(10, 10))
    (norm): Identity()
  )
  (pos_drop): Dropout(p=0.0, inplace=False)
  (blocks): Sequential(
    (0): Block(
      (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (qkv): Linear(in_features=768, out_features=2304, bias=True)
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=768, out_features=768, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (drop_path): Identity()
      (norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=768, out_features=3072, bias=True)
        (act): GELU()
        (fc2): Linear(in_features=3072, out_features=768, bias=True)
        (drop): Dropout(p=0.0, inplace=False)
      )
    )
    (1): Block(
      (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
      (attn): Attention

In [32]:
resampler = torchaudio.transforms.Resample(INITIAL_SR, TARGET_SR)
wav2spec = AugmentMelSTFT(freqm=0, timem=0)

def preprocess(wav):
    assert wav.dim() == 2
    wav = wav.mean(dim=0, keepdim=True)
    wav = resampler(wav)
    return wav2spec(wav)[:, :, 1:-1].unsqueeze(0)



In [33]:
labels = pd.read_csv('data/audioset/class_labels_indices.csv')
labels.head()

Unnamed: 0,index,mid,display_name
0,0,/m/09x0r,Speech
1,1,/m/05zppz,"Male speech, man speaking"
2,2,/m/02zsn,"Female speech, woman speaking"
3,3,/m/0ytgt,"Child speech, kid speaking"
4,4,/m/01h8n0,Conversation


In [34]:
helicopter, sr = torchaudio.load('data/audioset/test_wavs/helicopter.wav') # helicopter, vehicle, aircraft
assert sr == INITIAL_SR
helicopter = preprocess(helicopter)

music, sr = torchaudio.load('data/audioset/test_wavs/music.wav') # music, radio
assert sr == INITIAL_SR
music = preprocess(music)

barking, sr = torchaudio.load('data/audioset/test_wavs/barking.wav') # canidae, dogs, wolves, bark, domestic animals, pets, bow-wow, dog, growling, animal
assert sr == INITIAL_SR
barking = preprocess(barking)

water, sr = torchaudio.load('data/audioset/test_wavs/water.wav') # pump(liquid), water
assert sr == INITIAL_SR
water = preprocess(water)

In [35]:
print('helicopter:', labels.iloc[model(helicopter)[0][0].argmax().item()].display_name)
print('music:', labels.iloc[model(music)[0][0].argmax().item()].display_name)
print('barking:', labels.iloc[model(barking)[0][0].argmax().item()].display_name)
print('water:', labels.iloc[model(water)[0][0].argmax().item()].display_name)

helicopter: Vehicle
music: Music
barking: Dog
water: Vehicle


In [36]:
rspeech, sr = torchaudio.load('data/audioset/test_wavs/russian_speech.wav')
assert sr == INITIAL_SR
rspeech = preprocess(rspeech)

print('russian_speech:', labels.iloc[model(rspeech)[0][0].argmax().item()].display_name)

russian_speech: Speech


In [37]:
rspeech, sr = torchaudio.load('data/audioset/test_wavs/russian_speech2.wav')
assert sr == INITIAL_SR
rspeech = preprocess(rspeech)

print('russian_speech2:', labels.iloc[model(rspeech)[0][0].argmax().item()].display_name)

russian_speech2: Speech


In [38]:
laugh, sr = torchaudio.load('data/audioset/test_wavs/laugh_wspeech.wav')
assert sr == INITIAL_SR
laugh = F.pad(laugh, (0, 10 * INITIAL_SR - laugh.shape[-1]))
laugh = preprocess(laugh)

print('laugh with speech:', labels.iloc[model(laugh)[0][0].argmax().item()].display_name)

laugh with speech: Speech


In [39]:
laugh, sr = torchaudio.load('data/audioset/test_wavs/laugh.wav')
assert sr == INITIAL_SR
laugh = F.pad(laugh, (0, 10 * INITIAL_SR - laugh.shape[-1]))
laugh = preprocess(laugh)

print('laugh:', labels.iloc[model(laugh)[0][0].argmax().item()].display_name)

laugh: Snicker
