In [1]:
import os
import gc
import cv2
import math
import copy
import time
import random
import wget

# For data manipulation
import numpy as np
import pandas as pd

from pathlib import Path
# Pytorch Imports
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader
from torch.cuda import amp

# Audio 
import torchaudio
from torchaudio.transforms import MelSpectrogram, Resample,AmplitudeToDB

# Utils
import joblib
from tqdm import tqdm
from collections import defaultdict

# Sklearn Imports
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score

# For Image Models
import timm
from timm.models.layers import to_2tuple,trunc_normal_

# For colored terminal text
from colorama import Fore, Back, Style
b_ = Fore.BLUE
sr_ = Style.RESET_ALL

import warnings
warnings.filterwarnings("ignore")

# For descriptive error messages
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

  from .autonotebook import tqdm as notebook_tqdm


Testing Configuration

In [2]:
class CONFIG:
    num_class = 152
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    #device = torch.device('cpu')
    #model_name = 'tf_efficientnet_b0_ns'
    model_name = ['AST','tf_efficientnet_b0_ns']
    embedding_size = 768
    #Audio Specific
    sample_rate = 32000
    max_time = 5
    n_mels = 224
    n_fft = 1024
    period = 30
    
    target_columns = [
        "afrsil1",
        "akekee",
        "akepa1",
        "akiapo",
        "akikik",
        "amewig",
        "aniani",
        "apapan",
        "arcter",
        "barpet",
        "bcnher",
        "belkin1",
        "bkbplo",
        "bknsti",
        "bkwpet",
        "blkfra",
        "blknod",
        "bongul",
        "brant",
        "brnboo",
        "brnnod",
        "brnowl",
        "brtcur",
        "bubsan",
        "buffle",
        "bulpet",
        "burpar",
        "buwtea",
        "cacgoo1",
        "calqua",
        "cangoo",
        "canvas",
        "caster1",
        "categr",
        "chbsan",
        "chemun",
        "chukar",
        "cintea",
        "comgal1",
        "commyn",
        "compea",
        "comsan",
        "comwax",
        "coopet",
        "crehon",
        "dunlin",
        "elepai",
        "ercfra",
        "eurwig",
        "fragul",
        "gadwal",
        "gamqua",
        "glwgul",
        "gnwtea",
        "golphe",
        "grbher3",
        "grefri",
        "gresca",
        "gryfra",
        "gwfgoo",
        "hawama",
        "hawcoo",
        "hawcre",
        "hawgoo",
        "hawhaw",
        "hawpet1",
        "hoomer",
        "houfin",
        "houspa",
        "hudgod",
        "iiwi",
        "incter1",
        "jabwar",
        "japqua",
        "kalphe",
        "kauama",
        "laugul",
        "layalb",
        "lcspet",
        "leasan",
        "leater1",
        "lessca",
        "lesyel",
        "lobdow",
        "lotjae",
        "madpet",
        "magpet1",
        "mallar3",
        "masboo",
        "mauala",
        "maupar",
        "merlin",
        "mitpar",
        "moudov",
        "norcar",
        "norhar2",
        "normoc",
        "norpin",
        "norsho",
        "nutman",
        "oahama",
        "omao",
        "osprey",
        "pagplo",
        "palila",
        "parjae",
        "pecsan",
        "peflov",
        "perfal",
        "pibgre",
        "pomjae",
        "puaioh",
        "reccar",
        "redava",
        "redjun",
        "redpha1",
        "refboo",
        "rempar",
        "rettro",
        "ribgul",
        "rinduc",
        "rinphe",
        "rocpig",
        "rorpar",
        "rudtur",
        "ruff",
        "saffin",
        "sander",
        "semplo",
        "sheowl",
        "shtsan",
        "skylar",
        "snogoo",
        "sooshe",
        "sooter1",
        "sopsku1",
        "sora",
        "spodov",
        "sposan",
        "towsol",
        "wantat1",
        "warwhe1",
        "wesmea",
        "wessan",
        "wetshe",
        "whfibi",
        "whiter",
        "whttro",
        "wiltur",
        "yebcar",
        "yefcan",
        "zebdov",
        ]
    bird2id = {b:i for i,b in enumerate(target_columns)}
    id2bird = {i:b for i,b in enumerate(target_columns)}
    scored_birds = ["akiapo", "aniani", "apapan", "barpet", "crehon", "elepai", "ercfra", "hawama", "hawcre", "hawgoo", "hawhaw", "hawpet1", "houfin", "iiwi", "jabwar", "maupar", "omao", "puaioh", "skylar", "warwhe1", "yefcan"]

Normalizer

In [3]:
class NormalizeMelSpec(nn.Module):
    def __init__(self,eps = 1e-12):
        super().__init__()
        self.eps = eps
    
    def forward(self,x):

        mean = x.mean((1,2),keepdim = True)
        std = x.std((1,2),keepdim = True)
        x_std = (x-mean)/(std+self.eps)

        norm_min = x_std.min(-1)[0].min(-1)[0]
        norm_max = x_std.max(-1)[0].max(-1)[0]

        fix_ind = (norm_max - norm_min) > self.eps
        fix_ind = (fix_ind * torch.ones_like((norm_max - norm_min))).long()
        
        v = torch.zeros_like(x_std)
        
        #归一化后存在非零特征值(保留下来的是对应的batch)
        if fix_ind.sum():
            v_fix = x_std[fix_ind]
            norm_max_fix = norm_max[fix_ind,None,None]
            norm_min_fix = norm_min[fix_ind,None,None]
            v_fix = torch.max(
                torch.min(v_fix,norm_max_fix),
                norm_min_fix,
            )
            v_fix = (v_fix - norm_min_fix)/(norm_max_fix - norm_min_fix)
            v[fix_ind] = v_fix
        return v

Audio Transformer Model

In [4]:
class PatchEmbed(nn.Module):
    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
        super().__init__()

        img_size = to_2tuple(img_size)
        patch_size = to_2tuple(patch_size)
        num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0])
        self.img_size = img_size
        self.patch_size = patch_size
        self.num_patches = num_patches

        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)

    def forward(self, x):
        x = self.proj(x).flatten(2).transpose(1, 2)
        return x
        
class ASTModel(nn.Module):
    """
    The AST model.
    :param label_dim: the label dimension, i.e., the number of total classes, it is 527 for AudioSet, 50 for ESC-50, and 35 for speechcommands v2-35
    :param fstride: the stride of patch spliting on the frequency dimension, for 16*16 patchs, fstride=16 means no overlap, fstride=10 means overlap of 6
    :param tstride: the stride of patch spliting on the time dimension, for 16*16 patchs, tstride=16 means no overlap, tstride=10 means overlap of 6
    :param input_fdim: the number of frequency bins of the input spectrogram
    :param input_tdim: the number of time frames of the input spectrogram
    :param imagenet_pretrain: if use ImageNet pretrained model
    :param audioset_pretrain: if use full AudioSet and ImageNet pretrained model
    :param model_size: the model size of AST, should be in [tiny224, small224, base224, base384], base224 and base 384 are same model, but are trained differently during ImageNet pretraining.
    """
    def __init__(self, label_dim=527, fstride=10, tstride=10, input_fdim=128, input_tdim=1024, imagenet_pretrain=False,audioset_pretrain=False, model_size='tiny224', verbose=True):

        super(ASTModel, self).__init__()
        #assert timm.__version__ == '0.4.5', 'Please use timm == 0.4.5, the code might not be compatible with newer versions.'

        if verbose == True:
            print('---------------AST Model Summary---------------')
            print('ImageNet pretraining: {:s}, AudioSet pretraining: {:s}'.format(str(imagenet_pretrain),str(audioset_pretrain)))
        # override timm input shape restriction
        timm.models.vision_transformer.PatchEmbed = PatchEmbed

        # if AudioSet pretraining is not used (but ImageNet pretraining may still apply)
        if audioset_pretrain == False:
            if model_size == 'tiny224':
                self.v = timm.create_model('vit_deit_tiny_distilled_patch16_224', pretrained=imagenet_pretrain)
                #self.v = timm.create_model('vit_tiny_patch16_224', pretrained=imagenet_pretrain)
            elif model_size == 'small224':
                self.v = timm.create_model('vit_deit_small_distilled_patch16_224', pretrained=imagenet_pretrain)
                #self.v = timm.create_model('vit_small_patch16_224', pretrained=imagenet_pretrain)
            elif model_size == 'base224':
                self.v = timm.create_model('vit_deit_base_distilled_patch16_224', pretrained=imagenet_pretrain)
                #self.v = timm.create_model('vit_base_patch16_224', pretrained=imagenet_pretrain)
            elif model_size == 'base384':
                self.v = timm.create_model('vit_deit_base_distilled_patch16_384', pretrained=imagenet_pretrain)
                #self.v = timm.create_model('vit_base_patch16_384', pretrained=imagenet_pretrain)
            else:
                raise Exception('Model size must be one of tiny224, small224, base224, base384.')
            self.original_num_patches = self.v.patch_embed.num_patches
            self.oringal_hw = int(self.original_num_patches ** 0.5)
            self.original_embedding_dim = self.v.pos_embed.shape[2]
            self.mlp_head = nn.Sequential(nn.LayerNorm(self.original_embedding_dim), nn.Linear(self.original_embedding_dim, label_dim))

            # automatcially get the intermediate shape
            f_dim, t_dim = self.get_shape(fstride, tstride, input_fdim, input_tdim)
            num_patches = f_dim * t_dim
            self.v.patch_embed.num_patches = num_patches
            if verbose == True:
                print('frequncey stride={:d}, time stride={:d}'.format(fstride, tstride))
                print('number of patches={:d}'.format(num_patches))

            # the linear projection layer
            new_proj = torch.nn.Conv2d(1, self.original_embedding_dim, kernel_size=(16, 16), stride=(fstride, tstride))
            if imagenet_pretrain == True:
                new_proj.weight = torch.nn.Parameter(torch.sum(self.v.patch_embed.proj.weight, dim=1).unsqueeze(1))
                new_proj.bias = self.v.patch_embed.proj.bias
            self.v.patch_embed.proj = new_proj

            # the positional embedding
            if imagenet_pretrain == True:
                # get the positional embedding from deit model, skip the first two tokens (cls token and distillation token), reshape it to original 2D shape (24*24).
                new_pos_embed = self.v.pos_embed[:, 2:, :].detach().reshape(1, self.original_num_patches, self.original_embedding_dim).transpose(1, 2).reshape(1, self.original_embedding_dim, self.oringal_hw, self.oringal_hw)
                # cut (from middle) or interpolate the second dimension of the positional embedding
                if t_dim <= self.oringal_hw:
                    new_pos_embed = new_pos_embed[:, :, :, int(self.oringal_hw / 2) - int(t_dim / 2): int(self.oringal_hw / 2) - int(t_dim / 2) + t_dim]
                else:
                    new_pos_embed = torch.nn.functional.interpolate(new_pos_embed, size=(self.oringal_hw, t_dim), mode='bilinear')
                # cut (from middle) or interpolate the first dimension of the positional embedding
                if f_dim <= self.oringal_hw:
                    new_pos_embed = new_pos_embed[:, :, int(self.oringal_hw / 2) - int(f_dim / 2): int(self.oringal_hw / 2) - int(f_dim / 2) + f_dim, :]
                else:
                    new_pos_embed = torch.nn.functional.interpolate(new_pos_embed, size=(f_dim, t_dim), mode='bilinear')
                # flatten the positional embedding
                new_pos_embed = new_pos_embed.reshape(1, self.original_embedding_dim, num_patches).transpose(1,2)
                # concatenate the above positional embedding with the cls token and distillation token of the deit model.
                self.v.pos_embed = nn.Parameter(torch.cat([self.v.pos_embed[:, :2, :].detach(), new_pos_embed], dim=1))
            else:
                # if not use imagenet pretrained model, just randomly initialize a learnable positional embedding
                # TODO can use sinusoidal positional embedding instead
                new_pos_embed = nn.Parameter(torch.zeros(1, self.v.patch_embed.num_patches + 2, self.original_embedding_dim))
                self.v.pos_embed = new_pos_embed
                trunc_normal_(self.v.pos_embed, std=.02)

        # now load a model that is pretrained on both ImageNet and AudioSet
        elif audioset_pretrain == True:
            if audioset_pretrain == True and imagenet_pretrain == False:
                raise ValueError('currently model pretrained on only audioset is not supported, please set imagenet_pretrain = True to use audioset pretrained model.')
            if model_size != 'base384':
                raise ValueError('currently only has base384 AudioSet pretrained model.')
            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
            if os.path.exists('../pretrained_models/audioset_10_10_0.4593.pth') == False:
                # this model performs 0.4593 mAP on the audioset eval set
                audioset_mdl_url = 'https://www.dropbox.com/s/cv4knew8mvbrnvq/audioset_0.4593.pth?dl=1'
                wget.download(audioset_mdl_url, out='../pretrained_models/audioset_10_10_0.4593.pth')
            sd = torch.load('../pretrained_models/audioset_10_10_0.4593.pth', map_location=device)
            audio_model = ASTModel(label_dim=527, fstride=10, tstride=10, input_fdim=128, input_tdim=1024, imagenet_pretrain=False, audioset_pretrain=False, model_size='base384', verbose=False)
            audio_model = torch.nn.DataParallel(audio_model)
            audio_model.load_state_dict(sd, strict=False)
            self.v = audio_model.module.v
            self.original_embedding_dim = self.v.pos_embed.shape[2]
            self.mlp_head = nn.Sequential(nn.LayerNorm(self.original_embedding_dim), nn.Linear(self.original_embedding_dim, label_dim))

            f_dim, t_dim = self.get_shape(fstride, tstride, input_fdim, input_tdim)
            num_patches = f_dim * t_dim
            self.v.patch_embed.num_patches = num_patches
            if verbose == True:
                print('frequncey stride={:d}, time stride={:d}'.format(fstride, tstride))
                print('number of patches={:d}'.format(num_patches))

            new_pos_embed = self.v.pos_embed[:, 2:, :].detach().reshape(1, 1212, 768).transpose(1, 2).reshape(1, 768, 12, 101)
            # if the input sequence length is larger than the original audioset (10s), then cut the positional embedding
            if t_dim < 101:
                new_pos_embed = new_pos_embed[:, :, :, 50 - int(t_dim/2): 50 - int(t_dim/2) + t_dim]
            # otherwise interpolate
            else:
                new_pos_embed = torch.nn.functional.interpolate(new_pos_embed, size=(12, t_dim), mode='bilinear')
            new_pos_embed = new_pos_embed.reshape(1, 768, num_patches).transpose(1, 2)
            self.v.pos_embed = nn.Parameter(torch.cat([self.v.pos_embed[:, :2, :].detach(), new_pos_embed], dim=1))

    def get_shape(self, fstride, tstride, input_fdim=128, input_tdim=1024):
        test_input = torch.randn(1, 1, input_fdim, input_tdim)
        test_proj = nn.Conv2d(1, self.original_embedding_dim, kernel_size=(16, 16), stride=(fstride, tstride))
        test_out = test_proj(test_input)
        f_dim = test_out.shape[2]
        t_dim = test_out.shape[3]
        return f_dim, t_dim

    #@autocast()
    def forward(self, x):
        """
        :param x: the input spectrogram, expected shape: (batch_size, time_frame_num, frequency_bins), e.g., (12, 1024, 128)
        :return: prediction
        """
        # expect input x = (batch_size, time_frame_num, frequency_bins), e.g., (12, 1024, 128)
        x = x.unsqueeze(1)
        x = x.transpose(2, 3)

        B = x.shape[0]
        x = self.v.patch_embed(x)
        cls_tokens = self.v.cls_token.expand(B, -1, -1)
        dist_token = self.v.dist_token.expand(B, -1, -1)
        x = torch.cat((cls_tokens, dist_token, x), dim=1)
        x = x + self.v.pos_embed
        x = self.v.pos_drop(x)
        for blk in self.v.blocks:
            x = blk(x)
        x = self.v.norm(x)
        x = (x[:, 0] + x[:, 1]) / 2

        x = self.mlp_head(x)
        return x

In [5]:
#model used for prediction

#GeM pooling
class GeM(nn.Module):
    def __init__(self, p=3, eps=1e-6):
        super(GeM, self).__init__()
        self.p = nn.Parameter(torch.ones(1)*p)
        self.eps = eps

    def forward(self, x):
        return self.gem(x, p=self.p, eps=self.eps)
        
    def gem(self, x, p=3, eps=1e-6):
        return F.avg_pool2d(x.clamp(min=eps).pow(p), (x.size(-2), x.size(-1))).pow(1./p)
        
    def __repr__(self):
        return self.__class__.__name__ + \
                '(' + 'p=' + '{:.4f}'.format(self.p.data.tolist()[0]) + \
                ', ' + 'eps=' + str(self.eps) + ')'
#model
class BirdCLEFModel(nn.Module):
    def __init__(self, model_name, embedding_size, pretrained=True):
        super(BirdCLEFModel, self).__init__()
        self.model = timm.create_model(model_name, pretrained=pretrained)
        in_features = self.model.classifier.in_features
        self.model.classifier = nn.Identity()
        self.model.global_pool = nn.Identity()
        self.pooling = GeM()
        self.embedding = nn.Linear(in_features, embedding_size)
        self.fc = nn.Linear(embedding_size, CONFIG.num_class)

    def forward(self, images):
        features = self.model(images)
        pooled_features = self.pooling(features).flatten(1)
        embedding = self.embedding(pooled_features)
        output = self.fc(embedding)
        return output
    

Model wraping

In [6]:
class CMP_model():
    def __init__(self,model_name):
        if model_name == 'AST':
            self.model = ASTModel(label_dim = CONFIG.num_class,input_fdim = CONFIG.n_mels,input_tdim = 313)
        elif model_name == 'tf_efficientnet_b0_ns':
            self.model = BirdCLEFModel(model_name,CONFIG.embedding_size)
        self.model = self.model.to(CONFIG.device)


In [7]:
#create dataset for test
class test_dataset(Dataset):

    def __init__(self,df,clip,target_sample_rate = 32000):
        self.df = df
        self.clip = torch.mean(clip,axis = 0)
        self.SR = target_sample_rate
        self.num_samples = CONFIG.max_time*self.SR
        self.normalizer = NormalizeMelSpec()

    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx: int):
        sample = self.df.loc[idx, :]
        row_id = sample.row_id

        end = int(sample.seconds)
        start = int(end - 5)

        start_index = int(self.SR*start)
        end_index = int(self.SR*end)

        sample = self.clip[start_index:end_index]
        
        if sample.shape[0] > self.num_samples:
            sample = self.crop_audio(sample)
        if sample.shape[0] < self.num_samples:
            sample = self.pad_audio(sample)
        
        sample = torch.nan_to_num(sample)
        mel_spectrogram = MelSpectrogram(sample_rate=self.SR,
                                        n_mels = CONFIG.n_mels,
                                        n_fft = CONFIG.n_fft)
        mel = mel_spectrogram(sample)
        image = torch.stack([mel,mel,mel])
        image = self.normalizer(image)
        #image = torch.mean(self.normalizer(image),dim = 0).squeeze()
        #image = image.permute(1,0)
        #max_val = torch.abs(image).max()
        #image = image / max_val
        return image,row_id,end
    

    def pad_audio(self, audio):
        pad_length = self.num_samples - audio.shape[0]
        last_dim_padding = (0, pad_length)
        audio = F.pad(audio, last_dim_padding) #奇怪的pad方式增加了
        return audio
        
    def crop_audio(self, audio):
        return audio[:self.num_samples] 

# 
def prediction_for_clip(test_df,clip,models):
    dataset = test_dataset(df = test_df,clip = clip)
    loader = DataLoader(dataset,batch_size = 1,shuffle = False)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    prediction_dict = {'row_id':[],'target':[]}
    for image , row_id,seconds in tqdm(loader):
        image = image.to(device)
        pred_total = torch.zeros(CONFIG.num_class).to(device)
        for i in range(len(models)):
            if CONFIG.model_name[i] == 'AST':
                input = (torch.mean(image.squeeze(),dim = 0)).permute(1,0).unsqueeze(dim = 0)
            else:
                input = image
            outputs = models[i](input)
            pred = torch.sigmoid(outputs)[0]
            pred_total += pred
        pred_total = pred/len(models)
        row_id = row_id[0]
        seconds = seconds.item()
        for bird in CONFIG.scored_birds:
            judge = False
            if pred_total[int(CONFIG.bird2id[bird])] >= 0.2:
                judge = True
            id = row_id + '_' + bird + '_'+str(seconds)
                
            prediction_dict['row_id'].append(id)
            prediction_dict['target'].append(judge)

    return prediction_dict

In [8]:
#Prediction part
from torch.nn.modules.batchnorm import _BatchNorm

def prepare_model_for_inference(model,path,device = 'cuda:1'):
    if not torch.cuda.is_available():
        ckpt = torch.load(path,map_location = 'cpu')
    else:
        ckpt = torch.load(path,map_location={'cuda:1' :'cuda:0'})
    model.load_state_dict(ckpt)
    model.eval()

    return model

def prediction(test_audios,models,threshold = 0.05, threshold_long = None):
    #假设这里的model已经完成了load
    prediction_dicts = {'row_id':[],'target':[]}
    for audio_path in test_audios:
        clip,_ = torchaudio.load(audio_path)
        seconds = []
        row_ids = []

        for second in range(5,65,5):
            row_id = audio_path.name.split('.')[:-1][0]
            #row_id = "_".join(audio_path.name.split('.'[:-1])+f"_{second}")
            seconds.append(second)
            row_ids.append(row_id)
        
        test_df = pd.DataFrame(
            {
                "row_id":row_ids,
                "seconds":seconds
            }
        )
        prediction_dict = prediction_for_clip(test_df,clip,models)
        prediction_dicts['row_id'].extend(prediction_dict['row_id'])
        prediction_dicts['target'].extend(prediction_dict['target'])
    
    return prediction_dicts
    


In [9]:
test_audio_dir = Path('input/birdcleff-2022/test_soundscapes/')
torch.cuda.empty_cache()
#test_audio_dir = Path('input/birdcleff-2022/test_soundscapes/')
all_audios = list(test_audio_dir.glob("*.ogg"))

model_list = []
trained_list = ['AST_F10.4501_epoch182.bin','F10.4293_epoch120.bin']
for i in range(len(trained_list)):
    model = CMP_model(CONFIG.model_name[i]).model
    torch.cuda.empty_cache()
    model = prepare_model_for_inference(model,trained_list[i])
    model_list.append(model)


pred = prediction(all_audios,model_list)

'''
if CONFIG.model_name == 'AST':
    model = ASTModel(label_dim = CONFIG.num_class, input_fdim = 224, input_tdim = 313)
    model.to(CONFIG.device)

model = prepare_model_for_inference(model,'F10.4327_epoch166.bin')
pred = prediction(all_audios,model)
'''
result = pd.DataFrame(pred,columns = ['row_id','target'])
print(result.head())
result.to_csv("submission.csv",index = False)
torch.cuda.empty_cache()

---------------AST Model Summary---------------
ImageNet pretraining: False, AudioSet pretraining: False
frequncey stride=10, time stride=10
number of patches=630


100%|██████████| 12/12 [00:03<00:00,  3.40it/s]
100%|██████████| 12/12 [00:01<00:00,  6.49it/s]

                          row_id  target
0  soundscape_453028782_akiapo_5   False
1  soundscape_453028782_aniani_5   False
2  soundscape_453028782_apapan_5   False
3  soundscape_453028782_barpet_5   False
4  soundscape_453028782_crehon_5   False



