In [1]:
import cv2
import audioread
import logging
import os
import sys

sys.path.append('../pytorch-image-models-master')
import random
import time
import warnings
import glob

import librosa
import numpy as np
import pandas as pd
import soundfile as sf
import timm
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as torchdata

from contextlib import contextmanager
from pathlib import Path
from typing import List
from typing import Optional
from sklearn import metrics

from tqdm import tqdm

import albumentations as A
import albumentations.pytorch.transforms as T

import concurrent.futures

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
def init_layer(layer):
    nn.init.xavier_uniform_(layer.weight)

    if hasattr(layer, "bias"):
        if layer.bias is not None:
            layer.bias.data.fill_(0.0)


def init_bn(bn):
    bn.bias.data.fill_(0.0)
    bn.weight.data.fill_(1.0)


def init_weights(model):
    classname = model.__class__.__name__
    if classname.find("Conv2d") != -1:
        nn.init.xavier_uniform_(model.weight, gain=np.sqrt(2))
        model.bias.data.fill_(0)
    elif classname.find("BatchNorm") != -1:
        model.weight.data.normal_(1.0, 0.02)
        model.bias.data.fill_(0)
    elif classname.find("GRU") != -1:
        for weight in model.parameters():
            if len(weight.size()) > 1:
                nn.init.orghogonal_(weight.data)
    elif classname.find("Linear") != -1:
        model.weight.data.normal_(0, 0.01)
        model.bias.data.zero_()


def interpolate(x: torch.Tensor, ratio: int):
    """Interpolate data in time domain. This is used to compensate the
    resolution reduction in downsampling of a CNN.
    Args:
      x: (batch_size, time_steps, classes_num)
      ratio: int, ratio to interpolate
    Returns:
      upsampled: (batch_size, time_steps * ratio, classes_num)
    """
    (batch_size, time_steps, classes_num) = x.shape
    upsampled = x[:, :, None, :].repeat(1, 1, ratio, 1)
    upsampled = upsampled.reshape(batch_size, time_steps * ratio, classes_num)
    return upsampled


def pad_framewise_output(framewise_output: torch.Tensor, frames_num: int):
    """Pad framewise_output to the same length as input frames. The pad value
    is the same as the value of the last frame.
    Args:
      framewise_output: (batch_size, frames_num, classes_num)
      frames_num: int, number of frames to pad
    Outputs:
      output: (batch_size, frames_num, classes_num)
    """
    output = F.interpolate(
        framewise_output.unsqueeze(1), size=(frames_num, framewise_output.size(2)), align_corners=True, mode="bilinear"
    ).squeeze(1)

    return output


class AttBlockV2(nn.Module):
    def __init__(self, in_features: int, out_features: int, activation="linear"):
        super().__init__()

        self.activation = activation
        self.att = nn.Conv1d(in_channels=in_features, out_channels=out_features, kernel_size=1, stride=1, padding=0, bias=True)
        self.cla = nn.Conv1d(in_channels=in_features, out_channels=out_features, kernel_size=1, stride=1, padding=0, bias=True)

        self.init_weights()

    def init_weights(self):
        init_layer(self.att)
        init_layer(self.cla)

    def forward(self, x):
        # x: (n_samples, n_in, n_time)
        norm_att = torch.softmax(torch.tanh(self.att(x)), dim=-1)
        cla = self.nonlinear_transform(self.cla(x))
        x = torch.sum(norm_att * cla, dim=2)
        return x, norm_att, cla

    def nonlinear_transform(self, x):
        if self.activation == 'linear':
            return x
        elif self.activation == 'sigmoid':
            return torch.sigmoid(x)


class TimmSED(nn.Module):
    def __init__(self, base_model_name: str, config=None, pretrained=False, num_classes=24, in_channels=1):
        super().__init__()

        self.config = config

        self.bn0 = nn.BatchNorm2d(self.config.n_mels)

        base_model = timm.create_model(
            base_model_name,
            pretrained=pretrained,
            num_classes=0,
            global_pool="",
            in_chans=in_channels,
        )

        layers = list(base_model.children())[:-2]
        self.encoder = nn.Sequential(*layers)

        in_features = base_model.num_features

        self.fc1 = nn.Linear(in_features, in_features, bias=True)
        self.att_block = AttBlockV2(in_features, num_classes, activation="sigmoid")

        self.init_weight()

    def init_weight(self):
        init_bn(self.bn0)
        init_layer(self.fc1)

    def forward(self, input_data):
        if self.config.in_channels == 3:
            x = input_data
        else:
            x = input_data[:, [0], :, :]  # (batch_size, 1, time_steps, mel_bins)

        frames_num = x.shape[2]

        x = x.transpose(1, 3)
        x = self.bn0(x)
        x = x.transpose(1, 3)

        x = x.transpose(2, 3)

        x = self.encoder(x)

        # Aggregate in frequency axis
        x = torch.mean(x, dim=2)

        x1 = F.max_pool1d(x, kernel_size=3, stride=1, padding=1)
        x2 = F.avg_pool1d(x, kernel_size=3, stride=1, padding=1)
        x = x1 + x2

        x = x.transpose(1, 2)
        x = F.relu_(self.fc1(x))
        x = x.transpose(1, 2)

        (clipwise_output, norm_att, segmentwise_output) = self.att_block(x)

        output_dict = {
            "clipwise_output": clipwise_output,
        }

        return output_dict

In [4]:
mean = 0.485  # R only for RGB
std = 0.229  # R only for RGB

albu_transforms = {
    'train': A.Compose(
        [
            A.Normalize(mean, std),
        ]
    ),
    'valid': A.Compose(
        [
            A.Normalize(mean, std),
        ]
    ),
}


mean2 = (0.485, 0.456, 0.406)  # RGB
std2 = (0.229, 0.224, 0.225)  # RGB

albu_transforms2 = {
    'train': A.Compose(
        [
            A.Normalize(mean2, std2),
        ]
    ),
    'valid': A.Compose(
        [
            A.Normalize(mean2, std2),
        ]
    ),
}

In [39]:
class CFG_tf_efficientnet_b0:
    batch_size = 4
    num_workers = 4

    n_mels = 256
    fmin = 16
    fmax = 16386
    n_fft = 2048
    hop_length = 512
    sr = 32000

    target_columns = "abethr1 abhori1 abythr1 afbfly1 afdfly1 afecuc1 affeag1 afgfly1 afghor1 afmdov1 afpfly1 afpkin1 afpwag1 afrgos1 afrgrp1 afrjac1 afrthr1 amesun2 augbuz1 bagwea1 barswa bawhor2 bawman1 bcbeat1 beasun2 bkctch1 bkfruw1 blacra1 blacuc1 blakit1 blaplo1 blbpuf2 blcapa2 blfbus1 blhgon1 blhher1 blksaw1 blnmou1 blnwea1 bltapa1 bltbar1 bltori1 blwlap1 brcale1 brcsta1 brctch1 brcwea1 brican1 brobab1 broman1 brosun1 brrwhe3 brtcha1 brubru1 brwwar1 bswdov1 btweye2 bubwar2 butapa1 cabgre1 carcha1 carwoo1 categr ccbeat1 chespa1 chewea1 chibat1 chtapa3 chucis1 cibwar1 cohmar1 colsun2 combul2 combuz1 comsan crefra2 crheag1 crohor1 darbar1 darter3 didcuc1 dotbar1 dutdov1 easmog1 eaywag1 edcsun3 egygoo equaka1 eswdov1 eubeat1 fatrav1 fatwid1 fislov1 fotdro5 gabgos2 gargan gbesta1 gnbcam2 gnhsun1 gobbun1 gobsta5 gobwea1 golher1 grbcam1 grccra1 grecor greegr grewoo2 grwpyt1 gryapa1 grywrw1 gybfis1 gycwar3 gyhbus1 gyhkin1 gyhneg1 gyhspa1 gytbar1 hadibi1 hamerk1 hartur1 helgui hipbab1 hoopoe huncis1 hunsun2 joygre1 kerspa2 klacuc1 kvbsun1 laudov1 lawgol lesmaw1 lessts1 libeat1 litegr litswi1 litwea1 loceag1 lotcor1 lotlap1 luebus1 mabeat1 macshr1 malkin1 marsto1 marsun2 mcptit1 meypar1 moccha1 mouwag1 ndcsun2 nobfly1 norbro1 norcro1 norfis1 norpuf1 nubwoo1 pabspa1 palfly2 palpri1 piecro1 piekin1 pitwhy purgre2 pygbat1 quailf1 ratcis1 raybar1 rbsrob1 rebfir2 rebhor1 reboxp1 reccor reccuc1 reedov1 refbar2 refcro1 reftin1 refwar2 rehblu1 rehwea1 reisee2 rerswa1 rewsta1 rindov rocmar2 rostur1 ruegls1 rufcha2 sacibi2 sccsun2 scrcha1 scthon1 shesta1 sichor1 sincis1 slbgre1 slcbou1 sltnig1 sobfly1 somgre1 somtit4 soucit1 soufis1 spemou2 spepig1 spewea1 spfbar1 spfwea1 spmthr1 spwlap1 squher1 strher strsee1 stusta1 subbus1 supsta1 tacsun1 tafpri1 tamdov1 thrnig1 trobou1 varsun2 vibsta2 vilwea1 vimwea1 walsta1 wbgbir1 wbrcha2 wbswea1 wfbeat1 whbcan1 whbcou1 whbcro2 whbtit5 whbwea1 whbwhe3 whcpri2 whctur2 wheslf1 whhsaw1 whihel1 whrshr1 witswa1 wlwwar wookin1 woosan wtbeat1 yebapa1 yebbar1 yebduc1 yebere1 yebgre1 yebsto1 yeccan1 yefcan yelbis1 yenspu1 yertin1 yesbar1 yespet1 yetgre1 yewgre1".split()

    base_model_name = "tf_efficientnet_b0_ns"
    pretrained = False
    num_classes = 264
    in_channels = 1

    ckpt_path = Path.mkdir(Path.cwd().parent / 'models', exist_ok=True)

In [40]:
config_tf_efficientnet_b0 = CFG_tf_efficientnet_b0()

# config_enesemble = [
#     config_tf_efficientnet_b0,
# ]

In [8]:
def compute_melspec(y, params):
    """
    Computes a mel-spectrogram and puts it at decibel scale
    Arguments:
        y {np array} -- signal
        params {AudioParams} -- Parameters to use for the spectrogram. Expected to have the attributes sr, n_mels, f_min, f_max
    Returns:
        np array -- Mel-spectrogram
    """
    melspec = librosa.feature.melspectrogram(
        y=y,
        sr=params.sr,
        n_mels=params.n_mels,
        n_fft=params.n_fft,
        hop_length=params.hop_length,
        fmin=params.fmin,
        fmax=params.fmax,
    )

    return melspec


def mono_to_color(X, eps=1e-6, mean=None, std=None):
    """
    Converts a one channel array in [0, 255]
    Arguments:
        X {numpy array [H x W]} -- 2D array to convert
    Keyword Arguments:
        eps {float} -- To avoid dividing by 0 (default: {1e-6})
        mean {None or np array} -- Mean for normalization (default: {None})
        std {None or np array} -- Std for normalization (default: {None})
    Returns:
        numpy array [1 x H x W] -- RGB numpy array
    """
    # X = np.stack([X, X, X], axis=-1)
    X = np.expand_dims(X, axis=-1)

    # Standardize
    mean = mean or X.mean()
    std = std or X.std()
    X = (X - mean) / (std + eps)

    # Normalize to [0, 255]
    _min, _max = X.min(), X.max()

    if (_max - _min) > eps:
        V = np.clip(X, _min, _max)
        V = 255 * (V - _min) / (_max - _min)
        V = V.astype(np.uint8)
    else:
        V = np.zeros_like(X, dtype=np.uint8)

    return V


def mono_to_color3(X, eps=1e-6, mean=None, std=None):
    """
    Converts a one channel array to a 3 channel one in [0, 255]
    Arguments:
        X {numpy array [H x W]} -- 2D array to convert
    Keyword Arguments:
        eps {float} -- To avoid dividing by 0 (default: {1e-6})
        mean {None or np array} -- Mean for normalization (default: {None})
        std {None or np array} -- Std for normalization (default: {None})
    Returns:
        numpy array [3 x H x W] -- RGB numpy array
    """
    X = np.stack([X, X, X], axis=-1)

    # Standardize
    mean = mean or X.mean()
    std = std or X.std()
    X = (X - mean) / (std + eps)

    # Normalize to [0, 255]
    _min, _max = X.min(), X.max()

    if (_max - _min) > eps:
        V = np.clip(X, _min, _max)
        V = 255 * (V - _min) / (_max - _min)
        V = V.astype(np.uint8)
    else:
        V = np.zeros_like(X, dtype=np.uint8)

    return V

In [9]:
class TestDataset(torchdata.Dataset):
    def __init__(
        self,
        df: pd.DataFrame,
        clip: np.ndarray,
        config=None,
    ):
        self.df = df
        self.clip = clip
        self.config = config

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx: int):
        sample = self.df.loc[idx, :]
        row_id = sample.row_id

        end_seconds = int(sample.seconds)
        start_seconds = int(end_seconds - 5)

        y = self.clip[self.config.sr * start_seconds : self.config.sr * end_seconds].astype(np.float32)

        image = compute_melspec(y, self.config)
        image = librosa.power_to_db(image.astype(np.float32), ref=np.max)

        if self.config.in_channels == 3:
            image = mono_to_color3(image)
            image = image.astype(np.uint8)
            image = albu_transforms2['valid'](image=image)['image'].T
        else:
            image = mono_to_color(image)
            image = image.astype(np.uint8)
            image = albu_transforms['valid'](image=image)['image'].T

        return {
            "image": image,
            "row_id": row_id,
        }

In [None]:
class TrainDataset(torchdata.Dataset):
    def __init__(
        self,
        df: pd.DataFrame,
        clip: np.ndarray,
        config=None,
    ):
        self.df = df
        self.clip = clip
        self.config = config

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx: int):
        sample = self.df.loc[idx, :]
        row_id = sample.row_id

        end_seconds = int(sample.seconds)
        start_seconds = int(end_seconds - 5)

        y = self.clip[self.config.sr * start_seconds : self.config.sr * end_seconds].astype(np.float32)

        image = compute_melspec(y, self.config)
        image = librosa.power_to_db(image.astype(np.float32), ref=np.max)

        if self.config.in_channels == 3:
            image = mono_to_color3(image)
            image = image.astype(np.uint8)
            image = albu_transforms2['valid'](image=image)['image'].T
        else:
            image = mono_to_color(image)
            image = image.astype(np.uint8)
            image = albu_transforms['valid'](image=image)['image'].T

        return {
            "image": image,
            "row_id": row_id,
        }

In [14]:
# models_ensemble = []

# for config in config_enesemble:

#     config_models = []

#     for ckpt_path in config.ckpt_path:

#         model = TimmSED(
#             base_model_name=config.base_model_name,
#             config=config,
#             pretrained=config.pretrained,
#             num_classes=config.num_classes,
#             in_channels=config.in_channels
#         )

#         model.load_state_dict(torch.load(ckpt_path, map_location=device))
#         model.eval()

#         config_models.append(model)

#     models_ensemble.append((config, config_models))

# all_audios = list(Path("../input/birdclef-2023/test_soundscapes/").glob("*.ogg"))*15

In [41]:
models_ensemble = []

config = config_tf_efficientnet_b0

model = TimmSED(
    base_model_name=config.base_model_name,
    config=config,
    pretrained=config.pretrained,
    num_classes=config.num_classes,
    in_channels=config.in_channels
)

all_audios = list((Path.cwd().parent / 'data' / "train_audio").glob("**/*.ogg"))*15

  model = create_fn(


In [42]:
all_audios

[PosixPath('/home/broug/Desktop/birds/data/train_audio/fatrav1/XC354694.ogg'),
 PosixPath('/home/broug/Desktop/birds/data/train_audio/fatrav1/XC687329.ogg'),
 PosixPath('/home/broug/Desktop/birds/data/train_audio/fatrav1/XC293554.ogg'),
 PosixPath('/home/broug/Desktop/birds/data/train_audio/fatrav1/XC447027.ogg'),
 PosixPath('/home/broug/Desktop/birds/data/train_audio/fatrav1/XC746540.ogg'),
 PosixPath('/home/broug/Desktop/birds/data/train_audio/fatrav1/XC445355.ogg'),
 PosixPath('/home/broug/Desktop/birds/data/train_audio/fatrav1/XC687331.ogg'),
 PosixPath('/home/broug/Desktop/birds/data/train_audio/fatrav1/XC293487.ogg'),
 PosixPath('/home/broug/Desktop/birds/data/train_audio/fatrav1/XC322388.ogg'),
 PosixPath('/home/broug/Desktop/birds/data/train_audio/fatrav1/XC371225.ogg'),
 PosixPath('/home/broug/Desktop/birds/data/train_audio/fatrav1/XC512852.ogg'),
 PosixPath('/home/broug/Desktop/birds/data/train_audio/fatrav1/XC687332.ogg'),
 PosixPath('/home/broug/Desktop/birds/data/train_aud

In [10]:
seconds = [i for i in range(5, 605, 5)]


def prediction_for_clip(audio_path):
    device = torch.device("cpu")

    # inference
    prediction_dict = {}

    global models_ensemble

    clip, _ = librosa.load(audio_path, sr=32000)
    name_ = "_".join(audio_path.name.split(".")[:-1])
    row_ids = [name_ + f"_{second}" for second in seconds]

    test_df = pd.DataFrame({"row_id": row_ids, "seconds": seconds})

    for config_models in models_ensemble:
        config, models = config_models[0], config_models[1]

        dataset = TestDataset(
            df=test_df,
            clip=clip,
            config=config,
        )

        loader = torchdata.DataLoader(
            dataset, batch_size=config.batch_size, num_workers=config.num_workers, drop_last=False, shuffle=False, pin_memory=True
        )

        for data in loader:
            row_ids = data['row_id']

            for row_id in row_ids:
                if row_id not in prediction_dict:
                    prediction_dict[str(row_id)] = []

            image = data['image']  # .to(device)

            probas = []

            for model in models:
                with torch.no_grad():
                    output = model(image)
                #
                for row_id_idx, row_id in enumerate(row_ids):
                    prediction_dict[str(row_id)].append(output['clipwise_output'][[row_id_idx]].numpy().reshape(-1))

    for row_id in list(prediction_dict.keys()):
        logits = np.array(prediction_dict[row_id]).mean(0)
        prediction_dict[row_id] = {}
        for label in range(len(config.target_columns)):
            prediction_dict[row_id][config.target_columns[label]] = logits[label]

    return prediction_dict

In [26]:
seconds = [i for i in range(5, 605, 5)]


def prediction_for_clip2(audio_path):
    device = torch.device("cpu")

    # inference
    prediction_dict = {}

    global models_ensemble

    clip, _ = librosa.load(audio_path, sr=32000)
    name_ = "_".join(audio_path.name.split(".")[:-1])
    row_ids = [name_ + f"_{second}" for second in seconds]

    train_df = pd.DataFrame({"row_id": row_ids, "seconds": seconds})

    for config_models in models_ensemble:
        config, models = config_models[0], config_models[1]

        dataset = TrainDataset(
            df=train_df,
            clip=clip,
            config=config,
        )

        loader = torchdata.DataLoader(
            dataset, batch_size=config.batch_size, num_workers=config.num_workers, drop_last=False, shuffle=False, pin_memory=True
        )

        for data in loader:
            row_ids = data['row_id']

            for row_id in row_ids:
                if row_id not in prediction_dict:
                    prediction_dict[str(row_id)] = []

            image = data['image']  # .to(device)

            probas = []

            with model.train():
                output = model(image)

            #
            for row_id_idx, row_id in enumerate(row_ids):
                prediction_dict[str(row_id)].append(output['clipwise_output'][[row_id_idx]].numpy().reshape(-1))

        with torch.no_grad():
            output = model(image)

    for row_id in list(prediction_dict.keys()):
        logits = np.array(prediction_dict[row_id]).mean(0)
        prediction_dict[row_id] = {}
        for label in range(len(config.target_columns)):
            prediction_dict[row_id][config.target_columns[label]] = logits[label]

    return prediction_dict

In [11]:
def crop_or_pad(y, length, sr, train=True, probs=None):
    """
    Crops an array to a chosen length
    Arguments:
        y {1D np array} -- Array to crop
        length {int} -- Length of the crop
        sr {int} -- Sampling rate
    Keyword Arguments:
        train {bool} -- Whether we are at train time. If so, crop randomly, else return the beginning of y (default: {True})
        probs {None or numpy array} -- Probabilities to use to chose where to crop (default: {None})
    Returns:
        1D np array -- Cropped array
    """
    if len(y) <= length:
        y = np.concatenate([y, np.zeros(length - len(y))])
    else:
        if not train:
            start = 0
        elif probs is None:
            start = np.random.randint(len(y) - length)
        else:
            start = np.random.choice(np.arange(len(probs)), p=probs) + np.random.random()
            start = int(sr * (start))

        y = y[start : start + length]

    return y.astype(np.float32)

In [27]:
%%time
start = time.time()
dicts = []
for audio_path in all_audios:
    dicts.append(prediction_for_clip2(audio_path))
print(f"Regular for loop costs {time.time()-start} for processing 15 audios")

Regular for loop costs 5.4836273193359375e-05 for processing 15 audios
CPU times: user 207 µs, sys: 50 µs, total: 257 µs
Wall time: 269 µs


In [29]:
start = time.time()
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
    dicts = list(executor.map(prediction_for_clip2, all_audios))
print(f"With concurrent ThreadPoolExecutor, time cost reduced to {time.time()-start} for processing 15 audios")

With concurrent ThreadPoolExecutor, time cost reduced to 0.0005519390106201172 for processing 15 audios


In [None]:
prediction_dicts = {}
for d in dicts:
    prediction_dicts.update(d)

In [None]:
submission = pd.DataFrame.from_dict(prediction_dicts, "index").rename_axis("row_id").reset_index()
submission.to_csv("submission.csv", index=False)
submission.head()