# Summary of this notebook

In this notebook, I will show you how to inference with the nocall detector for train_short_audio.

# Input & Output

[input]

birdclef-2021 (original data)

7sec clip melspectrogram images of train_short_audio 

(generated by kkiller's notebook https://www.kaggle.com/kneroma/birdclef-mels-computer-public)

https://www.kaggle.com/kneroma/kkiller-birdclef-mels-computer-d7-part1

https://www.kaggle.com/kneroma/kkiller-birdclef-mels-computer-d7-part2

https://www.kaggle.com/kneroma/kkiller-birdclef-mels-computer-d7-part3

https://www.kaggle.com/kneroma/kkiller-birdclef-mels-computer-d7-part4

nocall detector models

[output]

inference results for train_short_audio are outputted.

In [1]:
import torch

class CFG:
    debug = False
    print_freq=100
    num_workers=1
    model_name= 'resnext50_32x4d'
    dim=(128, 281)
    epochs=10
    batch_size=1
    seed=42
    target_size=2
    fold = 2 #choose from [0,1,2,3,4]
    pretrained = False
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [2]:
!pip install --quiet timm

import os
import math
import time
import random
import shutil
from pathlib import Path
from contextlib import contextmanager
from collections import defaultdict, Counter

import scipy as sp
import numpy as np
import pandas as pd

from sklearn import preprocessing
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import StratifiedKFold

from tqdm.auto import tqdm
from functools import partial

import cv2
from PIL import Image

import cv2
from PIL import Image

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam, SGD
import torchvision.models as models
from torch.nn.parameter import Parameter
from torch.utils.data import DataLoader, Dataset
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts, CosineAnnealingLR, ReduceLROnPlateau

from albumentations.pytorch.transforms import ToTensorV2
from albumentations import ImageOnlyTransform

import timm

import warnings 
warnings.filterwarnings('ignore')

import glob



In [3]:
OUTPUT_DIR = './'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

In [4]:
def get_score(y_true, y_pred):
    return accuracy_score(y_true, y_pred)

def get_confusion_matrix(y_true, y_pred):
    return confusion_matrix(y_true, y_pred)

@contextmanager
def timer(name):
    t0 = time.time()
    LOGGER.info(f'[{name}] start')
    yield
    LOGGER.info(f'[{name}] done in {time.time() - t0:.0f} s.')


def init_logger(log_file=OUTPUT_DIR+'train.log'):
    from logging import getLogger, INFO, FileHandler,  Formatter,  StreamHandler
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=log_file)
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = init_logger()


def seed_torch(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_torch(seed=CFG.seed)

In [5]:
short = pd.read_csv('../input/birdclef-2022/train_metadata.csv')

metadata_5s = "./train_metadata_7s.csv"

if not os.path.exists(metadata_5s):
    metadata=[]
    # create metadata csv for short audios
    for idx in tqdm(range(len(short.index))):
        basename = os.path.basename(short.iloc[idx,-1])
        file = "../input/birdclef2022-mels-generator/birdclef2022_mels_5s/" + short.iloc[idx,0] + "/"+basename+".npy"
        metadata = metadata + [short.iloc[idx,:-1].values.tolist() + [file]]

    short_clip = pd.DataFrame(metadata, index=None, columns=short.columns)

    short_clip.to_csv(metadata_5s)
else:
    short_clip = pd.read_csv(metadata_5s, index_col=0)

print(short_clip.head())

  0%|          | 0/14852 [00:00<?, ?it/s]

  primary_label                secondary_labels                     type  \
0       afrsil1                              []  ['call', 'flight call']   
1       afrsil1  ['houspa', 'redava', 'zebdov']                 ['call']   
2       afrsil1                              []         ['call', 'song']   
3       afrsil1                              []   ['alarm call', 'call']   
4       afrsil1                              []          ['flight call']   

   latitude  longitude  scientific_name         common_name          author  \
0   12.3910    -1.4930  Euodice cantans  African Silverbill       Bram Piot   
1   19.8801  -155.7254  Euodice cantans  African Silverbill        Dan Lane   
2   16.2901   -16.0321  Euodice cantans  African Silverbill       Bram Piot   
3   17.0922    54.2958  Euodice cantans  African Silverbill  Oscar Campbell   
4   21.4581  -157.7252  Euodice cantans  African Silverbill   Ross Gallardy   

                                             license  rating   time 

In [6]:
class TestDataset(Dataset):
    def __init__(self, df, transform=None):
        self.df = df
        self.filenames = df['filename'].values
        #self.labels = df['hasbird'].values
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        file_name = self.filenames[idx]
        # filepath = glob.glob(f'../input/melspec7s/*/{file_name}.npy')[0]
        filepath = file_name
        image = np.load(filepath)
        #image = np.expand_dims(image, axis=0)
        image = np.stack((image,)*3, -1)
        augmented_images = []
        if self.transform:
            for i in range(image.shape[0]):
                oneimage = image[i]
                augmented = self.transform(image=oneimage)
                oneimage = augmented['image']
                augmented_images.append(oneimage)
        #label = torch.tensor(self.labels[idx]).long()
        return np.stack(augmented_images, axis=0)#, label

In [7]:
import albumentations as A
from albumentations.pytorch import ToTensorV2

def get_transforms(*, data):

    if data == 'train':
        return A.Compose([
            A.Resize(CFG.dim[0], CFG.dim[1]),
            A.HorizontalFlip(p=0.5),
            A.VerticalFlip(p=0.5),
            A.augmentations.transforms.JpegCompression(p=0.5),
            A.Normalize(
                mean=[0.485, 0.456, 0.406],
                std=[0.229, 0.224, 0.225],
            ),
            ToTensorV2(),
        ])

    elif data == 'valid':
        return A.Compose([
            A.Resize(CFG.dim[0], CFG.dim[1]),
            A.Normalize(
                mean=[0.485, 0.456, 0.406],
                std=[0.229, 0.224, 0.225],
            ),
            ToTensorV2(),
        ])

In [8]:
import torch.nn as nn
import timm

class CustomResNext(nn.Module):
    def __init__(self, model_name='resnext50_32x4d', pretrained=True):
        super().__init__()
        self.model = timm.create_model(model_name, pretrained=pretrained)
        n_features = self.model.fc.in_features
        self.model.fc = nn.Linear(n_features, CFG.target_size)

    def forward(self, x):
        x = self.model(x)
        return x

In [9]:
def inference(model, states, test_loader, device):
    model.to(device)
    tk0 = tqdm(enumerate(test_loader), total=len(test_loader))
    probs = []
    count = 0
    for i, (images) in tk0:
        images = images[0]
        images = images.to(device)
        avg_preds = []
        for state in states:
            model.load_state_dict(state['model'])
            model.eval()
            with torch.no_grad():
                y_preds = model(images)
            avg_preds.append(y_preds.softmax(1).to('cpu').numpy())
        avg_preds = np.mean(avg_preds, axis=0)
        probs.append(avg_preds)
        count += 1
        if count % 1000 == 0:
            print(count)
    #probs = np.concatenate(probs)
    return np.asarray(probs)

In [10]:
if CFG.debug == True:
    short_clip = short_clip.sample(n=10)

In [11]:
for fold in range(5):
    CFG.fold = fold

    MODEL_DIR = '../input/kagglebirdclef2021callvsnoncall/'
    model = CustomResNext(CFG.model_name, pretrained=CFG.pretrained)
    states = [torch.load(MODEL_DIR+f'{CFG.model_name}_fold{CFG.fold}_best.pth'),]
    test_dataset = TestDataset(short_clip, transform=get_transforms(data='valid'))
    test_loader = DataLoader(test_dataset, batch_size=CFG.batch_size, shuffle=False,
                             num_workers=CFG.num_workers, pin_memory=True)
    predictions = inference(model, states, test_loader, CFG.device)

    predictions = [i[:,1] for i in predictions]
    predictions = [' '.join(map(str, j.tolist())) for j in predictions]
    short_clip['nocalldetection'] = predictions
    short_clip.to_csv(f'./nocalldetection_for_shortaudio_fold{CFG.fold}.csv', index=False)

    short_clip.head()

  0%|          | 0/14852 [00:00<?, ?it/s]

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000


  0%|          | 0/14852 [00:00<?, ?it/s]

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000


  0%|          | 0/14852 [00:00<?, ?it/s]

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000


  0%|          | 0/14852 [00:00<?, ?it/s]

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000


  0%|          | 0/14852 [00:00<?, ?it/s]

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
