In [3]:
import os
import cv2
import csv
import sys
import copy
from tqdm import tqdm
from typing import Union, List, Dict, Any, cast
import random
import librosa
import librosa.display
import numpy as np 
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import label_ranking_average_precision_score, accuracy_score
import torchvision

import audiomentations as audioaa

import matplotlib.pyplot as plt
import IPython.display as ipd 
import skimage.io
from skimage.transform import resize
import albumentations as albu
from albumentations import pytorch as AT
from PIL import Image
from functools import partial


import pretrainedmodels
#from resnest.torch import resnest50

sys.path.append('../')

import src.audio_augs as aa
from src.utils import patch_first_conv
from src.loss import lsep_loss_stable, lsep_loss
from src.batch_mixer import BatchMixer
from src.pann import *

import timm
from timm.models.efficientnet import tf_efficientnet_b0_ns, tf_efficientnet_lite4, mobilenetv2_140
from torchlibrosa.stft import Spectrogram, LogmelFilterBank
from torchlibrosa.augmentation import SpecAugmentation


import warnings
warnings.filterwarnings('ignore')

In [4]:
train_folder_path = "../data/train/"
train_np_folder_path = "../data/train_np/"
test_folder_path = "../data/test/"
sample_submission = "../data/sample_submission.csv"
train_tp_path = "../data/train_tp.csv"
train_fp_path = "../data/train_fp.csv"
train_tp_folds = pd.read_csv("../data/train_tp_folds_v3.csv")
train_fp_folds = pd.read_csv("train_fp_folds.csv").drop("Unnamed: 0", 1)

train_files = os.listdir(train_folder_path)
test_files = os.listdir(test_folder_path)

train_tp = pd.read_csv(train_tp_path)
train_fp = pd.read_csv(train_fp_path)

_df = pd.read_csv("missing_3classes_extended.csv")
_df = _df.drop(columns="Unnamed: 0")

pseudo = pd.read_csv("pseudolabels_raw.csv")
pseudo_clear = pseudo[pseudo.mean_confidence > 0.99]
pseudo_clear = pseudo_clear.rename(columns={"offset": "t_max", "onset": "t_min", "file_id": "recording_id", "max_confidence":"f_min", "mean_confidence": "f_max"})
pseudo_clear["songtype_id"] =1 
pseudo_clear["label"] = 0.9
pseudo_clear = pseudo_clear[['recording_id', 'species_id', 'songtype_id', 't_min', 'f_min', 't_max', "f_max", "label"]]
pseudo_clear = pseudo_clear[~(pseudo_clear.species_id == 12)]

In [5]:
class Config:
    SEED = 25
    NUM_BIRDS = 24
    BATCH_SIZE = 16
    NUM_WORKERS = 4
    FOLD = 0
    TEST_FOLD = 5
    EPOCHS = 50
    
    #optimizer params
    LR = 0.01
    LR_ADAM = 1e-3
    WEIGHT_DECAY = 0.0001
    MOMENTUM = 0.9
    T_MAX = 8
    
    #scheduler params
    FACTOR = 0.8
    PATIENCE = 4

    SR = 48000
    LENGTH_1  = 10* SR
    LENGTH_2 = 5 * SR
    #TODO: MAKE AUGS CONF
    
encoder_params = {
    "efficientnet_b0": {
        #"features": 1280,
        "features": 1792,
        "init_op": partial(mobilenetv2_140, pretrained=True, drop_path_rate=0.2)
        }
    }
    
model_param = {
        'encoder' : 'efficientnet_b0',
        'sample_rate': 48000,
        'window_size' : 2048, #* 2, # 512 * 2
        'hop_size' : 512, #345 * 2, # 320
        'mel_bins' : 224, # 60
        'fmin' : 300,
        'fmax' : 15000,
        'classes_num' : 24
    }

In [6]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(Config.SEED)


In [15]:
class AudioSEDModel(nn.Module):
    def __init__(self, encoder, sample_rate, window_size, hop_size, mel_bins, fmin, fmax, classes_num):
        super().__init__()

        window = 'hann'
        center = True
        pad_mode = 'reflect'
        ref = 1.0
        amin = 1e-10
        top_db = None
        self.interpolate_ratio = 30  # Downsampled ratio
        self.mixup_coff = Mixup(1.)

        # Spectrogram extractor
        self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size, 
            win_length=window_size, window=window, center=center, pad_mode=pad_mode, 
            freeze_parameters=True)

        # Logmel feature extractor
        self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size, 
            n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db, 
            freeze_parameters=True)

        # Spec augmenter
        self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2, 
            freq_drop_width=8, freq_stripes_num=2)
        
        # Model Encoder
        self.encoder = encoder_params[encoder]["init_op"]()
        self.fc1 = nn.Linear(encoder_params[encoder]["features"], 1024, bias=True)
        self.att_block = AttBlock(1024, classes_num, activation="sigmoid")
        self.bn0 = nn.BatchNorm2d(mel_bins)
        self.init_weight()
    
    def init_weight(self):
        init_layer(self.fc1)
        init_bn(self.bn0)
    
    def forward(self, input, mixup_lambda=None):
        """Input : (batch_size, data_length)"""

        x = self.spectrogram_extractor(input)
        # batch_size x 1 x time_steps x freq_bins
        x = self.logmel_extractor(x)
        # batch_size x 1 x time_steps x mel_bins

        frames_num = x.shape[2]

        x = x.transpose(1, 3)
        x = self.bn0(x)
        x = x.transpose(1, 3)
        #print(x.shape)

        if self.training and False:
            x = self.spec_augmenter(x)
        
        # Mixup on spectrogram
        if self.training and mixup_lambda is not None:
            x = do_mixup(x, mixup_lambda)
        
        # Output shape (batch size, channels, time, frequency)
        x = x.expand(x.shape[0], 3, x.shape[2], x.shape[3])
        #print(x.shape)
        x = self.encoder.forward_features(x)
        #print(x.shape)
        x = torch.mean(x, dim=3)
        #print(x.shape)

        x1 = F.max_pool1d(x, kernel_size=3, stride=1, padding=1)
        x2 = F.avg_pool1d(x, kernel_size=3, stride=1, padding=1)
        x = x1 + x2
        #print(x.shape)

        x = F.dropout(x, p=0.5, training=self.training)
        x = x.transpose(1, 2)
        x = F.relu_(self.fc1(x))
        x = x.transpose(1, 2)
        x = F.dropout(x, p=0.5, training=self.training)
        #print(x.shape)

        (clipwise_output, norm_att, segmentwise_output) = self.att_block(x)
        logit = torch.sum(norm_att * self.att_block.cla(x), dim=2)
        segmentwise_logit = self.att_block.cla(x).transpose(1, 2)
        segmentwise_output = segmentwise_output.transpose(1, 2)

        # Get framewise output
        framewise_output = interpolate(segmentwise_output,
                                       self.interpolate_ratio)
        framewise_output = pad_framewise_output(framewise_output, frames_num)
        
        framewise_logit = interpolate(segmentwise_logit, self.interpolate_ratio)
        framewise_logit = pad_framewise_output(framewise_logit, frames_num)

        output_dict = {
            "framewise_output": framewise_output,
            "segmentwise_output": segmentwise_output,
            "logit": logit,
            "framewise_logit": framewise_logit,
            "clipwise_output": clipwise_output
        }


        return output_dict
    
def crop_or_pad(y, is_train=True):
    length = Config.LENGTH_2
    if len(y) < length:
        
        pad_width = length - len(y)
        pad_sub = start = np.random.randint(0, pad_width)
        
        y = np.pad(y, (pad_sub, pad_width-pad_sub), "minimum")
    elif len(y) > length:
        start = np.random.randint(len(y) - length)
        
        y = y[start:start + length]

    y = y.astype(np.float32, copy=False)
    #print(y.shape)

    return y

In [16]:
class RainforestDataset(Dataset):
    def __init__(self, df, audio_transforms = None, image_transforms = None,):
        self.audio_transforms = audio_transforms
        self.img_transforms = image_transforms
        self.df = df

    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        sample = copy.deepcopy(self.df.iloc[idx, :].values)
        try:
            wav = np.load(train_np_folder_path + sample[0] + ".npy")
        except:
            wav, sr = librosa.load('../data/test/' + sample[0] + ".flac", sr=None)
            
        tmin = float(sample[3]) * Config.SR
        tmax = float(sample[5]) * Config.SR
        center = np.round((tmin + tmax) / 2)
        
        multiplier = random.random() * 0.5 + 1
        clip_size = (tmax - tmin) * multiplier
        beginning = center - Config.LENGTH_2 / 2
        if beginning < 0:
            beginning = 0
            
        beginning = np.random.randint( beginning , center)
        ending = beginning + Config.LENGTH_2
        if ending > len(wav):
            ending = len(wav)
            beginning = ending - Config.LENGTH_2
            
        wav_slice = wav[int(beginning):int(ending)]
        
        beginning_time = beginning / Config.SR
        ending_time = ending / Config.SR
        recording_id = sample[0]
        query_string = f"recording_id == '{recording_id}' & "
        query_string += f"t_min < {ending_time} & t_max > {beginning_time}"
        all_tp_events = self.df.query(query_string)

        label_array = np.zeros(24, dtype=np.float32)
        for species_id in all_tp_events["species_id"].unique():
            label_array[int(species_id)] = sample[-1]
            if species_id == 12:
                label_array[3] = 1


            
            
        #wav_slice = crop_or_pad(wav_slice)
       
        if self.audio_transforms: # and bird_id not in (3, 7, 8, 9):
            #wav_slice =  self.audio_transforms(wav_slice)
            wav_slice = self.audio_transforms(samples=wav_slice, sample_rate=Config.SR)
            
        
        #new_sample_rate = 32000
        #wav_slice = librosa.resample(wav_slice, Config.SR, new_sample_rate)
            
        #wav_slice = np.expand_dims(wav_slice, 0).astype(np.float32)
        wav_slice = wav_slice.astype(np.float32) * 10.

        return torch.tensor(wav_slice), label_array

In [17]:
train_tp_folds["true"] = 1
train_fp_folds["true"] = 0
#X_train = train_tp_folds[(train_tp_folds['fold'] != Config.FOLD) & (train_tp_folds['fold'] != Config.TEST_FOLD)].reset_index(drop=True)
X_train = train_tp_folds[(train_tp_folds['fold'] != Config.FOLD)].reset_index(drop=True)
X_val = train_tp_folds[train_tp_folds['fold'] == Config.FOLD].reset_index(drop=True)
#X_test = train_tp_folds[train_tp_folds['fold'] == Config.TEST_FOLD].reset_index(drop=True)
#X_train = X_train[~(X_train.species_id == 12)]
#X_val = X_val[~(X_val.species_id == 12)]



_df = _df[_df.recording_id.isin(X_train.recording_id)]
X_train = X_train[_df.columns]
X_train = pd.concat([X_train, _df])
X_train["label"] = 1

add_pseudo = False

if add_pseudo:
    X_train = X_train[pseudo_clear.columns]
    X_train = pd.concat([X_train, pseudo_clear])

print('Training on ' + str(len(X_train)) + ' examples')
print('Validating on ' + str(len(X_val)) + ' examples')
#print('Testing on ' + str(len(X_test)) + ' examples')

Training on 1155 examples
Validating on 218 examples


In [18]:
audio_transform_train = aa.Compose([
  aa.OneOf([
    aa.GaussianNoiseSNR(min_snr=5.0, max_snr=20.0),
    aa.PinkNoiseSNR( min_snr=5.0, max_snr=20.0,)
  ]),
  aa.PitchShift(max_steps=4, sr=Config.SR, p=0.2),
  #aa.TimeStretch(max_rate=1.2, p=0.1),
  aa.TimeShift(sr=Config.SR),
  aa.VolumeControl(mode="sine", p=0.2 )
])

audio_transform = audioaa.Compose([
    audioaa.AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.5),
    #audioaa.AddShortNoises(min_rate=0.8, max_rate=1.25, p=0.5),
    audioaa.AddGaussianSNR(min_SNR=0.001, max_SNR=1.0, p=0.5),
    audioaa.PitchShift(min_semitones=-4, max_semitones=4, p=0.5),
    #|audioaa.Shift(min_fraction=-0.5, max_fraction=0.5, p=0.5),
    #audioaa.Normalize(),
    #audioaa.PolarityInversion(p=0.5),
    audioaa.Gain(min_gain_in_db=-12, max_gain_in_db=12, p=0.5),
    #audioaa.ClippingDistortion(min_percentile_threshold=0, max_percentile_threshold=40, p=0.5)
])


In [19]:
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss

class PANNsLoss(nn.Module):
    def __init__(self):
        super().__init__()

        self.bce = nn.BCELoss()

    def forward(self, input, target):
        input_ = input["clipwise_output"]
        input_ = torch.where(torch.isnan(input_),
                             torch.zeros_like(input_),
                             input_)
        input_ = torch.where(torch.isinf(input_),
                             torch.zeros_like(input_),
                             input_)

        target = target.float()

        return self.bce(input_, target)
        
class FocalLoss(nn.Module):
    def __init__(self, gamma=2, use_coeffs = False, coeffs = None):
        super().__init__()
        self.gamma = gamma
        self.coeffs = coeffs
        self.use_coeffs = use_coeffs

    def forward(self, logit, target):
        target = target.float()
        batch_size = target.shape[0]
        max_val = (-logit).clamp(min=0)
        loss = logit - logit * target + max_val + \
            ((-max_val).exp() + (-logit - max_val).exp()).log()

        invprobs = F.logsigmoid(-logit * (target * 2.0 - 1.0))
        loss = (invprobs * self.gamma).exp() * loss
        
        if self.use_coeffs:
            loss = loss * self.coeffs.repeat(batch_size,1)
        if len(loss.size()) == 2:
            loss = loss.sum(dim=1)

        return loss.mean()   

class ImprovedPANNsLoss(nn.Module):
    def __init__(self, output_key="logit", weights=[1, 1], pos_weights =  None):
        super().__init__()

        self.output_key = output_key
        if output_key == "logit":
            self.normal_loss = nn.BCEWithLogitsLoss(pos_weight=pos_weights)
        else:
            self.normal_loss = nn.BCELoss()

        self.bce = nn.BCELoss()
        self.weights = weights

    def forward(self, input, target):
        input_ = input[self.output_key]
        target = target.float()

        framewise_output = input["framewise_output"]
        clipwise_output_with_max, _ = framewise_output.max(dim=1)

        normal_loss = self.normal_loss(input_, target)
        auxiliary_loss = self.bce(clipwise_output_with_max, target)

        return self.weights[0] * normal_loss + self.weights[1] * auxiliary_loss
    
class ImprovedFocalLoss(nn.Module):
    def __init__(self, weights=[1, 1], use_coeffs = False, coeffs = None):
        super().__init__()

        self.focal = FocalLoss(coeffs=coeffs)
        self.weights = weights

    def forward(self, input, target):
        input_ = input["logit"]
        target = target.float()

        framewise_output = input["framewise_logit"]
        clipwise_output_with_max, _ = framewise_output.max(dim=1)

        normal_loss = self.focal(input_, target)
        auxiliary_loss = self.focal(clipwise_output_with_max, target)

        return self.weights[0] * normal_loss + self.weights[1] * auxiliary_loss

In [20]:
model = AudioSEDModel(**model_param)
#model.load_state_dict(torch.load('best_model.pt'))

In [21]:
train_dataset = RainforestDataset(X_train, audio_transforms=audio_transform, image_transforms=None)
train_loader = DataLoader(train_dataset, batch_size=Config.BATCH_SIZE, shuffle=True, num_workers = Config.NUM_WORKERS, drop_last = False)

coeffiicients = np.array([0.25, 0.289, 0.238, 0.508, 0.229, 0.221, 0.212, 
                          0.285, 0.228, 0.215, 0.218, 0.263, 0.297, 0.216, 
                          0.247, 0.279, 0.220, 0.218, 0.360, 0.212, 0.215, 
                          0.221,0.219, 0.240])

pos_weights = torch.ones(Config.NUM_BIRDS).cuda()
pos_weights[3] = 5
pos_weights[8] = 3

criterion = ImprovedPANNsLoss(pos_weights=pos_weights)
criterion_focal = ImprovedFocalLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=Config.LR_ADAM, weight_decay = 0.01)# momentum = 0.9)
#optimizer = torch.optim.SGD(model.parameters(), lr=Config.LR, weight_decay=Config.WEIGHT_DECAY, momentum=Config.MOMENTUM)
#scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size = 5, gamma = 0.8)
#scheduler =torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=500, T_mult=1, eta_min=1e-6)
#scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience = 2, factor = 0.7, mode = "max")
scheduler =torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10, eta_min=1e-6)
mixer = BatchMixer(p=0.5)
mixup_augmenter = Mixup(mixup_alpha=1.)
loss_function = nn.BCEWithLogitsLoss()

if torch.cuda.is_available():
    model = model.cuda()
    loss_function = loss_function.cuda()

In [22]:
def load_val_file(record_id, df):

    wav = np.load('../data/train_np/' + record_id + ".npy")
        # Split for enough segments to not miss anything
        #segments = len(wav) / Config.LENGTH_1
        #segments = int(np.ceil(segments))
    window = 10 * Config.SR
    #stride = 5 * Config.SR
    full_length = 60 * Config.SR

    mel_array = []
    #for i in range(0, full_length + stride - window, stride):
    for i in range(0, full_length, window):
        
            wav_slice = wav[i:i+window]
            #new_sample_rate = 32000
            #wav_slice = librosa.resample(wav_slice, Config.SR, new_sample_rate)
            #wav_slice = np.expand_dims(wav_slice, axis=0).astype(np.float32) 
            wav_slice = wav_slice.astype(np.float32) * 10.
            mel_array.append(wav_slice)
        
        
    val_labels_array = np.zeros(Config.NUM_BIRDS, dtype=np.single)
    species_ids = copy.deepcopy(df[(df.recording_id==record_id)].species_id.unique())
    val_labels_array[species_ids] = 1.
    if 12 in species_ids:
        val_labels_array[3] = 1.
        
    
    return np.array(mel_array), val_labels_array

def lwlrap(truth, scores):
    """Calculate the overall lwlrap using sklearn.metrics.lrap."""
    # sklearn doesn't correctly apply weighting to samples with no labels, so just skip them.
    sample_weight = np.sum(truth > 0, axis=1)
    nonzero_weight_sample_indices = np.flatnonzero(sample_weight > 0)
    overall_lwlrap = label_ranking_average_precision_score(
      truth[nonzero_weight_sample_indices, :] > 0,
      scores[nonzero_weight_sample_indices, :],
      sample_weight=sample_weight[nonzero_weight_sample_indices])
    return overall_lwlrap


def validate(model, files_ids, df):
        val_loss = []
        val_corr = []
        val_metrics = []
        model.eval()
        for i in tqdm(range(0, len(files_ids))):
            data, target = load_val_file(files_ids[i], X_val)
            data, target = torch.tensor(data), torch.tensor(target)
            data = data.float()
            if torch.cuda.is_available():
                data, target = data.cuda(), target.cuda().unsqueeze(0)
            output = model(data)
            framewise_output = output["framewise_output"]
            output, _ = framewise_output.max(dim=1) 
            output, _ = torch.max(output, 0)
          
            output = output.unsqueeze(0)
            #print(output.shape)
            loss = loss_function(output, target)
            #loss = lsep_loss(output, target)
            #loss = criterion(output, target)
            #loss = 0
            
            val_metric = lwlrap(target.cpu().detach().numpy(), output.cpu().detach().numpy())
            vals, answers = torch.max(output, 1)
            vals, targets = torch.max(target, 1)
            val_metrics.append(val_metric.item())
            corrects = 0
            for i in range(0, len(answers)):
                if answers[i] == targets[i]:
                    corrects = corrects + 1
            val_corr.append(corrects)
            val_loss.append(loss.item())
        valid_epoch_metric = sum(val_metrics) / len(val_loss)

        return val_loss, val_corr, valid_epoch_metric

In [None]:
best_corrects = 0
files_ids = copy.deepcopy(X_val.recording_id.unique())
mixup=False
# Train loop
print('Starting training loop')
for e in range(0, 200):
    # Stats
    train_loss = []
    train_corr = []
    
    # Single epoch - train
    model.train()
    for batch, (data, target) in tqdm(enumerate(train_loader)):
        data = data.float()
        if mixup:
            mixup_lambda = torch.tensor(mixup_augmenter.get_lambda(len(data)))
            target = do_mixup(target, mixup_lambda)
        #data, target = mixer(data, target)
        if torch.cuda.is_available():
            data, target = data.cuda(), target.cuda()
            if mixup:
                mixup_lambda =  mixup_lambda.cuda()

        #print(data.shape, target.shape, mixup_lambda.shape)    
        optimizer.zero_grad()
        if mixup:
            output = model(data, mixup_lambda )
        else:
            output = model(data)
        #loss = loss_function(output, target)
        #label_smoothing_list = [0.002, 0.0015, ]
        #label_smoothing = random.choice(label_smoothing_list) 
        #targets_smooth = target * (1 - label_smoothing) + 0.5 * label_smoothing
        
        #output = output["logit"]
        #loss = loss_function(output, targets_smooth)
        #loss = lsep_loss(output, target)
        loss = criterion_focal(output, target)
        loss.backward()
        optimizer.step()
        #scheduler.step()

        # Stats
        vals, answers = torch.max(output["clipwise_output"], 1)
        vals, targets = torch.max(target, 1)
        corrects = 0
        for i in range(0, len(answers)):
            if answers[i] == targets[i]:
                corrects = corrects + 1
        train_corr.append(corrects)
        train_loss.append(loss.item())
    
    # Stats
    for g in optimizer.param_groups:
        lr = g['lr']
    print('Epoch ' + str(e) + ' training end. LR: ' + str(lr) + ', Loss: ' + str(sum(train_loss) / len(train_loss)) +
          ', Correct answers: ' + str(sum(train_corr)) + '/' + str(train_dataset.__len__()))
    
    with torch.no_grad():
                # Stats
        val_loss, val_corr, valid_epoch_metric = validate(model, files_ids, X_val)
    # Stats
    print('Epoch ' + str(e) + ' validation end. LR: ' + str(lr) + ', Loss: ' + str(sum(val_loss) / len(val_loss)) +
          ', Correct answers: ' + str(sum(val_corr)) + '/' + str(len(files_ids)) + ", Val metric: " + str(valid_epoch_metric))
    
    # If this epoch is better than previous on validation, save model
    # Validation loss is the more common metric, but in this case our loss is misaligned with competition metric, making accuracy a better metric
    if valid_epoch_metric > best_corrects:
        print('Saving new best model at epoch ') #+ str(e) + ' ' + str(sum(val_corr)) + '/' + str(len(files_ids)))
        torch.save(model.state_dict(), 'best_model_.pt')
        best_corrects = valid_epoch_metric
        
    # Call every epoch
    #scheduler.step(valid_epoch_metric)
    scheduler.step()

# Free memory
#del model

In [15]:
model.load_state_dict(torch.load("best_model_.pt"))
files_ids = X_val.recording_id.unique()
model.cuda()
with torch.no_grad():
                # Stats
        answers_list =  []
        targets_list = []
        val_loss = []
        val_corr = []
        val_metrics = []
        model.eval()
        for i in tqdm(range(0, len(files_ids))):
            data, target = load_val_file(files_ids[i], X_val)
            data, target = torch.tensor(data), torch.tensor(target)
            data = data.float()
            if torch.cuda.is_available():
                data, target = data.cuda(), target.cuda().unsqueeze(0)
            output = model(data)
            #output = output.squeeze()
            framewise_output = output["framewise_output"]
            output, _ = framewise_output.max(dim=1) 
            output, _ = torch.max(output, 0)
            #output, _ = torch.max(output["clipwise_output"], 0)
            output = output.unsqueeze(0)
            #print(output.shape)
            loss = lsep_loss(output, target)
            val_metric = lwlrap(target.cpu().numpy(), output.cpu().numpy())
            vals, answers = torch.max(output, 1)
            vals, targets = torch.max(target, 1)
            answers_list.append(answers.item())
            targets_list.append(targets.item())
            val_metrics.append(val_metric.item())
            corrects = 0
            for i in range(0, len(answers)):
                if answers[i] == targets[i]:
                    corrects = corrects + 1
            val_corr.append(corrects)
            val_loss.append(loss.item())

    
valid_epoch_metric = sum(val_metrics) / len(val_loss)
# Stats
print('Loss: ' + str(sum(val_loss) / len(val_loss)) +
      ', Correct answers: ' + str(sum(val_corr)) + '/' + str(len(files_ids)) + ", Val metric: " + str(valid_epoch_metric))



100%|██████████| 197/197 [00:21<00:00,  9.16it/s]

Loss: 2.690632645853885, Correct answers: 127/197, Val metric: 0.8073301909596327





In [16]:
errors = []
for i in range(len(answers_list)):
    if answers_list[i] != targets_list[i]:
        errors.append(targets_list[i])
        
from collections import Counter
error_count = sorted(Counter(errors).items(),key = lambda i: i[0])
target_count = sorted(Counter(targets_list).items(),key = lambda i: i[0])
print(error_count, target_count, sep = "\n")


[(0, 6), (1, 2), (2, 2), (3, 19), (4, 4), (5, 1), (7, 5), (8, 4), (9, 1), (10, 1), (11, 5), (13, 2), (14, 2), (15, 6), (16, 2), (17, 1), (20, 2), (21, 4), (23, 1)]
[(0, 9), (1, 9), (2, 5), (3, 27), (4, 9), (5, 9), (6, 8), (7, 9), (8, 9), (9, 5), (10, 8), (11, 9), (13, 9), (14, 8), (15, 8), (16, 7), (17, 7), (19, 7), (20, 6), (21, 8), (22, 7), (23, 14)]


In [20]:
# Already defined above; for reference
fft = 2048
hop = 512 * 1
# Less rounding errors this way
sr = 48000
length = 20 * sr
fmin = 84
fmax = 15056


def load_test_file(f): 
    wav, sr = librosa.load('../data/test/' + f, sr=None)

        # Split for enough segments to not miss anything
    segments = len(wav) / length
    segments = int(np.ceil(segments))
    
    mel_array = []
    
    for i in range(0, segments):
        # Last segment going from the end
        if (i + 1) * length > len(wav):
            wav_slice = wav[len(wav) - length:len(wav)]
        else:
            wav_slice = wav[i * length:(i + 1) * length]
        #new_sample_rate = 24000
        #wav_slice = librosa.resample(slice, Config.SR, new_sample_rate)
        #wav_slice = np.expand_dims(wav_slice, axis=0).astype(np.float32) 
        wav_slice = wav_slice.astype(np.float32) 
        mel_array.append(wav_slice)
    
    return np.array(mel_array)

In [31]:
model = AudioSEDModel(**model_param)
model.load_state_dict(torch.load(f"best_model_sed_0.8.pt"))

model.cuda()
model.eval()
PERIOD = 20
global_time = 0.0
threshold = 0.1
estimated_event_list = []


for i in range(0, len(test_files)):
    global_time = 0.0
    data = load_test_file(test_files[i])
    file_id = str.split(test_files[i], '.')[0]
    for part in data:
    
        part = torch.tensor(part).unsqueeze(0)
        part = part.float()
        if torch.cuda.is_available():
            part = part.cuda()

        output = model(part)

        framewise_outputs = output["framewise_output"].detach().cpu().numpy()[0]

        thresholded = framewise_outputs >= threshold

        #print(thresholded)
        #print(thresholded.shape)

        for target_idx in range(thresholded.shape[1]):
            if thresholded[:, target_idx].mean() == 0:
                pass
            else:
                detected = np.argwhere(thresholded[:, target_idx]).reshape(-1)
                head_idx = 0
                tail_idx = 0
                while True:
                    if (tail_idx + 1 == len(detected)) or (detected[tail_idx + 1] - detected[tail_idx] != 1):
                        onset = 0.01 * detected[head_idx] + global_time
                        offset = 0.01 * detected[tail_idx] + global_time
                        onset_idx = detected[head_idx]
                        offset_idx = detected[tail_idx]
                        max_confidence = framewise_outputs[onset_idx:offset_idx, target_idx].max()
                        mean_confidence = framewise_outputs[onset_idx:offset_idx, target_idx].mean()
                        estimated_event = {
                            "file_id": file_id,
                            "species_id": target_idx,
                            "onset": onset,
                            "offset": offset,
                            "max_confidence": max_confidence,
                            "mean_confidence": mean_confidence
                        }
                        estimated_event_list.append(estimated_event)
                        head_idx = tail_idx + 1
                        tail_idx = tail_idx + 1
                        if head_idx >= len(detected):
                            break
                    else:
                        tail_idx += 1

        global_time += PERIOD

prediction_df = pd.DataFrame(estimated_event_list)

In [32]:
len(prediction_df.file_id.unique())

1992

In [33]:
prediction_df.to_csv("pseudolabels_raw_sed.csv", index=False)

In [34]:
submission = pd.DataFrame(columns=['recording_id','s0','s1','s2','s3','s4','s5','s6','s7','s8','s9','s10','s11',
                               's12','s13','s14','s15','s16','s17','s18','s19','s20','s21','s22','s23'])

In [37]:
prediction_df = pd.read_csv("pseudolabels_raw_sed.csv")

In [38]:
len(prediction_df.file_id.unique())

1992

In [39]:
#prediction_df = pd.read_csv("pseudolabels_raw_fold1.csv")
for file_id, sub_df in prediction_df.groupby("file_id"):
    events = sub_df[["file_id", "species_id", "onset", "offset", "max_confidence", ]]
    sub_row = []
    recording_id = events.file_id.unique()[0]
    sub_row.append(recording_id)
    unique = events.species_id.unique()
    label_array = np.zeros(24, dtype=np.float32)
    for i in unique:
        pred_proba = events[events.species_id==i].max_confidence.max()

        label_array[int(i)] = pred_proba 
    sub_row.extend(list(label_array))
    sub_series = pd.Series(sub_row, index = submission.columns)
    submission = submission.append(sub_series, ignore_index=True)



In [40]:
submission.shape

(1992, 25)

In [41]:
submission.to_csv("test_submission_from_frames.csv", index=False )