In [1]:
import gc
import sys
from collections import Counter
from typing import Dict, Optional

import numpy as np
import pandas as pd
from pathlib import Path
from tqdm.notebook import tqdm
import timm
import torch
import torch.nn as nn
from torchlibrosa.stft import LogmelFilterBank, Spectrogram
from torchlibrosa.augmentation import SpecAugmentation

import librosa
import audiomentations as audio
from torch.utils.data import Dataset, DataLoader

sys.path.append("../src")
import layer
import const
from utils import DataHandler
from metrics import row_wise_micro_averaged_f1_score

In [2]:
dh = DataHandler()

In [3]:
DEVICE = "cuda"
cfg = dh.load("../experiments/exp_070/config.yml")

In [4]:
cfg.data.valid.loader.batch_size = 128

In [5]:
const.TRAIN_SOUNDSCAPES_DIR = Path("../data/input/train_soundscapes")

## Dataset

In [6]:
class CustomValidDataset(Dataset):
    def __init__(self, df: pd.DataFrame, cfg):
        super().__init__()
        self.cfg = cfg
        self.filenames = df["filename"].values
        self.seconds = df["second"].values
        self.audio_dict = {}
        self.primary_label = df["primary_label"].values

    def __len__(self):
        return len(self.filenames)

    def __getitem__(self, idx):
        primary_label = self.primary_label[idx]
        filename = self.filenames[idx]
        seconds = self.seconds[idx]
        path_name = str(f"../data/input/train_short_audio/{primary_label}/{filename}")

        if filename not in self.audio_dict:
            y, sr = librosa.load(path_name, sr=const.TARGET_SAMPLE_RATE)
            self.audio_dict[filename] = y
        else:
            y = self.audio_dict[filename]

        start_index = const.TARGET_SAMPLE_RATE * (seconds - 5)
        end_index = const.TARGET_SAMPLE_RATE * seconds
        y = y[start_index:end_index].astype(np.float32)
        
        if len(y) < const.TARGET_SAMPLE_RATE * 5:
            padding = const.TARGET_SAMPLE_RATE * 5 - len(y)
            offset = padding // 2
            y = np.pad(y, (offset, const.TARGET_SAMPLE_RATE * 5 - len(y) - offset), "constant").astype(
                np.float32
            )

        return y

## Model

In [7]:
class CustomModel(nn.Module):
    def __init__(
        self,
        n_classes: int,
        model_name: str,
        args_spec: Dict,
        in_channels: int = 1,
        pooling_name: str = "GeM",
        args_pooling: Optional[Dict] = None,
    ):
        super(CustomModel, self).__init__()

        self.spectrogram_extractor = Spectrogram(
            n_fft=args_spec.n_fft,
            hop_length=args_spec.hop_length,
            win_length=args_spec.n_fft,
            window="hann",
            center=True,
            pad_mode="reflect",
            freeze_parameters=True,
        )

        self.logmel_extractor = LogmelFilterBank(
            sr=args_spec.sampling_rate,
            n_fft=args_spec.n_fft,
            n_mels=args_spec.n_mels,
            fmin=args_spec.fmin,
            fmax=args_spec.fmax,
            ref=1.0,
            amin=1e-10,
            top_db=None,
            freeze_parameters=True,
        )

        self.spec_augmenter = SpecAugmentation(
            time_drop_width=64,
            time_stripes_num=2,
            freq_drop_width=8,
            freq_stripes_num=2,
        )

        self.bn0 = nn.BatchNorm2d(args_spec.n_mels)

        self.backbone = timm.create_model(
            model_name, pretrained=True, in_chans=in_channels
        )

        final_in_features = list(self.backbone.children())[-1].in_features
        self.backbone = nn.Sequential(*list(self.backbone.children())[:-2])

        self.pooling = getattr(layer, pooling_name)(**args_pooling)

        self.act = nn.ReLU()
        self.drop = nn.Dropout(p=0.5)
        self.fc = nn.Linear(final_in_features, n_classes)

    def forward(self, x, is_train=True):
        x = self.spectrogram_extractor(x)
        x = self.logmel_extractor(x)

        x = x.transpose(1, 3)
        x = self.bn0(x)
        x = x.transpose(1, 3)

        # if is_train:
        #     x = self.spec_augmenter(x)

        x = x.contiguous().transpose(2, 3)

        x = self.backbone(x)
        x = self.pooling(x)
        x = x.view(len(x), -1)
        x = self.act(x)
        x = self.drop(x)
        x = self.fc(x)
        return x

## Factory

In [8]:
def get_dataloader(df, cfg):
    dataset = CustomValidDataset(
            df=df,
            cfg=cfg,
        )

    loader = DataLoader(dataset, **cfg.loader)
    return loader

## Main

In [9]:
train_df = dh.load("../data/input/train_metadata.csv")

train_df["target"] = train_df["primary_label"].map(const.BIRD_CODE)

target_array = np.zeros((len(train_df), len(const.BIRD_CODE)))
for idx, pl in enumerate(train_df["primary_label"]):
    target_array[idx, const.BIRD_CODE[pl]] += 1
    
target_df = pd.DataFrame(target_array, columns=list(const.BIRD_CODE.keys()))

In [10]:
wave_length_df = pd.read_csv("../data/processed/train_short_wave_length.csv")

train_df["audio_time"] = wave_length_df["wave_length"] // (const.TARGET_SAMPLE_RATE)

In [11]:
primary_labels = []
filenames = []
seconds = []

for idx in train_df.index:
    primary_label = train_df.loc[idx, "primary_label"]
    filename = train_df.loc[idx, "filename"]
    audio_time = train_df.loc[idx, "audio_time"]
    
    if audio_time % 5 == 0:
        clip_seconds = list(range(5, audio_time + (5 - (audio_time % 5)), 5))
    else:
        clip_seconds = list(range(5, audio_time + (5 - (audio_time % 5)) + 1, 5))
        
    for s in clip_seconds:
        primary_labels.append(primary_label)
        filenames.append(filename)
        seconds.append(s)

In [12]:
valid_df = pd.DataFrame({
    "primary_label": primary_labels,
    "filename": filenames,
    "second": seconds
})

In [13]:
def inference(df, cfg):
    valid_loader = get_dataloader(df, cfg.data.valid)

    model = CustomModel(
        model_name=cfg.model.backbone,
        n_classes=cfg.model.n_classes,
        args_spec=cfg.model.spec_params,
        **cfg.model.params
    ).to(DEVICE)

    model.load_state_dict(torch.load("../logs/exp_070_20210508193023_0.882/weight_best.pt"))
    
    valid_preds = np.zeros((len(valid_loader.dataset), cfg.model.n_classes))

    model.eval()
    with torch.no_grad():
        for i, images in enumerate(valid_loader):
            images = images.to(DEVICE)

            preds = model(images)

            start_batch_idx = i * valid_loader.batch_size
            end_batch_idx = (i + 1) * valid_loader.batch_size

            valid_preds[start_batch_idx:end_batch_idx, :] = (preds.sigmoid().cpu().detach().numpy())
            
    return valid_preds

In [14]:
gp = valid_df.groupby("primary_label")

preds_array = np.zeros((len(valid_df), cfg.model.n_classes))
for pl, df in tqdm(gp):
    idxs = df.index.values
    preds_array[idxs, :] = inference(df, cfg)
    
np.save("./preds_train.npy", preds_array)

  0%|          | 0/397 [00:00<?, ?it/s]

In [15]:
preds_array.shape

(726640, 1)

In [23]:
# valid_df["preds_label"] = [const.INV_BIRD_CODE[bc] for bc in np.argsort(preds_array, axis=1)[:, -1]]
valid_df["preds"] = preds_array

In [26]:
valid_df[valid_df["preds"] < 0.5]

Unnamed: 0,primary_label,filename,second,preds_label,preds
63,acafly,XC130140.ogg,20,acafly,0.376526
89,acafly,XC133047.ogg,10,acafly,0.265085
91,acafly,XC133047.ogg,20,acafly,0.479911
257,acafly,XC137858.ogg,550,acafly,0.057488
258,acafly,XC137858.ogg,555,acafly,0.049507
...,...,...,...,...,...
725690,yetvir,XC154485.ogg,10,acafly,0.131243
725691,yetvir,XC154485.ogg,15,acafly,0.100032
726251,yetvir,XC417449.ogg,40,acafly,0.215310
726411,yetvir,XC501230.ogg,55,acafly,0.193412


In [18]:
valid_df[valid_df["primary_label"] != valid_df["preds_label"]]

Unnamed: 0,primary_label,filename,second,preds_label
1526,acowoo,XC110258.ogg,5,acafly
1527,acowoo,XC110258.ogg,10,acafly
1528,acowoo,XC110258.ogg,15,acafly
1529,acowoo,XC110258.ogg,20,acafly
1530,acowoo,XC110258.ogg,25,acafly
...,...,...,...,...
726635,yetvir,XC615888.ogg,60,acafly
726636,yetvir,XC615888.ogg,65,acafly
726637,yetvir,XC615888.ogg,70,acafly
726638,yetvir,XC615888.ogg,75,acafly


In [19]:
noise_time_df = valid_df[valid_df["primary_label"] != valid_df["preds_label"]].groupby("filename")["second"].unique().reset_index()
noise_time_dict = dict(noise_time_df.values)

for k, v in noise_time_dict.items():
    noise_time_dict[k] = list(map(int, v))

In [20]:
dh.save("../data/processed/train_noise.json", noise_time_dict)

In [21]:
noise_time_dict["XC109605.ogg"]

KeyError: 'XC109605.ogg'

In [None]:
noise_time_dict

In [None]:
g = True
counter = 0

while g:
    counter += 1
    if counter == 10:
        g = False
    print(counter)

In [None]:
audio_time_df

In [None]:
len_y = 2037586
samples = 32_000 * 5

In [None]:
start = np.random.randint(len_y - samples)

In [None]:
round_s = start // (32_000)


In [None]:
num = 44
num - (num % 5) + 5