In [1]:
import os, random
import cv2
import math
import librosa
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from tqdm import tqdm

from collections import Counter

import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import WeightedRandomSampler
from torchvision.models import efficientnet
from torchvision.transforms import transforms
# from efficientnet_pytorch import EfficientNet

import timm

import scikitplot as skplt
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit, KFold
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.preprocessing import LabelEncoder

from glob import glob
from IPython.display import display, Audio

import cupy as cp
from cupyx.scipy import signal as cupy_signal
import yaml

from metric import score

import wandb

import plotly.graph_objects as go
import plotly.express as px

  from .autonotebook import tqdm as notebook_tqdm
  cupy._util.experimental('cupyx.jit.rawkernel')


In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Using device: cuda


In [47]:
default_config = {
    "VERSION": "v0.3",
    "DATA_PATH": "inputs",
    "LOAD_SPEC_DATA": True,
    "SEED": 24,
    "SAMPLE_RATE": 32000,
    "N_FFT": 1095,
    "WIN_SIZE": 412,
    "WIN_LAP": 100,
    "MIN_FREQ": 40,
    "MAX_FREQ": 15000,
    "EPOCHS": 10,
    "BACHSIZE": 16
}

try:
    with open('config.yaml', 'r') as f:
        default_config = yaml.load(f, Loader=yaml.SafeLoader)
except:
    pass

default_config

{'VERSION': 'v1.6',
 'DESCRIPTION': 'First 5s with 21-22-23-24 data',
 'DATA_PATH': 'inputs',
 'LOAD_SPEC_DATA': ['data/spec_first_5sec_256_256_24.npy',
  'data/spec_first_5sec_256_256_212223.npy'],
 'SEED': 24,
 'SAMPLE_RATE': 32000,
 'N_FFT': 1024,
 'WIN_SIZE': 412,
 'WIN_LAP': 100,
 'MIN_FREQ': 40,
 'MAX_FREQ': 15000,
 'N_MEL': 128,
 'HOP_LENGTH': 320,
 'EPOCHS': 10,
 'FOLD': 5,
 'BACTHSIZE': 16,
 'LABEL_SMOOTHING': 0.0}

In [52]:
def oog2spec_via_cupy(audio_data):
    
    audio_data = cp.array(audio_data)
    
    # handles NaNs
    mean_signal = cp.nanmean(audio_data)
    audio_data = cp.nan_to_num(audio_data, nan=mean_signal) if cp.isnan(audio_data).mean() < 1 else cp.zeros_like(audio_data)
    
    # to spec.
    frequencies, times, spec_data = cupy_signal.spectrogram(
        audio_data, 
        fs=default_config["SAMPLE_RATE"], 
        nfft=default_config["N_FFT"], 
        nperseg=default_config["WIN_SIZE"], 
        noverlap=default_config["WIN_LAP"], 
        window='hann'
    )
    
    # Filter frequency range
    valid_freq = (frequencies >= default_config["MIN_FREQ"]) & (frequencies <= default_config["MAX_FREQ"])
    spec_data = spec_data[valid_freq, :]
    
    # Log
    spec_data = cp.log10(spec_data + 1e-20)
    
    # min/max normalize
    spec_data = spec_data - spec_data.min()
    spec_data = spec_data / spec_data.max()
    
    return spec_data.get()

In [34]:
meta_data = pd.read_csv(f"inputs/train_metadata.csv")
meta_data.drop_duplicates(inplace=True)

meta_data.reset_index(drop=True, inplace=True)
meta_data = meta_data[["filename","primary_label"]]
meta_data

Unnamed: 0,filename,primary_label
0,asbfly/XC134896.ogg,asbfly
1,asbfly/XC164848.ogg,asbfly
2,asbfly/XC175797.ogg,asbfly
3,asbfly/XC207738.ogg,asbfly
4,asbfly/XC209218.ogg,asbfly
...,...,...
24454,zitcis1/XC845747.ogg,zitcis1
24455,zitcis1/XC845817.ogg,zitcis1
24456,zitcis1/XC856176.ogg,zitcis1
24457,zitcis1/XC856723.ogg,zitcis1


In [12]:
f = open("data/duplicates.txt", 'r').readlines()
dup = [row.split(',')[0] for row in f]
print(f"Get {len(dup)} duplicates file")
for d in dup:
    meta_data = meta_data[meta_data.filename!=d]
meta_data.reset_index(drop=True)
meta_data.drop_duplicates(subset=['filename'], inplace=True)
meta_data.reset_index(drop=True, inplace=True)
meta_data

Get 150 duplicates file


Unnamed: 0,filename,primary_label
0,asbfly/XC134896.ogg,asbfly
1,asbfly/XC164848.ogg,asbfly
2,asbfly/XC175797.ogg,asbfly
3,asbfly/XC207738.ogg,asbfly
4,asbfly/XC209218.ogg,asbfly
...,...,...
24307,zitcis1/XC845747.ogg,zitcis1
24308,zitcis1/XC845817.ogg,zitcis1
24309,zitcis1/XC856176.ogg,zitcis1
24310,zitcis1/XC856723.ogg,zitcis1


In [13]:
all_bird_data = dict()

for i, row_metadata in tqdm(meta_data.iterrows()):
                
        audio_data, _ = librosa.load(f"inputs/train_audio/{row_metadata.filename}", sr=default_config["SAMPLE_RATE"])

        n_copy = math.ceil(5 * default_config["SAMPLE_RATE"] / len(audio_data))
        if n_copy > 1: audio_data = np.concatenate([audio_data]*n_copy)

        # for start_idx in range(0,len(audio_data)-(len(audio_data)%(5*default_config["SAMPLE_RATE"])),5*default_config["SAMPLE_RATE"]):
        # start_idx = int(start_idx)
        # start_idx = int(len(audio_data) / 2 - 2.5 * default_config["SAMPLE_RATE"])
        start_idx = 0
        end_idx = int(start_idx + 5.0 * default_config["SAMPLE_RATE"])
        input_audio = audio_data[start_idx:end_idx]

        # ogg to spec.
        input_spec = oog2spec_via_cupy(input_audio)

        input_spec = cv2.resize(input_spec, (256, 256), interpolation=cv2.INTER_AREA)

        all_bird_data[f"{row_metadata.filename}_{start_idx}"] = input_spec.astype(np.float32)
# save to file
np.save(os.path.join("data", f'spec_first_5sec_256_256_24.npy'), all_bird_data)

24312it [16:41, 24.27it/s]


### Extract No Call class

In [2]:
from pydub import AudioSegment
import os

In [3]:
os.environ["PATH"] += os.pathsep + 'C:/Users/hoang/Downloads/ffmpeg-master-latest-win64-gpl-shared/bin/ffmpeg.exe'
os.environ["PATH"] += os.pathsep + 'C:/Users/hoang/Downloads/ffmpeg-master-latest-win64-gpl-shared/bin/ffplay.exe'
os.environ["PATH"] += os.pathsep + 'C:/Users/hoang/Downloads/ffmpeg-master-latest-win64-gpl-shared/bin/ffprobe.exe'

In [5]:
AudioSegment.converter = 'C:/Users/hoang/Downloads/ffmpeg-master-latest-win64-gpl-shared/bin/ffmpeg.exe'
AudioSegment.ffmpeg = 'C:/Users/hoang/Downloads/ffmpeg-master-latest-win64-gpl-shared/bin/ffmpeg.exe'
AudioSegment.ffprobe =  'C:/Users/hoang/Downloads/ffmpeg-master-latest-win64-gpl-shared/bin/ffprobe.exe'

In [30]:
p = "inputs/previous_dataset/ColumbiaAndCostaRica" # SouthwesternAmazonBasin" # WesternUS"
anno = pd.read_csv(f"{p}/annotations.csv")
anno = anno[anno["Species eBird Code"]=="????"].reset_index(drop=True)
anno

Unnamed: 0,Filename,Start Time (s),End Time (s),Low Freq (Hz),High Freq (Hz),Species eBird Code
0,NES_001_S01_20190914_043000.flac,1980.9,1981.3,7183,10000,????
1,NES_001_S01_20190914_043000.flac,1991.8,1992.3,7042,10845,????
2,NES_001_S01_20190914_043000.flac,1993.7,1994.1,7323,9718,????
3,NES_001_S01_20190914_043000.flac,1996.3,1996.8,7042,10281,????
4,NES_001_S01_20190914_043000.flac,2539.1,2539.4,7464,9859,????
...,...,...,...,...,...,...
381,NES_032_S02_20191009_063001.flac,3587.6,3588.2,4354,9516,????
382,NES_032_S02_20191009_063001.flac,3591.7,3592.2,5000,10161,????
383,NES_033_S02_20191009_160000.flac,1368.5,1370.9,3559,6271,????
384,NES_034_S02_20191009_170000.flac,2507.6,2508.4,0,2372,????


In [31]:
for idx in tqdm(range(len(anno))):
    if (anno["End Time (s)"][idx]- anno["Start Time (s)"][idx]) > 5:

        newAudio = AudioSegment.from_file(f"{p}/soundscape_data/{anno['Filename'][idx]}", 'flac')
        newAudio = newAudio[anno['Start Time (s)'][idx]*1000:anno['End Time (s)'][idx]*1000]
        newAudio.export(f'inputs/train_audio/nocall/ColumbiaAndCostaRica_{idx}.ogg', format="ogg")


100%|██████████| 386/386 [02:57<00:00,  2.17it/s]


In [53]:
noCallFiles = glob("inputs/train_audio/nocall/*.ogg")
meta_data = pd.DataFrame(noCallFiles, columns=["filename"])
meta_data

Unnamed: 0,filename
0,inputs/train_audio/nocall\ColumbiaAndCostaRica...
1,inputs/train_audio/nocall\ColumbiaAndCostaRica...
2,inputs/train_audio/nocall\ColumbiaAndCostaRica...
3,inputs/train_audio/nocall\ColumbiaAndCostaRica...
4,inputs/train_audio/nocall\ColumbiaAndCostaRica...
...,...
296,inputs/train_audio/nocall\SouthwesternAmazonBa...
297,inputs/train_audio/nocall\SouthwesternAmazonBa...
298,inputs/train_audio/nocall\SouthwesternAmazonBa...
299,inputs/train_audio/nocall\SouthwesternAmazonBa...


In [54]:
def clean_filename(row):
    return row.split("/")[-1].replace("\\","/")

meta_data["filename"] = meta_data["filename"].apply(clean_filename)
meta_data

Unnamed: 0,filename
0,nocall/ColumbiaAndCostaRica_102.ogg
1,nocall/ColumbiaAndCostaRica_103.ogg
2,nocall/ColumbiaAndCostaRica_106.ogg
3,nocall/ColumbiaAndCostaRica_11.ogg
4,nocall/ColumbiaAndCostaRica_111.ogg
...,...
296,nocall/SouthwesternAmazonBasin_987.ogg
297,nocall/SouthwesternAmazonBasin_989.ogg
298,nocall/SouthwesternAmazonBasin_993.ogg
299,nocall/SouthwesternAmazonBasin_994.ogg


In [55]:
all_bird_data = dict()

for i, row_metadata in tqdm(meta_data.iterrows()):
                
        audio_data, _ = librosa.load(f"inputs/train_audio/{row_metadata.filename}", sr=default_config["SAMPLE_RATE"])

        n_copy = math.ceil(5 * default_config["SAMPLE_RATE"] / len(audio_data))
        if n_copy > 1: audio_data = np.concatenate([audio_data]*n_copy)

        # for start_idx in range(0,len(audio_data)-(len(audio_data)%(5*default_config["SAMPLE_RATE"])),5*default_config["SAMPLE_RATE"]):
        # start_idx = int(start_idx)
        # start_idx = int(len(audio_data) / 2 - 2.5 * default_config["SAMPLE_RATE"])
        start_idx = 0
        end_idx = int(start_idx + 5.0 * default_config["SAMPLE_RATE"])
        input_audio = audio_data[start_idx:end_idx]

        # ogg to spec.
        input_spec = oog2spec_via_cupy(input_audio)

        input_spec = cv2.resize(input_spec, (256, 256), interpolation=cv2.INTER_AREA)

        all_bird_data[f"{row_metadata.filename}_{start_idx}"] = input_spec.astype(np.float32)
# save to file
np.save(os.path.join("data", f'spec_first_5sec_256_256_nocall.npy'), all_bird_data)

301it [00:08, 35.51it/s]
