In this notebook, we'll use a pre-trained machine learning model to generate a submission to the [BirdClef2023 competition](https://www.kaggle.com/c/birdclef-2023).  The goal of the competition is to identify Eastern African bird species by sound.

## Step 1: Imports

In [2]:
# Install All libraries
!pip install -Uq fastcore fastai fastbook
!pip install image_tabular
!pip install kaggle

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
import pandas as pd
import numpy as np
import librosa
import glob

import csv
import io

import os

from IPython.display import Audio

# Fastai
import fastbook
fastbook.setup_book()
from fastbook import *

# Import all the vision library
from fastai.vision.all import *


In [4]:
# Setup kaggle API

# 1. Read the kaggle API token to interact with your kaggle account
from google.colab import files, drive
drive.mount("/content/gdrive", force_remount=True)
# Switch do DeepLearning directory
%cd /content/gdrive/MyDrive/DeepLearning

# Folder containing kaggle.json for kaggle API authorization
os.environ['KAGGLE_CONFIG_DIR'] = "/content/gdrive/MyDrive/DeepLearning/"

Mounted at /content/gdrive
/content/gdrive/MyDrive/DeepLearning


In [5]:
!ls
# Pick a path to download the dataset to:
comp = 'birdclef-2023'
path = URLs.path(comp)

kaggle.json  path_spectrogram


In [6]:
#And use the Kaggle API to download the dataset to that path, and extract it:
#hide
Path.BASE_PATH = path

from kaggle import api

if not path.exists():
    path.mkdir(parents=true)
    api.competition_download_cli(comp, path=path)
    shutil.unpack_archive(str(path/f'{comp}.zip'), str(path))

path.ls(file_type='text')

(#3) [Path('train_metadata.csv'),Path('eBird_Taxonomy_v2021.csv'),Path('sample_submission.csv')]

## Step 2: Explore the training data

We'll start by loading a couple of training examples and using the IPython.display.Audio module to play them!

In [7]:
# Load a sample audio files from two different species
audio_abe, sr_abe = librosa.load(path/"train_audio/abethr1/XC128013.ogg")
audio_abh, sr_abh = librosa.load(path/"train_audio/abhori1/XC127317.ogg")

In [8]:
# Play the audio
Audio(data=audio_abe, rate=sr_abe)

In [9]:
# Play the audio
Audio(data=audio_abh, rate=sr_abh)



```
# This is formatted as code
```

## Step 3: Prepare the Data for training

We'll convert the audio files to spectrograms and use a vision learner

In [10]:

audio_extension = '.ogg'
img_extension = '.png'
fnames = get_files(path/'train_audio', extensions=audio_extension)
fnames[:5]

(#5) [Path('train_audio/gnbcam2/XC530130.ogg'),Path('train_audio/gnbcam2/XC366275.ogg'),Path('train_audio/gnbcam2/XC113258.ogg'),Path('train_audio/gnbcam2/XC402305.ogg'),Path('train_audio/gnbcam2/XC748252.ogg')]

In [11]:
path_spectrogram = path/"spectrogram"
!mkdir path_spectrogram

mkdir: cannot create directory ‘path_spectrogram’: File exists


In [12]:
Path(path/"train_audio/").ls()

(#264) [Path('train_audio/gnbcam2'),Path('train_audio/gnhsun1'),Path('train_audio/barswa'),Path('train_audio/slbgre1'),Path('train_audio/somtit4'),Path('train_audio/grbcam1'),Path('train_audio/sacibi2'),Path('train_audio/rewsta1'),Path('train_audio/norcro1'),Path('train_audio/vilwea1')...]

In [13]:
# Extract labels from submission csv
labels_csv_path = path/"sample_submission.csv"

def class_names_from_csv(class_map_csv_text):
    """Returns list of class names corresponding to score vector."""
    with open(class_map_csv_text) as csv_file:

        csv_reader = csv.reader(csv_file, delimiter=',')
        class_names = csv_reader.__next__()
        # Return all columns headers except for the first one which is "rows"
        return class_names[1:]

# Put all bird ids into labels list
labels = class_names_from_csv(labels_csv_path)

In [19]:
import librosa.display
# Create one folder for each category avec create one spectrogram per audio clip
# of that category in the folder
def create_folders_spectrograms(folder):
    spectrogram_path = Path(path_spectrogram)
    audio_path = Path(path/"train_audio/")   
    Path(spectrogram_path/folder).mkdir(parents=True, exist_ok=True)
    # Create spectrograms folder by folder
    # print(list(Path(audio_path/f'fold{folder}').glob(f'*{audio_extension}')))
    for audio_file in list(Path(audio_path/f'{folder}').glob(f'*.ogg')):
        #print(f'Found {audio_file}')
        samples, sample_rate = librosa.load(audio_file)
        fig = plt.figure(figsize=[0.72,0.72])
        ax = fig.add_subplot(111)
        ax.axes.get_xaxis().set_visible(False)
        ax.axes.get_yaxis().set_visible(False)
        ax.set_frame_on(False)
        filename  = spectrogram_path/folder/Path(audio_file).name.replace(audio_extension,img_extension)
        S = librosa.feature.melspectrogram(y=samples, sr=sample_rate)
        librosa.display.specshow(librosa.power_to_db(S, ref=np.max))
        plt.savefig(filename, dpi=400, bbox_inches='tight',pad_inches=0)
        print(f'Created {filename}')
        plt.close('all')

In [15]:
!ls

kaggle.json  path_spectrogram


In [20]:
for i in range(len(labels)):
    create_folders_spectrograms(labels[i])

Created /root/.fastai/archive/birdclef-2023/spectrogram/abethr1/XC531557.png
Created /root/.fastai/archive/birdclef-2023/spectrogram/abethr1/XC585802.png
Created /root/.fastai/archive/birdclef-2023/spectrogram/abethr1/XC616997.png
Created /root/.fastai/archive/birdclef-2023/spectrogram/abethr1/XC379322.png
Created /root/.fastai/archive/birdclef-2023/spectrogram/abethr1/XC128013.png
Created /root/.fastai/archive/birdclef-2023/spectrogram/abethr1/XC756300.png
Created /root/.fastai/archive/birdclef-2023/spectrogram/abethr1/XC363502.png
Created /root/.fastai/archive/birdclef-2023/spectrogram/abethr1/XC363503.png
Created /root/.fastai/archive/birdclef-2023/spectrogram/abethr1/XC363501.png
Created /root/.fastai/archive/birdclef-2023/spectrogram/abethr1/XC467121.png
Created /root/.fastai/archive/birdclef-2023/spectrogram/abethr1/XC606253.png
Created /root/.fastai/archive/birdclef-2023/spectrogram/abethr1/XC467122.png
Created /root/.fastai/archive/birdclef-2023/spectrogram/abethr1/XC363504.png

KeyboardInterrupt: ignored

In [None]:
data = DataBlock(blocks = (ImageBlock, CategoryBlock),
                 get_items=get_image_files, 
                 splitter=RandomSplitter(seed=42),
                 get_y=parent_label,
              item_tfms=Resi ze(460),)
dls = data.dataloaders(path_spectrogram)


learn = vision_learner(dls, resnet34, metrics=error_rate)
learn.fine_tune(2)

Downloading: "https://download.pytorch.org/models/resnet34-b627a593.pth" to /root/.cache/torch/hub/checkpoints/resnet34-b627a593.pth


  0%|          | 0.00/83.3M [00:00<?, ?B/s]

epoch,train_loss,valid_loss,error_rate,time


epoch,train_loss,valid_loss,error_rate,time
0,4.403503,3.647348,0.816038,16:05


epoch,train_loss,valid_loss,error_rate,time


In [None]:
train_metadata = pd.read_csv(path/"train_metadata.csv")
train_metadata.head()
competition_classes = sorted(train_metadata.primary_label.unique())

forced_defaults = 0
competition_class_map = []
for c in competition_classes:
    try:
        i = classes.index(c)
        competition_class_map.append(i)
    except:
        competition_class_map.append(0)
        forced_defaults += 1
        
## this is the count of classes not supported by our pretrained model
## you could choose to simply not predict these, set a default as above,
## or create your own model using the pretrained model as a base.
forced_defaults

## Step 4: Preprocess the data

The following functions are one way to load the audio provided and break it up into the five-second samples with a sample rate of 32,000 required by the competition.

In [None]:
def frame_audio(
      audio_array: np.ndarray,
      window_size_s: float = 5.0,
      hop_size_s: float = 5.0,
      sample_rate = 32000,
      ) -> np.ndarray:
    
    """Helper function for framing audio for inference."""
    """ using tf.signal """
    if window_size_s is None or window_size_s < 0:
        return audio_array[np.newaxis, :]
    frame_length = int(window_size_s * sample_rate)
    hop_length = int(hop_size_s * sample_rate)
    framed_audio = tf.signal.frame(audio_array, frame_length, hop_length, pad_end=True)
    return framed_audio

def ensure_sample_rate(waveform, original_sample_rate,
                       desired_sample_rate=32000):
    """Resample waveform if required."""
    if original_sample_rate != desired_sample_rate:
        waveform = tfio.audio.resample(waveform, original_sample_rate, desired_sample_rate)
    return desired_sample_rate, waveform

Below we load one training sample - use the Audio function to listen to the samples inside the notebook!

In [None]:
audio, sample_rate = librosa.load("/kaggle/input/birdclef-2023/train_audio/afghor1/XC156639.ogg")
sample_rate, wav_data = ensure_sample_rate(audio, sample_rate)
Audio(wav_data, rate=sample_rate)

## Step 5: Make predictions

Each test sample is cut into 5-second chunks. We use the pretrained model to return probabilities for all 10k birds included in the model, then pull out the classes used in this competition to create a final submission row. Note that we are NOT doing anything special to handle the 3 missing classes; those will need fine-tuning / transfer learning, which will be handled in a separate notebook.

In [None]:
fixed_tm = frame_audio(wav_data)
logits, embeddings = model.infer_tf(fixed_tm[:1])
probabilities = tf.nn.softmax(logits)
argmax = np.argmax(probabilities)
print(f"The audio is from the class {classes[argmax]} (element:{argmax} in the label.csv file), with probability of {probabilities[0][argmax]}")

In [None]:
def predict_for_sample(filename, sample_submission, frame_limit_secs=None):
    file_id = filename.split(".ogg")[0].split("/")[-1]
    
    audio, sample_rate = librosa.load(filename)
    sample_rate, wav_data = ensure_sample_rate(audio, sample_rate)
    
    fixed_tm = frame_audio(wav_data)
    
    frame = 5
    all_logits, all_embeddings = model.infer_tf(fixed_tm[:1])
    for window in fixed_tm[1:]:
        if frame_limit_secs and frame > frame_limit_secs:
            continue
        
        logits, embeddings = model.infer_tf(window[np.newaxis, :])
        all_logits = np.concatenate([all_logits, logits], axis=0)
        frame += 5
    
    frame = 5
    all_probabilities = []
    for frame_logits in all_logits:
        probabilities = tf.nn.softmax(frame_logits).numpy()
        
        ## set the appropriate row in the sample submission
        sample_submission.loc[sample_submission.row_id == file_id + "_" + str(frame), competition_classes] = probabilities[competition_class_map]
        frame += 5

## Step 6: Generate a submission

Now we process all of the test samples as discussed above, creating output rows, and saving them in the provided `sample_submission.csv`. Finally, we save these rows to our final output file: `submission.csv`. This is the file that gets submitted and scored when you submit the notebook.

In [None]:
test_samples = list(glob.glob("/kaggle/input/birdclef-2023/test_soundscapes/*.ogg"))
test_samples

In [None]:
sample_sub = pd.read_csv("/kaggle/input/birdclef-2023/sample_submission.csv")
sample_sub[competition_classes] = sample_sub[competition_classes].astype(np.float32)
sample_sub.head()

In [None]:
frame_limit_secs = 15 if sample_sub.shape[0] == 3 else None
for sample_filename in test_samples:
    predict_for_sample(sample_filename, sample_sub, frame_limit_secs=15)

In [None]:
sample_sub

In [None]:
sample_sub.to_csv("submission.csv", index=False)