In this notebook, we'll use a pre-trained machine learning model to generate a submission to the [BirdClef2023 competition](https://www.kaggle.com/c/birdclef-2023).  The goal of the competition is to identify Eastern African bird species by sound.

## Step 1: Imports

In [8]:
# Install All libraries
!pip install -Uq fastcore fastai fastbook
!pip install image_tabular
!pip install kaggle
! [ -e /content ] && pip install -Uqq pip fastai git+https://github.com/drscotthawley/fastproaudio.git

[0mLooking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
[0mLooking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
[0m  Preparing metadata (setup.py) ... [?25l[?25hdone
[0m

In [9]:
from google.colab import files, drive
drive.mount("/content/gdrive")
# Switch do DeepLearning directory
%cd /content/gdrive/MyDrive/DeepLearning

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
/content/gdrive/MyDrive/DeepLearning


In [3]:
import pandas as pd
import numpy as np
import librosa
import librosa.display
import random
import glob
import matplotlib.pyplot as plt
from typing import List

import csv
import io

import os

from IPython.display import Audio

import torchaudio
import torch
import torchvision.transforms as T


# To save spectrograms as png
import skimage.io

# fastproaudio
from fastproaudio.core import *


# Fastai
import fastbook
fastbook.setup_book()
from fastbook import *
from fastcore.transform import Transform
from fastai.torch_core import TensorBase
from fastai.data.core import TensorImageBase

# Import all the vision library
from fastai.vision.all import *


In [4]:
# Setup kaggle API

# 1. Read the kaggle API token to interact with your kaggle account
# Folder containing kaggle.json for kaggle API authorization
os.environ['KAGGLE_CONFIG_DIR'] = "/content/gdrive/MyDrive/DeepLearning/"

In [5]:
!ls
# Pick a path to download the dataset to:
comp = 'birdclef-2023'
path = URLs.path(comp)
path_train = Path(path/'train_audio')

abethr1  blhgon1  bswdov1  gryapa1	meypar1  refcro1  trobou1  yebduc1
afecuc1  bltori1  carcha1  gybfis1	nobfly1  refwar2  vibsta2  yebere1
afmdov1  brctch1  chewea1  gycwar3	palfly2  rerswa1  vilwea1  yesbar1
afrgrp1  brobab1  cohmar1  hamerk1	piecro1  sccsun2  whbcan1
bawman1  brrwhe3  gabgos2  kaggle.json	ratcis1  scrcha1  whctur2
beasun2  brtcha1  gnbcam2  loceag1	rebfir2  soucit1  wookin1
blaplo1  brubru1  gobbun1  lotcor1	reboxp1  strsee1  yebapa1


In [6]:
#And use the Kaggle API to download the dataset to that path, and extract it:
Path.BASE_PATH = path

from kaggle import api

if not path.exists():
    path.mkdir(parents=true)
    api.competition_download_cli(comp, path=path)
    shutil.unpack_archive(str(path/f'{comp}.zip'), str(path))

path.ls(file_type='text')

Downloading birdclef-2023.zip to /root/.fastai/archive/birdclef-2023


100%|██████████| 4.91G/4.91G [00:27<00:00, 190MB/s]





(#3) [Path('train_metadata.csv'),Path('eBird_Taxonomy_v2021.csv'),Path('sample_submission.csv')]

# Pre-generate Spectrograms and save them
Generate spectrograms over 5-seconds splits of resampled audio files


In [7]:
Path(path/'train_audio/gybfis1').ls()

(#10) [Path('train_audio/gybfis1/XC282065.ogg'),Path('train_audio/gybfis1/XC396326.ogg'),Path('train_audio/gybfis1/XC282058.ogg'),Path('train_audio/gybfis1/XC267528.ogg'),Path('train_audio/gybfis1/XC131985.ogg'),Path('train_audio/gybfis1/XC397814.ogg'),Path('train_audio/gybfis1/XC127885.ogg'),Path('train_audio/gybfis1/XC282061.ogg'),Path('train_audio/gybfis1/XC618866.ogg'),Path('train_audio/gybfis1/XC267529.ogg')]

Get the paths to all audio files

In [10]:
def get_audio_files(path):
    return get_files(path, extensions='.ogg', recurse=True)

def get_npy_files(path):
    return get_files(path, extensions='.npy', recurse=True)


In [11]:
def read_signal(audio_file_path: Path,
                  audio_split_length: float=5.,
                  audio_hop_length: float=1.,
                  sample_rate: int=32000):
    # Load audio sample from file path
    sig, rate = librosa.load(audio_file_path)    
    # Resample to match desired sample rate
    resampled_sig = librosa.resample(sig, orig_sr=rate, target_sr=sample_rate)
    return resampled_sig.astype(float, copy=False)

def split_signal(resampled_sig: np.ndarray,
                  audio_split_length: float=5.,
                  audio_hop_length: float=1.,
                  sample_rate: int=32000):
    "Split signal into chunks of desired length. Returns a list a those chunks" 

    len_chunk = int(audio_split_length*sample_rate)
    len_sig = len(resampled_sig)

    # If length of signal inferior to desired length, pad it with zeros
    if (len_sig <= len_chunk):
      res = np.zeros(len_chunk)
      res[:len_sig] = resampled_sig
      return [res]
    else:
      res = []
      # Create chunks. To avoid padding, the last chunk will start audio_split_length seconds before the signal end.
      start = 0
      hop = int(sample_rate*audio_hop_length)

      while(start+len_chunk <= len_sig):
        res.append(resampled_sig[start:start+len_chunk])
        start+=hop

    # Append last chunk
      res.append(resampled_sig[len_sig-len_chunk:])

      return res


def truncate_window(signal: np.ndarray,
                  window_size=10.,
                  sample_rate: int=32000):
    "Truncate signal within a random window of desired length" 

    length_window = int(window_size*sample_rate)
    len_signal = len(signal)
    if (len_signal>=length_window):
        start_window = random.randint(0,len_signal-length_window) 
        window = signal[start_window:start_window+length_window]
        return [window]
    else:
        window = np.zeros(length_window)
        window[0:len_signal] = signal
        return [window]


In [12]:
def scale_minmax(X, min=0.0, max=1.0):
    X_std = (X - X.min()) / (X.max() - X.min())
    X_scaled = X_std * (max - min) + min
    return X_scaled

def create_spectrogram(path_save, signal:np.ndarray, sample_rate:int=32000):
    spec = librosa.feature.melspectrogram(y=signal, sr=sample_rate)#, n_fft=int(duration*sample_rate))
    # save file
    np.save(path_save, spec)  


In [13]:
def save_chunk_as_file(chunk, audio_file_path: Path, sample_rate:int=32000, extension:str='.npy', idx_chunk=0, split=False):
    if split==True:
      path_save = Path(audio_file_path.parent/(audio_file_path.stem+f"-{idx_chunk}"+extension))
    else:
      path_save = Path(audio_file_path.parent/(audio_file_path.stem+extension))

    if not path_save.exists():
      create_spectrogram(path_save, chunk, sample_rate)
      print(path_save)
    else:
      print(path_save)
      print("LAST ALREADY EXISTED")
    return idx_chunk + 1

def create_spectrograms(audio_file_path: Path,
                        truncate_window_bool=False,
                        window_size=10.,
                        split=False,
                        audio_split_length: float=5.,
                        audio_hop_length: float=1.,
                        sample_rate: int=32000,
                        ax=None,
                        vmin=-70,
                        vmax=0,
                        rate=32000,
                        ctx=None):
    "Create all spectrograms for one audio file and save them in the same folder as audio files"
    # Split signal in to desired length chunks or window
    signal = read_signal(audio_file_path, audio_split_length=audio_split_length, audio_hop_length=audio_hop_length, sample_rate=sample_rate) 

    if truncate_window_bool==True:
      chunks = truncate_window(signal, window_size=window_size,sample_rate=sample_rate)

    elif split==True:
      chunks = split_signal(audio_file_path, audio_split_length=audio_split_length, audio_hop_length=audio_hop_length, sample_rate=sample_rate) 
    else:
      chunks=[signal]

    for i, chunk in enumerate(chunks):
        save_chunk_as_file(chunk, audio_file_path, idx_chunk=i)

## Call the methods and save spectrograms as .npy files

In [14]:
len(audio_files) 

NameError: ignored

In [None]:
for i, audio_file in enumerate(audio_files):
  create_spectrograms(audio_file, truncate_window_bool=True)
  print(f"\nPROCESSED AUDIO FILES : {i} / {len(audio_files)}\n") # Output progress

#### Play sound to alert when long task is over

# TRAIN LEARNER


Extract Labels from csv file

In [15]:
# Extract labels from submission csv
labels_csv_path = path/"sample_submission.csv"

def class_names_from_csv(class_map_csv_text):
    """Returns list of class names corresponding to score vector."""
    with open(class_map_csv_text) as csv_file:

        csv_reader = csv.reader(csv_file, delimiter=',')
        class_names = csv_reader.__next__()
        # Return all columns headers except for the first one which is "rows"
        return class_names[1:]

# Put all bird ids into labels list
labels_vocab = class_names_from_csv(labels_csv_path)

Write function to show spectrograms

In [16]:
def show_spectrogram(audio_data, ax=None, vmin=-70, vmax=0, rate=32000, ctx=None):
    # Get audio tensor (values between 0. and 1.) and convert to spectrogram DB values
    spec_db = array(audio_data[0]*255)

    # Plot Mel-spectrogram plot, from librosa documentation
    if ax is None: fig, ax = plt.subplots()
    img = librosa.display.specshow(spec_db, y_axis='mel', x_axis='time', ax=None, sr=32000, vmin=-70, vmax=0)
    ax.set(title=f'Spectrogram of bird type : {audio_data[1]}')
    plt.colorbar(img, ax=ax, format="%+2.f dB")
    if ax is None: plt.show(block=False)

# Before we can create a Transform, we need a type that knows how to show itself
# (if we want to use the show method). Here we define a TitledImage:
class TitledImage(fastuple):
    def show(self, ctx=None, **kwargs):
      show_spectrogram(self, ctx=ctx, **kwargs)

Write transform to feed .npy spectrogram files to learner

In [17]:
transform_to_tensor = T.Compose([T.ToTensor()])
transform_to_PIL = T.ToPILImage()


In [37]:
# Function to convert .npy files into tensors
def npy_to_tensor(path_file:Path):
    # Load file
    spec = np.load(path_file)
    # Convert to DB
    spec_db = librosa.power_to_db(spec, ref=np.max)
    # Convert spectrogram DB data to PyTorch Tensor
    spec_tensor = tensor(spec_db).float()/255
    return spec_tensor



class NpyTransform(ItemTransform):
    def setups(self, items):
      self.lblr = parent_label
      vals = class_names_from_csv(labels_csv_path)
      self.vocab,self.o2i = uniqueify(vals, sort=True, bidir=True)

    def encodes(self, o): 
      return (npy_to_tensor(o), self.o2i[self.lblr(o)])
    def decodes(self, x): 
      return TitledImage(x[0],self.vocab[x[1]])

def NpyBlock(vocab, o2i, lblr):
    return TransformBlock(type_tfms=NpyTransform(vocab, o2i, lblr))
 

In [27]:
path_spectrograms = "/content/gdrive/MyDrive/DeepLearning/"
items = get_npy_files(path_spectrograms)
audio_items = get_audio_files(path)
len(items), len(audio_items)

(2736, 16942)

In [None]:
# Check for double occurences of name files
#for i, audio_file_path in enumerate(audio_items):
#  print(i)
#  count = 0
#  for audio_file_path2 in audio_items:
##    if audio_file_path.name == audio_file_path2.name:
 ##     count += 1
  #    if count == 2:
  #      print("DOUBLON")
  #      print(audio_file_path.name)
  #      print(audio_file_path2.name)
  #      break

In [None]:
# Save npy files to google drive
#for i, path_file in enumerate(items):
#  path_tmp = Path("/content/gdrive/MyDrive/DeepLearning/"+path_file.name)
#  shutil.copy(path_file, path_tmp) 
#  print(f'Copied Items : {i}/{len(items)}')


In [28]:
audio_items[1]

Path('train_audio/mabeat1/XC294614.ogg')

In [22]:
## Add parent folders to npy files in gdrive
#path_npy_gdrive = Path("/content/gdrive/MyDrive/DeepLearning/")
#path_ogg_fastai = Path('/root/.fastai/archive/birdclef-2023/train_audio/')
#for i, path_npy in enumerate(items):
#  print(f'Processing {i+1}/{len(items)}')
#  for path_ogg in audio_items:
#    if path_npy.stem == path_ogg.stem:
#      dest = Path(path_npy_gdrive/path_ogg.parent.name/path_npy.name)
#      # Check if directory exists
#      if not dest.parent.exists():
#        dest.parent.mkdir(parents=True, exist_ok=True)
#      shutil.move(path_npy, dest)
#      break

In [29]:
!ls /content/gdrive/MyDrive/DeepLearning/soucit1


XC119402.npy  XC428700.npy  XC612385.npy  XC626072.npy	XC703347.npy
XC352785.npy  XC601840.npy  XC612386.npy  XC627235.npy
XC391601.npy  XC601841.npy  XC613507.npy  XC627238.npy
XC396039.npy  XC602932.npy  XC614372.npy  XC627239.npy
XC396388.npy  XC602933.npy  XC615098.npy  XC697678.npy


In [30]:
# Test
path_ogg = audio_items[1]
path_npy_gdrive = Path("/content/gdrive/MyDrive/DeepLearning/")
path_npy = items[1]
Path(path_npy_gdrive/path_ogg.parent.name/path_npy.name)

Path('/content/gdrive/MyDrive/DeepLearning/mabeat1/XC608050.npy')

## Test transform class and functions

In [None]:
labeller = parent_label
vals = list(map(labeller, items))
vocab,o2i = uniqueify(vals, sort=True, bidir=True)
npy_transform = NpyTransform(vocab,o2i,labeller)

In [None]:
x,y = npy_transform(items[0])
x.shape, y


In [None]:
(x,y)

In [None]:
dec = npy_transform.decode([x,y])
dec.show()

## Create Tfmdlists

In [45]:
path_npy_gdrive = Path("/content/gdrive/MyDrive/DeepLearning/")
items = get_npy_files(path_npy_gdrive)
items

(#2736) [Path('/content/gdrive/MyDrive/DeepLearning/sccsun2/XC609125.npy'),Path('/content/gdrive/MyDrive/DeepLearning/sccsun2/XC608050.npy'),Path('/content/gdrive/MyDrive/DeepLearning/sccsun2/XC298959.npy'),Path('/content/gdrive/MyDrive/DeepLearning/sccsun2/XC398625.npy'),Path('/content/gdrive/MyDrive/DeepLearning/sccsun2/XC752954.npy'),Path('/content/gdrive/MyDrive/DeepLearning/sccsun2/XC593225.npy'),Path('/content/gdrive/MyDrive/DeepLearning/sccsun2/XC609472.npy'),Path('/content/gdrive/MyDrive/DeepLearning/sccsun2/XC419932.npy'),Path('/content/gdrive/MyDrive/DeepLearning/sccsun2/XC609128.npy'),Path('/content/gdrive/MyDrive/DeepLearning/sccsun2/XC536795.npy')...]

In [46]:
# Parameters of the TfmdLists
npy_transform = NpyTransform()
npy_transform.setup(items)
splits = RandomSplitter()(items)


In [47]:
vals = class_names_from_csv(labels_csv_path)
vocab, o2i = uniqueify(vals, sort=True, bidir=True)
parent_label(items[0])
o2i[parent_label(items[0])]

191

In [48]:
tls = TfmdLists(items, [npy_transform], splits=splits)


In [59]:
dls = tls.dataloaders(bs=3)


In [None]:
dls.show_batch(max_n=4)

In [53]:
from torchvision.models import resnet34, resnet50
opt_func = partial(Adam, lr=slice(3e-3), wd=0.01, eps=1e-8)

In [50]:
dblock = DataBlock(blocks = (TransformBlock(NpyTransform()), CategoryBlock),
                   get_items = get_npy_files,
                   get_y=parent_label,
                   splitter = RandomSplitter())

In [120]:
!ls /content/gdrive/MyDrive/DeepLearning


abethr1  blhgon1  bswdov1  gryapa1	meypar1  refcro1  trobou1  yebduc1
afecuc1  bltori1  carcha1  gybfis1	nobfly1  refwar2  vibsta2  yebere1
afmdov1  brctch1  chewea1  gycwar3	palfly2  rerswa1  vilwea1  yesbar1
afrgrp1  brobab1  cohmar1  hamerk1	piecro1  sccsun2  whbcan1
bawman1  brrwhe3  gabgos2  kaggle.json	ratcis1  scrcha1  whctur2
beasun2  brtcha1  gnbcam2  loceag1	rebfir2  soucit1  wookin1
blaplo1  brubru1  gobbun1  lotcor1	reboxp1  strsee1  yebapa1


In [None]:
dls = dblock.dataloaders(path_npy_gdrive, bs=64)
dls.show_batch(max_n=2)


In [None]:

# define learner and train model
learn = vision_learner(dls, resnet18, opt_func=opt_func,metrics=error_rate, loss_func=CrossEntropyLossFlat)


In [None]:
LE LEANER NE MARCHE PAS SANS DOUTE CAR FASTAI EST PARAMETRé POUR LES PILImage. Tenter de convertir le tensor en PILimage avant de lancer le training
https://www.tutorialspoint.com/how-to-convert-a-torch-tensor-to-pil-image

In [58]:
learn = vision_learner(dls, resnet18, metrics=error_rate, loss_func=CrossEntropyLossFlat)
learn.fine_tune(4)

epoch,train_loss,valid_loss,error_rate,time


RuntimeError: ignored

In [None]:
 from IPython.lib.display import Audio
import numpy as np

framerate = 4410
play_time_seconds = 3

t = np.linspace(0, play_time_seconds, framerate*play_time_seconds)
audio_data = np.sin(2*np.pi*300*t) + np.sin(2*np.pi*240*t)
Audio(audio_data, rate=framerate, autoplay=True)