In this notebook, we'll use a pre-trained machine learning model to generate a submission to the [BirdClef2023 competition](https://www.kaggle.com/c/birdclef-2023).  The goal of the competition is to identify Eastern African bird species by sound.

## Step 1: Imports

In [1]:
# Install All libraries
!pip install -Uq fastcore fastai fastbook
!pip install image_tabular
!pip install kaggle
! [ -e /content ] && pip install -Uqq pip fastai git+https://github.com/drscotthawley/fastproaudio.git

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m719.8/719.8 KB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m469.0/469.0 KB[0m [31m19.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m33.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.9/132.9 KB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 KB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.2/212.2 KB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
from google.colab import files, drive
drive.mount("/content/gdrive")
# Switch do DeepLearning directory
%cd /content/gdrive/MyDrive/DeepLearning

Mounted at /content/gdrive
/content/gdrive/MyDrive/DeepLearning


In [3]:
import pandas as pd
import numpy as np
import librosa
import librosa.display
import glob
import matplotlib.pyplot as plt
from typing import List

import csv
import io

import os

from IPython.display import Audio

import torchaudio
import torch

# To save spectrograms as png
import skimage.io

# fastproaudio
from fastproaudio.core import *


# Fastai
import fastbook
fastbook.setup_book()
from fastbook import *
from fastcore.transform import Transform
from fastai.torch_core import TensorBase
from fastai.data.core import TensorImageBase

# Import all the vision library
from fastai.vision.all import *


In [4]:
# Setup kaggle API

# 1. Read the kaggle API token to interact with your kaggle account
# Folder containing kaggle.json for kaggle API authorization
os.environ['KAGGLE_CONFIG_DIR'] = "/content/gdrive/MyDrive/DeepLearning/"

In [5]:
!ls
# Pick a path to download the dataset to:
comp = 'birdclef-2023'
path = URLs.path(comp)
path_train = Path(path/'train_audio')

kaggle.json  path_spectrogram


In [27]:
#And use the Kaggle API to download the dataset to that path, and extract it:
Path.BASE_PATH = path

from kaggle import api

if not path.exists():
    path.mkdir(parents=true)
    api.competition_download_cli(comp, path=path)
    shutil.unpack_archive(str(path/f'{comp}.zip'), str(path))

path.ls(file_type='text')



Downloading birdclef-2023.zip to /root/.fastai/archive/birdclef-2023


100%|██████████| 4.91G/4.91G [00:55<00:00, 95.2MB/s]





(#3) [Path('eBird_Taxonomy_v2021.csv'),Path('sample_submission.csv'),Path('train_metadata.csv')]

# Pre-generate Spectrograms and save them
Generate spectrograms over 5-seconds splits of resampled audio files


In [7]:
Path(path/'train_audio/gybfis1').ls()

(#10) [Path('train_audio/gybfis1/XC282058.ogg'),Path('train_audio/gybfis1/XC267528.ogg'),Path('train_audio/gybfis1/XC131985.ogg'),Path('train_audio/gybfis1/XC618866.ogg'),Path('train_audio/gybfis1/XC282065.ogg'),Path('train_audio/gybfis1/XC267529.ogg'),Path('train_audio/gybfis1/XC127885.ogg'),Path('train_audio/gybfis1/XC397814.ogg'),Path('train_audio/gybfis1/XC282061.ogg'),Path('train_audio/gybfis1/XC396326.ogg')]

Get the paths to all audio files

In [8]:
def get_audio_files(path):
    return get_files(path, extensions='.ogg', recurse=True)

audio_files = get_audio_files(path)
fn = audio_files[0]; fn

Path('train_audio/gybfis1/XC282058.ogg')

In [28]:
def read_signal(audio_file_path: Path,
                  audio_split_length: float=5.,
                  audio_hop_length: float=1.,
                  sample_rate: int=32000):
    # Load audio sample from file path
    sig, rate = librosa.load(audio_file_path)    
    # Resample to match desired sample rate
    resampled_sig = librosa.resample(sig, orig_sr=rate, target_sr=sample_rate)
    return resampled_sig.astype(float, copy=False)

def split_signal(resampled_sig: np.ndarray,
                  audio_split_length: float=5.,
                  audio_hop_length: float=1.,
                  sample_rate: int=32000):
    "Split signal into chunks of desired length. Returns a list a those chunks" 

    len_chunk = int(audio_split_length*sample_rate)
    len_sig = len(resampled_sig)
    
    # If length of signal inferior to desired length, pad it with zeros
    if (len_sig <= len_chunk):
      res = np.zeros(len_chunk)
      res[:len_sig] = resampled_sig
      return [res]
    else:
      res = []
      # Create chunks. To avoid padding, the last chunk will start audio_split_length seconds before the signal end.
      start = 0
      hop = int(sample_rate*audio_hop_length)

      while(start+len_chunk <= len_sig):
        res.append(resampled_sig[start:start+len_chunk])
        start+=hop

    # Append last chunk
      res.append(resampled_sig[len_sig-len_chunk:])

      return res


In [29]:
def scale_minmax(X, min=0.0, max=1.0):
    X_std = (X - X.min()) / (X.max() - X.min())
    X_scaled = X_std * (max - min) + min
    return X_scaled

def create_spectrogram(path_save, signal:np.ndarray, sample_rate:int=32000):
    spec = librosa.feature.melspectrogram(y=signal, sr=sample_rate)#, n_fft=int(duration*sample_rate))
    # save file
    np.save(path_save, spec)  


In [30]:
def save_chunk_as_file(chunk, audio_file_path: Path, sample_rate:int=32000, extension:str='.npy', idx_chunk=0, split=False):
    if split==True:
      path_save = Path(audio_file_path.parent/(audio_file_path.stem+f"-{idx_chunk}"+extension))
    else:
      path_save = Path(audio_file_path.parent/(audio_file_path.stem+extension))

    if not path_save.exists():
      create_spectrogram(path_save, chunk, sample_rate)
      print(path_save)
    else:
      print(path_save)
      print("LAST ALREADY EXISTED ----------------")
    return idx_chunk + 1

def create_spectrograms(audio_file_path: Path,
                        split=False,
                        audio_split_length: float=5.,
                        audio_hop_length: float=1.,
                        sample_rate: int=32000,
                        ax=None,
                        vmin=-70,
                        vmax=0,
                        rate=32000,
                        ctx=None):
    "Create all spectrograms for one audio file and save them in the same folder as audio files"
    # Split signal in to desired length chunks
    signal = read_signal(audio_file_path, audio_split_length=audio_split_length, audio_hop_length=audio_hop_length, sample_rate=sample_rate) 
    if split==True:
      chunks = split_signal(audio_file_path, audio_split_length=audio_split_length, audio_hop_length=audio_hop_length, sample_rate=sample_rate) 
    else:
      chunks=[signal]

    idx_chunk=0
    for i, chunk in enumerate(chunks):
        save_chunk_as_file(chunk, audio_file_path, idx_chunk=i)

### Check the pipeline for saving spectrograms and reading them. Ensure data is not corrupted

In [12]:
# Create chunks from audio file
fn = audio_files[0]
chunks = split_signal(fn)

path_save = Path(fn.parent/(fn.stem+"-0"+'.npy'))
print(path_save)

# Create spec
spec = librosa.feature.melspectrogram(y=chunks[0], sr=32000)#, n_fft=int(duration*sample_rate))
# save file
print(spec)
print(spec.shape)
print(spec.dtype)

np.save(path_save, spec) 
!ls /root/.fastai/archive/birdclef-2023/train_audio/gybfis1/

# read file
spectro = np.load(path_save)
print(spectro)
print(spectro.shape)

spectro.dtype

/root/.fastai/archive/birdclef-2023/train_audio/gybfis1/XC282058-0.npy
[[5.26338831e-12 9.48653823e-05 1.77240310e-02 ... 1.34713730e-03 1.59315902e-03 1.42051508e-03]
 [5.95330532e-12 5.76004130e-05 2.82235568e-03 ... 3.65783594e-04 4.41506476e-04 7.84810580e-04]
 [7.14226108e-12 2.38947270e-05 4.75837931e-04 ... 3.01775090e-05 1.06597800e-04 2.55165563e-04]
 ...
 [1.25257080e-13 3.10485252e-14 3.80861205e-17 ... 1.07771894e-16 1.53774408e-09 5.18162522e-08]
 [1.18850564e-13 2.94689315e-14 3.26061576e-17 ... 9.07030632e-17 1.53097645e-09 5.16003084e-08]
 [1.15525854e-13 2.86640607e-14 3.47877808e-17 ... 5.28268665e-16 1.52656727e-09 5.14585302e-08]]
(128, 313)
float64
XC127885.ogg  XC267529.ogg    XC282061.ogg  XC397814.ogg
XC131985.ogg  XC282058-0.npy  XC282065.ogg  XC618866.ogg
XC267528.ogg  XC282058.ogg    XC396326.ogg
[[5.26338831e-12 9.48653823e-05 1.77240310e-02 ... 1.34713730e-03 1.59315902e-03 1.42051508e-03]
 [5.95330532e-12 5.76004130e-05 2.82235568e-03 ... 3.65783594e-04 4.

dtype('float64')

## Call the methods and save spectrograms as .npy files

In [31]:
len(audio_files) 

16942

ls: cannot access '/root/.fastai/archive/birdclef-2023/': No such file or directory


In [32]:
for i, audio_file in enumerate(audio_files):
  create_spectrograms(audio_file)
  print(f"\nPROCESSED AUDIO FILES : {i} / {len(audio_files)}\n") # Output progress

/root/.fastai/archive/birdclef-2023/train_audio/gybfis1/XC282058.npy

PROCESSED AUDIO FILES : 0 / 16942

/root/.fastai/archive/birdclef-2023/train_audio/gybfis1/XC267528.npy

PROCESSED AUDIO FILES : 1 / 16942

/root/.fastai/archive/birdclef-2023/train_audio/gybfis1/XC131985.npy

PROCESSED AUDIO FILES : 2 / 16942

/root/.fastai/archive/birdclef-2023/train_audio/gybfis1/XC618866.npy

PROCESSED AUDIO FILES : 3 / 16942

/root/.fastai/archive/birdclef-2023/train_audio/gybfis1/XC282065.npy

PROCESSED AUDIO FILES : 4 / 16942

/root/.fastai/archive/birdclef-2023/train_audio/gybfis1/XC267529.npy

PROCESSED AUDIO FILES : 5 / 16942

/root/.fastai/archive/birdclef-2023/train_audio/gybfis1/XC127885.npy

PROCESSED AUDIO FILES : 6 / 16942

/root/.fastai/archive/birdclef-2023/train_audio/gybfis1/XC397814.npy

PROCESSED AUDIO FILES : 7 / 16942

/root/.fastai/archive/birdclef-2023/train_audio/gybfis1/XC282061.npy

PROCESSED AUDIO FILES : 8 / 16942

/root/.fastai/archive/birdclef-2023/train_audio/gybfis1

KeyboardInterrupt: ignored

In [None]:
from IPython.lib.display import Audio
import numpy as np

framerate = 4410
play_time_seconds = 3

t = np.linspace(0, play_time_seconds, framerate*play_time_seconds)
audio_data = np.sin(2*np.pi*300*t) + np.sin(2*np.pi*240*t)
Audio(audio_data, rate=framerate, autoplay=True)