# Preparation

In [35]:
# Import audio function modules
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_io as tfio

# Import main modules
import numpy as np
import librosa
import csv
import io

In [36]:
# Load the model from TFHub
model_handle = "https://tfhub.dev/google/bird-vocalization-classifier/1"
model = hub.load(model_handle)

# Loading labels

In [37]:
# Find the trained labels
def class_names_from_csv(class_map_csv_text):
    with open(labels_path) as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        class_names = [mid for mid, desc in csv_reader]
        return class_names[:1]

labels_path = hub.resolve(model_handle) + "/assets/label.csv"
classes = class_names_from_csv(labels_path)

# Resampling

In [38]:
# Define sampling rate function parameters
def frame_audio(
    audio_array: np.ndarray,
    window_size_s: float=5.0,
    hop_size_s: float=5.0,
    sample_rate=32000
) -> np.ndarray:
    if window_size_s is None or window_size_s < 0:
        return audio_array[np.newaxis, :]
    frame_length = int(window_size_s * sample_rate)
    hop_length = int(hop_size_s * sample_rate)
    framed_audio = tf.signal.frame(audio_array, frame_length, hop_length, pad_end=True)
    return framed_audio

def ensure_sample_rate(
    waveform,
    original_sample_rate,
    desired_sample_rate=32000
):
    if original_sample_rate != desired_sample_rate:
        waveform = tfio.audio.resample(waveform, original_sample_rate, desired_sample_rate)
    return desired_sample_rate, waveform

# Load the audio file

In [39]:
# Load the audio file from any websites (e.g., Wikipedia)
!curl -O "https://upload.wikimedia.org/wikipedia/commons/7/7c/Turdus_merula_2.ogg"

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  306k  100  306k    0     0  1171k      0 --:--:-- --:--:-- --:--:-- 1175k


In [40]:
# Identify the audio sample
turdus_merula = "Turdus_merula_2.ogg"

audio, sample_rate = librosa.load(turdus_merula)

sample_rate, wav_data_turdus = ensure_sample_rate(audio, sample_rate)
audio

array([-1.1987052e-05,  3.2027947e-06,  1.1725351e-06, ...,
        2.0122516e-05, -1.0682907e-05,  2.0385552e-05], dtype=float32)

# Split the audio

In [41]:
# Convert the audio sample
fixed_tm = frame_audio(wav_data_turdus)
fixed_tm.shape

TensorShape([5, 160000])

# Model application

In [42]:
# Apply the model on a single frame
logits, embeddings = model.infer_tf(fixed_tm[:1])

probabilities = tf.nn.softmax(logits)
argmax = np.argmax(probabilities)

logits.shape

TensorShape([1, 10932])

In [43]:
# Apply the model on all frames (Step 1)
all_logits, all_embeddings = model.infer_tf(fixed_tm[:1])
for window in fixed_tm[:1]:
    logits, embeddings = model.infer_tf(window[np.newaxis, :])
    all_logits = np.concatenate([all_logits, logits], axis=0)

all_logits.shape

(2, 10932)

In [44]:
# Apply the model on all frames (Step 2)
frame = 0

for frame_logits in all_logits:
    probabilities = tf.nn.softmax(frame_logits)
    argmax = np.argmax(probabilities)

    frame += 1