In [3]:
import os

from IPython import display
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_io as tfio

In [43]:
import random
import os

def set_seed(seed: int = 42) -> None:
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    tf.experimental.numpy.random.seed(seed)
    # tf.set_random_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    os.environ['TF_CUDNN_DETERMINISTIC'] = '1'
    os.environ['TF_DETERMINISTIC_OPS'] = '1'
    # Set a fixed value for the hash seed
    os.environ["PYTHONHASHSEED"] = str(seed)
    print(f"Random seed set as {seed}")


In [4]:
yamnet_model_handle = 'https://tfhub.dev/google/yamnet/1'
yamnet_model = hub.load(yamnet_model_handle)

2023-12-20 21:43:12.930154: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-12-20 21:43:12.931730: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-12-20 21:43:12.933697: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: :/usr/local/cuda-12.3/lib64:/usr/local/cuda-12.3/targets/x86_64-linux/lib:/usr/local/cuda-12.3/lib64:/usr/local/cuda-12.3/targets/x86_64-linux/lib:/usr/local/cuda-12.3/lib64:/usr/local/cuda-12.3/targets/x86_64-linux/lib
2023-12-20 21:43:12.933802: W tensorflow/compiler/xla

In [5]:
# Utility functions for loading audio files and making sure the sample rate is correct.

@tf.function
def load_wav_16k_mono(filename):
    """ Load a WAV file, convert it to a float tensor, resample to 16 kHz single-channel audio. """
    file_contents = tf.io.read_file(filename)
    wav, sample_rate = tf.audio.decode_wav(
          file_contents,
          desired_channels=1)
    wav = tf.squeeze(wav, axis=-1)
    sample_rate = tf.cast(sample_rate, dtype=tf.int64)
    wav = tfio.audio.resample(wav, rate_in=sample_rate, rate_out=16000)
    return wav

In [6]:
class_map_path = yamnet_model.class_map_path().numpy().decode('utf-8')
class_names =list(pd.read_csv(class_map_path)['display_name'])

for name in class_names[:20]:
  print(name)
print('...')

Speech
Child speech, kid speaking
Conversation
Narration, monologue
Babbling
Speech synthesizer
Shout
Bellow
Whoop
Yell
Children shouting
Screaming
Whispering
Laughter
Baby laughter
Giggle
Snicker
Belly laugh
Chuckle, chortle
Crying, sobbing
...


In [41]:
import json
import csv

def load_wav_for_map(filename, label):
  return load_wav_16k_mono(filename), int(label)

def extract_embedding(wav_data, label):
  ''' run YAMNet to extract embedding from the wav data '''
  scores, embeddings, spectrogram = yamnet_model(wav_data)
  num_embeddings = tf.shape(embeddings)[0]
  return (embeddings,
            tf.repeat(label, num_embeddings),)

def make_index_dict(label_csv):
    index_lookup = {}
    with open(label_csv, 'r') as f:
        csv_reader = csv.DictReader(f)
        line_count = 0
        for row in csv_reader:
            index_lookup[row['mid']] = row['index']
            line_count += 1
    return index_lookup

def load_data(file, index_dict):
    with open(file) as f:
        data = json.load(f)
    df = pd.DataFrame.from_dict(data['data'])

    filenames = df['wav']
    targets = df['labels'].map(lambda x: int(index_dict[x]))
    
    main_ds = tf.data.Dataset.from_tensor_slices((filenames, targets))
    main_ds = main_ds.map(load_wav_for_map)
    main_ds = main_ds.map(extract_embedding).unbatch()

    return main_ds

In [48]:
BATCH_SIZE = 32
dirs = ['/mnt/data/tungtran/AudioMAE/dataset/coral_sound_indo_health_30min_trainBo_few_shot', '/mnt/data/tungtran/AudioMAE/dataset/coral_sound_indo_health_30min_few_shot','/mnt/data/tungtran/AudioMAE/dataset/coral_sound_indo_location_30min_few_shot', '/mnt/data/tungtran/AudioMAE/dataset/coral_sound_coral_chorus_30min_few_shot']
seeds = [0,1]
for seed in seeds:
    for dir in dirs:
        DIR = dir
        set_seed(seed)
        index_dict = make_index_dict(f'{DIR}/class_labels_indices.csv')
        train_ds = load_data(f'{DIR}/train.json', index_dict).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
        val_ds = load_data(f'{DIR}/val.json', index_dict).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
        
        NUM_CLASSES = 3
        
        my_model = tf.keras.Sequential([
            tf.keras.layers.Input(shape=(1024), dtype=tf.float32,
                                  name='input_embedding'),
            tf.keras.layers.Dense(512, activation='relu'),
            tf.keras.layers.Dense(NUM_CLASSES)
        ], name='my_model')
        
        print(my_model.summary())
        
        my_model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                         optimizer="adam",
                         metrics=['accuracy'])
        
        callback = tf.keras.callbacks.EarlyStopping(monitor='loss',
                                                    patience=3,
                                                    restore_best_weights=True)
        log_dir = "logs/" + DIR.split('/')[-1] + f'seed{seed}_yamnet'
        print(log_dir)
        tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)
        
        history = my_model.fit(train_ds,
                               epochs=10,
                               validation_data=val_ds,
                               callbacks=[callback, tensorboard_callback])
        print()

Random seed set as 0
Model: "my_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_35 (Dense)            (None, 512)               524800    
                                                                 
 dense_36 (Dense)            (None, 3)                 1539      
                                                                 
Total params: 526,339
Trainable params: 526,339
Non-trainable params: 0
_________________________________________________________________
None
logs/coral_sound_indo_health_30min_trainBo_few_shotseed0_yamnet
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

Random seed set as 0
Model: "my_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_37 (Dense)            (None, 512)               524800    
         