# Audio Classifier

Notebook based/inspired by https://www.tensorflow.org/tutorials/audio/transfer_learning_audio

c.f. https://docs.conda.io/en/latest/miniconda.html

```bash
conda create -n tf python=3.9.12
conda activate tf
conda install ipykernel
pip install tensorflow==2.8.*
pip install tensorflow_io==0.25.*
conda install -c conda-forge librosa pandas scipy
```

In [4]:
import numpy as np
import tensorflow as tf
import pandas as pd

In [5]:
import tensorflow_io as tfio
from scipy.io import wavfile
import librosa
import io

@tf.function
def load_wav_16k_mono(filename):
    """ Load a WAV file, convert it to a float tensor, resample to 16 kHz single-channel audio. """
    file_contents = tf.io.read_file(filename)
    wav,sample_rate = tf.audio.decode_wav(file_contents,desired_channels=1)
    return convert(wav,sample_rate,16000)

def convert_raw_16k_mono(raw,sample_rate):
    """ convert and resample """
    sample_rate,wav = wavfile.read(io.BytesIO(raw))
    wav = librosa.to_mono(wav.T)
    wav = tf.expand_dims(wav,axis=1)
    return convert(wav,sample_rate,16000)

@tf.function
def convert(wav,rate_in,rate_out):
    wav = tf.squeeze(wav, axis=-1)
    sample_rate = tf.cast(rate_in,dtype=tf.int64)
    wav = tfio.audio.resample(wav,sample_rate,rate_out)
    return wav

In [56]:
from pathlib import Path

base_data_path = Path('../datasets')
data_csv = base_data_path / 'samples.csv'
pd_data = pd.read_csv(data_csv)
pd_data.tail()

Unnamed: 0,filename,target,category,fold
1568,Moss-b25d44a9-d884-484c-be08-8e7314615706.wav,-1,Moss,7
1569,Moss-830c32b6-409a-480e-b9ab-a7fca84c0e87.wav,-1,Moss,7
1570,Moss-0432cf53-5bb2-4533-bbe6-5513bde2f558.wav,-1,Moss,7
1571,Moss-d22a6513-1f87-4ea2-881d-c026e594fe57.wav,-1,Moss,7
1572,Moss-78ffae70-fcfd-4f23-a6f2-539cf4f71255.wav,-1,Moss,7


In [57]:
my_classes = ['Moss','Jen']
map_class_to_id = {k:v for v,k in enumerate(my_classes)}
print(map_class_to_id)

filtered_pd = pd_data[pd_data.category.isin(my_classes)]

class_id = filtered_pd['category'].apply(lambda name: map_class_to_id[name])
# -> class_id: pd of 2,2,...,1 corresponding to "Jen","Jen",etc

# replace target:-1 with corresponding class id:
assigned_pd = filtered_pd.assign(target=class_id)

full_path = assigned_pd['filename'].apply(lambda row: str(base_data_path / row))
assigned_pd = assigned_pd.assign(filename=full_path)
assigned_pd.head()

{'Moss': 0, 'Jen': 1}


Unnamed: 0,filename,target,category,fold
2,../datasets/Jen-973efb42-af79-44b9-84f4-225306...,1,Jen,1
4,../datasets/Jen-0f693473-f947-4459-8c66-96fbd4...,1,Jen,1
5,../datasets/Jen-cc768a6f-e512-4550-b0da-791219...,1,Jen,1
6,../datasets/Jen-7e5d37d2-4255-4128-9215-73b056...,1,Jen,1
7,../datasets/Jen-43169bf4-dfb4-4e87-8bb8-a11cf5...,1,Jen,1


In [59]:
print(f"fold value counts:\n{assigned_pd['fold'].value_counts()}")

balance = assigned_pd[assigned_pd.fold.isin([1])]['category'].value_counts()
arr = [s.split() for s in balance.to_string().split('\n')]
print(f'balance (fold 1): {arr}')

fold value counts:
4    230
2    200
7    200
1    100
5    100
6    100
3     40
Name: fold, dtype: int64
balance (fold 1): [['Jen', '50'], ['Moss', '50']]


In [60]:
filenames = assigned_pd['filename']
targets = assigned_pd['target']
folds = assigned_pd['fold']
main_ds = tf.data.Dataset.from_tensor_slices((filenames,targets,folds))
print(main_ds.element_spec)

(TensorSpec(shape=(), dtype=tf.string, name=None), TensorSpec(shape=(), dtype=tf.int64, name=None), TensorSpec(shape=(), dtype=tf.int64, name=None))


In [61]:
def load_wav_for_map(filename,label,fold):
    return load_wav_16k_mono(filename),label,fold

main_ds = main_ds.map(load_wav_for_map)
main_ds.element_spec

(TensorSpec(shape=<unknown>, dtype=tf.float32, name=None),
 TensorSpec(shape=(), dtype=tf.int64, name=None),
 TensorSpec(shape=(), dtype=tf.int64, name=None))

In [62]:
import tensorflow_hub as hub

yamnet_model_handle = 'https://tfhub.dev/google/yamnet/1'
yamnet_model = hub.load(yamnet_model_handle)

In [63]:
# applies the embedding extraction model to a wav data
def extract_embedding(wav_data, label, fold):
  ''' run YAMNet to extract embedding from the wav data '''
  scores, embeddings, spectrogram = yamnet_model(wav_data)
  print(f'tf.shape(embeddings): {tf.shape(embeddings)}')
  # esc50: Tensor("Shape:0", shape=(2,), dtype=int32)
  num_embeddings = tf.shape(embeddings)[0]
  return (embeddings,
            tf.repeat(label, num_embeddings),
            tf.repeat(fold, num_embeddings))

# extract embedding
main_ds = main_ds.map(extract_embedding).unbatch()
main_ds.element_spec

tf.shape(embeddings): Tensor("Shape:0", shape=(2,), dtype=int32)


(TensorSpec(shape=(1024,), dtype=tf.float32, name=None),
 TensorSpec(shape=(), dtype=tf.int64, name=None),
 TensorSpec(shape=(), dtype=tf.int64, name=None))

In [64]:
cached_ds = main_ds.cache()
train_ds = cached_ds.filter(lambda embedding, label, fold: fold >= 2 and fold <= 6)
val_ds = cached_ds.filter(lambda embedding, label, fold: fold == 7)
test_ds = cached_ds.filter(lambda embedding, label, fold: fold == 1)

# remove the folds column now that it's not needed anymore
remove_fold_column = lambda embedding, label, fold: (embedding, label)

train_ds = train_ds.map(remove_fold_column)
val_ds = val_ds.map(remove_fold_column)
test_ds = test_ds.map(remove_fold_column)

train_ds = train_ds.cache().shuffle(1000).batch(32).prefetch(tf.data.AUTOTUNE)
val_ds = val_ds.cache().batch(32).prefetch(tf.data.AUTOTUNE)
test_ds = test_ds.cache().batch(32).prefetch(tf.data.AUTOTUNE)

In [70]:
my_model = tf.keras.Sequential(name='my_model')
my_model.add(tf.keras.layers.Input(shape=(1024),dtype=tf.float32,name='input_embedding'))
my_model.add(tf.keras.layers.Dense(512,activation='elu',kernel_regularizer=tf.keras.regularizers.l2(0.0001)))
my_model.add(tf.keras.layers.Dropout(0.5))
my_model.add(tf.keras.layers.Dense(512,activation='elu',kernel_regularizer=tf.keras.regularizers.l2(0.0001)))
my_model.add(tf.keras.layers.Dropout(0.5))
my_model.add(tf.keras.layers.Dense(len(my_classes)))
my_model.summary()

Model: "my_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_35 (Dense)            (None, 512)               524800    
                                                                 
 dropout_12 (Dropout)        (None, 512)               0         
                                                                 
 dense_36 (Dense)            (None, 512)               262656    
                                                                 
 dropout_13 (Dropout)        (None, 512)               0         
                                                                 
 dense_37 (Dense)            (None, 2)                 1026      
                                                                 
Total params: 788,482
Trainable params: 788,482
Non-trainable params: 0
_________________________________________________________________


In [71]:
my_model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                 metrics=['accuracy'],
                 optimizer='adam')
#early_stop = tf.keras.callbacks.EarlyStopping()
history = my_model.fit(train_ds,
                        epochs=20,
                        validation_data=val_ds)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [72]:
loss, accuracy = my_model.evaluate(test_ds)

print("Loss: ", loss)
print("Accuracy: ", accuracy)

Loss:  0.534130871295929
Accuracy:  0.800000011920929
