In [12]:
import tensorflow as tf
import numpy as np
import os

In [15]:
ds_train, ds_val = tf.keras.utils.audio_dataset_from_directory(
    directory='train/audio',
    batch_size=256,
    validation_split=0.2,
    subset='both',
    seed=1337,
)

Found 64721 files belonging to 30 classes.
Using 51777 files for training.
Using 12944 files for validation.


In [16]:
for x, y in ds_train.take(1):
    break
x.shape, y.shape

(TensorShape([256, 16000, 1]), TensorShape([256]))

In [17]:
def get_spectrograms(waveforms, labels):
  waveforms = tf.reshape(waveforms, [-1, 16000])
  spectrogram = tf.signal.stft(
      waveforms, frame_length=255, frame_step=128)
  spectrogram = tf.abs(spectrogram)
  spectrogram = spectrogram[..., tf.newaxis]
  return spectrogram, labels

In [18]:
spec, label = get_spectrograms(x, y)
spec.shape, label.shape

(TensorShape([256, 124, 129, 1]), TensorShape([256]))

In [20]:
ds_train = ds_train.map(get_spectrograms)
ds_val = ds_val.map(get_spectrograms)

In [21]:
for x, y in ds_train.take(1):
    break
x.shape, y.shape

(TensorShape([256, 124, 129, 1]), TensorShape([256]))

In [22]:
# norm_layer = tf.keras.layers.Normalization()
# norm_layer.adapt(data=ds_mapped.map(map_func=lambda spec, label: spec))

In [23]:
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(124, 129, 1)),
    tf.keras.layers.Resizing(32, 32),
    tf.keras.layers.Conv2D(32, 3, activation='relu'),
    tf.keras.layers.Conv2D(64, 3, activation='relu'),
    tf.keras.layers.MaxPooling2D(),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(30),
])

In [24]:
model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                optimizer=tf.keras.optimizers.Adam(),
                metrics=['accuracy'])

In [25]:
model.fit(ds_train, epochs=10, validation_data=ds_val)

Epoch 1/10


I0000 00:00:1711437820.616726 1269653 service.cc:145] XLA service 0x7fbbec0076f0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1711437820.626623 1269653 service.cc:153]   StreamExecutor device (0): NVIDIA GeForce RTX 2060 with Max-Q Design, Compute Capability 7.5
2024-03-26 08:23:41.460478: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-03-26 08:23:49.771996: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:465] Loaded cuDNN version 8907


[1m  3/203[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m6s[0m 34ms/step - accuracy: 0.0469 - loss: 3.4015    

I0000 00:00:1711437846.198004 1269653 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m203/203[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m109s[0m 388ms/step - accuracy: 0.2938 - loss: 2.6112 - val_accuracy: 0.6304 - val_loss: 1.3378
Epoch 2/10
[1m203/203[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 214ms/step - accuracy: 0.6812 - loss: 1.1497 - val_accuracy: 0.7559 - val_loss: 0.8976
Epoch 3/10
[1m203/203[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 125ms/step - accuracy: 0.7947 - loss: 0.7424 - val_accuracy: 0.7898 - val_loss: 0.7731
Epoch 4/10
[1m203/203[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 133ms/step - accuracy: 0.8463 - loss: 0.5478 - val_accuracy: 0.8113 - val_loss: 0.7038
Epoch 5/10
[1m 98/203[0m [32m━━━━━━━━━[0m[37m━━━━━━━━━━━[0m [1m14s[0m 136ms/step - accuracy: 0.8762 - loss: 0.4299