# Loading Yamnet from weights for classification of spectrograms

Loading YAMNet from weights following the instructions from https://github.com/tensorflow/models/tree/master/research/audioset/yamnet

In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
import tensorflow.keras as keras
import matplotlib.pyplot as plt
import pandas as pd
import random 
import copy
from numpy import savez_compressed, load
import keras.backend as K
from keras import regularizers
from keras.layers import Lambda,InputLayer
from keras.layers.convolutional import Conv2D, MaxPooling2D, UpSampling2D
#from keras.layers.advanced_activations import LeakyReLU
from keras.layers.core import Activation, Dense
from keras.layers import Input, Dense, Reshape, Flatten, Dropout, multiply, GaussianNoise, Conv2DTranspose
from keras.models import Sequential, Model
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.optimizers import Adam
from keras.models import Sequential
from sklearn.model_selection import train_test_split
import librosa
import librosa.display
import gc
import IPython.display as ipd
from tqdm import tqdm

## Helper functions

### Yamnet parameters

In [2]:
"""Hyperparameters for YAMNet."""

from dataclasses import dataclass

# The following hyperparameters (except patch_hop_seconds) were used to train YAMNet,
# so expect some variability in performance if you change these. The patch hop can
# be changed arbitrarily: a smaller hop should give you more patches from the same
# clip and possibly better performance at a larger computational cost.
@dataclass(frozen=True)  # Instances of this class are immutable.
class Params:
  sample_rate: float = 16000.0
  stft_window_seconds: float = 0.025
  stft_hop_seconds: float = 0.010
  mel_bands: int = 64
  mel_min_hz: float = 125.0
  mel_max_hz: float = 7500.0
  log_offset: float = 0.001
  patch_window_seconds: float = 0.96
  patch_hop_seconds: float = 0.48

  @property
  def patch_frames(self):
    return int(round(self.patch_window_seconds / self.stft_hop_seconds))

  @property
  def patch_bands(self):
    return self.mel_bands

  num_classes: int = 521
  conv_padding: str = 'same'
  batchnorm_center: bool = True
  batchnorm_scale: bool = False
  batchnorm_epsilon: float = 1e-4
  classifier_activation: str = 'sigmoid'

  tflite_compatible: bool = False

### Feature computation functions

In [3]:
"""Feature computation for YAMNet."""

import numpy as np
import tensorflow as tf


def waveform_to_log_mel_spectrogram_patches(waveform, params):
  """Compute log mel spectrogram patches of a 1-D waveform."""
  with tf.name_scope('log_mel_features'):
    # waveform has shape [<# samples>]

    # Convert waveform into spectrogram using a Short-Time Fourier Transform.
    # Note that tf.signal.stft() uses a periodic Hann window by default.
    window_length_samples = int(
      round(params.sample_rate * params.stft_window_seconds))
    hop_length_samples = int(
      round(params.sample_rate * params.stft_hop_seconds))
    fft_length = 2 ** int(np.ceil(np.log(window_length_samples) / np.log(2.0)))
    num_spectrogram_bins = fft_length // 2 + 1
    if params.tflite_compatible:
      magnitude_spectrogram = _tflite_stft_magnitude(
          signal=waveform,
          frame_length=window_length_samples,
          frame_step=hop_length_samples,
          fft_length=fft_length)
    else:
      magnitude_spectrogram = tf.abs(tf.signal.stft(
          signals=waveform,
          frame_length=window_length_samples,
          frame_step=hop_length_samples,
          fft_length=fft_length))
    # magnitude_spectrogram has shape [<# STFT frames>, num_spectrogram_bins]

    # Convert spectrogram into log mel spectrogram.
    linear_to_mel_weight_matrix = tf.signal.linear_to_mel_weight_matrix(
        num_mel_bins=params.mel_bands,
        num_spectrogram_bins=num_spectrogram_bins,
        sample_rate=params.sample_rate,
        lower_edge_hertz=params.mel_min_hz,
        upper_edge_hertz=params.mel_max_hz)
    mel_spectrogram = tf.matmul(
      magnitude_spectrogram, linear_to_mel_weight_matrix)
    log_mel_spectrogram = tf.math.log(mel_spectrogram + params.log_offset)
    # log_mel_spectrogram has shape [<# STFT frames>, params.mel_bands]

    # Frame spectrogram (shape [<# STFT frames>, params.mel_bands]) into patches
    # (the input examples). Only complete frames are emitted, so if there is
    # less than params.patch_window_seconds of waveform then nothing is emitted
    # (to avoid this, zero-pad before processing).
    spectrogram_hop_length_samples = int(
      round(params.sample_rate * params.stft_hop_seconds))
    spectrogram_sample_rate = params.sample_rate / spectrogram_hop_length_samples
    patch_window_length_samples = int(
      round(spectrogram_sample_rate * params.patch_window_seconds))
    patch_hop_length_samples = int(
      round(spectrogram_sample_rate * params.patch_hop_seconds))
    features = tf.signal.frame(
        signal=log_mel_spectrogram,
        frame_length=patch_window_length_samples,
        frame_step=patch_hop_length_samples,
        axis=0)
    # features has shape [<# patches>, <# STFT frames in an patch>, params.mel_bands]

    return log_mel_spectrogram, features


def pad_waveform(waveform, params):
  """Pads waveform with silence if needed to get an integral number of patches."""
  # In order to produce one patch of log mel spectrogram input to YAMNet, we
  # need at least one patch window length of waveform plus enough extra samples
  # to complete the final STFT analysis window.
  min_waveform_seconds = (
      params.patch_window_seconds +
      params.stft_window_seconds - params.stft_hop_seconds)
  min_num_samples = tf.cast(min_waveform_seconds * params.sample_rate, tf.int32)
  num_samples = tf.shape(waveform)[0]
  num_padding_samples = tf.maximum(0, min_num_samples - num_samples)

  # In addition, there might be enough waveform for one or more additional
  # patches formed by hopping forward. If there are more samples than one patch,
  # round up to an integral number of hops.
  num_samples = tf.maximum(num_samples, min_num_samples)
  num_samples_after_first_patch = num_samples - min_num_samples
  hop_samples = tf.cast(params.patch_hop_seconds * params.sample_rate, tf.int32)
  num_hops_after_first_patch = tf.cast(tf.math.ceil(
          tf.cast(num_samples_after_first_patch, tf.float32) /
          tf.cast(hop_samples, tf.float32)), tf.int32)
  num_padding_samples += (
      hop_samples * num_hops_after_first_patch - num_samples_after_first_patch)

  padded_waveform = tf.pad(waveform, [[0, num_padding_samples]],
                           mode='CONSTANT', constant_values=0.0)
  return padded_waveform


def _tflite_stft_magnitude(signal, frame_length, frame_step, fft_length):
  """TF-Lite-compatible version of tf.abs(tf.signal.stft())."""
  def _hann_window():
    return tf.reshape(
      tf.constant(
          (0.5 - 0.5 * np.cos(2 * np.pi * np.arange(0, 1.0, 1.0 / frame_length))
          ).astype(np.float32),
          name='hann_window'), [1, frame_length])

  def _dft_matrix(dft_length):
    """Calculate the full DFT matrix in NumPy."""
    # See https://en.wikipedia.org/wiki/DFT_matrix
    omega = (0 + 1j) * 2.0 * np.pi / float(dft_length)
    # Don't include 1/sqrt(N) scaling, tf.signal.rfft doesn't apply it.
    return np.exp(omega * np.outer(np.arange(dft_length), np.arange(dft_length)))

  def _rdft(framed_signal, fft_length):
    """Implement real-input Discrete Fourier Transform by matmul."""
    # We are right-multiplying by the DFT matrix, and we are keeping only the
    # first half ("positive frequencies").  So discard the second half of rows,
    # but transpose the array for right-multiplication.  The DFT matrix is
    # symmetric, so we could have done it more directly, but this reflects our
    # intention better.
    complex_dft_matrix_kept_values = _dft_matrix(fft_length)[:(
        fft_length // 2 + 1), :].transpose()
    real_dft_matrix = tf.constant(
        np.real(complex_dft_matrix_kept_values).astype(np.float32),
        name='real_dft_matrix')
    imag_dft_matrix = tf.constant(
        np.imag(complex_dft_matrix_kept_values).astype(np.float32),
        name='imaginary_dft_matrix')
    signal_frame_length = tf.shape(framed_signal)[-1]
    half_pad = (fft_length - signal_frame_length) // 2
    padded_frames = tf.pad(
        framed_signal,
        [
            # Don't add any padding in the frame dimension.
            [0, 0],
            # Pad before and after the signal within each frame.
            [half_pad, fft_length - signal_frame_length - half_pad]
        ],
        mode='CONSTANT',
        constant_values=0.0)
    real_stft = tf.matmul(padded_frames, real_dft_matrix)
    imag_stft = tf.matmul(padded_frames, imag_dft_matrix)
    return real_stft, imag_stft

  def _complex_abs(real, imag):
    return tf.sqrt(tf.add(real * real, imag * imag))

  framed_signal = tf.signal.frame(signal, frame_length, frame_step)
  windowed_signal = framed_signal * _hann_window()
  real_stft, imag_stft = _rdft(windowed_signal, fft_length)
  stft_magnitude = _complex_abs(real_stft, imag_stft)
  return stft_magnitude

#### Modified pad waveform to include parameters

In [4]:
def pad_waveform(waveform, params):
  sample_rate = 16000.0
  stft_window_seconds = 0.025
  stft_hop_seconds = 0.010
  mel_bands = 64
  mel_min_hz = 125.0
  mel_max_hz = 7500.0
  log_offset = 0.001
  patch_window_seconds = 0.96
  patch_hop_seconds = 0.48
  
  """Pads waveform with silence if needed to get an integral number of patches."""
  # In order to produce one patch of log mel spectrogram input to YAMNet, we
  # need at least one patch window length of waveform plus enough extra samples
  # to complete the final STFT analysis window.
  min_waveform_seconds = (
      patch_window_seconds +
      stft_window_seconds - stft_hop_seconds)
  min_num_samples = tf.cast(min_waveform_seconds * sample_rate, tf.int32)
  num_samples = tf.shape(waveform)[0]
  num_padding_samples = tf.maximum(0, min_num_samples - num_samples)

  # In addition, there might be enough waveform for one or more additional
  # patches formed by hopping forward. If there are more samples than one patch,
  # round up to an integral number of hops.
  num_samples = tf.maximum(num_samples, min_num_samples)
  num_samples_after_first_patch = num_samples - min_num_samples
  hop_samples = tf.cast(params.patch_hop_seconds * sample_rate, tf.int32)
  num_hops_after_first_patch = tf.cast(tf.math.ceil(
          tf.cast(num_samples_after_first_patch, tf.float32) /
          tf.cast(hop_samples, tf.float32)), tf.int32)
  num_padding_samples += (
      hop_samples * num_hops_after_first_patch - num_samples_after_first_patch)

  padded_waveform = tf.pad(waveform, [[0, num_padding_samples]],
                           mode='CONSTANT', constant_values=0.0)
  return padded_waveform

### Yamnet core model

In [5]:
def _batch_norm(name, params):
  def _bn_layer(layer_input):
    return layers.BatchNormalization(
      name=name,
      center=params.batchnorm_center,
      scale=params.batchnorm_scale,
      epsilon=params.batchnorm_epsilon)(layer_input)
  return _bn_layer


def _conv(name, kernel, stride, filters, params):
  def _conv_layer(layer_input):
    output = layers.Conv2D(name='{}/conv'.format(name),
                           filters=filters,
                           kernel_size=kernel,
                           strides=stride,
                           padding=params.conv_padding,
                           use_bias=False,
                           activation=None)(layer_input)
    output = _batch_norm('{}/conv/bn'.format(name), params)(output)
    output = layers.ReLU(name='{}/relu'.format(name))(output)
    return output
  return _conv_layer


def _separable_conv(name, kernel, stride, filters, params):
  def _separable_conv_layer(layer_input):
    output = layers.DepthwiseConv2D(name='{}/depthwise_conv'.format(name),
                                    kernel_size=kernel,
                                    strides=stride,
                                    depth_multiplier=1,
                                    padding=params.conv_padding,
                                    use_bias=False,
                                    activation=None)(layer_input)
    output = _batch_norm('{}/depthwise_conv/bn'.format(name), params)(output)
    output = layers.ReLU(name='{}/depthwise_conv/relu'.format(name))(output)
    output = layers.Conv2D(name='{}/pointwise_conv'.format(name),
                           filters=filters,
                           kernel_size=(1, 1),
                           strides=1,
                           padding=params.conv_padding,
                           use_bias=False,
                           activation=None)(output)
    output = _batch_norm('{}/pointwise_conv/bn'.format(name), params)(output)
    output = layers.ReLU(name='{}/pointwise_conv/relu'.format(name))(output)
    return output
  return _separable_conv_layer


_YAMNET_LAYER_DEFS = [
    # (layer_function, kernel, stride, num_filters)
    (_conv,          [3, 3], 2,   32),
    (_separable_conv, [3, 3], 1,   64),
    (_separable_conv, [3, 3], 2,  128),
    (_separable_conv, [3, 3], 1,  128),
    (_separable_conv, [3, 3], 2,  256),
    (_separable_conv, [3, 3], 1,  256),
    (_separable_conv, [3, 3], 2,  512),
    (_separable_conv, [3, 3], 1,  512),
    (_separable_conv, [3, 3], 1,  512),
    (_separable_conv, [3, 3], 1,  512),
    (_separable_conv, [3, 3], 1,  512),
    (_separable_conv, [3, 3], 1,  512),
    (_separable_conv, [3, 3], 2, 1024),
    (_separable_conv, [3, 3], 1, 1024)
]

def yamnet(features, params):
  """Define the core YAMNet mode in Keras."""
  net = layers.Reshape(
      (params.patch_frames, params.patch_bands, 1),
      input_shape=(params.patch_frames, params.patch_bands))(features)
  for (i, (layer_fun, kernel, stride, filters)) in enumerate(_YAMNET_LAYER_DEFS):
    net = layer_fun('layer{}'.format(i + 1), kernel, stride, filters, params)(net)
  embeddings = layers.GlobalAveragePooling2D()(net)
  logits = layers.Dense(units=params.num_classes, use_bias=True)(embeddings)
  predictions = layers.Activation(activation=params.classifier_activation)(logits)
  return predictions, embeddings


def yamnet_frames_model(params):
  """Defines the YAMNet waveform-to-class-scores model.
  Args:
    params: An instance of Params containing hyperparameters.
  Returns:
    A model accepting (num_samples,) waveform input and emitting:
    - predictions: (num_patches, num_classes) matrix of class scores per time frame
    - embeddings: (num_patches, embedding size) matrix of embeddings per time frame
    - log_mel_spectrogram: (num_spectrogram_frames, num_mel_bins) spectrogram feature matrix
  """
  waveform = layers.Input(batch_shape=(None,), dtype=tf.float32)
  waveform_padded = pad_waveform(waveform, params)
  log_mel_spectrogram, features = waveform_to_log_mel_spectrogram_patches(
      waveform_padded, params)
  predictions, embeddings = yamnet(features, params)
  frames_model = Model(
      name='yamnet_frames', inputs=waveform,
      outputs=[predictions, embeddings, log_mel_spectrogram])
  return frames_model

In [6]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


### Yamnet loading from code and weights

In [7]:
params = Params()
yamnet = yamnet_frames_model(params)
yamnet.load_weights("/content/gdrive/MyDrive/weights/yamnet.h5")

In [8]:
yamnet.summary()

Model: "yamnet_frames"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None,)]            0           []                               
                                                                                                  
 tf.compat.v1.shape (TFOpLambda  (1,)                0           ['input_1[0][0]']                
 )                                                                                                
                                                                                                  
 tf.__operators__.getitem (Slic  ()                  0           ['tf.compat.v1.shape[0][0]']     
 ingOpLambda)                                                                                     
                                                                                      

In [9]:
for i,layer in enumerate(yamnet.layers): ### Enumerate layers to understand indexes
    print(i,layer.name)

0 input_1
1 tf.compat.v1.shape
2 tf.__operators__.getitem
3 tf.math.maximum_1
4 tf.math.subtract_1
5 tf.cast
6 tf.math.truediv
7 tf.math.ceil
8 tf.cast_1
9 tf.math.subtract
10 tf.math.multiply
11 tf.math.maximum
12 tf.math.subtract_2
13 tf.__operators__.add
14 tf.compat.v1.pad
15 tf.signal.stft
16 tf.math.abs
17 tf.linalg.matmul
18 tf.__operators__.add_1
19 tf.math.log
20 tf.signal.frame
21 reshape
22 layer1/conv
23 layer1/conv/bn
24 layer1/relu
25 layer2/depthwise_conv
26 layer2/depthwise_conv/bn
27 layer2/depthwise_conv/relu
28 layer2/pointwise_conv
29 layer2/pointwise_conv/bn
30 layer2/pointwise_conv/relu
31 layer3/depthwise_conv
32 layer3/depthwise_conv/bn
33 layer3/depthwise_conv/relu
34 layer3/pointwise_conv
35 layer3/pointwise_conv/bn
36 layer3/pointwise_conv/relu
37 layer4/depthwise_conv
38 layer4/depthwise_conv/bn
39 layer4/depthwise_conv/relu
40 layer4/pointwise_conv
41 layer4/pointwise_conv/bn
42 layer4/pointwise_conv/relu
43 layer5/depthwise_conv
44 layer5/depthwise_conv/bn

Removing preprocessing layers and thus creating yamnet sub-model

In [10]:
mel_yamnet = Sequential() ## Create empty new model
mel_yamnet.add(Input(shape=(96,64))) ### Create input shape with the right dimension

for i,layer in enumerate(yamnet.layers): # Take all the Yamnet layers after the waveform preprocessing one
    if i >= 21 and i!=104: # From the first layer after preprocessing
      layer.trainable=False
      mel_yamnet.add(layer) # Add them to the new model
    if i==104:
      mel_yamnet.add(tf.keras.layers.Dense(256, activation='relu', kernel_regularizer=tf.keras.regularizers.L2(0.002))), 
      mel_yamnet.add(tf.keras.layers.Dropout(0.5)),
      mel_yamnet.add(tf.keras.layers.Dense(64, activation='relu', kernel_regularizer=tf.keras.regularizers.L2(0.002))), 
      mel_yamnet.add(tf.keras.layers.Dropout(0.5)),
      mel_yamnet.add(tf.keras.layers.Dense(16, activation='relu', kernel_regularizer=tf.keras.regularizers.L2(0.002))), 
      mel_yamnet.add(tf.keras.layers.Dropout(0.5)),
      mel_yamnet.add(tf.keras.layers.Dense(10, activation="softmax"))
      break

In [11]:
mel_yamnet.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 reshape (Reshape)           (None, 96, 64, 1)         0         
                                                                 
 layer1/conv (Conv2D)        (None, 48, 32, 32)        288       
                                                                 
 layer1/conv/bn (BatchNormal  (None, 48, 32, 32)       96        
 ization)                                                        
                                                                 
 layer1/relu (ReLU)          (None, 48, 32, 32)        0         
                                                                 
 layer2/depthwise_conv (Dept  (None, 48, 32, 32)       288       
 hwiseConv2D)                                                    
                                                                 
 layer2/depthwise_conv/bn (B  (None, 48, 32, 32)       9

## Feature generation using YAMNet auxiliary functions

In [12]:
params = Params()

dataset of 4000 reconstructed spectrograms

In [13]:
reconstructed = load("/content/gdrive/MyDrive/UrbanSound8K/dati/full_reconstructed_16kHz_500.npz")
# extract the first array
rec = reconstructed['arr_0']

In [14]:
labels = pd.read_csv("/content/gdrive/MyDrive/UrbanSound8K/dati/all_labels.csv")
labels = labels['0']

In [15]:
rec.shape

(4000, 64, 434)

Feature framing following "Input: Audio Features" section of https://github.com/tensorflow/models/tree/master/research/audioset/yamnet

In [16]:
x_mel_reconstructed = rec[1]

In [18]:
rec.shape

(4000, 64, 434)

In [19]:
def frame_spectrograms(reconstructed_spectrograms):
  framed_features=[]
  for x_mel_log_librosa in reconstructed_spectrograms:
    x_mel_log_librosa = np.log(x_mel_log_librosa + 0.001)
    x_mel_log_librosa = np.nan_to_num(x_mel_log_librosa) # replace nan with zeros
    x_mel_log_librosa_framed = np.ndarray((8,96,64))
    j=0
    # Frame size = 96
    # Split the spectrogram into frames, and put the frames in the 3D array
    # step of 48
    for i in range(0,x_mel_log_librosa.shape[1]-96,96//2):
        x_mel_log_librosa_framed[j] = x_mel_log_librosa[:,i:(i+96)].T # Spectrogram shape is (64,96) but tensor
        j+=1
    framed_features.append(x_mel_log_librosa_framed)
  return np.array(framed_features)

In [20]:
reconstructed_spectrograms = frame_spectrograms(rec)

  after removing the cwd from sys.path.


In [21]:
reconstructed_spectrograms.shape

(4000, 8, 96, 64)

In [22]:
reconstructed_spectrograms=reconstructed_spectrograms.reshape(4000, 96, 64, 8)

In [23]:
reconstructed_spectrograms.shape

(4000, 96, 64, 8)

Sub-model compiling and fitting 

In [24]:
mel_yamnet.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                 optimizer="adam",
                 metrics=['accuracy'])

In [25]:
ex = np.mean(reconstructed_spectrograms, axis=3) # patch avarage
ex.shape

(4000, 96, 64)

In [26]:
mel_yamnet.fit(ex, labels[0:4000], batch_size=128, epochs=30)

Epoch 1/30


  return dispatch_target(*args, **kwargs)


Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7fd24fac2850>

In [None]:
import gc 
del mel_yamnet
gc.collect() 

2200