In [3]:
import os

from IPython import display
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_io as tfio

In [4]:
yamnet_model_handle = 'https://tfhub.dev/google/yamnet/1'
yamnet_model = hub.load(yamnet_model_handle)

In [14]:
my_classes = ['Sound_Guitar', 'Sound_Drum', 'Sound_Violin', 'Sound_Piano']
my_classes

my_model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(1024), dtype=tf.float32,
                          name='input_embedding'),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dense(350, activation='relu'),
    tf.keras.layers.Dense(300, activation='relu'),
    tf.keras.layers.Dense(200, activation='relu'),
    tf.keras.layers.Dense(len(my_classes))
], name='my_model')

my_model.summary()

my_model.load_weights('./music-instruments-sounds-weights2')

Model: "my_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_8 (Dense)             (None, 512)               524800    
                                                                 
 dense_9 (Dense)             (None, 350)               179550    
                                                                 
 dense_10 (Dense)            (None, 300)               105300    
                                                                 
 dense_11 (Dense)            (None, 200)               60200     
                                                                 
 dense_12 (Dense)            (None, 4)                 804       
                                                                 
Total params: 870,654
Trainable params: 870,654
Non-trainable params: 0
_________________________________________________________________


<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x22d8051c940>

In [16]:
@tf.function
def load_wav_16k_mono(filename):
    """ Load a WAV file, convert it to a float tensor, resample to 16 kHz single-channel audio. """
    file_contents = tf.io.read_file(filename)
    wav, sample_rate = tf.audio.decode_wav(
          file_contents,
          desired_channels=1)
    wav = tf.squeeze(wav, axis=-1)
    sample_rate = tf.cast(sample_rate, dtype=tf.int64)
    wav = tfio.audio.resample(wav, rate_in=sample_rate, rate_out=16000)
    return wav



In [17]:
my_model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                 optimizer="adam",
                 metrics=['accuracy'])

callback = tf.keras.callbacks.EarlyStopping(monitor='loss',
                                            patience=3,
                                            restore_best_weights=True)

In [18]:
testing_wav_file_name = './dataset-16bit/Test16/pop-drums-loops-3-11278.wav'
testing_wav_data = load_wav_16k_mono(testing_wav_file_name)
display.Audio(testing_wav_data, rate=16000)





In [21]:
testing_wav_file_name = './dataset-16bit/Test16/Sad-Violin-Fast-E-www.fesliyanstudios.com.wav'
testing_wav_data = load_wav_16k_mono(testing_wav_file_name)
display.Audio(testing_wav_data, rate=16000)





In [23]:
scores, embeddings, spectrogram = yamnet_model(testing_wav_data)
result = my_model(embeddings).numpy()

inferred_class = my_classes[result.mean(axis=0).argmax()]
inferred_class

'Sound_Piano'

In [None]:
scores, embeddings, spectrogram = yamnet_model(testing_wav_data)
class_scores = tf.reduce_mean(scores, axis=0)
top_class = tf.argmax(class_scores)
print(top_class.value_index)
#inferred_class = my_classes[top_class]

#print(f'The main sound is: {inferred_class}')
#print(f'The embeddings shape: {embeddings.shape}')