### Loading libraries


In [1]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
import soundfile as sf
import tensorflow_hub as hub
import warnings
AUTOTUNE = tf.data.experimental.AUTOTUNE
warnings.filterwarnings('ignore')







### Preparing dataset for trianing


In [2]:
train_dir='Data/train/'
test_dir='Data/test/'

classes=os.listdir(train_dir)
print(classes)

['azaspi1', 'chcant2', 'houspa', 'redcro', 'wbwwre1']


In [3]:
def load_dataset(dataset_dir):
    x=[]
    y=[]
    label=0
    for (dir,folder,filenames) in os.walk(dataset_dir):
        if filenames:
            print(dir,label,len(filenames))
            for i in filenames:
                file_dir=os.path.join(dir,i)
                audio = tf.io.read_file(file_dir)
                audio, sr = tf.audio.decode_wav(audio,
                                                desired_channels=1,
                                                desired_samples=44100)
                # audio = tfio.audio.resample(audio, 44100, 16000)

                # audio=tf.reshape(audio,(1,-1))
                # audio=tf.squeeze(audio)
                
                audio = tf.squeeze(audio, axis=-1)
                
                x.append(audio)
                y.append(label)
            label+=1
    return tf.convert_to_tensor(x),tf.convert_to_tensor(y)

In [4]:
x_train,y_train=load_dataset(train_dir)
x_test,y_test=load_dataset(test_dir)

Data/train/azaspi1 0 40
Data/train/chcant2 1 47
Data/train/houspa 2 59
Data/train/redcro 3 34
Data/train/wbwwre1 4 61
Data/test/azaspi1 0 13
Data/test/chcant2 1 10
Data/test/houspa 2 11
Data/test/redcro 3 6
Data/test/wbwwre1 4 13


In [5]:
x_train.shape

TensorShape([241, 44100])

### Loading Pre-trained YAMNET model


In [6]:
yamnet_model_handle = 'https://tfhub.dev/google/yamnet/1'
yamnet_model = hub.load(yamnet_model_handle)

def get_features(audio):
    _,features,_=yamnet_model(audio)

    return features













### Extracting Features from pre-trained model


In [7]:
x_train_feat=tf.convert_to_tensor([get_features(i) for i in x_train])
x_test_feat=tf.convert_to_tensor([get_features(i) for i in x_test])
print(x_train_feat.shape,x_test_feat.shape)

(241, 5, 1024) (53, 5, 1024)


In [8]:
x_train_feat=tf.squeeze(x_train_feat)
x_test_feat=tf.squeeze(x_test_feat)
x_train_feat.shape,x_test_feat.shape

(TensorShape([241, 5, 1024]), TensorShape([53, 5, 1024]))

### Training the classifier


In [9]:
my_model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(5,1024), dtype=tf.float32,
                          name='input_embedding'),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(len(classes))
], name='my_model')

my_model.summary()







Model: "my_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 5, 512)            524800    
                                                                 
 dense_1 (Dense)             (None, 5, 64)             32832     
                                                                 
 flatten (Flatten)           (None, 320)               0         
                                                                 
 dense_2 (Dense)             (None, 5)                 1605      
                                                                 
Total params: 559237 (2.13 MB)
Trainable params: 559237 (2.13 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [10]:
my_model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                 optimizer="adam",
                 metrics=['accuracy'])

callback = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy',
                                            patience=3,
                                            restore_best_weights=True)

In [11]:
history = my_model.fit(x_train_feat,y_train,
                         #    batch_size=1,
                            epochs=20,
                        validation_data=(x_test_feat,y_test),
                        verbose=1,
                        callbacks=[callback])

Epoch 1/20






Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20


### Testing classifier


In [13]:
audio = tf.io.read_file('Data/train/chcant2/XC118040.wav')
audio, sr = tf.audio.decode_wav(audio,
                                desired_channels=1,
                                desired_samples=44100)
audio=tf.squeeze(audio,axis=-1)
feature=get_features(audio)
embeddings=tf.convert_to_tensor([feature])
result = my_model.predict(embeddings)

inferred_class = classes[result.mean(axis=0).argmax()]
print(f'The main sound is: {inferred_class}')

The main sound is: chcant2


### Converting to single model


In [14]:
class ReduceMeanLayer(tf.keras.layers.Layer):
  def __init__(self, axis=0, **kwargs):
    super(ReduceMeanLayer, self).__init__(**kwargs)
    self.axis = axis

  def call(self, input):
    return tf.math.reduce_mean(input, axis=self.axis)

In [15]:
input_segment = tf.keras.layers.Input(shape=(), dtype=tf.float32, name='audio')
# embedding_extraction_layer = hub.KerasLayer(yamnet_model_handle,
#                                             trainable=False, name='yamnet')
# yamnet_model_handle = 'https://tfhub.dev/google/yamnet/1'
# yamnet_model = hub.load(yamnet_model_handle)
yamnet_model=hub.KerasLayer(yamnet_model,trainable=False,name='yamnet')
_, embeddings_output, _ = yamnet_model(input_segment)
my_model=hub.KerasLayer(my_model,trainable=False,name='my_model')
serving_outputs = my_model(tf.convert_to_tensor([embeddings_output]))
serving_outputs = ReduceMeanLayer(axis=0, name='classifier')(serving_outputs)
serving_model = tf.keras.Model(input_segment, serving_outputs)

In [16]:
serving_model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 audio (InputLayer)          [(None,)]                 0         
                                                                 
 yamnet (KerasLayer)         [(None, 521),             0         
                              (None, 1024),                      
                              (None, 64)]                        
                                                                 
 tf.convert_to_tensor (TFOp  (1, None, 1024)           0         
 Lambda)                                                         
                                                                 
 my_model (KerasLayer)       (1, 5)                    559237    
                                                                 
 classifier (ReduceMeanLaye  (5,)                      0         
 r)                                                          

### Testing final model


In [17]:
audio = tf.io.read_file('Data/train/houspa/XC112666.wav')
audio, sr = tf.audio.decode_wav(audio,
                                desired_channels=1,
                                desired_samples=44100)
audio=tf.squeeze(audio,axis=-1)
serving_model(audio)

<tf.Tensor: shape=(5,), dtype=float32, numpy=
array([-1.2167687, -3.430489 ,  2.9077516, -0.7017541,  1.819451 ],
      dtype=float32)>

### Saving the Final Model


In [18]:
serving_model.save('model/', include_optimizer=False)





INFO:tensorflow:Assets written to: model/assets


INFO:tensorflow:Assets written to: model/assets


### Loading the saved model


In [19]:
reloaded_model = tf.saved_model.load('model/')

In [20]:
result=reloaded_model(audio)
result=np.array(result)
print(result)
print(f'The main sound is: {classes[result.argmax()]}')

[-1.2167687 -3.430489   2.9077516 -0.7017541  1.819451 ]
The main sound is: houspa


### Converting to tflite model


In [21]:
import os
def get_file_size(file_path):
    size = os.path.getsize(file_path)
    return round(size/(1024*1024),3)

In [22]:
tflite_model='model.tflite'
converter=tf.lite.TFLiteConverter.from_keras_model(reloaded_model)
lite_model=converter.convert()

INFO:tensorflow:Assets written to: C:\Users\naman\AppData\Local\Temp\tmpu05yn83y\assets


INFO:tensorflow:Assets written to: C:\Users\naman\AppData\Local\Temp\tmpu05yn83y\assets


In [23]:
open(tflite_model,"wb").write(lite_model)

15138704

In [24]:
get_file_size(tflite_model)

14.437

### Reducing further


In [25]:
name='model_quant.tflite'
converter=tf.lite.TFLiteConverter.from_keras_model(reloaded_model)
converter.optimizations=[tf.lite.Optimize.DEFAULT]
# converter.target_spec.supported_types=[tf.float]
lite_model=converter.convert()
open(name,"wb").write(lite_model)

INFO:tensorflow:Assets written to: C:\Users\naman\AppData\Local\Temp\tmpxf6_pil5\assets


INFO:tensorflow:Assets written to: C:\Users\naman\AppData\Local\Temp\tmpxf6_pil5\assets


3993888

In [26]:
get_file_size(name)

3.809

### Testing quantized model


In [27]:
tf_model=tf.lite.Interpreter(model_path=name)
tf_model.resize_tensor_input(tf_model.get_input_details()[0]['index'], [44100])
tf_model.allocate_tensors()

In [28]:
input_details=tf_model.get_input_details()
output_details=tf_model.get_output_details()
print(input_details)
print(output_details)

[{'name': 'serving_default_audio:0', 'index': 0, 'shape': array([44100]), 'shape_signature': array([-1]), 'dtype': <class 'numpy.float32'>, 'quantization': (0.0, 0), 'quantization_parameters': {'scales': array([], dtype=float32), 'zero_points': array([], dtype=int32), 'quantized_dimension': 0}, 'sparsity_parameters': {}}]
[{'name': 'StatefulPartitionedCall:0', 'index': 225, 'shape': array([5]), 'shape_signature': array([5]), 'dtype': <class 'numpy.float32'>, 'quantization': (0.0, 0), 'quantization_parameters': {'scales': array([], dtype=float32), 'zero_points': array([], dtype=int32), 'quantized_dimension': 0}, 'sparsity_parameters': {}}]


In [38]:
audio = tf.io.read_file('Data/test/wbwwre1/XC519209.wav')
audio, sr = tf.audio.decode_wav(audio,
                                desired_channels=1,
                                desired_samples=44100)
audio=tf.squeeze(audio,axis=-1)
audio.shape

TensorShape([44100])

In [41]:
tf_model.set_tensor(input_details[0]['index'],audio)
tf_model.invoke()
prediction=tf_model.get_tensor(output_details[0]['index'])
print(f'The main sound is: {classes[prediction.argmax()]}')

The main sound is: wbwwre1
