In [1]:
import tensorflow as tf
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import numpy as np
import os

In [3]:
train_ds, validation_ds = tf.keras.utils.audio_dataset_from_directory(
    directory='../reduced_dataset/dataset/audio',
    validation_split=0.4, # stiamo mettendo da parte il 40% del dataset, che sarà suddiviso in validation set e test set
    shuffle=True,
    subset='both', # necessario se stiamo utilizzando validation_split (se no darebbe errore)
    seed=0 # necessario se stiamo utilizzando sia shuffle che validation_split (se no darebbe errore)
)

label_names = train_ds.class_names

Found 12933 files belonging to 30 classes.
Using 7760 files for training.
Using 5173 files for validation.


In [5]:
def squeeze(audio, labels):
    audio = tf.squeeze(audio, axis=-1)
    return audio, labels

# fonte: funzione presa dal seguente link: https://www.geeksforgeeks.org/audio-recognition-in-tensorflow/

In [7]:
train_ds = train_ds.map(squeeze, tf.data.AUTOTUNE)
validation_ds = validation_ds.map(squeeze, tf.data.AUTOTUNE)

audio, label = next(iter(train_ds))
print(audio.shape)
print(label.shape)

(32, 16000)
(32,)


In [9]:
def get_spectrogram(waveform):
    spectrogram = tf.signal.stft(waveform, frame_length=255, frame_step=128)
    spectrogram = tf.abs(spectrogram)

    return spectrogram[..., tf.newaxis]

# fonte: funzione presa dal seguente link: https://www.geeksforgeeks.org/audio-recognition-in-tensorflow/

In [11]:
def get_spectrogram_dataset(dataset):
    dataset = dataset.map(lambda x, y: (get_spectrogram(x), y), num_parallel_calls=tf.data.AUTOTUNE)

    return dataset

# fonte: funzione presa dal seguente link: https://www.geeksforgeeks.org/audio-recognition-in-tensorflow/

In [13]:
train_ds = get_spectrogram_dataset(train_ds)
validation_ds = get_spectrogram_dataset(validation_ds)

val_ds = validation_ds.take(validation_ds.cardinality() // 2) # ho cambiato nome del validation_ds in modo tale da non creare problemi con l'istruzione seguente
test_ds = validation_ds.skip(validation_ds.cardinality() // 2)

In [104]:
class VoxProfunda():
    def __init__(self):
        self.counter = 0

    def conv2D_x(self, filters, kernel_size, conv_n_layers, repeat, strides=1, padding="valid", name=False):
        self.counter += 1
        
        if name == False:
            if self.counter == repeat:
                name = "conv" + str(self.counter)
            else:
                name = "conv" + str(self.counter) + "_" + str(repeat)
            
        conv2D =  tf.keras.Sequential(name=name)
    
        for repetition in range(repeat):
            for layer in range(conv_n_layers):
                conv2D.add(tf.keras.layers.Conv2D(filters=filters, kernel_size=kernel_size, strides=strides, padding=padding))
            conv2D.add(tf.keras.layers.BatchNormalization())
            conv2D.add(tf.keras.layers.Activation("relu"))
    
        return conv2D
        
    def get_model(self):
        inputs = tf.keras.Input(shape=(124, 129, 1), name="input")
        resize = tf.keras.layers.Resizing(120, 120, name="resize")(inputs)
        normalization = tf.keras.layers.Normalization(name="normalization")(resize)
    
        # Blocco di apprendimento delle caratteristiche
        # conv1
        conv1 = self.conv2D_x(filters=64, kernel_size=(7,7), conv_n_layers=1, repeat=1, padding="same")(normalization)
    
        # pool1
        pool1 = tf.keras.layers.MaxPooling2D(pool_size=(3,3), strides=2, padding="same", name="pool1")(conv1)
    
        # conv2_x * 3
        conv2_3 = self.conv2D_x(filters=64, kernel_size=(3,3), conv_n_layers=2, repeat=3, padding="same")(pool1)
    
        # conv3_x * 4
        conv3_4 = self.conv2D_x(filters=128, kernel_size=(3,3), conv_n_layers=2, repeat=4, padding="same")(conv2_3)
    
        # conv4_x * 6
        conv4_6 = self.conv2D_x(filters=256, kernel_size=(3,3), conv_n_layers=2, repeat=6, padding="same")(conv3_4)
    
        # conv4_x * 3
        conv5_3 = self.conv2D_x(filters=512, kernel_size=(3,3), conv_n_layers=2, repeat=3, padding="same")(conv4_6)

        # global average pooling
        global_average_pooling = tf.keras.layers.GlobalAveragePooling2D(name="global_average_pooling")(conv5_3)
        
        # fc1
        fc1 = tf.keras.layers.Dense(512, activation="relu", name="fc1")(global_average_pooling)
        fc2 = tf.keras.layers.Dense(512, activation="relu", name="fc2")(fc1)
        
        outputs = tf.keras.layers.Dense(30, activation="softmax", name="fc3")(fc2)
        return tf.keras.Model(inputs=inputs, outputs=outputs)


model = VoxProfunda()
model = model.get_model()
model.compile(loss="sparse_categorical_crossentropy", optimizer="rmsprop", metrics=["accuracy"])
model.summary()

Un esempio semplificato di architettura VGGVox potrebbe essere:

Blocco Convoluzionale 1:
Conv (3x3, 64) + ReLU
Max-Pooling (2x2) strides 2

Blocco Convoluzionale 2:
Conv (3x3, 128) + ReLU
Max-Pooling (2x2)

Blocco Convoluzionale 3:
Conv (3x3, 256) + ReLU
Conv (3x3, 256) + ReLU
Max-Pooling (2x2)

Fully Connected Layers:
FC (512) + ReLU
FC (4096) + ReLU

Softmax (30)

In [107]:
def get_model():
    inputs = tf.keras.Input(shape=(124, 129, 1), name="input")
    resize = tf.keras.layers.Resizing(120, 120, name="resize")(inputs)
    normalization = tf.keras.layers.Normalization(name="normalization")(resize)

    # conv1
    conv1 = tf.keras.layers.Conv2D(filters=64, kernel_size=3, activation="relu", name="conv1")(normalization)
    pool1 = tf.keras.layers.MaxPooling2D(pool_size=2, strides=2, name="pool1")(conv1)

    # conv2_x * 3
    conv2 = tf.keras.layers.Conv2D(filters=128, kernel_size=3, activation="relu", name="conv2")(pool1)
    pool2 = tf.keras.layers.MaxPooling2D(pool_size=2, name="pool2")(conv2)

    # conv3_x * 4
    conv3 = tf.keras.layers.Conv2D(filters=256, kernel_size=3, activation="relu", name="conv3")(pool2)
    conv3 = tf.keras.layers.Conv2D(filters=256, kernel_size=3, activation="relu", name="conv3_2")(conv3)
    pool3 = tf.keras.layers.MaxPooling2D(pool_size=2, name="pool3")(conv3)

    # flatten
    flatten = layers.Flatten(name="flatten")(pool3)

    # fc
    fc1 = layers.Dense(512, activation="relu", name="fc1")(flatten)
    dropout = layers.Dropout(0.5, name="dropout1")(fc1)
    fc2 = layers.Dense(5994, activation="relu", name="fc2")(dropout)
    dropout = layers.Dropout(0.5, name="dropout2")(fc2)
    outputs = layers.Dense(30, activation="softmax", name="softmax")(dropout)
    
    return models.Model(inputs=inputs, outputs=outputs)

model = get_model()
model.compile(loss="sparse_categorical_crossentropy", optimizer="rmsprop", metrics=["accuracy"])
model.summary()

In [72]:
callback = [tf.keras.callbacks.ModelCheckpoint(filepath="bestmodels/resnet34.keras", save_best_only=True, monitor="val_loss")]
history = model.fit(train_ds, epochs=100, validation_data=val_ds, callbacks=callback)

Epoch 1/100
[1m243/243[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m434s[0m 2s/step - accuracy: 0.0795 - loss: 3.3597 - val_accuracy: 0.3187 - val_loss: 2.3259
Epoch 2/100
[1m243/243[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m392s[0m 2s/step - accuracy: 0.4039 - loss: 1.9631 - val_accuracy: 0.5421 - val_loss: 1.7147
Epoch 3/100
[1m243/243[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m400s[0m 2s/step - accuracy: 0.6322 - loss: 1.2248 - val_accuracy: 0.7025 - val_loss: 1.0428
Epoch 4/100
[1m  2/243[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m7:44[0m 2s/step - accuracy: 0.6875 - loss: 0.9174

KeyboardInterrupt: 

In [121]:
class VoxProfunda():
    def __init__(self):
        self.counter = 0

    def conv2D_x(self, filters, kernel_size, conv_n_layers, repeat, strides=1, padding="valid", name=False):
        self.counter += 1
        
        if name == False:
            if self.counter == repeat:
                name = "conv" + str(self.counter)
            else:
                name = "conv" + str(self.counter) + "_" + str(repeat)
            
        conv2D =  tf.keras.Sequential(name=name)
    
        for repetition in range(repeat):
            for layer in range(conv_n_layers):
                conv2D.add(tf.keras.layers.Conv2D(filters=filters, kernel_size=kernel_size, strides=strides, padding=padding))
            conv2D.add(tf.keras.layers.BatchNormalization())
            conv2D.add(tf.keras.layers.Activation("relu"))
    
        return conv2D
        
    def get_model(self):
        inputs = tf.keras.Input(shape=(124, 129, 1), name="input")
        resize = tf.keras.layers.Resizing(120, 120, name="resize")(inputs)
        normalization = tf.keras.layers.Normalization(name="normalization")(resize)
    
        # Blocco di apprendimento delle caratteristiche
        # conv1
        conv1 = self.conv2D_x(filters=64, kernel_size=(7,7), conv_n_layers=1, repeat=1, strides=2, padding="same")(normalization)
    
        # pool1
        pool1 = tf.keras.layers.MaxPooling2D(pool_size=(3,3), strides=2, padding="same", name="pool1")(conv1)
    
        # conv2_x * 3
        conv2_3 = self.conv2D_x(filters=64, kernel_size=(3,3), conv_n_layers=2, repeat=3, padding="same")(pool1)
    
        # conv3_x * 4
        conv3_4 = self.conv2D_x(filters=128, kernel_size=(3,3), conv_n_layers=2, repeat=4, padding="same")(conv2_3)
    
        # conv4_x * 6
        conv4_6 = self.conv2D_x(filters=256, kernel_size=(3,3), conv_n_layers=2, repeat=6, padding="same")(conv3_4)
    
        # conv4_x * 3
        conv5_3 = self.conv2D_x(filters=512, kernel_size=(3,3), conv_n_layers=2, repeat=3, padding="same")(conv4_6)

        # global average pooling
        fc1 = tf.keras.layers.Conv2D(filters=512, kernel_size=(9, 1), strides=1, activation="relu", name="fc1")(conv5_3)
    
        # pool_time
        avg_pooling = tf.keras.layers.GlobalAveragePooling2D(name="pool_time")(fc1)
        
        # fc2
        fc2 = layers.Dense(5994, activation="relu", name="fc2")(avg_pooling)
        
        # softmax
        outputs = layers.Dense(30, activation="softmax", name="softmax")(fc2)
        
        return models.Model(inputs=inputs, outputs=outputs)



model = VoxProfunda()
model = model.get_model()
model.compile(loss="sparse_categorical_crossentropy", optimizer="rmsprop", metrics=["accuracy"])
model.summary()

In [194]:
class VoxProfunda():
    def __init__(self):
        self.counter = 0

    def conv2D_x(self, filters, kernel_size, conv_n_layers, repeat, strides=1, padding="valid", name=False):
        self.counter += 1
        
        if name == False:
            if self.counter == repeat:
                name = "conv" + str(self.counter)
            else:
                name = "conv" + str(self.counter) + "_" + str(repeat)
            
        conv2D =  tf.keras.Sequential(name=name)
    
        for repetition in range(repeat):
            for layer in range(conv_n_layers):
                conv2D.add(tf.keras.layers.Conv2D(filters=filters, kernel_size=kernel_size, strides=strides, padding=padding))
            conv2D.add(tf.keras.layers.BatchNormalization())
            conv2D.add(tf.keras.layers.Activation("relu"))
    
        return conv2D
        
    def residual_block(self, x, filters, kernel_size, conv_n_layers, repeat, strides=1, padding="valid", name=False, pooling=False):
        residual = x  
        x = self.conv2D_x(filters, kernel_size, conv_n_layers, repeat, strides, padding, name)(x)
        if pooling:
            x = tf.keras.layers.MaxPooling2D(pool_size=(3,3), strides=2, padding="same")(x)
            residual = tf.keras.layers.Conv2D(filters, 1, strides=4)(residual)
        elif filters != residual.shape[-1]:
            residual = tf.keras.layers.Conv2D(filters, 1)(residual) 

        output_layer = tf.keras.layers.add([x, residual])
        return output_layer
    
    def get_model(self):
        inputs = tf.keras.Input(shape=(124, 129, 1), name="input")
        resize = tf.keras.layers.Resizing(120, 120, name="resize")(inputs)
        normalization = tf.keras.layers.Normalization(name="normalization")(resize)
    
        # Blocco di apprendimento delle caratteristiche
        # conv1
        conv1 = self.residual_block(normalization,filters=64, kernel_size=(7,7), conv_n_layers=1, repeat=1, strides=2, padding="same", pooling=True)
        
        # conv2_x * 3
        conv2_3 = self.residual_block(conv1, filters=64, kernel_size=(3,3), conv_n_layers=2, repeat=3, padding="same")
    
        # conv3_x * 4
        conv3_4 = self.residual_block(conv2_3, filters=128, kernel_size=(3,3), conv_n_layers=2, repeat=4, padding="same")
    
        # conv4_x * 6
        conv4_6 = self.residual_block(conv3_4, filters=256, kernel_size=(3,3), conv_n_layers=2, repeat=6, padding="same")
    
        # conv4_x * 3
        conv5_3 = self.residual_block(conv4_6, filters=512, kernel_size=(3,3), conv_n_layers=2, repeat=3, padding="same")

        # global average pooling
        fc1 = tf.keras.layers.Conv2D(filters=512, kernel_size=(9, 1), strides=1, activation="relu", name="fc1")(conv5_3)
    
        # pool_time
        avg_pooling = tf.keras.layers.GlobalAveragePooling2D(name="pool_time")(fc1)
        
        # fc2
        fc2 = layers.Dense(5994, activation="relu", name="fc2")(avg_pooling)
        
        # softmax
        outputs = layers.Dense(30, activation="softmax", name="softmax")(fc2)
        
        return models.Model(inputs=inputs, outputs=outputs)

model = VoxProfunda()
model = model.get_model()
model.compile(loss="sparse_categorical_crossentropy", optimizer="rmsprop", metrics=["accuracy"])
model.summary()

In [196]:
callback = [tf.keras.callbacks.ModelCheckpoint(filepath="bestmodels/resnet34.keras", save_best_only=True, monitor="val_loss")]
history = model.fit(train_ds, epochs=100, validation_data=val_ds, callbacks=callback)

Epoch 1/100


KeyboardInterrupt: 