## First
* No dropout
* No softmax

### Datasets results
* Wav --> 12.5%
* STFT --> 0%
* Mel --> 12.5%

In [None]:
def cnn_model(_input_shape, _num_classes):
    # Parameters
    lr = 1e-3
    opt = keras.optimizers.Adam(learning_rate=lr)
    loss = keras.losses.SparseCategoricalCrossentropy()
    metrics = ["accuracy"]
    act_func = "relu"
    k_size = 5
    p_size = (4, 4)  # (Height, width)

    # Model definition
    model = models.Sequential()

    # Convolution layers
    model.add(layers.InputLayer(_input_shape))
    model.add(layers.Conv2D(filters=32, kernel_size=k_size, activation=act_func, padding="same"))
    model.add(layers.MaxPooling2D(p_size))
    model.add(layers.Conv2D(filters=64, kernel_size=k_size, activation=act_func, padding="same"))
    model.add(layers.MaxPooling2D(p_size))
    model.add(layers.Conv2D(filters=64, kernel_size=k_size, activation=act_func, padding="same"))
    model.add(layers.MaxPooling2D(p_size))

    # Dense layer
    model.add(layers.Flatten())
    model.add(layers.Dense(256, activation=act_func))
    model.add(layers.Dense(_num_classes))

    # Compile and summary
    model.compile(
        optimizer=opt,
        loss=loss,
        metrics=metrics
    )

    model.summary()

    return model

## Second
* Kernel: 3
* Max pool: 2
* With 512 in first dense
* Uses softmax
* More convolution blocks to reduce size
* One dropout of 0.3

### Datasets testing results on 10 classes
* Wav --> 20%
* STFT --> 45%
* Mel --> 5%

In [None]:
def cnn_model(_input_shape, _num_classes):
    # Parameters
    act_func = "relu"
    k_size = 3
    p_size = (2, 2)  # (Height, width)

    # Model definition
    model = models.Sequential()
    model.add(layers.InputLayer(_input_shape))

    # Convolution layers
    ## Convolution 1
    model.add(layers.Conv2D(filters=32, kernel_size=k_size, activation=act_func, padding="same"))
    model.add(layers.MaxPooling2D(p_size))

    ## Convolution 2
    model.add(layers.Conv2D(filters=64, kernel_size=k_size, activation=act_func, padding="same"))
    model.add(layers.MaxPooling2D(p_size))

    ## Convolution 3
    model.add(layers.Conv2D(filters=128, kernel_size=k_size, activation=act_func, padding="same"))
    model.add(layers.MaxPooling2D(p_size))

    ## Convolution 4
    model.add(layers.Conv2D(filters=128, kernel_size=k_size, activation=act_func, padding="same"))
    model.add(layers.MaxPooling2D(p_size))

    ## Convolution 5
    model.add(layers.Conv2D(filters=256, kernel_size=k_size, activation=act_func, padding="same"))
    model.add(layers.MaxPooling2D(p_size))

    # Dense layer
    model.add(layers.Flatten())
    model.add(layers.Dense(512, activation=act_func))
    model.add(layers.Dropout(0.3))
    model.add(layers.Dense(_num_classes, activation="softmax"))

    # Compile and summary
    model.compile(
        optimizer="adam",
        loss=keras.losses.SparseCategoricalCrossentropy(),
        metrics=["accuracy"]
    )

    model.summary()

    return model

## Third
* Similar to second
* With decreased, but more frequent dropout
* Swish instead of ReLU

### Datasets results
* Wav -->
* STFT --> 70%
* Mel -->

In [None]:
def cnn_model(_input_shape, _num_classes):
    # Parameters
    act_func = "swish"
    k_size = 3
    p_size = (2, 2)  # (Height, width)

    # Model definition
    model = models.Sequential()
    model.add(layers.InputLayer(_input_shape))

    # Convolution layers
    ## Convolution 1
    model.add(layers.Conv2D(filters=32, kernel_size=k_size, activation=act_func, padding="same"))
    model.add(layers.MaxPooling2D(p_size))

    ## Convolution 2
    model.add(layers.Conv2D(filters=64, kernel_size=k_size, activation=act_func, padding="same"))
    model.add(layers.MaxPooling2D(p_size))

    ## Convolution 3
    model.add(layers.Conv2D(filters=128, kernel_size=k_size, activation=act_func, padding="same"))
    model.add(layers.MaxPooling2D(p_size))
    model.add(layers.Dropout(0.1))

    ## Convolution 4
    model.add(layers.Conv2D(filters=256, kernel_size=k_size, activation=act_func, padding="same"))
    model.add(layers.MaxPooling2D(p_size))

    ## Convolution 5
    model.add(layers.Conv2D(filters=256, kernel_size=k_size, activation=act_func, padding="same"))
    model.add(layers.MaxPooling2D(p_size))

    # Dense layer
    model.add(layers.Flatten())
    model.add(layers.Dense(512, activation=act_func))
    model.add(layers.Dropout(0.1))
    model.add(layers.Dense(_num_classes, activation="softmax"))

    # Compile and summary
    model.compile(
        optimizer="adam",
        loss=keras.losses.SparseCategoricalCrossentropy(),
        metrics=["accuracy"]
    )

    model.summary()

    return model