## Log-Scaled MFCC 
Not really a normalization procedure, but some articles ("A Comparison of Audio Signal Preprocessing Methods for Deep Neural Networks on Music Tagging") state that log-scaling improves accuracy. This is simply done by converting amplitude to a db scale.<br/>

https://stackoverflow.com/questions/55513652/which-spectrogram-best-represents-features-of-an-audio-file-for-cnn-based-model

In [1]:
import sys
import os
import IPython
import math
import numpy as np
import pandas as pd
import librosa
import matplotlib.pyplot as plt
import random
from datetime import datetime 

from keras import backend as keras_backend, models
from keras.models import Sequential
from keras.layers import Dense, SpatialDropout2D, Activation, Conv2D, MaxPooling2D, BatchNormalization, GlobalAveragePooling2D
from keras.optimizers import Adam
from keras.utils import np_utils, to_categorical
from keras.callbacks import ModelCheckpoint 

from sklearn import metrics, preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [4]:
# Define general variables

# Set your path to the dataset
us8k_path = os.path.abspath('./UrbanSound8K')
audio_path = os.path.join(us8k_path, 'audio')
metadata_path = os.path.join(us8k_path, 'metadata/UrbanSound8K.csv')
data_path = os.path.abspath('./data')
models_path = os.path.abspath('./models')

# Ensure "channel last" data format on Keras
keras_backend.set_image_data_format('channels_last')

In [5]:
# Load the metadata from the generated CSV
metadata = pd.read_csv(metadata_path)

#### Helper functions

In [6]:
# Helper: Generates MFCC coefficients with Librosa 
def get_mfcc(filename, mfcc_max_padding=0):
    try:
        audio, sample_rate = librosa.load(filename, res_type='kaiser_fast') 
        mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
        # log-scaling 
        mfccs = librosa.core.power_to_db(mfccs)

        # Should we require padding
        pad_width = mfcc_max_padding - mfccs.shape[1]
        if (mfcc_max_padding > 0 & mfccs.shape[1] < mfcc_max_padding):
            pad_width = mfcc_max_padding - mfccs.shape[1]
            mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')

    except Exception as e:
        print("Error parsing wavefile: ", filename)
        return None 
    return mfccs

#### Load metadata

In [None]:
# Load the metadata from the generated CSV
metadata = pd.read_csv(metadata_path)

# Examine dataframe
metadata.head()

#### Full dataset MFCC extraction

In [None]:
# Iterate through all audio files and extract MFCC
features = []
labels = []
frames_max = 0
counter = 0
total_samples = len(metadata)

for index, row in metadata.iterrows():
    file_name = os.path.join(os.path.abspath(audio_path), 'fold' + str(row["fold"]), str(row["slice_file_name"]))
    class_label = row["class"]
    mfccs = get_mfcc(file_name)    
    num_frames = mfccs.shape[1]
    
    # Per-channel normalization
    mean = np.mean(mfccs, axis=1, keepdims=True)
    std = np.std(mfccs, axis=1, keepdims=True)
    mfccs = (mfccs - mean) / std

    
    # Add row (feature / label)
    features.append(mfccs)
    labels.append(class_label)

    # Update frames maximum
    if (num_frames > frames_max):
        frames_max = num_frames

    # Notify update every N files
    if (counter == 500):
        print("Status: {}/{}".format(index+1, total_samples))
        counter = 0
    
    counter += 1
    
print("Finished: {}/{}".format(index, total_samples))

#### Add padding for a consistent shape

In [None]:
padded = []

# Add padding
mfcc_max_padding = frames_max
for i in range(len(features)):
    size = len(features[i][0])
    if (size < mfcc_max_padding):
        pad_width = mfcc_max_padding - size
        px = np.pad(features[i], 
                    pad_width=((0, 0), (0, pad_width)), 
                    mode='constant', 
                    constant_values=(0,))
    
    padded.append(px)

#### Save X and y

In [None]:
# Convert features (X) and labels (y) to Numpy arrays

X = np.array(padded)
y = np.array(labels)

# Optionally save the features to disk
np.save("data/X-v5", X)
np.save("data/y-v5", y)

In [None]:
# Verify shapes
print("Raw features length: {}".format(len(features)))
print("Padded features length: {}".format(len(padded)))
print("Feature labels length: {}".format(len(features)))
print("X: {}, y: {}".format(X.shape, y.shape))

### Helper functions

In [None]:
# Helper: Generates MFCC coefficients with Librosa 
def get_mfcc(filename, mfcc_max_padding=0):
    try:
        audio, sample_rate = librosa.load(filename, res_type='kaiser_fast') 
        mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
                
        # Should we require padding?
        pad_width = mfcc_max_padding - mfccs.shape[1]
        if (mfcc_max_padding > 0 & mfccs.shape[1] < mfcc_max_padding):
            pad_width = mfcc_max_padding - mfccs.shape[1]
            mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')

    except Exception as e:
        print("Error parsing wavefile: ", filename)
        return None 
    return mfccs

# Given a file returns a list of prediction results
def predict_from_file(file_name, num_rows, num_columns, num_channels):
    # Note we use num_columns as our padding value!
    feature = get_mfcc(file_name, mfcc_max_padding=num_columns) 
    feature = feature.reshape(1, num_rows, num_columns, num_channels)
    predicted_vector = model.predict_classes(feature)
    predicted_class = le.inverse_transform(predicted_vector)
    predicted_proba = model.predict_proba(feature)[0]
    return [
        predicted_class,
        predicted_proba
    ]

# Reads the defaul directory for extra evaluation files and returns a pandas dataframe
def get_external_samples(base_path):
    files = []
    labels = []

    # Iterate category directories
    dirs = os.listdir(base_path)
    for d in dirs:
        category = d
        category_path = os.path.join(base_path, d)

        # Iterate files
        dir_files = os.listdir(category_path)

        for file in dir_files:
            file_path = os.path.join(category_path, file)
            files.append(file_path)
            labels.append(category)

    # Create dataframe
    return pd.DataFrame({ 'class': labels, 'file': files })
    
    
def plot_confusion_matrix(y_true, 
                          y_pred, 
                          classes, 
                          normalize=False, 
                          title=None, 
                          cmap=plt.cm.Blues,
                          size=(10,10)):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'

    # Compute confusion matrix
    cm = metrics.confusion_matrix(y_true, y_pred)
        
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    fig, ax = plt.subplots(figsize=size)
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)

    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")

    fig.tight_layout()
    return [cm, ax]



# Expects a NumPy array with probabilities, retuns accuracy per class
def acc_per_class(np_probs_array):
    accs = []
    for idx in range(0, np_probs_array.shape[0]):
        correct = cm[idx][idx].astype(int)
        total = cm[idx].sum().astype(int)
        acc = (correct / total) * 100
        accs.append(acc)
    return accs

### Load data (v5)

In [10]:
# Pre-processed MFCC coefficients and labels & metadata

X = np.load("data/X-v5.npy")
y = np.load("data/y-v5.npy")

### Data preparation: features + metadata

#### Train / Test split

In [11]:
indexes = []
total = len(metadata)
indexes = list(range(0, total))

# Randomize indexes
random.shuffle(indexes)

# Divide the indexes into Train and Test
test_split_pct = 20
split_offset = math.floor(test_split_pct * total / 100)

# Split the metadata
test_split_idx = indexes[0:split_offset]
train_split_idx = indexes[split_offset:total]
test_meta = metadata.iloc[test_split_idx]
train_meta = metadata.iloc[train_split_idx]

# Split the features the with the same indexes
X_test = np.take(X, test_split_idx, axis=0)
y_test = np.take(y, test_split_idx, axis=0)
X_train = np.take(X, train_split_idx, axis=0)
y_train = np.take(y, train_split_idx, axis=0)

# Print status
print("Test split: {} \t\t Train split: {}".format(len(test_meta), len(train_meta)))
print("X test shape: {} \t X train shape: {}".format(X_test.shape, X_train.shape))
print("y test shape: {} \t\t y train shape: {}".format(y_test.shape, y_train.shape))

Test split: 1746 		 Train split: 6986
X test shape: (1746, 40, 174) 	 X train shape: (6986, 40, 174)
y test shape: (1746,) 		 y train shape: (6986,)


#### One hot encode labels

In [12]:
le = LabelEncoder()
y_test_encoded = to_categorical(le.fit_transform(y_test))
y_train_encoded = to_categorical(le.fit_transform(y_train))

#### Reshape data

In [13]:
# How data should be organized
num_rows = 40
num_columns = 174 # Remember how we padded the data in the previous notebook?
num_channels = 1

# Reshape to fit the network input (channel last!)
X_train = X_train.reshape(X_train.shape[0], num_rows, num_columns, num_channels)
X_test = X_test.reshape(X_test.shape[0], num_rows, num_columns, num_channels)

# Total number of labels to predict (equal to the network output nodes)
num_labels = y_train_encoded.shape[1]

### Model definition

In [15]:
model = Sequential()

# Input shape
input_shape = (num_rows, num_columns, num_channels)

# General Spatial Dropout rate
spatial_dropout_rate = 0.24

# Conv 1
model.add(Conv2D(filters=16, kernel_size=(3,3), input_shape=input_shape, activation='relu'))
model.add(BatchNormalization())
model.add(SpatialDropout2D(spatial_dropout_rate))

# Conv 2
model.add(Conv2D(filters=32, kernel_size=(3,3), activation='relu'))
model.add(BatchNormalization())
model.add(SpatialDropout2D(spatial_dropout_rate))
model.add(MaxPooling2D(pool_size=2))

# Conv 3
model.add(Conv2D(filters=64, kernel_size=(3,3), activation='relu'))
model.add(BatchNormalization())
model.add(SpatialDropout2D(spatial_dropout_rate))
model.add(MaxPooling2D(pool_size=2))

# Conv 4
model.add(Conv2D(filters=128, kernel_size=(3,3), activation='relu'))
model.add(BatchNormalization())
model.add(SpatialDropout2D(spatial_dropout_rate))
model.add(MaxPooling2D(pool_size=2))
          
# Conv 5
model.add(Conv2D(filters=256, kernel_size=(3,3), activation='relu'))
model.add(BatchNormalization())
model.add(SpatialDropout2D(spatial_dropout_rate))


# Reduces each h×w feature map to a single number by taking the average of all h,w values.
model.add(GlobalAveragePooling2D())

# Softmax output
model.add(Dense(num_labels, activation='softmax'))

W0731 01:19:11.599071 140012423436096 deprecation_wrapper.py:119] From /home/eduugr/miniconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0731 01:19:11.644628 140012423436096 deprecation_wrapper.py:119] From /home/eduugr/miniconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0731 01:19:11.825411 140012423436096 deprecation_wrapper.py:119] From /home/eduugr/miniconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:174: The name tf.get_default_session is deprecated. Please use tf.compat.v1.get_default_session instead.

W0731 01:19:11.826296 140012423436096 deprecation_wrapper.py:119] From /home/eduugr/miniconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:181: The name tf.ConfigProto is deprecated. Please use tf.compat.v1.ConfigProto i

#### Model optimizer and loss 

Using [ADAM](https://machinelearningmastery.com/adam-optimization-algorithm-for-deep-learning/) with default parameters for start from somewhere.
Loss function is categorical cross-entropy as we are predicting between 10 classes.

In [16]:
adam = Adam(lr=1.5e-3, beta_1=0.99, beta_2=0.999)
model.compile(
    loss='categorical_crossentropy', 
    metrics=['accuracy'], 
    optimizer=adam)

# Display model architecture summary 
model.summary()

# Calculate pre-training accuracy 
score = model.evaluate(X_test, y_test_encoded, verbose=1)
accuracy = 100 * score[1]

print("Pre-training accuracy: %.4f%%" % accuracy) 

W0731 01:19:38.343889 140012423436096 deprecation_wrapper.py:119] From /home/eduugr/miniconda3/lib/python3.7/site-packages/keras/optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.



_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 38, 172, 16)       160       
_________________________________________________________________
batch_normalization_1 (Batch (None, 38, 172, 16)       64        
_________________________________________________________________
spatial_dropout2d_1 (Spatial (None, 38, 172, 16)       0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 36, 170, 32)       4640      
_________________________________________________________________
batch_normalization_2 (Batch (None, 36, 170, 32)       128       
_________________________________________________________________
spatial_dropout2d_2 (Spatial (None, 36, 170, 32)       0         
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 18, 85, 32)        0         
__________

#### Training the model

In [17]:
num_epochs = 80
num_batch_size = 256
model_file = 'logscaled-v0.1.hdf5'
model_path = os.path.join(models_path, model_file)


# Save checkpoints
checkpointer = ModelCheckpoint(filepath=model_path, 
                               verbose=1, 
                               save_best_only=True)
start = datetime.now()
history = model.fit(X_train, 
                    y_train_encoded, 
                    batch_size=num_batch_size, 
                    epochs=num_epochs, 
                    validation_data=(X_test, y_test_encoded), 
                    callbacks=[checkpointer], 
                    verbose=1)

duration = datetime.now() - start
print("Training completed in time: ", duration)

W0731 01:20:48.602335 140012423436096 deprecation.py:323] From /home/eduugr/miniconda3/lib/python3.7/site-packages/tensorflow/python/ops/math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Train on 6986 samples, validate on 1746 samples
Epoch 1/80

Epoch 00001: val_loss improved from inf to 6.34069, saving model to /mnt/disks/disk-1/projects/urban-audio-classifier/models/logscaled-v0.1.hdf5
Epoch 2/80

Epoch 00002: val_loss improved from 6.34069 to 3.91948, saving model to /mnt/disks/disk-1/projects/urban-audio-classifier/models/logscaled-v0.1.hdf5
Epoch 3/80

Epoch 00003: val_loss improved from 3.91948 to 2.64719, saving model to /mnt/disks/disk-1/projects/urban-audio-classifier/models/logscaled-v0.1.hdf5
Epoch 4/80

Epoch 00004: val_loss improved from 2.64719 to 2.41331, saving model to /mnt/disks/disk-1/projects/urban-audio-classifier/models/logscaled-v0.1.hdf5
Epoch 5/80

Epoch 00005: val_loss improved from 2.41331 to 2.28475, saving model to /mnt/disks/disk-1/projects/urban-audio-classifier/models/logscaled-v0.1.hdf5
Epoch 6/80

Epoch 00006: val_loss improved from 2.28475 to 2.27052, saving model to /mnt/disks/disk-1/projects/urban-audio-classifier/models/logscaled-


Epoch 00038: val_loss did not improve from 2.25238
Epoch 39/80

Epoch 00039: val_loss did not improve from 2.25238
Epoch 40/80

Epoch 00040: val_loss did not improve from 2.25238
Epoch 41/80

Epoch 00041: val_loss did not improve from 2.25238
Epoch 42/80

Epoch 00042: val_loss did not improve from 2.25238
Epoch 43/80

Epoch 00043: val_loss did not improve from 2.25238
Epoch 44/80

Epoch 00044: val_loss did not improve from 2.25238
Epoch 45/80

Epoch 00045: val_loss did not improve from 2.25238
Epoch 46/80

Epoch 00046: val_loss did not improve from 2.25238
Epoch 47/80

Epoch 00047: val_loss did not improve from 2.25238
Epoch 48/80

Epoch 00048: val_loss did not improve from 2.25238
Epoch 49/80

Epoch 00049: val_loss did not improve from 2.25238
Epoch 50/80

Epoch 00050: val_loss did not improve from 2.25238
Epoch 51/80

Epoch 00051: val_loss did not improve from 2.25238
Epoch 52/80

Epoch 00052: val_loss did not improve from 2.25238
Epoch 53/80

Epoch 00053: val_loss did not improve f

### Optional: load previously saved model data

In [18]:
# Optionally load a previously saved model
from keras import models

# Load a full model
model = models.load_model(model_path)

#### Model evaluation

In [19]:
# Score on training data
score = model.evaluate(X_train, y_train_encoded, verbose=0)
print("Training Accuracy: ", score[1])

# Score on test data
score = model.evaluate(X_test, y_test_encoded, verbose=0)
print("Testing Accuracy: ", score[1])

Training Accuracy:  0.1145147437753938
Testing Accuracy:  0.11454753723648406
