##  Classifying audio data with convolutional neural networks

<br/>
by Eduardo Garcia Rajo<br/>
<br/>

This notebook is part of the project "Urban sounds classification with Covnolutional Neural Networks" on [my Github](https://github.com/GorillaBus/urban-audio-classifier).<br/>
<br/>
Licensed under the GNU LESSER GENERAL PUBLIC LICENSE Version 3, 29 June 2007<br/>
<br/>

# Augmented data pre-processing

MFCC and Log-Mel Spectogram Coefficients extraction from augmented dataset.<br/>
<br/>

In [1]:
# Required libraries
import sys
import os
import IPython as IP
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import librosa
import librosa.display
import pickle
from include import helpers
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical
from IPython.display import clear_output, display

Using TensorFlow backend.


#### * Setup
Pay attention to the very simple path variables configured in this section.

In [2]:
# Set your path to the dataset
us8k_path = os.path.abspath('./UrbanSound8K')
audio_path = os.path.join(us8k_path, 'audio')
augmented_path = os.path.join(audio_path, 'augmented')

# Metadata
metadata_augmented_path = os.path.abspath('data/augmented-data.csv')

#### * Load metadata

In [3]:
# Load the metadata from the generated CSV
metadata = pd.read_csv(metadata_augmented_path)

# Examine dataframe
print("Metadata length:", len(metadata))
metadata.tail()

Metadata length: 69856


Unnamed: 0,augment,class,class_id,file,fold
69851,pitch_2,car_horn,1,199769-1-0-6.wav,3
69852,pitch_2,car_horn,1,18594-1-1-0.wav,3
69853,pitch_2,car_horn,1,151359-1-0-0.wav,3
69854,pitch_2,car_horn,1,18594-1-0-0.wav,3
69855,pitch_2,car_horn,1,199769-1-0-12.wav,3


#### 1. MFCC extraction

In [5]:
# Iterate through all audio files and extract MFCC
features = []
labels = []
frames_max = 0
counter = 0
total_samples = len(metadata)
n_mfcc = 40

for index, row in metadata.iterrows():
    file_path = os.path.join(os.path.abspath(audio_path), 'fold' + str(row["fold"]), str(row["file"]))
    class_label = row["class"]

    # Extract MFCCs (do not add padding)
    mfccs = helpers.get_mfcc(file_path, 0, n_mfcc)
    
    # Save current frame count
    num_frames = mfccs.shape[1]
    
    # Add row (feature / label)
    features.append(mfccs)
    labels.append(class_label)

    # Update frames maximum
    if (num_frames > frames_max):
        frames_max = num_frames
        
    clear_output(wait=True)
    print("Progress: {}/{}".format(index+1, total_samples))
    print("Last file: ", file_path)

    counter += 1
    
print("Finished: {}/{}".format(index, total_samples))

Progress: 69856/69856
Last file:  /home/edu/Projects/urban-audio-classifier/UrbanSound8K/audio/fold3/199769-1-0-12.wav
Finished: 69855/69856


#### 2. Add padding

In [6]:
padded = []

# Add padding
mfcc_max_padding = frames_max
for i in range(len(features)):
    size = len(features[i][0])
    if (size < mfcc_max_padding):
        pad_width = mfcc_max_padding - size
        px = np.pad(features[i], 
                    pad_width=((0, 0), (0, pad_width)), 
                    mode='constant', 
                    constant_values=(0,))
    
    padded.append(px)

#### 3. Save X and y

In [7]:
# Convert features (X) and labels (y) to Numpy arrays

X = np.array(padded)
y = np.array(labels)

# Optionally save the features to disk
np.save("data/X-mfcc-augmented", X)
np.save("data/y-mfcc-augmented", y)

In [8]:
# Verify shapes
print("Raw features length: {}".format(len(features)))
print("Padded features length: {}".format(len(padded)))
print("Feature labels length: {}".format(len(features)))
print("X: {}, y: {}".format(X.shape, y.shape))


Raw features length: 69856
Padded features length: 69856
Feature labels length: 69856
X: (69856, 40, 174), y: (69856,)


#### 4. Log-Mel Spectogram extraction

In [4]:
# Iterate through all audio files and extract Log-Mel Spectograms
features = []
labels = []
frames_max = 0
counter = 0
total_samples = len(metadata)
n_mels = 40

for index, row in metadata.iterrows():
    file_path = os.path.join(os.path.abspath(audio_path), 'fold' + str(row["fold"]), str(row["file"]))
    class_label = row["class"]

    # Extract Log-Mel Spectograms (do not add padding)
    mels = helpers.get_mel_spectogram(file_path, 0, n_mels=n_mels)
    
    # Save current frame count
    num_frames = mels.shape[1]
    
    # Add row (feature / label)
    features.append(mels)
    labels.append(class_label)

    # Update frames maximum
    if (num_frames > frames_max):
        frames_max = num_frames
        
    clear_output(wait=True)
    print("Progress: {}/{}".format(index+1, total_samples))
    print("Last file: ", file_path)

    counter += 1
    
print("Finished: {}/{}".format(index, total_samples))

Progress: 69856/69856
Last file:  /home/edu/Projects/urban-audio-classifier/UrbanSound8K/audio/fold3/199769-1-0-12.wav
Finished: 69855/69856


#### 2. Add padding for a consistent shape

In [5]:
padded = []

# Add padding
mels_max_padding = frames_max
for i in range(len(features)):
    size = len(features[i][0])
    if (size < mels_max_padding):
        pad_width = mels_max_padding - size
        px = np.pad(features[i], 
                    pad_width=((0, 0), (0, pad_width)), 
                    mode='constant', 
                    constant_values=(0,))
    
    padded.append(px)

#### 3. Save X and y

In [6]:
# Convert features (X) and labels (y) to Numpy arrays

X = np.array(padded)
y = np.array(labels)

# Optionally save the features to disk
np.save("data/X-mel_spec-augmented", X)
np.save("data/y-mel_spec-augmented", y)

In [7]:
# Verify shapes
print("Raw features length: {}".format(len(features)))
print("Padded features length: {}".format(len(padded)))
print("Feature labels length: {}".format(len(features)))
print("X: {}, y: {}".format(X.shape, y.shape))


Raw features length: 69856
Padded features length: 69856
Feature labels length: 69856
X: (69856, 40, 174), y: (69856,)
