In [1]:
import pandas as pd
import random
import os
import glob
import librosa
import numpy as np
import soundfile as sf
import sklearn

This notebook describes the pre-processing steps on the audio files and the feature extraction process:

    1) Split data for training/validation (80%) and test (20%)
    2) Data segmentation, verification, and augmentation
    3) Feature selection to generate feature matrix and target vector

# 1. Data Split for in a Train_Validation and Test_Set 

All the raw audio files for the 10 classes are in a `project_audio_raw` folder and the description of the audio files is located in the `file_description.csv`.  Filenames are randomly placed in 2 folders: a training/validation (80%) and a test (20%) folder.  The split is balanced on categories:


In [2]:
#define main working directory
parent_wd = os.getcwd()
source = parent_wd+'\\project_audio_raw\\'
destination_train = parent_wd+'\\train_audio\\'
destination_test = parent_wd+'\\test_audio\\'

#get the file names and categories
df_infos = pd.read_csv(parent_wd + '\\data\\file_description.csv')
filenames = df_infos['filename']
category  = df_infos['category']

#Make the split on file names
from sklearn.model_selection import train_test_split
train_names, test_names = train_test_split(
    filenames, train_size=0.8, test_size=0.2, random_state=0, stratify=category)

# 2. Audio data segmentation, rejection, and augmentation

According the file name list, the corresponding wavefile is read with a sampling frequency of 32,000 Hz.  

*  __Segmentation:__ the 5 second input wavefile is cut in 9 segments, each last 1 second with 50% overlap.  
*  __Rejection:__ if the amplitude of the signal is smaller than 0.1 then the file is rejected.  
*  __Augmentation:__ random noise is added to each odd file.

In [3]:
##train set: go through the filenames for data segmentation, augmentation and rejection
#train_target_vector = np.empty(0)
## extract the file
#for train_name in train_names:
#    os.chdir(source)
#    y, sr = librosa.load(train_name, sr = 32000)
#    name = train_name.split('\\')[-1].split('.')[0]
#    os.chdir(destination_train)
#    for i in np.arange(5):
#        extract = y[sr*i:sr*(i+1)]
#        vol_rms =  y.max() - y.min()
#        # remove the files if no signal is in
#        if vol_rms > 0.05: 
#            extract_name = name + '_C' + str(i) + '.wav' #C stands for clean
#            sf.write(extract_name, extract, sr)
#            train_target_vector = np.append(train_target_vector, [extract_name], axis=0)
#        else:
#            pass
#    #data augmentation with random noise
#    for j in np.arange(4):
#        extract_2 = y[int(sr/2)+j*sr  : int(sr/2) + sr*(j+1)]
#        vol_rms =  y.max() - y.min()
#        if vol_rms > 0.1:
#            noise = np.random.normal(0,0.01,len(extract_2)) # add noise
#            extract_2_name = name + '_N' + str(j) + '.wav' #N stands for noise
#            sf.write(extract_2_name, extract_2, sr)
#            train_target_vector = np.append(train_target_vector, [extract_2_name], axis=0)
#        else:
#            pass
#
#os.chdir(parent_wd+'\\data')
#np.save('files_train.npy', train_target_vector)

In [4]:
##test set: go through the filenames for data segmentation, augmentation and rejection
#test_target_vector = np.empty(0)
## extract the file
#for test_name in test_names:
#    os.chdir(source)
#    y, sr = librosa.load(test_name, sr = 32000)
#    name = test_name.split('\\')[-1].split('.')[0]
#    os.chdir(destination_test)
#    for i in np.arange(5):
#        extract = y[sr*i:sr*(i+1)]
#        vol_rms =  y.max() - y.min()
#        # remove the files if no signal is in
#        if vol_rms > 0.05: 
#            extract_name = name + '_C' + str(i) + '.wav' #C stands for clean
#            sf.write(extract_name, extract, sr)
#            test_target_vector = np.append(test_target_vector, [extract_name], axis=0)
#        else:
#            pass
#    #data augmentation with random noise
#    for j in np.arange(4):
#        extract_2 = y[int(sr/2)+j*sr  : int(sr/2) + sr*(j+1)]
#        vol_rms =  y.max() - y.min()
#        if vol_rms > 0.1:
#            noise = np.random.normal(0,0.01,len(extract_2)) # add noise
#            extract_2_name = name + '_N' + str(j) + '.wav' #N stands for noise
#            sf.write(extract_2_name, extract_2, sr)
#            test_target_vector = np.append(test_target_vector, [extract_2_name], axis=0)
#        else:
#            pass
#
#os.chdir(parent_wd+'\\data')
#np.save('files_test.npy', test_target_vector)

# 3. Features extraction

## 3.1 MFCCs setting 1: 63 x 63

In [5]:
sampling_f = 32000 # sampling frequency in Hz
sample_length = 1 # sample duration in seconds
hop_length = 512 # number of samples between 2 analysis points
n_frames = 63# number of frames for each sample
n_mfcc = 63 # number of coefficients for the mfcc analysis 
row_length = n_mfcc*n_frames #define the row length
print('row length: ', row_length)
print('number of frames: ', n_frames)

row length:  3969
number of frames:  63


## Train Set 

In [6]:
#load the csv file with file name and class
meta_data = np.load(parent_wd+'\\data\\files_train.npy')

# create a list with all the wav files in the 'project_audio' folder 
os.chdir(parent_wd + '\\train_audio')
filenames = glob.glob('*.wav')

#initiate features and target
feature_matrix, labels = np.empty((0, n_mfcc*n_frames)), np.empty(0)

#Extract features
for filename in filenames:
    raw_sound, sample_rate = librosa.load(filename, sr=sampling_f)
    #get the MFCCs
    mfccs = librosa.feature.mfcc(y=raw_sound, sr=sample_rate, n_mfcc=n_mfcc, hop_length=hop_length)
    mfccs_flat = mfccs.reshape(1,row_length)    
    mfcc_flat_scl = sklearn.preprocessing.scale(mfccs_flat, axis=1) # scale the flattend features
    feature_matrix = np.append(feature_matrix, mfcc_flat_scl, axis=0)
    #get the label
    row_label = filename.split('_')[0].split('-')[-1]
    labels = np.append(labels, [row_label], axis=0)

#define the dictionary based on the documentation
dictionary = {'14':0, '13': 1, '10': 2, '11': 3, '16':4, '43': 5, '46':6, '44':7, '42':8, '45':9}
#change the labels according to the dictionary
y = pd.Series(labels).replace(dictionary)

#save the data
os.chdir(parent_wd+'\\data')
np.save('X_63_train.npy', feature_matrix)
np.save('y_63_train.npy', y)

## Test Set 

In [7]:
#load the csv file with file name and class
meta_data = np.load(parent_wd+'\\data\\files_test.npy')
# create a list with all the wav files in the 'project_audio' folder 
os.chdir(parent_wd + '\\test_audio')
filenames = glob.glob('*.wav')

#initiate features and target
feature_matrix, labels = np.empty((0, n_mfcc*n_frames)), np.empty(0)

#Extract features
for filename in filenames:
    raw_sound, sample_rate = librosa.load(filename, sr=sampling_f)
    #get the MFCCs
    mfccs = librosa.feature.mfcc(y=raw_sound, sr=sample_rate, n_mfcc=n_mfcc, hop_length=hop_length)
    mfccs_flat = mfccs.reshape(1,row_length)    
    mfcc_flat_scl = sklearn.preprocessing.scale(mfccs_flat, axis=1) # scale the flattend features
    feature_matrix = np.append(feature_matrix, mfcc_flat_scl, axis=0)
    #get the label
    row_label = filename.split('_')[0].split('-')[-1]
    labels = np.append(labels, [row_label], axis=0)

#define the dictionary based on the documentation
dictionary = {'14':0, '13': 1, '10': 2, '11': 3, '16':4, '43': 5, '46':6, '44':7, '42':8, '45':9}
#change the labels according to the dictionary
y = pd.Series(labels).replace(dictionary)

#save the data
os.chdir(parent_wd+'\\data')
np.save('X_63_test.npy', feature_matrix)
np.save('y_63_test.npy', y)

## 3.2 MFCCs setting 2: 32 x 32

In [8]:
sampling_f = 32000 # sampling frequency in Hz
sample_length = 1 # sample duration in seconds
hop_length = 1024 # number of samples between 2 analysis points
n_frames = 32# number of frames for each sample
n_mfcc = 32 # number of coefficients for the mfcc analysis 
row_length = n_mfcc*n_frames #define the row length
print('row length: ', row_length)
print('number of frames: ', n_frames)

row length:  1024
number of frames:  32


## Train set

In [9]:
#load the csv file with file name and class
meta_data = np.load(parent_wd+'\\data\\files_train.npy')

# create a list with all the wav files in the 'project_audio' folder 
os.chdir(parent_wd + '\\train_audio')
filenames = glob.glob('*.wav')

#initiate features and target
feature_matrix, labels = np.empty((0, n_mfcc*n_frames)), np.empty(0)

#Extract features
for filename in filenames:
    raw_sound, sample_rate = librosa.load(filename, sr=sampling_f)
    #get the MFCCs
    mfccs = librosa.feature.mfcc(y=raw_sound, sr=sample_rate, n_mfcc=n_mfcc, hop_length=hop_length)
    mfccs_flat = mfccs.reshape(1,row_length)    
    mfcc_flat_scl = sklearn.preprocessing.scale(mfccs_flat, axis=1) # scale the flattend features
    feature_matrix = np.append(feature_matrix, mfcc_flat_scl, axis=0)
    #get the label
    row_label = filename.split('_')[0].split('-')[-1]
    labels = np.append(labels, [row_label], axis=0)

#define the dictionary based on the documentation
dictionary = {'14':0, '13': 1, '10': 2, '11': 3, '16':4, '43': 5, '46':6, '44':7, '42':8, '45':9}
#change the labels according to the dictionary
y = pd.Series(labels).replace(dictionary)

#save the data
os.chdir(parent_wd+'\\data')
np.save('X_32_train.npy', feature_matrix)
np.save('y_32_train.npy', y)

## Test set

In [10]:
#load the csv file with file name and class
meta_data = np.load(parent_wd+'\\data\\files_test.npy')
# create a list with all the wav files in the 'project_audio' folder 
os.chdir(parent_wd + '\\test_audio')
filenames = glob.glob('*.wav')

#initiate features and target
feature_matrix, labels = np.empty((0, n_mfcc*n_frames)), np.empty(0)

#Extract features
for filename in filenames:
    raw_sound, sample_rate = librosa.load(filename, sr=sampling_f)
    #get the MFCCs
    mfccs = librosa.feature.mfcc(y=raw_sound, sr=sample_rate, n_mfcc=n_mfcc, hop_length=hop_length)
    mfccs_flat = mfccs.reshape(1,row_length)    
    mfcc_flat_scl = sklearn.preprocessing.scale(mfccs_flat, axis=1) # scale the flattend features
    feature_matrix = np.append(feature_matrix, mfcc_flat_scl, axis=0)
    #get the label
    row_label = filename.split('_')[0].split('-')[-1]
    labels = np.append(labels, [row_label], axis=0)

#define the dictionary based on the documentation
dictionary = {'14':0, '13': 1, '10': 2, '11': 3, '16':4, '43': 5, '46':6, '44':7, '42':8, '45':9}
#change the labels according to the dictionary
y = pd.Series(labels).replace(dictionary)

#save the data
os.chdir(parent_wd+'\\data')
np.save('X_32_test.npy', feature_matrix)
np.save('y_32_test.npy', y)

## 3.3 Mel-spectrogram and MFCC 3 channels 

In [11]:
sampling_f = 32000 # sampling frequency in Hz
sample_length = 1 # sample duration in seconds
hop_length = 1024 # number of samples between 2 analysis points
n_frames = 32# number of frames for each sample
n_mfcc = 32 # number of coefficients for the mfcc analysis 
row_length = n_mfcc*n_frames #define the row length
print('row length: ', row_length)
print('number of frames: ', n_frames)

row length:  1024
number of frames:  32


## Train set

In [12]:
#load the csv file with file name and class
meta_data = np.load(parent_wd+'\\data\\files_train.npy')


# create a list with all the wav files in the 'project_audio' folder 
os.chdir(parent_wd + '\\train_audio')
filenames = glob.glob('*.wav')

#initiate features and target
feature_matrix, labels = np.empty((0, row_length, 3)), np.empty(0)

#Extract features
for filename in filenames:
    raw_sound, sample_rate = librosa.load(filename, sr=sampling_f)
    
    ## feature 1 MELSPEC
    melspec = librosa.feature.melspectrogram(y=raw_sound, n_mels=32, hop_length=hop_length)
    melspec_flat = melspec.reshape(1, row_length)
    melspec_flat_scl = sklearn.preprocessing.scale(melspec_flat, axis=1)
    
    ## feature 2 MFCCs
    mfccs = librosa.feature.mfcc(y=raw_sound, sr=sample_rate, n_mfcc=n_mfcc, hop_length=hop_length)
    mfccs_flat = mfccs.reshape(1,row_length)    
    mfcc_flat_scl = sklearn.preprocessing.scale(mfccs_flat, axis=1)
    
    ## feature 3 delta MFCC
    mfcc_delta = librosa.feature.delta(mfccs, order=1, axis=1)
    mfcc_delta_flat = mfcc_delta.reshape(1, row_length)
    mfcc_delta_flat_scl = sklearn.preprocessing.scale(mfcc_delta_flat, axis=1)
    
    #bind in 3d array
    bind = np.array([melspec_flat_scl, mfcc_flat_scl, mfcc_delta_flat_scl])
    bind_t = np.transpose(bind, axes=[1,2,0])
    feature_matrix = np.concatenate((feature_matrix, bind_t),axis=0)
    
    #get the label
    row_label = filename.split('_')[0].split('-')[-1]
    labels = np.append(labels, [row_label], axis=0)

#define the dictionary based on the documentation
dictionary = {'14':0, '13': 1, '10': 2, '11': 3, '16':4, '43': 5, '46':6, '44':7, '42':8, '45':9}
#change the labels according to the dictionary
y = pd.Series(labels).replace(dictionary)

#save the data
os.chdir(parent_wd+'\\data')
np.save('X_3d_train.npy', feature_matrix)
np.save('y_3d_train.npy', y)

## Test set

In [13]:
#load the csv file with file name and class
meta_data = np.load(parent_wd+'\\data\\files_test.npy')

# create a list with all the wav files in the 'project_audio' folder 
os.chdir(parent_wd + '\\test_audio')
filenames = glob.glob('*.wav')

#initiate features and target
feature_matrix, labels = np.empty((0, row_length, 3)), np.empty(0)

#Extract features
for filename in filenames:
    raw_sound, sample_rate = librosa.load(filename, sr=sampling_f)
    
    ## feature 1 MELSPEC
    melspec = librosa.feature.melspectrogram(y=raw_sound, n_mels=32, hop_length=hop_length)
    melspec_flat = melspec.reshape(1, row_length)
    melspec_flat_scl = sklearn.preprocessing.scale(melspec_flat, axis=1)
    
    ## feature 2 MFCCs
    mfccs = librosa.feature.mfcc(y=raw_sound, sr=sample_rate, n_mfcc=n_mfcc, hop_length=hop_length)
    mfccs_flat = mfccs.reshape(1,row_length)    
    mfcc_flat_scl = sklearn.preprocessing.scale(mfccs_flat, axis=1)
    
    ## feature 3 delta MFCC
    mfcc_delta = librosa.feature.delta(mfccs, order=1, axis=1)
    mfcc_delta_flat = mfcc_delta.reshape(1, row_length)
    mfcc_delta_flat_scl = sklearn.preprocessing.scale(mfcc_delta_flat, axis=1)
    
    #bind in 3d array
    bind = np.array([melspec_flat_scl, mfcc_flat_scl, mfcc_delta_flat_scl])
    bind_t = np.transpose(bind, axes=[1,2,0])
    feature_matrix = np.concatenate((feature_matrix, bind_t),axis=0)
    
    #get the label
    row_label = filename.split('_')[0].split('-')[-1]
    labels = np.append(labels, [row_label], axis=0)

#define the dictionary based on the documentation
dictionary = {'14':0, '13': 1, '10': 2, '11': 3, '16':4, '43': 5, '46':6, '44':7, '42':8, '45':9}
#change the labels according to the dictionary
y = pd.Series(labels).replace(dictionary)

#save the data
os.chdir(parent_wd+'\\data')
np.save('X_3d_test.npy', feature_matrix)
np.save('y_3d_test.npy', y)