# Feature Extration for the Convolutional Neural Network model

Importing necessary libraries and setting some variables that we will be wanting to use for all the features' extraction processes:

In [1]:
import librosa
import numpy as np
import pickle
import pandas as pd
import os

### Procedures to save and load extracted data from and to the disk

Throughout this notebook, we will be using the following procedures to save steps of our processed data into the disk, such that we can load it later if we need to.

These functions take advantage of Python's library _Pickle_, which allow us to save the content of our variables in binary files of extension _.pkl_, and to load them with the structure that it was saved from (dictionary, NumPy array, class object, ...).

In [2]:
def save_pkl(data, path):
    with open(path, "wb") as saved_data:
        pickle.dump(data, saved_data)
    saved_data.close()

def load_pkl(path):
    to_return = None
    with open(path, "rb") as loaded_data:
        to_return = pickle.load(loaded_data)
    loaded_data.close()
    return to_return

## Data Preparation

### Loading the Data's Classifications

In [54]:
data_info = pd.read_csv("./UrbanSound8K/metadata/UrbanSound8K.csv")
data_info


Unnamed: 0,slice_file_name,fsID,start,end,salience,fold,classID,class
0,100032-3-0-0.wav,100032,0.000000,0.317551,1,5,3,dog_bark
1,100263-2-0-117.wav,100263,58.500000,62.500000,1,5,2,children_playing
2,100263-2-0-121.wav,100263,60.500000,64.500000,1,5,2,children_playing
3,100263-2-0-126.wav,100263,63.000000,67.000000,1,5,2,children_playing
4,100263-2-0-137.wav,100263,68.500000,72.500000,1,5,2,children_playing
...,...,...,...,...,...,...,...,...
8727,99812-1-2-0.wav,99812,159.522205,163.522205,2,7,1,car_horn
8728,99812-1-3-0.wav,99812,181.142431,183.284976,2,7,1,car_horn
8729,99812-1-4-0.wav,99812,242.691902,246.197885,2,7,1,car_horn
8730,99812-1-5-0.wav,99812,253.209850,255.741948,2,7,1,car_horn


### Resampling and Zero Padding

Firstly, we will resample all of our audio raw data such that the resulting data has a sample rate of 44.1 KHz. We will also zero-pad it such that all data points represent audio with 4 seconds of duration.

All data will be saved seperately by fold.

We will also save the corresponding .wav file name such that we can correctly obtain its fold and classification in the dataset metadata CSV.

In [4]:
FOLDS_PATH = "UrbanSound8K/audio/"
DURATION = 4 # 4 seconds for each audio file
SAMPLE_RATE = 44100
HOP_LENGTH = round(SAMPLE_RATE * 0.0125)
WIN_LENGTH = round(SAMPLE_RATE * 0.023)
N_FFT = 2**10
TIME_SIZE = 4*SAMPLE_RATE//HOP_LENGTH+1

In [5]:
def zero_pad(audio_file_path):
    signal, sample_rate = librosa.load(audio_file_path, sr=None)
    # resample the sample rate to the target value of SR
    signal = librosa.resample(signal, orig_sr=sample_rate, target_sr=SAMPLE_RATE)
    # zero padding
    if len(signal) < DURATION*SAMPLE_RATE:
        signal = np.concatenate([
            signal,
            np.zeros(shape=(DURATION*SAMPLE_RATE - len(signal), ))
        ])
    elif len(signal) > DURATION*SAMPLE_RATE:
        signal = signal[:DURATION*SAMPLE_RATE]
    return signal

In [6]:
folds = [fold for fold in os.listdir(FOLDS_PATH) if "fold" in fold]
for fold in folds:
    print(fold)
    df_data = []
    audio_files = librosa.util.find_files(FOLDS_PATH+"/"+fold)
    for audio_file_path in audio_files:
        audio_file = audio_file_path.split("\\")[-1]
        df_data.append({'id': audio_file, 'zero_padded_data': zero_pad(audio_file_path)})
    df = pd.DataFrame(data=df_data, columns=['id', 'zero_padded_data'])
    save_pkl(df, f"features/zero_pad/{fold}_csv.pkl")
    # memory management
    del df

fold1


fold10
fold2
fold3
fold4
fold5
fold6
fold7
fold8
fold9


## Extraction of the 2D features

For this task, we will be taking advange of Librosa's library capabilities and extract the following 2D features from the .wav files availavle in the _UrbanSound8K_ dataset:
- Chromagram
- Mel-scaled Spectogram
- Short-time Fourier transform Tempogram

The following functions allow us to extract, in order, the beforehand mentioned features.

In [7]:
def chromagram(audio_data):
    N_CHROMA = 12
    return librosa.feature.chroma_stft(y=audio_data, n_chroma=N_CHROMA, sr=SAMPLE_RATE, n_fft=N_FFT, hop_length=HOP_LENGTH, win_length=WIN_LENGTH)

In [8]:
def mel_spectogram(audio_data):
    return librosa.feature.melspectrogram(y=audio_data, sr=SAMPLE_RATE, n_fft=N_FFT, hop_length=HOP_LENGTH, win_length=WIN_LENGTH)

In [9]:
def fourier_tempogram(audio_data):
    return np.abs(librosa.feature.fourier_tempogram(y=audio_data, sr=SAMPLE_RATE, hop_length=HOP_LENGTH, win_length=WIN_LENGTH))

Now, we extract the features and save the obtained data.

In [10]:
for i in range(1, 10+1):
    fold = f"fold{i}"
    fold_df = load_pkl(f"features/zero_pad/fold{i}_csv.pkl")
    print(fold)
    df_extracted_data = []
    for i in range(fold_df.shape[0]):
        extracted_data = {
            'id': fold_df.iloc[i,0],
            'chromagram': chromagram(fold_df.iloc[i,1]),
            'mel_spectogram': mel_spectogram(fold_df.iloc[i,1]),
            'fourier_tempogram': fourier_tempogram(fold_df.iloc[i,1])
        }
        df_extracted_data.append(extracted_data)
    df = pd.DataFrame(data=df_extracted_data, columns=['id', 'chromagram', 'mel_spectogram', 'fourier_tempogram'])
    save_pkl(df, f"features/extracted_2d/extracted_2d_{fold}_csv.pkl")
    # memory management
    del fold_df
    del df_extracted_data
    del df

fold1


  return pitch_tuning(


fold2


  return pitch_tuning(


fold3


  return pitch_tuning(


fold4


  return pitch_tuning(


fold5


  return pitch_tuning(


fold6


  return pitch_tuning(


fold7


  return pitch_tuning(


fold8


  return pitch_tuning(


fold9




fold10


  return pitch_tuning(


## Extraction of the 1D features

We will be taking advange of Librosa's library capabilities once again to extract some 1D features from the .wav files. Here are presented the 1D features to be extracted:
- Spectral Centroid
- Spectral Bandwidth
- Spectral Flatness
- Spectral Rolloff

The following functions allow us to extract, in order, the beforehand mentioned features.

In [11]:
def spectral_centroid(audio_data):
    return librosa.feature.spectral_centroid(y=audio_data, sr=SAMPLE_RATE, n_fft=N_FFT, hop_length=HOP_LENGTH, win_length=WIN_LENGTH)

In [12]:
def spectral_bandwidth(audio_data):
    return librosa.feature.spectral_bandwidth(y=audio_data, sr=SAMPLE_RATE, n_fft=N_FFT, hop_length=HOP_LENGTH, win_length=WIN_LENGTH)

In [13]:
def spectral_flatness(audio_data):
    return librosa.feature.spectral_flatness(y=audio_data, n_fft=N_FFT, hop_length=HOP_LENGTH, win_length=WIN_LENGTH)

In [14]:
def spectral_rolloff(audio_data):
    return librosa.feature.spectral_rolloff(y=audio_data, sr=SAMPLE_RATE, n_fft=N_FFT, hop_length=HOP_LENGTH, win_length=WIN_LENGTH)

In [16]:
for i in range(1,10+1):
    fold = f"fold{i}"
    fold_df = load_pkl(f"features/zero_pad/fold{i}_csv.pkl")
    print(fold)
    df_extracted_data = []
    for i in range(fold_df.shape[0]):
        extracted_data = {
            'id': fold_df.iloc[i,0],
            'spectral_centroid': spectral_centroid(fold_df.iloc[i,1]),
            'spectral_bandwidth': spectral_bandwidth(fold_df.iloc[i,1]),
            'spectral_flatness': spectral_flatness(fold_df.iloc[i,1]),
            'spectral_rolloff': spectral_rolloff(fold_df.iloc[i,1])
        }
        df_extracted_data.append(extracted_data)
    df = pd.DataFrame(data=df_extracted_data, columns=['id', 'spectral_centroid', 'spectral_bandwidth', 'spectral_flatness', 'spectral_rolloff'])
    save_pkl(df, f"features/extracted_1d/extracted_1d_{fold}_csv.pkl")
    # memory management
    del fold_df
    del df_extracted_data
    del df

fold1
fold2
fold3
fold4
fold5
fold6
fold7
fold8
fold9
fold10


## Normalization the data

As a last step of our Data Pre-Processing and Feature Extraction phases for our CNN model, we are going to normalize all values extracted, feature by feature, using the Min-Max scalling process.

In [48]:
from sklearn.preprocessing import MinMaxScaler

print("2D")
for i in range(1,10+1):
    fold = f"fold{i}"
    fold_df: pd.DataFrame = load_pkl(f"features/extracted_2d/extracted_2d_fold{i}_csv.pkl")
    print(fold)
    cols = ['chromagram', 'mel_spectogram', 'fourier_tempogram']
    for col in cols:
        stacked_values = np.vstack(fold_df[col])
        scaler = MinMaxScaler()
        normalized_values = scaler.fit_transform(stacked_values)
        original_shapes = [value.shape for value in fold_df[col]]
        normalized_arrays = [normalized_values[i:i+len(value)].reshape(shape) for i, (value, shape) in enumerate(zip(fold_df[col], original_shapes))]
        # update the dataframe column with the normalized values
        fold_df[col] = normalized_arrays
    fold_df.rename(columns={'id':'slice_file_name'}, inplace=True)
    save_pkl(fold_df, f"features/normalized_feats/norm_feats_2d_{fold}_csv.pkl")
    # memory management
    del fold_df
    del stacked_values
    del normalized_values

2D
fold1
fold2
fold3
fold4
fold5
fold6
fold7
fold8
fold9
fold10


In [49]:
print("1D")
for i in range(1,10+1):
    fold = f"fold{i}"
    fold_df = load_pkl(f"features/extracted_1d/extracted_1d_fold{i}_csv.pkl")
    print(fold)
    cols = ['spectral_centroid', 'spectral_bandwidth', 'spectral_flatness', 'spectral_rolloff']
    for col in cols:
        stacked_values = np.vstack(fold_df[col])
        scaler = MinMaxScaler()
        normalized_values = scaler.fit_transform(stacked_values)
        original_shapes = [value.shape for value in fold_df[col]]
        normalized_arrays = [normalized_values[i:i+len(value)].reshape(shape) for i, (value, shape) in enumerate(zip(fold_df[col], original_shapes))]
        # update the dataframe column with the normalized values
        fold_df[col] = normalized_arrays
    fold_df.rename(columns={'id':'slice_file_name'}, inplace=True)
    save_pkl(fold_df, f"features/normalized_feats/norm_feats_1d_{fold}_csv.pkl")
    # memory management
    del fold_df
    del stacked_values
    del normalized_values

1D
fold1
fold2
fold3
fold4
fold5
fold6
fold7
fold8
fold9
fold10


## Save the final DataFrames

In this set, we merge all the 2D and 1D features we collected into a single dataframe.

We also One-Hot encode the target class.

In [59]:
# build the folds dataframes for the cnn training

target_df = pd.read_csv("./UrbanSound8K/metadata/UrbanSound8K.csv")
target_df = target_df[['slice_file_name','classID']]
# one hot encode the target class id
ohe_targets = np.zeros(shape=(target_df['classID'].size, target_df['classID'].max()+1))
ohe_targets[np.arange(target_df['classID'].size), target_df['classID'].to_numpy(dtype=np.int16)] = 1
target_df['classID'] = ohe_targets.tolist()
target_df['classID'].apply(np.array)

# build the folds dataframes
for i in range(1, 10+1):
    print(f"fold{i}")
    feat_2d_df = load_pkl(f"./features/normalized_feats/norm_feats_2d_fold{i}_csv.pkl")
    feat_1d_df = load_pkl(f"./features/normalized_feats/norm_feats_1d_fold{i}_csv.pkl")
    fold_df = pd.merge(left=feat_2d_df, right=feat_1d_df, on="slice_file_name")
    fold_df = pd.merge(left=fold_df, right=target_df, on="slice_file_name")
    save_pkl(fold_df, f"./cnn_folds_dataframes/fold{i}_df.pkl")
    # memory management
    del feat_2d_df
    del feat_1d_df
    del fold_df

fold1
fold2
fold3
fold4
fold5
fold6
fold7
fold8
fold9
fold10
