# Extracting features from Voices

In [35]:
import pandas as pd 
import numpy as np 

import librosa

from datetime import datetime

import os
from pathlib import Path
# Setting working directory
os.chdir(Path('/home/adriel_martins/Documents/voice_recognition'))

## Preparing the data

Initial file dataframe is from the csv that we made with the 'LibriSpeech_Files_Pre_Processing' notebook.

In [23]:
df = pd.read_csv(Path('Data/id_and_soundfiles_LibriSpeech.csv'))
df.head(10)

Unnamed: 0,id,soundfile
0,298,298-126791-0010.flac
1,298,298-126791-0012.flac
2,298,298-126791-0049.flac
3,298,298-126791-0037.flac
4,298,298-126791-0063.flac
5,2514,2514-149482-0071.flac
6,2514,2514-149482-0028.flac
7,2514,2514-149482-0067.flac
8,2514,2514-149482-0032.flac
9,2514,2514-149482-0047.flac


## Feature Extraction

In [48]:
# Main source for the choosing of the feature is Jurgen Arias (2020).

def extract_features(row):
    
    # Sets the name to be the path to where the file is in my computer
    path = Path('LibriSpeech/train-clean-100')
    folder_paths_to_add = row.soundfile.split('-')
    for index, dir in enumerate(folder_paths_to_add):
        if index == 2:
            break
        path = path.joinpath(dir)
    path = path / row.soundfile
    # Loads the audio file as a floating point time series and assigns the default sample rate
    # Sample rate is set to 22050 by default
    X, sample_rate = librosa.load(path, res_type='kaiser_fast') 

    # Generate Mel-frequency cepstral coefficients (MFCCs) from a time series 
    mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T,axis=0)

    # Generates a Short-time Fourier transform (STFT) to use in the chroma_stft
    stft = np.abs(librosa.stft(X))

    # Computes a chromagram from a waveform or power spectrogram.
    chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)

    # Computes a mel-scaled spectrogram.
    mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)

    # Computes spectral contrast
    contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T,axis=0)

    # Computes the tonal centroid features (tonnetz)
    tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X),
    sr=sample_rate).T,axis=0)
        
    # We add also the speaker_id of each file as a label at the end
    label = row.id

    return mfccs, chroma, mel, contrast, tonnetz, label

In [49]:
# Code to start the timer to see how long it takes to extract the features
startTime = datetime.now()

# Applying the function to the train data by accessing each row of the dataframe
features_label = df.apply(extract_features, axis=1)

# Code to see how long it took
print(datetime.now() - startTime)

0:12:28.837143


In [50]:
features_label

0      ([-289.34943, 110.6154, -67.3782, 59.304237, -...
1      ([-320.5409, 105.738525, -56.98817, 52.488617,...
2      ([-311.92624, 104.054054, -59.412125, 57.68102...
3      ([-310.9264, 96.11666, -56.33535, 57.83569, -1...
4      ([-294.8751, 113.68041, -62.691093, 48.88425, ...
                             ...                        
495    ([-392.83832, 103.10235, -21.629827, 35.794468...
496    ([-366.55304, 114.450096, -13.166514, 37.16646...
497    ([-378.58362, 119.243164, -22.32459, 43.95746,...
498    ([-364.83, 118.77677, -26.965506, 41.79388, -7...
499    ([-371.5594, 116.94956, -21.727648, 40.800175,...
Length: 500, dtype: object

In [52]:
np.save(Path('Data/features_label'), features_label)