# Extracting features from Voices

In [1]:
import pandas as pd 
import numpy as np 

import librosa

from datetime import datetime

import os
from pathlib import Path
# Setting working directory
os.chdir(Path('/home/adriel_martins/Documents/voice_recognition'))

## Preparing the data

Initial file dataframe is from the csv that we made with the 'LibriSpeech_Files_Pre_Processing' notebook.

In [2]:
df = pd.read_csv(Path('Data/id_and_soundfiles_LibriSpeech.csv'))
df.head(10)

Unnamed: 0,id,soundfile
0,8468,8468-286673-0030.flac
1,1235,1235-135883-0028.flac
2,2911,2911-7601-0013.flac
3,8797,8797-294123-0053.flac
4,3214,3214-167606-0032.flac
5,460,460-172359-0004.flac
6,150,150-126107-0004.flac
7,5750,5750-35690-0019.flac
8,8465,8465-246942-0005.flac
9,3259,3259-158083-0054.flac


## Feature Extraction

In [3]:
# Main source for the choosing of the feature is Jurgen Arias (2020).

def extract_features(row):
    
    # Sets the name to be the path to where the file is in my computer
    path = Path('LibriSpeech/train-clean-100')
    folder_paths_to_add = row.soundfile.split('-')
    for index, dir in enumerate(folder_paths_to_add):
        if index == 2:
            break
        path = path.joinpath(dir)
    path = path / row.soundfile
    # Loads the audio file as a floating point time series and assigns the default sample rate
    # Sample rate is set to 22050 by default
    X, sample_rate = librosa.load(path, res_type='kaiser_fast') 

    # Generate Mel-frequency cepstral coefficients (MFCCs) from a time series 
    mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T,axis=0)

    # Generates a Short-time Fourier transform (STFT) to use in the chroma_stft
    stft = np.abs(librosa.stft(X))

    # Computes a chromagram from a waveform or power spectrogram.
    chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)

    # Computes a mel-scaled spectrogram.
    mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)

    # Computes spectral contrast
    contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T,axis=0)

    # Computes the tonal centroid features (tonnetz)
    tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X),
    sr=sample_rate).T,axis=0)
        
    # We add also the speaker_id of each file as a label at the end
    label = row.id

    return mfccs, chroma, mel, contrast, tonnetz, label

In [None]:
# Code to start the timer to see how long it takes to extract the features
startTime = datetime.now()

# Applying the function to the train data by accessing each row of the dataframe
features_label = df.apply(extract_features, axis=1)

# Code to see how long it took
print(datetime.now() - startTime)

In [None]:
features_label

In [None]:
np.save(Path('Data/features_label'), features_label)