# FEATURE EXTRACTION

# OpenSMILE

OpenSMILE (Speech and Music Interpretation by Large-space Extraction) is an open-source feature extraction tool that enables to extract large audio feature spaces in real time. It combines features from Music Information Retrieval and Speech Processing. OpenSMILE official documentation: 

https://audeering.github.io/opensmile/index.html

A theoretical description of the implemented algorithms can be found in Florian Eyben’s doctoral thesis entitled “Real-time Speech and Music Classification by Large Audio Feature Space Extraction” (2015). 

## Install and Import OpenSMILE Python implementation

The official documentation can be accessed here: https://audeering.github.io/opensmile-python/index.html

For updates/contribution, refer to the official github repository: https://github.com/audeering/opensmile-python  

In [None]:
# Installing opensmile via pip
#!pip install opensmile

In [None]:
# update required packages if needed
#!pip install --upgrade pyyaml

In [None]:
import opensmile
import os
from pydub import AudioSegment
import glob
from tqdm.notebook import tqdm
import pandas as pd

In [None]:
!jupyter nbextension enable --py widgetsnbextension

In [None]:
my_dir = os.getcwd()   
if not os.path.exists(my_dir + '/wav'):
    os.mkdir(my_dir + '/wav')

GEMS  = my_dir + '/mp3/'
data  = my_dir + '/wav/'

### Conversion from mp3 to wav

In [None]:
for file in tqdm(glob.glob(os.path.join(GEMS, '*.mp3'))):
    file_name  = os.path.basename(file[0:-4])
    output = file_name + ".wav"                                                           
    sound = AudioSegment.from_mp3(file)
    sound.export(data + output, format="wav")

### Extracting eGeMAPS LLDs from several audio files

In [None]:
# Define feature extractor to get LLDs from the eGeMAPS (v02) feature set
smile_LLDs = opensmile.Smile(
    feature_set=opensmile.FeatureSet.eGeMAPSv02,
    feature_level=opensmile.FeatureLevel.LowLevelDescriptors,
)
#smile_LLDs.feature_names

# Create a folder to save the results
if not os.path.exists(my_dir + '/RESULTS'):
    os.mkdir(my_dir + '/RESULTS')
if not os.path.exists(my_dir + '/RESULTS/LLD'):
    os.mkdir(my_dir + '/RESULTS/LLD')
if not os.path.exists(my_dir + '/RESULTS/LLD/eGeMAPS'):
    os.mkdir(my_dir + '/RESULTS/LLD/eGeMAPS')

# Extract the features
for file in tqdm(glob.glob(os.path.join(data, '*.wav'))):
    file_name  = os.path.basename(file[0:-4])
    LLDs = smile_LLDs.process_file(file)
    # Create csv file
    csv_name = my_dir + '/RESULTS/LLD/eGeMAPS/' + file_name + '.csv'
    LLDs.to_csv(csv_name, index=False)  # remember to drop the index  

### Extracting eGeMAPS functionals from several audio files

In [None]:
# Define feature extractor to get functionals from the eGeMAPS (v02) feature set
smile_func = opensmile.Smile(
    feature_set=opensmile.FeatureSet.eGeMAPSv02,
    feature_level=opensmile.FeatureLevel.Functionals,
)
#smile_func.feature_names

# get list of paths for all the files
all_files = glob.glob(os.path.join(data, '*.wav'))

# extract functionals for all the files in a dataframe
functionals = smile_func.process_files(all_files)

# clean up the index
functionals.reset_index(inplace=True)
functionals['file'] = functionals['file'].apply(os.path.basename)
try:
    del functionals['start']
    del functionals['end']
except:
    pass
functionals.set_index(['file'], inplace=True)

#save data frame to csv
functionals.to_csv(my_dir + '/RESULTS/functionals_eGeMAPS.csv')  # now we keep the index since we will need the file name as we have more than one 

### Extracting emobase LLDs from several audio files

In [None]:
# Define feature extractor to get LLDs from the eGeMAPS (v02) feature set
smile_LLDs = opensmile.Smile(
    feature_set=opensmile.FeatureSet.emobase,
    feature_level=opensmile.FeatureLevel.LowLevelDescriptors,
)
#smile_LLDs.feature_names

# Create a folder to save the results
if not os.path.exists(my_dir + '/RESULTS/LLD/emobase'):
    os.mkdir(my_dir + '/RESULTS/LLD/emobase')

# Extract the features
for file in tqdm(glob.glob(os.path.join(data, '*.wav'))):
    file_name  = os.path.basename(file[0:-4])
    LLDs = smile_LLDs.process_file(file)
    # Create csv file
    csv_name = my_dir + '/RESULTS/LLD/emobase/' + file_name + '.csv'
    LLDs.to_csv(csv_name, index=False)  # remember to drop the index  

### Extracting emobase functionals from several audio files

In [None]:
# Define feature extractor to get functionals from the eGeMAPS (v02) feature set
smile_func = opensmile.Smile(
    feature_set=opensmile.FeatureSet.emobase,
    feature_level=opensmile.FeatureLevel.Functionals,
)
#smile_func.feature_names

# get list of paths for all the files
all_files = glob.glob(os.path.join(data, '*.wav'))

# extract functionals for all the files in a dataframe
functionals = smile_func.process_files(all_files)

# clean up the index
functionals.reset_index(inplace=True)
functionals['file'] = functionals['file'].apply(os.path.basename)
try:
    del functionals['start']
    del functionals['end']
except:
    pass
functionals.set_index(['file'], inplace=True)

#save data frame to csv
functionals.to_csv(my_dir + '/RESULTS/functionals_emobase.csv')  # now we keep the index since we will need the file name as we have more than one 

### Extracting ComParE LLDs from several audio files

In [None]:
# Define feature extractor to get LLDs from the ComParE feature set
smile_LLDs = opensmile.Smile(
    feature_set=opensmile.FeatureSet.ComParE_2016,
    feature_level=opensmile.FeatureLevel.LowLevelDescriptors,
)
#smile_LLDs.feature_names

# Create a folder to save the results
if not os.path.exists(my_dir + '/RESULTS/LLD/compare'):
    os.mkdir(my_dir + '/RESULTS/LLD/compare')

# Extract the features
for file in glob.glob(os.path.join(data, '*.wav')):
    file_name  = os.path.basename(file[0:-4])
    LLDs = smile_LLDs.process_file(file)
    # Create csv file
    csv_name = my_dir + '/RESULTS/LLD/compare/' + file_name + '.csv'
    LLDs.to_csv(csv_name, index=False)  # remember to drop the index  

### Extracting ComParE functionals from several audio files

In [None]:
# Define feature extractor to get functionals from the ComParE feature set
smile_func = opensmile.Smile(
    feature_set=opensmile.FeatureSet.ComParE_2016,
    feature_level=opensmile.FeatureLevel.Functionals,
)
#smile_func.feature_names

# get list of paths for all the files
all_files = glob.glob(os.path.join(data, '*.wav'))

# extract functionals for all the files in a dataframe
functionals = smile_func.process_files(all_files)

# clean up the index
functionals.reset_index(inplace=True)
functionals['file'] = functionals['file'].apply(os.path.basename)
try:
    del functionals['start']
    del functionals['end']
except:
    pass
functionals.set_index(['file'], inplace=True)

#save data frame to csv
functionals.to_csv(my_dir + '/RESULTS/functionals_compare.csv')  # now we keep the index since we will need the file name as we have more than one 