In [None]:
import opensmile
import numpy as np
import pandas as pd
import librosa

import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
from scipy import stats

np.random.seed(42)

In [1]:
#This function extract the features (functionals or low-level descriptors) using the OpenSMILE set
#df: data frame with the name of the audios to be used

def extract_features(df, path_to_audios, path_to_save, sample_rate, features_subset):
    features_opensmile_func = pd.DataFrame()
    
    #creation of the OpenSMILE extractor
    #using ComParE 2016 and all the functional features
    if(features_subset == "functionals"):
        smile_func = opensmile.Smile(
            feature_set=opensmile.FeatureSet.ComParE_2016,
            feature_level=opensmile.FeatureLevel.Functionals,
        )
        
    #using ComParE 2016 and all the low-level descriptors as features
    elif(features_subset == "lld"): 
        smile_func = opensmile.Smile(
            feature_set=opensmile.FeatureSet.ComParE_2016,
            feature_level=opensmile.FeatureLevel.LowLevelDescriptors,
        )
        
    for index,row in df.iterrows():
        if(index%10==0):
            print(index, end='\r')
        
        #create the path and load each audio
        file = path_to_audios + row[8] + ".wav" 
        audio, sr = librosa.load(file, sr = sample_rate)
        
        #extract the features
        f_func = smile_func.process_signal(signal=audio, sampling_rate=sr)
        
        #Prepare the final dataset with participant ID, format and features
        f_flat_func = pd.DataFrame(np.array(f_func))
        f_flat_func.columns = f_func.columns
        
        #add ID and format alongside the features
        f_flat_func.insert(loc=0, column='ID_short', value=row[8])
        features_opensmile_func = features_opensmile_func.append(f_flat_func)
    
    features_opensmile_func.reset_index(drop = True, inplace = True)
    
    #Drop all column that the feature is equal zero troughout all samples
    count = 0
    for column in features_opensmile_func.columns:
        summ = np.sum(features_opensmile_func[column])
        if(summ == 0):
            features_opensmile_func.drop(columns = [column], inplace = True)
            count += 1
    
    #Useful data and saving to file
    print("{} fetures have been removed due to zeros-values".format(count))
    print("{} features remaining".format(len(features_opensmile_func.columns)-1))
    print("\nSaving to file...")
    
    #Protocol 4 must be used in order to be able to open in google colab
    features_opensmile_func.to_pickle(path_to_save,protocol = 4)
    
    print("All done!")
    
    return features_opensmile_func

# 3gp

In [None]:
data_3gp = pd.read_csv("../data/prediCovid_taste_loss_dataset_B_trim_june.csv")
data_3gp = data_3gp[data_3gp["format"]=="3gp"]
data_3gp.info()

## Type1 et Type2

In [None]:
path_to_audios = "..\\Data\\wav_no_silence_june\\"
path_to_save = "..\\Data\\openSMILE\\3gp\\type1_type2\\3gp_lld_type1_type2_openSMILE.pkl"

#Change last parameter to "functionals" to extract training features instead
x = extract_features(data_3gp, path_to_audios, path_to_save, 8000, "lld")

In [None]:
df = pd.read_pickle(path_to_save)
df.insert(1, "class", data_3gp.lossTaste_daily)

#label of each low level descriptor
labels = pd.read_excel("../Data/llds_ComParE.xlsx")

#separate positive from negative
df_pos = df[df["class"]==1].drop(columns = ["ID_short","class"])
df_neg = df[df["class"]==0].drop(columns = ["ID_short","class"])

#array with the names of all features
names = ["duration"] + label["label"].to_list()


In [None]:
fig, axs = plt.subplots(9, 8, figsize=(25, 20))

features = df_pos.columns
count = 0

#iterate trough graph matrix
for i in range(9):
      for j in range(8):
        if(count == 67):
              break

        axs[i,j].tick_params(left=False,
                             bottom=True,
                             labelleft=False,
                             labelbottom=False)

        axs[i,j].set_title(str(names[count]))

        #plot positive density
        sns.distplot(df_pos[features[count]], hist = False, kde = True, color = "r", 
                      label = str(names[count]), ax = axs[i,j])
        
        #plot negative density
        sns.distplot(df_neg[features[count]], hist = False, kde = True, color = "b", 
                      label = str(names[count]), ax = axs[i,j])

        count += 1

red_patch = mpatches.Patch(color='red', label='Positive')
blue_patch = mpatches.Patch(color='blue', label='Negative')
plt.legend(handles=[red_patch, blue_patch])
plt.savefig("../Data/openSMILE/3gp_lld.png")
plt.show()
  


# m4a

In [None]:
data_m4a = pd.read_csv("../data/prediCovid_taste_loss_dataset_B_trim_june.csv")
data_m4a = data_m4a[data_m4a["format"]=="m4a"]
data_m4a.info()

## Type1 et Type2

In [None]:
path_to_audios = "..\Data\\wav_no_silence_june\\"
path_to_save = "..\Data\\openSMILE\\m4a\\type1_type2\\m4a_lld_type1_type2_openSMILE.pkl"

#Change last parameter to "functionals" to extract training features
x = extract_features(data_m4a, path_to_audios, path_to_save, 8000, "lll")

In [None]:
df = pd.read_pickle(path_to_save)
df.insert(1, "class", data_m4a.lossTaste_daily)

#label of each low level descriptor
labels = pd.read_excel("../Data/llds_ComParE.xlsx")

#separate positive from negative
df_pos = df[df["class"]==1].drop(columns = ["ID_short","class"])
df_neg = df[df["class"]==0].drop(columns = ["ID_short","class"])

#array with the names of all features
names = ["duration"] + label["label"].to_list()

In [None]:
fig, axs = plt.subplots(9, 8, figsize=(25, 20))

features = df_pos.columns
count = 0

#iterate trough graphic matrix
for i in range(9):
      for j in range(8):
        if(count == 67):
              break

        axs[i,j].tick_params(left=False,
                             bottom=True,
                             labelleft=False,
                             labelbottom=False)

        axs[i,j].set_title(str(names[count]))

        #plot positive density
        sns.distplot(df_pos[features[count]], hist = False, kde = True, color = "r", 
                      label = str(names[count]), ax = axs[i,j])
        
        #plot negative density
        sns.distplot(df_neg[features[count]], hist = False, kde = True, color = "b", 
                      label = str(names[count]), ax = axs[i,j])

        count += 1

red_patch = mpatches.Patch(color='red', label='Positive')
blue_patch = mpatches.Patch(color='blue', label='Negative')
plt.legend(handles=[red_patch, blue_patch])
plt.savefig("../Data/openSMILE/m4a_lld.png")
plt.show()