# Creation of the dataframe

Per ora i file sono organizzati:
* i file eda sono nella cartella data scaricabile da PMEmo.
* i dati di Valence-Arousal sono nella cartella VA:
    * static_annotations_std.csv -> VA in std dev
    * static_annotations.csv -> VA in mean
    
Viene creato un DataFrame con tante righe quanti i soggetti per ogni brano e tante colonne quante features, da time domain features, statistic features.
I dati EDA "raw" vengono pre-processati nella funzione initialize_signal:
* crea il segnale EvenlySignal dalla libreria pyphysio
* da la possibilità di fare resampling
* da la possibilità di filtrare il segnale per ridurre il rumore
* estrae la parte di phasic (SCR)


#### Da chiedere:
* normalizzazione su ogni colonna nel range [0,1]
* rimuovere features poco utili?
* calcolo di tutte le features sul phasic, ok?
* features dinamiche? si? con window?

In [1]:
import pandas as pd
import os
import numpy as np
from scipy import stats, signal
import matplotlib.pyplot as plt
from tqdm import tqdm
from natsort import natsorted

## Analysis of the EDA signal

In [69]:
import pyphysio as ph
import pyphysio.filters.Filters as flt
import pyphysio.estimators.Estimators as est
import pyphysio.indicators.TimeDomain as td_ind
import pyphysio.indicators.FrequencyDomain as fd_ind

Please cite:
Bizzego et al. (2019) 'pyphysio: A physiological signal processing library for data science approaches in physiology', SoftwareX


In [114]:
def initialize_signal(signal, fs, s_type):
    
    # creazione di un segnale con una fs fissata
    signal = ph.EvenlySignal(values = signal, sampling_freq = fs, signal_type = s_type)
    #signal.plot('r')
    
    # resampling
    signal_resampled = signal.resample(fout=32) # fout: sampling frequency for resampling
    signal = signal_resampled
    #signal.plot('.')
    
    # filtering
    signal_filt = flt.IIRFilter(fp=0.8, fs = 1.1, ftype='ellip')(signal)
    signal = signal_filt
    #signal.plot('g')
    
    # phasic extraction
    driver = est.DriverEstim()(signal)
    phasic, tonic, _ = ph.PhasicEstim(delta=0.02)(driver)
    signal = phasic
    #phasic.plot('b')
    
    # min-max normalization [0,1]
    #signal_normalized = (phasic - np.min(phasic))/np.ptp(phasic)
    #phasic = signal_normalized
     
    return phasic

### Functions for the features

In [71]:
# time domain

def td_mean(signal):
    # arithmetic mean of the signal
    td_mean = td_ind.Mean() # create the indicator
    td_mean_ = td_mean(signal)
    #print('mean: ', td_mean_)
    return td_mean_

def td_min(signal):
    td_min = td_ind.Min()
    td_min_ = td_min(signal)
    #print('min: ', td_min_)
    return td_min_

def td_max(signal):
    td_max = td_ind.Max()
    td_max_ = td_max(signal)
    #print('max: ', td_max_)
    return td_max_

def td_range(signal):
    td_range = td_ind.Range()
    td_range_ = td_range(signal)
    #print('range: ', td_range_)
    return td_range_

def td_median(signal):
    td_median = td_ind.Median()
    td_median_ = td_median(signal)
    #print('median: ', td_median_)
    return td_median_

def td_std_dev(signal):
    td_stdev = td_ind.StDev()
    td_stdev_ = td_stdev(signal)
    #print('standard deviation: ', td_stdev_)
    return td_stdev_

def td_sum(signal):
    # sum of the values in the signal
    td_sum = td_ind.Sum()
    td_sum_ = td_sum(signal)
    #print('sum: ', td_sum_)
    return td_sum_

def td_AUC(signal):
    # AUC: area under the curve of the signal
    td_AUC = td_ind.AUC()
    td_AUC_ = td_AUC(signal)
    #print('AUC: ', td_AUC_)
    return td_AUC_

def td_RMSSD(signal):
    # RMSSD: square root of the mean of the squared 1st order discrete differences
    td_RMSSD = td_ind.RMSSD()
    td_RMSSD_ = td_RMSSD(signal)
    #print('RMSSD: ', td_RMSSD_)
    return td_RMSSD_

def td_SDSD(signal):
    # SDSD: standard deviation of the 1st order discrete differences
    td_SDSD = td_ind.SDSD()
    td_SDSD_ = td_SDSD(signal)
    #print('SDSD: ', td_SDSD_)
    return td_SDSD_


# frequency domain

def fd_powerinband(signal,f_min, f_max):
    # estimation of the power in a given frequency band
    fd_powerinband = fd_ind.PowerInBand(interp_freq=4, freq_max=f_max, freq_min=f_min, method = 'fft') # create the indicator
    fd_powerinband_ = fd_powerinband(signal.resample(4)) # resampling needed to compute PSD
    return fd_powerinband_

def fd_peakinband(signal,f_min, f_max):
    # estimation of the peak frequency in a given frequency band
    fd_peakinband = fd_ind.PeakInBand(interp_freq=4, freq_max=f_max, freq_min=f_min, method = 'fft') # create the indicator
    fd_peakinband_ = fd_peakinband(phasic.resample(4)) # resampling needed to compute PSD
    return fd_peakinband_



# Dataframe creation (static)

In [None]:
# get the directory
path_EDA = '/Users/gioelepozzi/Desktop/data/EDA'
path_VA = '/Users/gioelepozzi/Desktop/data/annotations'

l = [] # lista per unire i risultati in un DataFrame unico
count = 1 # contatore per ciclare tutti i brani nel file con i dati di VA
dictionary = {} # per creare il DataFrame

# ciclo per ogni brano
for csv_file in tqdm(natsorted(os.listdir(path_EDA))):
    if csv_file.endswith(".csv"):
        # prendo dal nome del file il numero del brano (music_ID)
        file_name = os.path.basename(csv_file)
        music_ID = file_name.split('_')[0]
        #print('file name',file_name,music_ID, VA_std.iloc[count][0])

        
        # prendo file con valori di EDA e VA
        my_data = pd.read_csv(path_EDA + '/' + file_name, header = None)
        VA_std = pd.read_csv(path_VA + '/static_annotations_std.csv', header = None)
        VA_mean = pd.read_csv(path_VA + '/static_annotations.csv', header = None)
        
        # ci sono alcuni file EDA che non hanno il corrispettivo valore di VA
        # perchè per validare i valori di VA hanno rifatto ascoltare un brano e se davano valori di VA
        # diversi di più di 0.25 venivano scartati
        if music_ID != VA_std.iloc[count][0]:
            continue

        # prendo valori di VA dai file qua sopra
        v_std = VA_std.iloc[count][2]
        a_std = VA_std.iloc[count][1]
        v_mean = VA_mean.iloc[count][2]
        a_mean = VA_mean.iloc[count][1]
        count = count + 1
        
        # prendo ID della persona dai file EDA
        subject_ID = my_data.loc[0]
        
        # creo un vettore dei soggetti, per ogni brano. Per ogni soggetto calcolo features
        subject_vector = []
        td_mean_vector = []
        td_std_vector = []
        td_kurt_vector = []
        td_skew_vector = []
        psd_vector = []
        td_mean_vector = []
        td_min_vector = []
        td_max_vector = []
        td_range_vector = []
        td_median_vector = []
        td_std_dev_vector = []
        td_sum_vector = []
        td_AUC_vector = []
        td_RMSSD_vector = []
        td_SDSD_vector = []
        
        ZCR_vector = []
        
        fd_mean_vector = []
        fd_std_vector = []
        fd_kurt_vector = []
        fd_skew_vector = []
        fd_min_vector = []
        fd_max_vector = []
        fd_range_vector = []
        
        fd_powerinband_vector1 = []
        fd_powerinband_vector2 = []
        fd_powerinband_vector3 = []
        fd_powerinband_vector4 = []
        fd_powerinband_vector5 = []
        
        fd_peakinband_vector1 = []
        fd_peakinband_vector2 = []
        fd_peakinband_vector3 = []
        fd_peakinband_vector4 = []
        fd_peakinband_vector5 = []

        
        for i in range(1,len(subject_ID)):
            subject_vector.append(int(subject_ID[i]))
            
            # modifica del segnale per fare resampling, filtraggio, e prendere la parte di phasic
            eda_data = [] # rappresenta la colonna con il segnale nei file EDA, uno per ogni soggetto
            s = my_data.iloc[:][i]
            for k in range(1, len(s)):
                eda_data.append(s[k])
                
            fs = 50
            times = np.arange(len(eda_data))/fs
            
            # prendo la parte phasic del segnale
            phasic = initialize_signal(eda_data, fs = 50, s_type = 'eda')
            
            frequency_signal = np.abs(np.fft.fft(phasic))
            frequency = np.fft.fftfreq(phasic.size, d=1/fs)
            
            # funzioni statistiche nel tempo
            td_mean_vector.append(np.mean(phasic))
            td_std_vector.append(np.std(phasic))
            td_kurt_vector.append(stats.kurtosis(phasic))
            td_skew_vector.append(stats.skew(phasic))
            psd_vector.append(signal.periodogram(phasic))
            
            # richiamo delle funzioni che calcolano le features e metto in un vettore
            td_min_vector.append(td_min(phasic))
            td_max_vector.append(td_max(phasic))
            td_range_vector.append(td_range(phasic))
            td_median_vector.append(td_median(phasic))
            td_sum_vector.append(td_sum(phasic))
            td_AUC_vector.append(td_AUC(phasic))
            td_RMSSD_vector.append(td_RMSSD(phasic))
            td_SDSD_vector.append(td_SDSD(phasic))
            
            ZCR_vector.append(((phasic[:-1] * phasic[1:]) < 0).sum())
            
            fd_mean_vector.append(np.mean(frequency_signal))
            fd_std_vector.append(np.std(frequency_signal))
            fd_kurt_vector.append(stats.kurtosis(frequency_signal))
            fd_skew_vector.append(stats.skew(frequency_signal))
            fd_min_vector.append(frequency_signal.min())
            fd_max_vector.append(frequency_signal.max())
            fd_range_vector.append(frequency_signal.max()-frequency_signal.min())
            
            # 0-0.1-0.2-0.3-0.4-0.5 come in PMEmo
            fd_powerinband_vector1.append(fd_powerinband(phasic,0,0.1))
            fd_powerinband_vector2.append(fd_powerinband(phasic,0.1,0.2))
            fd_powerinband_vector3.append(fd_powerinband(phasic,0.2,0.3))
            fd_powerinband_vector4.append(fd_powerinband(phasic,0.3,0.4))
            fd_powerinband_vector5.append(fd_powerinband(phasic,0.4,0.5))
            
            fd_peakinband_vector1.append(fd_peakinband(phasic,0,0.1))
            fd_peakinband_vector2.append(fd_peakinband(phasic,0.1,0.2))
            fd_peakinband_vector3.append(fd_peakinband(phasic,0.2,0.3))
            fd_peakinband_vector4.append(fd_peakinband(phasic,0.3,0.4))
            fd_peakinband_vector5.append(fd_peakinband(phasic,0.4,0.5))
            
            #print('min vector ', td_min_vector, '\n\n')
            
            #np.seterr(divide='ignore', invalid='ignore')
            #td_min_vector_norm = (td_min_vector - np.min(td_min_vector))/np.ptp(td_min_vector)
            
            
        # creo un dizionario
        labels = [
            'music_ID',
            'subject_ID',
            'valence(mean)',
            'arousal(mean)',
            'valence(std)',
            'arousal(std)',
            'td_mean',
            'td_std',
            'td_kurt',
            'td_skew',
            'td_min',
            'td_max',
            'td_range',
            'td_median',
            'td_sum',
            'td_AUC',
            'td_RMSSD',
            'td_SDSD',
            'ZCR',
            'fd_mean',
            'fd_std',
            'fd_kurt',
            'fd_skew',
            'fd_min',
            'fd_max',
            'fd_range',
            'fd_powerinband1',
            'fd_powerinband2',
            'fd_powerinband3',
            'fd_powerinband4',
            'fd_powerinband5',
            'fd_peakinband1',
            'fd_peakinband2',
            'fd_peakinband3',
            'fd_peakinband4',
            'fd_peakinband5'
        ]
        values = [
            music_ID,
            subject_vector,
            v_mean,
            a_mean,
            v_std,
            a_std,
            td_mean_vector,
            td_std_vector,
            td_kurt_vector,
            td_skew_vector,
            td_min_vector,
            td_max_vector,
            td_range_vector,
            td_median_vector,
            td_sum_vector,
            td_AUC_vector,
            td_RMSSD_vector,
            td_SDSD_vector,
            ZCR_vector,
            fd_mean_vector,
            fd_std_vector,
            fd_kurt_vector,
            fd_skew_vector,
            fd_min_vector,
            fd_max_vector,
            fd_range_vector,
            fd_powerinband_vector1,
            fd_powerinband_vector2,
            fd_powerinband_vector3,
            fd_powerinband_vector4,
            fd_powerinband_vector5,
            fd_peakinband_vector1,
            fd_peakinband_vector2,
            fd_peakinband_vector3,
            fd_peakinband_vector4,
            fd_peakinband_vector5
        ]
        
        # popolo il dizionario
        for j in range(len(labels)):
            dictionary[labels[j]] = values[j]
        
        # creo il dataframe
        df = pd.DataFrame(dictionary)
        l.append(df)
        
        continue


results = pd.concat(l, ignore_index=True)


# normalizzazione [0,1] per ogni feature, dalla colonna td_mean (7) in poi
for n in range(6,len(df.columns)) :
    if labels[n] == 'ZCR': # per ZCR, non voglio normalizzare questa colonna
        continue
    a = results.iloc[:,n]
    b = (a - np.min(a))/np.ptp(a)
    results.update(b)

results

 89%|████████▉ | 710/795 [03:23<00:25,  3.30it/s]

# EDA Static features extraction

In [118]:
def extract_EDA_static_features(path_EDA, path_VA):
    
    feature_set = pd.DataFrame()
    count = 1
    
    # for every .mp3 file get a set of features
    for csv_file in tqdm(natsorted(os.listdir(path_EDA))):
        if csv_file.endswith(".csv"):
    
            file_name = os.path.basename(csv_file)
            id = file_name.split('_')[0] # music_ID
            data = pd.read_csv(path_EDA + '/' + file_name, header = None)
            VA_std = pd.read_csv(path_VA + '/static_annotations_std.csv', header = None)
            VA_mean = pd.read_csv(path_VA + '/static_annotations.csv', header = None)
        
            # There are some EDA files with no VA data, so exclude them
            # due to data validation, see PMEmo paper for more details
            if id != VA_std.iloc[count][0]:
                continue

            # save VA values for EDA file
            v_std = VA_std.iloc[count][2]
            a_std = VA_std.iloc[count][1]
            v_mean = VA_mean.iloc[count][2]
            a_mean = VA_mean.iloc[count][1]
            count = count + 1
            
            feature = {}

            # cicle over all the subjects in one EDA file
            for i in range(1,len(data.loc[0])):
                
                subject_ID = data.iloc[0][i]
                
                s = data.iloc[:][i].to_numpy()
                eda_data = np.delete(s,[0])

                #### estrarre features, da np array eda_data
                
                # modifica del segnale per fare resampling, filtraggio, e prendere la parte di phasic
                fs = 50
            
                phasic = initialize_signal(eda_data, fs = fs, s_type = 'eda')
                td_mean = np.mean(phasic)
                
                
                feature['music_ID'] = id
                feature['subject_ID'] = int(subject_ID)
                feature['td_mean'] = td_mean
            
                feature_set = feature_set.append(pd.DataFrame(data=feature, index=[0]))
            
    return feature_set


In [119]:
path_EDA = '/Users/gioelepozzi/Desktop/MasterThesis/code/eda_feature_extraction/data'

path_VA = '/Users/gioelepozzi/Desktop/data/annotations'

static = extract_EDA_static_features(path_EDA, path_VA)

100%|██████████| 3/3 [00:00<00:00,  9.39it/s]


In [120]:
static

Unnamed: 0,music_ID,subject_ID,td_mean
0,1,100179,2.871903
0,1,100184,0.510666
0,1,100180,0.028471
0,1,110448,0.601928
0,1,100178,0.009695
0,1,200373,-0.059312
0,1,100181,1.447602
0,1,100435,0.002033
0,1,100443,0.797665
0,1,100177,1.416402


In [117]:
# export results

static.to_csv('static_features_EDA.csv', index=False)

# Dataframe creation (dynamic)

In [8]:
def window_with_overlap(a, window, stride):
    nrows = ((a.size-window)//stride)+1 # // floor division
    n = a.strides[0]
    # create a view into the array a with the given shape and strides
    return np.lib.stride_tricks.as_strided(a, shape=(nrows,window), strides=(stride*n,n))

In [47]:
def extract_dynamic_features(path_EDA, path_VA, window, stride):
    
    feature_set = pd.DataFrame()
    count = 1
    
    for eda_file in tqdm(natsorted(os.listdir(path_EDA))):
        if eda_file.endswith('.csv'):
            
            file_name = os.path.basename(eda_file)
            id = file_name.split('_')[0]
            data = pd.read_csv(path_EDA + '/' + file_name, header = None)
            VA_std = pd.read_csv(path_VA + '/static_annotations_std.csv', header = None)
            VA_mean = pd.read_csv(path_VA + '/static_annotations.csv', header = None)
            
            subject_vector = []
            
            if id!= VA_std.iloc[count][0]:
                continue
            
            v_std = VA_std.iloc[count][2]
            a_std = VA_std.iloc[count][1]
            v_mean = VA_mean.iloc[count][2]
            a_mean = VA_mean.iloc[count][1]
            count = count + 1
            subject_ID = data.loc[0]
            
            for i in range(1,len(subject_ID)):
                subject_vector.append(int(subject_ID[i]))
                eda_data = [] # rappresenta la colonna con il segnale nei file EDA, uno per ogni soggetto
                s = my_data.iloc[:][i]
                for k in range(1, len(s)):
                    eda_data.append(s[k])
                
                sr = 50
                times = np.arange(len(eda_data))/sr
                phasic = initialize_signal(eda_data, fs = sr, s_type = 'eda')
                
                times = np.arange(len(phasic))/sr
                frames = window_with_overlap(phasic, window, stride)
                time_frames = window_with_overlap(times, window, stride)
                
                for fidx, frame in enumerate(frames):
                    
                    feature = {}
                    frame_time = fidx*0.5+1
                    times = time_frames[fidx]
                    frequency_signal = np.abs(np.fft.fft(frame))
                    frequency = np.fft.fftfreq(frame.size, d=1/sr)
                                        
                    #print(fidx, frame_time, subject_ID)


                    feature['music_ID'] = id
                    feature['subject_ID'] = subject_ID
                    feature['frame_time'] = frame_time
                    
                feature_set = feature_set.append(pd.DataFrame(data=feature, index=[0]))
    
    return feature_set

            

In [48]:
path_EDA = '/Users/gioelepozzi/Desktop/MasterThesis/code/eda_feature_extraction/data'
path_VA = '/Users/gioelepozzi/Desktop/data/annotations'

a = extract_dynamic_features(path_EDA, path_VA, 50, 25)

100%|██████████| 3/3 [00:00<00:00,  3.53it/s]


In [7]:

     
# get the directory
path_EDA = '/Users/gioelepozzi/Desktop/MasterThesis/code/eda_feature_extraction/data'
path_VA = '/Users/gioelepozzi/Desktop/data/annotations'

l = [] # lista per unire i risultati in un DataFrame unico
count = 1 # contatore per ciclare tutti i brani nel file con i dati di VA
dictionary = {} # per creare il DataFrame

# ciclo per ogni brano
for csv_file in tqdm(natsorted(os.listdir(path_EDA))):
    if csv_file.endswith(".csv"):
        # prendo dal nome del file il numero del brano (music_ID)
        file_name = os.path.basename(csv_file)
        music_ID = file_name.split('_')[0]
        #print('file name',file_name,music_ID, VA_std.iloc[count][0])

        
        # prendo file con valori di EDA e VA
        my_data = pd.read_csv(path_EDA + '/' + file_name, header = None)
        VA_std = pd.read_csv(path_VA + '/static_annotations_std.csv', header = None)
        VA_mean = pd.read_csv(path_VA + '/static_annotations.csv', header = None)
        
        # ci sono alcuni file EDA che non hanno il corrispettivo valore di VA
        if music_ID != VA_std.iloc[count][0]:
            continue

        # prendo valori di VA dai file qua sopra
        v_std = VA_std.iloc[count][2]
        a_std = VA_std.iloc[count][1]
        v_mean = VA_mean.iloc[count][2]
        a_mean = VA_mean.iloc[count][1]
        count = count + 1
        
        # prendo ID della persona dai file EDA
        subject_ID = my_data.loc[0]
        
        # creo un vettore dei soggetti, per ogni brano. Per ogni soggetto calcolo features
        subject_vector = []
        
        ZCR_vector = []
        
        for i in range(1,len(subject_ID)):
            subject_vector.append(int(subject_ID[i]))
            
            # modifica del segnale per fare resampling, filtraggio, e prendere la parte di phasic
            eda_data = [] # rappresenta la colonna con il segnale nei file EDA, uno per ogni soggetto
            s = my_data.iloc[:][i]
            for k in range(1, len(s)):
                eda_data.append(s[k])
                
            fs = 50
            times = np.arange(len(eda_data))/fs
            
            # prendo la parte phasic del segnale
            phasic = initialize_signal(eda_data, fs = fs, s_type = 'eda')
            
            window = 50
            stride = 25
            frames = window_with_overlap(phasic, window, stride)
            time_frames = window_with_overlap(times, window, stride)
            
            for fidx, frame in enumerate(frames):
                
                frame_time = fidx*0.5+1
                times = time_frames[fidx]
            
            
            # creo un dizionario
        labels = [
            'music_ID',
            'subject_ID',
            'frame',
            'valence(mean)',
            'arousal(mean)',
            'valence(std)',
            'arousal(std)',
            'ZCR'
        ]
        values = [
            music_ID,
            subject_vector,
            frame,
            v_mean,
            a_mean,
            v_std,
            a_std,
            ZCR_vector
        ]
        
        # popolo il dizionario
        for j in range(len(labels)):
            dictionary[labels[j]] = values[j]
        
        # creo il dataframe
        df = pd.DataFrame(dictionary)
        l.append(df)
        
        continue


results = pd.concat(l, ignore_index=True)


# normalizzazione [0,1] per ogni feature, dalla colonna td_mean (7) in poi
#for n in range(6,26) :
#    if labels[n] == 'ZCR': # per ZCR, non voglio normalizzare questa colonna
#        continue
#    a = results.iloc[:,n]
#    b = (a - np.min(a))/np.ptp(a)
#    results.update(b)

results

  0%|          | 0/3 [00:00<?, ?it/s]


ValueError: arrays must all be same length