# Creation of the dataframe

Per ora i file sono organizzati:
* i file eda sono nella directory del notebook nella cartella EDA.
* i dati di Valence-Arousal sono nella cartella VA:
    * static_annotations_std.csv -> VA in std dev
    * static_annotations.csv -> VA in mean
    
Viene creato un DataFrame con tante righe quanti i soggetti per ogni brano e tante colonne quante features, da time domain features, statistic features.
I dati EDA "raw" vengono pre-processati nella funzione initialize_signal:
* crea il segnale EvenlySignal dalla libreria pyphysio
* da la possibilità di fare resampling
* da la possibilità di filtrare il segnale per ridurre il rumore
* estrae la parte di phasic (SCR)
* fa una normalizzazione della parte di phasic in [0, 1] 


#### TODO:
* creare una funzione per fare windowing
* estrarre features in base alla divisione del segnale in bande, come PSD e MFCC
* cambiare normalizzazione?
* rimuovere features poco utili?

In [19]:
import pandas as pd
import os
import numpy as np
from scipy import stats, signal

In [24]:
# get the directory
dir = os.getcwd()

l = [] # lista per unire i risultati in un DataFrame unico
count = 1 # contatore per ciclare tutti i brani nel file con i dati di VA
dictionary = {} # per creare il DataFrame

# ciclo per ogni brano
for csv_file in sorted(os.listdir(dir + '/EDA/')):
    if csv_file.endswith(".csv"):
        
        # prendo dal nome del file il numero del brano (music_ID)
        file_name = os.path.basename(csv_file)
        music_ID = int(file_name[0])
        
        # prendo file con valori di EDA e VA
        my_data = pd.read_csv('EDA/' + file_name, header = None)
        VA_std = pd.read_csv(os.getcwd() + '/VA/static_annotations_std.csv', header = None)
        VA_mean = pd.read_csv(os.getcwd() + '/VA/static_annotations.csv', header = None)

        # prendo valori di VA dai file qua sopra
        v_std = VA_std.iloc[count][2]
        a_std = VA_std.iloc[count][1]
        v_mean = VA_mean.iloc[count][2]
        a_mean = VA_mean.iloc[count][1]
        count = count + 1
        
        # prendo ID della persona dai file EDA
        subject_ID = my_data.loc[0]
        
        # creo un vettore dei soggetti, per ogni brano. Per ogni soggetto calcolo features
        subject_vector = []
        td_mean_vector = []
        td_min_vector = []
        td_max_vector = []
        td_range_vector = []
        td_median_vector = []
        td_std_dev_vector = []
        td_sum_vector = []
        td_AUC_vector = []
        td_RMSSD_vector = []
        td_SDSD_vector = []
        mean_vector = []
        std_vector = []
        kurt_vector = []
        skew_vector = []
        psd_vector = []
        
        for i in range(1,len(subject_ID)):
            subject_vector.append(int(subject_ID[i]))
            
            # modifica del segnale per fare resampling, filtraggio, e prendere la parte di phasic
            eda_data = [] # rappresenta la colonna con il segnale nei file EDA, uno per ogni soggetto
            s = my_data.iloc[:][i]
            for k in range(1, len(s)):
                eda_data.append(s[k])
            
            # prendo la parte phasic del segnale
            phasic = initialize_signal(eda_data, fs = 50, s_type = 'eda')
            
            # richiamo delle funzioni che calcolano le features e metto in un vettore
            td_mean_vector.append(td_mean(phasic))
            td_min_vector.append(td_min(phasic))
            td_max_vector.append(td_max(phasic))
            td_range_vector.append(td_range(phasic))
            td_median_vector.append(td_median(phasic))
            td_std_dev_vector.append(td_std_dev(phasic))
            td_sum_vector.append(td_sum(phasic))
            td_AUC_vector.append(td_AUC(phasic))
            td_RMSSD_vector.append(td_RMSSD(phasic))
            td_SDSD_vector.append(td_SDSD(phasic))
            
            # funzioni statistiche
            mean_vector.append(np.mean(phasic))
            std_vector.append(np.std(phasic))
            kurt_vector.append(stats.kurtosis(phasic))
            skew_vector.append(stats.skew(phasic))
            psd_vector.append(signal.periodogram(phasic))
            
            
        # creo un dizionario
        labels = [
            'music_ID',
            'subject_ID',
            'valence(mean)',
            'arousal(mean)',
            'valence(std)',
            'arousal(std)',
            'td_mean',
            'td_min',
            'td_max',
            'td_range',
            'td_median',
            'td_std_dev',
            'td_sum',
            'td_AUC',
            'td_RMSSD',
            'td_SDSD',
            'mean',
            'std',
            'kurt',
            'skew'
        ]
        values = [
            music_ID,
            subject_vector,
            v_mean,
            a_mean,
            v_std,
            a_std,
            td_mean_vector,
            td_min_vector,
            td_max_vector,
            td_range_vector,
            td_median_vector,
            td_std_dev_vector,
            td_sum_vector,
            td_AUC_vector,
            td_RMSSD_vector,
            td_SDSD_vector,
            mean_vector,
            std_vector,
            kurt_vector,
            skew_vector
        ]
        
        # popolo il dizionario
        for j in range(len(labels)):
            dictionary[labels[j]] = values[j]
        
        # creo il dataframe
        df = pd.DataFrame(dictionary)
        l.append(df)
        
        #print('Directory: ', dir, '\n')
        #print('name of the file: ', file_name)
        #print('music ID: ', music_ID)
        #print('valence std: ', v_std)
        #print('arousal std: ', a_std)
        #print('valence mean: ', v_mean)
        #print('arousal mean: ', a_mean)
        #print(subject_ID)
        #print('subjects: ', subject_vector)
        #print('dictionary: ', dictionary)
        #print('list: ', l)
        #print()
        
        continue

results = pd.concat(l, ignore_index=True)

results

Unnamed: 0,music_ID,subject_ID,valence(mean),arousal(mean),valence(std),arousal(std),td_mean,td_min,td_max,td_range,td_median,td_std_dev,td_sum,td_AUC,td_RMSSD,td_SDSD,mean,std,kurt,skew
0,1,100179,0.575,0.4,0.160078105936,0.1561249499599999,0.175641,0.0,1.0,1.0,0.08014,0.215012,11.240996,5.620498,0.154575,0.154575,0.1756405691962342,0.215012495980345,4.141992,2.042004
1,1,100184,0.575,0.4,0.160078105936,0.1561249499599999,0.193594,0.0,1.0,1.0,0.10078,0.196564,12.390007,6.195004,0.134647,0.134647,0.1935938611895901,0.1965638305069185,5.447091,2.391835
2,1,100180,0.575,0.4,0.160078105936,0.1561249499599999,0.366166,0.0,1.0,1.0,0.284821,0.180806,23.43464,11.71732,0.151436,0.151436,0.3661662513916053,0.180805556694282,2.72162,1.539666
3,1,110448,0.575,0.4,0.160078105936,0.1561249499599999,0.159363,0.0,1.0,1.0,0.044891,0.21852,10.199226,5.099613,0.16798,0.16798,0.1593628987771807,0.2185198649606048,4.671846,2.184764
4,1,100178,0.575,0.4,0.160078105936,0.1561249499599999,0.151093,0.0,1.0,1.0,0.058018,0.194999,9.669976,4.834988,0.144733,0.144733,0.1510933808500204,0.19499910456637,6.790184,2.549743
5,1,200373,0.575,0.4,0.160078105936,0.1561249499599999,0.383038,0.0,1.0,1.0,0.270897,0.223092,24.514427,12.257214,0.206016,0.206016,0.3830379260821442,0.2230922841672034,0.386135,0.963609
6,1,100181,0.575,0.4,0.160078105936,0.1561249499599999,0.20961,0.0,1.0,1.0,0.080458,0.2367,13.41501,6.707505,0.185273,0.185273,0.2096095279800409,0.2367000033596688,1.631038,1.584972
7,1,100435,0.575,0.4,0.160078105936,0.1561249499599999,0.229654,0.0,1.0,1.0,0.147542,0.210983,14.697824,7.348912,0.183549,0.183549,0.2296535031610192,0.2109829108899656,4.592006,2.308827
8,1,100443,0.575,0.4,0.160078105936,0.1561249499599999,0.190868,0.0,1.0,1.0,0.093382,0.207416,12.215578,6.107789,0.14787,0.14787,0.1908684060422011,0.207415571847268,3.493104,1.921157
9,1,100177,0.575,0.4,0.160078105936,0.1561249499599999,0.198537,0.0,1.0,1.0,0.102511,0.212428,12.706342,6.353171,0.193626,0.193626,0.1985365863649069,0.2124276576285251,1.863114,1.475238


In [245]:
# export results

results.to_csv('dataframe_EDA.csv')

## Analysis of the EDA signal

In [4]:
import pyphysio as ph
import pyphysio.filters.Filters as flt
import pyphysio.estimators.Estimators as est
import pyphysio.indicators.TimeDomain as td_ind
import pyphysio.indicators.FrequencyDomain as fd_ind

Please cite:
Bizzego et al. (2019) 'pyphysio: A physiological signal processing library for data science approaches in physiology', SoftwareX


In [5]:
def initialize_signal(signal, fs, s_type):
    
    # creazione di un segnale con una fs fissata
    signal = ph.EvenlySignal(values = eda_data, sampling_freq = fs, signal_type = s_type)
    #signal.plot('r')
    
    # resampling
    signal_resampled = signal.resample(fout=2) # fout: sampling frequency for resampling
    signal = signal_resampled
    #signal.plot('.')
    
    # filtering
    signal_filt = flt.IIRFilter(fp=0.8, fs = 1.1, ftype='ellip')(signal)
    signal = signal_filt
    #signal.plot('g')
    
    # phasic extraction
    driver = est.DriverEstim()(signal)
    phasic, tonic, _ = ph.PhasicEstim(delta=0.02)(driver)
    #phasic.plot('b')
    
    # normalization [0,1]
    signal_normalized = (phasic - np.min(phasic))/np.ptp(phasic)
    phasic = signal_normalized
     
    return phasic

### Functions for the features

In [14]:
# statistics features

# from scipy library


# time domain

def td_mean(signal):
    # arithmetic mean of the signal
    td_mean = td_ind.Mean() # create the indicator
    td_mean_ = td_mean(signal)
    #print('mean: ', td_mean_)
    return td_mean_

def td_min(signal):
    td_min = td_ind.Min()
    td_min_ = td_min(signal)
    #print('min: ', td_min_)
    return td_min_

def td_max(signal):
    td_max = td_ind.Max()
    td_max_ = td_max(signal)
    #print('max: ', td_max_)
    return td_max_

def td_range(signal):
    td_range = td_ind.Range()
    td_range_ = td_range(signal)
    #print('range: ', td_range_)
    return td_range_

def td_median(signal):
    td_median = td_ind.Median()
    td_median_ = td_median(signal)
    #print('median: ', td_median_)
    return td_median_

def td_std_dev(signal):
    td_stdev = td_ind.StDev()
    td_stdev_ = td_stdev(signal)
    #print('standard deviation: ', td_stdev_)
    return td_stdev_

def td_sum(signal):
    # sum of the values in the signal
    td_sum = td_ind.Sum()
    td_sum_ = td_sum(signal)
    #print('sum: ', td_sum_)
    return td_sum_

def td_AUC(signal):
    # AUC: area under the curve of the signal
    td_AUC = td_ind.AUC()
    td_AUC_ = td_AUC(signal)
    #print('AUC: ', td_AUC_)
    return td_AUC_

def td_RMSSD(signal):
    # RMSSD: square root of the mean of the squared 1st order discrete differences
    td_RMSSD = td_ind.RMSSD()
    td_RMSSD_ = td_RMSSD(signal)
    #print('RMSSD: ', td_RMSSD_)
    return td_RMSSD_

def td_SDSD(signal):
    # SDSD: standard deviation of the 1st order discrete differences
    td_SDSD = td_ind.SDSD()
    td_SDSD_ = td_SDSD(signal)
    #print('SDSD: ', td_SDSD_)
    return td_SDSD_


# frequency domain





# prove


