# Creation of the dataframe

Per ora i file sono organizzati:
* i file eda sono nella directory del notebook nella cartella EDA.
* i dati di Valence-Arousal sono nella cartella VA:
    * static_annotations_std.csv -> VA in std dev
    * static_annotations.csv -> VA in mean
    
Viene creato un DataFrame con tante righe quanti i soggetti per ogni brano e tante colonne quante features, da time domain features, statistic features.
I dati EDA "raw" vengono pre-processati nella funzione initialize_signal:
* crea il segnale EvenlySignal dalla libreria pyphysio
* da la possibilità di fare resampling
* da la possibilità di filtrare il segnale per ridurre il rumore
* estrae la parte di phasic (SCR)


#### TODO:
* creare una funzione per fare windowing
* estrarre features in base alla divisione del segnale in bande, come PSD e MFCC
* cambiare normalizzazione?
* rimuovere features poco utili?

In [3]:
import pandas as pd
import os
import numpy as np
from scipy import stats, signal

In [10]:
# get the directory
dir = os.getcwd()

l = [] # lista per unire i risultati in un DataFrame unico
count = 1 # contatore per ciclare tutti i brani nel file con i dati di VA
dictionary = {} # per creare il DataFrame

# ciclo per ogni brano
for csv_file in sorted(os.listdir(dir + '/EDA/')):
    if csv_file.endswith(".csv"):
        
        # prendo dal nome del file il numero del brano (music_ID)
        file_name = os.path.basename(csv_file)
        music_ID = int(file_name[0])
        
        # prendo file con valori di EDA e VA
        my_data = pd.read_csv('EDA/' + file_name, header = None)
        VA_std = pd.read_csv(os.getcwd() + '/VA/static_annotations_std.csv', header = None)
        VA_mean = pd.read_csv(os.getcwd() + '/VA/static_annotations.csv', header = None)

        # prendo valori di VA dai file qua sopra
        v_std = VA_std.iloc[count][2]
        a_std = VA_std.iloc[count][1]
        v_mean = VA_mean.iloc[count][2]
        a_mean = VA_mean.iloc[count][1]
        count = count + 1
        
        # prendo ID della persona dai file EDA
        subject_ID = my_data.loc[0]
        
        # creo un vettore dei soggetti, per ogni brano. Per ogni soggetto calcolo features
        subject_vector = []
        td_mean_vector = []
        td_min_vector = []
        td_max_vector = []
        td_range_vector = []
        td_median_vector = []
        td_std_dev_vector = []
        td_sum_vector = []
        td_AUC_vector = []
        td_RMSSD_vector = []
        td_SDSD_vector = []
        mean_vector = []
        std_vector = []
        kurt_vector = []
        skew_vector = []
        psd_vector = []
        
        for i in range(1,len(subject_ID)):
            subject_vector.append(int(subject_ID[i]))
            
            # modifica del segnale per fare resampling, filtraggio, e prendere la parte di phasic
            eda_data = [] # rappresenta la colonna con il segnale nei file EDA, uno per ogni soggetto
            s = my_data.iloc[:][i]
            for k in range(1, len(s)):
                eda_data.append(s[k])
            
            # prendo la parte phasic del segnale
            phasic = initialize_signal(eda_data, fs = 50, s_type = 'eda')
            
            # richiamo delle funzioni che calcolano le features e metto in un vettore
            td_mean_vector.append(td_mean(phasic))
            td_min_vector.append(td_min(phasic))
            td_max_vector.append(td_max(phasic))
            td_range_vector.append(td_range(phasic))
            td_median_vector.append(td_median(phasic))
            td_std_dev_vector.append(td_std_dev(phasic))
            td_sum_vector.append(td_sum(phasic))
            td_AUC_vector.append(td_AUC(phasic))
            td_RMSSD_vector.append(td_RMSSD(phasic))
            td_SDSD_vector.append(td_SDSD(phasic))
            
            # funzioni statistiche
            mean_vector.append(np.mean(phasic))
            std_vector.append(np.std(phasic))
            kurt_vector.append(stats.kurtosis(phasic))
            skew_vector.append(stats.skew(phasic))
            psd_vector.append(signal.periodogram(phasic))
            
            
            #print('min vector ', td_min_vector, '\n\n')
            
            #np.seterr(divide='ignore', invalid='ignore')
            #td_min_vector_norm = (td_min_vector - np.min(td_min_vector))/np.ptp(td_min_vector)
            
            
        # creo un dizionario
        labels = [
            'music_ID',
            'subject_ID',
            'valence(mean)',
            'arousal(mean)',
            'valence(std)',
            'arousal(std)',
            'td_mean',
            'td_min',
            'td_max',
            'td_range',
            'td_median',
            'td_std_dev',
            'td_sum',
            'td_AUC',
            'td_RMSSD',
            'td_SDSD',
            'mean',
            'std',
            'kurt',
            'skew'
        ]
        values = [
            music_ID,
            subject_vector,
            v_mean,
            a_mean,
            v_std,
            a_std,
            td_mean_vector,
            td_min_vector,
            td_max_vector,
            td_range_vector,
            td_median_vector,
            td_std_dev_vector,
            td_sum_vector,
            td_AUC_vector,
            td_RMSSD_vector,
            td_SDSD_vector,
            mean_vector,
            std_vector,
            kurt_vector,
            skew_vector
        ]
        
        # popolo il dizionario
        for j in range(len(labels)):
            dictionary[labels[j]] = values[j]
        
        # creo il dataframe
        df = pd.DataFrame(dictionary)
        l.append(df)
        
        #print('Directory: ', dir, '\n')
        #print('name of the file: ', file_name)
        #print('music ID: ', music_ID)
        #print('valence std: ', v_std)
        #print('arousal std: ', a_std)
        #print('valence mean: ', v_mean)
        #print('arousal mean: ', a_mean)
        #print(subject_ID)
        #print('subjects: ', subject_vector)
        #print('dictionary: ', dictionary)
        #print('list: ', l)
        #print()
        
        continue


results = pd.concat(l, ignore_index=True)


# normalizzazione [0,1] per ogni feature, dalla colonna td_mean (7) in poi
for n in range(6,20):
    a = results.iloc[:,n]
    b = (a - np.min(a))/np.ptp(a)
    results.update(b)

results

Unnamed: 0,music_ID,subject_ID,valence(mean),arousal(mean),valence(std),arousal(std),td_mean,td_min,td_max,td_range,td_median,td_std_dev,td_sum,td_AUC,td_RMSSD,td_SDSD,mean,std,kurt,skew
0,1,100179,0.575,0.4,0.160078105936,0.1561249499599999,1.0,0.778218,1.0,1.0,1.0,1.0,0.962261,0.962261,1.0,1.0,1.0,1.0,0.175626,0.315496
1,1,100184,0.575,0.4,0.160078105936,0.1561249499599999,0.224283,0.749634,0.313875,0.336655,0.0306931,0.30804,0.215973,0.215973,0.291738,0.291738,0.224283,0.30804,0.22718,0.399334
2,1,100180,0.575,0.4,0.160078105936,0.1561249499599999,0.004711,0.98317,0.004754,0.006358,0.001526381,0.005575,0.004731,0.004731,0.004821,0.004821,0.00471058,0.00557532,0.119519,0.195111
3,1,110448,0.575,0.4,0.160078105936,0.1561249499599999,0.170708,0.957235,0.194687,0.194304,0.05077942,0.198305,0.164431,0.164431,0.210497,0.210497,0.170708,0.198305,0.196556,0.349709
4,1,100178,0.575,0.4,0.160078105936,0.1561249499599999,0.018987,0.99298,0.027671,0.027385,0.001810125,0.025341,0.018466,0.018466,0.024064,0.024064,0.0189873,0.0253413,0.280234,0.437176
5,1,200373,0.575,0.4,0.160078105936,0.1561249499599999,0.006899,0.988175,0.004069,0.005054,0.004142969,0.006352,0.006836,0.006836,0.00691,0.00691,0.00689874,0.00635162,0.027264,0.057058
6,1,100181,0.575,0.4,0.160078105936,0.1561249499599999,0.504015,0.790569,0.468612,0.481854,0.2249383,0.531352,0.485092,0.485092,0.57788,0.57788,0.504015,0.531352,0.076439,0.205968
7,1,100435,0.575,0.4,0.160078105936,0.1561249499599999,0.001438,1.0,0.001814,0.001353,2.492221e-16,0.002188,0.001583,0.001583,0.001123,0.001123,0.00143815,0.00218754,0.193402,0.379441
8,1,100443,0.575,0.4,0.160078105936,0.1561249499599999,0.36197,0.856369,0.368586,0.376222,0.3187391,0.363373,0.348436,0.348436,0.358861,0.358861,0.36197,0.363373,0.149994,0.286535
9,1,100177,0.575,0.4,0.160078105936,0.1561249499599999,0.450811,0.941397,0.376078,0.372658,0.5906156,0.368721,0.433907,0.433907,0.467231,0.467231,0.450811,0.368721,0.085607,0.17967


In [11]:
# export results

results.to_csv('dataframe_EDA.csv')

## Analysis of the EDA signal

In [5]:
import pyphysio as ph
import pyphysio.filters.Filters as flt
import pyphysio.estimators.Estimators as est
import pyphysio.indicators.TimeDomain as td_ind
import pyphysio.indicators.FrequencyDomain as fd_ind

Please cite:
Bizzego et al. (2019) 'pyphysio: A physiological signal processing library for data science approaches in physiology', SoftwareX


In [6]:
def initialize_signal(signal, fs, s_type):
    
    # creazione di un segnale con una fs fissata
    signal = ph.EvenlySignal(values = eda_data, sampling_freq = fs, signal_type = s_type)
    #signal.plot('r')
    
    # resampling
    signal_resampled = signal.resample(fout=2) # fout: sampling frequency for resampling
    signal = signal_resampled
    #signal.plot('.')
    
    # filtering
    signal_filt = flt.IIRFilter(fp=0.8, fs = 1.1, ftype='ellip')(signal)
    signal = signal_filt
    #signal.plot('g')
    
    # phasic extraction
    driver = est.DriverEstim()(signal)
    phasic, tonic, _ = ph.PhasicEstim(delta=0.02)(driver)
    signal = phasic
    #phasic.plot('b')
    
    # min-max normalization [0,1]
    #signal_normalized = (phasic - np.min(phasic))/np.ptp(phasic)
    #phasic = signal_normalized
     
    return phasic

### Functions for the features

In [8]:
# statistics features

# from scipy library


# time domain

def td_mean(signal):
    # arithmetic mean of the signal
    td_mean = td_ind.Mean() # create the indicator
    td_mean_ = td_mean(signal)
    #print('mean: ', td_mean_)
    return td_mean_

def td_min(signal):
    td_min = td_ind.Min()
    td_min_ = td_min(signal)
    #print('min: ', td_min_)
    return td_min_

def td_max(signal):
    td_max = td_ind.Max()
    td_max_ = td_max(signal)
    #print('max: ', td_max_)
    return td_max_

def td_range(signal):
    td_range = td_ind.Range()
    td_range_ = td_range(signal)
    #print('range: ', td_range_)
    return td_range_

def td_median(signal):
    td_median = td_ind.Median()
    td_median_ = td_median(signal)
    #print('median: ', td_median_)
    return td_median_

def td_std_dev(signal):
    td_stdev = td_ind.StDev()
    td_stdev_ = td_stdev(signal)
    #print('standard deviation: ', td_stdev_)
    return td_stdev_

def td_sum(signal):
    # sum of the values in the signal
    td_sum = td_ind.Sum()
    td_sum_ = td_sum(signal)
    #print('sum: ', td_sum_)
    return td_sum_

def td_AUC(signal):
    # AUC: area under the curve of the signal
    td_AUC = td_ind.AUC()
    td_AUC_ = td_AUC(signal)
    #print('AUC: ', td_AUC_)
    return td_AUC_

def td_RMSSD(signal):
    # RMSSD: square root of the mean of the squared 1st order discrete differences
    td_RMSSD = td_ind.RMSSD()
    td_RMSSD_ = td_RMSSD(signal)
    #print('RMSSD: ', td_RMSSD_)
    return td_RMSSD_

def td_SDSD(signal):
    # SDSD: standard deviation of the 1st order discrete differences
    td_SDSD = td_ind.SDSD()
    td_SDSD_ = td_SDSD(signal)
    #print('SDSD: ', td_SDSD_)
    return td_SDSD_


# frequency domain

