-
Notifications
You must be signed in to change notification settings - Fork 5
/
create_db.py
315 lines (270 loc) · 13.1 KB
/
create_db.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
#!/usr/bin/python
# References: https://github.com/paulvangentcom/heartrate_analysis_python
import sys, argparse
from os.path import basename, join
import glob
import pandas as pd
import numpy as np
from scipy.special import entr
from scipy.stats import skew, kurtosis, pearsonr # , skewtest, kurtosistest
from scipy import interpolate, fftpack
from sklearn.metrics import mutual_info_score # mutual information algorithm
from sklearn.metrics.pairwise import euclidean_distances as dist
from statsmodels.tsa.tsatools import lagmat
import json
import matplotlib.pylab as plt
from cardio import pre_process as pr
def detect_peaks(signal, mov_avg):
window = []
peaklist = []
for (i, datapoint), roll in zip(enumerate(signal), mov_avg):
if (datapoint <= roll) and (len(window) <= 1):
continue
elif (datapoint > roll):
window.append(datapoint)
else:
beatposition = i - len(window) + np.argmax(window)
peaklist.append(beatposition)
window = []
return peaklist, [signal[x] for x in peaklist]
mutual_info = lambda x, y, bins : mutual_info_score(None,
None,
contingency=np.histogram2d(x, y, bins)[0])
MI = lambda x, tmax : [mutual_info(x[i + 1 :], x[: -(i + 1)], 100) for i in range(tmax - 1)]
mad = lambda x, medianX : np.median(np.abs(x-medianX))
def fnn(data, m):
"""
https://github.com/jcvasquezc/Corr_Dim
Compute the embedding dimension of a time series data to build the phase space using the false neighbors criterion
data--> time series
m --> maximmum embeding dimension
"""
RT = 15.0
AT = 2
sigmay = np.std(data, ddof=1)
nyr = len(data)
EM = lagmat(data, maxlag = m - 1)
EEM = np.asarray([EM[j,:] for j in range(m - 1, EM.shape[0])])
embedm = m
for k in range(AT, EEM.shape[1] + 1):
fnn1 = []
fnn2 = []
Ma = EEM[:,range(k)]
D = dist(Ma)
for i in range(1, EEM.shape[0] - m - k):
d = D[i,:]
pdnz = np.where(d>0)
dnz = d[pdnz]
Rm = np.min(dnz)
l = np.where(d==Rm)
l = l[0]
l = l[len(l) - 1]
if l + m + k - 1 < nyr:
fnn1.append(np.abs(data[i + m + k - 1] - data[l + m + k - 1]) / Rm)
fnn2.append(np.abs(data[i + m + k - 1] - data[l + m + k - 1]) / sigmay)
Ind1=np.where(np.asarray(fnn1) > RT)
Ind2=np.where(np.asarray(fnn2) > AT)
if len(Ind1[0]) / len(fnn1) < .1 and len(Ind2[0])/len(fnn2) < .1:
embedm = k
break
return embedm
def create_db(data_dir, info_dir=""):
"""
Create .json database for the PPG data from .txt files for PPG data and information.
The database is saved in the current directory under the name "cardio.json".
Parameters
----
data_dir: string; the directory where the .txt data files must be fetched.
info_dir: string; the direcotry where the .txt info files must be fetched. Default=""
The function does not return anything.
Notes:
The files in info_dir and data_dir must be named as follows,
head_id_type.txt
where:
head is a string head without underscore
id is the id of the file (e.g. a number)
type is either data or info
Please note that each file in data_dir must have its corresponding file in
info_dir and vice versa.
Example
>>>data_dir = "path/to/data"
>>>info_dir = data_dir
>>>create_db(data_dir, info_dir)
Processing head_1_data.txt
Processing head_2_data.txt
[...]
"""
features = {}
datas = sorted(glob.glob(join(data_dir, "*_data.txt")))
infos = sorted(glob.glob(join(info_dir, "*_info.txt")))
for f, i in zip(datas[:], infos[:]):
bf, bi = basename(f).split('_'), basename(i).split('_')
assert(bf[1] == bi[1])
print("Processing file %s"%(basename(f)))
info = pd.read_csv(i, sep=",", skiprows = 1, names = [0, 1],
index_col = 0
# Smoking
).T.replace("NO", 0
).replace("YES", 1
# Sex
).replace("Man", "M"
).replace("Vrouw", "F"
).replace("Female", "F"
).replace("Male", "M"
).replace("(null)", np.nan
# Rhythm
).replace("Onbekend", 0 # sconosciuto
).replace("Unknown", 0
).replace("Boezemfibrilleren", 1 # atrial fib
).replace("Atrial Fibrillation", 1
).replace("Atrial Flutter", 2
).replace("AVNRT", 3 # Tachicardia da rientro atrio-ventricolare di tipo nodale
).replace("Boezemtachycardie", 4 # Tachicardia emotiva
).replace("Atrial Tachycardia", 5 # Tachicardia atriale
).replace("Extrasystolen in de boezems", 6 # Extrasistole negli atri
).replace("Extrasystolen in de kamers", 7 # Extrasystole nelle stanze
).replace("Boezemflutter", 8 # Bosom flutter
).replace("VES", 9 # velocità di eritrosedimentazione (infiammazione)
)
data = pd.read_csv(f, sep=',')
time, sign = pr.process_pipe(data, view=False, output='', name=bf[1])
srate = len(time)/(max(time)-min(time)) #sampling rate
peaks = np.arange(len(sign)) #initializing peaks index
# making peak detection using 4 different sliding windows, their width is in srate units
for i in np.array([.5, 1., 1.5, 2.]): # widths
mt, mov_avg = pr.m_avg(time, sign, int(srate*i))
len_filler = np.zeros((len(sign)-len(mov_avg))//2) + np.mean(sign) # used to make mov_avg the same size as sign
mov_avg = np.insert(mov_avg, 0, len_filler)
mov_avg = np.append(mov_avg, len_filler)
peaklist, sign_peak = detect_peaks(sign,mov_avg)
peaks = np.intersect1d(peaks, peaklist) # keeping only peaks detected with all 4 different windows
# peaks' checking: rejecting lower peaks where RR distance is too small
final_peaks = [] # definitive peaks positions container
last_peak = -1 # control parameter to avoid undesired peaks still in final_peaks
for p in peaks:
if p <= last_peak:
continue
evaluated_peaks = [g for g in peaks if p <= g <= srate * .5 + p ] # peaks evaluated t once, only 1 of them will be kept in final_peaks
last_peak = max(evaluated_peaks)
final_peaks.append(evaluated_peaks[np.argmax([sign[x] for x in evaluated_peaks])])
final_peaks = np.unique(final_peaks) # to avoid repetition of identical elements
# computation of quality coefficient
grad = np.gradient(sign) # gradient of signal
checker = np.multiply(grad[:-1], grad[1:]) # equals to grad_i * grad_i+1
# to count "how many times signal inverts its growth without having a peak" (index of bad quality for our signals)
# we can count how many times gradient changes its sign (local maxima or local minima) and subtract twice the number of peaks to that
# then we can multiply by the variance of peaks' amplitude to enhance the bad contribute given by messy peaks
quality = np.var([sign[x] for x in final_peaks])*(len(checker[checker < 0]) - 2 * len(final_peaks) + 2) / len(checker) # "+2" is just to avoid negative numbers
# Note Well: now quality it's a defect index. It goes from 0 to 1
# where 0 is for perfect signals and >0.05 is for horrible signals.
# usually we have (with some exceptions):
# quality ~ 1e-5 or less PERFECT
# quality ~ 1e-4 GOOD
# quality ~ 1e-3 GOOD WITH PROBLEMS
# quality ~ 2e-2 BAD
# quality > 5e-2 HORRIBLE
# compute some common measurements
RR = np.diff(time[final_peaks], 1, 0,)*1e3 # time between consecutive R peaks (in milliseconds)
RR_diff = np.abs(np.diff(RR, 1, 0)) # time variation between consecutive RR intervals
ibi = np.mean(RR) # mean Inter Beat Interval
bpm = 60000 / ibi # mean bpm
sdnn = np.std(RR) # Take standard deviation of all R-R intervals
sdsd = np.std(RR_diff) # #Take standard deviation of the differences between all subsequent R-R intervals
rmssd = np.sqrt(np.mean(RR_diff**2)) #Take the square root of the mean of the list of squared differences
x = np.multiply(RR[1:-1]-RR[:-2], RR[1:-1]-RR[2:]) # we have a turning point when x>0
turning_point_ratio = lambda w : len(w[w>0.]) / len(w) # turning point ratio (randomness index)
tpr = turning_point_ratio(x)
x = np.multiply(RR_diff[1:-1]-RR_diff[:-2], RR_diff[1:-1]-RR_diff[2:]) #same but with RR_diff instead of RR
tpr_RR_diff = turning_point_ratio(x)
pnn20 = len(RR_diff[RR_diff > 20.]) / len(RR_diff) # percentage of RR_diff > 20 milliseconds
pnn50 = len(RR_diff[RR_diff > 50.]) / len(RR_diff) # percentage of RR_diff > 50 milliseconds
medianRR = np.median(RR)
madRR = mad(RR, medianRR)
Rpeakvalues = sign[final_peaks]
AA = np.diff(Rpeakvalues, n=1, axis=0)
medianAA = np.median(AA)
madAA = mad(AA, medianAA)
# compute frequency domain measurements
new_range = np.linspace(0, len(RR), len(RR)*100) # resampling data at higher frequency (x100)
tck = interpolate.splrep(np.arange(0, len(RR)), RR, s=0)
RR_new = interpolate.splev(new_range, tck, der=0)
RR_fft = fftpack.fft(RR_new) # fast fourier transform
freq = fftpack.fftfreq(len(RR_new), d=(1./srate))[:len(RR_new)//2]
RR_fft = RR_fft[:len(RR_new)//2] / len(new_range)
lf = np.trapz(abs(RR_fft[(freq >= .04) & (freq <= .15)])) # Low Frequency is related to short-term blood pressure regulation
hf = np.trapz(abs(RR_fft[(freq > .15) & (freq <= .4 )])) # High Frequency is related to breathing
# other features
entropyRR = sum(entr(RR/sum(RR))) # signal special-entropy
tau_max = 400 # keep an eye on. probably must be < 439 (file 1380 is the shortest)
mi = MI(sign, tau_max) # mutual information
mi_t, mi_avg = pr.m_avg(np.arange(0, len(mi)), mi, 100)
opt_delay = mt[np.argmin(mi_avg)]
embedim = fnn(sign, 20)
skewRR = skew(RR)
kurtRR = kurtosis(RR)
skewAA = skew(AA)
kurtAA = kurtosis(AA)
meanAA = np.mean(AA)
stddevAA = np.std(AA)
corrRRAA, twotailedPvalue_corrRRAA = pearsonr(RR, AA)
features[f] = {"sex" : info.Sex.values.item(),
"age" : info.Age.values.item(),
"weight" : info.Weight.values.item(),
"smoke": info.Smoking.values.item(),
"afib": info.Afib.values.item(),
"rhythm" : info.Rhythm.values.item(),
"quality" : quality,
"RR" : RR.tolist(),
"bpm" : bpm,
"ibi" : ibi,
"sdnn" : sdnn,
"sdsd" : sdsd,
"rmssd" : rmssd,
"tpr" : tpr,
"tpr_RR_diff" : tpr_RR_diff,
"pnn20" : pnn20,
"pnn50" : pnn50,
"medianRR" : medianRR,
"madRR" : madRR,
"lf" : lf,
"hf" : hf,
"skewnessRR" : skewRR,
"kurtosisRR" : kurtRR,
"entropyRR" : entropyRR,
"opt_delay" : opt_delay,
"embedim" : embedim,
"time" : time.tolist(),
"signal" : sign.tolist(),
"Rpeakvalues" : Rpeakvalues.tolist(),
"AA" : AA.tolist(),
"medianAA" : medianAA,
"madAA" : madAA,
"skewnessAA" : skewAA,
"kurtosisAA" : kurtAA,
"meanAA" : meanAA,
"std_devAA" : stddevAA,
"corrRRAA" : corrRRAA,
"corrRRAA_Pvalue" : twotailedPvalue_corrRRAA,
"length": info.Length.values.item(),
"device": info.Device.values.item(),
"city": info.City.values.item(),
"country": info.Country.values.item(),
"lifestyle": info.Lifestyle.values.item(),
"class": info.Class.values.item()
}
# saving data on file cardio.json
with open('cardio.json', 'w') as file:
json.dump(features, file)
if __name__ == '__main__':
description = "Create DB data"
parser = argparse.ArgumentParser(description = description)
parser.add_argument("-f", required=True, dest="data_dir", action="store", help="Data directory name", default='')
parser.add_argument("-i", required=False, dest="info_dir", action="store", help="Info directory name", default='')
if len(sys.argv) <= 1:
parser.print_help()
sys.exit(1)
else: args = parser.parse_args()
data_dir = args.data_dir
if args.info_dir == '': info_dir = data_dir
else: info_dir = args.info_dir
create_db(data_dir, info_dir)