In [116]:
"""loop through files in S1_Dataset directory"""
"""create 2D data matrix by appending each new dataset to the bottom of data matrixx"""
"""add a column indicating the paitent ID"""
import os
import pandas as pd
import numpy as np
S1_PATH = os.path.join('..','..','Datasets_Healthy_Older_People','S1_Dataset')
S2_PATH = os.path.join('..','..','Datasets_Healthy_Older_People','S2_Dataset')
print('Importing Data...')
s1_data = pd.DataFrame()
patient_id = 0
for filename in os.listdir(S1_PATH):
    if filename != 'README.txt':
        data_path = os.path.join(S1_PATH, filename)
        data = pd.read_csv(data_path, header=None)
        data[9] = np.zeros(data.shape[0])
        data[9] = patient_id
        patient_id = patient_id + 1
        s1_data = s1_data.append(data, ignore_index=True)
s1_data.columns = ['time','frontal','vertical','lateral','antenna_id','rssi','phase','frequency','activity','patient_id']
s1_data = s1_data.drop(columns=['phase','frequency'])
s1_data_by_patient = s1_data.set_index(['patient_id','time'])
print('Done')

Importing Data...
Done


In [117]:
s1_data_by_patient.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,frontal,vertical,lateral,antenna_id,rssi,activity
patient_id,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.0,0.27203,1.0082,-0.082102,1,-63.5,1
0,0.5,0.27203,1.0082,-0.082102,1,-63.0,1
0,1.5,0.44791,0.91636,-0.013684,1,-63.5,1
0,1.75,0.44791,0.91636,-0.013684,1,-63.0,1
0,2.5,0.34238,0.96229,-0.059296,1,-63.5,1


In [118]:
"""analyse data for sequential learning preprocessing"""

time_intervals = []
too_large_intervals = {}
too_large_interval_limit = 20
prev_time_stamp = 0
index = 0
for time_stamp in s1_data['time']:
    difference = time_stamp - prev_time_stamp
   
    if difference > too_large_interval_limit:
        too_large_intervals[index] = difference
    
    prev_time_stamp = time_stamp 
    if difference > 0:
        time_intervals.append(round(difference,4))
    index = index + 1

"""some large jumps of missing data found"""
print('Maximum time step: ', max(time_intervals)) 
print('Minimum time step: ', min(time_intervals))
print('Number of jumps larger than %ds: ' %too_large_interval_limit, len(too_large_intervals))

"""
for k,v in too_large_intervals.items():
    print('\nIndex: ', k, '\tJump: ', v)
    print(s1_data[k-5:k+5])
"""
    
"""find the smallest timestep"""
from math import gcd

smallest_timestep = time_intervals[0]
for time_step in time_intervals[1:]:
    smallest_timestep = gcd(round(smallest_timestep*1000), round(time_step*1000))/1000

print('Time step to fit everything: ', smallest_timestep)

Maximum time step:  185.15
Minimum time step:  0.02
Number of jumps larger than 20s:  47
Time step to fit everything:  0.005


In [171]:
"""preprocess data for sequential learning"""

"""
a data point is a set of vectors with features, which looks like:
<x_n,...,x_n+w>, <y_n,...,y_n+w>, <z_n,...,z_n+w>, <rssi_n,...,rssi_n+w>, <antennaID_n,...,antennaID_n+w>

a label is either the set which contains the features of the n+w+1 item or just the activity:
<x_n+w+1>, <y_n+w+1>, <z_n+w+1>, <rssi_n+w+1>, <antennaID_n+w+1>
or
<activity_n+w+1>
"""

window_size = 10
top_level_column_indexes = np.array(range(window_size))
second_level_column_indexes = np.array(['frontal','vertical','lateral','antenna_id','rssi'])
indexes = pd.MultiIndex.from_product([top_level_column_indexes,second_level_column_indexes])

number_of_patients = s1_data_by_patient.index.values[-1][0] + 1
# number_of_patients = 3

all_patients_sequence_data = []
all_patients_sequence_label = []
for patient_index in range(number_of_patients):
    patient_data = s1_data_by_patient.xs(patient_index)
    #time_indexes = patient_data.index.values
    
    #start_time = time_indexes[0]
    all_sequence_data = []
    all_sequence_label = []

    for window_start in range(len(time_indexes) - window_size - 1):
        sequence_data = []
        for i in range(window_size):
            single_data = patient_one.iloc[window_start+i][:-1].values
            sequence_data.extend(single_data)
        
        all_sequence_data.append(sequence_data)
        all_sequence_label.append( patient_one.iloc[window_start+window_size+1][-1])
    
    all_patients_sequence_data.extend(all_sequence_data)
    all_patients_sequence_label.extend(all_sequence_label)

all_patients_sequence_data = np.asarray(all_patients_sequence_data)
all_patients_sequence_label = np.asarray(all_patients_sequence_label)

print(all_patients_sequence_data.shape)
print(all_patients_sequence_label.shape)

all_patients_sequence_data = pd.DataFrame(all_patients_sequence_data, columns=indexes)
all_patients_sequence_data['label'] = all_patients_sequence_label
all_patients_sequence_data


(23400, 50)
(23400,)


Unnamed: 0_level_0,0,0,0,0,0,1,1,1,1,1,...,8,8,8,8,9,9,9,9,9,label
Unnamed: 0_level_1,frontal,vertical,lateral,antenna_id,rssi,frontal,vertical,lateral,antenna_id,rssi,...,vertical,lateral,antenna_id,rssi,frontal,vertical,lateral,antenna_id,rssi,Unnamed: 21_level_1
0,0.27203,1.00820,-0.082102,1.0,-63.5,0.27203,1.00820,-0.082102,1.0,-63.0,...,1.03120,-0.127710,1.0,-64.5,0.23685,1.03120,-0.127710,1.0,-66.0,1.0
1,0.27203,1.00820,-0.082102,1.0,-63.0,0.44791,0.91636,-0.013684,1.0,-63.5,...,1.03120,-0.127710,1.0,-66.0,0.23685,1.03120,-0.127710,1.0,-63.0,1.0
2,0.44791,0.91636,-0.013684,1.0,-63.5,0.44791,0.91636,-0.013684,1.0,-63.0,...,1.03120,-0.127710,1.0,-63.0,0.23685,1.03120,-0.127710,1.0,-65.0,1.0
3,0.44791,0.91636,-0.013684,1.0,-63.0,0.34238,0.96229,-0.059296,1.0,-63.5,...,1.03120,-0.127710,1.0,-65.0,0.31893,0.99674,-0.070699,1.0,-62.0,1.0
4,0.34238,0.96229,-0.059296,1.0,-63.5,0.34238,0.96229,-0.059296,4.0,-56.5,...,0.99674,-0.070699,1.0,-62.0,0.31893,0.99674,-0.070699,4.0,-56.5,1.0
5,0.34238,0.96229,-0.059296,4.0,-56.5,0.30721,0.99674,-0.070699,1.0,-63.5,...,0.99674,-0.070699,4.0,-56.5,0.47136,0.92784,-0.002281,1.0,-62.5,1.0
6,0.30721,0.99674,-0.070699,1.0,-63.5,0.27203,0.99674,-0.093505,1.0,-64.0,...,0.92784,-0.002281,1.0,-62.5,0.40101,0.93932,-0.025087,1.0,-64.0,1.0
7,0.27203,0.99674,-0.093505,1.0,-64.0,0.23685,1.03120,-0.127710,1.0,-64.5,...,0.93932,-0.025087,1.0,-64.0,0.40101,0.93932,-0.025087,1.0,-62.5,1.0
8,0.23685,1.03120,-0.127710,1.0,-64.5,0.23685,1.03120,-0.127710,1.0,-66.0,...,0.93932,-0.025087,1.0,-62.5,0.40101,0.93932,-0.025087,1.0,-64.5,1.0
9,0.23685,1.03120,-0.127710,1.0,-66.0,0.23685,1.03120,-0.127710,1.0,-63.0,...,0.93932,-0.025087,1.0,-64.5,0.40101,0.93932,-0.025087,4.0,-58.0,1.0
