# Extracting Features from Raw Walking Data

In [1]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sb

In [2]:
import synapseclient
# If you have your config file set up you can run:
syn = synapseclient.login()

Welcome, Kaivon Ahmad!



In [3]:
demo_query = syn.tableQuery("SELECT * FROM syn10146552")

# Convert to dataframe
demographics_df = demo_query.asDataFrame()

In [4]:
demographics_df.head()

Unnamed: 0,recordId,healthCode,createdOn,appVersion,phoneInfo,age,are-caretaker,deep-brain-stimulation,diagnosis-year,education,...,packs-per-day,past-participation,phone-usage,professional-diagnosis,race,smartphone,smoked,surgery,video-usage,years-smoking
0_0,bbb003a9-5c7b-4d66-a1d3-bcd1430edbb7,639e8a78-3631-4231-bda1-c911c1b169e5,1425904649000,"version 1.0, build 7",iPhone 6,24.0,False,False,,Some college,...,,False,True,False,"""White or Caucasian""",Very easy,False,False,False,
1_0,05258304-77f7-4761-bc58-52dc743e44cd,52fe366a-2a9f-4260-9fb1-0fbc637a6cf4,1425926191000,"version 1.0, build 7",iPhone 5s (GSM),38.0,False,False,,Some graduate school,...,,False,True,False,"""White or Caucasian""",Very easy,False,,True,
2_0,b331ff56-c6d9-430a-bc6c-b6f52d2d8e9f,67bdd316-26fc-4fc7-8431-bf9f41a649dd,1425926604000,"version 1.0, build 7",iPhone 6 Plus,32.0,False,False,,2-year college degree,...,1.0,False,True,False,"""White or Caucasian""",Very easy,True,,True,8.0
3_0,0bc6fe52-d24d-419c-97be-91a6ba32839b,af441f87-c447-4c3c-8e00-72751aff2360,1425927330000,"version 1.0, build 7",iPhone 6,24.0,False,False,,Some college,...,,False,True,False,"""Latino/Hispanic""",Very easy,True,False,True,0.0
4_0,6ef0da29-24cd-4391-8ca4-9e20ced8dfa0,340260f8-644a-4670-8a39-bc0729310343,1425927378000,"version 1.0, build 7",iPhone 6,35.0,False,False,,Doctoral Degree,...,,False,True,False,"""White or Caucasian""",Very easy,False,False,True,


# Computational Limitations

The size of the entire dataset that can be obtained from the synapse API is around 36,000. Each patient is an instance and each patient contains it's own json file that includes walking device data. I attemped to extract features with the entire dataset but it was taking an indefinite amount of time for my computer to go through each row, open the json file and extract the different features. Due to this inability, I was only able to use 2,000 instances from the dataset. The following notebooks will be mostly for proof of concept with the greater intention being to download the entire dataset and see if the analysis can be further scaled.

In [5]:
healthcode_list = ", ".join( repr(i) for i in demographics_df["healthCode"])

In [6]:
# Query 'walking training table' for walk data recordIDs and healthCodes. 
INPUT_WALKING_ACTIVITY_TABLE_SYNID = "syn10146553"
actv_walking_syntable = syn.tableQuery(('SELECT "recordId", "healthCode", "deviceMotion_walking_outbound.json.items" FROM {0} WHERE healthCode IN ({1}) AND "deviceMotion_walking_outbound.json.items" is not null limit 2000').format(INPUT_WALKING_ACTIVITY_TABLE_SYNID, healthcode_list))
actv_walking = actv_walking_syntable.asDataFrame()
actv_walking['idx'] = actv_walking.index

In [7]:
actv_walking.head()

Unnamed: 0,recordId,healthCode,deviceMotion_walking_outbound.json.items,idx
1401_0,a3e54d84-360e-4e8a-9534-de188d5fa9e1,000240d1-1110-4dd2-a2d0-e344c37efd68,2408193,1401_0
19613_9,27d5ffdd-536e-4f2d-b478-74bbd520e9f7,00081bd9-9abd-4003-b035-de6cc3e8c922,3253717,19613_9
2570_1,cdd76ffc-9607-4106-b806-2fa29ad282f4,00372eda-3796-481b-96f7-f37e8e600904,2518106,2570_1
2573_1,ba735eb0-0639-4392-8d4d-f87b06c7b7e6,00372eda-3796-481b-96f7-f37e8e600904,2578074,2573_1
2574_1,2344a520-81df-4166-9f63-09343c502a21,00372eda-3796-481b-96f7-f37e8e600904,2502847,2574_1


In [8]:
import json

# bulk download walk JSON files containing sensor data
walk_json_files = syn.downloadTableColumns(actv_walking_syntable, "deviceMotion_walking_outbound.json.items")
items = walk_json_files.items()

# create pandas dataframe of JSON filepaths and filehandleIDs
walk_json_files_temp = pd.DataFrame({"deviceMotion_walking_outbound.json.items": [i[0] for i in items], "outbound_walk_json_file": [i[1] for i in items]})

# convert ints to strings for merging
actv_walking["deviceMotion_walking_outbound.json.items"] = actv_walking["deviceMotion_walking_outbound.json.items"].astype(str)

# merge IDs/healthCodes with JSON data
actv_walk_temp = pd.merge(actv_walking, walk_json_files_temp, on="deviceMotion_walking_outbound.json.items")

Downloading 0 files, 2000 cached locally


In [9]:
actv_walk_temp.head()

Unnamed: 0,recordId,healthCode,deviceMotion_walking_outbound.json.items,idx,outbound_walk_json_file
0,a3e54d84-360e-4e8a-9534-de188d5fa9e1,000240d1-1110-4dd2-a2d0-e344c37efd68,2408193,1401_0,/Users/kaivon123/.synapseCache/193/2408193/dev...
1,27d5ffdd-536e-4f2d-b478-74bbd520e9f7,00081bd9-9abd-4003-b035-de6cc3e8c922,3253717,19613_9,/Users/kaivon123/.synapseCache/717/3253717/dev...
2,cdd76ffc-9607-4106-b806-2fa29ad282f4,00372eda-3796-481b-96f7-f37e8e600904,2518106,2570_1,/Users/kaivon123/.synapseCache/106/2518106/dev...
3,ba735eb0-0639-4392-8d4d-f87b06c7b7e6,00372eda-3796-481b-96f7-f37e8e600904,2578074,2573_1,/Users/kaivon123/.synapseCache/74/2578074/devi...
4,2344a520-81df-4166-9f63-09343c502a21,00372eda-3796-481b-96f7-f37e8e600904,2502847,2574_1,/Users/kaivon123/.synapseCache/847/2502847/dev...


# Features Chosen

For patient's acceleration in the X, Y, and Z axis, each axis will have the following:
-  mean of the acceleration
-  standard deviation of the acceleration
-  median of the acceleration
-  skew of the acceleration
-  kurtosis of the acceleration
-  range of the acceleration
-  variation of the acceleration

In [10]:
import scipy.stats.stats as st

# get average(mean) x-coordinate UserAcceleration for each file
x_accel = [] # initialize empty list for storing x-acceleration values
avg_x_accel = [] # initialize empty list for different features
std_x_accel = []
med_x_accel = []
skew_x_accel = []
kurt_x_accel = []
range_x_accel = []
variation_x_accel = []

# function for computing range
def get_range(accel_list):
    min_val = min(accel_list)
    max_val = max(accel_list)
    return max_val - min_val

In [11]:
# loop through each row in dataframe to read in json file
# grab the userAcceleration x-values and calculate the means

def get_features(axis):

    axis_accel = [] # initialize empty list for storing acceleration values of axis
    avg_accel = [] # initialize empty list for different features
    std_accel = []
    med_accel = []
    skew_accel = []
    kurt_accel = []
    range_accel = []
    variation_accel = []

    for row in actv_walk_temp["outbound_walk_json_file"]: 
        with open(row) as json_data:
            data = json.load(json_data)
            for item in data:
                acceleration = item.get("userAcceleration").get(axis)
                axis_accel.append(acceleration)

            avg = np.mean(axis_accel)
            std = np.std(axis_accel)
            med = np.median(axis_accel)
            skew = st.skew(axis_accel)
            kurt = st.kurtosis(axis_accel)
            xrange = get_range(axis_accel)
            variation = st.variation(axis_accel)
            
            avg_accel.append(avg)
            std_accel.append(std)
            med_accel.append(med)
            skew_accel.append(skew)
            kurt_accel.append(kurt)
            range_accel.append(xrange)
            variation_accel.append(variation)
            
    return avg_accel, std_accel, med_accel, skew_accel, kurt_accel, range_accel, variation_accel

In [12]:
avg_x_accel, std_x_accel, med_x_accel, skew_x_accel, kurt_x_accel, range_x_accel, variation_x_accel = get_features('x')

In [13]:
# create new column in dataframe for each feature
actv_walk_temp["meanXaccel"] = avg_x_accel
actv_walk_temp["medianXaccel"] = med_x_accel
actv_walk_temp["stdXaccel"] = std_x_accel
actv_walk_temp["skewXaccel"] = skew_x_accel
actv_walk_temp["kurtXaccel"] = kurt_x_accel
actv_walk_temp["rangeXaccel"] = range_x_accel
actv_walk_temp["variationXaccel"] = variation_x_accel


# Remove unnecessary columns
actv_walk = actv_walk_temp.drop(["deviceMotion_walking_outbound.json.items", "idx", "outbound_walk_json_file"], axis=1)

In [14]:
actv_walk.head()

Unnamed: 0,recordId,healthCode,meanXaccel,medianXaccel,stdXaccel,skewXaccel,kurtXaccel,rangeXaccel,variationXaccel
0,a3e54d84-360e-4e8a-9534-de188d5fa9e1,000240d1-1110-4dd2-a2d0-e344c37efd68,-0.060817,-0.039582,0.251105,-0.43847,0.141505,1.50823,-4.128884
1,27d5ffdd-536e-4f2d-b478-74bbd520e9f7,00081bd9-9abd-4003-b035-de6cc3e8c922,-0.015293,0.01929,0.205124,-0.854834,1.367009,1.521156,-13.413365
2,cdd76ffc-9607-4106-b806-2fa29ad282f4,00372eda-3796-481b-96f7-f37e8e600904,0.013353,0.023553,0.206104,-0.969447,4.550264,2.571121,15.434469
3,ba735eb0-0639-4392-8d4d-f87b06c7b7e6,00372eda-3796-481b-96f7-f37e8e600904,0.016124,0.023198,0.192878,-0.936322,4.949377,2.571121,11.962509
4,2344a520-81df-4166-9f63-09343c502a21,00372eda-3796-481b-96f7-f37e8e600904,0.013815,0.019293,0.19809,-0.664648,4.381479,2.674647,14.338761


In [17]:
# extract y access features

avg_y_accel, std_y_accel, med_y_accel, skew_y_accel, kurt_y_accel, range_y_accel, variation_y_accel = get_features('y')

In [18]:
# add y features as columns

actv_walk["meanYaccel"] = avg_y_accel
actv_walk["medianYaccel"] = med_y_accel
actv_walk["stdYaccel"] = std_y_accel
actv_walk["skewYaccel"] = skew_y_accel
actv_walk["kurtYaccel"] = kurt_y_accel
actv_walk["rangeYaccel"] = range_y_accel
actv_walk["variationYaccel"] = variation_y_accel

In [19]:
actv_walk.head()

Unnamed: 0,recordId,healthCode,meanXaccel,medianXaccel,stdXaccel,skewXaccel,kurtXaccel,rangeXaccel,variationXaccel,meanYaccel,medianYaccel,stdYaccel,skewYaccel,kurtYaccel,rangeYaccel,variationYaccel
0,a3e54d84-360e-4e8a-9534-de188d5fa9e1,000240d1-1110-4dd2-a2d0-e344c37efd68,-0.060817,-0.039582,0.251105,-0.43847,0.141505,1.50823,-4.128884,0.048314,0.021201,0.315285,0.319682,-0.210016,1.895134,6.52571
1,27d5ffdd-536e-4f2d-b478-74bbd520e9f7,00081bd9-9abd-4003-b035-de6cc3e8c922,-0.015293,0.01929,0.205124,-0.854834,1.367009,1.521156,-13.413365,0.0507,0.029125,0.267536,0.547424,1.101485,1.91076,5.276845
2,cdd76ffc-9607-4106-b806-2fa29ad282f4,00372eda-3796-481b-96f7-f37e8e600904,0.013353,0.023553,0.206104,-0.969447,4.550264,2.571121,15.434469,0.047635,0.020974,0.230869,0.571332,2.337877,2.653296,4.846593
3,ba735eb0-0639-4392-8d4d-f87b06c7b7e6,00372eda-3796-481b-96f7-f37e8e600904,0.016124,0.023198,0.192878,-0.936322,4.949377,2.571121,11.962509,0.056733,0.024706,0.226478,0.709988,2.618261,2.653296,3.992016
4,2344a520-81df-4166-9f63-09343c502a21,00372eda-3796-481b-96f7-f37e8e600904,0.013815,0.019293,0.19809,-0.664648,4.381479,2.674647,14.338761,0.048095,0.019794,0.246758,0.681993,3.859799,3.030658,5.13065


In [20]:
# extract z features

avg_z_accel, std_z_accel, med_z_accel, skew_z_accel, kurt_z_accel, range_z_accel, variation_z_accel = get_features('z')

In [21]:
# add z features as columns

actv_walk["meanZaccel"] = avg_z_accel
actv_walk["medianZaccel"] = med_z_accel
actv_walk["stdZaccel"] = std_z_accel
actv_walk["skewZaccel"] = skew_z_accel
actv_walk["kurtZaccel"] = kurt_z_accel
actv_walk["rangeZaccel"] = range_z_accel
actv_walk["variationZaccel"] = variation_z_accel

In [22]:
actv_walk.head()

Unnamed: 0,recordId,healthCode,meanXaccel,medianXaccel,stdXaccel,skewXaccel,kurtXaccel,rangeXaccel,variationXaccel,meanYaccel,...,kurtYaccel,rangeYaccel,variationYaccel,meanZaccel,medianZaccel,stdZaccel,skewZaccel,kurtZaccel,rangeZaccel,variationZaccel
0,a3e54d84-360e-4e8a-9534-de188d5fa9e1,000240d1-1110-4dd2-a2d0-e344c37efd68,-0.060817,-0.039582,0.251105,-0.43847,0.141505,1.50823,-4.128884,0.048314,...,-0.210016,1.895134,6.52571,-0.10439,-0.15215,0.475797,0.726143,0.544279,3.060726,-4.557892
1,27d5ffdd-536e-4f2d-b478-74bbd520e9f7,00081bd9-9abd-4003-b035-de6cc3e8c922,-0.015293,0.01929,0.205124,-0.854834,1.367009,1.521156,-13.413365,0.0507,...,1.101485,1.91076,5.276845,-0.00999,0.011845,0.388474,0.143198,0.936114,3.061306,-38.884676
2,cdd76ffc-9607-4106-b806-2fa29ad282f4,00372eda-3796-481b-96f7-f37e8e600904,0.013353,0.023553,0.206104,-0.969447,4.550264,2.571121,15.434469,0.047635,...,2.337877,2.653296,4.846593,-0.098005,-0.0919,0.420515,-1.091197,7.715927,5.277242,-4.290775
3,ba735eb0-0639-4392-8d4d-f87b06c7b7e6,00372eda-3796-481b-96f7-f37e8e600904,0.016124,0.023198,0.192878,-0.936322,4.949377,2.571121,11.962509,0.056733,...,2.618261,2.653296,3.992016,-0.068101,-0.036225,0.425813,-1.372953,7.632989,5.277242,-6.252671
4,2344a520-81df-4166-9f63-09343c502a21,00372eda-3796-481b-96f7-f37e8e600904,0.013815,0.019293,0.19809,-0.664648,4.381479,2.674647,14.338761,0.048095,...,3.859799,3.030658,5.13065,-0.057569,-0.030008,0.396042,-1.491347,9.065509,5.277242,-6.879401


In [24]:
# merge demographics df with features df

cols = ['healthCode','age', 'gender', 'professional-diagnosis']

In [25]:
actv_walk_final = actv_walk.merge(demographics_df[cols], on='healthCode', how='left')

In [26]:
actv_walk_final.head()

Unnamed: 0,recordId,healthCode,meanXaccel,medianXaccel,stdXaccel,skewXaccel,kurtXaccel,rangeXaccel,variationXaccel,meanYaccel,...,meanZaccel,medianZaccel,stdZaccel,skewZaccel,kurtZaccel,rangeZaccel,variationZaccel,age,gender,professional-diagnosis
0,a3e54d84-360e-4e8a-9534-de188d5fa9e1,000240d1-1110-4dd2-a2d0-e344c37efd68,-0.060817,-0.039582,0.251105,-0.43847,0.141505,1.50823,-4.128884,0.048314,...,-0.10439,-0.15215,0.475797,0.726143,0.544279,3.060726,-4.557892,33.0,Male,False
1,27d5ffdd-536e-4f2d-b478-74bbd520e9f7,00081bd9-9abd-4003-b035-de6cc3e8c922,-0.015293,0.01929,0.205124,-0.854834,1.367009,1.521156,-13.413365,0.0507,...,-0.00999,0.011845,0.388474,0.143198,0.936114,3.061306,-38.884676,70.0,Male,False
2,cdd76ffc-9607-4106-b806-2fa29ad282f4,00372eda-3796-481b-96f7-f37e8e600904,0.013353,0.023553,0.206104,-0.969447,4.550264,2.571121,15.434469,0.047635,...,-0.098005,-0.0919,0.420515,-1.091197,7.715927,5.277242,-4.290775,,Female,False
3,ba735eb0-0639-4392-8d4d-f87b06c7b7e6,00372eda-3796-481b-96f7-f37e8e600904,0.016124,0.023198,0.192878,-0.936322,4.949377,2.571121,11.962509,0.056733,...,-0.068101,-0.036225,0.425813,-1.372953,7.632989,5.277242,-6.252671,,Female,False
4,2344a520-81df-4166-9f63-09343c502a21,00372eda-3796-481b-96f7-f37e8e600904,0.013815,0.019293,0.19809,-0.664648,4.381479,2.674647,14.338761,0.048095,...,-0.057569,-0.030008,0.396042,-1.491347,9.065509,5.277242,-6.879401,,Female,False


In [30]:
# rearrange columns

first_four_cols = ['recordId', 'healthCode', 'age', 'gender']

final_cols = first_four_cols  + [col for col in actv_walk_final if col not in first_four_cols]
final_df = actv_walk_final[final_cols]

In [31]:
final_df.head()

Unnamed: 0,recordId,healthCode,age,gender,meanXaccel,medianXaccel,stdXaccel,skewXaccel,kurtXaccel,rangeXaccel,...,rangeYaccel,variationYaccel,meanZaccel,medianZaccel,stdZaccel,skewZaccel,kurtZaccel,rangeZaccel,variationZaccel,professional-diagnosis
0,a3e54d84-360e-4e8a-9534-de188d5fa9e1,000240d1-1110-4dd2-a2d0-e344c37efd68,33.0,Male,-0.060817,-0.039582,0.251105,-0.43847,0.141505,1.50823,...,1.895134,6.52571,-0.10439,-0.15215,0.475797,0.726143,0.544279,3.060726,-4.557892,False
1,27d5ffdd-536e-4f2d-b478-74bbd520e9f7,00081bd9-9abd-4003-b035-de6cc3e8c922,70.0,Male,-0.015293,0.01929,0.205124,-0.854834,1.367009,1.521156,...,1.91076,5.276845,-0.00999,0.011845,0.388474,0.143198,0.936114,3.061306,-38.884676,False
2,cdd76ffc-9607-4106-b806-2fa29ad282f4,00372eda-3796-481b-96f7-f37e8e600904,,Female,0.013353,0.023553,0.206104,-0.969447,4.550264,2.571121,...,2.653296,4.846593,-0.098005,-0.0919,0.420515,-1.091197,7.715927,5.277242,-4.290775,False
3,ba735eb0-0639-4392-8d4d-f87b06c7b7e6,00372eda-3796-481b-96f7-f37e8e600904,,Female,0.016124,0.023198,0.192878,-0.936322,4.949377,2.571121,...,2.653296,3.992016,-0.068101,-0.036225,0.425813,-1.372953,7.632989,5.277242,-6.252671,False
4,2344a520-81df-4166-9f63-09343c502a21,00372eda-3796-481b-96f7-f37e8e600904,,Female,0.013815,0.019293,0.19809,-0.664648,4.381479,2.674647,...,3.030658,5.13065,-0.057569,-0.030008,0.396042,-1.491347,9.065509,5.277242,-6.879401,False


In [32]:
# save as csv

#final_df.to_csv('final_df.csv')