In [1]:
# Work with exported data from Apple Health in raw format
# ToDo before getting going
# pip install xmltodict

In [59]:
# List of dependencies
import pandas as pd
import xmltodict
import json

In [60]:
# Reading in Data
input_path = './Export.xml'
with open(input_path, 'r') as xml_file:
    input_data = xmltodict.parse(xml_file.read())

In [61]:
# Getting actual heald data records
records_list = input_data['HealthData']['Record']
df = pd.DataFrame(records_list)

In [62]:
# Date points collected
df['@type'].unique()

array(['HKQuantityTypeIdentifierDietaryWater',
       'HKQuantityTypeIdentifierHeight',
       'HKQuantityTypeIdentifierBodyMass',
       'HKQuantityTypeIdentifierHeartRate',
       'HKQuantityTypeIdentifierStepCount',
       'HKQuantityTypeIdentifierDistanceWalkingRunning',
       'HKQuantityTypeIdentifierBasalEnergyBurned',
       'HKQuantityTypeIdentifierActiveEnergyBurned',
       'HKQuantityTypeIdentifierFlightsClimbed',
       'HKQuantityTypeIdentifierAppleExerciseTime',
       'HKQuantityTypeIdentifierDistanceCycling',
       'HKQuantityTypeIdentifierRestingHeartRate',
       'HKQuantityTypeIdentifierVO2Max',
       'HKQuantityTypeIdentifierWalkingHeartRateAverage',
       'HKQuantityTypeIdentifierHeadphoneAudioExposure',
       'HKCategoryTypeIdentifierSleepAnalysis',
       'HKCategoryTypeIdentifierAppleStandHour',
       'HKCategoryTypeIdentifierMindfulSession',
       'HKCategoryTypeIdentifierHighHeartRateEvent',
       'HKQuantityTypeIdentifierHeartRateVariabilitySDNN'], dt

In [63]:
# Data sources the were connect to Apple Health
df["@sourceName"].unique()

array(['Mein Wasser', 'Health', 'Kurzbefehle', "Marvin's iPhoneXs",
       'Connect', 'Apple\xa0Watch von Marvin', 'Wahoo', 'ELEMNT',
       "Marvin's iPhone7", 'Strava', 'Nike Training', 'Uhr', 'Calm',
       'Headspace', 'Tide'], dtype=object)

In [137]:
# Format time
format = '%Y-%m-%d %H:%M:%S'
df['@creationDate'] = pd.to_datetime(df['@creationDate'], format=format)
df['@startDate'] = pd.to_datetime(df['@startDate'], format=format)
df['@endDate'] = pd.to_datetime(df['@endDate'], format=format)

# Filter for StepCount and Data Source
step_counts = df[df['@type'] == 'HKQuantityTypeIdentifierStepCount']
step_counts = step_counts[step_counts['@sourceName'].isin(["Apple\xa0Watch von Marvin", "Connect"])]

In [138]:
step_counts.head()

Unnamed: 0,@type,@sourceName,@sourceVersion,@unit,@creationDate,@startDate,@endDate,@value,MetadataEntry,@device,HeartRateVariabilityMetadataList
213165,HKQuantityTypeIdentifierStepCount,Apple Watch von Marvin,3.2.2,count,2017-05-25 07:14:50,2017-05-25 07:12:34,2017-05-25 07:13:35,22,,"<<HKDevice: 0x281f54820>, name:Apple Watch, ma...",
213166,HKQuantityTypeIdentifierStepCount,Apple Watch von Marvin,3.2.2,count,2017-05-25 07:22:58,2017-05-25 07:15:26,2017-05-25 07:16:28,20,,"<<HKDevice: 0x281f56df0>, name:Apple Watch, ma...",
213167,HKQuantityTypeIdentifierStepCount,Apple Watch von Marvin,3.2.2,count,2017-05-25 08:34:58,2017-05-25 08:23:16,2017-05-25 08:32:48,17,,"<<HKDevice: 0x281f55bd0>, name:Apple Watch, ma...",
213168,HKQuantityTypeIdentifierStepCount,Apple Watch von Marvin,3.2.2,count,2017-05-25 08:50:13,2017-05-25 08:39:31,2017-05-25 08:45:36,9,,"<<HKDevice: 0x281f55d60>, name:Apple Watch, ma...",
213169,HKQuantityTypeIdentifierStepCount,Apple Watch von Marvin,3.2.2,count,2017-05-25 08:58:00,2017-05-25 08:51:58,2017-05-25 08:53:00,36,,"<<HKDevice: 0x281f56ad0>, name:Apple Watch, ma...",


In [139]:
# Drop unimportant columns and convert to numeric
columns_to_drop = ["@sourceName", "@sourceVersion", "@startDate", "@endDate", "MetadataEntry", "@device", "HeartRateVariabilityMetadataList"]
step_counts = step_counts.drop(columns_to_drop, axis=1)
step_counts.loc[:, '@value'] = pd.to_numeric(step_counts.loc[:, '@value'])

In [140]:
step_counts.head()

Unnamed: 0,@type,@unit,@creationDate,@value
213165,HKQuantityTypeIdentifierStepCount,count,2017-05-25 07:14:50,22
213166,HKQuantityTypeIdentifierStepCount,count,2017-05-25 07:22:58,20
213167,HKQuantityTypeIdentifierStepCount,count,2017-05-25 08:34:58,17
213168,HKQuantityTypeIdentifierStepCount,count,2017-05-25 08:50:13,9
213169,HKQuantityTypeIdentifierStepCount,count,2017-05-25 08:58:00,36


In [141]:
# Step count per day
step_counts_by_creation = step_counts.groupby('@creationDate').sum()
by_day = step_counts_by_creation['@value'].resample('D').sum()
#by_day.sort_values(ascending=False)[:10]

In [142]:
by_day.mean(axis=0)

8487.22346368715

In [144]:
def prepare_data_typ(data_type):
    # Select data
    selected_data_type = df[df['@type'] == data_type]
    selected_data_type = selected_data_type[selected_data_type['@sourceName'].isin(["Apple\xa0Watch von Marvin", "Connect"])]
    # Clean data
    columns_to_drop = ["@sourceName", "@sourceVersion", "@startDate", "@endDate", "MetadataEntry", "@device", "HeartRateVariabilityMetadataList"]
    selected_data_type = selected_data_type.drop(columns_to_drop, axis=1)
    selected_data_type.loc[:, '@value'] = pd.to_numeric(selected_data_type.loc[:, '@value'])
    #selected_data_type.rename(columns={ selected_data_type.columns[1]: type_name}, inplace = True)
    # Aggregate per day
    selected_data_sum = selected_data_type.groupby('@creationDate').sum()
    selected_data_sum_daily = selected_data_sum['@value'].resample('D').sum()
    return selected_data_sum_daily
    
combined_data_set = pd.concat([by_day, prepare_data_typ('HKQuantityTypeIdentifierBasalEnergyBurned'), prepare_data_typ('HKQuantityTypeIdentifierActiveEnergyBurned')], axis=1)
combined_data_set.columns = ['StepCount', 'BasalEnergyBurned', 'ActiveEnergyBurned']
combined_data_set.head(5)

Unnamed: 0_level_0,StepCount,BasalEnergyBurned,ActiveEnergyBurned
@creationDate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-05-24,2643,568.724,170.044
2017-05-25,5102,2101.189,338.028
2017-05-26,3861,1875.262,333.502
2017-05-27,5770,2009.497,344.732
2017-05-28,3538,1819.71,257.26


In [175]:
# Export Data
# Set Index for converting datatime to string and do conversion
combined_data_set_indexed = combined_data_set.to_dict('index')
result = dict((key.strftime("%m/%d/%Y"), value) for (key, value) in combined_data_set_indexed.items())
# Write JSON-file
with open('result.json', 'w') as fp:
    json.dump(result, fp)