In [1]:
# Work with exported data from Apple Health in raw format
# ToDo before getting going
# pip install xmltodict

In [1]:
# List of dependencies
import pandas as pd
import xmltodict

In [3]:
# Reading in Data
input_path = './Export.xml'
with open(input_path, 'r') as xml_file:
    input_data = xmltodict.parse(xml_file.read())

In [4]:
# Getting actual heald data records
records_list = input_data['HealthData']['Record']
df = pd.DataFrame(records_list)

In [6]:
typeOfDataPoints = df['@type'].unique()

array(['HKQuantityTypeIdentifierDietaryWater',
       'HKQuantityTypeIdentifierHeight',
       'HKQuantityTypeIdentifierBodyMass',
       'HKQuantityTypeIdentifierHeartRate',
       'HKQuantityTypeIdentifierStepCount',
       'HKQuantityTypeIdentifierDistanceWalkingRunning',
       'HKQuantityTypeIdentifierBasalEnergyBurned',
       'HKQuantityTypeIdentifierActiveEnergyBurned',
       'HKQuantityTypeIdentifierFlightsClimbed',
       'HKQuantityTypeIdentifierAppleExerciseTime',
       'HKQuantityTypeIdentifierDistanceCycling',
       'HKQuantityTypeIdentifierRestingHeartRate',
       'HKQuantityTypeIdentifierVO2Max',
       'HKQuantityTypeIdentifierWalkingHeartRateAverage',
       'HKQuantityTypeIdentifierHeadphoneAudioExposure',
       'HKCategoryTypeIdentifierSleepAnalysis',
       'HKCategoryTypeIdentifierAppleStandHour',
       'HKCategoryTypeIdentifierMindfulSession',
       'HKCategoryTypeIdentifierHighHeartRateEvent',
       'HKQuantityTypeIdentifierHeartRateVariabilitySDNN'], dt

In [31]:
df["@sourceName"].unique()

array(['Mein Wasser', 'Health', 'Kurzbefehle', "Marvin's iPhoneXs",
       'Connect', 'Apple\xa0Watch von Marvin', 'Wahoo', 'ELEMNT',
       "Marvin's iPhone7", 'Strava', 'Nike Training', 'Uhr', 'Calm',
       'Headspace', 'Tide'], dtype=object)

In [50]:
# Format time
format = '%Y-%m-%d %H:%M:%S'
df['@creationDate'] = pd.to_datetime(df['@creationDate'], format=format)
df['@startDate'] = pd.to_datetime(df['@startDate'], format=format)
df['@endDate'] = pd.to_datetime(df['@endDate'], format=format)

# Filter for StepCount and Data Source
step_counts = df[df['@type'] == 'HKQuantityTypeIdentifierStepCount']
step_counts = step_counts[step_counts['@sourceName'].isin(["Apple\xa0Watch von Marvin", "Connect"])]

In [51]:
step_counts.head()

Unnamed: 0,@type,@sourceName,@sourceVersion,@unit,@creationDate,@startDate,@endDate,@value,MetadataEntry,@device,HeartRateVariabilityMetadataList
213165,HKQuantityTypeIdentifierStepCount,Apple Watch von Marvin,3.2.2,count,2017-05-25 07:14:50,2017-05-25 07:12:34,2017-05-25 07:13:35,22,,"<<HKDevice: 0x281f54820>, name:Apple Watch, ma...",
213166,HKQuantityTypeIdentifierStepCount,Apple Watch von Marvin,3.2.2,count,2017-05-25 07:22:58,2017-05-25 07:15:26,2017-05-25 07:16:28,20,,"<<HKDevice: 0x281f56df0>, name:Apple Watch, ma...",
213167,HKQuantityTypeIdentifierStepCount,Apple Watch von Marvin,3.2.2,count,2017-05-25 08:34:58,2017-05-25 08:23:16,2017-05-25 08:32:48,17,,"<<HKDevice: 0x281f55bd0>, name:Apple Watch, ma...",
213168,HKQuantityTypeIdentifierStepCount,Apple Watch von Marvin,3.2.2,count,2017-05-25 08:50:13,2017-05-25 08:39:31,2017-05-25 08:45:36,9,,"<<HKDevice: 0x281f55d60>, name:Apple Watch, ma...",
213169,HKQuantityTypeIdentifierStepCount,Apple Watch von Marvin,3.2.2,count,2017-05-25 08:58:00,2017-05-25 08:51:58,2017-05-25 08:53:00,36,,"<<HKDevice: 0x281f56ad0>, name:Apple Watch, ma...",


In [52]:
# Drop unimportant columns and convert to numeric
columns_to_drop = ["@sourceName", "@sourceVersion", "@startDate", "@endDate", "MetadataEntry", "@device", "HeartRateVariabilityMetadataList"]
step_counts = step_counts.drop(columns_to_drop, axis=1)
step_counts.loc[:, '@value'] = pd.to_numeric(step_counts.loc[:, '@value'])

In [53]:
step_counts.head()

Unnamed: 0,@type,@unit,@creationDate,@value
213165,HKQuantityTypeIdentifierStepCount,count,2017-05-25 07:14:50,22
213166,HKQuantityTypeIdentifierStepCount,count,2017-05-25 07:22:58,20
213167,HKQuantityTypeIdentifierStepCount,count,2017-05-25 08:34:58,17
213168,HKQuantityTypeIdentifierStepCount,count,2017-05-25 08:50:13,9
213169,HKQuantityTypeIdentifierStepCount,count,2017-05-25 08:58:00,36


In [54]:
# Step count per day
step_counts_by_creation = step_counts.groupby('@creationDate').sum()
by_day = step_counts_by_creation['@value'].resample('D').sum()
by_day.sort_values(ascending=False)[:10] 

@creationDate
2019-01-27    100948
2018-08-08     62947
2019-03-16     55034
2019-07-10     49111
2019-09-01     48744
2018-06-26     47397
2018-04-29     46286
2019-07-16     46165
2018-09-30     42728
2019-04-14     41899
Name: @value, dtype: int64

In [58]:
by_day.mean(axis=0)

8487.22346368715