In [1]:
import xml.etree.ElementTree as ET
import pandas as pd
import datetime as dt
import numpy as np

import matplotlib as plt

In [2]:
# Path to the XML file exported form Apple Health app
XML_DATA = "C:/Users/LHAGELS/Downloads/Export/apple_health_export/Export.xml"

In [3]:
# Parse XML file exported from Apple Health app
tree = ET.parse(XML_DATA)
root = tree.getroot()

In [4]:
attributes = []
for child in root:
    attributes.append(child.tag)

attributes = list(set(attributes))
print(attributes)

['Record', 'ExportDate', 'Me', 'ActivitySummary', 'Workout']


In [5]:
attributes.remove('ExportDate')
attributes.remove('Me')

print(attributes)

['Record', 'ActivitySummary', 'Workout']


In [6]:
records_list = [x.attrib for x in root.iter('Record')]

### Records

In [7]:
df_records = pd.DataFrame(records_list)

In [8]:
start_date = '2023-01-01'

df_records_2023 = df_records[pd.to_datetime(df_records['startDate']).dt.strftime('%Y-%m-%d') >= start_date]
df_records = df_records_2023

# remove 'sourceName', 'sourceVersion', 'device', 'creationDate', 'endDate' columns
df_records = df_records.drop(['sourceName','sourceVersion', 'device'], axis=1)

In [9]:
df_records['Date'] = pd.to_datetime(df_records['startDate']).dt.strftime('%Y-%m-%d')
df_records['Time'] = pd.to_datetime(df_records['startDate']).dt.strftime('%H:%M:%S')
df_records['Day'] = pd.to_datetime(df_records['startDate']).dt.strftime('%A')
df_records['Month'] = pd.to_datetime(df_records['startDate']).dt.strftime('%B')

In [10]:
df_records[['type', 'startDate', 'Date', 'Time', 'Month', 'Day', 'value', 'unit']].head(5)

Unnamed: 0,type,startDate,Date,Time,Month,Day,value,unit
22,HKQuantityTypeIdentifierBodyMass,2023-07-12 22:07:00 +0200,2023-07-12,22:07:00,July,Wednesday,85,kg
135085,HKQuantityTypeIdentifierHeartRate,2023-01-01 00:00:30 +0200,2023-01-01,00:00:30,January,Sunday,68,count/min
135086,HKQuantityTypeIdentifierHeartRate,2023-01-01 00:09:37 +0200,2023-01-01,00:09:37,January,Sunday,75,count/min
135087,HKQuantityTypeIdentifierHeartRate,2023-01-01 00:10:47 +0200,2023-01-01,00:10:47,January,Sunday,74,count/min
135088,HKQuantityTypeIdentifierHeartRate,2023-01-01 00:17:45 +0200,2023-01-01,00:17:45,January,Sunday,64,count/min


In [11]:
# value is numeric, NaN if fails
df_records['value'] = pd.to_numeric(df_records['value'], errors='coerce')

In [12]:
# shorter observation names
df_records['type'] = df_records['type'].str.replace('HKQuantityTypeIdentifier', '')
df_records['type'] = df_records['type'].str.replace('HKCategoryTypeIdentifier', '')

In [13]:
df_records.type.unique()

array(['BodyMass', 'HeartRate', 'OxygenSaturation', 'RespiratoryRate',
       'StepCount', 'DistanceWalkingRunning', 'BasalEnergyBurned',
       'ActiveEnergyBurned', 'FlightsClimbed', 'AppleExerciseTime',
       'DistanceCycling', 'DistanceSwimming', 'SwimmingStrokeCount',
       'RestingHeartRate', 'VO2Max', 'WalkingHeartRateAverage',
       'DistanceDownhillSnowSports', 'EnvironmentalAudioExposure',
       'HeadphoneAudioExposure', 'WalkingDoubleSupportPercentage',
       'SixMinuteWalkTestDistance', 'AppleStandTime', 'WalkingSpeed',
       'WalkingStepLength', 'WalkingAsymmetryPercentage',
       'StairAscentSpeed', 'StairDescentSpeed', 'AppleWalkingSteadiness',
       'RunningStrideLength', 'RunningVerticalOscillation',
       'RunningGroundContactTime', 'HeartRateRecoveryOneMinute',
       'RunningPower', 'RunningSpeed', 'SleepAnalysis', 'AppleStandHour',
       'LowHeartRateEvent', 'HeartRateVariabilitySDNN'], dtype=object)

In [14]:
record_types = list(df_records.type.unique())

In [15]:
record_types

['BodyMass',
 'HeartRate',
 'OxygenSaturation',
 'RespiratoryRate',
 'StepCount',
 'DistanceWalkingRunning',
 'BasalEnergyBurned',
 'ActiveEnergyBurned',
 'FlightsClimbed',
 'AppleExerciseTime',
 'DistanceCycling',
 'DistanceSwimming',
 'SwimmingStrokeCount',
 'RestingHeartRate',
 'VO2Max',
 'WalkingHeartRateAverage',
 'DistanceDownhillSnowSports',
 'EnvironmentalAudioExposure',
 'HeadphoneAudioExposure',
 'WalkingDoubleSupportPercentage',
 'SixMinuteWalkTestDistance',
 'AppleStandTime',
 'WalkingSpeed',
 'WalkingStepLength',
 'WalkingAsymmetryPercentage',
 'StairAscentSpeed',
 'StairDescentSpeed',
 'AppleWalkingSteadiness',
 'RunningStrideLength',
 'RunningVerticalOscillation',
 'RunningGroundContactTime',
 'HeartRateRecoveryOneMinute',
 'RunningPower',
 'RunningSpeed',
 'SleepAnalysis',
 'AppleStandHour',
 'LowHeartRateEvent',
 'HeartRateVariabilitySDNN']

In [16]:
# dictionary of DataFrames for filtered 'record_data'
record_dict = {}

# create new DataFrame for every interested data
for type in record_types:
   record_dict[type] = df_records.loc[(df_records['type'].str.contains(type))].rename(columns={"value": type}).sort_values(by='Date')

### Workouts

In [17]:
level_1_list = []
level_2_list = []
level_3_list = []

a_list = list(root.iter('Workout'))
for a in range(len(a_list)):
    b_list = list(a_list[a])
    
    for b in range(len(b_list)):
        level_1_list.append(b_list[b].tag)
        c_list = list(b_list[b])

        for c in range(len(c_list)):
            level_2_list.append(c_list[c].tag)
            d_list = list(c_list[c])

            for d in range(len(d_list)):
                level_3_list.append(d_list[d].tag)

level_1_list = list(set(level_1_list))
level_2_list = list(set(level_2_list))
level_3_list = list(set(level_3_list))

In [18]:
### EMPTY LIST: no further levels to investigate
print(level_1_list)
print(level_2_list)
print(level_3_list)

concat_list = list(set(level_1_list + level_2_list + level_3_list))
print(concat_list)

['MetadataEntry', 'WorkoutEvent', 'WorkoutStatistics', 'WorkoutRoute']
['MetadataEntry', 'FileReference']
[]
['WorkoutEvent', 'MetadataEntry', 'FileReference', 'WorkoutStatistics', 'WorkoutRoute']


In [19]:
index = 0
workouts = root.findall('.//Workout')
total_workouts = len(workouts)

for wo in workouts:
    index = index + 1
    wo.attrib['workout_pk'] = str(index)

    for wo_2 in level_1_list:
        workouts_2 = wo.findall(f'.//{wo_2}')
        for x in workouts_2:
            x.attrib['workout_pk'] = str(index)

            for wo_3 in level_2_list:
                workouts_3 = x.findall(f'.//{wo_3}')
                for y in workouts_3:
                    y.attrib['workout_pk'] = str(index)

tree.write('Indexed_XML.xml')

In [20]:
tree = ET.parse('Indexed_XML.xml')
root = tree.getroot()

In [21]:
df_list = []
for elem in concat_list:
    workouts = root.findall(f'.//{elem}')
    globals()[f"{elem}_list"] = []
    df_list.append(str(f"{elem}_list"))
    for wo in workouts:
        globals()[f"{elem}_list"].append(wo.attrib)

In [22]:
df_list

['WorkoutEvent_list',
 'MetadataEntry_list',
 'FileReference_list',
 'WorkoutStatistics_list',
 'WorkoutRoute_list']

In [23]:
for elem in df_list:
    object = globals()[elem]
    df_name = f"df_{elem}"
    print(f"{df_name}:")
    globals()[df_name] = pd.DataFrame(object)
    display(pd.DataFrame(object).head(2))

df_WorkoutEvent_list:


Unnamed: 0,type,date,duration,durationUnit,workout_pk
0,HKWorkoutEventTypeSegment,2022-03-29 16:56:57 +0200,13.28115280667941,min,72
1,HKWorkoutEventTypeSegment,2022-03-29 16:56:57 +0200,25.37857104341189,min,72


df_MetadataEntry_list:


Unnamed: 0,key,value,workout_pk
0,HKWasUserEntered,1,
1,HKMetadataKeyHeartRateMotionContext,0,


df_FileReference_list:


Unnamed: 0,path,workout_pk
0,/workout-routes/route_2022-03-29_5.39pm.gpx,72
1,/workout-routes/route_2022-04-03_11.15am.gpx,81


df_WorkoutStatistics_list:


Unnamed: 0,type,startDate,endDate,sum,unit,workout_pk,average,minimum,maximum
0,HKQuantityTypeIdentifierActiveEnergyBurned,2021-12-02 19:38:33 +0200,2021-12-02 19:58:33 +0200,254.0,kcal,1,,,
1,HKQuantityTypeIdentifierDistanceCycling,2021-12-02 19:38:33 +0200,2021-12-02 19:58:33 +0200,8.09,km,1,,,


df_WorkoutRoute_list:


Unnamed: 0,sourceName,sourceVersion,creationDate,startDate,endDate,workout_pk
0,Apple Watch von Lucas,8.1,2022-03-29 17:39:48 +0200,2022-03-29 16:57:11 +0200,2022-03-29 17:39:47 +0200,72
1,Apple Watch von Lucas,8.1,2022-04-03 11:15:12 +0200,2022-04-03 10:48:45 +0200,2022-04-03 11:15:10 +0200,81


#### Transform dfs

##### WorkoutStatistics

In [24]:
df_workoutstatistics_melt = pd.melt(df_WorkoutStatistics_list, 
                                    id_vars=['workout_pk', 'type','startDate', 'endDate', 'unit'], 
                                    value_vars=['sum', 'average', 'minimum', 'maximum'], 
                                    var_name='metric',
                                    value_name='value').dropna()

# shorter observation names
df_workoutstatistics_melt['type'] = df_workoutstatistics_melt['type'].str.replace('HKQuantityTypeIdentifier', '')
df_workoutstatistics_melt['type'] = df_workoutstatistics_melt['type'].str.replace('HKCategoryTypeIdentifier', '')

In [25]:
df_workoutstatistics_melt[df_workoutstatistics_melt['workout_pk'] == '450']

Unnamed: 0,workout_pk,type,startDate,endDate,unit,metric,value
1453,450,DistanceWalkingRunning,2023-06-08 18:34:09 +0200,2023-06-08 18:50:47 +0200,km,sum,1.56752
1454,450,BasalEnergyBurned,2023-06-08 18:34:09 +0200,2023-06-08 18:50:47 +0200,kcal,sum,29.2329
1456,450,ActiveEnergyBurned,2023-06-08 18:34:09 +0200,2023-06-08 18:50:47 +0200,kcal,sum,99.4092
3280,450,HeartRate,2023-06-08 18:34:09 +0200,2023-06-08 18:50:47 +0200,count/min,average,100.836
5105,450,HeartRate,2023-06-08 18:34:09 +0200,2023-06-08 18:50:47 +0200,count/min,minimum,86.0
6930,450,HeartRate,2023-06-08 18:34:09 +0200,2023-06-08 18:50:47 +0200,count/min,maximum,124.0


##### HeartRate

In [26]:
wo_start_end = df_workoutstatistics_melt[['workout_pk', 'startDate', 'endDate']].drop_duplicates()
wo_start_end['Date'] = pd.to_datetime(wo_start_end['startDate']).dt.strftime('%Y-%m-%d')
wo_start_end['startDate'] = pd.to_datetime(wo_start_end['startDate'])
wo_start_end['endDate'] = pd.to_datetime(wo_start_end['endDate'])

df_hr = record_dict['HeartRate']
df_hr['Date'] = pd.to_datetime(df_hr['startDate']).dt.strftime('%Y-%m-%d')

In [27]:
merged = pd.merge(df_hr, wo_start_end, on = 'Date', how='left', suffixes=('_hr', '_wo'))
merged['isWO'] = np.where((merged['startDate_hr'] >= merged['startDate_wo']) & (merged['startDate_hr'] <= merged['endDate_wo']), 1, 0)

In [28]:
wo_heartrates = merged[merged['isWO']==1]

##### Investigate Workout Metrics

In [29]:
record_df = pd.DataFrame()

for type in record_types:
    frames = [record_df, record_dict[type]]
    record_df = pd.concat(frames)

In [30]:
record_df['Date'] = pd.to_datetime(record_df['startDate']).dt.strftime('%Y-%m-%d')
record_df['startDate'] = pd.to_datetime(record_df['startDate'])

In [31]:
merged = pd.merge(record_df, wo_start_end, on = 'Date', how='left', suffixes=('_records', '_wo'))
merged['isWO'] = np.where((merged['startDate_records'] >= merged['startDate_wo']) & (merged['startDate_records'] <= merged['endDate_wo']), 1, 0)

In [32]:
wo_records = merged[merged['isWO']==1]

In [67]:
wo_records_types = list(wo_records['type'].unique())
wo_records_types.remove('AppleStandTime')
wo_records_types.remove('AppleExerciseTime')
wo_records_types.remove('AppleStandHour')


In [69]:
wo_records_melt = pd.melt(wo_records,
                     id_vars=[
                         'unit', 'creationDate', 
                         'startDate_records', 'endDate_records', 
                         'Date', 'Time', 'Day', 'Month', 
                         'workout_pk', 'startDate_wo', 'endDate_wo', 
                         'isWO'], 
                     value_vars=wo_records_types, 
                     var_name='type',
                     value_name='value').dropna()

In [73]:
wo_records_melt.to_parquet('./output_files/workout_records.parquet.gzip', compression='gzip') 