In [None]:
import xml.etree.ElementTree as ET
import pandas as pd
import datetime as dt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from IPython.display import display

In [None]:
# create element tree object
tree = ET.parse('apple_health_export/export.xml')

In [None]:
# extract the attributes of health record
root = tree.getroot()
record_list = [x.attrib for x in root.iter('Record')]
record_list

In [None]:
# create a DataFrame from record_list
record_data = pd.DataFrame(record_list)

# print the information of record_data
print(record_data.info())

# show the record_data DataFrame
display(record_data)

In [None]:
# unique elements in 'type' column of record_data
record_data.type.unique()

In [None]:
# return recorded Active Energy Burned
record_data.loc[(record_data["type"].str.contains("ActiveEnergyBurned"))]

In [None]:
# remove 'sourceName', 'sourceVersion', 'device', 'creationDate', 'endDate' columns
record_data_cleaned = record_data.drop(
    ["sourceName", "sourceVersion", "device", "creationDate", "endDate"], axis=1
)

# transform 'startDate' into date format
record_data["startDate"] = pd.to_datetime(record_data["startDate"]).dt.strftime(
    "%Y-%m-%d"
)
record_data_cleaned["Day"] = pd.to_datetime(record_data["startDate"]).dt.strftime("%A")
record_data_cleaned["Date"] = pd.to_datetime(record_data["startDate"]).dt.strftime(
    "%Y-%m%d"
)
record_data_cleaned["Month"] = pd.to_datetime(record_data["startDate"]).dt.strftime(
    "%B"
)

# value is numeric, NaN if fails
record_data_cleaned["value"] = pd.to_numeric(record_data["value"], errors="coerce")

# shorter observation names
record_data_cleaned["type"] = record_data_cleaned["type"].str.replace(
    "HKQuantityTypeIdentifier", ""
)
record_data_cleaned["type"] = record_data_cleaned["type"].str.replace(
    "HKCategoryTypeIdentifier", ""
)

# reorder 'record_data' columns
record_data_cleaned = record_data_cleaned[
    ["type", "Date", "Day", "Month", "value", "unit"]
]

In [None]:
# dictionary of DataFrame for filtered 'record_data'
record_data_df_dict = {}

# filter 'type' of 'record_data'
record_types = [
    "BodyMass",
    "ActiveEnergyBurned",
    "BasalEnergyBurned",
    "DistanceWalkingRunning",
    "StepCount",
    "AppleStandTime",
    "WalkingSpeed",
    "RunningSpeed",
    "HeartRateVariabilitySDNN",
    "RestingHeartRate",
    "WalkingHeartRate",
    "WalkingHeartRateAverage",
    "VO2Max",
    "HeartRateRecoveryOneMinute",
]

# create new DataFrame for every interested data
for record_type in record_types:
    record_data_df_dict[record_type] = (
        record_data_cleaned.loc[(record_data_cleaned["type"].str.contains(record_type))]
        .rename(columns={"value": record_type})
        .sort_values(by="Date")
    )

In [11]:
# list of data 'type' that need to be summed daily
key_get_sum = [
    "BasalEnergyBurned",
    "ActiveEnergyBurned",
    "DistanceWalkingRunning",
    "StepCount",
    "AppleStandTime",
]

record_data_df_dict_daily = {}
for key in key_get_sum:
    record_data_df_dict_daily[key] = (
        record_data_df_dict[key]
        .groupby(record_data_df_dict[key]["Date"])
        .agg({key: "sum", "Day": lambda x: x.mode().iat[0]})
        .reset_index()
    )

In [12]:
record_data_df_dict_monthly = {}
for key in key_get_sum:
    record_data_df_dict_monthly[key] = record_data_df_dict[key].groupby(record_data_df_dict[key]['Date'].str[:-3]).agg({key: 'sum', 'Month': lambda x: x.mode().iat[0]}).reset_index()

In [None]:
# list of data 'type' that need to be summed daily
key_get_sum = [
    'BasalEnergyBurned',
    'ActiveEnergyBurned',
    'DistanceWalkingRunning',
    'StepCount',
    'AppleStandTime'
]

record_data_df_dict_daily = {}
for key in key_get_sum:
    record_data_df_dict_daily[key] = record_data_df_dict[key].groupby(record_data_df_dict[key]['Date']).agg({key: 'sum', 'Day':lambda x: x.mode().iat[0]}).reset_index()

In [None]:
record_data_df_dict_monthly ={}
for key in key_get_sum:
    record_data_df_dict_monthly[key] = record_data_df_dict[key].groupby(record_data_df_dict[key]['Date'].str[:-3]).agg({key: 'sum', 'Month': lambda x: x.mode().iat[0]}).reset_index()

In [None]:
# Before vs After Workout
# Workout routine starts from 1 September 2022 -> data filter after workout are specified starts on this date

# Body mass progress before and after Workout
record_data_df_BodyMass_start_Sep22 = record_data_df_dict['BodyMass'].loc[(record_data_df_dict['BodyMass']['Date'] >= '2023-09-01')]

# Active Energy Burned before and after workout routine
record_data_df_ActiveEnergyBurned_before_workout = record_data_df_dict_monthly["ActiveEnergyBurned"].loc[(record_data_df_dict_monthly["ActiveEnergyBurned"]['Date'] < '2022-08-31')]
record_data_df_ActiveEnergyBurned_after_workout = record_data_df_dict_monthly["ActiveEnergyBurned"].loc[(record_data_df_dict_monthly["ActiveEnergyBurned"]['Date'] >= '2022-08-31')]

# Basal Energy Burned before and after workout routine
record_data_df_BasalEnergyBurned_before_workout = record_data_df_dict_monthly["BasalEnergyBurned"].loc[(record_data_df_dict_monthly["BasalEnergyBurned"]['Date'] < '2022-08-31')]
record_data_df_BasalEnergyBurned_after_workout = record_data_df_dict_monthly["BasalEnergyBurned"].loc[(record_data_df_dict_monthly["BasalEnergyBurned"]['Date'] >= '2022-08-31')]
# Distance Walking-Running before and after workout routine
record_data_df_Distance_before_workout = record_data_df_dict_monthly["DistanceWalkingRunning"].loc[(record_data_df_dict_monthly["DistanceWalkingRunning"]['Date'] < '2022-08-31')]
record_data_df_Distance_after_workout = record_data_df_dict_monthly["DistanceWalkingRunning"].loc[(record_data_df_dict_monthly["DistanceWalkingRunning"]['Date'] >= '2022-08-31')]
# Step count before and after workout routine
record_data_df_StepCount_before_workout = record_data_df_dict_monthly["StepCount"].loc[(record_data_df_dict_monthly["StepCount"]['Date'] < '2022-08-31')]
record_data_df_StepCount_after_workout = record_data_df_dict_monthly["StepCount"].loc[(record_data_df_dict_monthly["StepCount"]['Date'] >= '2022-08-31')]