In [None]:
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime

In [None]:
# create element tree object
tree = ET.parse('apple_health_export/export.xml')

In [None]:
# for every health record, extract the attributes into a dictionary (columns). Then create a list (rows).
root = tree.getroot()
record_list = [x.attrib for x in root.iter('Record')]

In [None]:
# create DataFrame from a list (rows) of dictionaries (columns)
data = pd.DataFrame(record_list)
#print(data)

In [None]:
# proper type to dates
for col in ['creationDate', 'startDate', 'endDate']:
    data[col] = pd.to_datetime(data[col])
#print(data)

In [None]:
# value is numeric, NaN if fails
data['value'] = pd.to_numeric(data['value'], errors='coerce')

In [None]:
# some records do not measure anything, just count occurences
# filling with 1.0 (= one time) makes it easier to aggregate
data['value'] = data['value'].fillna(1.0)

In [None]:
# shorter observation names: use vectorized replace function
data['type'] = data['type'].str.replace('HKQuantityTypeIdentifier', '')
data['type'] = data['type'].str.replace('HKCategoryTypeIdentifier', '')
data['type'] = data['type'].str.replace('HKDataType', '')

In [None]:
# What data looks like

# rows x columns
print("Shape:", data.shape)
# all column names
print("Columns:", data.columns)
# all data types
types = data.type.unique()
print("Types:", len(types), '\n', types)
print([x for x in types if 'Mind' in x])
# data sources
sources = data.sourceName.unique()
print("Sources:", len(sources), '\n', sources)

In [None]:
# pivot and resample
# body mass and total walking distance
pivot_df = data.pivot_table(index='endDate', columns='type', values='value')

In [None]:
df = pivot_df.resample('D').agg({'HeartRate' : "mean"})
print(df)

# set data
x = df.index
y = df['HeartRate']

# configure scatterplot
plt.scatter(x, y)

# rename x-axis data labels
labels = df.dropna().index    # drop rows with NaN
labels = [str(date).split(' ')[0] for date in labels]
plt.xticks(df.dropna().index, labels, rotation=45)

# set x and y-axis labels
plt.xlabel('Day')
plt.ylabel('Mean HeartRate')
plt.title('Mean HeartRate as a function of time')
plt.show()
#plt.savefig("Mean HeartRate")    # to save, comment out plt.show()

In [None]:
# filter on sleep data and apple watch info ONLY
sleep_data = data[data['type'] == "SleepAnalysis"]
sleep_data = sleep_data[sleep_data['sourceName'] == 'Mihir’s Apple\xa0Watch']
#print(sleep_data)

# calulate time between date(s)
sleep_data['time_asleep'] = sleep_data['endDate'] - sleep_data['startDate']
#print(sleep_data['time_asleep'])
# records are grouped by creation date, so lets used that to sum up the values we need here
# total time asleep as a sum of the asleep time
# awake and bed times are max's and min's
# sleep count is the number of times the Apple Watch detected movement
# rem is the number of sleep cycles over 90 minutes (divded by 90 if they were longer than 1 cycle)
sleep_data = sleep_data.groupby('creationDate').agg(total_time_asleep=('time_asleep', 'sum'),
    bed_time=('startDate', 'min'), 
    awake_time=('endDate', 'max'), 
    sleep_counts=('creationDate','count'), 
    rem_cycles=pd.NamedAgg(column='time_asleep', aggfunc=lambda x: (x // datetime.timedelta(minutes=90)).sum()))

# Time in Bed will be different to Apple's reported figure - 
# as Apple uses the time you place your iPhone down as an additional 
# datapoint, which of course, is incorrect if you try to maintain 
# some device separation in the evenings.
# For now - we will just use Apple Watch data here
sleep_data['time_in_bed'] = sleep_data['awake_time'] - sleep_data['bed_time']
sleep_data['restless_time'] = sleep_data['time_in_bed'] - sleep_data['total_time_asleep']

# convert time duration to hours for easier plotting and comparison
# time in bed and total time asleep seem to be exactly the same for me
sleep_data['time_in_bed'] = (sleep_data['time_in_bed'].dt.total_seconds()/60/60)
sleep_data['total_time_asleep'] = (sleep_data['total_time_asleep'].dt.total_seconds()/60/60)

plt.plot(sleep_data.index, sleep_data['total_time_asleep'])
# add labels
for idx, label in enumerate(sleep_data['total_time_asleep']):
    plt.text(sleep_data.index[idx], sleep_data['total_time_asleep'].iloc[idx], f"{label:.2f}", ha='center', va='bottom')

# add recommended sleep line
plt.axhline(y=8, color='red', linestyle='--', label='Recommended Sleep (8 hours)')

# style graph
plt.xticks(sleep_data.index, rotation=45)  # Use all dates from the index as x-ticks
plt.xticks(rotation=45)
plt.xlabel('Day')
plt.ylabel('Total Sleep (hours)')
plt.title('Sleep')
plt.legend()
plt.show()

In [None]:
noise_data = data[data['type'] == 'EnvironmentalAudioExposure']
plt.plot(noise_data['creationDate'], noise_data['value'])
plt.title('Environmental Audio Exposure Over Time')
plt.xlabel('Date')
plt.ylabel('Noise (dB)')
plt.xticks(rotation=45)
plt.show()

In [None]:
import seaborn as sns
correlation_data = data.pivot_table(index='creationDate', columns='type', values='value')
sns.heatmap(correlation_data.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Between Health Metrics')
plt.show()