In [1]:
# default_exp datapipe
from nbdev import *

# Database - functions for data back-end / manipulations

This is using an alternate approach:
  - I have exported all of my Apple healthfit data from the Health app to export.zip 
  and then converted this to a SQLite database using `healthfit-to-sqlite`
  - I am then creating a "published" version of this SQLite database using
  Datasette.io
  - I have a local version of this database running at http://localhost:8081/healthkit and
  similarly I have an externally deployed version at https://my-healthkit-data.fly.dev
  - I will then run queries against this database to build the cache file (or possibly a smaller custom sqlite file)

## TODO
* This is still a work in progress
* Need to write the queries to marshall the data for each of the workouts within each group of walks
* Then cache this data - maybe try another (small) sqlite db for the caching (instead of feather)
* **NOTE: It looks like the queries are being truncated at 1000 values - need to fix**


In [2]:
#export
import os
import pandas as pd
import activityio as aio
from dateutil.parser import parse
import datetime as dt
import sqlite3 as sql
from pathlib import Path
import tomli


In [3]:
db_file = Path('emmaus_walking.sqlite.db')

In [4]:
LOCAL_DB_URL = 'http://localhost:8081/'
HOSTED_DB_URL = 'https://my-healthkit-data.fly.dev/'

In [5]:
url_CSV = 'http://localhost:8081/healthkit.csv?sql=select%0D%0A++id%2C%0D%0A++workoutActivityType%2C%0D%0A++duration%2C%0D%0A++durationUnit%2C%0D%0A++totalDistance%2C%0D%0A++totalDistanceUnit%2C%0D%0A++totalEnergyBurned%2C%0D%0A++totalEnergyBurnedUnit%2C%0D%0A++sourceName%2C%0D%0A++sourceVersion%2C%0D%0A++creationDate%2C%0D%0A++startDate%2C%0D%0A++endDate%2C%0D%0A++metadata_HKTimeZone%2C%0D%0A++metadata_HKWeatherTemperature%2C%0D%0A++metadata_HKWeatherHumidity%2C%0D%0A++device%2C%0D%0A++metadata_HKElevationAscended%2C%0D%0A++metadata_HKAverageMETs%0D%0Afrom%0D%0A++workouts%0D%0Aorder+by%0D%0A++id%0D%0Alimit%0D%0A++101'

In [6]:
workouts_df = pd.read_csv(url_CSV)

#print((LOCAL_DB_URL + 'workout.json'))
#workout_df = pd.read_json(LOCAL_DB_URL + 'workouts.json')

In [7]:
workouts_df.nunique()

id                               101
workoutActivityType                7
duration                         100
durationUnit                       1
totalDistance                     85
totalDistanceUnit                  1
totalEnergyBurned                100
totalEnergyBurnedUnit              1
sourceName                         2
sourceVersion                     27
creationDate                     101
startDate                        101
endDate                          101
metadata_HKTimeZone                5
metadata_HKWeatherTemperature     37
metadata_HKWeatherHumidity        51
device                            20
metadata_HKElevationAscended      48
metadata_HKAverageMETs            40
dtype: int64

In [8]:
workouts_df.head()

Unnamed: 0,id,workoutActivityType,duration,durationUnit,totalDistance,totalDistanceUnit,totalEnergyBurned,totalEnergyBurnedUnit,sourceName,sourceVersion,creationDate,startDate,endDate,metadata_HKTimeZone,metadata_HKWeatherTemperature,metadata_HKWeatherHumidity,device,metadata_HKElevationAscended,metadata_HKAverageMETs
0,002a7fbcc124ce02c76554f8d505a2e0555f906e,HKWorkoutActivityTypeWalking,122.173776,min,3.6746,km,1729.920665,kJ,Michael and Ai Leen’s Apple Watch,6.2.6,2020-06-27 08:34:30 +1000,2020-06-27 06:32:18 +1000,2020-06-27 08:34:28 +1000,Australia/Sydney,49 degF,8500 %,"<<HKDevice: 0x282f6c280>, name:Apple Watch, ma...",4630 cm,2.74876 kcal/hr·kg
1,00a2dae192724c8c1940c393a8bfef79d37cbdbe,HKWorkoutActivityTypeWalking,117.242568,min,1.803008,km,1289.166333,kJ,Michael and Ai Leen’s Apple Watch,6.2.1,2020-06-13 15:13:35 +1000,2020-06-13 13:16:17 +1000,2020-06-13 15:13:32 +1000,Australia/Sydney,67 degF,7000 %,"<<HKDevice: 0x282f600a0>, name:Apple Watch, ma...",2803 cm,2.25918 kcal/hr·kg
2,0121372783b05b67ef82219fde21347b87636b5b,HKWorkoutActivityTypeCrossTraining,7.883268,min,0.0,km,513.001475,kJ,Seven,420,2016-07-20 08:42:54 +1000,2016-07-20 08:35:00 +1000,2016-07-20 08:42:54 +1000,Australia/Sydney,,,,,
3,0123895b3103842c96484bd5baeeca3e57481ce5,HKWorkoutActivityTypeWalking,103.204779,min,3.650011,km,1544.586192,kJ,Michael and Ai Leen’s Apple Watch,7.1,2020-12-14 09:33:44 +1000,2020-12-14 07:50:29 +1000,2020-12-14 09:33:41 +1000,Australia/Sydney,69.8 degF,8200 %,"<<HKDevice: 0x282f745a0>, name:Apple Watch, ma...",4271 cm,2.88149 kcal/hr·kg
4,013404c2aaf1f1e14314e756a0edb5ae54d57397,HKWorkoutActivityTypeHiking,57.604634,min,1.218353,km,469.549308,kJ,Michael and Ai Leen’s Apple Watch,5.1.3,2019-04-12 10:18:27 +1000,2019-04-12 09:20:30 +1000,2019-04-12 10:18:06 +1000,Australia/Sydney,65 degF,7500 %,"<<HKDevice: 0x282fdf110>, name:Apple Watch, ma...",1607 cm,


In [9]:
workouts_fly_df = pd.read_csv(url_CSV.replace(LOCAL_DB_URL, HOSTED_DB_URL))

In [10]:
workouts_fly_df.head()

Unnamed: 0,id,workoutActivityType,duration,durationUnit,totalDistance,totalDistanceUnit,totalEnergyBurned,totalEnergyBurnedUnit,sourceName,sourceVersion,creationDate,startDate,endDate,metadata_HKTimeZone,metadata_HKWeatherTemperature,metadata_HKWeatherHumidity,device,metadata_HKElevationAscended,metadata_HKAverageMETs
0,002a7fbcc124ce02c76554f8d505a2e0555f906e,HKWorkoutActivityTypeWalking,122.173776,min,3.6746,km,1729.920665,kJ,Michael and Ai Leen’s Apple Watch,6.2.6,2020-06-27 08:34:30 +1000,2020-06-27 06:32:18 +1000,2020-06-27 08:34:28 +1000,Australia/Sydney,49 degF,8500 %,"<<HKDevice: 0x282f6c280>, name:Apple Watch, ma...",4630 cm,2.74876 kcal/hr·kg
1,00a2dae192724c8c1940c393a8bfef79d37cbdbe,HKWorkoutActivityTypeWalking,117.242568,min,1.803008,km,1289.166333,kJ,Michael and Ai Leen’s Apple Watch,6.2.1,2020-06-13 15:13:35 +1000,2020-06-13 13:16:17 +1000,2020-06-13 15:13:32 +1000,Australia/Sydney,67 degF,7000 %,"<<HKDevice: 0x282f600a0>, name:Apple Watch, ma...",2803 cm,2.25918 kcal/hr·kg
2,0121372783b05b67ef82219fde21347b87636b5b,HKWorkoutActivityTypeCrossTraining,7.883268,min,0.0,km,513.001475,kJ,Seven,420,2016-07-20 08:42:54 +1000,2016-07-20 08:35:00 +1000,2016-07-20 08:42:54 +1000,Australia/Sydney,,,,,
3,0123895b3103842c96484bd5baeeca3e57481ce5,HKWorkoutActivityTypeWalking,103.204779,min,3.650011,km,1544.586192,kJ,Michael and Ai Leen’s Apple Watch,7.1,2020-12-14 09:33:44 +1000,2020-12-14 07:50:29 +1000,2020-12-14 09:33:41 +1000,Australia/Sydney,69.8 degF,8200 %,"<<HKDevice: 0x282f745a0>, name:Apple Watch, ma...",4271 cm,2.88149 kcal/hr·kg
4,013404c2aaf1f1e14314e756a0edb5ae54d57397,HKWorkoutActivityTypeHiking,57.604634,min,1.218353,km,469.549308,kJ,Michael and Ai Leen’s Apple Watch,5.1.3,2019-04-12 10:18:27 +1000,2019-04-12 09:20:30 +1000,2019-04-12 10:18:06 +1000,Australia/Sydney,65 degF,7500 %,"<<HKDevice: 0x282fdf110>, name:Apple Watch, ma...",1607 cm,


In [11]:
workout_points_SQL = 'http://localhost:8081/healthkit.csv?sql=select%0D%0A++rowid%2C%0D%0A++date%2C%0D%0A++latitude%2C%0D%0A++longitude%2C%0D%0A++altitude%2C%0D%0A++speed%0D%0Afrom%0D%0A++workout_points%0D%0Awhere%0D%0A++workout_id+%3D+%22'
workout_id = 'a34036ff616122952fa67c9bc11a493f8642dd7c' + '%22'

workout_points_df = pd.read_csv(workout_points_SQL + workout_id, parse_dates=True)

In [12]:
workout_points_df.head()

Unnamed: 0,rowid,date,latitude,longitude,altitude,speed
0,1,2017-07-31T00:51:19Z,-33.857455,151.160325,-4.943426,2.993684
1,2,2017-07-31T00:52:08Z,-33.857726,151.160033,20.688181,1.673957
2,3,2017-07-31T00:52:09Z,-33.857726,151.160052,20.902155,1.67737
3,4,2017-07-31T00:52:10Z,-33.857727,151.160071,21.125973,1.682297
4,5,2017-07-31T00:52:11Z,-33.857727,151.160087,21.315756,1.687898


In [13]:
workout_points_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   rowid      1000 non-null   int64  
 1   date       1000 non-null   object 
 2   latitude   1000 non-null   float64
 3   longitude  1000 non-null   float64
 4   altitude   1000 non-null   float64
 5   speed      1000 non-null   float64
dtypes: float64(4), int64(1), object(1)
memory usage: 47.0+ KB


In [14]:
WALK_DETAILS_FILE = 'walk_details.toml'
walk_details = Path('../' + WALK_DETAILS_FILE)


In [15]:
walk_details

Path('../walk_details.toml')

In [16]:
with open(walk_details, encoding="utf-8") as f:
    walk_details_dict = tomli.load(f)

In [17]:
walk_details_dict

{'walks': [{'short_name': 'B2M', 'name': 'Bondi to Manly'},
  {'short_name': 'B2W', 'name': 'Bondi to Wollongong'},
  {'short_name': 'D2C', 'name': 'Drummoyne to Cockatoo'},
  {'short_name': 'GNW', 'name': 'Great North Walk'},
  {'short_name': 'GTL', 'name': 'Gladesville Loop'},
  {'short_name': 'GNW', 'name': 'Great North Walk'},
  {'short_name': 'GWW', 'name': 'Great West Walk', 'status': 'incomplete'},
  {'short_name': 'OLD', 'name': 'Old Bar'},
  {'short_name': 'STM', 'name': "St Michael's Golf Course"},
  {'short_name': 'SNM', 'name': 'Snowy Mountains (Thredo)'},
  {'short_name': 'WNG',
   'name': 'Newcastle to Sydney',
   'status': 'incomplete'}]}

In [18]:
pd.DataFrame(walk_details_dict, )

Unnamed: 0,walks
0,"{'short_name': 'B2M', 'name': 'Bondi to Manly'}"
1,"{'short_name': 'B2W', 'name': 'Bondi to Wollon..."
2,"{'short_name': 'D2C', 'name': 'Drummoyne to Co..."
3,"{'short_name': 'GNW', 'name': 'Great North Walk'}"
4,"{'short_name': 'GTL', 'name': 'Gladesville Loop'}"
5,"{'short_name': 'GNW', 'name': 'Great North Walk'}"
6,"{'short_name': 'GWW', 'name': 'Great West Walk..."
7,"{'short_name': 'OLD', 'name': 'Old Bar'}"
8,"{'short_name': 'STM', 'name': 'St Michael's Go..."
9,"{'short_name': 'SNM', 'name': 'Snowy Mountains..."


In [19]:
#export
def calc_walk_stats(walk_data):
    total_time = dt.timedelta(0)
    total_distance = 0

    for iHike, hike in enumerate(walk_data):
        total_time += hike.index.max()
        # print(iHike+1, walk_date[iHike], hike.index.max(), hike['dist'].max() / 1e3)
        total_distance += hike['dist'].max()
    total_distance /= 1e3

    start_coord = walk_data[0][['lat', 'lon']].iloc[0].tolist()
    end_coord = walk_data[-1][['lat', 'lon']].iloc[-1].tolist()
    return total_time, total_distance, start_coord, end_coord


# TODO: use st.cache() and also look to pre-load and cache/feather data (or similar) - NB: use of @st.cache() below didn't work
def load_and_cache_raw_walk_data(walk_name, sample_freq, conn):
    RAW_FIT_FILE_PATH = 'icloud/Data/HealthFit/FIT' 
    fit_dir = Path.home()/RAW_FIT_FILE_PATH
    data_dir = fit_dir/walk_name[0:3]
    print(data_dir.ls())
    data_files = [file for file in os.listdir(data_dir) if file.endswith('.fit')]
    walk_files = sorted(data_files)
    print(walk_files)

    walk_data = []
    walk_date = []

    for iFile, file in enumerate(walk_files):
        walk_df = pd.DataFrame(aio.read(data_dir + file))
        walk_data.append(walk_df)
        walk_date.append(parse(file[0:17]))
        walk_df['WalkName'] = walk_name
        walk_df['WalkNumber'] = iFile
        walk_df[['alt', 'dist', 'lat', 'lon', 'speed', 'WalkName', 'WalkNumber']].to_sql('walks', conn, if_exists='append', index=False)
               
    total_time, total_distance, start_coord, end_coord = calc_walk_stats(walk_data)
    walk_stats = [total_time, total_distance, start_coord, end_coord]
    #print(start_coord)
    walk_merged = pd.concat(walk_data)
    points = walk_merged[['lat', 'lon']].values.tolist()
    points = [tuple(point) for ipoint, point in enumerate(points) if ipoint % sample_freq == 0]
    return walk_data, walk_date, walk_files, points, walk_stats

In [20]:
def create_walk_cached_data_for_app(db_file, n_rows_used=5):
    # read in all of the walks data and sample at an appropriate frequency and cache for faster use in the app
    db_conn = sql.connect(db_file)
    walk_df = pd.read_sql_query('SELECT * FROM walks', db_conn)

    UNUSED_COLUMNS = ['dist', 'speed']

    walk_df.drop(UNUSED_COLUMNS, axis=1, inplace=True)
    walk_df.dropna(inplace=True)      # TODO: Check why there are a few NaNs
    walk_df = walk_df.iloc[::n_rows_used].reset_index()    # downsample

    walk_df.to_feather(Path(db_file.as_posix().replace('.db', '.cache.feather')))
    
    return walk_df

In [21]:
# Not working yet -- this is the alternate approach to using the individual .FIT files
# walk_df = create_walk_cached_data_for_app(db_file, 10)

In [22]:
# walk_df[walk_df['lat'].isna()]

In [23]:
Path(db_file.as_posix().replace('.db', '.cache.feather'))

Path('emmaus_walking.sqlite.cache.feather')

In [24]:
# walk_df = pd.read_feather(Path(db_file.as_posix().replace('.db', '.cache.feather')))

In [25]:
# walk_df.info()

In [26]:
# walk_df['WalkName'].unique()