In [1]:
# default_exp datapipe
from nbdev import *

# Database - functions for data back-end / manipulations

This is using an alternate approach:
  - I have exported all of my Apple healthfit data from the Health app to export.zip 
  and then converted this to a SQLite database using `healthfit-to-sqlite`
  - I am then creating a "published" version of this SQLite database using
  Datasette.io
  - I have a local version of this database running at http://localhost:8081/healthkit and
  similarly I have an externally deployed version at https://my-healthkit-data.fly.dev
  - I will then run queries against this database to build the cache file (or possibly a smaller custom sqlite file)

## TODO
* This is still a work in progress
* Need to write the queries to marshall the data for each of the workouts within each group of walks
* Then cache this data - maybe try another (small) sqlite db for the caching (instead of feather)
* **NOTE: It looks like the queries are being truncated at 1000 values - need to fix**


In [2]:
#export
import os
import pandas as pd
import activityio as aio
from dateutil.parser import parse
import datetime as dt
import sqlite3 as sql
from pathlib import Path
import tomli
import klib
import subprocess
import pendulum
from sqlite_utils import Database


In [3]:
try:
    from emmaus_walking.core import in_notebook, get_project_root, get_project_root_alternate
except:
    from core import in_notebook, get_project_root, get_project_root_alternate

In Jupyter notebook
Project root directory: /Users/mjboothaus/code/github/mjboothaus/emmaus_walking


[https://databooth.slite.com/api/s/note/MEQPRcLKFqUE6ko2vVp3Yr/Datasette-io-approach](https://databooth.slite.com/api/s/note/MEQPRcLKFqUE6ko2vVp3Yr/Datasette-io-approach) (Slite page)

e.g. start_datasette = subprocess.Popen('datasette ' + db_name + ' -m aus-covid-datasette-meta.json', stdout=subprocess.PIPE, shell=True)

In [4]:
EXPORT_ZIP_PATH = Path("/Users/mjboothaus/data/healthkit/export.zip")

In [11]:
def convert_healthkit_export_to_sqlite():
    zip_file = EXPORT_ZIP_PATH.as_posix()
    zip_file_date = pendulum.instance(dt.datetime.fromtimestamp(EXPORT_ZIP_PATH.stat().st_ctime))

    db_file = zip_file.replace("export.zip", "healthkit.db")
    if Path(db_file).exists == True:
        Path(db_file).unlink()
    sp_cmd = "pipx run healthkit-to-sqlite " + zip_file + " " + db_file
    print(sp_cmd)
    print('---------------------------------------------------------------------------------------------')
    print('Please wait: converting healthkit export.zip to sqlite database (takes just over a minute)...')

    sp = subprocess.Popen(sp_cmd, stdout=subprocess.PIPE, shell=True)
    (sp_output, sp_err) = sp.communicate()  

    #This makes the wait possible
    sp_status = sp.wait()

    db_file_with_date = db_file.replace(".db", "_" + zip_file_date.to_date_string().replace("-", "_") + ".db")
    
    EXPORT_ZIP_PATH.rename(zip_file.replace(".zip", "_" + zip_file_date.to_date_string().replace("-", "_") + ".zip"))
    Path(db_file).rename(db_file_with_date)

    return db_file_with_date, sp_output

In [12]:
db_file, output = convert_healthkit_export_to_sqlite()

pipx run healthkit-to-sqlite /Users/mjboothaus/data/healthkit/export.zip /Users/mjboothaus/data/healthkit/healthkit.db
---------------------------------------------------------------------------------------------
Please wait: converting healthkit export.zip to sqlite database (takes just over a minute)...


In [13]:
output

b'Importing from HealthKit\n'

## Look at the HealthKit database via Datasette.io (and try an reduce to just needed data)

In [20]:
print("datasette " + db_file + " &")

datasette /Users/mjboothaus/data/healthkit/healthkit_2021_07_31.db &


In [32]:
def create_df_from_sql_query_in_file(filename_dot_sql, conn):
# Read the sql file
    query_file = get_project_root_alternate() / "sql" / filename_dot_sql

    with open(query_file, 'r') as query:
        # connection == the connection to your database
        sql_text = query.read()
        print(sql_text)
        df = pd.read_sql_query(sql_text, conn)
    return df

In [33]:
db = Database(db_file)

In [34]:
df = create_df_from_sql_query_in_file("list_all_tables.sql", db.conn)

SELECT name FROM sqlite_master WHERE type = "table"


In [35]:
df

Unnamed: 0,name
0,rHeight
1,rBodyMass
2,rHeartRate
3,rBloodPressureSystolic
4,rBloodPressureDiastolic
5,rStepCount
6,rDistanceWalkingRunning
7,rBasalEnergyBurned
8,rActiveEnergyBurned
9,rFlightsClimbed


### Looking at sqlite version of the cached data derived from individual walk files in walk groups

In [None]:
db_file = Path('emmaus_walking.db')

In [None]:
!echo "datasette" $db_file "&"

In [None]:
LOCAL_DB_URL = 'http://localhost:8081/'
HOSTED_DB_URL = 'https://my-healthkit-data.fly.dev/'

In [None]:
url_CSV = 'http://localhost:8081/healthkit.csv?sql=select%0D%0A++id%2C%0D%0A++workoutActivityType%2C%0D%0A++duration%2C%0D%0A++durationUnit%2C%0D%0A++totalDistance%2C%0D%0A++totalDistanceUnit%2C%0D%0A++totalEnergyBurned%2C%0D%0A++totalEnergyBurnedUnit%2C%0D%0A++sourceName%2C%0D%0A++sourceVersion%2C%0D%0A++creationDate%2C%0D%0A++startDate%2C%0D%0A++endDate%2C%0D%0A++metadata_HKTimeZone%2C%0D%0A++metadata_HKWeatherTemperature%2C%0D%0A++metadata_HKWeatherHumidity%2C%0D%0A++device%2C%0D%0A++metadata_HKElevationAscended%2C%0D%0A++metadata_HKAverageMETs%0D%0Afrom%0D%0A++workouts%0D%0Aorder+by%0D%0A++id%0D%0Alimit%0D%0A++101'

In [None]:
url_CSV2 = 'http://localhost:8081/healthkit.csv?sql=select%0D%0A++id%2C%0D%0A++workoutActivityType%2C%0D%0A++duration%2C%0D%0A++durationUnit%2C%0D%0A++totalDistance%2C%0D%0A++totalDistanceUnit%2C%0D%0A++totalEnergyBurned%2C%0D%0A++totalEnergyBurnedUnit%2C%0D%0A++sourceName%2C%0D%0A++sourceVersion%2C%0D%0A++creationDate%2C%0D%0A++startDate%2C%0D%0A++endDate%2C%0D%0A++metadata_HKTimeZone%2C%0D%0A++workout_events%2C%0D%0A++metadata_HKWeatherTemperature%2C%0D%0A++metadata_HKWeatherHumidity%2C%0D%0A++device%2C%0D%0A++metadata_HKElevationAscended%2C%0D%0A++metadata_HKAverageMETs%2C%0D%0A++metadata_HKMaximumSpeed%2C%0D%0A++metadata_HKAverageSpeed%0D%0Afrom%0D%0A++workouts%0D%0Awhere%0D%0A++workoutActivityType+in+%28%3Ap0%2C+%3Ap1%29%0D%0Aorder+by%0D%0A++creationDate%0D%0Alimit%0D%0A++101&p0=HKWorkoutActivityTypeWalking&p1=HKWorkoutActivityTypeHiking&_size=max'

In [None]:
workouts_df = pd.read_csv(url_CSV)

#print((LOCAL_DB_URL + 'workout.json'))
#workout_df = pd.read_json(LOCAL_DB_URL + 'workouts.json')

In [None]:
workouts_df.nunique()

In [None]:
workouts_df.info()

In [None]:
workouts_clean_df = klib.data_cleaning(workouts_df)

In [None]:
workouts_clean_df.info()

In [None]:
workouts_df.head()

In [None]:
workouts_fly_df = pd.read_csv(url_CSV.replace(LOCAL_DB_URL, HOSTED_DB_URL))

In [None]:
workouts_fly_df.head()

In [None]:
workout_points_SQL = 'http://localhost:8081/healthkit.csv?sql=select%0D%0A++rowid%2C%0D%0A++date%2C%0D%0A++latitude%2C%0D%0A++longitude%2C%0D%0A++altitude%2C%0D%0A++speed%0D%0Afrom%0D%0A++workout_points%0D%0Awhere%0D%0A++workout_id+%3D+%22'
workout_id = 'a34036ff616122952fa67c9bc11a493f8642dd7c' + '%22'

workout_points_df = pd.read_csv(workout_points_SQL + workout_id, parse_dates=True)

In [None]:
workout_points_df.head()

In [None]:
workout_points_df.info()

In [None]:
WALK_DETAILS_FILE = 'walk_details.toml'
walk_details = Path('../' + WALK_DETAILS_FILE)


In [None]:
walk_details

In [None]:
with open(walk_details, encoding="utf-8") as f:
    walk_details_dict = tomli.load(f)

In [None]:
walk_details_dict

In [None]:
pd.DataFrame(walk_details_dict, )

In [None]:
#export
def calc_walk_stats(walk_data):
    total_time = dt.timedelta(0)
    total_distance = 0

    for iHike, hike in enumerate(walk_data):
        total_time += hike.index.max()
        # print(iHike+1, walk_date[iHike], hike.index.max(), hike['dist'].max() / 1e3)
        total_distance += hike['dist'].max()
    total_distance /= 1e3

    start_coord = walk_data[0][['lat', 'lon']].iloc[0].tolist()
    end_coord = walk_data[-1][['lat', 'lon']].iloc[-1].tolist()
    return total_time, total_distance, start_coord, end_coord


# TODO: use st.cache() and also look to pre-load and cache/feather data (or similar) - NB: use of @st.cache() below didn't work
def load_and_cache_raw_walk_data(walk_name, sample_freq, conn):
    RAW_FIT_FILE_PATH = 'icloud/Data/HealthFit/FIT' 
    fit_dir = Path.home()/RAW_FIT_FILE_PATH
    data_dir = fit_dir/walk_name[0:3]
    print(data_dir.ls())
    data_files = [file for file in os.listdir(data_dir) if file.endswith('.fit')]
    walk_files = sorted(data_files)
    print(walk_files)

    walk_data = []
    walk_date = []

    for iFile, file in enumerate(walk_files):
        walk_df = pd.DataFrame(aio.read(data_dir + file))
        walk_data.append(walk_df)
        walk_date.append(parse(file[0:17]))
        walk_df['WalkName'] = walk_name
        walk_df['WalkNumber'] = iFile
        walk_df[['alt', 'dist', 'lat', 'lon', 'speed', 'WalkName', 'WalkNumber']].to_sql('walks', conn, if_exists='append', index=False)
               
    total_time, total_distance, start_coord, end_coord = calc_walk_stats(walk_data)
    walk_stats = [total_time, total_distance, start_coord, end_coord]
    #print(start_coord)
    walk_merged = pd.concat(walk_data)
    points = walk_merged[['lat', 'lon']].values.tolist()
    points = [tuple(point) for ipoint, point in enumerate(points) if ipoint % sample_freq == 0]
    return walk_data, walk_date, walk_files, points, walk_stats

In [None]:
def create_walk_cached_data_for_app(db_file, n_rows_used=5):
    # read in all of the walks data and sample at an appropriate frequency and cache for faster use in the app
    db_conn = sql.connect(db_file)
    walk_df = pd.read_sql_query('SELECT * FROM walks', db_conn)

    UNUSED_COLUMNS = ['dist', 'speed']

    walk_df.drop(UNUSED_COLUMNS, axis=1, inplace=True)
    walk_df.dropna(inplace=True)      # TODO: Check why there are a few NaNs
    walk_df = walk_df.iloc[::n_rows_used].reset_index()    # downsample

    walk_df.to_feather(Path(db_file.as_posix().replace('.db', '.cache.feather')))
    
    return walk_df

In [None]:
# Not working yet -- this is the alternate approach to using the individual .FIT files
# walk_df = create_walk_cached_data_for_app(db_file, 10)

In [None]:
# walk_df[walk_df['lat'].isna()]

In [None]:
Path(db_file.as_posix().replace('.db', '.cache.feather'))

In [None]:
# walk_df = pd.read_feather(Path(db_file.as_posix().replace('.db', '.cache.feather')))

In [None]:
# walk_df.info()

In [None]:
# walk_df['WalkName'].unique()