# Database - functions for data back-end / manipulations

This is using an alternate approach:
  - Export all of my Apple HealthFit data from the Health app to export.zip 
  - Converted this to a SQLite database using `healthfit-to-sqlite`
 
  Queries can then against this database to build the cache file (or possibly a smaller custom SQLite file.

## TODO
* This is still a work in progress
* Need to write the queries to marshall the data for each of the workouts within each group of walks
* Then cache this data - maybe try another (small) sqlite db for the caching (instead of feather)
* **NOTE: It looks like the queries are being truncated at 1000 values - need to fix**


In [1]:

import pandas as pd

from dateutil.parser import parse
import datetime as dt
import sqlite3 as sql
from pathlib import Path
import tomli

import subprocess
import pendulum
from sqlite_utils import Database
import reverse_geocoder as rg
import healthkit_to_sqlite

### Datasette.io approach

#### Exporting HealthKit data / creating SQLite DB

First export HealthKit data using the Health app - select your profile icon from the top-right of the main screen and then select **Export All Health Data** (this can take some time to create the `export.zip` file).

The archive can be converted to a SQLite database using the following command:

`healthkit-to-sqlite export.zip healthkit_db.sqlite`

In [2]:
HEALTHKIT_DATA_PATH = "/Users/mjboothaus/icloud/Data/apple_health_export"
export_zip = Path(HEALTHKIT_DATA_PATH) / "export.zip"

In [3]:
# !mv /Users/mjboothaus/icloud/Data/apple_health_export/export_2022_05_11.zip /Users/mjboothaus/icloud/Data/apple_health_export/export.zip

In [4]:
!ls $HEALTHKIT_DATA_PATH

export.zip                     export_3Oct2021.zip
export_2022_04_28.zip          export_6Feb2022.zip
export_28Nov2021.zip           healthkit_db_2022_04_28.sqlite
export_31Jul2021.zip           healthkit_db_2022_05_11.sqlite


In [5]:
def convert_healthkit_export_to_sqlite(export_zip):
    zip_file = export_zip.as_posix()
    if export_zip.exists() is False:
        print(zip_file, ": not found")
        return None, f"{zip_file}: not found"
    zip_file_date = pendulum.instance(dt.datetime.fromtimestamp(export_zip.stat().st_ctime))

    db_file = zip_file.replace("export.zip", "healthkit_db.sqlite")
    if Path(db_file).exists() is True:
        Path(db_file).unlink()
    sp_cmd = f"healthkit-to-sqlite {zip_file} {db_file}"
    print(sp_cmd)
    print('---------------------------------------------------------------------------------------------')
    print('Please wait: converting healthkit export.zip to sqlite database (takes just over a minute)...')

    sp = subprocess.Popen(sp_cmd, stdout=subprocess.PIPE, shell=True)
    (sp_output, sp_err) = sp.communicate()  

    # This makes the wait possible
    sp_status = sp.wait()

    db_file_with_date = db_file.replace(".sqlite", "_" + zip_file_date.to_date_string().replace("-", "_") + ".sqlite")

    export_zip.rename(zip_file.replace(".zip", "_" + zip_file_date.to_date_string().replace("-", "_") + ".zip"))
    Path(db_file).rename(db_file_with_date)

    return db_file_with_date, sp_output

In [6]:
db_file, output = convert_healthkit_export_to_sqlite(export_zip)

healthkit-to-sqlite /Users/mjboothaus/icloud/Data/apple_health_export/export.zip /Users/mjboothaus/icloud/Data/apple_health_export/healthkit_db.sqlite
---------------------------------------------------------------------------------------------
Please wait: converting healthkit export.zip to sqlite database (takes just over a minute)...


In [7]:
db_file

'/Users/mjboothaus/icloud/Data/apple_health_export/healthkit_db_2022_05_11.sqlite'

In [8]:
def create_df_from_sql_query_in_file(filename_dot_sql, conn, parse_dates):

    query_file = Path.cwd().parent / "sql" / filename_dot_sql

    with open(query_file, 'r') as query:
        sql_text = query.read()
        print(sql_text)
        df = pd.read_sql_query(sql_text, conn, parse_dates=parse_dates)
    return df

In [9]:
db = Database(db_file)

In [10]:
workouts_df = create_df_from_sql_query_in_file("select_star_walking_workouts.sql", db.conn, ['startDate', 'endDate'])

select
    id as workout_id,
    duration as duration_minutes,
    totaldistance as totaldistance_km,
    totalenergyburned as totalenergyburned_kJ,
    sourcename,
    sourceversion,
    startdate,
    enddate,
    metadata_hkweathertemperature,
    metadata_hkweatherhumidity,
    metadata_hkelevationascended,
    metadata_hkaveragemets
from
    workouts
where workoutactivitytype = "HKWorkoutActivityTypeWalking" order by id


 /* Excluded fields:
    workoutactivitytype,   # just walking
    durationunit,          # fixed - min
    totaldistanceunit,     # fixed - km
    totalenergyburnedunit, # fixed - kJ
    device,
    creationdate,          # not really of interest (start date instead)
    workout_events,        # think this is redundant info (need to check - JSON?)
    metadata_hkgroupfitness,
    metadata_hkworkoutbrandname,
    metadata_hktimezone,
    metadata_hkcoachedworkout,
    metadata_hkwasuserentered,
    metadata_hkindoorworkout,
    metadata_hkelevationascended,
    m

In [11]:
workouts_df.head()

Unnamed: 0,workout_id,duration_minutes,totaldistance_km,totalenergyburned_kJ,sourceName,sourceVersion,startDate,endDate,metadata_HKWeatherTemperature,metadata_HKWeatherHumidity,metadata_HKElevationAscended,metadata_HKAverageMETs
0,00340675e4ce8dbe420be499fec61064f1fe6181,39.56012303233147,1.655609837202355,805.8203208498522,Michael and Ai Leen’s Apple Watch,5.1.1,2018-12-12 12:02:43+10:00,2018-12-12 12:42:17+10:00,73 degF,6600 %,6742 cm,
1,0194ee8d2c587a5f60bf52bc29ad48732f6577c1,377.0199567496776,21.23662400489231,11479.84601564049,Michael and Ai Leen’s Apple Watch,8.0,2021-10-15 07:29:58+10:00,2021-10-15 13:46:59+10:00,57.2 degF,7200 %,84570 cm,5.09913 kcal/hr·kg
2,01a008d304689e9c4f33d39a834448dc87de1f3d,73.05824839870135,7.151453514377587,2437.193206995518,Michael and Ai Leen’s Apple Watch,5.2.1,2019-07-22 13:27:06+10:00,2019-07-22 14:40:10+10:00,72 degF,3400 %,3100 cm,
3,02f4ac544c1e69e2d1bd28449d7d53e414dabf6f,37.22489701608817,2.167894573042402,562.8986240000003,Michael and Ai Leen’s Apple Watch,4.2,2017-12-28 16:39:12+10:00,2017-12-28 17:16:25+10:00,80 degF,6500 %,,
4,03feeff6f530f7ada4dc9f3fbbbaeef4c357eef0,45.25093293984731,2.445003094782327,1081.791941778821,Michael’s Apple Watch,8.0.1,2021-11-05 16:17:31+10:00,2021-11-05 17:02:46+10:00,71.6 degF,6700 %,3165 cm,3.91946 kcal/hr·kg


In [12]:
# Trying to find the start point in each walk workout -- as date is not a date field in db not clear if sort by date and limit 1 will work
# might need to import table to pandas convert types and then export to db before doing query. Else use sqlite_utils to change column types.

start_point_df = create_df_from_sql_query_in_file("select_start_point_workout.sql", db.conn, ['date'])

-- Get starting point for each workout
select
  start_datetime,
  start_latitude,
  start_longitude,
  start_altitude,
  start_speed,
  workout_id
from
  (
    select
      date as start_datetime,
      latitude as start_latitude,
      longitude as start_longitude,
      altitude as start_altitude,
      speed as start_speed,
      workout_id,
      row_number() over (
        partition by workout_id
        order by
          date asc
      ) as date_rank
    from
      workout_points
  )
where
  date_rank = 1


In [13]:
start_point_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 772 entries, 0 to 771
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   start_datetime   772 non-null    object 
 1   start_latitude   772 non-null    float64
 2   start_longitude  772 non-null    float64
 3   start_altitude   772 non-null    float64
 4   start_speed      772 non-null    float64
 5   workout_id       772 non-null    object 
dtypes: float64(4), object(2)
memory usage: 36.3+ KB


In [14]:
start_point_df.head()

Unnamed: 0,start_datetime,start_latitude,start_longitude,start_altitude,start_speed,workout_id
0,2018-12-12T02:02:49Z,-36.828834,174.797829,22.545229,1.140843,00340675e4ce8dbe420be499fec61064f1fe6181
1,2021-10-17T02:10:37Z,-33.839599,151.169671,-7.832549,0.025052,01803c1598592000aa9a56d529426db9b0e2900b
2,2021-10-14T21:29:57Z,-33.768678,150.619695,169.71344,0.159099,0194ee8d2c587a5f60bf52bc29ad48732f6577c1
3,2019-07-22T03:27:12Z,-33.856203,151.158288,23.493387,1.249567,01a008d304689e9c4f33d39a834448dc87de1f3d
4,2017-08-12T23:15:38Z,-33.847686,151.153258,-16.533785,1.918036,024aa33aaf0f6023cd9b54e1a2ffa7e9fe27d935


In [15]:
finish_point_df = create_df_from_sql_query_in_file("select_finish_point_workout.sql", db.conn, ['date'])

-- Get finishing point for each workout
select
  finish_datetime,
  finish_latitude,
  finish_longitude,
  finish_altitude,
  finish_speed,
  workout_id
from
  (
    select
      date as finish_datetime,
      latitude as finish_latitude,
      longitude as finish_longitude,
      altitude as finish_altitude,
      speed as finish_speed,
      workout_id,
      row_number() over (
        partition by workout_id
        order by
          date desc
      ) as date_rank
    from
      workout_points
  )
where
  date_rank = 1


In [16]:
finish_point_df.head()

Unnamed: 0,finish_datetime,finish_latitude,finish_longitude,finish_altitude,finish_speed,workout_id
0,2018-12-12T02:40:16Z,-36.831425,174.797488,11.966413,0.646744,00340675e4ce8dbe420be499fec61064f1fe6181
1,2021-10-17T02:24:46Z,-33.839389,151.169714,1.235013,0.529916,01803c1598592000aa9a56d529426db9b0e2900b
2,2021-10-15T02:38:33Z,-33.76298,150.512678,504.802399,1.227253,0194ee8d2c587a5f60bf52bc29ad48732f6577c1
3,2019-07-22T04:40:09Z,-33.856201,151.158183,9.002841,1.459583,01a008d304689e9c4f33d39a834448dc87de1f3d
4,2017-08-12T23:17:12Z,-33.849632,151.154144,-22.33036,2.512069,024aa33aaf0f6023cd9b54e1a2ffa7e9fe27d935


In [17]:
walk_info_df = start_point_df.merge(finish_point_df, how='inner', on='workout_id')

In [18]:
def get_location(latitude, longitude):
    location = rg.search((latitude, longitude))
    return [location[0]['name'], location[0]['admin1'], location[0]['cc']]

In [19]:
walk_info_df['start_location'] = walk_info_df.apply(lambda row: get_location(float(row['start_latitude']), float(row['start_longitude'])), axis=1)

Loading formatted geocoded file...


In [20]:
walk_info_df['finish_location'] = walk_info_df.apply(lambda row: get_location(float(row['finish_latitude']), float(row['finish_longitude'])), axis=1)

In [21]:
walk_info_df.head(10)

Unnamed: 0,start_datetime,start_latitude,start_longitude,start_altitude,start_speed,workout_id,finish_datetime,finish_latitude,finish_longitude,finish_altitude,finish_speed,start_location,finish_location
0,2018-12-12T02:02:49Z,-36.828834,174.797829,22.545229,1.140843,00340675e4ce8dbe420be499fec61064f1fe6181,2018-12-12T02:40:16Z,-36.831425,174.797488,11.966413,0.646744,"[Auckland, Auckland, NZ]","[Auckland, Auckland, NZ]"
1,2021-10-17T02:10:37Z,-33.839599,151.169671,-7.832549,0.025052,01803c1598592000aa9a56d529426db9b0e2900b,2021-10-17T02:24:46Z,-33.839389,151.169714,1.235013,0.529916,"[Longueville, New South Wales, AU]","[Longueville, New South Wales, AU]"
2,2021-10-14T21:29:57Z,-33.768678,150.619695,169.71344,0.159099,0194ee8d2c587a5f60bf52bc29ad48732f6577c1,2021-10-15T02:38:33Z,-33.76298,150.512678,504.802399,1.227253,"[Glenbrook, New South Wales, AU]","[Hazelbrook, New South Wales, AU]"
3,2019-07-22T03:27:12Z,-33.856203,151.158288,23.493387,1.249567,01a008d304689e9c4f33d39a834448dc87de1f3d,2019-07-22T04:40:09Z,-33.856201,151.158183,9.002841,1.459583,"[Drummoyne, New South Wales, AU]","[Drummoyne, New South Wales, AU]"
4,2017-08-12T23:15:38Z,-33.847686,151.153258,-16.533785,1.918036,024aa33aaf0f6023cd9b54e1a2ffa7e9fe27d935,2017-08-12T23:17:12Z,-33.849632,151.154144,-22.33036,2.512069,"[Canada Bay, New South Wales, AU]","[Drummoyne, New South Wales, AU]"
5,2017-12-28T06:39:19Z,-33.019214,151.687465,15.312214,0.910099,02f4ac544c1e69e2d1bd28449d7d53e414dabf6f,2017-12-28T07:16:26Z,-33.008561,151.706111,22.346874,0.365437,"[Jewells, New South Wales, AU]","[Redhead, New South Wales, AU]"
6,2021-11-05T06:27:42Z,-33.866904,151.207558,56.25695,0.611529,03feeff6f530f7ada4dc9f3fbbbaeef4c357eef0,2021-11-05T07:02:46Z,-33.87393,151.200567,36.06461,15.52635,"[Sydney, New South Wales, AU]","[Haymarket, New South Wales, AU]"
7,2020-08-16T21:52:52Z,-33.852121,151.154849,27.93795,1.825035,042dbfd0cf540c6de30fe9c511e614907b748550,2020-08-16T23:02:31Z,-33.847096,151.152861,21.91992,0.0,"[Drummoyne, New South Wales, AU]","[Canada Bay, New South Wales, AU]"
8,2018-10-26T23:03:00Z,-33.792994,151.156094,19.478712,0.219718,04c2e8bf2e2685f2745bd250dca9e798bbc7080e,2018-10-27T02:32:19Z,-33.773364,151.139517,55.003124,0.226333,"[Chatswood West, New South Wales, AU]","[Killara, New South Wales, AU]"
9,2021-10-31T18:38:21Z,-32.959781,151.198376,165.21376,0.264564,0551ef0c3a8736ee14a81dd9eded8f0606090701,2021-10-31T21:20:43Z,-32.937847,151.142218,111.341309,0.000538,"[Cessnock, New South Wales, AU]","[Cessnock, New South Wales, AU]"


In [22]:
def calculate_elapsed_time_minutes(finish_datetime, start_datetime):
    dt = pendulum.parse(finish_datetime) - pendulum.parse(start_datetime)
    return float(dt.in_seconds() / 60 / 60)

In [23]:
walk_info_df['elapsed_time_hours'] = walk_info_df.apply(lambda row: calculate_elapsed_time_minutes(row['finish_datetime'], row['start_datetime']), axis=1)

In [24]:
walk_info_df['start_datetime'] = walk_info_df['start_datetime'].apply(lambda dt: pendulum.parse(dt, tz="Australia/Sydney").to_datetime_string())    # TODO: Need to convert from UTC to Sydney local time?

In [25]:
walk_info_df.head()

Unnamed: 0,start_datetime,start_latitude,start_longitude,start_altitude,start_speed,workout_id,finish_datetime,finish_latitude,finish_longitude,finish_altitude,finish_speed,start_location,finish_location,elapsed_time_hours
0,2018-12-12 02:02:49,-36.828834,174.797829,22.545229,1.140843,00340675e4ce8dbe420be499fec61064f1fe6181,2018-12-12T02:40:16Z,-36.831425,174.797488,11.966413,0.646744,"[Auckland, Auckland, NZ]","[Auckland, Auckland, NZ]",0.624167
1,2021-10-17 02:10:37,-33.839599,151.169671,-7.832549,0.025052,01803c1598592000aa9a56d529426db9b0e2900b,2021-10-17T02:24:46Z,-33.839389,151.169714,1.235013,0.529916,"[Longueville, New South Wales, AU]","[Longueville, New South Wales, AU]",0.235833
2,2021-10-14 21:29:57,-33.768678,150.619695,169.71344,0.159099,0194ee8d2c587a5f60bf52bc29ad48732f6577c1,2021-10-15T02:38:33Z,-33.76298,150.512678,504.802399,1.227253,"[Glenbrook, New South Wales, AU]","[Hazelbrook, New South Wales, AU]",5.143333
3,2019-07-22 03:27:12,-33.856203,151.158288,23.493387,1.249567,01a008d304689e9c4f33d39a834448dc87de1f3d,2019-07-22T04:40:09Z,-33.856201,151.158183,9.002841,1.459583,"[Drummoyne, New South Wales, AU]","[Drummoyne, New South Wales, AU]",1.215833
4,2017-08-12 23:15:38,-33.847686,151.153258,-16.533785,1.918036,024aa33aaf0f6023cd9b54e1a2ffa7e9fe27d935,2017-08-12T23:17:12Z,-33.849632,151.154144,-22.33036,2.512069,"[Canada Bay, New South Wales, AU]","[Drummoyne, New South Wales, AU]",0.026111


In [26]:
walk_info_df = walk_info_df.merge(workouts_df, how="inner", on="workout_id")

In [27]:
walk_info_df['startDate'] = walk_info_df['startDate'].apply(lambda dt: pendulum.instance(dt).to_datetime_string()) 
walk_info_df['endDate'] = walk_info_df['endDate'].apply(lambda dt: pendulum.instance(dt).to_datetime_string())

In [28]:
walk_info_df.to_excel('walk_info_df.xlsx', index=False)

#TODO: Need to put this in the data folder

In [29]:
start_point_df['workout_id'].nunique() / len(start_point_df)

1.0

In [30]:
workouts_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 682 entries, 0 to 681
Data columns (total 12 columns):
 #   Column                         Non-Null Count  Dtype                                
---  ------                         --------------  -----                                
 0   workout_id                     682 non-null    object                               
 1   duration_minutes               682 non-null    object                               
 2   totaldistance_km               682 non-null    object                               
 3   totalenergyburned_kJ           682 non-null    object                               
 4   sourceName                     682 non-null    object                               
 5   sourceVersion                  682 non-null    object                               
 6   startDate                      682 non-null    datetime64[ns, pytz.FixedOffset(600)]
 7   endDate                        682 non-null    datetime64[ns, pytz.FixedOffset(6

In [31]:
workouts_df_cleaned = klib.data_cleaning(workouts_df)

NameError: name 'klib' is not defined

In [None]:
workouts_df_cleaned.info()

In [None]:
workouts_df_cleaned.head()

In [None]:
workouts_csv = HEALTHKIT_DATA_PATH / "workouts.csv"

In [None]:
workouts_df_cleaned.to_csv(workouts_csv, index=False)

In [None]:
workouts_df_cleaned.columns

In [None]:
workouts_df_cleaned.nunique()

In [None]:
[col for col in workouts_df.columns.values if col not in workouts_df_cleaned.columns.values]

In [None]:
tables_df = create_df_from_sql_query_in_file("list_all_tables.sql", db.conn)

In [None]:
tables_df

In [None]:
WALK_DETAILS_FILE = 'walk_details.toml'
walk_details = Path('../' + WALK_DETAILS_FILE)


In [None]:
walk_details

In [None]:
with open(walk_details, encoding="utf-8") as f:
    walk_details_dict = tomli.load(f)

In [None]:
walk_details_dict

In [None]:
pd.DataFrame(walk_details_dict, )

In [None]:

def calc_walk_stats(walk_data):
    total_time = dt.timedelta(0)
    total_distance = 0

    for hike in walk_data:
        total_time += hike.index.max()
        # print(iHike+1, walk_date[iHike], hike.index.max(), hike['dist'].max() / 1e3)
        total_distance += hike['dist'].max()
    total_distance /= 1e3

    start_coord = walk_data[0][['lat', 'lon']].iloc[0].tolist()
    end_coord = walk_data[-1][['lat', 'lon']].iloc[-1].tolist()
    return total_time, total_distance, start_coord, end_coord

In [None]:
def create_walk_cached_data_for_app(db_file, n_rows_used=5):
    # read in all of the walks data and sample at an appropriate frequency and cache for faster use in the app
    db_conn = sql.connect(db_file)
    walk_df = pd.read_sql_query('SELECT * FROM walks', db_conn)

    UNUSED_COLUMNS = ['dist', 'speed']

    walk_df.drop(UNUSED_COLUMNS, axis=1, inplace=True)
    walk_df.dropna(inplace=True)      # TODO: Check why there are a few NaNs
    walk_df = walk_df.iloc[::n_rows_used].reset_index()    # downsample

    walk_df.to_feather(Path(db_file.as_posix().replace('.db', '.cache.feather')))
    
    return walk_df

In [None]:
# Not working yet -- this is the alternate approach to using the individual .FIT files
# walk_df = create_walk_cached_data_for_app(db_file, 10)

In [None]:
# walk_df[walk_df['lat'].isna()]

In [None]:
Path(db_file.as_posix().replace('.db', '.cache.feather'))

In [None]:
# walk_df = pd.read_feather(Path(db_file.as_posix().replace('.db', '.cache.feather')))

In [None]:
# walk_df.info()

In [None]:
# walk_df['WalkName'].unique()