In [1]:
# default_exp datapipe
from nbdev import *

# Database - functions for data back-end / manipulations

This is using an alternate approach:
  - I have exported all of my Apple healthfit data from the Health app to export.zip 
  and then converted this to a SQLite database using `healthfit-to-sqlite`
  - I am then creating a "published" version of this SQLite database using
  Datasette.io
  - I have a local version of this database running at http://localhost:8081/healthkit and
  similarly I have an externally deployed version at https://my-healthkit-data.fly.dev
  - I will then run queries against this database to build the cache file (or possibly a smaller custom sqlite file)

## TODO
* This is still a work in progress
* Need to write the queries to marshall the data for each of the workouts within each group of walks
* Then cache this data - maybe try another (small) sqlite db for the caching (instead of feather)
* **NOTE: It looks like the queries are being truncated at 1000 values - need to fix**


In [2]:
#export
import os
import pandas as pd
import activityio as aio
from dateutil.parser import parse
import datetime as dt
import sqlite3 as sql
from pathlib import Path
import tomli
import klib
import subprocess
import pendulum
from sqlite_utils import Database
import reverse_geocoder as rg



In [3]:
try:
    from emmaus_walking.core import in_notebook, get_project_root, get_project_root_alternate
except:
    from core import in_notebook, get_project_root, get_project_root_alternate

In Jupyter notebook
Project root directory: /Users/mjboothaus/code/github/mjboothaus/emmaus_walking


[https://databooth.slite.com/api/s/note/MEQPRcLKFqUE6ko2vVp3Yr/Datasette-io-approach](https://databooth.slite.com/api/s/note/MEQPRcLKFqUE6ko2vVp3Yr/Datasette-io-approach) (Slite page)

e.g. start_datasette = subprocess.Popen('datasette ' + db_name + ' -m aus-covid-datasette-meta.json', stdout=subprocess.PIPE, shell=True)

In [4]:
HEALTHKIT_DATA_PATH = Path("/Users/mjboothaus/data/healthkit")
export_zip = HEALTHKIT_DATA_PATH / "export.zip"

In [5]:
def convert_healthkit_export_to_sqlite(export_zip):
    zip_file = export_zip.as_posix()
    if export_zip.exists() == False:
        print(zip_file, ": not found")
        return None, zip_file + ": not found"
    zip_file_date = pendulum.instance(dt.datetime.fromtimestamp(export_zip.stat().st_ctime))

    db_file = zip_file.replace("export.zip", "healthkit.db")
    if Path(db_file).exists() == True:
        Path(db_file).unlink()
    sp_cmd = "pipx run healthkit-to-sqlite " + zip_file + " " + db_file
    print(sp_cmd)
    print('---------------------------------------------------------------------------------------------')
    print('Please wait: converting healthkit export.zip to sqlite database (takes just over a minute)...')

    sp = subprocess.Popen(sp_cmd, stdout=subprocess.PIPE, shell=True)
    (sp_output, sp_err) = sp.communicate()  

    #This makes the wait possible
    sp_status = sp.wait()

    db_file_with_date = db_file.replace(".db", "_" + zip_file_date.to_date_string().replace("-", "_") + ".db")
    
    export_zip.rename(zip_file.replace(".zip", "_" + zip_file_date.to_date_string().replace("-", "_") + ".zip"))
    Path(db_file).rename(db_file_with_date)

    return db_file_with_date, sp_output

In [6]:
db_file, output = convert_healthkit_export_to_sqlite(export_zip)

pipx run healthkit-to-sqlite /Users/mjboothaus/data/healthkit/export.zip /Users/mjboothaus/data/healthkit/healthkit.db
---------------------------------------------------------------------------------------------
Please wait: converting healthkit export.zip to sqlite database (takes just over a minute)...


In [7]:
output

b'Importing from HealthKit\n'

## Look at the HealthKit database via Datasette.io (and try an reduce to just needed data)

In [8]:
print("datasette " + db_file + " --setting sql_time_limit_ms 5000  --setting max_returned_rows 10000 &")

datasette /Users/mjboothaus/data/healthkit/healthkit_2021_08_02.db --setting sql_time_limit_ms 5000  --setting max_returned_rows 10000 &


In [9]:
def create_df_from_sql_query_in_file(filename_dot_sql, conn, parse_dates):
# Read the sql file
    query_file = get_project_root_alternate() / "sql" / filename_dot_sql

    with open(query_file, 'r') as query:
        # connection == the connection to your database
        sql_text = query.read()
        print(sql_text)
        df = pd.read_sql_query(sql_text, conn, parse_dates=parse_dates)
    return df

In [41]:
db = Database(db_file)

In [42]:
workouts_df = create_df_from_sql_query_in_file("select_star_walking_workouts.sql", db.conn, ['startDate', 'endDate'])

select
    id as workout_id,
    duration as duration_minutes,
    totaldistance as totaldistance_km,
    totalenergyburned as totalenergyburned_kJ,
    sourcename,
    sourceversion,
    startdate,
    enddate,
    metadata_hkweathertemperature,
    metadata_hkweatherhumidity,
    metadata_hkelevationascended,
    metadata_hkaveragemets
from
    workouts
where workoutactivitytype = "HKWorkoutActivityTypeWalking" order by id


 /* Excluded fields:
    workoutactivitytype,   # just walking
    durationunit,          # fixed - min
    totaldistanceunit,     # fixed - km
    totalenergyburnedunit, # fixed - kJ
    device,
    creationdate,          # not really of interest (start date instead)
    workout_events,        # think this is redundant info (need to check - JSON?)
    metadata_hkgroupfitness,
    metadata_hkworkoutbrandname,
    metadata_hktimezone,
    metadata_hkcoachedworkout,
    metadata_hkwasuserentered,
    metadata_hkindoorworkout,
    metadata_hkelevationascended,
    m

In [43]:
workouts_df.head()

Unnamed: 0,workout_id,duration_minutes,totaldistance_km,totalenergyburned_kJ,sourceName,sourceVersion,startDate,endDate,metadata_HKWeatherTemperature,metadata_HKWeatherHumidity,metadata_HKElevationAscended,metadata_HKAverageMETs
0,000d7757f079ab0f75a4db50621f052894aa33c2,30.05313618381818,1.177883369302785,639.2089883072423,Michael and Ai Leen’s Apple Watch,7.1,2021-01-06 13:53:19+10:00,2021-01-06 14:23:22+10:00,73.4 degF,6900 %,3899 cm,3.82314 kcal/hr·kg
1,0018d73a8cf8c051befbc9bfb427320431b54628,97.57144786715509,5.969473837612362,1843.191264816394,Michael and Ai Leen’s Apple Watch,6.1.1,2020-03-23 09:15:24+10:00,2020-03-23 10:52:59+10:00,68 degF,6400 %,9430 cm,3.30812 kcal/hr·kg
2,00645c031954e799f2bad6ca9f26529a676bc660,91.34290488362312,5.020199223674742,2235.856794424977,Michael and Ai Leen’s Apple Watch,5.1.3,2019-04-08 11:41:03+10:00,2019-04-08 13:12:24+10:00,84 degF,4300 %,13916 cm,
3,00e4a61ddc04be5ec7fca6130b97fb3694d07c7a,75.63339018424352,4.172883323542221,1718.733819123089,Michael and Ai Leen’s Apple Watch,6.2.1,2020-05-06 16:23:43+10:00,2020-05-06 17:39:21+10:00,71 degF,4500 %,7325 cm,3.94665 kcal/hr·kg
4,02b32c6480081df19693e7f5661f5f3c7ac65474,105.3766344666481,6.767214865892154,2543.764525288699,Michael and Ai Leen’s Apple Watch,6.2.6,2020-06-24 15:10:00+10:00,2020-06-24 17:58:14+10:00,65 degF,4900 %,8025 cm,


In [44]:
# Trying to find the start point in each walk workout -- as date is not a date field in db not clear if sort by date and limit 1 will work
# might need to import table to pandas convert types and then export to db before doing query. Else use sqlite_utils to change column types.

start_point_df = create_df_from_sql_query_in_file("select_start_point_workout.sql", db.conn, ['date'])

-- Get starting point for each workout
select
  start_datetime,
  start_latitude,
  start_longitude,
  start_altitude,
  start_speed,
  workout_id
from
  (
    select
      date as start_datetime,
      latitude as start_latitude,
      longitude as start_longitude,
      altitude as start_altitude,
      speed as start_speed,
      workout_id,
      row_number() over (
        partition by workout_id
        order by
          date asc
      ) as date_rank
    from
      workout_points
  )
where
  date_rank = 1


In [45]:
start_point_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 562 entries, 0 to 561
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   start_datetime   562 non-null    object 
 1   start_latitude   562 non-null    float64
 2   start_longitude  562 non-null    float64
 3   start_altitude   562 non-null    float64
 4   start_speed      562 non-null    float64
 5   workout_id       562 non-null    object 
dtypes: float64(4), object(2)
memory usage: 26.5+ KB


In [46]:
start_point_df.head()

Unnamed: 0,start_datetime,start_latitude,start_longitude,start_altitude,start_speed,workout_id
0,2019-12-28T22:34:09Z,-33.014218,151.720312,1.280728,0.909875,00097a235ab0fa481f3a17838634066fad80f180
1,2021-01-06T03:53:18Z,-33.766433,151.170484,99.246925,0.039098,000d7757f079ab0f75a4db50621f052894aa33c2
2,2020-03-22T23:15:32Z,-33.850294,151.140811,17.576904,1.871135,0018d73a8cf8c051befbc9bfb427320431b54628
3,2019-04-08T01:41:07Z,-33.843765,151.284606,17.783922,2.052635,00645c031954e799f2bad6ca9f26529a676bc660
4,2020-05-06T06:24:03Z,-33.847065,151.152711,27.148079,0.873648,00e4a61ddc04be5ec7fca6130b97fb3694d07c7a


In [47]:
finish_point_df = create_df_from_sql_query_in_file("select_finish_point_workout.sql", db.conn, ['date'])

-- Get finishing point for each workout
select
  finish_datetime,
  finish_latitude,
  finish_longitude,
  finish_altitude,
  finish_speed,
  workout_id
from
  (
    select
      date as finish_datetime,
      latitude as finish_latitude,
      longitude as finish_longitude,
      altitude as finish_altitude,
      speed as finish_speed,
      workout_id,
      row_number() over (
        partition by workout_id
        order by
          date desc
      ) as date_rank
    from
      workout_points
  )
where
  date_rank = 1


In [48]:
finish_point_df.head()

Unnamed: 0,finish_datetime,finish_latitude,finish_longitude,finish_altitude,finish_speed,workout_id
0,2019-12-28T22:35:50Z,-33.014006,151.719901,1.494658,0.061736,00097a235ab0fa481f3a17838634066fad80f180
1,2021-01-06T04:23:21Z,-33.766486,151.170506,99.62204,0.808002,000d7757f079ab0f75a4db50621f052894aa33c2
2,2020-03-23T00:52:58Z,-33.85085,151.156934,14.75663,0.0,0018d73a8cf8c051befbc9bfb427320431b54628
3,2019-04-08T03:12:22Z,-33.844078,151.284577,23.045195,1.279834,00645c031954e799f2bad6ca9f26529a676bc660
4,2020-05-06T07:39:21Z,-33.847094,151.152851,21.988541,0.623013,00e4a61ddc04be5ec7fca6130b97fb3694d07c7a


In [49]:
walk_info_df = start_point_df.merge(finish_point_df, how='inner', on='workout_id')

In [50]:
def get_location(latitude, longitude):
    location = rg.search((latitude, longitude))
    return [location[0]['name'], location[0]['admin1'], location[0]['cc']]

In [51]:
walk_info_df['start_location'] = walk_info_df.apply(lambda row: get_location(float(row['start_latitude']), float(row['start_longitude'])), axis=1)

In [52]:
walk_info_df['finish_location'] = walk_info_df.apply(lambda row: get_location(float(row['finish_latitude']), float(row['finish_longitude'])), axis=1)

In [53]:
walk_info_df.head(10)

Unnamed: 0,start_datetime,start_latitude,start_longitude,start_altitude,start_speed,workout_id,finish_datetime,finish_latitude,finish_longitude,finish_altitude,finish_speed,start_location,finish_location
0,2019-12-28T22:34:09Z,-33.014218,151.720312,1.280728,0.909875,00097a235ab0fa481f3a17838634066fad80f180,2019-12-28T22:35:50Z,-33.014006,151.719901,1.494658,0.061736,"[Redhead, New South Wales, AU]","[Redhead, New South Wales, AU]"
1,2021-01-06T03:53:18Z,-33.766433,151.170484,99.246925,0.039098,000d7757f079ab0f75a4db50621f052894aa33c2,2021-01-06T04:23:21Z,-33.766486,151.170506,99.62204,0.808002,"[Killara, New South Wales, AU]","[Killara, New South Wales, AU]"
2,2020-03-22T23:15:32Z,-33.850294,151.140811,17.576904,1.871135,0018d73a8cf8c051befbc9bfb427320431b54628,2020-03-23T00:52:58Z,-33.85085,151.156934,14.75663,0.0,"[Canada Bay, New South Wales, AU]","[Drummoyne, New South Wales, AU]"
3,2019-04-08T01:41:07Z,-33.843765,151.284606,17.783922,2.052635,00645c031954e799f2bad6ca9f26529a676bc660,2019-04-08T03:12:22Z,-33.844078,151.284577,23.045195,1.279834,"[Dover Heights, New South Wales, AU]","[Dover Heights, New South Wales, AU]"
4,2020-05-06T06:24:03Z,-33.847065,151.152711,27.148079,0.873648,00e4a61ddc04be5ec7fca6130b97fb3694d07c7a,2020-05-06T07:39:21Z,-33.847094,151.152851,21.988541,0.623013,"[Canada Bay, New South Wales, AU]","[Canada Bay, New South Wales, AU]"
5,2017-08-12T23:15:38Z,-33.847686,151.153258,-16.533785,1.918036,024aa33aaf0f6023cd9b54e1a2ffa7e9fe27d935,2017-08-12T23:17:12Z,-33.849632,151.154144,-22.33036,2.512069,"[Canada Bay, New South Wales, AU]","[Drummoyne, New South Wales, AU]"
6,2020-06-24T05:10:00Z,-33.847272,151.152805,17.519846,0.60129,02b32c6480081df19693e7f5661f5f3c7ac65474,2020-06-24T06:55:23Z,-33.846985,151.15307,16.557472,0.079072,"[Canada Bay, New South Wales, AU]","[Canada Bay, New South Wales, AU]"
7,2017-12-28T06:39:19Z,-33.019214,151.687465,15.312214,0.910099,02f4ac544c1e69e2d1bd28449d7d53e414dabf6f,2017-12-28T07:16:26Z,-33.008561,151.706111,22.346874,0.365437,"[Jewells, New South Wales, AU]","[Redhead, New South Wales, AU]"
8,2019-12-15T23:51:54Z,-33.859273,151.154808,7.459394,1.334558,03a1d16e41823ee845034d70dbfc3f562061d4c9,2019-12-16T00:47:54Z,-33.858716,151.154779,12.775998,0.139416,"[Drummoyne, New South Wales, AU]","[Drummoyne, New South Wales, AU]"
9,2018-11-17T04:08:35Z,-33.77336,151.139593,54.950699,0.784665,03d91b8de4f3f3e1a7e437024d0338dfd0b55e08,2018-11-17T06:43:56Z,-33.732058,151.078416,173.085007,1.161704,"[Killara, New South Wales, AU]","[Pennant Hills, New South Wales, AU]"


In [54]:
def calculate_elapsed_time_minutes(finish_datetime, start_datetime):
    dt = pendulum.parse(finish_datetime) - pendulum.parse(start_datetime)
    return dt.in_seconds() / 60 / 60

In [55]:
walk_info_df['elapsed_time_hours'] = walk_info_df.apply(lambda row: calculate_elapsed_time_minutes(row['finish_datetime'], row['start_datetime']), axis=1)

In [56]:
walk_info_df['start_datetime'] = walk_info_df['start_datetime'].apply(lambda dt: pendulum.parse(dt, tz="Australia/Sydney").to_datetime_string())    # TODO: Need to convert from UTC to Sydney local time?

In [57]:
walk_info_df.head()

Unnamed: 0,start_datetime,start_latitude,start_longitude,start_altitude,start_speed,workout_id,finish_datetime,finish_latitude,finish_longitude,finish_altitude,finish_speed,start_location,finish_location,elapsed_time_hours
0,2019-12-28 22:34:09,-33.014218,151.720312,1.280728,0.909875,00097a235ab0fa481f3a17838634066fad80f180,2019-12-28T22:35:50Z,-33.014006,151.719901,1.494658,0.061736,"[Redhead, New South Wales, AU]","[Redhead, New South Wales, AU]",0.028056
1,2021-01-06 03:53:18,-33.766433,151.170484,99.246925,0.039098,000d7757f079ab0f75a4db50621f052894aa33c2,2021-01-06T04:23:21Z,-33.766486,151.170506,99.62204,0.808002,"[Killara, New South Wales, AU]","[Killara, New South Wales, AU]",0.500833
2,2020-03-22 23:15:32,-33.850294,151.140811,17.576904,1.871135,0018d73a8cf8c051befbc9bfb427320431b54628,2020-03-23T00:52:58Z,-33.85085,151.156934,14.75663,0.0,"[Canada Bay, New South Wales, AU]","[Drummoyne, New South Wales, AU]",1.623889
3,2019-04-08 01:41:07,-33.843765,151.284606,17.783922,2.052635,00645c031954e799f2bad6ca9f26529a676bc660,2019-04-08T03:12:22Z,-33.844078,151.284577,23.045195,1.279834,"[Dover Heights, New South Wales, AU]","[Dover Heights, New South Wales, AU]",1.520833
4,2020-05-06 06:24:03,-33.847065,151.152711,27.148079,0.873648,00e4a61ddc04be5ec7fca6130b97fb3694d07c7a,2020-05-06T07:39:21Z,-33.847094,151.152851,21.988541,0.623013,"[Canada Bay, New South Wales, AU]","[Canada Bay, New South Wales, AU]",1.255


In [58]:
walk_info_df = walk_info_df.merge(workouts_df, how="inner", on="workout_id")

In [59]:
walk_info_df['startDate'] = walk_info_df['startDate'].apply(lambda dt: pendulum.instance(dt).to_datetime_string()) 
walk_info_df['endDate'] = walk_info_df['endDate'].apply(lambda dt: pendulum.instance(dt).to_datetime_string())

In [60]:
walk_info_df.to_excel('walk_info_df.xlsx', index=False)

In [None]:
start_point_df['workout_id'].nunique() / len(start_point_df)

In [None]:
workouts_df.info()

In [None]:
workouts_df_cleaned = klib.data_cleaning(workouts_df)

In [None]:
workouts_df_cleaned.info()

In [None]:
workouts_df_cleaned.head()

In [None]:
db_file

In [None]:
workouts_csv = HEALTHKIT_DATA_PATH / "workouts.csv"

In [None]:
workouts_df_cleaned.to_csv(workouts_csv, index=False)

In [None]:
workouts_df_cleaned.columns

In [None]:
workouts_df_cleaned.nunique()

In [None]:
[col for col in workouts_df.columns.values if col not in workouts_df_cleaned.columns.values]

In [None]:
tables_df = create_df_from_sql_query_in_file("list_all_tables.sql", db.conn)

In [None]:
tables_df

### Looking at sqlite version of the cached data derived from individual walk files in walk groups

In [None]:
db_file = Path('emmaus_walking.db')

In [None]:
!echo "datasette" $db_file "&"

In [None]:
LOCAL_DB_URL = 'http://localhost:8081/'
HOSTED_DB_URL = 'https://my-healthkit-data.fly.dev/'

In [None]:
url_CSV = 'http://localhost:8081/healthkit.csv?sql=select%0D%0A++id%2C%0D%0A++workoutActivityType%2C%0D%0A++duration%2C%0D%0A++durationUnit%2C%0D%0A++totalDistance%2C%0D%0A++totalDistanceUnit%2C%0D%0A++totalEnergyBurned%2C%0D%0A++totalEnergyBurnedUnit%2C%0D%0A++sourceName%2C%0D%0A++sourceVersion%2C%0D%0A++creationDate%2C%0D%0A++startDate%2C%0D%0A++endDate%2C%0D%0A++metadata_HKTimeZone%2C%0D%0A++metadata_HKWeatherTemperature%2C%0D%0A++metadata_HKWeatherHumidity%2C%0D%0A++device%2C%0D%0A++metadata_HKElevationAscended%2C%0D%0A++metadata_HKAverageMETs%0D%0Afrom%0D%0A++workouts%0D%0Aorder+by%0D%0A++id%0D%0Alimit%0D%0A++101'

In [None]:
url_CSV2 = 'http://localhost:8081/healthkit.csv?sql=select%0D%0A++id%2C%0D%0A++workoutActivityType%2C%0D%0A++duration%2C%0D%0A++durationUnit%2C%0D%0A++totalDistance%2C%0D%0A++totalDistanceUnit%2C%0D%0A++totalEnergyBurned%2C%0D%0A++totalEnergyBurnedUnit%2C%0D%0A++sourceName%2C%0D%0A++sourceVersion%2C%0D%0A++creationDate%2C%0D%0A++startDate%2C%0D%0A++endDate%2C%0D%0A++metadata_HKTimeZone%2C%0D%0A++workout_events%2C%0D%0A++metadata_HKWeatherTemperature%2C%0D%0A++metadata_HKWeatherHumidity%2C%0D%0A++device%2C%0D%0A++metadata_HKElevationAscended%2C%0D%0A++metadata_HKAverageMETs%2C%0D%0A++metadata_HKMaximumSpeed%2C%0D%0A++metadata_HKAverageSpeed%0D%0Afrom%0D%0A++workouts%0D%0Awhere%0D%0A++workoutActivityType+in+%28%3Ap0%2C+%3Ap1%29%0D%0Aorder+by%0D%0A++creationDate%0D%0Alimit%0D%0A++101&p0=HKWorkoutActivityTypeWalking&p1=HKWorkoutActivityTypeHiking&_size=max'

In [None]:
workouts_df = pd.read_csv(url_CSV)

#print((LOCAL_DB_URL + 'workout.json'))
#workout_df = pd.read_json(LOCAL_DB_URL + 'workouts.json')

In [None]:
workouts_df.nunique()

In [None]:
workouts_df.info()

In [None]:
workouts_clean_df = klib.data_cleaning(workouts_df)

In [None]:
workouts_clean_df.info()

In [None]:
workouts_df.head()

In [None]:
workouts_fly_df = pd.read_csv(url_CSV.replace(LOCAL_DB_URL, HOSTED_DB_URL))

In [None]:
workouts_fly_df.head()

In [None]:
workout_points_SQL = 'http://localhost:8081/healthkit.csv?sql=select%0D%0A++rowid%2C%0D%0A++date%2C%0D%0A++latitude%2C%0D%0A++longitude%2C%0D%0A++altitude%2C%0D%0A++speed%0D%0Afrom%0D%0A++workout_points%0D%0Awhere%0D%0A++workout_id+%3D+%22'
workout_id = 'a34036ff616122952fa67c9bc11a493f8642dd7c' + '%22'

workout_points_df = pd.read_csv(workout_points_SQL + workout_id, parse_dates=True)

In [None]:
workout_points_df.head()

In [None]:
workout_points_df.info()

In [None]:
WALK_DETAILS_FILE = 'walk_details.toml'
walk_details = Path('../' + WALK_DETAILS_FILE)


In [None]:
walk_details

In [None]:
with open(walk_details, encoding="utf-8") as f:
    walk_details_dict = tomli.load(f)

In [None]:
walk_details_dict

In [None]:
pd.DataFrame(walk_details_dict, )

In [None]:
#export
def calc_walk_stats(walk_data):
    total_time = dt.timedelta(0)
    total_distance = 0

    for iHike, hike in enumerate(walk_data):
        total_time += hike.index.max()
        # print(iHike+1, walk_date[iHike], hike.index.max(), hike['dist'].max() / 1e3)
        total_distance += hike['dist'].max()
    total_distance /= 1e3

    start_coord = walk_data[0][['lat', 'lon']].iloc[0].tolist()
    end_coord = walk_data[-1][['lat', 'lon']].iloc[-1].tolist()
    return total_time, total_distance, start_coord, end_coord


# TODO: use st.cache() and also look to pre-load and cache/feather data (or similar) - NB: use of @st.cache() below didn't work
def load_and_cache_raw_walk_data(walk_name, sample_freq, conn):
    RAW_FIT_FILE_PATH = 'icloud/Data/HealthFit/FIT' 
    fit_dir = Path.home()/RAW_FIT_FILE_PATH
    data_dir = fit_dir/walk_name[0:3]
    print(data_dir.ls())
    data_files = [file for file in os.listdir(data_dir) if file.endswith('.fit')]
    walk_files = sorted(data_files)
    print(walk_files)

    walk_data = []
    walk_date = []

    for iFile, file in enumerate(walk_files):
        walk_df = pd.DataFrame(aio.read(data_dir + file))
        walk_data.append(walk_df)
        walk_date.append(parse(file[0:17]))
        walk_df['WalkName'] = walk_name
        walk_df['WalkNumber'] = iFile
        walk_df[['alt', 'dist', 'lat', 'lon', 'speed', 'WalkName', 'WalkNumber']].to_sql('walks', conn, if_exists='append', index=False)
               
    total_time, total_distance, start_coord, end_coord = calc_walk_stats(walk_data)
    walk_stats = [total_time, total_distance, start_coord, end_coord]
    #print(start_coord)
    walk_merged = pd.concat(walk_data)
    points = walk_merged[['lat', 'lon']].values.tolist()
    points = [tuple(point) for ipoint, point in enumerate(points) if ipoint % sample_freq == 0]
    return walk_data, walk_date, walk_files, points, walk_stats

In [None]:
def create_walk_cached_data_for_app(db_file, n_rows_used=5):
    # read in all of the walks data and sample at an appropriate frequency and cache for faster use in the app
    db_conn = sql.connect(db_file)
    walk_df = pd.read_sql_query('SELECT * FROM walks', db_conn)

    UNUSED_COLUMNS = ['dist', 'speed']

    walk_df.drop(UNUSED_COLUMNS, axis=1, inplace=True)
    walk_df.dropna(inplace=True)      # TODO: Check why there are a few NaNs
    walk_df = walk_df.iloc[::n_rows_used].reset_index()    # downsample

    walk_df.to_feather(Path(db_file.as_posix().replace('.db', '.cache.feather')))
    
    return walk_df

In [None]:
# Not working yet -- this is the alternate approach to using the individual .FIT files
# walk_df = create_walk_cached_data_for_app(db_file, 10)

In [None]:
# walk_df[walk_df['lat'].isna()]

In [None]:
Path(db_file.as_posix().replace('.db', '.cache.feather'))

In [None]:
# walk_df = pd.read_feather(Path(db_file.as_posix().replace('.db', '.cache.feather')))

In [None]:
# walk_df.info()

In [None]:
# walk_df['WalkName'].unique()