In [1]:
# default_exp datapipe
from nbdev import *

# Datapipe - functions for data back-end / manipulations

* This is the module for creating the data pipeline.
* It should also be used to perform the data pre-processing and caching.

* **NOTE: This is currently broken because the iCloud Drive has not downloaded the files as yet.**

## Pre-process (prep) the data - do this ONCE and only ONCE - they put in some re-useable form

0. Think about capturing walk metadata / do on a per-overall-walk basis 
1. Need to "extract" the data from the .fit files
2. Clean/fix the data (e.g. allow for breaks in walk, change in order, not turning off walk at end)
3. Concatenate into a single data-structure per overall walk 
4. Store in "database" e.g. sqlite, postgres?, files, or is Quilt sufficient?

In [2]:
#export
import os
import pandas as pd
import activityio as aio
from dateutil.parser import parse
import datetime as dt
import sqlite3 as sql
from pathlib import Path
import tomli


In [3]:
RAW_FIT_FILE_PATH = 'icloud/Data/HealthFit/FIT' 
WALK_DATABASE_NAME = 'emmaus_walking.db'

In [4]:
fit_dir = Path.home()/RAW_FIT_FILE_PATH

In [5]:
db_file = Path(WALK_DATABASE_NAME)

In [6]:
walks = []
for path in sorted(fit_dir.iterdir()):
        if path.is_dir():
            walks.append([path.parts[-1], 'Name to be defined'])
walks.append(['ALL', 'All Walks'])
walks_df = pd.DataFrame(walks, columns=['walk_shortname', 'walk_name'])

In [7]:
walks

[['B2M', 'Name to be defined'],
 ['B2W', 'Name to be defined'],
 ['D2C', 'Name to be defined'],
 ['GNW', 'Name to be defined'],
 ['GTL', 'Name to be defined'],
 ['GWW', 'Name to be defined'],
 ['OLD', 'Name to be defined'],
 ['SNM', 'Name to be defined'],
 ['STM', 'Name to be defined'],
 ['WNG', 'Name to be defined'],
 ['ALL', 'All Walks']]

In [8]:
walks_df.to_json('walks_TBD.json', orient='table', index=False)

In [9]:
[shortname + ': ' + name for shortname, name in pd.read_json('walks_TBD.json', orient='table').values.tolist()]

['B2M: Name to be defined',
 'B2W: Name to be defined',
 'D2C: Name to be defined',
 'GNW: Name to be defined',
 'GTL: Name to be defined',
 'GWW: Name to be defined',
 'OLD: Name to be defined',
 'SNM: Name to be defined',
 'STM: Name to be defined',
 'WNG: Name to be defined',
 'ALL: All Walks']

In [10]:
WALK_DETAILS_FILE = 'walk_details.toml'
walk_details = Path('../' + WALK_DETAILS_FILE)


In [11]:
walk_details

Path('../walk_details.toml')

In [12]:
with open(walk_details, encoding="utf-8") as f:
    walk_details_dict = tomli.load(f)

In [13]:
walk_details_dict

{'walks': [{'short_name': 'B2M', 'name': 'Bondi to Manly'},
  {'short_name': 'B2W', 'name': 'Bondi to Wollongong'},
  {'short_name': 'D2C', 'name': 'Drummoyne to Cockatoo'},
  {'short_name': 'GNW', 'name': 'Great North Walk'},
  {'short_name': 'GTL', 'name': 'Gladesville Loop'},
  {'short_name': 'GNW', 'name': 'Great North Walk'},
  {'short_name': 'GWW', 'name': 'Great West Walk', 'status': 'incomplete'},
  {'short_name': 'OLD', 'name': 'Old Bar'},
  {'short_name': 'STM', 'name': "St Michael's Golf Course"},
  {'short_name': 'SNM', 'name': 'Snowy Mountains (Thredo)'},
  {'short_name': 'WNG',
   'name': 'Newcastle to Sydney',
   'status': 'incomplete'}]}

In [14]:
pd.DataFrame(walk_details_dict, )

Unnamed: 0,walks
0,"{'short_name': 'B2M', 'name': 'Bondi to Manly'}"
1,"{'short_name': 'B2W', 'name': 'Bondi to Wollon..."
2,"{'short_name': 'D2C', 'name': 'Drummoyne to Co..."
3,"{'short_name': 'GNW', 'name': 'Great North Walk'}"
4,"{'short_name': 'GTL', 'name': 'Gladesville Loop'}"
5,"{'short_name': 'GNW', 'name': 'Great North Walk'}"
6,"{'short_name': 'GWW', 'name': 'Great West Walk..."
7,"{'short_name': 'OLD', 'name': 'Old Bar'}"
8,"{'short_name': 'STM', 'name': 'St Michael's Go..."
9,"{'short_name': 'SNM', 'name': 'Snowy Mountains..."


In [15]:
def create_database_from_walk_files():

    # Get proper paths for files/db sorted & keep backup of previous .db

    fit_dir = Path.home()/RAW_FIT_FILE_PATH
    db_file = Path('../' + WALK_DATABASE_NAME)
    
    if db_file.is_file():
        print('Deleting existing database')
        db_file.unlink()

    db_conn = sql.connect(db_file)
    print('Created: ' + db_file.resolve().as_posix())
    
    for path in fit_dir.iterdir():
        if path.is_dir():
            walk_name = path.parts[-1]
            print(walk_name)
            walk_data, walk_date, walk_files, points, walk_stats = load_and_cache_raw_walk_data(walk_name, 1, db_conn)

            # create table of walk meta-data

            walk_meta = pd.DataFrame([walk_name, walk_date, walk_stats])
            try:
                walk_meta.to_sql('walk_meta', db_conn, if_exists='append', index=False)
            except:
                pass
    
    db_conn.close()
    return db_file, walk_meta

In [16]:
#export
def calc_walk_stats(walk_data):
    total_time = dt.timedelta(0)
    total_distance = 0

    for iHike, hike in enumerate(walk_data):
        total_time += hike.index.max()
        # print(iHike+1, walk_date[iHike], hike.index.max(), hike['dist'].max() / 1e3)
        total_distance += hike['dist'].max()
    total_distance /= 1e3

    start_coord = walk_data[0][['lat', 'lon']].iloc[0].tolist()
    end_coord = walk_data[-1][['lat', 'lon']].iloc[-1].tolist()
    return total_time, total_distance, start_coord, end_coord


# TODO: use st.cache() and also look to pre-load and cache/feather data (or similar) - NB: use of @st.cache() below didn't work
def load_and_cache_raw_walk_data(walk_name, sample_freq, conn):
    RAW_FIT_FILE_PATH = 'icloud/Data/HealthFit/FIT' 
    fit_dir = Path.home()/RAW_FIT_FILE_PATH
    data_dir = fit_dir/walk_name[0:3]
    print(data_dir.ls())
    data_files = [file for file in os.listdir(data_dir) if file.endswith('.fit')]
    walk_files = sorted(data_files)
    print(walk_files)

    walk_data = []
    walk_date = []

    for iFile, file in enumerate(walk_files):
        print(file)
        if Path(file).suffix == '.icloud':
            print('Undownloaded files in iCloud Drive - STOP')
            return False
        walk_df = pd.DataFrame(aio.read(data_dir + file))
        if len(walk_df) > 1:
            walk_data.append(walk_df)
            walk_date.append(parse(file[0:17]))
            walk_df['WalkName'] = walk_name
            walk_df['WalkNumber'] = iFile
            walk_df[['alt', 'dist', 'lat', 'lon', 'speed', 'WalkName', 'WalkNumber']].to_sql('walks', conn, if_exists='append', index=False)
               
    total_time, total_distance, start_coord, end_coord = calc_walk_stats(walk_data)
    walk_stats = [total_time, total_distance, start_coord, end_coord]
    #print(start_coord)
    walk_merged = pd.concat(walk_data)
    points = walk_merged[['lat', 'lon']].values.tolist()
    points = [tuple(point) for ipoint, point in enumerate(points) if ipoint % sample_freq == 0]
    return walk_data, walk_date, walk_files, points, walk_stats

In [17]:
try:
    db_file, walk_meta = create_database_from_walk_files()

SyntaxError: unexpected EOF while parsing (2939222685.py, line 2)

In [6]:
db_file

Path('emmaus_walking.db')

In [7]:
def create_walk_datafile_for_app(db_file, n_rows_used=5):
    # read in all of the walks data and sample at an appropriate frequency and cache for faster use in the app
    db_conn = sql.connect(db_file)
    walk_df = pd.read_sql_query('SELECT * FROM walks', db_conn)

    UNUSED_COLUMNS = ['dist', 'speed']

    walk_df.drop(UNUSED_COLUMNS, axis=1, inplace=True)
    walk_df.dropna(inplace=True)      # TODO: Check why there are a few NaNs
    walk_df = walk_df.iloc[::n_rows_used].reset_index()    # downsample

    #walk_df.to_feather(Path(db_file.as_posix().replace('.db', '.cache.feather')))

    #walk_df.to_pickle(Path(db_file.as_posix().replace('.db', '.cache.pickle')))
    

    # TODO: Consider using .parquet as the format - else just a SQLite database
    return walk_df

In [9]:
walk_df = create_walk_datafile_for_app(db_file, 10)

DatabaseError: Execution failed on sql 'SELECT * FROM walks': no such table: walks

In [24]:
walk_df[walk_df['lat'].isna()]

NameError: name 'walk_df' is not defined

In [25]:
Path(db_file.as_posix().replace('.db', '.cache.feather'))

Path('emmaus_walking.cache.feather')

In [18]:
try:
    walk_df = pd.read_feather(Path(db_file.as_posix().replace('.db', '.cache.feather')))

SyntaxError: unexpected EOF while parsing (2088894046.py, line 2)

In [35]:
walk_df.info()

NameError: name 'walk_df' is not defined

In [36]:
walk_df['WalkName'].unique()

NameError: name 'walk_df' is not defined