# Database - functions for data back-end / manipulations

This is using an alternate approach:
  - Export all of my Apple HealthFit data from the Health app to export.zip 
  - Converted this to a SQLite database using `healthfit-to-sqlite`
 
  Queries can then against this database to build the cache file (or possibly a smaller custom SQLite file).

### TODO
* This is still a work in progress
* Then cache this data - maybe try another (small) sqlite db for the caching (instead of Parquet/Feather)


In [1]:
import pandas as pd
import datetime as dt
from pathlib import Path
import subprocess
import pendulum
from sqlite_utils import Database
import reverse_geocoder as rg


#### Exporting HealthKit data / creating SQLite DB

First export HealthKit data using the Health app - select your profile icon from the top-right of the main screen and then select **Export All Health Data** (this can take some time to create the `export.zip` file).

The archive can be converted to a SQLite database using the following command:

`healthkit-to-sqlite export.zip healthkit_db.sqlite`

which requires `healthkit-to-sqlite` to be installed (note it is one of the requirements).

In [2]:
HEALTHKIT_DATA_PATH = "/Users/mjboothaus/icloud/Data/apple_health_export"
export_zip = Path(HEALTHKIT_DATA_PATH) / "export.zip"


In [3]:
# !ls $HEALTHKIT_DATA_PATH


In [4]:
def convert_healthkit_export_to_sqlite(export_zip):
    zip_file = export_zip.as_posix()
    if export_zip.exists() is False:
        print(zip_file, ": not found")
        return None, f"{zip_file}: not found"
    zip_file_date = pendulum.instance(
        dt.datetime.fromtimestamp(export_zip.stat().st_ctime)
    )

    db_file = zip_file.replace("export.zip", "healthkit_db.sqlite")
    if Path(db_file).exists() is True:
        Path(db_file).unlink()
    sp_cmd = f"healthkit-to-sqlite {zip_file} {db_file}"
    print(sp_cmd)
    print(
        "---------------------------------------------------------------------------------------------"
    )
    print(
        "Please wait: converting healthkit export.zip to sqlite database (takes just over a minute)..."
    )

    sp = subprocess.Popen(sp_cmd, stdout=subprocess.PIPE, shell=True)
    (sp_output, sp_err) = sp.communicate()

    # This makes the wait possible
    sp_status = sp.wait()

    db_file_with_date = db_file.replace(
        ".sqlite", "_" + zip_file_date.to_date_string().replace("-", "_") + ".sqlite"
    )

    export_zip.rename(
        zip_file.replace(
            ".zip", "_" + zip_file_date.to_date_string().replace("-", "_") + ".zip"
        )
    )
    Path(db_file).rename(db_file_with_date)

    return db_file_with_date, sp_output


In [5]:
def create_df_from_sql_query_in_file(filename_dot_sql, conn, parse_dates, echo_query=False):

    query_file = Path.cwd().parent / "sql" / filename_dot_sql

    with open(query_file, "r") as query:
        sql_text = query.read()
        if echo_query is True:
            print(sql_text)
        df = pd.read_sql_query(sql_text, conn, parse_dates=parse_dates)
    return df


In [6]:
db_file, output = convert_healthkit_export_to_sqlite(export_zip)
try:
    db = Database(db_file)
except Exception as e:
    print(e)


healthkit-to-sqlite /Users/mjboothaus/icloud/Data/apple_health_export/export.zip /Users/mjboothaus/icloud/Data/apple_health_export/healthkit_db.sqlite
---------------------------------------------------------------------------------------------
Please wait: converting healthkit export.zip to sqlite database (takes just over a minute)...


In [7]:
workouts_df = create_df_from_sql_query_in_file(
    "select_star_walking_workouts.sql", db.conn, ["startDate", "endDate"]
)


In [8]:
# Trying to find the start point in each walk workout -- as date is not a date field in db not clear if sort by date and limit 1 will work
# might need to import table to pandas convert types and then export to db before doing query. Else use sqlite_utils to change column types.

start_point_df = create_df_from_sql_query_in_file(
    "select_start_point_workout.sql", db.conn, ["date"]
)

In [9]:
finish_point_df = create_df_from_sql_query_in_file(
    "select_finish_point_workout.sql", db.conn, ["date"]
)


In [10]:
workouts_summary_df = start_point_df.merge(
    finish_point_df, how="inner", on="workout_id"
)


In [11]:
def get_location(latitude, longitude):
    location = rg.search((latitude, longitude))
    return [location[0]["name"], location[0]["admin1"], location[0]["cc"]]


In [12]:
workouts_summary_df["start_location"] = workouts_summary_df.apply(
    lambda row: get_location(
        float(row["start_latitude"]), float(row["start_longitude"])
    ),
    axis=1,
)


Loading formatted geocoded file...


In [13]:
workouts_summary_df["finish_location"] = workouts_summary_df.apply(
    lambda row: get_location(
        float(row["finish_latitude"]), float(row["finish_longitude"])
    ),
    axis=1,
)


In [14]:
def calculate_elapsed_time_minutes(finish_datetime, start_datetime):
    dt = pendulum.parse(finish_datetime) - pendulum.parse(start_datetime)
    return float(dt.in_seconds() / 60 / 60)


In [15]:
workouts_summary_df["elapsed_time_hours"] = workouts_summary_df.apply(
    lambda row: calculate_elapsed_time_minutes(
        row["finish_datetime"], row["start_datetime"]
    ),
    axis=1,
)


In [16]:
workouts_summary_df["start_datetime"] = workouts_summary_df["start_datetime"].apply(
    lambda dt: pendulum.parse(dt, tz="Australia/Sydney").to_datetime_string()
)  # TODO: Need to convert from UTC to Sydney local time?


In [17]:
workouts_summary_df = workouts_summary_df.merge(
    workouts_df, how="inner", on="workout_id"
)


In [18]:
workouts_summary_df["startDate"] = workouts_summary_df["startDate"].apply(
    lambda dt: pendulum.instance(dt).to_datetime_string()
)
workouts_summary_df["endDate"] = workouts_summary_df["endDate"].apply(
    lambda dt: pendulum.instance(dt).to_datetime_string()
)


In [19]:
workouts_summary_df.to_excel("../data/workouts_summary.xlsx", index=False)


In [20]:
assert start_point_df["workout_id"].nunique() / len(start_point_df) == 1


In [21]:
def calc_walk_stats(walk_data):
    total_time = dt.timedelta(0)
    total_distance = 0

    for hike in walk_data:
        total_time += hike.index.max()
        # print(iHike+1, walk_date[iHike], hike.index.max(), hike['dist'].max() / 1e3)
        total_distance += hike["dist"].max()
    total_distance /= 1e3

    start_coord = walk_data[0][["lat", "lon"]].iloc[0].tolist()
    end_coord = walk_data[-1][["lat", "lon"]].iloc[-1].tolist()
    return total_time, total_distance, start_coord, end_coord


In [22]:
def create_walk_cached_data_for_app(db_file, n_rows_used=5):
    # read in all of the walks data and sample at an appropriate frequency and cache for faster use in the app
    db = Database(db_file)
    walk_df = pd.read_sql_query("SELECT * FROM walks", db.conn)

    UNUSED_COLUMNS = ["dist", "speed"]

    walk_df.drop(UNUSED_COLUMNS, axis=1, inplace=True)
    walk_df.dropna(inplace=True)  # TODO: Check why there are a few NaNs
    walk_df = walk_df.iloc[::n_rows_used].reset_index()  # downsample

    walk_df.to_feather(Path(db_file.as_posix().replace(".db", ".cache.feather")))

    return walk_df


In [23]:
# Not working yet -- this is the alternate approach to using the individual .FIT files
# walk_df = create_walk_cached_data_for_app(db_file, 10)
