### Predicting sleep pattern

In [292]:
import pandas as pd
import numpy as np
from plotnine import *

pd.set_option('display.max.columns', 500)
pd.set_option('display.max.rows', 100)

### Reading in and cleaning the data

#### Parsing the XML data to csv

In [99]:
# -*- coding: utf-8 -*-
"""
applehealthdata.py: Extract data from Apple Health App's export.xml.
Copyright (c) 2016 Nicholas J. Radcliffe
Licence: MIT
"""
import os
import re
import sys

from xml.etree import ElementTree
from collections import Counter, OrderedDict

__version__ = '1.3'

RECORD_FIELDS = OrderedDict((
    ('sourceName', 's'),
    ('sourceVersion', 's'),
    ('device', 's'),
    ('type', 's'),
    ('unit', 's'),
    ('creationDate', 'd'),
    ('startDate', 'd'),
    ('endDate', 'd'),
    ('value', 'n'),
))

ACTIVITY_SUMMARY_FIELDS = OrderedDict((
    ('dateComponents', 'd'),
    ('activeEnergyBurned', 'n'),
    ('activeEnergyBurnedGoal', 'n'),
    ('activeEnergyBurnedUnit', 's'),
    ('appleExerciseTime', 's'),
    ('appleExerciseTimeGoal', 's'),
    ('appleStandHours', 'n'),
    ('appleStandHoursGoal', 'n'),
))

WORKOUT_FIELDS = OrderedDict((
    ('sourceName', 's'),
    ('sourceVersion', 's'),
    ('device', 's'),
    ('creationDate', 'd'),
    ('startDate', 'd'),
    ('endDate', 'd'),
    ('workoutActivityType', 's'),
    ('duration', 'n'),
    ('durationUnit', 's'),
    ('totalDistance', 'n'),
    ('totalDistanceUnit', 's'),
    ('totalEnergyBurned', 'n'),
    ('totalEnergyBurnedUnit', 's'),
))

FIELDS = {
    'Record': RECORD_FIELDS,
    'ActivitySummary': ACTIVITY_SUMMARY_FIELDS,
    'Workout': WORKOUT_FIELDS,
}


PREFIX_RE = re.compile('^HK.*TypeIdentifier(.+)$')
ABBREVIATE = True
VERBOSE = True

def format_freqs(counter):
    """
    Format a counter object for display.
    """
    return '\n'.join('%s: %d' % (tag, counter[tag])
                     for tag in sorted(counter.keys()))


def format_value(value, datatype):
    """
    Format a value for a CSV file, escaping double quotes and backslashes.
    None maps to empty.
    datatype should be
        's' for string (escaped)
        'n' for number
        'd' for datetime
    """
    if value is None:
        return ''
    elif datatype == 's':  # string
        return '"%s"' % value.replace('\\', '\\\\').replace('"', '\\"')
    elif datatype in ('n', 'd'):  # number or date
        return value
    else:
        raise KeyError('Unexpected format value: %s' % datatype)


def abbreviate(s, enabled=ABBREVIATE):
    """
    Abbreviate particularly verbose strings based on a regular expression
    """
    m = re.match(PREFIX_RE, s)
    return m.group(1) if enabled and m else s


class HealthDataExtractor(object):
    """
    Extract health data from Apple Health App's XML export, export.xml.
    Inputs:
        path:      Relative or absolute path to export.xml
        verbose:   Set to False for less verbose output
    Outputs:
        Writes a CSV file for each record type found, in the same
        directory as the input export.xml. Reports each file written
        unless verbose has been set to False.
    """
    def __init__(self, path, verbose=VERBOSE):
        self.in_path = path
        self.verbose = verbose
        self.directory = os.path.abspath(os.path.split(path)[0])
        with open(path) as f:
            self.report('Reading data from %s . . . ' % path, end='')
            self.data = ElementTree.parse(f)
            self.report('done')
        self.root = self.data._root
        self.nodes = list(self.root)
        self.n_nodes = len(self.nodes)
        self.abbreviate_types()
        self.collect_stats()

    def report(self, msg, end='\n'):
        if self.verbose:
            print(msg, end=end)
            sys.stdout.flush()

    def count_tags_and_fields(self):
        self.tags = Counter()
        self.fields = Counter()
        for record in self.nodes:
            self.tags[record.tag] += 1
            for k in record.keys():
                self.fields[k] += 1

    def count_record_types(self):
        """
        Counts occurrences of each type of (conceptual) "record" in the data.
        In the case of nodes of type 'Record', this counts the number of
        occurrences of each 'type' or record in self.record_types.
        In the case of nodes of type 'ActivitySummary' and 'Workout',
        it just counts those in self.other_types.
        The slightly different handling reflects the fact that 'Record'
        nodes come in a variety of different subtypes that we want to write
        to different data files, whereas (for now) we are going to write
        all Workout entries to a single file, and all ActivitySummary
        entries to another single file.
        """
        self.record_types = Counter()
        self.other_types = Counter()
        for record in self.nodes:
            if record.tag == 'Record':
                self.record_types[record.attrib['type']] += 1
            elif record.tag in ('ActivitySummary', 'Workout'):
                self.other_types[record.tag] += 1
            elif record.tag in ('Export', 'Me'):
                pass
            else:
                self.report('Unexpected node of type %s.' % record.tag)

    def collect_stats(self):
        self.count_record_types()
        self.count_tags_and_fields()

    def open_for_writing(self):
        self.handles = {}
        self.paths = []
        for kind in (list(self.record_types) + list(self.other_types)):
            path = os.path.join(self.directory, '%s.csv' % abbreviate(kind))
            f = open(path, 'w')
            headerType = (kind if kind in ('Workout', 'ActivitySummary')
                               else 'Record')
            f.write(','.join(FIELDS[headerType].keys()) + '\n')
            self.handles[kind] = f
            self.report('Opening %s for writing' % path)

    def abbreviate_types(self):
        """
        Shorten types by removing common boilerplate text.
        """
        for node in self.nodes:
            if node.tag == 'Record':
                if 'type' in node.attrib:
                    node.attrib['type'] = abbreviate(node.attrib['type'])

    def write_records(self):
        kinds = FIELDS.keys()
        for node in self.nodes:
            if node.tag in kinds:
                attributes = node.attrib
                kind = attributes['type'] if node.tag == 'Record' else node.tag
                values = [format_value(attributes.get(field), datatype)
                          for (field, datatype) in FIELDS[node.tag].items()]
                line = ','.join(values) + '\n'
                self.handles[kind].write(line)

    def close_files(self):
        for (kind, f) in self.handles.items():
            f.close()
            self.report('Written %s data.' % abbreviate(kind))

    def extract(self):
        self.open_for_writing()
        self.write_records()
        self.close_files()

    def report_stats(self):
        print('\nTags:\n%s\n' % format_freqs(self.tags))
        print('Fields:\n%s\n' % format_freqs(self.fields))
        print('Record types:\n%s\n' % format_freqs(self.record_types))

In [None]:
data = HealthDataExtractor(path='raw_health_export.xml')
data.report_stats()
data.extract()


Reading data from raw_health_export.xml . . . done
Unexpected node of type ExportDate.

Tags:
ActivitySummary: 900
ExportDate: 1
Me: 1
Record: 2795078
Workout: 1387

Fields:
HKCharacteristicTypeIdentifierBiologicalSex: 1
HKCharacteristicTypeIdentifierBloodType: 1
HKCharacteristicTypeIdentifierCardioFitnessMedicationsUse: 1
HKCharacteristicTypeIdentifierDateOfBirth: 1
HKCharacteristicTypeIdentifierFitzpatrickSkinType: 1
activeEnergyBurned: 900
activeEnergyBurnedGoal: 900
activeEnergyBurnedUnit: 900
appleExerciseTime: 900
appleExerciseTimeGoal: 900
appleMoveTime: 900
appleMoveTimeGoal: 900
appleStandHours: 900
appleStandHoursGoal: 900
creationDate: 2796465
dateComponents: 900
device: 2583875
duration: 1387
durationUnit: 1387
endDate: 2796465
sourceName: 2796465
sourceVersion: 2696886
startDate: 2796465
type: 2795078
unit: 2770121
value: 2794900
workoutActivityType: 1387

Record types:
ActiveEnergyBurned: 853486
AppleExerciseTime: 73715
AppleStandHour: 24515
AppleStandTime: 45524
AppleWal

#### Reading the CSV files

In [184]:
train_detailed = (pd.read_csv("train_detailed.csv",
                              parse_dates=['startDate', 'endDate', 'creationDate'])
                  .assign(startDate = lambda x: x.startDate.dt.tz_localize(None),
                          endDate = lambda x: x.endDate.dt.tz_localize(None),
                          creationDate = lambda x: x.creationDate.dt.tz_localize(None))
                  )

In [185]:
train = pd.read_csv("train.csv", parse_dates=['date'])
test = pd.read_csv("test.csv", parse_dates=['date'])

Gotta read in the huge number of csv files from the XML... The aim is to left-join all data on the train dataset and then proceed with modelling.

1. Read in every file, inspect what I want to do with the data (how to aggregate to daily)
2. Aggregate to daily
3. Left-join

In [186]:
active_energy = (pd.read_csv("XML/ActiveEnergyBurned.csv",
                            parse_dates=['startDate', 'endDate', 'creationDate'])
                .assign(startDate=lambda x: x.startDate.dt.tz_localize(None),
                        endDate=lambda x: x.endDate.dt.tz_localize(None),
                        creationDate=lambda x: x.creationDate.dt.tz_localize(None))
                .assign(date=lambda x: x['startDate'].dt.date)
                .groupby('date', as_index=False)
                .agg(active_energy=('value', np.sum))
                .assign(date=lambda x: pd.to_datetime(x['date']))
                )

active_energy.head(5)



Unnamed: 0,date,active_energy
0,2015-02-19,487.0
1,2015-02-20,1789.0
2,2015-02-21,1185.0
3,2015-02-22,2173.0
4,2015-02-23,903.0


In [187]:
exercise_time = (pd.read_csv("XML/AppleExerciseTime.csv",
                            parse_dates=['startDate', 'endDate', 'creationDate'])
                .assign(startDate=lambda x: x.startDate.dt.tz_localize(None),
                        endDate=lambda x: x.endDate.dt.tz_localize(None),
                        creationDate=lambda x: x.creationDate.dt.tz_localize(None))
                .assign(date=lambda x: x['startDate'].dt.date)
                .groupby('date', as_index=False)
                .agg(exercise_minutes=('value', np.sum))
                .assign(date=lambda x: pd.to_datetime(x['date']))
                )

exercise_time.head(5)

Unnamed: 0,date,exercise_minutes
0,2020-09-25,1
1,2020-09-26,105
2,2020-09-27,59
3,2020-09-28,118
4,2020-09-29,25


In [188]:
stand_hour = (pd.read_csv("XML/AppleStandHour.csv",
                            parse_dates=['startDate', 'endDate', 'creationDate'])
                .assign(startDate=lambda x: x.startDate.dt.tz_localize(None),
                        endDate=lambda x: x.endDate.dt.tz_localize(None),
                        creationDate=lambda x: x.creationDate.dt.tz_localize(None))
                .assign(date=lambda x: x['startDate'].dt.date)
                .value_counts(['date', 'value'])
                .reset_index(name='count')
                .sort_values('date')
                .pivot(index='date', values='count', columns='value')
                .reset_index()
                .rename_axis(None, axis=1)
                .assign(date = lambda x: pd.to_datetime(x['date']))
                )

stand_hour.head(5)

Unnamed: 0,date,HKCategoryValueAppleStandHourIdle,HKCategoryValueAppleStandHourStood
0,2020-09-25,,4.0
1,2020-09-26,6.0,18.0
2,2020-09-27,11.0,13.0
3,2020-09-28,7.0,24.0
4,2020-09-29,11.0,12.0


In [189]:
stand_minutes = (pd.read_csv("XML/AppleStandTime.csv",
                            parse_dates=['startDate', 'endDate', 'creationDate'])
                .assign(startDate=lambda x: x.startDate.dt.tz_localize(None),
                        endDate=lambda x: x.endDate.dt.tz_localize(None),
                        creationDate=lambda x: x.creationDate.dt.tz_localize(None))
                .assign(date=lambda x: x['startDate'].dt.date)
                .groupby('date', as_index=False)
                .agg(minutes_stood=('value', np.sum))
                .assign(date=lambda x: pd.to_datetime(x['date']))
                )

stand_minutes.head(5)

Unnamed: 0,date,minutes_stood
0,2020-09-25,26
1,2020-09-26,241
2,2020-09-27,119
3,2020-09-28,164
4,2020-09-29,56


In [190]:
audio_exposure_event = (pd.read_csv("XML/AudioExposureEvent.csv",
                                    parse_dates=['startDate', 'endDate', 'creationDate'])
                        .assign(startDate=lambda x: x.startDate.dt.tz_localize(None),
                                endDate=lambda x: x.endDate.dt.tz_localize(None),
                                creationDate=lambda x: x.creationDate.dt.tz_localize(None))
                        .assign(date=lambda x: x['startDate'].dt.date)
                        .value_counts(['date', 'value'])
                        .reset_index(name='count')
                        .sort_values('date')
                        .pivot(index='date', values='count', columns='value')
                        .reset_index()
                        .rename_axis(None, axis=1)
                        .assign(date=lambda x: pd.to_datetime(x['date']))
                        )

audio_exposure_event.head(5)


Unnamed: 0,date,HKCategoryValueEnvironmentalAudioExposureEventMomentaryLimit
0,2020-09-26,1
1,2020-10-30,3
2,2020-11-26,1
3,2021-02-17,1
4,2021-02-18,2


In [191]:
basal_energy = (pd.read_csv("XML/BasalEnergyBurned.csv",
                            parse_dates=['startDate', 'endDate', 'creationDate'])
                .assign(startDate=lambda x: x.startDate.dt.tz_localize(None),
                        endDate=lambda x: x.endDate.dt.tz_localize(None),
                        creationDate=lambda x: x.creationDate.dt.tz_localize(None))
                .assign(date=lambda x: x['startDate'].dt.date)
                .groupby('date', as_index=False)
                .agg(basal_energy=('value', np.sum))
                .assign(date=lambda x: pd.to_datetime(x['date']))
                )

basal_energy.head(5)



Unnamed: 0,date,basal_energy
0,2012-04-20,1913.0
1,2012-04-21,1913.0
2,2012-04-22,1913.0
3,2012-04-23,1913.0
4,2012-04-24,1913.0


In [192]:
body_mass = (pd.read_csv("XML/BodyMass.csv",
                            parse_dates=['startDate', 'endDate', 'creationDate'])
                .assign(startDate=lambda x: x.startDate.dt.tz_localize(None),
                        endDate=lambda x: x.endDate.dt.tz_localize(None),
                        creationDate=lambda x: x.creationDate.dt.tz_localize(None))
                .assign(date=lambda x: x['startDate'].dt.date)
                .groupby('date', as_index=False)
                .agg(body_mass=('value', np.mean))       
                .assign(date=lambda x: pd.to_datetime(x['date']))
                )

body_mass.head(5)

Unnamed: 0,date,body_mass
0,2012-04-20,188.799
1,2012-04-21,188.799
2,2012-04-22,188.799
3,2012-04-23,188.799
4,2012-04-24,188.799


In [193]:
body_mass_index = (pd.read_csv("XML/BodyMassIndex.csv",
                            parse_dates=['startDate', 'endDate', 'creationDate'])
                .assign(startDate=lambda x: x.startDate.dt.tz_localize(None),
                        endDate=lambda x: x.endDate.dt.tz_localize(None),
                        creationDate=lambda x: x.creationDate.dt.tz_localize(None))
                .assign(date=lambda x: x['startDate'].dt.date)
                .groupby('date', as_index=False)
                .agg(body_mass_index=('value', np.mean))       
                .assign(date=lambda x: pd.to_datetime(x['date']))
                )

body_mass_index.head(5)

Unnamed: 0,date,body_mass_index
0,2012-04-20,23.5981
1,2012-04-21,23.5981
2,2012-04-22,23.5981
3,2012-04-23,23.5981
4,2012-04-24,23.5981


In [194]:
distance_walking_running = (pd.read_csv("XML/DistanceWalkingRunning.csv",
                            parse_dates=['startDate', 'endDate', 'creationDate'])
                .assign(startDate=lambda x: x.startDate.dt.tz_localize(None),
                        endDate=lambda x: x.endDate.dt.tz_localize(None),
                        creationDate=lambda x: x.creationDate.dt.tz_localize(None))
                .assign(date=lambda x: x['startDate'].dt.date)
                .groupby('date', as_index=False)
                .agg(distance_walking_running=('value', np.sum))       
                .assign(date=lambda x: pd.to_datetime(x['date']))
                )

distance_walking_running.head(5)

Unnamed: 0,date,distance_walking_running
0,2015-02-18,0.019188
1,2015-02-19,1.31187
2,2015-02-20,7.16791
3,2015-02-21,3.82391
4,2015-02-22,8.19599


In [200]:
env_audio_exposure = (pd.read_csv("XML/EnvironmentalAudioExposure.csv",
                            parse_dates=['startDate', 'endDate', 'creationDate'])
                .assign(startDate=lambda x: x.startDate.dt.tz_localize(None),
                        endDate=lambda x: x.endDate.dt.tz_localize(None),
                        creationDate=lambda x: x.creationDate.dt.tz_localize(None))
                .assign(date=lambda x: x['startDate'].dt.date)
                .groupby('date', as_index=False)
                .agg(env_audio_exposure_min=('value', np.min),
                     env_audio_exposure_mean=('value', np.mean),
                     env_audio_exposure_median=('value', np.median),
                     env_audio_exposure_max=('value', np.max))       
                .assign(date=lambda x: pd.to_datetime(x['date']))
                )

env_audio_exposure.head(5)

Unnamed: 0,date,env_audio_exposure_min,env_audio_exposure_mean,env_audio_exposure_median,env_audio_exposure_max
0,2020-09-26,40.5313,71.283803,71.5018,91.9333
1,2020-09-27,44.8399,61.7274,63.58785,81.5414
2,2020-09-28,44.2695,59.369372,59.741,78.0796
3,2020-09-29,48.3998,61.768209,62.67415,77.3387
4,2020-09-30,50.3467,62.081132,62.5881,85.8944


In [204]:
flights_climbed = (pd.read_csv("XML/FlightsClimbed.csv",
                            parse_dates=['startDate', 'endDate', 'creationDate'])
                .assign(startDate=lambda x: x.startDate.dt.tz_localize(None),
                        endDate=lambda x: x.endDate.dt.tz_localize(None),
                        creationDate=lambda x: x.creationDate.dt.tz_localize(None))
                .assign(date=lambda x: x['startDate'].dt.date)
                .groupby('date', as_index=False)
                .agg(flights_climbed=('value', np.sum))       
                .assign(date=lambda x: pd.to_datetime(x['date']))
                )

flights_climbed.head(5)

Unnamed: 0,date,flights_climbed
0,2015-04-19,8
1,2015-04-20,9
2,2015-04-21,5
3,2015-04-22,15
4,2015-04-23,23


In [212]:
headphone_exposure = (pd.read_csv("XML/HeadphoneAudioExposure.csv",
                                  parse_dates=['startDate', 'endDate', 'creationDate'])
                      .assign(startDate=lambda x: x.startDate.dt.tz_localize(None),
                              endDate=lambda x: x.endDate.dt.tz_localize(None),
                              creationDate=lambda x: x.creationDate.dt.tz_localize(None))
                      .assign(date=lambda x: x['startDate'].dt.date)
                      .groupby('date', as_index=False)
                      .agg(headphone_audio_exposure_min=('value', np.min),
                           headphone_audio_exposure_mean=('value', np.mean),
                           headphone_audio_exposure_median=('value', np.median),
                           headphone_audio_exposure_max=('value', np.max))
                      .assign(date=lambda x: pd.to_datetime(x['date']))
                      )

headphone_exposure.head(5)


Unnamed: 0,date,headphone_audio_exposure_min,headphone_audio_exposure_mean,headphone_audio_exposure_median,headphone_audio_exposure_max
0,2020-06-10,3.29836,44.117424,46.4057,71.4634
1,2020-06-11,22.3296,56.697617,63.4311,69.2495
2,2020-06-15,29.5991,77.771241,84.2212,86.5065
3,2020-06-22,44.6483,59.629152,57.0473,80.0172
4,2020-06-23,25.586,47.8225,51.1487,72.8219


In [216]:
heartrate = (pd.read_csv("XML/HeartRate.csv",
                                  parse_dates=['startDate', 'endDate', 'creationDate'])
                      .assign(startDate=lambda x: x.startDate.dt.tz_localize(None),
                              endDate=lambda x: x.endDate.dt.tz_localize(None),
                              creationDate=lambda x: x.creationDate.dt.tz_localize(None))
                      .assign(date=lambda x: x['startDate'].dt.date)
                      .groupby('date', as_index=False)
                      .agg(heartrate_min=('value', np.min),
                           heartrate_mean=('value', np.mean),
                           heartrate_median=('value', np.median),
                           heartrate_max=('value', np.max))
                      .assign(date=lambda x: pd.to_datetime(x['date']))
                      )

heartrate.head(5)




Unnamed: 0,date,heartrate_min,heartrate_mean,heartrate_median,heartrate_max
0,2015-04-20,73.0,115.602243,114.0,202.0
1,2015-04-21,22.0,130.74151,135.0,201.0
2,2015-04-22,61.0,122.71836,123.0,158.0
3,2015-04-23,77.0,155.503007,159.0,181.0
4,2015-04-24,73.0,125.893857,124.0,166.0


In [233]:
heartrate_variability = (pd.read_csv("XML/HeartRateVariabilitySDNN.csv",
                                  parse_dates=['startDate', 'endDate', 'creationDate'])
                      .assign(startDate=lambda x: x.startDate.dt.tz_localize(None),
                              endDate=lambda x: x.endDate.dt.tz_localize(None),
                              creationDate=lambda x: x.creationDate.dt.tz_localize(None))
                      .assign(date=lambda x: x['startDate'].dt.date)
                      .groupby('date', as_index=False)
                      .agg(heartrate_var_min=('value', np.min),
                           heartrate_var_mean=('value', np.mean),
                           heartrate_var_median=('value', np.median),
                           heartrate_var_max=('value', np.max))
                      .assign(date=lambda x: pd.to_datetime(x['date']))
                      )

heartrate_variability.head(5)

Unnamed: 0,date,heartrate_var_min,heartrate_var_mean,heartrate_var_median,heartrate_var_max
0,2020-09-25,59.7688,78.67445,78.67445,97.5801
1,2020-09-26,45.8612,56.330725,53.94875,71.5642
2,2020-09-27,40.6202,61.245689,52.7636,100.46
3,2020-09-28,44.448,72.40994,46.9947,144.175
4,2020-09-29,37.7384,50.9306,53.67525,63.6494


In [242]:
oxygen_saturation = (pd.read_csv("XML/OxygenSaturation.csv",
                                  parse_dates=['startDate', 'endDate', 'creationDate'])
                      .assign(startDate=lambda x: x.startDate.dt.tz_localize(None),
                              endDate=lambda x: x.endDate.dt.tz_localize(None),
                              creationDate=lambda x: x.creationDate.dt.tz_localize(None))
                      .assign(date=lambda x: x['startDate'].dt.date)
                      .groupby('date', as_index=False)
                      .agg(oxygen_saturation_min=('value', np.min),
                           oxygen_saturation_mean=('value', np.mean),
                           oxygen_saturation_median=('value', np.median),
                           oxygen_saturation_max=('value', np.max))
                      .assign(date=lambda x: pd.to_datetime(x['date']))
                      )

oxygen_saturation.head(5)

Unnamed: 0,date,oxygen_saturation_min,oxygen_saturation_mean,oxygen_saturation_median,oxygen_saturation_max
0,2020-09-25,0.99,0.99,0.99,0.99
1,2020-09-26,0.94,0.970667,0.96,1.0
2,2020-09-27,0.94,0.974737,0.97,1.0
3,2020-09-28,0.95,0.97,0.97,1.0
4,2020-09-29,0.95,0.969375,0.97,0.99


In [244]:
respiratory_rate = (pd.read_csv("XML/RespiratoryRate.csv",
                                  parse_dates=['startDate', 'endDate', 'creationDate'])
                      .assign(startDate=lambda x: x.startDate.dt.tz_localize(None),
                              endDate=lambda x: x.endDate.dt.tz_localize(None),
                              creationDate=lambda x: x.creationDate.dt.tz_localize(None))
                      .assign(date=lambda x: x['startDate'].dt.date)
                      .groupby('date', as_index=False)
                      .agg(respiratory_rate_min=('value', np.min),
                           respiratory_rate_mean=('value', np.mean),
                           respiratory_rate_median=('value', np.median),
                           respiratory_rate_max=('value', np.max))
                      .assign(date=lambda x: pd.to_datetime(x['date']))
                      )

respiratory_rate.head(5)

Unnamed: 0,date,respiratory_rate_min,respiratory_rate_mean,respiratory_rate_median,respiratory_rate_max
0,2021-10-25,17.0,20.75,20.75,24.5
1,2021-10-26,14.0,16.855769,17.0,22.5
2,2021-10-27,13.5,16.981818,16.5,25.0
3,2021-10-28,15.0,17.209302,16.5,24.5
4,2021-10-29,13.5,16.385965,16.0,23.0


In [248]:
resting_heartrate = (pd.read_csv("XML/RestingHeartRate.csv",
                                  parse_dates=['startDate', 'endDate', 'creationDate'])
                      .assign(startDate=lambda x: x.startDate.dt.tz_localize(None),
                              endDate=lambda x: x.endDate.dt.tz_localize(None),
                              creationDate=lambda x: x.creationDate.dt.tz_localize(None))
                      .assign(date=lambda x: x['startDate'].dt.date)
                      .groupby('date', as_index=False)
                      .agg(resting_heartrate = ('value', np.mean))
                      .assign(date=lambda x: pd.to_datetime(x['date']))
                      )

resting_heartrate.head(5)

Unnamed: 0,date,resting_heartrate
0,2020-09-25,56.0
1,2020-09-26,56.0
2,2020-09-27,53.0
3,2020-09-28,54.0
4,2020-09-29,52.0


In [252]:
stair_ascent_speed = (pd.read_csv("XML/StairAscentSpeed.csv",
                                  parse_dates=['startDate', 'endDate', 'creationDate'])
                      .assign(startDate=lambda x: x.startDate.dt.tz_localize(None),
                              endDate=lambda x: x.endDate.dt.tz_localize(None),
                              creationDate=lambda x: x.creationDate.dt.tz_localize(None))
                      .assign(date=lambda x: x['startDate'].dt.date)
                      .groupby('date', as_index=False)
                      .agg(stair_ascent_speed_min=('value', np.min),
                           stair_ascent_speed_mean=('value', np.mean),
                           stair_ascent_speed_median=('value', np.median),
                           stair_ascent_speed_max=('value', np.max))
                      .assign(date=lambda x: pd.to_datetime(x['date']))
                      )

stair_ascent_speed.head(5)

Unnamed: 0,date,stair_ascent_speed_min,stair_ascent_speed_mean,stair_ascent_speed_median,stair_ascent_speed_max
0,2020-09-25,1.17239,1.37114,1.37114,1.56989
1,2020-09-26,0.754373,1.236808,1.09045,2.17955
2,2020-09-27,1.00399,1.42055,1.42055,1.83711
3,2020-09-28,0.708712,0.968418,0.905707,1.99061
4,2020-09-29,1.05519,1.380167,1.52411,1.5612


In [254]:
stair_descent_speed = (pd.read_csv("XML/StairDescentSpeed.csv",
                                  parse_dates=['startDate', 'endDate', 'creationDate'])
                      .assign(startDate=lambda x: x.startDate.dt.tz_localize(None),
                              endDate=lambda x: x.endDate.dt.tz_localize(None),
                              creationDate=lambda x: x.creationDate.dt.tz_localize(None))
                      .assign(date=lambda x: x['startDate'].dt.date)
                      .groupby('date', as_index=False)
                      .agg(stair_descent_speed_min=('value', np.min),
                           stair_descent_speed_mean=('value', np.mean),
                           stair_descent_speed_median=('value', np.median),
                           stair_descent_speed_max=('value', np.max))
                      .assign(date=lambda x: pd.to_datetime(x['date']))
                      )

stair_descent_speed.head(5)

Unnamed: 0,date,stair_descent_speed_min,stair_descent_speed_mean,stair_descent_speed_median,stair_descent_speed_max
0,2020-09-26,1.45482,1.767678,1.721255,2.17338
1,2020-09-27,0.689655,1.099932,0.832439,2.05139
2,2020-09-28,0.670295,1.149277,1.05817,2.24115
3,2020-09-29,1.61614,1.945814,1.86357,2.52648
4,2020-09-30,1.85761,1.85761,1.85761,1.85761


In [256]:
step_count = (pd.read_csv("XML/StepCount.csv",
                                  parse_dates=['startDate', 'endDate', 'creationDate'])
                      .assign(startDate=lambda x: x.startDate.dt.tz_localize(None),
                              endDate=lambda x: x.endDate.dt.tz_localize(None),
                              creationDate=lambda x: x.creationDate.dt.tz_localize(None))
                      .assign(date=lambda x: x['startDate'].dt.date)
                      .groupby('date', as_index=False)
                      .agg(step_count = ('value', np.sum))
                      .assign(date=lambda x: pd.to_datetime(x['date']))
                      )

step_count.head(5)

Unnamed: 0,date,step_count
0,2015-02-18,44
1,2015-02-19,2669
2,2015-02-20,14552
3,2015-02-21,7769
4,2015-02-22,16675


In [260]:
vo2_max = (pd.read_csv("XML/VO2Max.csv",
                                  parse_dates=['startDate', 'endDate', 'creationDate'])
                      .assign(startDate=lambda x: x.startDate.dt.tz_localize(None),
                              endDate=lambda x: x.endDate.dt.tz_localize(None),
                              creationDate=lambda x: x.creationDate.dt.tz_localize(None))
                      .assign(date=lambda x: x['startDate'].dt.date)
                      .groupby('date', as_index=False)
                      .agg(vo2_max = ('value', np.mean))
                      .assign(date=lambda x: pd.to_datetime(x['date']))
                      )

vo2_max.head(5)

Unnamed: 0,date,vo2_max
0,2020-09-30,46.64
1,2020-10-01,42.94
2,2020-10-05,42.94
3,2020-10-06,42.99
4,2020-10-09,43.72


In [263]:
walking_asymmetry = (pd.read_csv("XML/WalkingAsymmetryPercentage.csv",
                                  parse_dates=['startDate', 'endDate', 'creationDate'])
                      .assign(startDate=lambda x: x.startDate.dt.tz_localize(None),
                              endDate=lambda x: x.endDate.dt.tz_localize(None),
                              creationDate=lambda x: x.creationDate.dt.tz_localize(None))
                      .assign(date=lambda x: x['startDate'].dt.date)
                      .groupby('date', as_index=False)
                      .agg(walking_asymmetry_min=('value', np.min),
                           walking_asymmetry_mean=('value', np.mean),
                           walking_asymmetry_median=('value', np.median),
                           walking_asymmetry_max=('value', np.max))
                      .assign(date=lambda x: pd.to_datetime(x['date']))
                      )

walking_asymmetry.head(5)

Unnamed: 0,date,walking_asymmetry_min,walking_asymmetry_mean,walking_asymmetry_median,walking_asymmetry_max
0,2020-01-10,0.0,0.0,0.0,0.0
1,2020-01-11,0.0,0.003043,0.0,0.07
2,2020-01-12,0.0,0.01,0.0,0.07
3,2020-02-10,0.0,0.025,0.0,0.1
4,2020-02-11,0.0,0.0,0.0,0.0


In [265]:
walking_doublesupport = (pd.read_csv("XML/WalkingDoubleSupportPercentage.csv",
                                  parse_dates=['startDate', 'endDate', 'creationDate'])
                      .assign(startDate=lambda x: x.startDate.dt.tz_localize(None),
                              endDate=lambda x: x.endDate.dt.tz_localize(None),
                              creationDate=lambda x: x.creationDate.dt.tz_localize(None))
                      .assign(date=lambda x: x['startDate'].dt.date)
                      .groupby('date', as_index=False)
                      .agg(walking_support_min=('value', np.min),
                           walking_support_mean=('value', np.mean),
                           walking_support_median=('value', np.median),
                           walking_support_max=('value', np.max))
                      .assign(date=lambda x: pd.to_datetime(x['date']))
                      )

walking_doublesupport.head(5)

Unnamed: 0,date,walking_support_min,walking_support_mean,walking_support_median,walking_support_max
0,2020-09-25,0.293,0.293,0.293,0.293
1,2020-09-26,0.245,0.294344,0.294,0.338
2,2020-09-27,0.261,0.292441,0.291,0.359
3,2020-09-28,0.18,0.287185,0.287,0.344
4,2020-09-29,0.278,0.300857,0.303,0.325


In [269]:
walking_heartrate = (pd.read_csv("XML/WalkingHeartRateAverage.csv",
                                  parse_dates=['startDate', 'endDate', 'creationDate'])
                      .assign(startDate=lambda x: x.startDate.dt.tz_localize(None),
                              endDate=lambda x: x.endDate.dt.tz_localize(None),
                              creationDate=lambda x: x.creationDate.dt.tz_localize(None))
                      .assign(date=lambda x: x['startDate'].dt.date)
                      .groupby('date', as_index=False)
                      .agg(walking_heartrate_mean=('value', np.mean))
                      .assign(date=lambda x: pd.to_datetime(x['date']))
                      )

walking_heartrate.head(5)

Unnamed: 0,date,walking_heartrate_mean
0,2020-09-26,125.0
1,2020-09-27,100.0
2,2020-09-28,93.0
3,2020-09-29,100.5
4,2020-09-30,107.0


In [270]:
walking_speed = (pd.read_csv("XML/WalkingSpeed.csv",
                                  parse_dates=['startDate', 'endDate', 'creationDate'])
                      .assign(startDate=lambda x: x.startDate.dt.tz_localize(None),
                              endDate=lambda x: x.endDate.dt.tz_localize(None),
                              creationDate=lambda x: x.creationDate.dt.tz_localize(None))
                      .assign(date=lambda x: x['startDate'].dt.date)
                      .groupby('date', as_index=False)
                      .agg(walking_speed_min=('value', np.min),
                           walking_speed_mean=('value', np.mean),
                           walking_speed_median=('value', np.median),
                           walking_speed_max=('value', np.max))
                      .assign(date=lambda x: pd.to_datetime(x['date']))
                      )

walking_speed.head(5)

Unnamed: 0,date,walking_speed_min,walking_speed_mean,walking_speed_median,walking_speed_max
0,2020-09-25,2.34878,2.34878,2.34878,2.34878
1,2020-09-26,1.6777,2.772081,2.7738,3.98175
2,2020-09-27,1.83429,2.867007,2.86328,3.53436
3,2020-09-28,1.78955,2.97134,2.88565,5.01074
4,2020-09-29,2.25931,2.614021,2.55011,2.93039


In [271]:
walking_steplength = (pd.read_csv("XML/WalkingStepLength.csv",
                                  parse_dates=['startDate', 'endDate', 'creationDate'])
                      .assign(startDate=lambda x: x.startDate.dt.tz_localize(None),
                              endDate=lambda x: x.endDate.dt.tz_localize(None),
                              creationDate=lambda x: x.creationDate.dt.tz_localize(None))
                      .assign(date=lambda x: x['startDate'].dt.date)
                      .groupby('date', as_index=False)
                      .agg(walking_steplength_min=('value', np.min),
                           walking_steplength_mean=('value', np.mean),
                           walking_steplength_median=('value', np.median),
                           walking_steplength_max=('value', np.max))
                      .assign(date=lambda x: pd.to_datetime(x['date']))
                      )

walking_steplength.head(5)

Unnamed: 0,date,walking_steplength_min,walking_steplength_mean,walking_steplength_median,walking_steplength_max
0,2020-09-25,23.2283,23.2283,23.2283,23.2283
1,2020-09-26,20.0787,28.073909,29.5276,35.8268
2,2020-09-27,18.8976,29.291362,29.72445,35.4331
3,2020-09-28,18.8976,29.636615,28.7402,53.1496
4,2020-09-29,25.9843,28.177771,27.5591,30.7087


Concatenate train and test just for the quick left-join:

In [277]:
pd.concat([train.assign(id='train'), test.assign(id='test')])

Unnamed: 0,date,sleep_hours,id
0,2015-02-19,6.400000,train
1,2015-02-20,7.583333,train
2,2015-02-21,6.350000,train
3,2015-02-22,6.500000,train
4,2015-02-23,8.916667,train
...,...,...,...
414,2023-03-12,1.000000,test
415,2023-03-13,1.000000,test
416,2023-03-14,1.000000,test
417,2023-03-15,1.000000,test


In [279]:
data_prepped = (pd.concat([train.assign(id='train'), test.assign(id='test')])
 .merge(active_energy, how='left', on='date')
 .merge(exercise_time, how='left', on='date')
 .merge(stand_hour, how='left', on='date')
 .merge(audio_exposure_event, how='left', on='date')
 .merge(stand_minutes, how='left', on='date')
 .merge(basal_energy, how='left', on='date')
 .merge(body_mass, how='left', on='date')
 .merge(body_mass_index, how='left', on='date')
 .merge(distance_walking_running, how='left', on='date')
 .merge(env_audio_exposure, how='left', on='date')
 .merge(flights_climbed, how='left', on='date')
 .merge(headphone_exposure, how='left', on='date')
 .merge(heartrate, how='left', on='date')
 .merge(heartrate_variability, how='left', on='date')
 .merge(oxygen_saturation, how='left', on='date')
 .merge(respiratory_rate, how='left', on='date')
 .merge(resting_heartrate, how='left', on='date')
 .merge(stair_ascent_speed, how='left', on='date')
 .merge(stair_descent_speed, how='left', on='date')
 .merge(step_count, how='left', on='date')
 .merge(vo2_max, how='left', on='date')
 .merge(walking_asymmetry, how='left', on='date')
 .merge(walking_doublesupport, how='left', on='date')
 .merge(walking_heartrate, how='left', on='date')
 .merge(walking_speed, how='left', on='date')
 .merge(walking_steplength, how='left', on='date')
 .copy()
 )

data_prepped.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2773 entries, 0 to 2772
Data columns (total 66 columns):
 #   Column                                                        Non-Null Count  Dtype         
---  ------                                                        --------------  -----         
 0   date                                                          2773 non-null   datetime64[ns]
 1   sleep_hours                                                   2773 non-null   float64       
 2   id                                                            2773 non-null   object        
 3   active_energy                                                 1472 non-null   float64       
 4   exercise_minutes                                              871 non-null    float64       
 5   HKCategoryValueAppleStandHourIdle                             870 non-null    float64       
 6   HKCategoryValueAppleStandHourStood                            871 non-null    float64       
 7   HKCate

Separating back into train and test:

In [287]:
train = (data_prepped
 .query('id == "train"')
 .drop('id', axis=1)
 .reset_index(drop=True)
 )

test = (data_prepped
 .query('id == "test"')
 .drop('id', axis=1)
 .reset_index(drop=True)
 )

In [288]:
test

Unnamed: 0,date,sleep_hours,active_energy,exercise_minutes,HKCategoryValueAppleStandHourIdle,HKCategoryValueAppleStandHourStood,HKCategoryValueEnvironmentalAudioExposureEventMomentaryLimit,minutes_stood,basal_energy,body_mass,body_mass_index,distance_walking_running,env_audio_exposure_min,env_audio_exposure_mean,env_audio_exposure_median,env_audio_exposure_max,flights_climbed,headphone_audio_exposure_min,headphone_audio_exposure_mean,headphone_audio_exposure_median,headphone_audio_exposure_max,heartrate_min,heartrate_mean,heartrate_median,heartrate_max,heartrate_var_min,heartrate_var_mean,heartrate_var_median,heartrate_var_max,oxygen_saturation_min,oxygen_saturation_mean,oxygen_saturation_median,oxygen_saturation_max,respiratory_rate_min,respiratory_rate_mean,respiratory_rate_median,respiratory_rate_max,resting_heartrate,stair_ascent_speed_min,stair_ascent_speed_mean,stair_ascent_speed_median,stair_ascent_speed_max,stair_descent_speed_min,stair_descent_speed_mean,stair_descent_speed_median,stair_descent_speed_max,step_count,vo2_max,walking_asymmetry_min,walking_asymmetry_mean,walking_asymmetry_median,walking_asymmetry_max,walking_support_min,walking_support_mean,walking_support_median,walking_support_max,walking_heartrate_mean,walking_speed_min,walking_speed_mean,walking_speed_median,walking_speed_max,walking_steplength_min,walking_steplength_mean,walking_steplength_median,walking_steplength_max
0,2022-01-01,1.0,1802.449,104.0,6.0,33.0,,128.0,2089.326,,,16.686231,46.0920,65.053287,67.01795,78.8768,43.0,0.0000,56.030966,67.87115,88.6707,42.0,75.768965,67.0,169.000,49.9237,62.938750,63.02290,76.2496,0.90,0.967500,0.980,1.00,16.0,18.220930,18.00,25.0,53.0,0.710106,1.083547,1.186330,1.28830,0.675453,1.078890,0.995578,2.21127,20193,48.62,0.0,0.000000,0.0,0.00,0.236,0.268172,0.2640,0.310,95.0,2.10272,3.257710,3.489620,3.93701,19.6850,30.539961,31.88980,35.8268
1,2022-01-02,1.0,2982.172,177.0,11.0,24.0,,132.0,2025.210,192.8,,27.026786,39.5644,62.348590,65.35100,80.7179,71.0,0.0000,70.113413,78.77610,93.0875,47.0,75.117756,65.0,167.903,27.7781,39.378067,39.59940,50.8589,0.94,0.969167,0.965,1.00,16.0,18.216216,17.50,27.5,53.0,0.767510,1.158289,1.088730,1.71273,0.718094,1.344225,1.089250,3.73039,25412,48.74,0.0,0.000000,0.0,0.00,0.237,0.264552,0.2600,0.323,97.0,1.74481,3.392009,3.467250,4.31729,18.5039,31.145340,31.88980,36.2205
2,2022-01-03,1.0,1941.497,115.0,9.0,30.0,,123.0,2070.001,193.4,,17.661256,42.4967,60.688543,61.20610,84.3081,35.0,47.2848,72.966550,78.24450,89.8971,46.0,67.099667,60.0,175.177,36.0798,45.128400,43.79265,56.8485,0.94,0.972143,0.975,1.00,15.5,18.420455,17.75,25.0,55.0,0.812983,1.243011,1.295900,1.65904,1.112130,1.385570,1.366755,1.69664,19616,48.72,0.0,0.000000,0.0,0.00,0.256,0.275357,0.2715,0.326,81.0,1.81192,3.012570,3.176450,3.93701,20.8661,27.918541,29.13390,33.8583
3,2022-01-04,1.0,1566.851,96.0,8.0,19.0,,89.0,1950.298,191.0,,14.542260,41.4022,61.143140,61.96725,84.1344,45.0,0.0000,35.978019,19.87010,82.2624,41.0,66.317184,58.0,173.000,50.4108,82.669000,51.59520,146.0010,0.96,0.978571,0.980,1.00,15.5,17.304878,17.00,22.5,52.0,0.826994,1.113190,0.999835,1.91101,0.662541,1.372438,1.508720,2.18718,16027,48.65,0.0,0.018000,0.0,0.05,0.239,0.267333,0.2620,0.309,102.0,2.03561,3.289740,3.511990,4.18307,18.1102,29.705384,30.70870,36.6142
4,2022-01-05,1.0,2579.864,156.0,14.0,22.0,,123.0,2030.691,188.2,,23.285167,47.1578,61.075746,60.99650,76.0110,60.0,75.8013,81.267738,80.24400,88.1498,48.0,71.112040,61.0,176.093,26.4528,63.201460,61.39370,129.8100,0.92,0.966000,0.970,0.99,15.0,17.285714,17.00,23.0,52.0,0.968357,1.230317,1.187360,1.85300,0.664894,1.212054,1.248305,1.75090,23944,48.67,0.0,0.000909,0.0,0.01,0.220,0.251339,0.2470,0.319,119.0,1.61059,3.446407,3.556730,4.11596,16.1417,30.910016,31.49610,36.6142
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
414,2023-03-12,1.0,485.440,20.0,9.0,14.0,,101.0,2171.837,,,3.199846,41.4948,59.897020,56.87430,83.5157,32.0,66.3341,72.787475,69.50640,85.8030,45.0,62.919270,59.0,125.000,37.1588,57.368264,51.18800,103.0970,0.96,0.982353,0.980,1.00,12.0,18.030000,17.25,24.0,51.0,0.720210,1.041977,0.991744,1.65313,0.830004,1.279828,1.236100,1.95940,6764,,,,,,0.256,0.289000,0.2950,0.313,92.0,2.01324,2.806113,2.796170,3.62384,20.4724,28.237122,28.93705,36.2205
415,2023-03-13,1.0,1669.080,85.0,16.0,12.0,,80.0,2208.939,201.8,,13.882898,46.3354,57.339388,54.16800,72.1421,41.0,55.8954,71.060029,76.62900,79.0951,45.0,108.161167,92.0,183.000,30.4413,53.282227,55.09480,70.1224,0.92,0.962500,0.960,0.99,15.0,17.091837,17.00,24.5,49.0,0.820322,1.101963,1.124190,1.39816,0.680858,0.938817,0.862358,1.86641,13894,46.74,0.0,0.000000,0.0,0.00,0.249,0.270615,0.2630,0.321,122.0,1.76718,3.170059,3.433695,3.78042,16.5354,29.799421,31.88980,33.4646
416,2023-03-14,1.0,2269.436,133.0,12.0,18.0,,108.0,2183.173,,,20.237776,44.9641,58.649826,59.58350,71.8246,71.0,0.0000,60.994986,70.25410,84.4533,49.0,140.239636,170.5,181.000,38.2061,58.561370,54.50440,112.0390,0.94,0.973333,0.970,1.00,14.5,17.466667,17.00,22.0,53.0,0.658285,1.123164,1.046940,2.38534,0.665730,1.341530,1.139880,3.29447,21455,46.74,0.0,0.005000,0.0,0.10,0.257,0.272431,0.2705,0.307,115.0,2.52774,3.328022,3.411325,3.82516,25.5906,30.593305,30.51185,33.8583
417,2023-03-15,1.0,1648.959,88.0,10.0,23.0,,77.0,2137.165,,,14.429490,43.5476,64.606065,64.06290,85.4633,45.0,31.7297,71.229290,78.93205,85.5273,45.0,129.186419,165.0,177.000,24.7969,51.061036,46.10700,86.1832,0.95,0.973846,0.970,1.00,14.0,17.444444,16.50,26.5,60.0,0.779132,1.089656,0.962743,1.54806,0.686101,1.084077,0.965200,2.09338,14798,46.65,0.0,0.000000,0.0,0.00,0.243,0.270194,0.2670,0.323,99.5,1.52112,3.290716,3.511990,3.98175,16.1417,30.144741,31.10240,34.6457


Got all the data together now, next step is EDA and data cleaning and then starting with the pipelines.

In [297]:
# Writing the data to CSV for quick random forest test in R
train.to_csv('train_prepped.csv', index=False)
test.to_csv('test_prepped.csv', index=False)