### Create Flatfile for Running workouts

In [184]:
import pandas as pd
import numpy as np

In [185]:
### Load Files
# DimWorkouts
# FactWorkoutRecords --> Aggregate
# DimRoutes --> Aggregate
# FactBasisRecords --> Aggregate

In [186]:
DimWorkouts = pd.read_parquet('./output_files/frontend_files/DimWorkouts.parquet.gzip', engine='pyarrow')
FactWorkoutRecords = pd.read_parquet('./output_files/frontend_files/FactWorkoutRecords.parquet.gzip', engine='pyarrow')
DimRoutes = pd.read_parquet('./output_files/frontend_files/DimRoutes.parquet.gzip', engine='pyarrow')
FactBasisRecords = pd.read_parquet('./output_files/frontend_files/FactBasisRecords.parquet.gzip', engine='pyarrow')

### Filter for Running Workouts

In [187]:
runningWorkouts = DimWorkouts[DimWorkouts['workoutActivityType'] == 'Running']
runningFacts = FactWorkoutRecords.merge(runningWorkouts['workout_pk'], how = 'inner', on = 'workout_pk')

### Create Flatfile

#### Aggregate Values for runningFacts

In [188]:
type_list = list(runningFacts[runningFacts['workout_pk'].isna() == False]['type'].unique())
type_list

['DistanceWalkingRunning',
 'BasalEnergyBurned',
 'ActiveEnergyBurned',
 'RunningStrideLength',
 'RunningVerticalOscillation',
 'RunningGroundContactTime',
 'RunningPower',
 'RunningSpeed']

In [189]:
type_list_sum = ['DistanceWalkingRunning',
                 'BasalEnergyBurned',
                 'ActiveEnergyBurned']
type_list_avg = ['RunningStrideLength',
                 'RunningVerticalOscillation',
                 'RunningGroundContactTime',
                 'RunningPower',
                 'RunningSpeed']
type_list_sd = ['RunningStrideLength',
                'RunningVerticalOscillation',
                'RunningGroundContactTime',
                'RunningPower',
                'RunningSpeed']

In [190]:
runningFacts = runningFacts.drop(['startDate_records', 'endDate_records', 'startDate_wo', 'endDate_wo', 'unit'], axis=1)

In [191]:
columns = ['workout_pk', 'Date', 'type']

In [192]:
# Group and sum
runningFacts_sum = runningFacts[runningFacts['type'].isin(type_list_sum)].groupby(columns).sum().reset_index()
runningFacts_sum['type'] = 'SUM_' + runningFacts_sum['type']
# Pivot
runningFacts_sum = runningFacts_sum.pivot(index=['workout_pk', 'Date'], columns='type', values='value').reset_index()

# Group and avg
runningFacts_avg = runningFacts[runningFacts['type'].isin(type_list_avg)].groupby(columns).mean().reset_index()
runningFacts_avg['type'] = 'AVG_' + runningFacts_avg['type']
# Pivot
runningFacts_avg = runningFacts_avg.pivot(index=['workout_pk', 'Date'], columns='type', values='value').reset_index()

# Group and sd
runningFacts_sd = runningFacts[runningFacts['type'].isin(type_list_sd)].groupby(columns).std().reset_index()
runningFacts_sd['type'] = 'SD_' + runningFacts_sd['type']
# Pivot
runningFacts_sd = runningFacts_sd.pivot(index=['workout_pk', 'Date'], columns='type', values='value').reset_index()

merge_columns = columns
merge_columns.remove('type')

runningFactsFlat = runningFacts_sum.merge(runningFacts_avg, on = merge_columns, how = 'outer').merge(runningFacts_sd, on = merge_columns, how = 'outer')

#### Aggregate Values for DimRoutes

In [193]:
DimRoutes = DimRoutes.drop(['trkpt', 'time'], axis=1)    

In [194]:
agg_columns = ['ele', 'course', 'speed', 'hAcc', 'vAcc']

In [195]:
DimRoutes['ele'] = DimRoutes['ele'].astype(str).astype(float)
DimRoutes['course'] = DimRoutes['course'].astype(str).astype(float)
DimRoutes['speed'] = DimRoutes['speed'].astype(str).astype(float)
DimRoutes['hAcc'] = DimRoutes['hAcc'].astype(str).astype(float)
DimRoutes['vAcc'] = DimRoutes['vAcc'].astype(str).astype(float)
DimRoutes.dtypes

ele           float64
course        float64
speed         float64
hAcc          float64
vAcc          float64
route_name     object
workout_pk      int64
dtype: object

In [196]:
# Defining functions for xth percentile
def p10(x):
    return x.quantile(0.1)
def p20(x):
    return x.quantile(0.2)
def p30(x):
    return x.quantile(0.3)
def p40(x):
    return x.quantile(0.4)
def p50(x):
    return x.quantile(0.5)
def p60(x):
    return x.quantile(0.6)
def p70(x):
    return x.quantile(0.7)
def p80(x):
    return x.quantile(0.8)
def p90(x):
    return x.quantile(0.9)

aggfuncs = ['mean', 'std', p10, p20, p30, p40, p50, p60, p70, p80, p90]
runningRoutesFacts = DimRoutes.groupby(['workout_pk']).agg({'ele': aggfuncs,
                                                            'course': aggfuncs,
                                                            'speed': aggfuncs,
                                                            'hAcc': aggfuncs,
                                                            'vAcc': aggfuncs}).reset_index()

runningRoutesFacts.columns = runningRoutesFacts.columns.map('_'.join).str.strip('_')

### Aggregate Basis Records

In [197]:
runningHealthFacts = FactBasisRecords.merge(runningWorkouts['workout_pk'], on = 'workout_pk', how = 'inner')

type_list = list(runningHealthFacts[runningHealthFacts['workout_pk'].isna() == False]['type'].unique())
type_list

['HeartRate',
 'StepCount',
 'FlightsClimbed',
 'AppleExerciseTime',
 'VO2Max',
 'HeadphoneAudioExposure',
 'AppleStandTime',
 'StairAscentSpeed',
 'StairDescentSpeed',
 'WalkingDoubleSupportPercentage',
 'WalkingSpeed',
 'WalkingStepLength',
 'WalkingAsymmetryPercentage',
 'EnvironmentalAudioExposure']

In [198]:
runningHealthFacts = runningHealthFacts[runningHealthFacts['type'] == 'HeartRate']
runningHealthFacts = runningHealthFacts.rename(columns={"value": "HeartRate"}, errors="raise")
runningHealthFactsHeartrate = runningHealthFacts.drop(['startDate_records', 'endDate_records', 'unit', 'type'], axis=1)
columns = ['workout_pk', 'Date', 'type']

In [199]:
aggfuncs = ['mean', 'std', 'min', p10, p20, p30, p40, p50, p60, p70, p80, p90, 'max']
runningHealthFactsHeartrate = runningHealthFactsHeartrate.groupby(['workout_pk']).agg({'HeartRate': aggfuncs}).reset_index()

runningHealthFactsHeartrate.columns = runningHealthFactsHeartrate.columns.map('_'.join).str.strip('_')

### Investigate Zones

In [200]:
runningHealthFacts = runningHealthFacts.sort_values(['workout_pk', 'startDate_records'])
runningHealthFacts['endDate_records_shift'] = runningHealthFacts.groupby(['workout_pk'])['endDate_records'].shift(-1)

runningHealthFacts['endDate_records_shift'] = np.where(pd.isnull(runningHealthFacts['endDate_records_shift']), runningHealthFacts['endDate_records'], runningHealthFacts['endDate_records_shift']) 

In [201]:
runningHealthFacts = runningHealthFacts.drop('endDate_records', axis = 1)

In [202]:
runningHealthFacts['duration_sec'] = (runningHealthFacts['endDate_records_shift'] - runningHealthFacts['startDate_records']).dt.total_seconds()

In [203]:
heart_ranges = [[90, 100],
                [100, 110],
                [110, 120],
                [120, 125],
                [125, 130],
                [130, 135],
                [135, 140],
                [140, 145],
                [145, 150],
                [150, 155],
                [155, 160],
                [160, 165],
                [165, 170],
                [170, 175],
                [175, 180],
                [180, 185],
                [185, 190],
                [190, 195],
                [195, 200],
                [200, 250]]

list_hr_columns = [] 

for range in heart_ranges:
    runningHealthFacts[f'HR_{range[0]}-{range[1]}'] = np.where((runningHealthFacts['HeartRate'] > range[0]) & (runningHealthFacts['HeartRate'] <= range[1]), runningHealthFacts['duration_sec'], 0)
    list_hr_columns.append(f"HR_{range[0]}-{range[1]}")

In [204]:
list_hr_columns.append('workout_pk')
list_hr_columns.append('duration_sec')

In [205]:
runningHealthFactsHeartrateZones = runningHealthFacts[list_hr_columns].groupby('workout_pk').sum()

#### Join to Flatfile

In [206]:
runningFactsFlat = runningFactsFlat.merge(runningRoutesFacts, on = 'workout_pk', how = 'left')
runningFactsFlat = runningWorkouts[['workout_pk', 'sourceName', 'workoutActivityType', 'duration']].merge(runningFactsFlat, on = 'workout_pk', how = 'right')
runningFactsFlat = runningFactsFlat.merge(runningHealthFactsHeartrate, on = 'workout_pk', how = 'left')
runningFactsFlat = runningFactsFlat.merge(runningHealthFactsHeartrateZones, on = 'workout_pk', how = 'left')

### Save Flatfile

In [209]:
runningFactsFlat.to_parquet('./output_files/flatfiles/runningFactsFlat.parquet.gzip', compression='gzip')