In [190]:
from fitparse import FitFile
import folium
from functools import partial
import gzip
import gpxpy
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [283]:
def semicir_to_degs(semicirc):
    return semicirc * (180 / 2**31)

def parse_fitgz(filename):
    try:
        fitfile = FitFile(gzip.open(filename))
        df = pd.DataFrame([{d['name']: d['value'] for d in r.as_dict()['fields']} 
                                   for r in fitfile.get_messages('record')])
        df['position_lat'] = df['position_lat'].map(semicir_to_degs)
        df['position_long'] = df['position_long'].map(semicir_to_degs)
        return df
    except Exception as e:
        print(f'Issue reading fit file {filename}.')

def parse_gpx(filename):
    gpx = gpxpy.parse(filename)
    track_coords = []
    for track in gpx.tracks:
        for segment in track.segments:
            for point in segment.points:
                track_coords.append([point.time, point.latitude, point.longitude, point.elevation])
    return pd.DataFrame(track_coords, columns=['position_lat', 'position_long', 'altitude'])

def parse_file(filename):
    if filename.endswith('.fit.gz'):
        return parse_fitgz(filename)
    elif filename.endswith('.gpx'):
        return parse_gpx(filename)
    elif filename.endswith('.gpx.gz'):
        return parse_gpx(gzip.open(filename))
    else:
        print(f'Add parser for {filename} to parse_file function.')

# Error tracking

Fit files are failing when they have no lat / long data in activity records.

In [299]:
def parse_fitgz_no_lat_long(filename):
    try:
        fitfile = FitFile(gzip.open(filename))
        df = pd.DataFrame([{d['name']: d['value'] for d in r.as_dict()['fields']} 
                                   for r in fitfile.get_messages('record')])
        return df
    except Exception as e:
        print(f'Issue reading fit file {filename}.')
        
def see_why_failing_fit(folder, file):
    filename = f'../data/{folder}/activities/{file}'
    parse_fitgz(filename)
    df = parse_fitgz_no_lat_long(filename)
    print(f"Proportion of missing lat/longs: {df[['position_lat', 'position_long']].isnull().sum()/len(df)}")
    return df


In [300]:
see_why_failing_fit('MB_Strava', '4690904571.fit.gz').head(2)

Issue reading fit file ../data/MB_Strava/activities/4690904571.fit.gz.
Proportion of missing lat/longs: position_lat     1.0
position_long    1.0
dtype: float64


Unnamed: 0,timestamp,position_lat,position_long,distance,enhanced_speed,speed,unknown_88,heart_rate,cadence
0,2020-11-25 13:54:06,,,1.06,1.086,1.086,,,47
1,2020-11-25 13:54:06,,,1.12,1.086,1.086,,,47


In [20]:
def full_path(directory, x):
    try:
        return os.path.join(f'../data/{directory}/', x)
    except Exception as e:
        print(f'Full_path error directory: {directory}, {x}')

def check_file_exists(x):
    try:
        return os.path.exists(x)
    except Exception as e:
        print(f'File {x} does not exist')
        
def get_activities(directory):
    df = pd.read_csv(f'../data/{directory}/activities.csv')
    df = df[['Activity ID', 'Activity Date', 'Activity Name', 'Activity Type', 
                             'Elapsed Time', 'Distance', 'Filename', 'Moving Time',
                             'Elevation Gain', 'Elevation Loss', 'Average Speed', 'Average Grade']]
    df['Activity Date'] = pd.to_datetime(df['Activity Date'])
    df.columns = [ x.lower().replace(" ", "_") for x in df.columns]
    df['filename'] = df['filename'].map(lambda x: full_path(directory, x))
    df['exists'] = df['filename'].map(check_file_exists)
    return df.sort_values('activity_date')

In [22]:
def combine_folders(folder_list):
    dfs = []
    for folder in folder_list:
        df = get_activities(folder)
        df['person'] = folder
        dfs.append(df)
    return pd.concat(dfs, ignore_index=True)

In [25]:
people = ['MB_Strava', 'BL_Strava', 'KM_Strava', 'LB_Strava']
everyone = combine_folders(people)

Full_path error directory: BL_Strava, nan
Full_path error directory: BL_Strava, nan
Full_path error directory: BL_Strava, nan
Full_path error directory: BL_Strava, nan
Full_path error directory: BL_Strava, nan
Full_path error directory: BL_Strava, nan
Full_path error directory: BL_Strava, nan
Full_path error directory: BL_Strava, nan
Full_path error directory: BL_Strava, nan
Full_path error directory: BL_Strava, nan
Full_path error directory: BL_Strava, nan
Full_path error directory: BL_Strava, nan
Full_path error directory: BL_Strava, nan
File None does not exist
File None does not exist
File None does not exist
File None does not exist
File None does not exist
File None does not exist
File None does not exist
File None does not exist
File None does not exist
File None does not exist
File None does not exist
File None does not exist
File None does not exist
Full_path error directory: KM_Strava, nan
Full_path error directory: KM_Strava, nan
Full_path error directory: KM_Strava, nan
Ful

In [26]:
everyone.tail(3)

Unnamed: 0,activity_id,activity_date,activity_name,activity_type,elapsed_time,distance,filename,moving_time,elevation_gain,elevation_loss,average_speed,average_grade,exists,person
4908,4847862092,2021-02-04 14:15:18,Morning Activity,Nordic Ski,3252,5.68,../data/LB_Strava/activities/5171408676.fit.gz,3174.0,98.0,98.0,1.792029,-0.014065,True,LB_Strava
4909,4847864782,2021-02-06 17:57:03,Morning Activity,Nordic Ski,5793,7.61,../data/LB_Strava/activities/5171411428.fit.gz,5433.0,75.0,66.0,1.401988,0.123408,True,LB_Strava
4910,4847866278,2021-02-22 21:44:45,Afternoon Activity,Nordic Ski,2948,5.41,../data/LB_Strava/activities/5171412935.fit.gz,2781.0,112.0,79.0,1.946926,0.616874,True,LB_Strava


In [30]:
for person in people:
    print(f'Shape of {person}: {everyone[everyone.person == person].shape}')

Shape of MB_Strava: (383, 14)
Shape of BL_Strava: (2406, 14)
Shape of KM_Strava: (1566, 14)
Shape of LB_Strava: (556, 14)


In [36]:
current = everyone[everyone.person == person]
person, current.shape

('LB_Strava', (556, 14))

In [40]:
current.to_dict(orient='records')[:2]

[{'activity_id': 455865774,
  'activity_date': Timestamp('2015-12-12 18:41:42'),
  'activity_name': 'Lunch Run',
  'activity_type': 'Run',
  'elapsed_time': 14065,
  'distance': 25.45,
  'filename': '../data/LB_Strava/activities/506850239.fit.gz',
  'moving_time': 13253.0,
  'elevation_gain': 830.0,
  'elevation_loss': nan,
  'average_speed': nan,
  'average_grade': -0.0337859988212585,
  'exists': True,
  'person': 'LB_Strava'},
 {'activity_id': 455865772,
  'activity_date': Timestamp('2015-12-13 17:55:24'),
  'activity_name': 'Morning Run',
  'activity_type': 'Run',
  'elapsed_time': 11960,
  'distance': 21.99,
  'filename': '../data/LB_Strava/activities/506850238.fit.gz',
  'moving_time': 11503.0,
  'elevation_gain': 529.0,
  'elevation_loss': nan,
  'average_speed': nan,
  'average_grade': 0.090041697025299,
  'exists': True,
  'person': 'LB_Strava'}]

In [41]:
# dfs = []
# for i, d in enumerate(current.to_dict(orient='records')):
#     if i%20==0:
#         print(i, len(dfs))
#     try:
#         df = parse_file(d['filename'])
#         df['activity_id'] = d['activity_id']
#         df['person'] = d['person']
#         dfs.append(df)
#     except Exception as e:
#         pass

0 0
20 18
40 36
Issue reading fit file ../data/LB_Strava/activities/877660707.fit.gz.
Issue reading fit file ../data/LB_Strava/activities/1057621325.fit.gz.
60 54
Issue reading fit file ../data/LB_Strava/activities/1093136985.fit.gz.
80 73
100 92
Add parser for ../data/LB_Strava/activities/1217477450.tcx.gz to parse_file function.
Add parser for ../data/LB_Strava/activities/1217442924.tcx.gz to parse_file function.
120 108
Issue reading fit file ../data/LB_Strava/activities/1325790963.fit.gz.
140 126
160 145
180 165
200 183
220 202
240 222
Issue reading fit file ../data/LB_Strava/activities/2551741241.fit.gz.
260 241
280 261
300 281
320 301
Issue reading fit file ../data/LB_Strava/activities/3174278624.fit.gz.
Issue reading fit file ../data/LB_Strava/activities/3177023014.fit.gz.
340 319
Issue reading fit file ../data/LB_Strava/activities/3185356169.fit.gz.
Issue reading fit file ../data/LB_Strava/activities/3192244831.fit.gz.
Issue reading fit file ../data/LB_Strava/activities/3194741

In [175]:
# person_df = pd.concat(dfs)
# person_df['timestamp'] = pd.to_datetime(person_df['timestamp'])
# person_df.to_pickle(f'../data/{person}/df.pkl')

In [176]:
mb_df = pd.read_pickle(f'../data/MB_Strava/df.pkl')
mb_df.head()

Unnamed: 0,timestamp,position_lat,position_long,distance,enhanced_altitude,altitude,enhanced_speed,speed,unknown_61,unknown_66,unknown_87,cadence,temperature,fractional_cadence,activity_id,person,unknown_88,heart_rate
0,2016-03-20 18:31:38,40.016399,-105.344539,0.0,1986.6,1986.6,0.0,0.0,12433.0,852.0,0.0,0,28.0,0.0,522427928,MB_Strava,,
1,2016-03-20 18:31:43,40.01646,-105.344504,7.33,1987.8,1987.8,0.028,0.028,12439.0,852.0,0.0,0,28.0,0.0,522427928,MB_Strava,,
2,2016-03-20 18:31:44,40.016479,-105.344482,10.1,1987.2,1987.2,1.586,1.586,12436.0,852.0,0.0,0,28.0,0.0,522427928,MB_Strava,,
3,2016-03-20 18:31:46,40.016492,-105.344425,15.15,1986.8,1986.8,2.473,2.473,12434.0,862.0,0.0,81,28.0,0.5,522427928,MB_Strava,,
4,2016-03-20 18:31:53,40.016477,-105.344164,37.59,1986.4,1986.4,3.154,3.154,12432.0,853.0,0.0,84,28.0,0.0,522427928,MB_Strava,,


In [177]:
person = "MB_Strava"
person_df = pd.read_pickle(f"../data/{person}/df.pkl")
person_df.shape

(1405293, 18)

In [178]:
person_df['altitude'].isnull().sum()/person_df['altitude'].shape[0]

0.9978851385440616

In [179]:
tmp = person_df.dropna(subset=['altitude'])
tmp.shape

(2972, 18)

In [180]:
tmp.head(2)

Unnamed: 0,timestamp,position_lat,position_long,distance,enhanced_altitude,altitude,enhanced_speed,speed,unknown_61,unknown_66,unknown_87,cadence,temperature,fractional_cadence,activity_id,person,unknown_88,heart_rate
0,2016-03-20 18:31:38,40.016399,-105.344539,0.0,1986.6,1986.6,0.0,0.0,12433.0,852.0,0.0,0,28.0,0.0,522427928,MB_Strava,,
1,2016-03-20 18:31:43,40.01646,-105.344504,7.33,1987.8,1987.8,0.028,0.028,12439.0,852.0,0.0,0,28.0,0.0,522427928,MB_Strava,,


In [182]:
def max_altitude(grp):
    index = grp['altitude'].idxmax()
    diff = grp['altitude'].max() - grp['altitude'].min()
    highest_pt = grp.loc[index]
    highest_pt['gain'] = diff
    return highest_pt

peaks = tmp.groupby('activity_id').apply(max_altitude)

In [183]:
print(f'Shape: {peaks.shape}')
peaks.head(2)

Shape: (3, 19)


Unnamed: 0_level_0,timestamp,position_lat,position_long,distance,enhanced_altitude,altitude,enhanced_speed,speed,unknown_61,unknown_66,unknown_87,cadence,temperature,fractional_cadence,activity_id,person,unknown_88,heart_rate,gain
activity_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
522427928,2016-03-20 19:56:04,40.018006,-105.346612,10929.82,2013.4,2013.4,2.295,2.295,12567.0,855.0,0.0,78,23.0,0.5,522427928,MB_Strava,,,138.2
523229186,2016-03-21 20:29:59,39.754415,-105.201471,3895.28,1908.0,1908.0,2.948,2.948,,,0.0,85,28.0,0.0,523229186,MB_Strava,,,129.6


In [184]:
peaks = peaks.dropna(subset=['position_lat', 'position_long'])
print(f'Shape: {peaks.shape}')

Shape: (3, 19)


In [185]:
cental_lat, central_long = peaks[['position_lat', 'position_long']].mean(axis=0)

m = folium.Map(location=[cental_lat, central_long], tiles="Stamen Terrain",zoom_start=10)

for item in peaks.to_dict(orient='records'):
    
    folium.CircleMarker(
        location=[ item['position_lat'], item['position_long'] ],
        radius=5).add_to(m)
m

# Peaks

In [226]:
person = "BL_Strava"
person_df = pd.read_pickle(f"../data/{person}/df.pkl")
print(person_df.shape)
print(f"Percentage where altitude NOT missing: \
      {100 - 100*(person_df['altitude'].isnull().sum()/person_df['altitude'].shape[0]):.2f}%")



(6336861, 24)
Percentage where altitude NOT missing:       37.62%


In [227]:
person_alt_df = person_df.dropna(subset=['altitude'])
print(person_alt_df.shape)
peaks = person_alt_df.groupby('activity_id').apply(max_altitude)
print(f'There are {len(peaks)} instances in peaks.')
gain_threshold = 400
peaks = peaks[peaks['gain'] > gain_threshold]
print(f'There are {len(peaks)} instances in peaks.')

(2383825, 24)
There are 666 instances in peaks.
There are 159 instances in peaks.


In [228]:
peaks.head(5)

Unnamed: 0_level_0,timestamp,position_lat,position_long,distance,time_from_course,enhanced_altitude,altitude,enhanced_speed,speed,power,...,activity_id,person,unknown_88,unknown_87,fractional_cadence,gps_accuracy,activity_type,step_length,accumulated_power,gain
activity_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1624048,2011-09-14 01:13:11,39.631434,-105.23178,6001.01,,2265.4,2265.4,5.377,5.377,,...,1624048,BL_Strava,,,,,,,,500.0
1694245,2011-09-06 19:24:14,39.720587,-105.251404,10873.28,,2247.0,2247.0,0.0,0.0,,...,1694245,BL_Strava,,,,,,,,533.2
1726347,2011-09-22 19:41:07,40.089599,-105.33612,25293.77,,2046.0,2046.0,4.659,4.659,,...,1726347,BL_Strava,,,,,,,,444.6
1845672,2011-10-02 15:03:38,39.916688,-105.391186,0.0,,2555.4,2555.4,3.694,3.694,,...,1845672,BL_Strava,,,,,,,,810.4
1989612,2011-10-04 23:25:45,39.73205,-105.248726,6596.16,,2291.0,2291.0,2.961,2.961,,...,1989612,BL_Strava,,,,,,,,553.6


In [229]:
print(f"Nulls: {peaks[['position_lat', 'position_long']].isnull().sum()}")

Nulls: position_lat     22
position_long    22
dtype: int64


In [197]:
peaks = peaks.dropna(subset=['position_lat', 'position_long'])

cental_lat, central_long = peaks[['position_lat', 'position_long']].mean(axis=0)

m = folium.Map(location=[cental_lat, central_long], tiles="Stamen Terrain",zoom_start=10)

for item in peaks.to_dict(orient='records'):
    
    folium.CircleMarker(
        location=[item['position_lat'], item['position_long']],
        radius=5).add_to(m)
m

In [233]:
peaks = peaks.dropna(subset=['position_lat', 'position_long'])
tmp = peaks[['position_lat', 'position_long']].copy()
epsilon = 0.005
clustering = DBSCAN(eps=epsilon, metric='haversine', min_samples=2).fit(tmp)
clustering.labels_

array([-1,  0,  1,  2,  3,  0,  2,  3, -1,  0,  4,  5,  0,  6,  7,  6,  1,
       -1,  6,  1, -1, -1, -1,  8,  8, -1,  9, 10, 10, 11, -1, 12, -1,  4,
       11, -1, 10, 12, 10, 12, -1, 10, -1, 13, 13, 14, 13, 13, 13, 13, 14,
       -1, 13,  0,  1,  1, 15,  1, 15, 15,  0,  4, -1, -1, -1, -1, -1, -1,
       -1, 16,  7, -1, -1, 15, 17, 13, -1, -1, -1,  9, -1,  1,  4, 17,  7,
       10, -1, 10, -1, 10, 13, -1, -1, 10,  4,  4, 13, -1,  7,  4,  4,  4,
       18, -1,  4,  4, 18,  7, 12,  4,  1, 13, 10, -1, -1,  7, 18, 12, 10,
       12, 16, 16, 16,  5, 18, 12,  6, 13, -1, 12, -1, -1, 19, 19, -1, 19,
       -1])

In [234]:
peaks['cluster'] = clustering.labels_
tmp2=peaks['cluster'].value_counts().sort_values(ascending=False)
most_freq = tmp2.head(1).index[0]
cluster_series = tmp2.iloc[1:]
cluster_series

4     12
13    12
10    11
12     8
1      8
7      6
0      6
18     4
16     4
15     4
6      4
19     3
8      2
9      2
11     2
5      2
2      2
14     2
17     2
3      2
Name: cluster, dtype: int64

In [235]:
cental_lat, central_long = peaks[['position_lat', 'position_long']].mean(axis=0)
m = folium.Map(location=[cental_lat, central_long], tiles="Stamen Terrain",zoom_start=12)

for i in range(len(cluster_series)):
    cluster = cluster_series.index[i]
    p = peaks.query(f"cluster=={cluster}")
    for point in p.to_dict(orient='records'):
        folium.CircleMarker(
            location=[point['position_lat'], point['position_long']],
            radius=5).add_to(m)
 
m