In [1]:
from fitparse import FitFile
import folium
from functools import partial
import gzip
import gpxpy
import itertools
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [2]:
def max_altitude(act):
    index = act['altitude'].idxmax()
    diff = act['altitude'].max() - act['altitude'].min()
    highest_pt = act.loc[index]
    highest_pt['gain'] = diff
    return highest_pt

In [44]:
person = "MB_Strava"
person_df = pd.read_pickle(f"../data/{person}/df.pkl")
print(person_df['altitude'].isnull().sum()/person_df['altitude'].shape[0])
tmp = person_df.dropna(subset=['altitude'])

0.9978851385440616


In [45]:
peaks = tmp.groupby('activity_id').apply(max_altitude)
peaks

Unnamed: 0_level_0,timestamp,position_lat,position_long,distance,enhanced_altitude,altitude,enhanced_speed,speed,unknown_61,unknown_66,unknown_87,cadence,temperature,fractional_cadence,activity_id,person,unknown_88,heart_rate,gain
activity_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
522427928,2016-03-20 19:56:04,40.018006,-105.346612,10929.82,2013.4,2013.4,2.295,2.295,12567.0,855.0,0.0,78,23.0,0.5,522427928,MB_Strava,,,138.2
523229186,2016-03-21 20:29:59,39.754415,-105.201471,3895.28,1908.0,1908.0,2.948,2.948,,,0.0,85,28.0,0.0,523229186,MB_Strava,,,129.6
524284549,2016-03-23 00:55:16,39.967898,-105.281714,9396.1,1943.4,1943.4,2.407,2.407,,,0.0,80,28.0,0.5,524284549,MB_Strava,,,322.8


In [46]:
cental_lat, central_long = peaks[['position_lat', 'position_long']].mean(axis=0)

m = folium.Map(location=[cental_lat, central_long], tiles="Stamen Terrain",zoom_start=10)

for item in peaks.to_dict(orient='records'):
    
    folium.CircleMarker(
        location=[ item['position_lat'], item['position_long'] ],
        radius=5).add_to(m)
m

# Initial Peak Investigation 

In [47]:
person = "BL_Strava"
person_df = pd.read_pickle(f"../data/{person}/df.pkl")
#person_df = pd.read_parquet(f"../data/{person}/df.parquet")
print(person_df.shape)
print(f"Percentage where altitude NOT missing: \
      {100 - 100*(person_df['altitude'].isnull().sum()/person_df['altitude'].shape[0]):.2f}%")


(6336861, 24)
Percentage where altitude NOT missing:       37.62%


In [12]:
person_alt_df = person_df.dropna(subset=['altitude'])
print(person_alt_df.shape)
peaks = person_alt_df.groupby('activity_id').apply(max_altitude)
print(f'There are {len(peaks)} instances in peaks.')
gain_threshold = 400
peaks = peaks[peaks['gain'] > gain_threshold]
print(f'There are {len(peaks)} instances of peaks after thresholding.')


(2383825, 24)
There are 666 instances in peaks.
There are 159 instances of peaks after thresholding.


In [13]:
peaks.head(5)

Unnamed: 0_level_0,timestamp,position_lat,position_long,distance,time_from_course,enhanced_altitude,altitude,enhanced_speed,speed,power,...,activity_id,person,unknown_88,unknown_87,fractional_cadence,gps_accuracy,activity_type,step_length,accumulated_power,gain
activity_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1624048,2011-09-14 01:13:11,39.631434,-105.23178,6001.01,,2265.4,2265.4,5.377,5.377,,...,1624048,BL_Strava,,,,,,,,500.0
1694245,2011-09-06 19:24:14,39.720587,-105.251404,10873.28,,2247.0,2247.0,0.0,0.0,,...,1694245,BL_Strava,,,,,,,,533.2
1726347,2011-09-22 19:41:07,40.089599,-105.33612,25293.77,,2046.0,2046.0,4.659,4.659,,...,1726347,BL_Strava,,,,,,,,444.6
1845672,2011-10-02 15:03:38,39.916688,-105.391186,0.0,,2555.4,2555.4,3.694,3.694,,...,1845672,BL_Strava,,,,,,,,810.4
1989612,2011-10-04 23:25:45,39.73205,-105.248726,6596.16,,2291.0,2291.0,2.961,2.961,,...,1989612,BL_Strava,,,,,,,,553.6


In [14]:
print(f"Nulls: {peaks[['position_lat', 'position_long']].isnull().sum()}")

Nulls: position_lat     22
position_long    22
dtype: int64


In [15]:
peaks = peaks.dropna(subset=['position_lat', 'position_long'])

cental_lat, central_long = peaks[['position_lat', 'position_long']].mean(axis=0)

m = folium.Map(location=[cental_lat, central_long], tiles="Stamen Terrain",zoom_start=10)

for item in peaks.to_dict(orient='records'):
    
    folium.CircleMarker(
        location=[item['position_lat'], item['position_long']],
        radius=5).add_to(m)
m

In [16]:
peaks = peaks.dropna(subset=['position_lat', 'position_long'])
tmp = peaks[['position_lat', 'position_long']].copy()
epsilon = 0.005
clustering = DBSCAN(eps=epsilon, metric='haversine', min_samples=2).fit(tmp)
clustering.labels_

array([-1,  0,  1,  2,  3,  0,  2,  3, -1,  0,  4,  5,  0,  6,  7,  6,  1,
       -1,  6,  1, -1, -1, -1,  8,  8, -1,  9, 10, 10, 11, -1, 12, -1,  4,
       11, -1, 10, 12, 10, 12, -1, 10, -1, 13, 13, 14, 13, 13, 13, 13, 14,
       -1, 13,  0,  1,  1, 15,  1, 15, 15,  0,  4, -1, -1, -1, -1, -1, -1,
       -1, 16,  7, -1, -1, 15, 17, 13, -1, -1, -1,  9, -1,  1,  4, 17,  7,
       10, -1, 10, -1, 10, 13, -1, -1, 10,  4,  4, 13, -1,  7,  4,  4,  4,
       18, -1,  4,  4, 18,  7, 12,  4,  1, 13, 10, -1, -1,  7, 18, 12, 10,
       12, 16, 16, 16,  5, 18, 12,  6, 13, -1, 12, -1, -1, 19, 19, -1, 19,
       -1])

In [17]:
peaks['cluster'] = clustering.labels_
most_freq=peaks['cluster'].value_counts().sort_values(ascending=False)
cluster_series = most_freq.iloc[1:]
cluster_series

4     12
13    12
10    11
12     8
1      8
7      6
0      6
18     4
16     4
15     4
6      4
19     3
8      2
9      2
11     2
5      2
2      2
14     2
17     2
3      2
Name: cluster, dtype: int64

In [18]:
cental_lat, central_long = peaks[['position_lat', 'position_long']].mean(axis=0)
m = folium.Map(location=[cental_lat, central_long], tiles="Stamen Terrain",zoom_start=12)

for i in range(len(cluster_series)):
    cluster = cluster_series.index[i]
    p = peaks.query(f"cluster=={cluster}")
    for point in p.to_dict(orient='records'):
        folium.CircleMarker(
            location=[point['position_lat'], point['position_long']],
            radius=5).add_to(m)
 
m

# Focus on Summit

In [19]:
def peak_detector_with_info(person, gain_threshold):
    #person_df = pd.read_pickle(f"../data/{person}/df.pkl")
    person_df = pd.read_parquet(f"../data/{person}/df.parquet")
    person_df.rename(columns={'alt': 'altitude', 'lat': 'position_lat', 'lon': 'position_long'}, inplace=True) #For parquet
    print(person_df.shape)
    print(f"Percentage where altitude data AVAILABLE: \
          {100 - 100*(person_df['altitude'].isnull().sum()/person_df['altitude'].shape[0]):.2f}%")
    person_alt_df = person_df.dropna(subset=['altitude'])
    print(person_alt_df.shape)
    peaks = person_alt_df.groupby('activity_id').apply(max_altitude)
    print(f'There are {len(peaks)} instances in peaks.')
    peaks = peaks[peaks['gain'] > gain_threshold]
    print(f'There are {len(peaks)} instances in peaks.')
    return peaks

person = "KM_Strava"
peaks = peak_detector_with_info(person, 400)


(7146031, 6)
Percentage where altitude data AVAILABLE:           96.32%
(6882748, 6)
There are 1396 instances in peaks.
There are 692 instances in peaks.


In [33]:
peaks = peaks.dropna(subset=['position_lat', 'position_long'])
peaks_locations = peaks[['position_lat', 'position_long']].copy()
epsilon = 0.0003 #0.005
clustering = DBSCAN(eps=epsilon, metric='haversine', min_samples=2).fit(peaks_locations)
peaks_locations['cluster'] = clustering.labels_
most_freq=peaks_locations['cluster'].value_counts().sort_values(ascending=False)
cluster_series = most_freq[most_freq.index!=-1]

print(cluster_series.head())
peaks.head(3)

0     46
31    23
22    20
24    15
1     13
Name: cluster, dtype: int64


Unnamed: 0_level_0,time,position_lat,position_long,altitude,activity_id,person,gain
activity_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
13480462,2012-07-12 20:56:28,39.737918,-106.182983,2998.4,13480462,KM_Strava,464.4
13794905,2012-07-19 02:24:41,39.567978,-106.110296,3198.5,13794905,KM_Strava,420.0
33738955,2012-12-23 21:40:16,39.511052,-105.904173,3673.4,33738955,KM_Strava,725.1


In [3]:
def peak_detector(df, gain_threshold):
    df.rename(columns={'alt': 'altitude', 'lat': 'position_lat', 'lon': 'position_long'}, inplace=True) 
    df.dropna(subset=['altitude'], inplace=True)
    peaks = df.groupby('activity_id').apply(max_altitude)
    peaks = peaks[peaks['gain'] > gain_threshold]
    return peaks

def plot_one_cluster(m, df, activities_ids, steps, color):
    for activity_id in activities_ids:
        locations = df.query(f"activity_id=={activity_id}")[['position_lat', 'position_long']].iloc[::steps]
        locations.dropna(inplace=True)
        points = locations.values.tolist()
        folium.PolyLine(points, color=color, weight=2.5, opacity=0.5).add_to(m)

def plot_multiple_clusters(df, peaks, central_lat, central_long, colormap, cluster_series, steps):
    m = folium.Map(location=[central_lat,central_long],tiles="Stamen Terrain",zoom_start=12)
    n = len(list(cluster_series.index))
    peak_positions = peaks.groupby('cluster')[['position_lat', 'position_long']].mean()
    for i, val in enumerate(list(cluster_series.index)):
        color = next(colormap)
        activities_ids = list(peaks.query(f"cluster=={val}").index)
        plot_one_cluster(m, df, activities_ids, steps, color)
        peak = peak_positions.loc[val]
        folium.CircleMarker(location=[peak['position_lat'], peak['position_long']],radius=10, color=color).add_to(m)
    return m



In [4]:
person = "KM_Strava"
person_df = pd.read_parquet(f"../data/{person}/df.parquet")
peaks = peak_detector(person_df, 400)
print(peaks.shape[0])
peaks = peaks.dropna(subset=['position_lat', 'position_long']) #WORK ON THIS
epsilon = 0.0005 #0.005
clustering = DBSCAN(eps=epsilon, metric='haversine', min_samples=2).fit(peaks[['position_lat', 'position_long']])
peaks['cluster'] = clustering.labels_
most_freq = peaks['cluster'].value_counts().sort_values(ascending=False)
cluster_series = most_freq[most_freq.index != -1]
print(len(cluster_series))


692
40


In [5]:
peaks.head(2)

Unnamed: 0_level_0,time,position_lat,position_long,altitude,activity_id,person,gain,cluster
activity_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
13480462,2012-07-12 20:56:28,39.737918,-106.182983,2998.4,13480462,KM_Strava,464.4,-1
13794905,2012-07-19 02:24:41,39.567978,-106.110296,3198.5,13794905,KM_Strava,420.0,0


In [7]:
cluster_series.head(4)

0     46
23    44
18    17
27    15
Name: cluster, dtype: int64

In [9]:
colors = ['#a6cee3','#1f78b4','#b2df8a','#33a02c','#fb9a99','#e31a1c','#fdbf6f',
          '#ff7f00','#cab2d6','#6a3d9a','#ffff99','#b15928']
colormap = itertools.cycle(colors)
central_lat, central_long = 39.5744, -106.0975

m = folium.Map(location=[central_lat,central_long],tiles="Stamen Terrain",zoom_start=12)

plot_multiple_clusters(person_df, peaks, central_lat, central_long, colormap, cluster_series, 10)


# Graph BEFORE adding peak markers

In [263]:
# def plot_one_cluster(m, df, activities_ids, steps, color):
#     for activity_id in activities_ids:
#         locations = person_df.query(f"activity_id=={activity_id}")[['lat', 'lon']].iloc[::steps]
#         locations.dropna(inplace=True)
#         points = locations.values.tolist()
#         folium.PolyLine(points, color=color, weight=2.5, opacity=0.5).add_to(m)


# def plot_multiple_clusters(df, central_lat, central_long, colormap, cluster_series, steps):
#     m = folium.Map(location=[central_lat,central_long],tiles="Stamen Terrain",zoom_start=12)
#     n = len(list(cluster_series.index))
#     for i, val in enumerate(list(cluster_series.index)):
#         activities_ids = list(peaks_locations.query(f"cluster=={val}").index)
#         color = next(colormap)
#         plot_one_cluster(m, df, activities_ids, steps, color)
#     return m

# colors = ['#a6cee3','#1f78b4','#b2df8a','#33a02c','#fb9a99','#e31a1c','#fdbf6f',
#           '#ff7f00','#cab2d6','#6a3d9a','#ffff99','#b15928']
# colormap = itertools.cycle(colors)
# central_lat, central_long = 39.5744, -106.0975

# plot_multiple_clusters(person_df, central_lat, central_long, colormap, cluster_series, 10)

In [229]:
# colors = ['#a6cee3','#1f78b4','#b2df8a','#33a02c','#fb9a99','#e31a1c','#fdbf6f',
#           '#ff7f00','#cab2d6','#6a3d9a','#ffff99','#b15928']

# colormap = itertools.cycle(colors)

# central_lat, central_long = 39.5744, -106.0975
# m = folium.Map(location=[central_lat,central_long],tiles="Stamen Terrain",zoom_start=12)

# n = len(list(cluster_series.index))

# for i, val in enumerate(list(cluster_series.index)):
#     activities_ids = list(peaks_locations.query(f"cluster=={val}").index)
#     color = next(colormap)
#     for activity_id in activities_ids:
#         locations = person_df.query(f"activity_id=={activity_id}")[['lat', 'lon']]
#         locations.dropna(inplace=True)
#         locations=locations.iloc[::10]
#         points = locations.values.tolist()
#         folium.PolyLine(points, color=color, weight=2.5, opacity=0.5).add_to(m)

# m

In [230]:
# cluster_num = 0
# activities_ids = list(peaks_locations.query(f"cluster=={cluster_num}").index)
# person = "KM_Strava"
# person_df = pd.read_parquet(f"../data/{person}/df.parquet")
# print(activities_ids)
# print(len(activities_ids))
# person_df.head() 

In [231]:
# central_lat, central_long = 39.5744, -106.0975
# m = folium.Map(location=[central_lat,central_long],tiles="Stamen Terrain",zoom_start=12)

# for activity_id in activities_ids:
#     locations = person_df.query(f"activity_id=={activity_id}")[['lat', 'lon']].iloc[::10]
#     points = locations.values.tolist()
#     folium.PolyLine(points, color="red", weight=2.5, opacity=0.5).add_to(m)
    
# m

In [232]:
# colors = ['#a6cee3','#1f78b4','#b2df8a','#33a02c','#fb9a99','#e31a1c','#fdbf6f',
#           '#ff7f00','#cab2d6','#6a3d9a','#ffff99','#b15928']

# colormap = itertools.cycle(colors)

# central_lat, central_long = 39.5744, -106.0975
# m = folium.Map(location=[central_lat,central_long],tiles="Stamen Terrain",zoom_start=12)

# n = len(list(cluster_series.index))

# for i, val in enumerate(list(cluster_series.index)):
#     activities_ids = list(peaks_locations.query(f"cluster=={val}").index)
#     color = next(colormap)
#     for activity_id in activities_ids:
#         locations = person_df.query(f"activity_id=={activity_id}")[['lat', 'lon']]
#         locations.dropna(inplace=True)
#         locations=locations.iloc[::10]
#         points = locations.values.tolist()
#         folium.PolyLine(points, color=color, weight=2.5, opacity=0.5).add_to(m)

# m

In [233]:
# def plot_routes(activity_lst, folder_name, central_lat, central_long):
#     m = folium.Map(location=[central_lat,central_long],tiles="Stamen Terrain",zoom_start=12)
#     for activity in activity_lst:
#         locations = read_fitfile(f'../data/{folder_name}/activities/{activity}')[['position_lat', 'position_long']].dropna()
#         points = locations.values.tolist()
#         folium.PolyLine(points, color="red", weight=2.5, opacity=0.5).add_to(m)
#     return m