# Calculate Athlete Performance based on Strava History
This notebook tests different approches to use the athlete's history to predict his performance.

## Set Strava API user token
Import the Client from the [stravalib library](https://github.com/hozn/stravalib/) and load your Access Token.

You can find your personal token at https://www.strava.com/settings/api. Copy your token to a file named `STRAVA_TOKEN` in the home directory.

In [None]:
from stravalib import unithelper
from stravalib.client import Client as StravaClient

strava_token = !cat STRAVA_TOKEN
strava_client = StravaClient(access_token=strava_token)

## Import Activities


In [None]:
import datetime
import sys
from termcolor import colored

weeks = datetime.timedelta(weeks=4)

args = {
    # 'after': datetime.datetime.now() - weeks, # start date is after specified value (UTC). datetime.datetime or str or None
    # 'before': datetime.datetime(year=2017, month=12, day=31), # start date is before specified value (UTC). datetime.datetime or str or None
    # 'limit': 50,  # Maximum activites retrieved
}

activities = strava_client.get_activities(**args)

manual_activities = [activity for activity in activities if activity.manual]
activities = [activity for activity in activities if not activity.manual]

message = "%d Activities imported\n" % len(activities)
sys.stdout.write(colored(message, attrs=['bold']))
print("%d Manual activities excluded" % len(manual_activities))


## Group Activities by Activity Type
We create a dictionnary containing lists of activities by [activity type](https://strava.github.io/api/v3/activities/#activity-types-a-idtypesnbspa).


### Basic Activity Types:
- `Ride`
- `Run`
- `Swim`
- `Hike`
- `Walk`

### More Exotic ones:
`AlpineSki`, `BackcountrySki`, `Canoeing`, `Crossfit`, `EBikeRide`, `Elliptical`, `IceSkate`, `InlineSkate`, `Kayaking`, `Kitesurf`, `NordicSki`, `RockClimbing`, `RollerSki`, `Rowing`, `Snowboard`, `Snowshoe`, `StairStepper`, `StandUpPaddling`, `Surfing`, `VirtualRide`, `WeightTraining`, `Windsurf`, `Workout`, `Yoga`

In [None]:
activities_by_type = {}

for activity in activities: 
    if activity.type not in activities_by_type:
        activities_by_type[activity.type] = []
    
    activities_by_type[activity.type].append(activity)

for activity_type, activity_list in activities_by_type.items():
    sys.stdout.write("- %s: %d \n" % (activity_type, len(activity_list)))
    

## Group Activities by Gear
We choose the activity type and group the activities by gear used during the activity.

In [None]:
activities_by_gear = {}

for activity in activities:
    if activity.gear_id not in activities_by_gear:
        activities_by_gear[activity.gear_id] = []

    activities_by_gear[activity.gear_id].append(activity) 

gear_ids = list(activities_by_gear)

for gear_id in gear_ids:
    if gear_id is not None:
        gear = strava_client.get_gear(gear_id)
        activities_by_gear[gear.name] = activities_by_gear.pop(gear_id)

for activity_gear, activity_list in activities_by_gear.items():
    sys.stdout.write("- %s: %d \n" % (activity_gear, len(activity_list)))

## Group Activities by Workout Type
We group the activities by workout type:
- `default run`,
- `race run`,
- `long run`,
- `workout run`,
- `default ride`,
- `race ride`,
- `workout ride`

In [None]:
activities_by_workout_type = {}

for activity in activities:

    if activity.workout_type not in activities_by_workout_type:
        activities_by_workout_type[activity.workout_type] = []

    activities_by_workout_type[activity.workout_type].append(activity)

# Rename workout type
workout_types = {
    '0': 'default run',
    '1': 'race run',
    '2': 'long run',
    '3': 'workout run',
    '10': 'default ride',
    '11': 'race ride',
    '12': 'workout ride',
}

for key, name in workout_types.items():
    if key in activities_by_workout_type:
        activities_by_workout_type[name] = activities_by_workout_type.pop(key)
        
for workout_type, activities in activities_by_workout_type.items():
    sys.stdout.write("- %s: %d \n" % (workout_type, len(activities)))        



## Filter activities
Always filter activity by type: `Run`, `Ride`, `Swim`, etc..

Optionnaly further filter by additional criteria, e.g.:
- Gear
- Workout type
- Distance
- Altitude gain
- Date

In [None]:
filtered_activities = []

# Activity Type (Run, Ride, Swim, etc.)
filtered_activities = activities_by_type['Run']

# Workout Type (race, workout, long run, etc...)
# filtered_activities = activities_by_workout_type['default run']

# Gear
# filtered_activities = activities_by_gear['Dynafit Alpine Pro']

# Filter more by type
# filtered_activities = [activity for activity in filtered_activities if activity.type == 'Run']

# Filter more by elevation gain
# filtered_activities = [activity for activity in filtered_activities if 1000 < unithelper.meters(activity.total_elevation_gain).num < 3000]

# Filter more by distance
# filtered_activities = [activity for activity in filtered_activities if unithelper.kilometers(activity.distance).num > 5]

# Filter more by date
# filtered_activities = [activity for activity in filtered_activities if activity.start_date_local > datetime.datetime(year=2018, month=1, day=1)]

message = "{} activities selected:\n".format(len(filtered_activities))
sys.stdout.write(colored(message, attrs=['bold']))
for activity in filtered_activities:
    date = activity.start_date.strftime('%d.%m.%y')
    sys.stdout.write("- {} {}: {} {}, +{}\n".format(
        activity.type,
        date,
        activity.name,
        unithelper.kilometers(activity.distance),
        unithelper.meters(activity.total_elevation_gain),
    ))


## Import Raw Streams for activities

In [None]:
count = len(filtered_activities)
observations = []
stream_types = ['time', 'altitude', 'distance']

for index, activity in enumerate(filtered_activities):
    # get raw streams at low resolution for optimal sampling
    raw_streams = strava_client.get_activity_streams(activity.id, types=stream_types, resolution='low')

    # make sure we have all the data for the linear regression
    if all(stream_type in raw_streams for stream_type in stream_types) and raw_streams['time'].original_size:
        observations.append({'activity': activity, 'streams': raw_streams})
        status = 'added'
    else:
        status = 'skipped'
    
    date = activity.start_date.strftime('%d.%m.%y')
    
    # display progress
    message = "{}/{} {}: {} {}: {} - {}, +{}... \n".format(
        index + 1, count,
        status,
        date,
        activity.type,
        activity.name,
        unithelper.kilometers(activity.distance),
        activity.total_elevation_gain)

    sys.stdout.write(message)

# remove skipped activities from observations
observations = [observation for observation in observations if observation['streams']]

# display results
message = "\nData streams imported for {} activities\n".format(len(observations))
sys.stdout.write(colored(message, attrs=['bold']))


## Prepare DataFrame for Polynomial Regression

The basic assumption is that there is a polynomial relationship between the **pace** of the athlete and the **slope** of travelled terrain, plus a penalty linked to the **total elevation gain**:

$$y = ax^2 + bx + ez + c $$


Where:
$$slope = x = \frac{elevation\:in\:meters}{distance\:in\:meters}$$


and:
$$pace = y = \frac{seconds}{meter}$$

and:
$$total\:elevation\:gain = z = {meters}$$


We try to fit the data with a polynomial regression.

In [None]:
import math
import numpy as np
import pandas as pd

count = len(observations)

for index, observation in enumerate(observations):
    streams = pd.DataFrame()
    data = pd.DataFrame()

    for key, stream in observation['streams'].items():
        streams[key] = stream.data

    data['gradient'] = streams['altitude'].diff() / streams['distance'].diff()
    data['pace'] = streams['time'].diff() / streams['distance'].diff()
    data['totalup'] = streams['altitude'].diff()[streams['altitude'].diff() >= 0].cumsum()
    data['totalup'] = data['totalup'].fillna(method='ffill').fillna(value=0)

    # cleanup and sort by gradient
    data = data[data.gradient.notnull()]
    data = data.sort_values(['gradient'])
    data = data.reset_index(drop=True)

    # append to observations
    observation['data'] = data
    
    # display progress
    message = "{}/{} \r".format(index + 1, count)
    sys.stdout.write(message)
    sys.stdout.flush()


## Remove outliers
**TODO**: Refactor with sklearn

In [None]:
for index, observation in enumerate(observations):
    data = observation['data']
    data = data[np.abs(data.pace - data.pace.mean()) <= (2.5 * data.pace.std())] # keep only the ones that are within +2.5 to -2.5 standard deviations in the column 'pace'.
    data = data[np.abs(data.gradient - data.gradient.mean()) <= ( 6 * data.gradient.std())] # keep only the ones that are within +6 to -6 standard deviations in the column 'gradient'.
    observation['data'] = data
    
    # display progress
    message = "{}/{} \r".format(index + 1, count)
    sys.stdout.write(message)
    sys.stdout.flush()
    

## Linear regression with sklearn 

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
%matplotlib inline

for index, observation in enumerate(observations):
    data = observation['data']
    data['gradient_squared'] = data.gradient**2
    variables = data[['gradient_squared', 'gradient', 'totalup']]                           
    target = data['pace']

    X = variables
    y = target

    model = LinearRegression()
    model.fit(X,y)
    model.score_ = model.score(X,y)
    
    observation['data'] = data
    observation['model'] = model

    activity_message = "{}/{}: {}\n".format(index + 1, count, observation['activity'].name)
    model_message = "R-squared: {} \nIntercept: {}".format(model.score_, model.intercept_)

    pace = data['pace'] * 1000 / 60
    gradient = data['gradient']
    predictions = model.predict(X) * 1000 / 60

    fig = plt.figure(figsize=(40, 10))
    ax = fig.add_subplot(122)
    ax.plot(gradient, pace, 'b.')
    ax.plot(gradient, predictions, 'ro')
    plt.title(activity_message + model_message)
    plt.close()


## Filter activities by model score and aggregate data

In [None]:
aggregated_data = pd.DataFrame()

# filter activities by R-square of the linear regression
good_observations = [observation for observation in observations if observation['model'].score_ > 0.75]

# aggregate good activity data in one table
for observation in good_observations:
    aggregated_data = aggregated_data.append(observation['data'])

    # display progress
    message = "{}/{} \r".format(index + 1, count)
    sys.stdout.write(message)
    sys.stdout.flush()
    
aggregated_data = aggregated_data.sort_values(['gradient'])
aggregated_data = aggregated_data.reset_index(drop=True)

# display results of the aggregation
count = len(good_observations)
message = "{} activities aggregated.\n".format(count)
sys.stdout.write(colored(message, attrs=['bold']))
for observation in good_observations:
    activity = observation['activity']
    date = activity.start_date.strftime('%d.%m.%y')
    sys.stdout.write("- {} {}: {} {}, +{}\n".format(
        activity.type,
        date,
        activity.name,
        unithelper.kilometers(activity.distance),
        unithelper.meters(activity.total_elevation_gain),
    ))


## Linear regression on aggregated data

In [None]:
%matplotlib inline

variables = aggregated_data[['gradient_squared', 'gradient', 'totalup']]                           
target = aggregated_data['pace']

X = variables
y = target

model = LinearRegression()
model.fit(X,y)
model.score_ = model.score(X,y)

print('R-squared: %s' % model.score_)
print('Intercept: %s' % model.intercept_)
print('Coef: %s' % model.coef_)
print('Based on %d entries.' % variables.shape[0])

pace = aggregated_data['pace'] * 1000 / 60
gradient = aggregated_data['gradient']
predictions = model.predict(X) * 1000 / 60

fig = plt.figure(figsize=(40, 10))
ax = fig.add_subplot(122)
ax.plot(gradient, pace, 'b,')
ax.plot(gradient, predictions, 'r.')
