In [139]:
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

In [167]:
# Creating dataframe 
df = pd.read_excel(r'C:\Users\Manuel Elizaldi\Desktop\Learning-Testing\PyStrava\Outputs\workout_data.xls',sheet_name = 'All_Workouts_Table')

# cleaning data frame
df = df[['activity_id','sport_type','distance','workout_time_min','calories','total_elevation_gain','average_speed','max_speed','average_heartrate','max_heartrate','avg_time_per_lap','lap_count']]

In [168]:
df.shape

(667, 12)

In [169]:
df['sport_type'].unique()

array(['Run', 'Functional-Cardio Workout', 'Yoga', 'WeightTraining',
       'MountainBikeRide', 'Hike', 'Walk', 'Ride', 'TrailRun', 'Rowing',
       'Swim', 'Kayaking', 'AlpineSki'], dtype=object)

In [170]:
# Creating additional dataframes for specific activities:
# running workouts 
running_activities = df.loc[df['sport_type'].isin(['Run','TrailRun'])]

# Biking type workouts
biking_activities = df.loc[df['sport_type'].isin(['Ride','MountainBikeRide'])]

# Functional type workouts
functional_activities = df.loc[df['sport_type'].isin(['Functional-Cardio Workout'])]

In [171]:
# create a list of our conditions
# points are marked with comments
distance_conditions = [
    (df['distance']==0), # 1
    (df['sport_type'].isin(['Run', 'TrailRun'])) & (df['distance'] >= 0) & (df['distance'] < 5), # 5
    (df['sport_type'].isin(['Run', 'TrailRun'])) & (df['distance'] >= 5) & (df['distance'] < 10), # 10 
    (df['sport_type'].isin(['Run', 'TrailRun'])) & (df['distance'] >= 10) & (df['distance'] < 13), # 25 
    (df['sport_type'].isin(['Run', 'TrailRun'])) & (df['distance'] >= 13), # 30
    (df['sport_type'].isin(['Ride', 'MountainBikeRide'])) & (df['distance'] >= 1) & (df['distance'] < 5), # 5
    (df['sport_type'].isin(['Ride', 'MountainBikeRide'])) & (df['distance'] >= 5) & (df['distance'] < 8.5), # 10
    (df['sport_type'].isin(['Ride', 'MountainBikeRide'])) & (df['distance'] >= 8.5) & (df['distance'] < 12), # 15
    (df['sport_type'].isin(['Ride', 'MountainBikeRide'])) & (df['distance'] >= 12) & (df['distance'] < 15), # 20
    (df['sport_type'].isin(['Ride', 'MountainBikeRide'])) & (df['distance'] >= 15), # 25
    (df['sport_type'].isin(['AlpineSki']) & (df['distance'] >= 10) & (df['distance'] < 15)), # 10
    (df['sport_type'].isin(['AlpineSki']) & (df['distance'] >= 15) & (df['distance'] < 20)), # 15
    (df['sport_type'].isin(['AlpineSki']) & (df['distance'] >= 20)), # 20
    (df['sport_type'].isin(['Swim'])) & (df['distance'] >= 0.10) & (df['distance'] < 0.20), # 10
    (df['sport_type'].isin(['Swim'])) & (df['distance'] >= 0.20) & (df['distance'] < 0.30), # 15
    (df['sport_type'].isin(['Swim'])) & (df['distance'] >= 0.30) & (df['distance'] < 0.35), # 20
    (df['sport_type'].isin(['Swim'])) & (df['distance'] >= 0.35) & (df['distance'] < 0.40), # 25
    (df['sport_type'].isin(['Swim'])) & (df['distance'] >= 0.40), # 30
    (df['distance']> 0.5) & (df['distance'] < 1), # 1
    (df['distance']> 1) & (df['distance'] < 2), # 2
    (df['distance']> 2) & (df['distance'] < 3), # 3
    (df['distance']> 3) & (df['distance'] < 4), # 4
    (df['distance']> 4) # 5
]

distance_conditions_values = [1, 5, 10, 25, 30, # running
                              5, 10, 15, 20, 25, # biking
                              10, 15, 20, # skiing
                              10, 15, 20, 25, 30, # swimming
                              1, 2, 3, 4, 5 # special activitie 
                              ]

# applying conditions and values
df['distance_score'] = np.select(distance_conditions, distance_conditions_values)

In [172]:
# building workout time score conditions and values
workout_time_condition = [
    (df['workout_time_min'] >= 2) & (df['workout_time_min'] < 10),
    (df['workout_time_min'] >= 10) & (df['workout_time_min'] < 15),
    (df['workout_time_min'] >= 15) & (df['workout_time_min'] < 20),
    (df['workout_time_min'] >= 20) & (df['workout_time_min'] < 25),
    (df['workout_time_min'] >= 25) & (df['workout_time_min'] < 30),
    (df['workout_time_min'] >= 20) & (df['workout_time_min'] < 35),
    (df['workout_time_min'] >= 35) & (df['workout_time_min'] < 40),
    (df['workout_time_min'] >= 40) & (df['workout_time_min'] < 45),
    (df['workout_time_min'] >= 45) & (df['workout_time_min'] < 50),
    (df['workout_time_min'] >= 55) & (df['workout_time_min'] < 60),
    (df['workout_time_min'] >= 60)
]

workout_time_values = [1, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50]

# applying the conditions and values to the dataframe
df['workout_time_score'] = np.select(workout_time_condition, workout_time_values)


In [173]:
# building calories conditions and values
calories_conditions = [(df['calories'] >= 0) & (df['calories'] < 100), # 5
                       (df['calories'] >= 100) & (df['calories'] < 200), # 15
                       (df['calories'] >= 200) & (df['calories'] < 300), # 25
                       (df['calories'] >= 300) & (df['calories'] < 400), # 35
                       (df['calories'] >= 400) & (df['calories'] < 500), # 40
                       (df['calories'] >= 500) & (df['calories'] < 600), # 50
                       (df['calories'] >= 600) & (df['calories'] < 700), # 60
                       (df['calories'] >= 700) & (df['calories'] < 800), # 70
                       (df['calories'] >= 800) & (df['calories'] < 900), # 80
                       (df['calories'] >= 900) & (df['calories'] < 1000), # 90
                       (df['calories'] >= 1000) # 100
                       ] 

calories_values = [5, 15, 25, 35, 40, 50, 60, 70, 80, 90, 100]

df['calorie_score'] = np.select(calories_conditions, calories_values)


In [174]:
# Building elevation conditions and values
elevation_conditions = [(df['total_elevation_gain'] == 0),
                        (df['total_elevation_gain'] >= 1) & (df['total_elevation_gain'] < 100),
                        (df['total_elevation_gain'] >=100) & (df['total_elevation_gain'] < 200),
                        (df['total_elevation_gain'] >= 200) & (df['total_elevation_gain'] < 300),
                        (df['total_elevation_gain'] >= 300) & (df['total_elevation_gain'] < 400),
                        (df['total_elevation_gain'] >= 400) & (df['total_elevation_gain'] < 500),
                        (df['total_elevation_gain']> 500)]

elevation_values = [1,5,10,15,20,25,30]


df['total_elevation_gain_score'] = np.select(elevation_conditions, elevation_values)

In [175]:
# replacing some of the Nans with 1 
df['average_heartrate'] = df['average_heartrate'].fillna(1)
# building average heartrate conditions and values
avg_heartrate_conditions = [
                            (df['average_heartrate'] > 0) & (df['average_heartrate'] < 100),
                            (df['average_heartrate'] >= 100) & (df['average_heartrate'] < 130),
                            (df['average_heartrate'] >= 130) & (df['average_heartrate'] < 145),
                            (df['average_heartrate'] >= 145) & (df['average_heartrate'] < 155),
                            (df['average_heartrate'] >= 155) & (df['average_heartrate'] < 165),
                            (df['average_heartrate'] >= 165) & (df['average_heartrate'] < 170),
                            (df['average_heartrate'] >= 170)
                            ]

avg_heartrate_values = [5, 10, 15, 20, 30, 35, 40]

df['average_heartrate_score'] = np.select(avg_heartrate_conditions, avg_heartrate_values)

In [176]:
# replacing some of the Nans with 1 
df['max_heartrate'] = df['max_heartrate'].fillna(1)

# building max heartrate conditions and values
max_heartrate_conditions = [
    (df['max_heartrate'] >= 0) & (df['max_heartrate'] < 80),
    (df['max_heartrate'] >= 80) & (df['max_heartrate'] < 130),
    (df['max_heartrate'] >= 130) & (df['max_heartrate'] < 165),
    (df['max_heartrate'] >= 165) & (df['max_heartrate'] < 175),
    (df['max_heartrate'] >= 175) & (df['max_heartrate'] < 185),
    (df['max_heartrate'] >= 180)
]

max_heartrate_values = [5, 10, 15, 25, 30, 35]

df['max_heartrate_score'] = np.select(max_heartrate_conditions, max_heartrate_values)

In [177]:
# avg time per lap conditions and values
avg_time_per_lap_conditions = [
                               (df['avg_time_per_lap'] >= 0) & (df['avg_time_per_lap'] < 5),
                               (df['avg_time_per_lap'] >= 5) & (df['avg_time_per_lap'] < 10),
                               (df['avg_time_per_lap'] >= 10) & (df['avg_time_per_lap'] < 20),
                               (df['avg_time_per_lap'] >= 20) & (df['avg_time_per_lap'] < 30),
                               (df['avg_time_per_lap'] >= 30)
                               ]


avg_time_per_lap_values = [1, 5, 10, 15, 20]


df['avg_time_per_lap_score'] = np.select(avg_time_per_lap_conditions, avg_time_per_lap_values)


In [180]:
# lap count conditions and values
lap_count_conditions = [(df['lap_count'] >= 0) & (df['lap_count'] < 3),
                        (df['lap_count'] >= 3) & (df['lap_count'] < 4),
                        (df['lap_count'] >= 4) & (df['lap_count'] < 5),
                        (df['lap_count'] >= 5) & (df['lap_count'] < 6),
                        (df['lap_count'] >= 6)
                        ]

lap_count_values = [5, 10, 20, 25, 30]

df['lap_count_score'] = np.select(lap_count_conditions, lap_count_values)

In [182]:
# average speed conditions and values
avg_speed_conditions = [(df['average_speed'] > 0) & (df['average_speed'] < 1),
                        (df['average_speed'] >= 1.5) & (df['average_speed'] < 2),
                        (df['average_speed'] >= 2.5) & (df['average_speed'] < 3),
                        (df['average_speed'] >= 3) & (df['average_speed'] < 3.5),
                        (df['average_speed'] >= 3.5) & (df['average_speed'] < 4),
                        (df['average_speed'] >= 4) & (df['average_speed'] < 5),
                        (df['average_speed'] >= 5)]

avg_speed_values = []

In [181]:
df

Unnamed: 0,activity_id,sport_type,distance,workout_time_min,calories,total_elevation_gain,average_speed,max_speed,average_heartrate,max_heartrate,avg_time_per_lap,lap_count,distance_score,workout_time_score,calorie_score,total_elevation_gain_score,average_heartrate_score,max_heartrate_score,avg_time_per_lap_score,lap_count_score
0,8885873542,Run,5.50,41.83,449.0,48.0,2.429,4.048,151.4,164.0,6.966667,6,10,35,40,5,20,15,5,30
1,8879705626,Functional-Cardio Workout,0.00,31.97,344.0,0.0,0.000,0.000,146.7,178.0,5.318333,6,1,25,35,1,20,30,5,30
2,8875392009,Functional-Cardio Workout,0.00,16.20,194.0,0.0,0.000,0.000,150.0,179.0,5.393333,3,1,10,15,1,20,30,5,10
3,8869251524,Functional-Cardio Workout,0.00,20.45,246.0,0.0,0.000,0.000,147.2,168.0,4.076000,5,1,15,25,1,20,25,1,25
4,8862861001,Yoga,0.00,19.78,106.0,0.0,0.000,0.000,100.6,140.0,19.780000,1,1,10,15,1,10,15,10,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
662,3517497833,Run,3.57,28.92,352.5,4.2,2.061,4.900,1.0,1.0,28.920000,1,5,20,35,5,5,5,15,5
663,3516568449,Ride,10.64,38.50,0.0,13.9,4.695,10.800,1.0,1.0,38.500000,1,15,30,5,5,5,5,20,5
664,3509004014,Run,4.43,36.25,443.1,15.6,2.065,4.700,1.0,1.0,36.270000,1,5,30,40,5,5,5,20,5
665,3496090180,Run,3.45,32.63,340.5,3.9,1.832,4.800,1.0,1.0,32.630000,1,5,25,35,5,5,5,20,5


In [179]:
df.describe()

Unnamed: 0,activity_id,distance,workout_time_min,calories,total_elevation_gain,average_speed,max_speed,average_heartrate,max_heartrate,avg_time_per_lap,lap_count,distance_score,workout_time_score,calorie_score,total_elevation_gain_score,average_heartrate_score,max_heartrate_score,avg_time_per_lap_score
count,667.0,667.0,667.0,667.0,667.0,667.0,667.0,667.0,667.0,667.0,667.0,667.0,667.0,667.0,667.0,667.0,667.0,667.0
mean,6632538000.0,1.183238,41.041289,379.525187,11.093103,0.395214,0.928993,132.789055,162.908546,21.091069,3.976012,3.074963,27.548726,35.547226,1.838081,15.637181,23.965517,10.749625
std,1393143000.0,3.430321,24.3351,190.732169,42.147437,0.998098,2.468626,27.882629,32.888779,20.83046,6.255042,5.731774,13.922089,15.843418,2.523913,6.084268,7.477314,6.897489
min,3483200000.0,0.0,2.58,0.0,0.0,0.0,0.0,1.0,1.0,0.054216,1.0,1.0,0.0,5.0,1.0,5.0,5.0,1.0
25%,5459833000.0,0.0,29.525,283.0,0.0,0.0,0.0,129.4,163.0,6.6675,1.0,1.0,20.0,25.0,1.0,10.0,15.0,5.0
50%,6724528000.0,0.0,37.45,371.0,0.0,0.0,0.0,139.8,172.0,11.08,4.0,1.0,30.0,35.0,1.0,15.0,25.0,10.0
75%,7879658000.0,0.0,47.0,453.5,0.0,0.0,0.0,146.4,178.0,32.05,5.0,1.0,35.0,40.0,1.0,20.0,30.0,20.0
max,8885874000.0,34.47,376.47,1769.0,452.0,6.949,18.6,170.5,195.0,186.55,102.0,30.0,50.0,100.0,25.0,40.0,35.0,20.0
