In [66]:
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

In [64]:
# Creating dataframe 
df = pd.read_excel(r'C:\Users\Manuel Elizaldi\Desktop\Learning-Testing\PyStrava\Outputs\workout_data.xls',sheet_name = 'All_Workouts_Table')

# cleaning data frame
df = df[['activity_id','sport_type','distance','workout_time_min','calories','total_elevation_gain','average_speed','max_speed','max_speed','average_heartrate','max_heartrate','avg_time_per_lap','lap_count']]

In [121]:
df.describe()

Unnamed: 0,activity_id,distance,workout_time_min,calories,total_elevation_gain,average_speed,max_speed,max_speed.1,average_heartrate,max_heartrate,avg_time_per_lap,lap_count,distance_score
count,667.0,667.0,667.0,667.0,667.0,667.0,667.0,667.0,648.0,648.0,667.0,667.0,667.0
mean,6632538000.0,1.183238,41.041289,379.525187,11.093103,0.395214,0.928993,0.928993,136.653241,167.655864,21.091069,3.976012,2.488756
std,1393143000.0,3.430321,24.3351,190.732169,42.147437,0.998098,2.468626,2.468626,16.591326,17.917735,20.83046,6.255042,4.005002
min,3483200000.0,0.0,2.58,0.0,0.0,0.0,0.0,0.0,68.8,81.0,0.054216,1.0,1.0
25%,5459833000.0,0.0,29.525,283.0,0.0,0.0,0.0,0.0,130.975,164.0,6.6675,1.0,1.0
50%,6724528000.0,0.0,37.45,371.0,0.0,0.0,0.0,0.0,140.3,173.0,11.08,4.0,1.0
75%,7879658000.0,0.0,47.0,453.5,0.0,0.0,0.0,0.0,146.525,178.0,32.05,5.0,1.0
max,8885874000.0,34.47,376.47,1769.0,452.0,6.949,18.6,18.6,170.5,195.0,186.55,102.0,25.0


In [55]:
df.shape

(667, 13)

In [28]:
df['sport_type'].unique()

array(['Run', 'Functional-Cardio Workout', 'Yoga', 'WeightTraining',
       'MountainBikeRide', 'Hike', 'Walk', 'Ride', 'TrailRun', 'Rowing',
       'Swim', 'Kayaking', 'AlpineSki'], dtype=object)

In [11]:
# Creating additional dataframes for specific activities:
running_activities = df.loc[df['sport_type'].isin(['Run','TrailRun'])]

# Biking type workouts
biking_activities = df.loc[df['sport_type'].isin(['Ride','MountainBikeRide'])]

# Functional type workouts
functional_activities = df.loc[df['sport_type'].isin(['Functional-Cardio Workout'])]

In [119]:
# create a list of our conditions
distance_conditions = [
    (df['distance']==0), # 1
    (df['sport_type'].isin(['Run', 'TrailRun'])) & (df['distance'] >= 0) & (df['distance'] < 5), # 5
    (df['sport_type'].isin(['Run', 'TrailRun'])) & (df['distance'] >= 5) & (df['distance'] < 10), # 10 
    (df['sport_type'].isin(['Run', 'TrailRun'])) & (df['distance'] >= 10) & (df['distance'] < 13), # 20 
    (df['sport_type'].isin(['Run', 'TrailRun'])) & (df['distance'] >= 13), # 30
    (df['sport_type'].isin(['Ride', 'MountainBikeRide'])) & (df['distance'] >= 1) & (df['distance'] < 5), # 1
    (df['sport_type'].isin(['Ride', 'MountainBikeRide'])) & (df['distance'] >= 5) & (df['distance'] < 8.5), # 3
    (df['sport_type'].isin(['Ride', 'MountainBikeRide'])) & (df['distance'] >= 8.5) & (df['distance'] < 12), # 5
    (df['sport_type'].isin(['Ride', 'MountainBikeRide'])) & (df['distance'] >= 12) & (df['distance'] < 15), # 10
    (df['sport_type'].isin(['Ride', 'MountainBikeRide'])) & (df['distance'] >= 15), # 15
    (df['sport_type'].isin(['AlpineSki']) & (df['distance'] >= 10) & (df['distance'] < 15)), # 10
    (df['sport_type'].isin(['AlpineSki']) & (df['distance'] >= 15) & (df['distance'] < 20)), # 15
    (df['sport_type'].isin(['AlpineSki']) & (df['distance'] >= 20)), # 20
    (df['sport_type'].isin(['Swim'])) & (df['distance'] >= 0.10) & (df['distance'] < 0.20), # 5
    (df['sport_type'].isin(['Swim'])) & (df['distance'] >= 0.20) & (df['distance'] < 0.30), # 10
    (df['sport_type'].isin(['Swim'])) & (df['distance'] >= 0.30) & (df['distance'] < 0.35), # 15
    (df['sport_type'].isin(['Swim'])) & (df['distance'] >= 0.35) & (df['distance'] < 0.40), # 20
    (df['sport_type'].isin(['Swim'])) & (df['distance'] >= 0.40), # 25
    (df['distance']> 0.5) & (df['distance'] < 1), # 1
    (df['distance']> 1) & (df['distance'] < 2), # 2
    (df['distance']> 2) & (df['distance'] < 3), # 3
    (df['distance']> 3) & (df['distance'] < 4), # 4
    (df['distance']> 4) # 5
    
]
distance_conditions_values = [1, 5, 10, 20, 30, 
                              1, 3, 5, 10, 15, 
                              10, 15, 20, 
                              5, 10, 15, 20, 25, 
                              1, 2, 3, 4, 5]


df['distance_score'] = np.select(distance_conditions, distance_conditions_values)

In [131]:
workout_time_condition = [
    (df['workout_time_min'] >= 2) & (df['workout_time_min'] < 10),
    (df['workout_time_min'] >= 10) & (df['workout_time_min'] < 15),
    (df['workout_time_min'] >= 15) & (df['workout_time_min'] < 20),
    (df['workout_time_min'] >= 20) & (df['workout_time_min'] < 25),
    (df['workout_time_min'] >= 25) & (df['workout_time_min'] < 30),
    (df['workout_time_min'] >= 20) & (df['workout_time_min'] < 35),
    (df['workout_time_min'] >= 35) & (df['workout_time_min'] < 40),
    (df['workout_time_min'] >= 40) & (df['workout_time_min'] < 45),
    (df['workout_time_min'] >= 45) & (df['workout_time_min'] < 50),
    (df['workout_time_min'] >= 55) & (df['workout_time_min'] < 60),
    (df['workout_time_min'] >= 60)
]


workout_time_values = [1, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50]

df['workout_time_score'] = np.select(workout_time_condition, workout_time_values)


In [132]:
df

Unnamed: 0,activity_id,sport_type,distance,workout_time_min,calories,total_elevation_gain,average_speed,max_speed,max_speed.1,average_heartrate,max_heartrate,avg_time_per_lap,lap_count,distance_score,workout_time_score
0,8885873542,Run,5.50,41.83,449.0,48.0,2.429,4.048,4.048,151.4,164.0,6.966667,6,10,35
1,8879705626,Functional-Cardio Workout,0.00,31.97,344.0,0.0,0.000,0.000,0.000,146.7,178.0,5.318333,6,1,25
2,8875392009,Functional-Cardio Workout,0.00,16.20,194.0,0.0,0.000,0.000,0.000,150.0,179.0,5.393333,3,1,10
3,8869251524,Functional-Cardio Workout,0.00,20.45,246.0,0.0,0.000,0.000,0.000,147.2,168.0,4.076000,5,1,15
4,8862861001,Yoga,0.00,19.78,106.0,0.0,0.000,0.000,0.000,100.6,140.0,19.780000,1,1,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
662,3517497833,Run,3.57,28.92,352.5,4.2,2.061,4.900,4.900,,,28.920000,1,5,20
663,3516568449,Ride,10.64,38.50,0.0,13.9,4.695,10.800,10.800,,,38.500000,1,5,30
664,3509004014,Run,4.43,36.25,443.1,15.6,2.065,4.700,4.700,,,36.270000,1,5,30
665,3496090180,Run,3.45,32.63,340.5,3.9,1.832,4.800,4.800,,,32.630000,1,5,25


In [124]:
df.describe()

Unnamed: 0,activity_id,distance,workout_time_min,calories,total_elevation_gain,average_speed,max_speed,max_speed.1,average_heartrate,max_heartrate,avg_time_per_lap,lap_count,distance_score
count,667.0,667.0,667.0,667.0,667.0,667.0,667.0,667.0,648.0,648.0,667.0,667.0,667.0
mean,6632538000.0,1.183238,41.041289,379.525187,11.093103,0.395214,0.928993,0.928993,136.653241,167.655864,21.091069,3.976012,2.488756
std,1393143000.0,3.430321,24.3351,190.732169,42.147437,0.998098,2.468626,2.468626,16.591326,17.917735,20.83046,6.255042,4.005002
min,3483200000.0,0.0,2.58,0.0,0.0,0.0,0.0,0.0,68.8,81.0,0.054216,1.0,1.0
25%,5459833000.0,0.0,29.525,283.0,0.0,0.0,0.0,0.0,130.975,164.0,6.6675,1.0,1.0
50%,6724528000.0,0.0,37.45,371.0,0.0,0.0,0.0,0.0,140.3,173.0,11.08,4.0,1.0
75%,7879658000.0,0.0,47.0,453.5,0.0,0.0,0.0,0.0,146.525,178.0,32.05,5.0,1.0
max,8885874000.0,34.47,376.47,1769.0,452.0,6.949,18.6,18.6,170.5,195.0,186.55,102.0,25.0
