In [95]:
import warnings 
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, classification_report
from matplotlib import pyplot

In [3]:
# Creating dataframe 
df = pd.read_excel(r'C:\Users\Manuel Elizaldi\Desktop\Learning-Testing\PyStrava\Outputs\workout_data.xls',sheet_name = 'All_Workouts_Table')

# cleaning data frame
df = df[['activity_id','sport_type','distance','workout_time_min','calories','total_elevation_gain','average_speed','max_speed','average_heartrate','max_heartrate','avg_time_per_lap','lap_count']]

In [4]:
df.shape

(667, 12)

In [5]:
df['sport_type'].unique()

array(['Run', 'Functional-Cardio Workout', 'Yoga', 'WeightTraining',
       'MountainBikeRide', 'Hike', 'Walk', 'Ride', 'TrailRun', 'Rowing',
       'Swim', 'Kayaking', 'AlpineSki'], dtype=object)

In [6]:
# Creating additional dataframes for specific activities:
# running workouts 
running_activities = df.loc[df['sport_type'].isin(['Run','TrailRun'])]

# Biking type workouts
biking_activities = df.loc[df['sport_type'].isin(['Ride','MountainBikeRide'])]

# Functional type workouts
functional_activities = df.loc[df['sport_type'].isin(['Functional-Cardio Workout'])]

In [7]:
# create a list of our conditions
# points are marked with comments
distance_conditions = [
    (df['distance']==0), # 1
    (df['sport_type'].isin(['Run', 'TrailRun'])) & (df['distance'] >= 0) & (df['distance'] < 5), # 5
    (df['sport_type'].isin(['Run', 'TrailRun'])) & (df['distance'] >= 5) & (df['distance'] < 10), # 10 
    (df['sport_type'].isin(['Run', 'TrailRun'])) & (df['distance'] >= 10) & (df['distance'] < 13), # 25 
    (df['sport_type'].isin(['Run', 'TrailRun'])) & (df['distance'] >= 13), # 30
    (df['sport_type'].isin(['Ride', 'MountainBikeRide'])) & (df['distance'] >= 1) & (df['distance'] < 5), # 5
    (df['sport_type'].isin(['Ride', 'MountainBikeRide'])) & (df['distance'] >= 5) & (df['distance'] < 8.5), # 10
    (df['sport_type'].isin(['Ride', 'MountainBikeRide'])) & (df['distance'] >= 8.5) & (df['distance'] < 12), # 15
    (df['sport_type'].isin(['Ride', 'MountainBikeRide'])) & (df['distance'] >= 12) & (df['distance'] < 15), # 20
    (df['sport_type'].isin(['Ride', 'MountainBikeRide'])) & (df['distance'] >= 15), # 25
    (df['sport_type'].isin(['AlpineSki']) & (df['distance'] >= 10) & (df['distance'] < 15)), # 10
    (df['sport_type'].isin(['AlpineSki']) & (df['distance'] >= 15) & (df['distance'] < 20)), # 15
    (df['sport_type'].isin(['AlpineSki']) & (df['distance'] >= 20)), # 20
    (df['sport_type'].isin(['Swim'])) & (df['distance'] >= 0.10) & (df['distance'] < 0.20), # 10
    (df['sport_type'].isin(['Swim'])) & (df['distance'] >= 0.20) & (df['distance'] < 0.30), # 15
    (df['sport_type'].isin(['Swim'])) & (df['distance'] >= 0.30) & (df['distance'] < 0.35), # 20
    (df['sport_type'].isin(['Swim'])) & (df['distance'] >= 0.35) & (df['distance'] < 0.40), # 25
    (df['sport_type'].isin(['Swim'])) & (df['distance'] >= 0.40), # 30
    (df['distance']> 0.5) & (df['distance'] < 1), # 1
    (df['distance']> 1) & (df['distance'] < 2), # 2
    (df['distance']> 2) & (df['distance'] < 3), # 3
    (df['distance']> 3) & (df['distance'] < 4), # 4
    (df['distance']> 4) # 5
]

distance_conditions_values = [1, 5, 10, 25, 30, # running
                              5, 10, 15, 20, 25, # biking
                              10, 15, 20, # skiing
                              10, 15, 20, 25, 30, # swimming
                              1, 2, 3, 4, 5 # special activitie 
                              ]

# applying conditions and values
df['distance_score'] = np.select(distance_conditions, distance_conditions_values)

In [8]:
# building workout time score conditions and values
workout_time_condition = [
    (df['workout_time_min'] >= 2) & (df['workout_time_min'] < 10),
    (df['workout_time_min'] >= 10) & (df['workout_time_min'] < 15),
    (df['workout_time_min'] >= 15) & (df['workout_time_min'] < 20),
    (df['workout_time_min'] >= 20) & (df['workout_time_min'] < 25),
    (df['workout_time_min'] >= 25) & (df['workout_time_min'] < 30),
    (df['workout_time_min'] >= 20) & (df['workout_time_min'] < 35),
    (df['workout_time_min'] >= 35) & (df['workout_time_min'] < 40),
    (df['workout_time_min'] >= 40) & (df['workout_time_min'] < 45),
    (df['workout_time_min'] >= 45) & (df['workout_time_min'] < 50),
    (df['workout_time_min'] >= 55) & (df['workout_time_min'] < 60),
    (df['workout_time_min'] >= 60)
]

workout_time_values = [1, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50]

# applying the conditions and values to the dataframe
df['workout_time_score'] = np.select(workout_time_condition, workout_time_values)


In [9]:
# building calories conditions and values
calories_conditions = [(df['calories'] >= 0) & (df['calories'] < 100), # 5
                       (df['calories'] >= 100) & (df['calories'] < 200), # 15
                       (df['calories'] >= 200) & (df['calories'] < 300), # 25
                       (df['calories'] >= 300) & (df['calories'] < 400), # 35
                       (df['calories'] >= 400) & (df['calories'] < 500), # 40
                       (df['calories'] >= 500) & (df['calories'] < 600), # 50
                       (df['calories'] >= 600) & (df['calories'] < 700), # 60
                       (df['calories'] >= 700) & (df['calories'] < 800), # 70
                       (df['calories'] >= 800) & (df['calories'] < 900), # 80
                       (df['calories'] >= 900) & (df['calories'] < 1000), # 90
                       (df['calories'] >= 1000) # 100
                       ] 

calories_values = [5, 15, 25, 35, 40, 50, 60, 70, 80, 90, 100]

df['calorie_score'] = np.select(calories_conditions, calories_values)


In [10]:
# Building elevation conditions and values
elevation_conditions = [(df['total_elevation_gain'] == 0),
                        (df['total_elevation_gain'] >= 1) & (df['total_elevation_gain'] < 100),
                        (df['total_elevation_gain'] >=100) & (df['total_elevation_gain'] < 200),
                        (df['total_elevation_gain'] >= 200) & (df['total_elevation_gain'] < 300),
                        (df['total_elevation_gain'] >= 300) & (df['total_elevation_gain'] < 400),
                        (df['total_elevation_gain'] >= 400) & (df['total_elevation_gain'] < 500),
                        (df['total_elevation_gain']> 500)]

elevation_values = [1,5,10,15,20,25,30]


df['total_elevation_gain_score'] = np.select(elevation_conditions, elevation_values)

In [11]:
# replacing some of the Nans with 1 
df['average_heartrate'] = df['average_heartrate'].fillna(1)
# building average heartrate conditions and values
avg_heartrate_conditions = [
                            (df['average_heartrate'] > 0) & (df['average_heartrate'] < 100),
                            (df['average_heartrate'] >= 100) & (df['average_heartrate'] < 130),
                            (df['average_heartrate'] >= 130) & (df['average_heartrate'] < 145),
                            (df['average_heartrate'] >= 145) & (df['average_heartrate'] < 155),
                            (df['average_heartrate'] >= 155) & (df['average_heartrate'] < 165),
                            (df['average_heartrate'] >= 165) & (df['average_heartrate'] < 170),
                            (df['average_heartrate'] >= 170)
                            ]

avg_heartrate_values = [5, 10, 15, 20, 30, 35, 40]

df['average_heartrate_score'] = np.select(avg_heartrate_conditions, avg_heartrate_values)

In [12]:
# replacing some of the Nans with 1 
df['max_heartrate'] = df['max_heartrate'].fillna(1)

# building max heartrate conditions and values
max_heartrate_conditions = [
    (df['max_heartrate'] >= 0) & (df['max_heartrate'] < 80),
    (df['max_heartrate'] >= 80) & (df['max_heartrate'] < 130),
    (df['max_heartrate'] >= 130) & (df['max_heartrate'] < 165),
    (df['max_heartrate'] >= 165) & (df['max_heartrate'] < 175),
    (df['max_heartrate'] >= 175) & (df['max_heartrate'] < 185),
    (df['max_heartrate'] >= 180)
]

max_heartrate_values = [5, 10, 15, 25, 30, 35]

df['max_heartrate_score'] = np.select(max_heartrate_conditions, max_heartrate_values)

In [13]:
# avg time per lap conditions and values
avg_time_per_lap_conditions = [
                               (df['avg_time_per_lap'] >= 0) & (df['avg_time_per_lap'] < 5),
                               (df['avg_time_per_lap'] >= 5) & (df['avg_time_per_lap'] < 10),
                               (df['avg_time_per_lap'] >= 10) & (df['avg_time_per_lap'] < 20),
                               (df['avg_time_per_lap'] >= 20) & (df['avg_time_per_lap'] < 30),
                               (df['avg_time_per_lap'] >= 30)
                               ]


avg_time_per_lap_values = [1, 5, 10, 15, 20]


df['avg_time_per_lap_score'] = np.select(avg_time_per_lap_conditions, avg_time_per_lap_values)


In [14]:
# lap count conditions and values
lap_count_conditions = [(df['lap_count'] >= 0) & (df['lap_count'] < 3),
                        (df['lap_count'] >= 3) & (df['lap_count'] < 4),
                        (df['lap_count'] >= 4) & (df['lap_count'] < 5),
                        (df['lap_count'] >= 5) & (df['lap_count'] < 6),
                        (df['lap_count'] >= 6)
                        ]

lap_count_values = [5, 10, 20, 25, 30]

df['lap_count_score'] = np.select(lap_count_conditions, lap_count_values)

In [54]:
# average speed conditions and values
avg_speed_conditions = [(df['average_speed'] == 0), # 1
                        (df['average_speed'] >= 0.1) & (df['average_speed'] < 1), # 5
                        (df['average_speed'] >= 1) & (df['average_speed'] < 2), # 15
                        (df['average_speed'] >= 2) & (df['average_speed'] < 3), # 20
                        (df['average_speed'] >= 3) & (df['average_speed'] < 3.5), # 25 
                        (df['average_speed'] >= 3.5) & (df['average_speed'] < 4), # 30
                        (df['average_speed'] >= 4) & (df['average_speed'] < 5), # 35
                        (df['average_speed'] >= 5)] # 40

avg_speed_values = [1, 5, 15, 20, 25, 30, 35, 40]


df['avg_speed_score'] = np.select(avg_speed_conditions, avg_speed_values)

In [55]:
# max speed conditions and values
max_speed_conditions = [(df['max_speed'] >= 0) & (df['max_speed'] < 0.5),
                        (df['max_speed'] >= 0.5) & (df['max_speed'] < 3),
                        (df['max_speed'] >= 3) & (df['max_speed'] < 4.5),
                        (df['max_speed'] >= 4.5) & (df['max_speed'] < 6.5),
                        (df['max_speed'] >= 6.5) & (df['max_speed'] < 10),
                        (df['max_speed'] >= 10)
                        ]

max_speed_values = [5, 10, 15, 20, 25, 30]


df['max_speed_score'] = np.select(max_speed_conditions, max_speed_values)

In [56]:
# Creating effort score dataframe
score_df = df[[
    'activity_id',
    'sport_type',
    'distance_score',
    'workout_time_score',
    'calorie_score',
    'total_elevation_gain_score',
    'average_heartrate_score',
    'max_heartrate_score',
    'avg_time_per_lap_score',
    'lap_count_score',
    'avg_speed_score',
    'max_speed_score']]


# Getting effort score for each workout
score_df['effort_score'] = score_df['distance_score'] + score_df['workout_time_score'] + score_df['calorie_score'] + score_df['total_elevation_gain_score'] + score_df['average_heartrate_score'] + score_df['max_heartrate_score'] + score_df['avg_time_per_lap_score'] +score_df['lap_count_score'] + score_df['avg_speed_score'] +score_df['max_speed_score']


score_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  score_df['effort_score'] = score_df['distance_score'] + score_df['workout_time_score'] + score_df['calorie_score'] + score_df['total_elevation_gain_score'] + score_df['average_heartrate_score'] + score_df['max_heartrate_score'] + score_df['avg_time_per_lap_score'] +score_df['lap_count_score'] + score_df['avg_speed_score'] +score_df['max_speed_score']


Unnamed: 0,activity_id,sport_type,distance_score,workout_time_score,calorie_score,total_elevation_gain_score,average_heartrate_score,max_heartrate_score,avg_time_per_lap_score,lap_count_score,avg_speed_score,max_speed_score,effort_score
0,8885873542,Run,10,35,40,5,20,15,5,30,0,15,175
1,8879705626,Functional-Cardio Workout,1,25,35,1,20,30,5,30,1,5,153
2,8875392009,Functional-Cardio Workout,1,10,15,1,20,30,5,10,1,5,98
3,8869251524,Functional-Cardio Workout,1,15,25,1,20,25,1,25,1,5,119
4,8862861001,Yoga,1,10,15,1,10,15,10,5,1,5,73
...,...,...,...,...,...,...,...,...,...,...,...,...,...
662,3517497833,Run,5,20,35,5,5,5,15,5,0,20,115
663,3516568449,Ride,15,30,5,5,5,5,20,5,30,30,150
664,3509004014,Run,5,30,40,5,5,5,20,5,0,20,135
665,3496090180,Run,5,25,35,5,5,5,20,5,10,20,135


In [57]:
# creating low, medium and high effort scores column
score_conditions = [(score_df['effort_score'] >= 0) & (score_df['effort_score'] < 120),
                    (score_df['effort_score'] >= 120) & (score_df['effort_score'] < 150),
                    (score_df['effort_score'] >= 150)]

score_values = [1,2,3]


score_df['effort_score_label'] = np.select(score_conditions, score_values)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  score_df['effort_score_label'] = np.select(score_conditions, score_values)


In [58]:
score_df['effort_score_label'].value_counts()

3    258
2    245
1    164
Name: effort_score_label, dtype: int64

In [59]:
list(score_df.columns)

['activity_id',
 'sport_type',
 'distance_score',
 'workout_time_score',
 'calorie_score',
 'total_elevation_gain_score',
 'average_heartrate_score',
 'max_heartrate_score',
 'avg_time_per_lap_score',
 'lap_count_score',
 'avg_speed_score',
 'max_speed_score',
 'effort_score',
 'effort_score_label']

In [63]:
df.to_csv(r'C:\Users\Manuel Elizaldi\Desktop\Learning-Testing\PyStrava\Outputs\score_test.csv')

In [74]:
# Creating features df
feature_df = score_df[[
 'distance_score',
 'workout_time_score',
 'calorie_score',
 'total_elevation_gain_score',
 'average_heartrate_score',
 'max_heartrate_score',
 'avg_time_per_lap_score',
 'lap_count_score',
 'avg_speed_score',
 'max_speed_score',
 'effort_score',
 'effort_score_label']]

feature_df

Unnamed: 0,distance_score,workout_time_score,calorie_score,total_elevation_gain_score,average_heartrate_score,max_heartrate_score,avg_time_per_lap_score,lap_count_score,avg_speed_score,max_speed_score,effort_score,effort_score_label
0,10,35,40,5,20,15,5,30,0,15,175,3
1,1,25,35,1,20,30,5,30,1,5,153,3
2,1,10,15,1,20,30,5,10,1,5,98,1
3,1,15,25,1,20,25,1,25,1,5,119,1
4,1,10,15,1,10,15,10,5,1,5,73,1
...,...,...,...,...,...,...,...,...,...,...,...,...
662,5,20,35,5,5,5,15,5,0,20,115,1
663,15,30,5,5,5,5,20,5,30,30,150,3
664,5,30,40,5,5,5,20,5,0,20,135,2
665,5,25,35,5,5,5,20,5,10,20,135,2


In [77]:
# X -> Features, y -> labels
X = feature_df[[ 'distance_score',
 'workout_time_score',
 'calorie_score',
 'total_elevation_gain_score',
 'average_heartrate_score',
 'max_heartrate_score',
 'avg_time_per_lap_score',
 'lap_count_score',
 'avg_speed_score',
 'max_speed_score',
 'effort_score']]

y = feature_df[['effort_score_label']]

In [90]:
# dividing X, y into train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)

# training a KNN classifier
knn = KNeighborsClassifier(n_neighbors = 3).fit(X_train, y_train)

# accuracy on X_test
accuracy = knn.score(X_test, y_test)
accuracy

  knn = KNeighborsClassifier(n_neighbors = 3).fit(X_train, y_train)


0.9640718562874252

In [91]:
# creating a confusion matrix
knn_predictions = knn.predict(X_test) 
cm = confusion_matrix(y_test, knn_predictions)

cm

array([[43,  4,  0],
       [ 0, 65,  0],
       [ 0,  2, 53]], dtype=int64)

ValueError: If using all scalar values, you must pass an index