In [2]:
import os
import pandas as pd
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import numpy as np

In [3]:
dataframes = {}
data_folder = 'data_cleaned/'
for filename in os.listdir(data_folder):
    if filename.endswith('.csv'):
        file_path = os.path.join(data_folder, filename)
        df = pd.read_csv(file_path, low_memory=False)
        dataframes[filename[:-4]] = df
exercises = dataframes['processed_exercises_v2']
exercises.head(2)


Unnamed: 0,CloudId,Gender,Age,PhysicalActivityMacroTypeName,ExerciseName,EquipmentName,DoneOnUTC,Duration_sec,Calories,MetsMin,ExerciseBodyPartName,ExerciseMusclesName,METs,EquipmentCategory,Week,Year,Date
0,ed958cce9812f1b8607f8c8ab5836c1f450f858b,M,13,Cardio,Custom exercise in time,Bike,2022-01-23 10:00:31.425000+00:00,215,30.0,5.6,LowerBody,,,Cardiovascular,3,2022,2022-01-23
1,ed958cce9812f1b8607f8c8ab5836c1f450f858b,M,13,Cardio,GOAL exercise in time,Run,2022-01-23 10:09:01.153000+00:00,215,29.0,5.3,TotalBody,,,Cardiovascular,3,2022,2022-01-23


In [4]:
exercises.isnull().sum()

CloudId                                0
Gender                                 0
Age                                    0
PhysicalActivityMacroTypeName          0
ExerciseName                           0
EquipmentName                          3
DoneOnUTC                              0
Duration_sec                           0
Calories                               3
MetsMin                          2456592
ExerciseBodyPartName                   0
ExerciseMusclesName               863014
METs                              870238
EquipmentCategory                      3
Week                                   0
Year                                   0
Date                                   0
dtype: int64

### Extract aggregated features for exercises

##### Extract year, week and date

In [5]:
df = exercises
df['DoneOnUTC'] = pd.to_datetime(df['DoneOnUTC'], format='mixed')
df['Week'] = df['DoneOnUTC'].dt.isocalendar().week
df['Year'] = df['DoneOnUTC'].dt.year
df['Date'] = df['DoneOnUTC'].dt.date

In [6]:
df_backup = df.copy()
df.drop(columns=['ExerciseName', 'EquipmentName', 'MetsMin', 'METs', 'ExerciseMusclesName'], inplace=True)
df.sample(3)

Unnamed: 0,CloudId,Gender,Age,PhysicalActivityMacroTypeName,DoneOnUTC,Duration_sec,Calories,ExerciseBodyPartName,EquipmentCategory,Week,Year,Date
449842,576a5558c83ee4d4b38a591b120009e6224024f6,M,35,Isotonic,2022-02-08 17:20:37.462000+00:00,90,10.0,LowerBody,Strength Training,6,2022,2022-02-08
2118803,cc3a5545bfa0840ffa4211945832d9600e7ce7a9,M,36,Isotonic,2022-06-21 09:51:11.692000+00:00,219,18.0,UpperBody,Free Weights,25,2022,2022-06-21
320668,ec1bc1956f044b14410e367586b62db7964148dc,F,49,Isotonic,2022-02-12 09:30:18.310000+00:00,359,28.0,UpperBody,Strength Training,6,2022,2022-02-12


In [7]:
# Define a function to fill missing calories
def fill_missing_calories(row):
    if pd.isnull(row['Calories']):
        # Filter the dataframe to find similar rows
        similar_rows = df[
            (df['EquipmentCategory'] == row['EquipmentCategory']) &
            (df['ExerciseBodyPartName'] == row['ExerciseBodyPartName']) &
            (df['Duration_sec'].between(row['Duration_sec'] - 300, row['Duration_sec'] + 300))
        ]
        # Calculate the mean calories of the similar rows
        mean_calories = similar_rows['Calories'].mean()
        return mean_calories
    else:
        return row['Calories']
df['Calories'] = df.apply(fill_missing_calories, axis=1)

### Just here !!!

In [8]:
df['EquipmentCategory'].fillna('Cardiovascular', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['EquipmentCategory'].fillna('Cardiovascular', inplace=True)


In [9]:
df.isnull().sum()

CloudId                          0
Gender                           0
Age                              0
PhysicalActivityMacroTypeName    0
DoneOnUTC                        0
Duration_sec                     0
Calories                         0
ExerciseBodyPartName             0
EquipmentCategory                0
Week                             0
Year                             0
Date                             0
dtype: int64

In [10]:
# One-Hot Encode Categorical Columns
encoded_columns = pd.get_dummies(df[['PhysicalActivityMacroTypeName', 'EquipmentCategory', 'ExerciseBodyPartName']], drop_first=False)
# Concatenate the Encoded Columns with the Original DataFrame
df = pd.concat([df, encoded_columns], axis=1)
# Drop the original categorical columns as they are now encoded
df = df.drop(columns=['PhysicalActivityMacroTypeName', 'EquipmentCategory', 'ExerciseBodyPartName'])
df.head()

Unnamed: 0,CloudId,Gender,Age,DoneOnUTC,Duration_sec,Calories,Week,Year,Date,PhysicalActivityMacroTypeName_Cardio,PhysicalActivityMacroTypeName_Isotonic,EquipmentCategory_Cardiovascular,EquipmentCategory_Flexibility & Mobility,EquipmentCategory_Free Weights,EquipmentCategory_Functional Training,EquipmentCategory_Strength Training,ExerciseBodyPartName_Core,ExerciseBodyPartName_LowerBody,ExerciseBodyPartName_TotalBody,ExerciseBodyPartName_UpperBody
0,ed958cce9812f1b8607f8c8ab5836c1f450f858b,M,13,2022-01-23 10:00:31.425000+00:00,215,30.0,3,2022,2022-01-23,True,False,True,False,False,False,False,False,True,False,False
1,ed958cce9812f1b8607f8c8ab5836c1f450f858b,M,13,2022-01-23 10:09:01.153000+00:00,215,29.0,3,2022,2022-01-23,True,False,True,False,False,False,False,False,False,True,False
2,ed958cce9812f1b8607f8c8ab5836c1f450f858b,M,13,2022-01-23 10:28:49.724000+00:00,85,9.0,3,2022,2022-01-23,False,True,False,False,False,False,True,False,True,False,False
3,56a55ec8f237732fff2239735b469e346c8a5f6b,M,17,2022-01-23 08:52:44.240000+00:00,600,90.0,3,2022,2022-01-23,True,False,True,False,False,False,False,False,False,True,False
4,926bd1a7bbf9ccd40101befc6d409e2aa5979fd2,F,17,2022-01-23 17:13:38.840000+00:00,660,126.0,3,2022,2022-01-23,True,False,True,False,False,False,False,False,False,True,False


### TODO

Check whether it is usual to do multiple workouts a day

### TODO

Keep the part of the day that the workout has been performed (morning, evening, afternoon)

### Create workout sessions

We define a workout session as follows: All the exercises that one user has performed during 1 day

For a daily workoout we calculate:
- Total workout duration
- Total Calories burnt
- Total cardio calories
- Total cardio duration
- Total isotonic calories
- Total isotonic duration
- Duration for each body part category exercise (5 durations/workout)
- Duration for each equipment category exercise (4 durations/workout)

Also keep the gender and the age

In [12]:
df.columns

Index(['CloudId', 'Gender', 'Age', 'DoneOnUTC', 'Duration_sec', 'Calories',
       'Week', 'Year', 'Date', 'PhysicalActivityMacroTypeName_Cardio',
       'PhysicalActivityMacroTypeName_Isotonic',
       'EquipmentCategory_Cardiovascular',
       'EquipmentCategory_Flexibility & Mobility',
       'EquipmentCategory_Free Weights',
       'EquipmentCategory_Functional Training',
       'EquipmentCategory_Strength Training', 'ExerciseBodyPartName_Core',
       'ExerciseBodyPartName_LowerBody', 'ExerciseBodyPartName_TotalBody',
       'ExerciseBodyPartName_UpperBody'],
      dtype='object')

In [None]:
df['WorkoutId'] = df['CloudId'] + df['Date'].astype(str)

# Group by WorkoutId to calculate metrics per workout
workout_metrics = df.groupby(['CloudId', 'Date', 'WorkoutId']).agg(
    total_workout_duration=('Duration_sec', 'sum'),
    total_calories_burnt=('Calories', 'sum'),
    total_cardio_calories=('Calories', lambda x: x[df['PhysicalActivityMacroTypeName_Cardio']].sum()),
    total_cardio_duration=('Duration_sec', lambda x: x[df['PhysicalActivityMacroTypeName_Cardio']].sum()),
    total_isotonic_calories=('Calories', lambda x: x[df['PhysicalActivityMacroTypeName_Isotonic']].sum()),
    total_isotonic_duration=('Duration_sec', lambda x: x[df['PhysicalActivityMacroTypeName_Isotonic']].sum()),
    duration_core=('Duration_sec', lambda x: x[df['ExerciseBodyPartName_Core']].sum()),
    duration_lower_body=('Duration_sec', lambda x: x[df['ExerciseBodyPartName_LowerBody']].sum()),
    duration_total_body=('Duration_sec', lambda x: x[df['ExerciseBodyPartName_TotalBody']].sum()),
    duration_upper_body=('Duration_sec', lambda x: x[df['ExerciseBodyPartName_UpperBody']].sum()),
    duration_cardiovascular=('Duration_sec', lambda x: x[df['EquipmentCategory_Cardiovascular']].sum()),
    duration_flexibility_mobility=('Duration_sec', lambda x: x[df['EquipmentCategory_Flexibility & Mobility']].sum()),
    duration_free_weights=('Duration_sec', lambda x: x[df['EquipmentCategory_Free Weights']].sum()),
    duration_functional_training=('Duration_sec', lambda x: x[df['EquipmentCategory_Functional Training']].sum()),
    duration_strength_training=('Duration_sec', lambda x: x[df['EquipmentCategory_Strength Training']].sum())
).reset_index()

workout_metrics['Intensity'] = workout_metrics['total_calories_burnt'] / workout_metrics['total_workout_duration']

# store workout_metrics to a csv file
workout_metrics.to_csv('data_cleaned/workout_metrics.csv', index=False)


### Process workouts

In [17]:

file = 'data_cleaned/workout_metrics.csv'
workout_metrics = pd.read_csv(file)
workout_metrics


Unnamed: 0,CloudId,Date,WorkoutId,total_workout_duration,total_calories_burnt,total_cardio_calories,total_cardio_duration,total_isotonic_calories,total_isotonic_duration,duration_core,duration_lower_body,duration_total_body,duration_upper_body,duration_cardiovascular,duration_flexibility_mobility,duration_free_weights,duration_functional_training,duration_strength_training,Intensity
0,0015d65e3205deb6bb6a8f0d57cc48547918f0f2,2022-01-15,0015d65e3205deb6bb6a8f0d57cc48547918f0f22022-0...,3437,485.0,180.0,740,305.0,2697,306,608,740,1783,740,0,1314,0,1383,0.141111
1,0015d65e3205deb6bb6a8f0d57cc48547918f0f2,2022-01-31,0015d65e3205deb6bb6a8f0d57cc48547918f0f22022-0...,4202,562.0,280.0,1680,282.0,2522,306,1456,900,1540,1680,0,1314,0,1208,0.133746
2,0015d65e3205deb6bb6a8f0d57cc48547918f0f2,2022-02-02,0015d65e3205deb6bb6a8f0d57cc48547918f0f22022-0...,3986,537.0,255.0,1500,282.0,2486,306,1235,900,1545,1500,0,1314,0,1172,0.134722
3,0015d65e3205deb6bb6a8f0d57cc48547918f0f2,2022-02-05,0015d65e3205deb6bb6a8f0d57cc48547918f0f22022-0...,3863,497.0,215.0,1325,282.0,2538,306,1281,725,1551,1325,0,1314,0,1224,0.128656
4,0015d65e3205deb6bb6a8f0d57cc48547918f0f2,2022-02-07,0015d65e3205deb6bb6a8f0d57cc48547918f0f22022-0...,4374,569.0,287.0,1440,282.0,2934,306,1008,1440,1620,1440,0,1314,0,1620,0.130087
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
438045,fff39b07939109e856b411ff8dd9b22da1f1ff2e,2022-12-21,fff39b07939109e856b411ff8dd9b22da1f1ff2e2022-1...,848,137.0,0.0,0,137.0,848,133,179,0,536,0,0,0,0,848,0.161557
438046,fff39b07939109e856b411ff8dd9b22da1f1ff2e,2022-12-22,fff39b07939109e856b411ff8dd9b22da1f1ff2e2022-1...,2205,288.0,129.0,1200,159.0,1005,177,1392,0,636,1200,0,0,0,1005,0.130612
438047,fff39b07939109e856b411ff8dd9b22da1f1ff2e,2022-12-23,fff39b07939109e856b411ff8dd9b22da1f1ff2e2022-1...,1625,213.0,92.0,900,121.0,725,0,1090,0,535,900,0,0,0,725,0.131077
438048,fff39b07939109e856b411ff8dd9b22da1f1ff2e,2022-12-27,fff39b07939109e856b411ff8dd9b22da1f1ff2e2022-1...,2409,309.0,156.0,1500,153.0,909,193,1697,0,519,1500,0,0,0,909,0.128269


In [18]:
exercises = dataframes['processed_exercises_v2']

In [19]:

# Define the chunk size
chunk_size = 100000

# Initialize an empty list to store the merged chunks
merged_chunks = []

# Load the workout_metrics dataframe in chunks
for chunk in pd.read_csv('data_cleaned/workout_metrics.csv', chunksize=chunk_size):
    # Merge the chunk with the exercises dataframe to get age and gender
    merged_chunk = pd.merge(chunk, exercises[['CloudId', 'Gender', 'Age']].drop_duplicates(), on='CloudId', how='left')
    # Append the merged chunk to the list
    merged_chunks.append(merged_chunk)

# Concatenate all the merged chunks
merged_workout_metrics = pd.concat(merged_chunks, ignore_index=True)

# Display the merged dataframe
merged_workout_metrics.head()

Unnamed: 0,CloudId,Date,WorkoutId,total_workout_duration,total_calories_burnt,total_cardio_calories,total_cardio_duration,total_isotonic_calories,total_isotonic_duration,duration_core,...,duration_total_body,duration_upper_body,duration_cardiovascular,duration_flexibility_mobility,duration_free_weights,duration_functional_training,duration_strength_training,Intensity,Gender,Age
0,0015d65e3205deb6bb6a8f0d57cc48547918f0f2,2022-01-15,0015d65e3205deb6bb6a8f0d57cc48547918f0f22022-0...,3437,485.0,180.0,740,305.0,2697,306,...,740,1783,740,0,1314,0,1383,0.141111,M,56
1,0015d65e3205deb6bb6a8f0d57cc48547918f0f2,2022-01-31,0015d65e3205deb6bb6a8f0d57cc48547918f0f22022-0...,4202,562.0,280.0,1680,282.0,2522,306,...,900,1540,1680,0,1314,0,1208,0.133746,M,56
2,0015d65e3205deb6bb6a8f0d57cc48547918f0f2,2022-02-02,0015d65e3205deb6bb6a8f0d57cc48547918f0f22022-0...,3986,537.0,255.0,1500,282.0,2486,306,...,900,1545,1500,0,1314,0,1172,0.134722,M,56
3,0015d65e3205deb6bb6a8f0d57cc48547918f0f2,2022-02-05,0015d65e3205deb6bb6a8f0d57cc48547918f0f22022-0...,3863,497.0,215.0,1325,282.0,2538,306,...,725,1551,1325,0,1314,0,1224,0.128656,M,56
4,0015d65e3205deb6bb6a8f0d57cc48547918f0f2,2022-02-07,0015d65e3205deb6bb6a8f0d57cc48547918f0f22022-0...,4374,569.0,287.0,1440,282.0,2934,306,...,1440,1620,1440,0,1314,0,1620,0.130087,M,56


In [27]:
merged_workout_metrics.to_csv('data_cleaned/merged_workout_metrics.csv', index=False)

The final dataframe has the following metrics per user and per week:

- Total calories
- Total exercise minutes
- Total cardio calories
- Total cardio exercise minutes

- Total isotonic calories
- Total isotonic exercise minutes,

- Total upper body exercise minutes
- Total upper body calories

- Total lower body exercise minutes
- Total lower body calories

- Total core body exercise minutes
- Total core body calories

- Total total body exercise minutes
- Total total body calories

- avg duration per workout
- avg calories per workout
- Average mets min
- avg isotonic workouts
- avg cardio workouts
- avg upper workouts
- avg lower workouts
- avg core workouts
- avg total body workouts

#### Questions

- To get reliable workout plans should I consider only users with consistent workouts? Eg train for some consecutive months?
- How to do the clustering --> there are some similar exercises - exercises that train similar body parts (lowerbody, upperbody etc). Group them so that workout plans can be extracted. Decide a time range for a specific exercise. From this grouping end up with distinct categories of exercises that are more manageable in number, lower, upper body could be helpful. Put the time that they dedicated to each category in bins so that we can group them. Eg lowerbody -> 10-20min, upperbody -> 30min, cardio -> 10min. Then try to do clustering using those characteristics to get bigger clusters
- Extract broader categories for both muscles and define propertirs of equipment (free weights, body weight etc)
- Extract aggregated features along with previous features (avg isotonic per week)
- Cluster users based on all the new and the old features in order to create workout categories (eg a user who goes to the gym 4 times a week, does 20 minutes of cardio every day)