In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import plotly.express as px

In [2]:
df_daily_activity_merged = pd.read_csv('dailyActivity_merged.csv')

In [None]:
df_daily_activity_merged = pd.read_csv('dailyActivity_merged.csv')

In [3]:
df_daily_activity_merged.head()

Unnamed: 0,Id,ActivityDate,TotalSteps,TotalDistance,TrackerDistance,LoggedActivitiesDistance,VeryActiveDistance,ModeratelyActiveDistance,LightActiveDistance,SedentaryActiveDistance,VeryActiveMinutes,FairlyActiveMinutes,LightlyActiveMinutes,SedentaryMinutes,Calories
0,1503960366,4/12/2016,13162,8.5,8.5,0.0,1.88,0.55,6.06,0.0,25,13,328,728,1985
1,1503960366,4/13/2016,10735,6.97,6.97,0.0,1.57,0.69,4.71,0.0,21,19,217,776,1797
2,1503960366,4/14/2016,10460,6.74,6.74,0.0,2.44,0.4,3.91,0.0,30,11,181,1218,1776
3,1503960366,4/15/2016,9762,6.28,6.28,0.0,2.14,1.26,2.83,0.0,29,34,209,726,1745
4,1503960366,4/16/2016,12669,8.16,8.16,0.0,2.71,0.41,5.04,0.0,36,10,221,773,1863


In [4]:
df_daily_activity_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 940 entries, 0 to 939
Data columns (total 15 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Id                        940 non-null    int64  
 1   ActivityDate              940 non-null    object 
 2   TotalSteps                940 non-null    int64  
 3   TotalDistance             940 non-null    float64
 4   TrackerDistance           940 non-null    float64
 5   LoggedActivitiesDistance  940 non-null    float64
 6   VeryActiveDistance        940 non-null    float64
 7   ModeratelyActiveDistance  940 non-null    float64
 8   LightActiveDistance       940 non-null    float64
 9   SedentaryActiveDistance   940 non-null    float64
 10  VeryActiveMinutes         940 non-null    int64  
 11  FairlyActiveMinutes       940 non-null    int64  
 12  LightlyActiveMinutes      940 non-null    int64  
 13  SedentaryMinutes          940 non-null    int64  
 14  Calories  

In [5]:
subset = df_daily_activity_merged[["TotalSteps", "VeryActiveDistance", "Calories"]]

In [6]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaled_data = scaler.fit_transform(subset)

In [7]:
from sklearn.cluster import KMeans

num_clusters = 3
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
cluster_labels = kmeans.fit_predict(scaled_data)



In [8]:
df_daily_activity_merged["ActivityLevel"] = cluster_labels

In [9]:
import plotly.express as px

fig = px.scatter_3d(
    df_daily_activity_merged,
    x="TotalSteps",
    y="VeryActiveDistance",
    z="Calories",
    color="ActivityLevel",
    title="Clustering Visualization",
    template="plotly_dark",
)

fig.update_traces(
    marker=dict(size=3, opacity=0.6, line=dict(width=2, color="DarkSlateGrey")
))

fig.update_layout(
    scene=dict(
        xaxis_title="TotalSteps",
        yaxis_title="VeryActiveDistance",
        zaxis_title="Calories",
    ),
    scene_aspectmode="cube",
)

fig.show()

As can be seen the labels still need to be manually assigned. K-Means assigns labels based on how the clusters were formed and might not align with the intuitive labels.

In [21]:
# Assuming df_daily_activity_merged is your DataFrame with the clustering and activity labels
cluster_activity_means = df_daily_activity_merged.groupby('ActivityLevel').mean(numeric_only=True)

# Print the mean values for relevant columns
print(cluster_activity_means[['TotalSteps', 'TotalDistance', 'VeryActiveMinutes']])

                     TotalSteps  TotalDistance  VeryActiveMinutes
ActivityLevel                                                    
Highly Active      17534.225806      14.149677         104.919355
Moderately Active  10316.951163       7.354651          27.674419
Not Very Active     3696.930804       2.501205           3.325893


In [11]:
# Create a mapping dictionary for cluster labels to activity levels
labels = {0: "Moderately Active", 1: "Not Very Active", 2: "Highly Active"}

# Map cluster labels to activity levels
activity_levels = [labels[label] for label in cluster_labels]

# Add the 'ActivityLevel' column to the original DataFrame
df_daily_activity_merged['ActivityLevel'] = activity_levels

In [12]:
# Assuming df_daily_activity_merged is your DataFrame with the clustering and activity labels
cluster_activity_means = df_daily_activity_merged.groupby('ActivityLevel').mean()

# Print the mean values for relevant columns
print(cluster_activity_means[['TotalSteps', 'TotalDistance', 'VeryActiveMinutes']])

                     TotalSteps  TotalDistance  VeryActiveMinutes
ActivityLevel                                                    
Highly Active      17534.225806      14.149677         104.919355
Moderately Active  10316.951163       7.354651          27.674419
Not Very Active     3696.930804       2.501205           3.325893



The default value of numeric_only in DataFrameGroupBy.mean is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.



In [13]:
import plotly.express as px

fig = px.scatter_3d(
    df_daily_activity_merged,
    x="TotalSteps",
    y="VeryActiveDistance",
    z="Calories",
    color="ActivityLevel",
    title="Clustering Visualization",
    template="plotly_dark",
    color_discrete_sequence=["blue", "red", "green"]
)

fig.update_traces(
    marker=dict(size=3, opacity=0.6, line=dict(width=2, color="DarkSlateGrey")
))

fig.update_layout(
    scene=dict(
        xaxis_title="TotalSteps",
        yaxis_title="VeryActiveDistance",
        zaxis_title="Calories",
    ),
    scene_aspectmode="cube",
)

# Set the order of legend labels
fig.update_layout(
    legend=dict(traceorder="reversed", title="Activity Level", itemsizing="constant"),
    coloraxis_colorbar=dict(
        title="Activity Level",
        tickvals=[0, 1, 2],
        ticktext=["Not Very Active", "Moderately Active", "Highly Active"]
    )
)

fig.show()


### Preliminary Fitness Clustering Analysis

In our initial fitness clustering analysis, we utilized the features "TotalSteps," "VeryActiveDistance," and "Calories" to evaluate Fitbit users' general fitness and activity levels. While these features provide valuable insights into physical activity and calorie expenditure, it's important to note that the "Calories" feature may introduce bias into our clustering results.

Calorie consumption is influenced by various factors, such as metabolic rates, body size, and gender. This influence can potentially lead to users with different characteristics being assigned to separate clusters. To mitigate this potential bias and achieve more accurate fitness clustering, we plan to incorporate additional data, including user-specific information like age, weight, and gender. These variables will allow us to normalize calorie data based on individual characteristics, providing a fairer assessment of fitness levels.

Our analysis is ongoing, and we are committed to refining our methodology. For now, we have used the "Calories" feature as a starting point, recognizing its limitations but appreciating its significance in capturing overall physical activity.

In [20]:
import plotly.express as px

# Create the 3D scatter plot
fig = px.scatter_3d(
    df_daily_activity_merged,
    x="TotalSteps",
    y="VeryActiveDistance",
    z="Calories",
    color="ActivityLevel",
    template="plotly_dark",
    color_discrete_sequence=["blue", "red", "green"]
)

# Remove the white circles around data points
fig.update_traces(marker=dict(size=3, opacity=0.6, line=dict(width=0)))

fig.update_layout(
    scene=dict(
        xaxis_title="TotalSteps",
        yaxis_title="VeryActiveDistance",
        zaxis_title="Calories",
    ),
    scene_aspectmode="cube",
    title="Clustering Visualization",
    legend=dict(
        title="Activity Level",
        itemsizing="constant",
    ),
    width=800,
    height=600,
    margin=dict(l=0, r=0, b=0, t=30),
)

# Increase the resolution for PNG download
fig.update_layout(autosize=False, width=1200, height=900)

# Show the plot
fig.show()
