In [28]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

data = pd.read_csv("dailyActivity_merged.csv")
print(data.head())

           Id ActivityDate  TotalSteps  TotalDistance  TrackerDistance  \
0  1503960366    3/25/2016       11004           7.11             7.11   
1  1503960366    3/26/2016       17609          11.55            11.55   
2  1503960366    3/27/2016       12736           8.53             8.53   
3  1503960366    3/28/2016       13231           8.93             8.93   
4  1503960366    3/29/2016       12041           7.85             7.85   

   LoggedActivitiesDistance  VeryActiveDistance  ModeratelyActiveDistance  \
0                       0.0                2.57                      0.46   
1                       0.0                6.92                      0.73   
2                       0.0                4.66                      0.16   
3                       0.0                3.19                      0.79   
4                       0.0                2.16                      1.09   

   LightActiveDistance  SedentaryActiveDistance  VeryActiveMinutes  \
0                 4.07

In [29]:
print(data.isnull().sum()) # Checking whether this dataset has any null values or not

Id                          0
ActivityDate                0
TotalSteps                  0
TotalDistance               0
TrackerDistance             0
LoggedActivitiesDistance    0
VeryActiveDistance          0
ModeratelyActiveDistance    0
LightActiveDistance         0
SedentaryActiveDistance     0
VeryActiveMinutes           0
FairlyActiveMinutes         0
LightlyActiveMinutes        0
SedentaryMinutes            0
Calories                    0
dtype: int64


In [30]:
print(data.info()) #  information about columns

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 457 entries, 0 to 456
Data columns (total 15 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Id                        457 non-null    int64  
 1   ActivityDate              457 non-null    object 
 2   TotalSteps                457 non-null    int64  
 3   TotalDistance             457 non-null    float64
 4   TrackerDistance           457 non-null    float64
 5   LoggedActivitiesDistance  457 non-null    float64
 6   VeryActiveDistance        457 non-null    float64
 7   ModeratelyActiveDistance  457 non-null    float64
 8   LightActiveDistance       457 non-null    float64
 9   SedentaryActiveDistance   457 non-null    float64
 10  VeryActiveMinutes         457 non-null    int64  
 11  FairlyActiveMinutes       457 non-null    int64  
 12  LightlyActiveMinutes      457 non-null    int64  
 13  SedentaryMinutes          457 non-null    int64  
 14  Calories  

In [31]:
# Changing datatype of ActivityDate
data["ActivityDate"] = pd.to_datetime(data["ActivityDate"],
                                      format="%m/%d/%Y")
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 457 entries, 0 to 456
Data columns (total 15 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   Id                        457 non-null    int64         
 1   ActivityDate              457 non-null    datetime64[ns]
 2   TotalSteps                457 non-null    int64         
 3   TotalDistance             457 non-null    float64       
 4   TrackerDistance           457 non-null    float64       
 5   LoggedActivitiesDistance  457 non-null    float64       
 6   VeryActiveDistance        457 non-null    float64       
 7   ModeratelyActiveDistance  457 non-null    float64       
 8   LightActiveDistance       457 non-null    float64       
 9   SedentaryActiveDistance   457 non-null    float64       
 10  VeryActiveMinutes         457 non-null    int64         
 11  FairlyActiveMinutes       457 non-null    int64         
 12  LightlyActiveMinutes  

In [43]:
data["TotalMinutes"] = data["VeryActiveMinutes"] + data["FairlyActiveMinutes"] + data["LightlyActiveMinutes"] + data["SedentaryMinutes"] # Total Minutes
print(data["TotalMinutes"].sample(5))

num_individuals = data["Id"].nunique()  # No.of unique Ids in the data
print(f"Number of unique individuals: {num_individuals}")


290    1440
220    1440
239    1418
278    1015
224    1440
Name: TotalMinutes, dtype: int64
Number of unique individuals: 35


In [33]:
print(data.describe())  # descriptive statistics of the dataset

                 Id                   ActivityDate    TotalSteps  \
count  4.570000e+02                            457    457.000000   
mean   4.628595e+09  2016-04-04 04:40:26.258205696   6546.562363   
min    1.503960e+09            2016-03-12 00:00:00      0.000000   
25%    2.347168e+09            2016-04-02 00:00:00   1988.000000   
50%    4.057193e+09            2016-04-05 00:00:00   5986.000000   
75%    6.391747e+09            2016-04-08 00:00:00  10198.000000   
max    8.877689e+09            2016-04-12 00:00:00  28497.000000   
std    2.293781e+09                            NaN   5398.493064   

       TotalDistance  TrackerDistance  LoggedActivitiesDistance  \
count     457.000000       457.000000                457.000000   
mean        4.663523         4.609847                  0.179427   
min         0.000000         0.000000                  0.000000   
25%         1.410000         1.280000                  0.000000   
50%         4.090000         4.090000               

In [34]:
# Relationship between calories burned and the total steps walked in a day

figure = px.scatter(
    data_frame=data,
    x="Calories",
    y="TotalSteps",
    size="VeryActiveMinutes",
    color="VeryActiveMinutes",  # Color gradient based on activity level
    trendline="ols",
    title="Calories Burned vs. Total Steps",
    labels={"Calories": "Calories Burned", "TotalSteps": "Total Steps"},
    template="plotly_dark"
)

# Update layout
figure.update_layout(
    title_font_size=20,
    xaxis=dict(showgrid=True),
    yaxis=dict(showgrid=True)
)

figure.show()

**There is a linear relationship between the total number of steps and the number of calories burned in a day**

In [35]:
# Average total number of active minutes in a day

labels = ["Very Active Minutes", "Fairly Active Minutes",
          "Lightly Active Minutes", "Inactive Minutes"]

values = data[["VeryActiveMinutes", "FairlyActiveMinutes",
               "LightlyActiveMinutes", "SedentaryMinutes"]].mean()

colors = ['gold', 'lightgreen', 'pink', 'royalblue']  # Improved contrast

fig = go.Figure(data=[go.Pie(
    labels=labels,
    values=values,
    hole=0.3,  # Creates a donut chart
    pull=[0.1, 0.05, 0, 0],  # Slightly pull out "Very Active" and "Fairly Active" segments
    marker=dict(colors=colors, line=dict(color='black', width=2))
)])

# Update layout
fig.update_layout(
    title_text="Distribution of Activity Minutes",
    title_font_size=20,
    template="plotly_dark",
    legend=dict(orientation="h", yanchor="bottom", y=-0.2, xanchor="center", x=0.5)
)

# Hover information and formatting
fig.update_traces(
    hoverinfo="label+percent+value",
    textinfo="percent",
    textfont_size=18
)

fig.show()

83.3% of Total inactive minutes in a day
14.2% of Lightly active minutes in a day
On an average, only 16.62 minutes (1.39%) were very active
and 1.09% (13 minutes) of fairly active minutes in a day

In [50]:
# Add a new column to this dataset as “Day”, using  ActivityDate column and use it to find the weekdays of the records
data["Day"] = data["ActivityDate"].dt.day_name()
print(data["Day"].head())

# Count occurrences of each day in the dataset
day_counts = data["Day"].value_counts()

# Display the result
print(day_counts)
#There's an unever distribtion of days in the dataset

0      Friday
1    Saturday
2      Sunday
3      Monday
4     Tuesday
Name: Day, dtype: object
Day
Saturday     75
Friday       73
Tuesday      73
Sunday       72
Monday       68
Wednesday    48
Thursday     48
Name: count, dtype: int64


In [52]:
#Activity Minutes by Day
fig = go.Figure()

# Adding bars for different activity levels
fig.add_trace(go.Bar(
    x=data["Day"],
    y=data["VeryActiveMinutes"],
    name='Very Active',
    marker_color='indigo'
))
fig.add_trace(go.Bar(
    x=data["Day"],
    y=data["FairlyActiveMinutes"],
    name='Fairly Active',
    marker_color='mediumseagreen'
))
fig.add_trace(go.Bar(
    x=data["Day"],
    y=data["LightlyActiveMinutes"],
    name='Lightly Active',
    marker_color='hotpink'
))

# Update layout
fig.update_layout(
    title_text="Daily Activity Minutes Breakdown",
    title_font_size=20,
    xaxis_title="Day of the Week",
    yaxis_title="Minutes",
    template="plotly_dark",
    barmode='group',
    xaxis_tickangle=-45,  # Tilt x-axis labels
    legend=dict(orientation="h", yanchor="bottom", y=-0.4, xanchor="center", x=0.5),
    bargap=0.2,  # Space between bars
    bargroupgap=0.1  # Space between grouped bars
)


fig.show()


In [53]:
#  Average inactive minutes on each day of the week
day_counts = data.groupby("Day")["SedentaryMinutes"].mean().round(2)  # Average sedentary minutes per day
labels = day_counts.index
values = day_counts.values
colors = ['gold', 'lightgreen', 'pink', 'blue', 'skyblue', 'cyan', 'orange']

fig = go.Figure(data=[go.Pie(labels=labels, values=values)])

fig.update_layout(
    template="plotly_dark",
    title_text='Inactive Minutes Daily',
    title_x=0.5
)

fig.update_traces(
    hoverinfo='label+percent',
    textinfo='label+value',
    textfont_size=20,
    marker=dict(colors=colors, line=dict(color='black', width=2)),
    pull=[0.05] * len(labels)  # separate slices
)

fig.show()


Tuesday is the most inactive day according to the lifestyle of all the individuals in the dataset

*The distirubtion of days in the dataset is uneven, so we used .mean() to get an average inactive mins for each to compare

In [54]:
# Calories burned on each day of the week

calories_per_day = data.groupby("Day")["Calories"].mean().round(2)  # Sum calories per day
labels = calories_per_day.index
values = calories_per_day.values
colors = ['gold', 'lightgreen', 'pink', 'blue', 'skyblue', 'cyan', 'orange']

fig = go.Figure(data=[go.Pie(labels=labels, values=values)])

fig.update_layout(
    template="plotly_dark",
    title_text='Calories Burned Daily',
    title_x=0.5,  # Center title
    font=dict(size=16)
)

fig.update_traces(
    hoverinfo='label+percent',
    textinfo='label+value',
    textfont_size=20,
    marker=dict(colors=colors, line=dict(color='black', width=2)),
    pull=[0.05] * len(labels)  # separate slices
)

fig.show()


Wednesday is one of the most active days for all individuals in the dataset, as on an average the highest number of calories were burned on wednesday