In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# FitBit Tracker Insights: Unveiling Trends in Wellness Technology Consumer Behavior

![](https://th.bing.com/th/id/R.a3e06837f29dd2f6ac5fffeb4dd76323?rik=TA3NHdXufVkT%2fA&pid=ImgRaw&r=0)

The Bellabeat case study presents a compelling opportunity to delve into the world of wellness technology and consumer behavior analysis. As a junior data analyst at Bellabeat, I'll be tasked with unraveling trends in smart device usage, particularly focusing on how consumers interact with non-Bellabeat smart devices. Through this analysis, I aim to unearth insights that can inform Bellabeat's marketing strategy, helping the company capitalize on growth opportunities in the global smart device market.

Employing a structured approach encompassing the data analysis process stages of ask, prepare, process, analyze, share, and act, I will navigate through the dataset, extract meaningful patterns, and translate them into actionable recommendations. With a meticulous eye for detail and a knack for storytelling through data, I will craft a comprehensive report containing clear summaries, detailed documentation, insightful visualizations, and high-level content recommendations.

By leveraging public data sources such as the FitBit Fitness Tracker Data and employing rigorous data cleaning and analysis techniques, I aim to provide Bellabeat with invaluable insights that can steer their marketing strategies towards success. As I embark on this capstone project, I look forward to honing my analytical skills, gaining practical experience, and delivering impactful results that contribute to Bellabeat's continued growth and success in the wellness technology industry.

In [1]:
# Load necessary libraries
library(tidyverse)
library(lubridate)  # For date manipulation
library(knitr)      # For better table presentation
library(ggplot2)    # For data visualization

── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.4     [32m✔[39m [34mreadr    [39m 2.1.4
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.1
[32m✔[39m [34mggplot2  [39m 3.4.4     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.3     [32m✔[39m [34mtidyr    [39m 1.3.0
[32m✔[39m [34mpurrr    [39m 1.0.2     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors


In [2]:
base_path <- "/kaggle/input/fitbit/mturkfitbit_export_4.12.16-5.12.16/Fitabase Data 4.12.16-5.12.16"

# Load and preprocess the data
daily_activity <- read_csv(file.path(base_path, "dailyActivity_merged.csv"))
daily_sleep <- read_csv(file.path(base_path, "sleepDay_merged.csv"))
weight_info <- read_csv(file.path(base_path, "weightLogInfo_merged.csv"))


[1mRows: [22m[34m940[39m [1mColumns: [22m[34m15[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m  (1): ActivityDate
[32mdbl[39m (14): Id, TotalSteps, TotalDistance, TrackerDistance, LoggedActivitiesDi...

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.
[1mRows: [22m[34m413[39m [1mColumns: [22m[34m5[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (1): SleepDay
[32mdbl[39m (4): Id, TotalSleepRecords, TotalMinutesAsleep, TotalTimeInBed

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.
[1mRows: [22m[34m67[39m [1mCol

In [3]:
# Install and load the janitor package
install.packages("janitor")
library(janitor)

# Clean column names and convert to lowercase for daily_activity
daily_activity <- clean_names(daily_activity)
daily_activity <- rename_with(daily_activity, tolower)
head(daily_activity)

# Clean column names and convert to lowercase for daily_sleep
daily_sleep <- clean_names(daily_sleep)
daily_sleep <- rename_with(daily_sleep, tolower)
head(daily_sleep)

# Clean column names and convert to lowercase for weight_info
weight_info <- clean_names(weight_info)
weight_info <- rename_with(weight_info, tolower)
head(weight_info)


Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)


Attaching package: ‘janitor’


The following objects are masked from ‘package:stats’:

    chisq.test, fisher.test




id,activity_date,total_steps,total_distance,tracker_distance,logged_activities_distance,very_active_distance,moderately_active_distance,light_active_distance,sedentary_active_distance,very_active_minutes,fairly_active_minutes,lightly_active_minutes,sedentary_minutes,calories
<dbl>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1503960366,4/12/2016,13162,8.5,8.5,0,1.88,0.55,6.06,0,25,13,328,728,1985
1503960366,4/13/2016,10735,6.97,6.97,0,1.57,0.69,4.71,0,21,19,217,776,1797
1503960366,4/14/2016,10460,6.74,6.74,0,2.44,0.4,3.91,0,30,11,181,1218,1776
1503960366,4/15/2016,9762,6.28,6.28,0,2.14,1.26,2.83,0,29,34,209,726,1745
1503960366,4/16/2016,12669,8.16,8.16,0,2.71,0.41,5.04,0,36,10,221,773,1863
1503960366,4/17/2016,9705,6.48,6.48,0,3.19,0.78,2.51,0,38,20,164,539,1728


id,sleep_day,total_sleep_records,total_minutes_asleep,total_time_in_bed
<dbl>,<chr>,<dbl>,<dbl>,<dbl>
1503960366,4/12/2016 12:00:00 AM,1,327,346
1503960366,4/13/2016 12:00:00 AM,2,384,407
1503960366,4/15/2016 12:00:00 AM,1,412,442
1503960366,4/16/2016 12:00:00 AM,2,340,367
1503960366,4/17/2016 12:00:00 AM,1,700,712
1503960366,4/19/2016 12:00:00 AM,1,304,320


id,date,weight_kg,weight_pounds,fat,bmi,is_manual_report,log_id
<dbl>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<lgl>,<dbl>
1503960366,5/2/2016 11:59:59 PM,52.6,115.9631,22.0,22.65,True,1462234000000.0
1503960366,5/3/2016 11:59:59 PM,52.6,115.9631,,22.65,True,1462320000000.0
1927972279,4/13/2016 1:08:52 AM,133.5,294.3171,,47.54,False,1460510000000.0
2873212765,4/21/2016 11:59:59 PM,56.7,125.0021,,21.45,True,1461283000000.0
2873212765,5/12/2016 11:59:59 PM,57.3,126.3249,,21.69,True,1463098000000.0
4319703577,4/17/2016 11:59:59 PM,72.4,159.6147,25.0,27.45,True,1460938000000.0


In [4]:
# Clean and format date columns
daily_activity$date <- parse_date_time(daily_activity$activity_date, "%m/%d/%Y")
daily_activity$date <- date(daily_activity$date)
head(daily_activity)


daily_sleep$date_time <- parse_date_time(daily_sleep$sleep_day, "%m/%d/%Y %I:%M:%S %p")
daily_sleep$date <- date(daily_sleep$date_time)
head(daily_sleep)

weight_info$date_time <- parse_date_time(weight_info$date, "%m/%d/%Y %I:%M:%S %p")
weight_info$date <- date(weight_info$date_time)
head(weight_info)


id,activity_date,total_steps,total_distance,tracker_distance,logged_activities_distance,very_active_distance,moderately_active_distance,light_active_distance,sedentary_active_distance,very_active_minutes,fairly_active_minutes,lightly_active_minutes,sedentary_minutes,calories,date
<dbl>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<date>
1503960366,4/12/2016,13162,8.5,8.5,0,1.88,0.55,6.06,0,25,13,328,728,1985,2016-04-12
1503960366,4/13/2016,10735,6.97,6.97,0,1.57,0.69,4.71,0,21,19,217,776,1797,2016-04-13
1503960366,4/14/2016,10460,6.74,6.74,0,2.44,0.4,3.91,0,30,11,181,1218,1776,2016-04-14
1503960366,4/15/2016,9762,6.28,6.28,0,2.14,1.26,2.83,0,29,34,209,726,1745,2016-04-15
1503960366,4/16/2016,12669,8.16,8.16,0,2.71,0.41,5.04,0,36,10,221,773,1863,2016-04-16
1503960366,4/17/2016,9705,6.48,6.48,0,3.19,0.78,2.51,0,38,20,164,539,1728,2016-04-17


id,sleep_day,total_sleep_records,total_minutes_asleep,total_time_in_bed,date_time,date
<dbl>,<chr>,<dbl>,<dbl>,<dbl>,<dttm>,<date>
1503960366,4/12/2016 12:00:00 AM,1,327,346,2016-04-12,2016-04-12
1503960366,4/13/2016 12:00:00 AM,2,384,407,2016-04-13,2016-04-13
1503960366,4/15/2016 12:00:00 AM,1,412,442,2016-04-15,2016-04-15
1503960366,4/16/2016 12:00:00 AM,2,340,367,2016-04-16,2016-04-16
1503960366,4/17/2016 12:00:00 AM,1,700,712,2016-04-17,2016-04-17
1503960366,4/19/2016 12:00:00 AM,1,304,320,2016-04-19,2016-04-19


id,date,weight_kg,weight_pounds,fat,bmi,is_manual_report,log_id,date_time
<dbl>,<date>,<dbl>,<dbl>,<dbl>,<dbl>,<lgl>,<dbl>,<dttm>
1503960366,2016-05-02,52.6,115.9631,22.0,22.65,True,1462234000000.0,2016-05-02 23:59:59
1503960366,2016-05-03,52.6,115.9631,,22.65,True,1462320000000.0,2016-05-03 23:59:59
1927972279,2016-04-13,133.5,294.3171,,47.54,False,1460510000000.0,2016-04-13 01:08:52
2873212765,2016-04-21,56.7,125.0021,,21.45,True,1461283000000.0,2016-04-21 23:59:59
2873212765,2016-05-12,57.3,126.3249,,21.69,True,1463098000000.0,2016-05-12 23:59:59
4319703577,2016-04-17,72.4,159.6147,25.0,27.45,True,1460938000000.0,2016-04-17 23:59:59


In [5]:
library(dplyr)

# Drop the 'date_time' column from 'weight_info'
weight_info <- select(weight_info, -date_time)

# Drop the 'activity_date' column from 'daily_activity'
daily_activity <- select(daily_activity, -activity_date)

# Drop the 'sleep_day' and 'date_time' columns from 'daily_sleep'
daily_sleep <- select(daily_sleep, -sleep_day, -date_time)


In [8]:
# Merge datasets
merged_data <- merge(merge(daily_activity, daily_sleep, by = c("id", "date"), all = TRUE), weight_info, by = c("id", "date"), all = TRUE)
head(merged_data)

Unnamed: 0_level_0,id,date,total_steps,total_distance,tracker_distance,logged_activities_distance,very_active_distance,moderately_active_distance,light_active_distance,sedentary_active_distance,⋯,calories,total_sleep_records,total_minutes_asleep,total_time_in_bed,weight_kg,weight_pounds,fat,bmi,is_manual_report,log_id
Unnamed: 0_level_1,<dbl>,<date>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<lgl>,<dbl>
1,1503960366,2016-04-12,13162,8.5,8.5,0,1.88,0.55,6.06,0,⋯,1985,1.0,327.0,346.0,,,,,,
2,1503960366,2016-04-13,10735,6.97,6.97,0,1.57,0.69,4.71,0,⋯,1797,2.0,384.0,407.0,,,,,,
3,1503960366,2016-04-14,10460,6.74,6.74,0,2.44,0.4,3.91,0,⋯,1776,,,,,,,,,
4,1503960366,2016-04-15,9762,6.28,6.28,0,2.14,1.26,2.83,0,⋯,1745,1.0,412.0,442.0,,,,,,
5,1503960366,2016-04-16,12669,8.16,8.16,0,2.71,0.41,5.04,0,⋯,1863,2.0,340.0,367.0,,,,,,
6,1503960366,2016-04-17,9705,6.48,6.48,0,3.19,0.78,2.51,0,⋯,1728,1.0,700.0,712.0,,,,,,


In [None]:
# Remove unnecessary columns
merged_data <- merged_data %>%
  select(-c(TrackerDistance, LoggedActivitiesDistance, TotalSleepRecords, WeightPounds, Fat, BMI, IsManualReport))

# Visualize correlations between variables
correlation_plot <- ggplot(merged_data, aes(x = TotalSteps, y = Calories)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  labs(title = "Correlation between Steps and Calories Burned",
       x = "Total Steps",
       y = "Calories Burned")

# Explore users' activity by day of the week
activity_by_day <- merged_data %>%
  mutate(Weekday = wday(Date, label = TRUE)) %>%
  group_by(Weekday) %>%
  summarize(Avg_Steps = mean(TotalSteps, na.rm = TRUE))

# Visualize users' activity by day of the week
activity_by_day_plot <- ggplot(activity_by_day, aes(x = Weekday, y = Avg_Steps)) +
  geom_bar(stat = "identity", fill = "skyblue") +
  labs(title = "Average Steps by Day of the Week",
       x = "Day of the Week",
       y = "Average Steps")

# Analyze sleep patterns
sleep_distribution <- merged_data %>%
  mutate(TotalMinutesAsleep_Range = cut(TotalMinutesAsleep, breaks = c(0, 420, 540, Inf), labels = c("Less than 7h", "7h to 9h", "More than 9h"))) %>%
  group_by(TotalMinutesAsleep_Range) %>%
  summarize(Count = n())

# Visualize sleep distribution
sleep_distribution_plot <- ggplot(sleep_distribution, aes(x = TotalMinutesAsleep_Range, y = Count)) +
  geom_bar(stat = "identity", fill = "lightgreen") +
  labs(title = "Distribution of Sleep Duration",
       x = "Sleep Duration Range",
       y = "Count")

# Explore the relationship between weight and activity
weight_vs_activity <- merged_data %>%
  group_by(Id) %>%
  summarize(Avg_Weight = mean(WeightKg, na.rm = TRUE),
            Avg_Distance = mean(TotalDistance, na.rm = TRUE))

# Visualize weight vs activity
weight_vs_activity_plot <- ggplot(weight_vs_activity, aes(x = Avg_Weight, y = Avg_Distance)) +
  geom_point() +
  labs(title = "Average Weight vs Average Distance Covered",
       x = "Average Weight (kg)",
       y = "Average Distance Covered (miles)")

# Print correlations and plots
print(correlation_plot)
print(activity_by_day_plot)
print(sleep_distribution_plot)
print(weight_vs_activity_plot)


In [None]:
# Load necessary libraries
library(tidyverse)
library(lubridate)  # For date manipulation
library(knitr)      # For better table presentation
library(ggplot2)    # For data visualization

# Load and preprocess the data
daily_activity <- read_csv("dailyActivity_merged.csv")
daily_sleep <- read_csv("sleepDay_merged.csv")
weight_info <- read_csv("weightLogInfo_merged.csv")

# Clean and format date columns
daily_activity <- daily_activity %>%
  mutate(Date = ymd(Date))
daily_sleep <- daily_sleep %>%
  mutate(Date = ymd(SleepDay))
weight_info <- weight_info %>%
  mutate(Date = ymd(Date))

# Merge datasets
merged_data <- merge(merge(daily_activity, daily_sleep, by = c("Id", "Date"), all = TRUE), weight_info, by = c("Id", "Date"), all = TRUE)

# Remove unnecessary columns
merged_data <- merged_data %>%
  select(-c(TrackerDistance, LoggedActivitiesDistance, TotalSleepRecords, WeightPounds, Fat, BMI, IsManualReport))

# Visualize correlations between variables
correlation_plot <- ggplot(merged_data, aes(x = TotalSteps, y = Calories)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  labs(title = "Correlation between Steps and Calories Burned",
       x = "Total Steps",
       y = "Calories Burned")

# Explore users' activity by day of the week
activity_by_day <- merged_data %>%
  mutate(Weekday = wday(Date, label = TRUE)) %>%
  group_by(Weekday) %>%
  summarize(Avg_Steps = mean(TotalSteps, na.rm = TRUE))

# Visualize users' activity by day of the week
activity_by_day_plot <- ggplot(activity_by_day, aes(x = Weekday, y = Avg_Steps)) +
  geom_bar(stat = "identity", fill = "skyblue") +
  labs(title = "Average Steps by Day of the Week",
       x = "Day of the Week",
       y = "Average Steps")

# Analyze sleep patterns
sleep_distribution <- merged_data %>%
  mutate(TotalMinutesAsleep_Range = cut(TotalMinutesAsleep, breaks = c(0, 420, 540, Inf), labels = c("Less than 7h", "7h to 9h", "More than 9h"))) %>%
  group_by(TotalMinutesAsleep_Range) %>%
  summarize(Count = n())

# Visualize sleep distribution
sleep_distribution_plot <- ggplot(sleep_distribution, aes(x = TotalMinutesAsleep_Range, y = Count)) +
  geom_bar(stat = "identity", fill = "lightgreen") +
  labs(title = "Distribution of Sleep Duration",
       x = "Sleep Duration Range",
       y = "Count")

# Explore the relationship between weight and activity
weight_vs_activity <- merged_data %>%
  group_by(Id) %>%
  summarize(Avg_Weight = mean(WeightKg, na.rm = TRUE),
            Avg_Distance = mean(TotalDistance, na.rm = TRUE))

# Visualize weight vs activity
weight_vs_activity_plot <- ggplot(weight_vs_activity, aes(x = Avg_Weight, y = Avg_Distance)) +
  geom_point() +
  labs(title = "Average Weight vs Average Distance Covered",
       x = "Average Weight (kg)",
       y = "Average Distance Covered (miles)")

# Hourly Activity Analysis
hourly_activity <- merged_data %>%
  mutate(Hour = hour(DateTime)) %>%
  group_by(Hour) %>%
  summarize(Avg_Steps = mean(TotalSteps, na.rm = TRUE))

# Visualize hourly activity
hourly_activity_plot <- ggplot(hourly_activity, aes(x = Hour, y = Avg_Steps)) +
  geom_line() +
  labs(title = "Average Steps by Hour of the Day",
       x = "Hour of the Day",
       y = "Average Steps")

# Goal Achievement Analysis
goal_achievement <- merged_data %>%
  mutate(Goal_Achieved = ifelse(TotalSteps >= 10000, "Yes", "No")) %>%
  summarize(Percentage_Achieved = mean(Goal_Achieved == "Yes", na.rm = TRUE) * 100)

# Comparison with Benchmarks (assuming 10000 steps as benchmark)
benchmark_comparison <- merged_data %>%
  mutate(Above_Benchmark = ifelse(TotalSteps >= 10000, "Yes", "No")) %>%
  summarize(Percentage_Above_Benchmark = mean(Above_Benchmark == "Yes", na.rm = TRUE) * 100)

# User Segmentation
user_segmentation <- merged_data %>%
  mutate(Activity_Level = case_when(
    TotalSteps < 5000 ~ "Low",
    TotalSteps >= 5000 & TotalSteps < 10000 ~ "Moderate",
    TotalSteps >= 10000 ~ "High"
  ),
  Sleep_Duration = case_when(
    TotalMinutesAsleep < 420 ~ "Low",
    TotalMinutesAsleep >= 420 & TotalMinutesAsleep < 540 ~ "Moderate",
    TotalMinutesAsleep >= 540 ~ "High"
  ),
  Weight_Category = case_when(
    WeightKg < 60 ~ "Underweight",
    WeightKg >= 60 & WeightKg < 80 ~ "Normal",
    WeightKg >= 80 ~ "Overweight"
  ))

# Longitudinal Analysis
longitudinal_analysis <- merged_data %>%
  group_by(Id) %>%
  summarize(Start_Date = min(Date),
            End_Date = max(Date),
            Total_Steps = sum(TotalSteps, na.rm = TRUE),
            Total_Minutes_Asleep = sum(TotalMinutesAsleep, na.rm = TRUE),
            Avg_Weight = mean(WeightKg, na.rm = TRUE))

# Print correlations and plots
print(correlation_plot)
print(activity_by_day_plot)
print(sleep_distribution_plot)
print(weight_vs_activity_plot)
print(hourly_activity_plot)

# Print additional analyses
print(paste0("Percentage of Users Achieving Daily Activity Goal: ", goal_achievement$Percentage_Achieved, "%"))
print(paste0("Percentage of Users Above 10,000 Steps Benchmark: ", benchmark_comparison$Percentage_Above_Benchmark, "%"))
print("User Segmentation based on Activity Level, Sleep Duration, and Weight:")
print(user_segmentation)
print("Longitudinal Analysis:")
print(longitudinal_analysis)
