# Bellabeat - Small Device Usage Data 
# Analysis for New Growth Opportunities

* What are some trends in small device usage?
* How could these trends apply to Bellabeat customers?
* How could these trends help influence Bellabeat marketing strategy?

# Data Cleaning and Manipulation
## Libraries

In [None]:
library(readr) # import data and read csv
library(tinytex) # LaTeX distribution based on TeX Live
library(janitor) # clean and format data
library(dplyr) # manipulate data
library(skimr) # compact summary
library(tibble) # data frame manipulation
library(ggplot2) # data visualization

## Import CSV files

In [None]:
activity_log <- read.csv("/kaggle/input/fitbit/mturkfitbit_export_4.12.16-5.12.16/Fitabase Data 4.12.16-5.12.16/dailyActivity_merged.csv")
sleep_log <- read.csv("/kaggle/input/fitbit/mturkfitbit_export_4.12.16-5.12.16/Fitabase Data 4.12.16-5.12.16/sleepDay_merged.csv")
weight_log <- read.csv("/kaggle/input/fitbit/mturkfitbit_export_4.12.16-5.12.16/Fitabase Data 4.12.16-5.12.16/weightLogInfo_merged.csv")

## Review columns and atomic vector types

In [None]:
compare_df_cols(activity_log, sleep_log, weight_log)

## Rename date columns to standardize across files

In [None]:
activity_log <- activity_log %>%
  rename(Date = ActivityDate)
sleep_log <- sleep_log %>% 
  rename(Date = SleepDay)

# verify coulmn title changes
colnames(activity_log) 
colnames(sleep_log)

## Format dates

In [None]:
activity_log$Date <- as.Date(activity_log$Date, "%m/%d/%Y")
weight_log$Date <- as.Date(weight_log$Date, "%m/%d/%Y")
sleep_log$Date <- as.Date(sleep_log$Date, "%m/%d/%Y")

weight_log$IsManualReport <- as.logical(weight_log$IsManualReport)

# verify formatting changes
compare_df_cols(activity_log, sleep_log, weight_log)

## Convert minutes to hours

In [None]:
sleep_log$total_hours_asleep <- c((sleep_log$TotalMinutesAsleep)/60)
sleep_log$total_hours_in_bed <- c((sleep_log$TotalTimeInBed)/60)

head(sleep_log)

## Verify that total distance adds up across activity intensity

In [None]:
# Define function
is.FALSE <- function(x) {
  identical(x, FALSE)
}

# Check if the sum of the four intensities equals TotalDistance
is_equal <- activity_log$VeryActiveDistance + activity_log$ModeratelyActiveDistance + activity_log$LightActiveDistance + activity_log$SedentaryDistance == activity_log$TotalDistance

# Count the number of FALSE values (indicating mismatches)
num_mismatches <- sum(is_equal == FALSE)

# Print the number of mismatches
print(num_mismatches)

## Exclude incomplete and unneccessary columns 

In [None]:
activity_log <- activity_log %>% 
  select(-c(LoggedActivitiesDistance,SedentaryActiveDistance,TrackerDistance ))
weight_log <- weight_log %>% 
  select(-c(LogId, Fat)) 
sleep_log <- sleep_log %>% 
  select(-c(TotalTimeInBed ))

# verify column exclusion
compare_df_cols(activity_log, weight_log, sleep_log)

## Clean column titles

In [None]:
activity_01 <- clean_names(activity_log)
weight_01 <- clean_names(weight_log)
sleep_01 <- clean_names(sleep_log)

#view new column names
colnames(activity_01) 
colnames(weight_01) 
colnames(sleep_01)

## Check for null values

In [None]:
sum(is.na(activity_01))
sum(is.na(weight_01))
sum(is.na(sleep_01))

## Merge activity and sleep tables

In [None]:
activity_and_sleep <- merge(x=activity_01,y=sleep_01, by=c("date","id"))

# view new table
head(activity_and_sleep)

## New column for new merged table: time spent not sleeping in bed

In [None]:
activity_and_sleep$total_hours_not_asleep <- c((activity_and_sleep$total_hours_in_bed)-(activity_and_sleep$total_hours_asleep))
activity_and_sleep$total_minutes_not_asleep <- c((activity_and_sleep$total_hours_not_asleep)*60)

head(activity_and_sleep)

## Merge activity and weight tables; Add total activity column

In [None]:
activity_and_weight <- merge(x=activity_01, y=weight_01, by=c("date","id"))

activity_and_weight$total_activity_minutes <- c((activity_and_weight$lightly_active_minutes)+(activity_and_weight$fairly_active_minutes)+(activity_and_weight$very_active_minutes))
activity_and_sleep$total_activity_minutes <- c((activity_and_sleep$lightly_active_minutes)+(activity_and_sleep$fairly_active_minutes)+(activity_and_sleep$very_active_minutes))


# view new table
head(activity_and_weight)

# Summary

## Activity and Sleep Summarized

In [None]:
summary(activity_and_sleep)

## Weight in Pounds

In [None]:
summary(activity_and_weight$weight_pounds)

## Body Mass Index (BMI)

In [None]:
summary(activity_and_weight$bmi)

BMI Categories:
* Underweight = <18.5
* Normal weight = 18.5–24.9
* Overweight = 25–29.9
* Obesity = BMI of 30 or greater

## Manual Reports

In [None]:
summary(activity_and_weight$is_manual_report)

## Skim Activity-Sleep Table

In [None]:
skim(activity_and_sleep)

## Skim Activity-Weight Table

In [None]:
skim(activity_and_weight)

# Visualization I
## Own Analysis

## Weight vs Time
### *Users with more than two data points appear to have a downward trend in weight*

In [None]:
activity_and_weight %>%
 group_by(id) %>%
 filter(n() > 2) %>%
 ggplot(aes(x=date,y=weight_pounds)) +
 geom_point() +
 geom_smooth(method = lm) +
 facet_wrap(~id) +
 labs(title="Weight Change over Time") 

### *Upon closer inspection, there are either 1) very few data points, or 2) a wide dispersement of data points for the very few consistently reporting users*

In [None]:
activity_and_weight %>%
 filter(id == 4558609924) %>%
 ggplot(aes(x=date,y=weight_pounds)) +
 geom_point() +
 geom_smooth(method = lm) +
 labs(title="Weight Change over Time", subtitle="ID 4558609924")

activity_and_weight %>%
 filter(id == 6962181067) %>%
 ggplot(aes(x=date,y=weight_pounds)) +
 geom_point() +
 geom_smooth(method = lm) +
 labs(title="Weight Change over Time", subtitle="ID 6962181067")

activity_and_weight %>%
 filter(id == 8877689391) %>%
 ggplot(aes(x=date,y=weight_pounds)) +
 geom_point() +
 geom_smooth(method = lm) +
 labs(title="Weight Change over Time", subtitle="ID 8877689391")

## Body Mass Index vs Time

### *BMI shows a very simiar behavior to the above weight analysis, due to BMI being a derivation of weight*

BMI = weight (lb) / [height (in)]^2 x 703

In [None]:
ggplot(activity_and_weight,aes(x=date,y=bmi)) +
 geom_point() +
 geom_smooth(method = lm) +
 labs(title="BMI Change over Time")

activity_and_weight %>%
 group_by(id) %>%
 filter(n() > 2) %>%
 ggplot(aes(x=date,y=bmi)) +
 geom_point() +
 geom_smooth(method = lm) +
 facet_wrap(~id) +
 labs(title="BMI Change over Time") 

activity_and_weight %>%
 filter(id == 4558609924) %>%
 ggplot(aes(x=date,y=bmi)) +
 geom_point() +
 geom_smooth(method = lm) +
 labs(title="BMI Change over Time", subtitle="ID 4558609924")

activity_and_weight %>%
 filter(id == 6962181067) %>%
 ggplot(aes(x=date,y=bmi)) +
 geom_point() +
 geom_smooth(method = lm) +
 labs(title="BMI Change over Time", subtitle="ID 6962181067")

activity_and_weight %>%
 filter(id == 8877689391) %>%
 ggplot(aes(x=date,y=bmi)) +
 geom_point() +
 geom_smooth(method = lm) +
 labs(title="BMI Change over Time", subtitle="ID 8877689391")

## Total Sleep vs Time
### *Due to a relatively small dataset, exploring individuals' trends over time in search for improvement is ineffecient*
-> Weigth, BMI, Total Sleep, Time Spent in Bed Not Sleeping

In [None]:
ggplot(activity_and_sleep,aes(x=date,y=total_hours_asleep, color=id)) +
 geom_point() +
 geom_smooth(method = lm) +
 labs(title="Total Sleep over Time")

activity_and_sleep %>%
 group_by(id) %>%
 filter(n() > 30) %>%
 ggplot(aes(x=date,y=total_hours_asleep)) +
 geom_point() +
 geom_smooth(method = lm) +
 facet_wrap(~id) +
 labs(title="Total Sleep over Time") 

activity_and_sleep %>%  
 filter(id == 5553957443) %>%
 ggplot(aes(x=date,y=total_hours_asleep)) +
 geom_point() +
 geom_smooth(method = lm) +
 labs(title="Total Sleep over Time", subtitle="ID 5553957443")

activity_and_sleep %>%
 filter(id == 6962181067) %>%
 ggplot(aes(x=date,y=total_hours_asleep)) +
 geom_point() +
 geom_smooth(method = lm) +
 labs(title="Total Sleep over Time", subtitle="ID 6962181067")

activity_and_sleep %>%
 filter(id == 8378563200) %>%
 ggplot(aes(x=date,y=total_hours_asleep)) +
 geom_point() +
 geom_smooth(method = lm) +
 labs(title="Total Sleep over Time", subtitle="ID 8378563200")

## Time Spent Not Sleeping vs Time

See above note about time vs x-variable

In [None]:
ggplot(activity_and_sleep,aes(x=date,y=total_hours_not_asleep, color=id)) +
 geom_point() +
 geom_smooth(method = lm) +
 labs(title="TTime Spent Not Sleeping Time")

activity_and_sleep %>%
 group_by(id) %>%
 filter(n() > 30) %>%
 ggplot(aes(x=date,y=total_hours_not_asleep)) +
 geom_point() +
 geom_smooth(method = lm) +
 facet_wrap(~id) +
 labs(title="Time Spent Not Sleeping Time") 

## Total Steps vs Total Distance
### According to this [link](https://support.google.com/fitbit/answer/14237111?hl=en&ref_topic=14236502&sjid=4159926566167969126-NC#zippy=%2Chow-does-my-fitbit-device-count-steps-taken%2Chow-does-my-fitbit-device-calculate-distance-traveled), steps are measured using a 3-axis accelerometer while distance is calculated using the measured steps (unless GPS tracking was used) 
### *Given this calculation, along with slight variations due to stride length depending on sex, total steps and total distance share a strong direct relationship*

In [None]:
ggplot(activity_and_weight,aes(x=total_steps,y=total_distance)) +
 geom_point() +
 geom_smooth(method = lm) +
 labs(title="Total Steps versus Total Distance")

## Calories vs Total Activity Minutes

### Providing users a visualization of this direct relationship, including the accomplished activity level and calories burned as well as the calories they *could* burn with longer activity time, can be a strong motivator for the user's committment to Bellabeat's subscription health regimen. 

In [None]:
ggplot(activity_and_weight,aes(x=calories,y=total_activity_minutes)) +
 geom_point() +
 geom_smooth(method = lm) +
 labs(title="Calories versus Total Activity Time")

activity_and_weight %>%
 group_by(id) %>%
 filter(n()>2) %>%
ggplot(aes(x=calories,y=total_activity_minutes)) +
 geom_point() +
 geom_smooth(method = lm) +
 labs(title="Calories versus Total Activity Time") +
 facet_wrap(~id)

## Calories vs Total Steps/Distance

Because 
* steps are derived from a 3-axis acceleromater,
* distance is derived from steps, and
* calories are derived from above activity data and basal metabolic rate (BMR),

their visualizations also show a strong relationship 

### *To encourage users to subscribe to membership, provide pop-up tips or 'fun facts' at different related milestones after providing visualization of activity against calories burned*

BMR is based on height, weight, sex, and age (or pulse rate with certain devices)


In [None]:
ggplot(activity_and_weight,aes(x=calories,y=total_steps)) +
 geom_point() +
 geom_smooth(method = lm) +
 labs(title="Calories versus Total Steps")

ggplot(activity_and_weight,aes(x=calories,y=total_distance)) +
 geom_point() +
 geom_smooth(method = lm) +
 labs(title="Calories versus Total Distance")

## Sleep vs Total Activity Minutes

### *With this amount of data, there is not a clear relationship between total activity time and total sleep time*

In [None]:
ggplot(activity_and_sleep,aes(x=total_activity_minutes,y=total_hours_asleep)) +
 geom_point() +
 geom_smooth(method = lm) +
 labs(title="Total Activity versus Total Sleep Time")

activity_and_sleep %>%
 group_by(id) %>%
 filter(n()>30) %>%
ggplot(aes(x=total_activity_minutes,y=total_hours_asleep)) +
 geom_point() +
 geom_smooth(method = lm) +
 labs(title="Total Activity versus Total Sleep Time") +
 facet_wrap(~id)

## Total Time Spent in Bed Not Sleeping vs Total Activity Minutes

### *Due to the variance in correlation between activity level and sleep quality from user to user, individuals could be encourgaed to track their own personal patterns by using a wrist-based device during sleep. Then their device can make personalized recommendations going off personal sleep cycle data*

In [None]:
ggplot(activity_and_sleep,aes(x=total_activity_minutes,y=total_minutes_not_asleep)) +
 geom_point() +
 geom_smooth(method = lm) +
 labs(title="Total Activity Minutes versus Total Time Spent in Bed Not Sleeping")

activity_and_sleep %>%
 group_by(id) %>%
 filter(n()>30) %>%
ggplot(aes(x=total_activity_minutes,y=total_minutes_not_asleep)) +
 geom_point() +
 geom_smooth(method = lm) +
 labs(title="Total Activity Minutes versus Total Time Spent in Bed Not Sleeping") +
 facet_wrap(~id)

## Calories vs Different Intensity Distance

### *More information is needed from the data source to understand the "light", "moderate", and "very active"  intensity categories. At first glance, light and high activity share a direct relationship with number of calories burned, while moderate activity shares no clear relationship*

In [None]:
ggplot(activity_and_weight,aes(x=light_active_distance,y=calories)) +
 geom_point() +
 geom_smooth(method = lm) +
 labs(title="Calories versus Light Intensity Distance")

ggplot(activity_and_weight,aes(x=moderately_active_distance,y=calories)) +
 geom_point() +
 geom_smooth(method = lm) +
 labs(title="Calories versus Moderate Intensity Distance")

ggplot(activity_and_weight,aes(x=very_active_distance,y=calories)) +
 geom_point() +
 geom_smooth(method = lm) +
 labs(title="Calories versus High Intensity Distance")

In [None]:
# for reference 

colnames(activity_and_sleep)
colnames(activity_and_weight)

# Visualization II
## Refer to Nancy Chauhan's Bellabeat notebook

## Time Spent in Bed vs Time Spent Asleep

### *Subset of 32 users shows them not sleeping as much as other individuals spending the same amount of total time in bed*

In [None]:
ggplot(data=activity_and_sleep, aes(x=total_hours_in_bed, y=total_hours_asleep)) + 
 geom_point() + 
 labs(title="Time Spent in Bed vs Time Spent Asleep")

## Time Spent Asleep vs Time Spent Inactive
### *An increase in time spent sleeping seems to reflect a negative correlation to sedentary time. More information is needed from data source on what is counted as sedentary time*

In [None]:
# find asleep minutes to compare to sedentary minutes
ggplot(data=activity_and_sleep, aes(x=total_minutes_asleep, y=sedentary_minutes)) +
 geom_jitter() +
 geom_smooth() +
 labs(title="Time Spent Asleep vs Time Spent Inactive")

## Total Steps vs Body Mass Index
### *Number of steps do not seem to be a reliable indicator of a user's body mass index*

In [None]:
activity_and_weight %>%
 filter(bmi < 40) %>%
 ggplot(aes(x=total_steps, y=bmi, color=calories)) +
 geom_jitter() +
 geom_smooth() +
 labs(title="Total Steps vs Body Mass Index")

## Calories Burned vs Total Steps
### *There is a positive correlation between calories burned and total steps taken*
*This may be due to the built-in formula estimating calories burned*

In [None]:
ggplot(activity_and_weight,aes(x=calories,y=total_steps)) +
 geom_point() +
 geom_smooth() +
 labs(title="Calories Burned v Total Steps")

## Calories vs Inactive Time
### *Calories burned do not seem to be correlated in either direction with a user's time of inactivity*

In [None]:
ggplot(activity_and_weight,aes(x=calories, y=sedentary_minutes ,color=weight_pounds )) +
 geom_point() +
 geom_smooth(method = lm) +
 labs(title="Calories vs Inactive Time")

## Body Mass Index vs Total, Light, Fair, and High Activity
### *Body Mass Index is not a reliable indicator of a user's activity intensity*

In [None]:
# Body Mass Index vs Total Activity
activity_and_weight %>%
 filter(bmi < 40) %>%
 ggplot(aes(x=bmi,y=total_activity_minutes, color=calories)) +
 geom_point() +
 labs(title="Body Mass Index vs Total Activity")

#Body Mass Index vs Light Activity
activity_and_weight %>%
 filter(bmi < 40) %>%
 ggplot(aes(x=bmi,y=lightly_active_minutes, color=calories)) +
 geom_point() +
 labs(title="Body Mass Index vs Light Activity")

#Body Mass Index vs Fair Activity
activity_and_weight %>%
 filter(bmi < 40) %>%
 ggplot(aes(x=bmi,y=fairly_active_minutes, color=calories)) +
 geom_point() +
 geom_smooth(method = lm) +
 labs(title="Body Mass Index vs Fair Activity")

#Body Mass Index vs High Activity
activity_and_weight %>%
 filter(bmi < 40) %>% 
 ggplot(aes(x=bmi,y=very_active_minutes, color=calories)) +
 geom_point() +
 geom_smooth(method = lm) +
 labs(title="Body Mass Index vs High Activity")

# Conclusion
## Recommendations

### **1. Encourage Users to Subscribe to Membership**
- **Personalized Milestones:** Tailor milestone reminders based on individual user goals and progress (e.g., "You're only 10 minutes away from hitting your weekly activity goal!").
- **Highlight Membership Benefits:** When users reach significant milestones, include a pop-up that showcases how the Bellabeat membership can enhance their progress (e.g., "Great job reaching your activity goal! With a Bellabeat membership, you could receive personalized nutrition tips to maximize your results.").
- **Interactive Previews:** Offer a limited-time preview of membership features, such as personalized guidance or advanced sleep tracking, after users hit certain milestones, encouraging them to subscribe.

### **2. Provide Visualization of Activity and Calories**
- **Dynamic Activity Tracker:** Introduce an interactive activity tracker that not only shows calories burned but also provides insights into how various activities impact different health metrics (e.g., "Walking for 30 minutes burns X calories and improves your heart rate by Y%").
- **Predictive Analytics:** Integrate predictive analytics that suggest how additional activities could improve health outcomes, motivating users to extend their workouts (e.g., "An extra 15 minutes of yoga could help you burn an additional 50 calories and improve your sleep quality by 10%").
- **Social Sharing:** Encourage users to share their achievements on social media, showcasing the calories burned and progress made, while subtly promoting Bellabeat membership benefits.

### **3. Encourage Sleep Tracking and Offer Personalized Recommendations**
- **Sleep Quality Index:** Introduce a personalized sleep quality index that summarizes sleep data and provides actionable insights based on the user's sleep patterns. Use this index to emphasize how Bellabeat membership can offer even deeper analysis and custom sleep improvement plans.
- **Smart Sleep Features:** Develop smart sleep features like adaptive wake-up alarms and bedtime reminders that adjust based on the user's sleep data, with exclusive advanced settings available to members.
- **Marketing Strategy:** Use marketing campaigns to highlight the connection between activity levels and sleep quality, emphasizing how Bellabeat's personalized guidance (available through membership) can optimize both.

### **4. Encourage Regular Weight Tracking**
- **Weight Trend Visualization:** Offer users a visual representation of their weight trends over time, correlating these trends with their activity, nutrition, and sleep data. 
- **Goal-Setting Features:** Introduce goal-setting features that allow users to set weight targets, with reminders and tips provided through Bellabeat membership to help them stay on track.
- **Progress Reports:** Create detailed progress reports that summarize the user's health journey, emphasizing how Bellabeat membership can offer deeper insights and personalized recommendations.

### **5. Integrate Mindfulness and Wellness Features**
- **Mindfulness Integration:** Expand the mindfulness features by offering guided meditation sessions, stress management tips, and wellness challenges, with premium content available to members.
- **Holistic Health Focus:** Promote a holistic approach to health by linking physical activity, nutrition, sleep, and mindfulness practices. Use marketing campaigns to emphasize how Bellabeat membership offers comprehensive guidance in all these areas, tailored to individual lifestyles and goals.