# LIBRARIES

In [1]:
library(tidyverse)
library(repr)
library(broom)
library(leaps)
library(moderndive)
library(MASS)
library(car)
library(rsample)
print("LIBRARIES LOADED")

── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.4     [32m✔[39m [34mreadr    [39m 2.1.5
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.1
[32m✔[39m [34mggplot2  [39m 3.5.1     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.3     [32m✔[39m [34mtidyr    [39m 1.3.1
[32m✔[39m [34mpurrr    [39m 1.0.2     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors

Attaching package: ‘MASS’


The following object is masked from ‘package:dplyr’:

    select


Loading required package: carData


Attaching package: ‘car’


The follow

[1] "LIBRARIES LOADED"


# DATA AND TIDYING

In [None]:
file_url <- "https://drive.google.com/uc?export=download&id=1ZjZvLl5dUzHEF8ouimlTg8t0MorhjzVA"
sleep_data <- read.csv(file_url)
head(sleep_data)
set.seed(114514) # SEED, DO NOT CHANGE

In [None]:
sleep_data_reduced <- sleep_data %>%
  dplyr::select(-any_of(c("Person.ID", "Gender", "Blood.Pressure", "Heart.Rate", "Daily.Steps")))
new_names <- c(
  "Age", "Occupation", "Sleep_Duration", 
  "Quality_of_Sleep", "Physical_Activity_Level", "Stress_Level", 
  "BMI_Category",
  "Sleep_Disorder"
)
names(sleep_data_reduced)<- new_names
sleep_data_reduced<- sleep_data_reduced|>mutate(BMI_Category = as.factor(BMI_Category), Occupation = as.factor(Occupation))
sleep_data_reduced<- sleep_data_reduced|>mutate(Sleep_Disorder= ifelse(Sleep_Disorder=="None", "False", "True"))
sleep_data_reduced<- sleep_data_reduced|>mutate(Sleep_Disorder = as.factor(Sleep_Disorder))
head(sleep_data_reduced) 
nrow(sleep_data_reduced)

# IMPLEMENTATION
QUESTION: ASSOCIATION BETWEEN SLEEP DURATION (RESPONSE) AND OTHER VARIABLES.

HERE WE SPLIT THE DATA TO TRAINING/TESTING, 70/30 BEFORE DOING THE FIRST VIF CHECK.

In [None]:
# SPLITTING
# SPLIT DATA BEFORE FIRST VIF CHECK

data_split <- sleep_data_reduced |> initial_split(prop = 0.7, strata = Sleep_Duration)
sleep_train <- training(data_split)
sleep_test <- testing(data_split)
print(paste("TRAINING N-ROWS", nrow(sleep_train)))
print(paste("TESTING N-ROWS", nrow(sleep_test)))

In [None]:
# FIRST VIF CHECK
sleep_full <- lm(Sleep_Duration~., data=sleep_train)
vif(sleep_full)

Variables with high scaled-GVIF:

Quality_of_Sleep, Stress_Level

In [None]:
# STEP AIC
levels(sleep_train$Occupation)
# FROM THE ALPHABETICAL ORDER OF THE LEVELS OF THE OCCUPATION VARIABLE, WE EXPECT "ACCOUNTANT"
# TO BE ABSORBED INTO THE REFERENCE LEVEL ROW (INTERCEPT)

aic_model <- stepAIC(sleep_full, direction = "backward", k = log(nrow(sleep_train)))
summary(aic_model)

INTERPRETATION:

From the alphabetical order of the levels of the occupation variable we expect "Accountant" to be absorbed into the reference level (Intercept) row.

REMOVED AFTER STEP-AIC:

Age, BMI_Category, Sleep_Disorder.

In [None]:
# 2ND VIF WITH AIC MODEL
vif(aic_model)

INTERPRETATION:

A slight improvement.

# VALIDATION

In [None]:
prediction_test <- predict(aic_model, newdata = sleep_test)
actual <- sleep_test$Sleep_Duration
residual <- actual - prediction_test
RMSE <- sqrt(mean(residual^2))
MAE <- mean(abs(residual))
R2 <- 1 - sum(residual^2) / sum((actual - mean(actual))^2)

print(c(RMSE = RMSE, MAE = MAE, R2 = R2))

mean_sleep <- mean(sleep_test$Sleep_Duration)
relative_RMSE <- RMSE / mean_sleep
relative_MAE <- MAE / mean_sleep

print(paste("RMSE RELATIVE TO MEAN SLEEP DURATION: ", round(relative_RMSE,4), ", good."))
print(paste("MAE RELATIVE TO MEAN SLEEP DURATION: ", round(relative_MAE,4), ",good."))

Summary of test-set validation:

RMS error: about 0.284 hours sleep duration or 17 minutes.
MA error: 0.215 hours or 13 minutes.
R^2: 0.872, good.