# LIBRARIES AND IMPORT CSV

In [1]:
install.packages("tidyverse")
library("tidyverse")
install.packages("rlang")
library("rlang")
install.packages("Hmisc")
library("Hmisc")
install.packages("tidymodels")
library("tidymodels")

Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)

── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.4     [32m✔[39m [34mreadr    [39m 2.1.5
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.1
[32m✔[39m [34mggplot2  [39m 3.5.2     [32m✔[39m [34mtibble   [39m 3.3.0
[32m✔[39m [34mlubridate[39m 1.9.4     [32m✔[39m [34mtidyr    [39m 1.3.1
[32m✔[39m [34mpurrr    [39m 1.1.0     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors
Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)


At

In [2]:
system('dx download "project-Gzb5YJQJgQzb41qJj9G5G1GV:/Mattia/csv/extracted_replaced_environmental_factors.csv"')

In [3]:
dat <- read.csv("extracted_replaced_environmental_factors.csv")

## Brief check of dataset structure

In [None]:
head(dat |> select(-X, -eid))

# Dataset preproc
Rename variable as: https://github.com/miargentieri/exposome-aging-ukb/blob/main/code/01-UKB-data-import.Rmd
Recode variables as: https://github.com/miargentieri/exposome-aging-ukb/blob/main/code/02-UKB_data_recoding.Rmd
General rules:
    1. All "Prefer not to answer", "Do not know", "Not applicable" are set as NA (exception: )

### Function to create dummy variables according to data structure

In [4]:
create_string_dummies <- function(data, col_original) {
  
  # 1) Extract all unique activity names from non-NA rows
  all_values <- na.omit(data[[col_original]])
  
  # Split on the quoted separator:  ', '  (i.e., "'\\s*,\\s*'")
  # First remove the outer ["..."] wrapper, but keep inner quotes intact.
  stripped <- gsub("^\\[\\s*'", "", all_values)
  stripped <- gsub("'\\s*\\]$", "", stripped)
  split_values <- unlist(strsplit(stripped, "'\\s*,\\s*'"))
  
  split_values <- trimws(split_values)
  split_values <- split_values[split_values != ""]
  all_activities <- sort(unique(split_values))
  
  # 2) Create dummy columns
  for (activity in all_activities) {
    colname <- gsub("[^A-Za-z0-9]+", "_", activity)
    colname <- paste0(col_original, ".", colname)
    
    # Exact token match by searching for 'activity' (keeps commas inside labels intact)
    dummy <- ifelse(
      is.na(data[[col_original]]),
      NA,
      as.integer(grepl(paste0("'", activity, "'"), data[[col_original]], fixed = TRUE))
      # If you prefer your original behavior, swap the line above for:
      # as.integer(grepl(activity, data[[col_original]], fixed = TRUE))
    )
    
    data[[colname]] <- factor(dummy, levels = c(0, 1), labels = c("No", "Yes"))
  }
  
  return(data)
}

## Preproc of clusters of variables

### Other variable for preprocess but not for XWAS

In [57]:
dat <- dat |>
#other <- dat |>
#   select(p1767_i0, p6164_i0, p31, p1160_i0, p1309_i0, p20117_i0, p1319_i0, p1001_i0, p3637_i0, p806_i0, p991_i0, p767_i0, p816_i0, p2634_i0, p2624_i0, p1021_i0, p1011_i0, p1438_i0, p3647_i0, p981_i0, p971_i0, p1289_i0, p1299_i0, p1448_i0, p1468_i0, p1458_i0, p1070_i0, p1080_i0, p1090_i0, p1369_i0, p1379_i0, p1389_i0
#         ) |> 
    create_string_dummies("p6164_i0") |>
    mutate(
        across(
      c(
        starts_with("p6164_i0."),
        -any_of(c("p6164_i0.None_of_the_above", "p6164_i0.Prefer_not_to_answer"))
      ),
          ~ case_when(
              p6164_i0.None_of_the_above == "Yes" |
              p6164_i0.Prefer_not_to_answer == "Yes" ~ NA,
              TRUE ~ .x
              )
           )
   ) |> 
    rename(
        light_DIY_4wks             = p6164_i0.Light_DIY_eg_pruning_watering_the_lawn_,
        pleasure_walks_4wks        = p6164_i0.Walking_for_pleasure_not_as_a_means_of_transport_,
        other_exercise_4wks        = p6164_i0.Other_exercises_eg_swimming_cycling_keep_fit_bowling_,
        heavy_DIY_4wks             = p6164_i0.Heavy_DIY_eg_weeding_lawn_mowing_carpentry_digging_,
        strenuous_sports_4wks      = p6164_i0.Strenuous_sports
  ) |> 
  select(-p6164_i0, -p6164_i0.None_of_the_above, -p6164_i0.Prefer_not_to_answer) |>
  mutate(
      light_DIY_4wks               = recode_factor(light_DIY_4wks, "0" = "No", "1" = "Yes"),
      pleasure_walks_4wks          = recode_factor(pleasure_walks_4wks, "0" = "No", "1" = "Yes"),
      other_exercise_4wks          = recode_factor(other_exercise_4wks, "0" = "No", "1" = "Yes"),
      heavy_DIY_4wks               = recode_factor(heavy_DIY_4wks, "0" = "No", "1" = "Yes"),
      strenuous_sports_4wks        = recode_factor(strenuous_sports_4wks, "0" = "No", "1" = "Yes")
  ) |>
    rename(
       adopted         =  p1767_i0,
       sex             =  p31,
       sleep_hours     =  p1160_i0,
       fresh_fruit     =  p1309_i0,
       dried_fruit     =  p1319_i0,
       cooked_veg      =  p1289_i0,
       salad_or_raw_veg=  p1299_i0,
       bread_type      =  p1448_i0,
       cereal_type     =  p1468_i0,
       cereal          =  p1458_i0,
       bread           =  p1438_i0, 
       TV_time         =  p1070_i0,
       computer_time   =  p1080_i0,
       driving_time    =  p1090_i0,
       beef            =  p1369_i0,
       lamb            =  p1379_i0,
       pork            =  p1389_i0, 
       pleasure_walks_duration= p981_i0,
       pleasure_walks_freq_4wks=p971_i0,
       other_exercise_duration=p3647_i0,
       other_exercise_freq_4wks=p3637_i0,
       strenuous_sports_duration=p1001_i0,
       strenuous_sports_freq_4wks=p991_i0,
       heavy_DIY_duration= p2634_i0,
       heavy_DIY_freq_4wks=p2624_i0,
       light_DIY_duration=p1021_i0,
       light_DIY_freq_4wks=p1011_i0,
       employment_hours=p767_i0,
       employment_heavy_manual=p816_i0,
       employment_standing=p806_i0,
       alcohol_status  = p20117_i0 
        
    ) |>
    mutate(
       adopted         = ifelse(adopted   %in% c("Do not know", "Prefer not to answer"), NA, adopted),
       sleep_hours     = ifelse(sleep_hours   %in% c("Do not know", "Prefer not to answer"), NA, sleep_hours),
       fresh_fruit     = ifelse(fresh_fruit   %in% c("Do not know", "Prefer not to answer"), NA, fresh_fruit),
       fresh_fruit     = ifelse(fresh_fruit   %in% c("Less than one"), 0.5, fresh_fruit),
       dried_fruit     = ifelse(dried_fruit   %in% c("Do not know", "Prefer not to answer"), NA, dried_fruit),
       dried_fruit     = ifelse(dried_fruit   %in% c("Less than one"), 0.5, dried_fruit),
       cooked_veg      = ifelse(cooked_veg   %in% c("Do not know", "Prefer not to answer"), NA, cooked_veg),
       cooked_veg      = ifelse(cooked_veg   %in% c("Less than one"), 0.5, cooked_veg),
       salad_or_raw_veg= ifelse(salad_or_raw_veg   %in% c("Do not know", "Prefer not to answer"), NA, salad_or_raw_veg),
       salad_or_raw_veg= ifelse(salad_or_raw_veg   %in% c("Less than one"), 0.5, salad_or_raw_veg),
       bread_type      = ifelse(bread_type   %in% c("Do not know", "Prefer not to answer"), NA, bread_type),
       cereal_type     = ifelse(cereal_type   %in% c("Do not know", "Prefer not to answer"), NA, cereal_type),
       TV_time         = ifelse(TV_time   %in% c("Do not know", "Prefer not to answer"), NA, TV_time),
       TV_time         = ifelse(TV_time   %in% c("Less than one"), 0.5, TV_time),
       computer_time   = ifelse(computer_time   %in% c("Do not know", "Prefer not to answer"), NA, computer_time),
       computer_time   = ifelse(computer_time   %in% c("Less than one"), 0.5, computer_time),
       driving_time    = ifelse(driving_time   %in% c("Do not know", "Prefer not to answer"), NA, driving_time),
       driving_time    = ifelse(driving_time   %in% c("Less than one"), 0.5, driving_time),
       beef            = ifelse(beef   %in% c("Do not know", "Prefer not to answer"), NA, beef), 
       lamb            = ifelse(lamb   %in% c("Do not know", "Prefer not to answer"), NA, lamb),  
       pork            = ifelse(pork   %in% c("Do not know", "Prefer not to answer"), NA, pork), 
       cereal          = ifelse(cereal   %in% c("Do not know", "Prefer not to answer"), NA, cereal),
       cereal          = ifelse(cereal   %in% c("Less than one"), 0.5, cereal),
       pleasure_walks_duration = ifelse(pleasure_walks_duration   %in% c("Do not know", "Prefer not to answer"), NA, pleasure_walks_duration),
       pleasure_walks_freq_4wks = ifelse(pleasure_walks_freq_4wks   %in% c("Do not know", "Prefer not to answer"), NA, pleasure_walks_freq_4wks), 
       bread           = ifelse(bread   %in% c("Do not know", "Prefer not to answer"), NA, bread),
       bread           = ifelse(bread   %in% c("Less than one"), 0.5, bread), 
       other_exercise_duration = ifelse(other_exercise_duration   %in% c("Do not know", "Prefer not to answer"), NA, other_exercise_duration), 
       other_exercise_freq_4wks = ifelse(other_exercise_freq_4wks   %in% c("Do not know", "Prefer not to answer"), NA, other_exercise_freq_4wks),  
       strenuous_sports_duration = ifelse(strenuous_sports_duration   %in% c("Do not know", "Prefer not to answer"), NA, strenuous_sports_duration),
       strenuous_sports_freq_4wks= ifelse(strenuous_sports_freq_4wks   %in% c("Do not know", "Prefer not to answer"), NA, strenuous_sports_freq_4wks), 
       heavy_DIY_duration        = ifelse(heavy_DIY_duration   %in% c("Do not know", "Prefer not to answer"), NA, heavy_DIY_duration),
       heavy_DIY_freq_4wks= ifelse(heavy_DIY_freq_4wks   %in% c("Do not know", "Prefer not to answer"), NA, heavy_DIY_freq_4wks),
       light_DIY_duration        = ifelse(light_DIY_duration   %in% c("Do not know", "Prefer not to answer"), NA, light_DIY_duration),
       light_DIY_freq_4wks= ifelse(light_DIY_freq_4wks   %in% c("Do not know", "Prefer not to answer"), NA, light_DIY_freq_4wks),
       employment_hours   = ifelse(employment_hours   %in% c("Do not know", "Prefer not to answer"), NA, employment_hours),
       employment_heavy_manual= ifelse(employment_heavy_manual   %in% c("Do not know", "Prefer not to answer"), NA, employment_heavy_manual),
       employment_standing= ifelse(employment_standing   %in% c("Do not know", "Prefer not to answer"), NA, employment_standing),  
       alcohol_status     = ifelse(alcohol_status   %in% c("Prefer not to answer"), NA, alcohol_status)   
        
    ) |>
    mutate(
       adopted                   = relevel(factor(adopted,       ordered = FALSE), ref = "No"),
       sex                       = relevel(factor(sex,   ordered = FALSE), ref = "Female"),
       sleep_hours               = as.numeric(sleep_hours),
       fresh_fruit               = as.numeric(fresh_fruit), 
       dried_fruit               = as.numeric(dried_fruit),
       cooked_veg                = as.numeric(cooked_veg),
       salad_or_raw_veg          = as.numeric(salad_or_raw_veg),
       bread_type                = relevel(factor(bread_type,   ordered = FALSE), ref = "Wholemeal or wholegrain"),
       cereal_type               = relevel(factor(cereal_type,   ordered = FALSE), ref = "Other (e.g. Cornflakes, Frosties)"), 
       TV_time                   = as.numeric(TV_time), 
       computer_time             = as.numeric(computer_time),
       driving_time              = as.numeric(driving_time),
       beef                      = relevel(factor(beef,       ordered = FALSE), ref = "Never"), 
       lamb                      = relevel(factor(lamb,       ordered = FALSE), ref = "Never"), 
       pork                      = relevel(factor(pork,       ordered = FALSE), ref = "Never"),
       cereal                    = as.numeric(cereal), 
       bread                     = as.numeric(bread),  
       pleasure_walks_duration   = relevel(factor(pleasure_walks_duration,   ordered = FALSE), ref = "Less than 15 minutes"),
       pleasure_walks_freq_4wks  = relevel(factor(pleasure_walks_freq_4wks,   ordered = FALSE), ref = "Once in the last 4 weeks"), 
       other_exercise_duration   = relevel(factor(other_exercise_duration,   ordered = FALSE), ref = "Less than 15 minutes"), 
       other_exercise_freq_4wks  = relevel(factor(other_exercise_freq_4wks,   ordered = FALSE), ref = "Once in the last 4 weeks"), 
       strenuous_sports_duration = relevel(factor(strenuous_sports_duration,   ordered = FALSE), ref = "Less than 15 minutes"),
       strenuous_sports_freq_4wks= relevel(factor(strenuous_sports_freq_4wks,   ordered = FALSE), ref = "Once in the last 4 weeks"),  
       heavy_DIY_duration       = relevel(factor(heavy_DIY_duration,   ordered = FALSE), ref = "Less than 15 minutes"), 
       heavy_DIY_freq_4wks      = relevel(factor(heavy_DIY_freq_4wks,   ordered = FALSE), ref = "Once in the last 4 weeks"),   
       light_DIY_duration       = relevel(factor(light_DIY_duration,   ordered = FALSE), ref = "Less than 15 minutes"),  
       light_DIY_freq_4wks      = relevel(factor(light_DIY_freq_4wks,   ordered = FALSE), ref = "Once in the last 4 weeks"),    
       employment_hours         = as.numeric(employment_hours),
       employment_heavy_manual  = relevel(factor(employment_heavy_manual,   ordered = FALSE), ref = "Never/rarely"),
       employment_standing      = relevel(factor(employment_standing,   ordered = FALSE), ref = "Never/rarely"),
       alcohol_status           = relevel(factor(alcohol_status,   ordered = FALSE), ref = "Never")  
        
    )

others <- c(
  "adopted", "sex", "sleep_hours", "fresh_fruit", "dried_fruit", "cooked_veg", "salad_or_raw_veg", "bread_type", "cereal_type", "cereal", "bread", 
  "TV_time", "computer_time", "driving_time", "beef", "lamb", "pork", "pleasure_walks_duration", "pleasure_walks_freq_4wks", 
  "light_DIY_4wks", "pleasure_walks_4wks", "other_exercise_4wks", "heavy_DIY_4wks", "strenuous_sports_4wks", "heavy_DIY_duration", "heavy_DIY_freq_4wks", 
  "light_DIY_duration", "light_DIY_freq_4wks", "other_exercise_duration", "other_exercise_freq_4wks", "strenuous_sports_duration", "strenuous_sports_freq_4wks", 
  "employment_hours", "employment_heavy_manual", "employment_standing", "alcohol_status"
)

[1m[22m[36mℹ[39m In argument: `TV_time = as.numeric(TV_time)`.
[33m![39m NAs introduced by coercion


### Early life factors

In [58]:
dat <- dat |>
#elf1 <- dat |>
  #select(p1647_i0, p1677_i0, p1687_i0, p1697_i0, p1707_i0, p1777_i0, p1787_i0, p20022_i0) |>
  rename(
    country_birth       = p1647_i0,
    breastfed           = p1677_i0,
    body_size_10yrs_old = p1687_i0,
    height_10yrs_old    = p1697_i0,
    handedness          = p1707_i0,
    part_multiple_birth = p1777_i0,
    maternal_smoking    = p1787_i0,
    birth_weight        = p20022_i0
  ) |>
  mutate(
    country_birth       = ifelse(country_birth       %in% c("Prefer not to answer", "Do not know"), NA, country_birth),
    breastfed           = ifelse(breastfed           %in% c("Prefer not to answer", "Do not know"), NA, breastfed),
    body_size_10yrs_old = ifelse(body_size_10yrs_old %in% c("Prefer not to answer", "Do not know"), NA, body_size_10yrs_old),
    height_10yrs_old    = ifelse(height_10yrs_old    %in% c("Prefer not to answer", "Do not know"), NA, height_10yrs_old),
    handedness          = ifelse(handedness          %in% c("Prefer not to answer", "Do not know"), NA, handedness),
    part_multiple_birth = ifelse(part_multiple_birth %in% c("Prefer not to answer", "Do not know"), NA, part_multiple_birth),
    maternal_smoking    = ifelse(maternal_smoking    %in% c("Prefer not to answer", "Do not know"), NA, maternal_smoking)
  ) |>
  mutate(
    country_birth       = relevel(factor(country_birth,       ordered = FALSE), ref = "England"),
    breastfed           = relevel(factor(breastfed,           ordered = FALSE), ref = "No"),
    body_size_10yrs_old = relevel(factor(body_size_10yrs_old, ordered = FALSE), ref = "About average"),
    height_10yrs_old    = relevel(factor(height_10yrs_old,    ordered = FALSE), ref = "About average"),
    handedness          = relevel(factor(handedness,          ordered = FALSE), ref = "Right-handed"),
    part_multiple_birth = relevel(factor(part_multiple_birth, ordered = FALSE), ref = "No"),
    maternal_smoking    = relevel(factor(maternal_smoking,    ordered = FALSE), ref = "No")
  )


elf_vars <- c(
  "country_birth","breastfed","body_size_10yrs_old","height_10yrs_old",
  "handedness","part_multiple_birth","maternal_smoking","birth_weight"
)

### Stressful life events

In [59]:
dat <- dat |>
#sle <- dat |>
    #select(p6145_i0) |>
    create_string_dummies("p6145_i0") |>
    mutate(
        across(
      c(
        starts_with("p6145_i0."),
        -any_of(c("p6145_i0.None_of_the_above", "p6145_i0.Prefer_not_to_answer"))
      ),
          ~ case_when(
              #p6145_i0.None_of_the_above == "Yes" |
              p6145_i0.Prefer_not_to_answer == "Yes" ~ NA,
              TRUE ~ .x
             )
            )
    ) |> 
    rename(
        self_illness_injury_assault       = p6145_i0.Serious_illness_injury_or_assault_to_yourself,
        relative_illness_injury_assault   = p6145_i0.Serious_illness_injury_or_assault_of_a_close_relative,
        death_relative                    = p6145_i0.Death_of_a_close_relative,
        death_partner                     = p6145_i0.Death_of_a_spouse_or_partner,
        divorce                           = p6145_i0.Marital_separation_divorce,
        financial_difficulty              = p6145_i0.Financial_difficulties,
  ) |> 
  select(-p6145_i0, -p6145_i0.None_of_the_above, -p6145_i0.Prefer_not_to_answer) |>
  mutate(
      self_illness_injury_assault            = recode_factor(self_illness_injury_assault, "0" = "No", "1" = "Yes"),
      relative_illness_injury_assault        = recode_factor(relative_illness_injury_assault, "0" = "No", "1" = "Yes"),
      death_relative                         = recode_factor(death_relative, "0" = "No", "1" = "Yes"),
      death_partner                          = recode_factor(death_partner, "0" = "No", "1" = "Yes"),
      divorce                                = recode_factor(divorce, "0" = "No", "1" = "Yes"),
      financial_difficulty                   = recode_factor(financial_difficulty, "0" = "No", "1" = "Yes")
  )
    

sle_vars <- c(
    "death_relative", "death_partner", "financial_difficulty", "relative_illness_injury_assault", "self_illness_injury_assault", "divorce"
)


### Social factors

In [60]:
dat <- dat |>
#social <- dat |>
#    select(p1031_i0, p6160_i0, p2110_i0, p2020_i0) |>
    create_string_dummies("p6160_i0") |>
    mutate(
        across(
      c(
        starts_with("p6160_i0."),
        -any_of(c("p6160_i0.None_of_the_above", "p6160_i0.Prefer_not_to_answer"))
      ),
          ~ case_when(
              #p6160_i0.None_of_the_above == "Yes" |
              p6160_i0.Prefer_not_to_answer == "Yes" ~ NA,
              TRUE ~ .x
             )
            )
    ) |> 
    rename(
        gym                    = p6160_i0.Sports_club_or_gym,
        pub                    = p6160_i0.Pub_or_social_club,
        religious_group        = p6160_i0.Religious_group,
        adult_education        = p6160_i0.Adult_education_class,
        other_group_activity   = p6160_i0.Other_group_activity,
        family_visit_freq      = p1031_i0,
        confide_freq           = p2110_i0,
        loneliness             = p2020_i0
    ) |> 
    select(-p6160_i0, -p6160_i0.None_of_the_above, -p6160_i0.Prefer_not_to_answer) |> 
    mutate(
        family_visit_freq       = ifelse(family_visit_freq   %in% c("Do not know", "Prefer not to answer"), NA, family_visit_freq),
        confide_freq            = ifelse(confide_freq        %in% c("Do not know", "Prefer not to answer"), NA, confide_freq),
        loneliness              = ifelse(loneliness          %in% c("Do not know", "Prefer not to answer"), NA, loneliness)
    ) |>
    mutate(
      gym                      = recode_factor(gym, "0" = "No", "1" = "Yes"),
      pub                      = recode_factor(pub, "0" = "No", "1" = "Yes"),
      religious_group          = recode_factor(religious_group, "0" = "No", "1" = "Yes"),
      adult_education          = recode_factor(adult_education, "0" = "No", "1" = "Yes"),
      other_group_activity     = recode_factor(other_group_activity, "0" = "No", "1" = "Yes"),
      family_visit_freq        = relevel(factor(family_visit_freq, ordered = FALSE), ref = "No friends/family outside household"),
      confide_freq             = relevel(factor(confide_freq, ordered = FALSE), ref = "Never or almost never"),
      loneliness               = relevel(factor(loneliness, ordered = FALSE), ref = "No")
    )

social_vars <- c(
  "family_visit_freq","confide_freq", "loneliness", "adult_education", 
    "other_group_activity", "pub", "religious_group", "gym"
)

### Sociodemographics

In [61]:
dat <- dat |>
#socio_dem <- dat |>
    #select(p670_i0, p6141_i0, p680_i0, p699_i0, p709_i0, p738_i0, p4674_i0, p6139_i0, p6140_i0, p6142_i0, p6155_i0) |>
    create_string_dummies("p6141_i0") |> 
    mutate(
        across(
          c(
            starts_with("p6141_i0."),
            -any_of(c("p6141_i0.Prefer_not_to_answer"))
          ),
          ~ case_when(
              p6141_i0.Prefer_not_to_answer == "Yes" ~ NA,
              TRUE ~ .x
             )
        )
    ) |>
    create_string_dummies("p6139_i0") |>
    mutate(
        across(
          c(
            starts_with("p6139_i0."),
            -any_of(c("p6139_i0.Prefer_not_to_answer", "p6139_i0.Do_not_know"))
          ),
          ~ case_when(
              p6139_i0.Prefer_not_to_answer == "Yes" |
              p6139_i0.Do_not_know == "Yes" ~ NA,
              TRUE ~ .x
             )
        )
    ) |>
    create_string_dummies("p6140_i0") |>
    mutate(
        across(
          c(
            starts_with("p6140_i0."),
            -any_of(c("p6140_i0.Prefer_not_to_answer", "p6140_i0.Do_not_know"))
          ),
          ~ case_when(
              p6140_i0.Prefer_not_to_answer == "Yes" |
              p6140_i0.Do_not_know == "Yes" ~ NA,
              TRUE ~ .x
             )
        )
    ) |>
    create_string_dummies("p6142_i0") |>
    mutate(
        across(
          c(
            starts_with("p6142_i0."),
            -any_of(c("p6142_i0.Prefer_not_to_answer"))
          ),
          ~ case_when(
              p6142_i0.Prefer_not_to_answer == "Yes" ~ NA,
              TRUE ~ .x
             )
        )
    ) |>
    rename(
       hshld_partner = p6141_i0.Husband_wife_or_partner, 
       hshld_children = p6141_i0.Son_and_or_daughter_include_step_children_,
       hshld_siblings = p6141_i0.Brother_and_or_sister,
       hshld_parents = p6141_i0.Mother_and_or_father,
       hshld_grandparent = p6141_i0.Grandparent,
       hshld_grandchild = p6141_i0.Grandchild,
       hshld_other_related = p6141_i0.Other_related,
       hshld_other_unrelated = p6141_i0.Other_unrelated,
       accommodation_type =  p670_i0,
       own_or_rent =  p680_i0,
       years_at_address =  p699_i0,
       hshld_number =  p709_i0,
       hshld_income =  p738_i0,
       private_healthcare =  p4674_i0,
       gas_hob_heat = p6139_i0.A_gas_hob_or_gas_cooker,
       gas_fire_heat = p6139_i0.A_gas_fire_that_you_use_regularly_in_winter_time,
       open_fire_heat = p6139_i0.An_open_solid_fuel_fire_that_you_use_regularly_in_winter_time,
       gas_central_heat = p6140_i0.Gas_central_heating,
       electric_storage_heat = p6140_i0.Electric_storage_heaters,
       oil_central_heat = p6140_i0.Oil_kerosene_central_heating,
       employed = p6142_i0.In_paid_employment_or_self_employed,
       retired = p6142_i0.Retired,
       home_maker = p6142_i0.Looking_after_home_and_or_family,
       unemployed = p6142_i0.Unemployed,
       volunteer = p6142_i0.Doing_unpaid_or_voluntary_work,
       student  = p6142_i0.Full_or_part_time_student
       
    ) |> 
    select(-p6141_i0, -p6141_i0.Prefer_not_to_answer, 
           -p6139_i0, -p6139_i0.Prefer_not_to_answer, -p6139_i0.Do_not_know, -p6139_i0.None_of_the_above,
           -p6140_i0, -p6140_i0.Prefer_not_to_answer, -p6140_i0.Do_not_know, -p6140_i0.None_of_the_above, -p6140_i0.Open_fire_without_central_heating, -p6140_i0.Portable_gas_or_paraffin_heaters, -p6140_i0.Solid_fuel_central_heating,
           -p6142_i0, -p6142_i0.Prefer_not_to_answer, -p6142_i0.None_of_the_above, -p6142_i0.Unable_to_work_because_of_sickness_or_disability
           
    ) |>  
    mutate(
       accommodation_type       = ifelse(accommodation_type   %in% c("None of the above", "Prefer not to answer"), NA, accommodation_type),
       own_or_rent       = ifelse(own_or_rent   %in% c("None of the above", "Prefer not to answer"), NA, own_or_rent),
       years_at_address       = ifelse(years_at_address   %in% c("Do not know", "Prefer not to answer"), NA, years_at_address),
       years_at_address       = ifelse(years_at_address   %in% c("Less than a year"), 0.5, years_at_address),
       years_at_address = as.numeric(years_at_address),
       hshld_number       = ifelse(hshld_number   %in% c("Do not know", "Prefer not to answer"), NA, hshld_number),
       hshld_number = as.numeric(hshld_number),
       hshld_income       = ifelse(hshld_income   %in% c("Do not know", "Prefer not to answer"), NA, hshld_income),
       private_healthcare       = ifelse(private_healthcare   %in% c("Do not know", "Prefer not to answer"), NA, private_healthcare),  
    ) |>
    mutate(
       accommodation_type       = relevel(factor(accommodation_type,       ordered = FALSE), ref = "A house or bungalow"),
       own_or_rent       = relevel(factor(own_or_rent,       ordered = FALSE), ref = "Own outright (by you or someone in your household)"),
       hshld_income       = relevel(factor(hshld_income,       ordered = FALSE), ref = "Less than 18,000"),
       private_healthcare       = relevel(factor(private_healthcare,       ordered = FALSE), ref = "No, never"),
       gas_hob_heat  = recode_factor(gas_hob_heat, "0" = "No", "1" = "Yes"), 
       gas_fire_heat  = recode_factor(gas_fire_heat, "0" = "No", "1" = "Yes"), 
       open_fire_heat  = recode_factor(open_fire_heat, "0" = "No", "1" = "Yes"),
       gas_central_heat  = recode_factor(gas_central_heat, "0" = "No", "1" = "Yes"), 
       electric_storage_heat  = recode_factor(electric_storage_heat, "0" = "No", "1" = "Yes"), 
       oil_central_heat  = recode_factor(oil_central_heat, "0" = "No", "1" = "Yes"), 
       employed  = recode_factor(employed, "0" = "No", "1" = "Yes"), 
       retired  = recode_factor(retired, "0" = "No", "1" = "Yes"), 
       home_maker  = recode_factor(home_maker, "0" = "No", "1" = "Yes"), 
       unemployed  = recode_factor(unemployed, "0" = "No", "1" = "Yes"), 
       volunteer  = recode_factor(volunteer, "0" = "No", "1" = "Yes"), 
       student  = recode_factor(student, "0" = "No", "1" = "Yes")
    ) 

socio_dem_vars <- c("hshld_partner", "hshld_children", "hshld_siblings", "hshld_parents", "hshld_grandparent", "hshld_grandchild", 
                    "hshld_other_related", "hshld_other_unrelated", "accommodation_type", "own_or_rent", "years_at_address", 
                    "hshld_number", "hshld_income", "private_healthcare", "gas_hob_heat", "gas_fire_heat", "open_fire_heat", 
                    "gas_central_heat", "electric_storage_heat", "oil_central_heat", "employed", "retired", "home_maker", 
                    "unemployed", "volunteer", "student"
)


### Supplements

In [62]:
dat <- dat |>
#supplem <- dat |>
#    select(p6179_i0, p6155_i0) |>
    create_string_dummies("p6179_i0") |>
    mutate(
        across(
          c(
            starts_with("p6179_i0."),
            -any_of(c("p6179_i0.Prefer_not_to_answer"))
          ),
          ~ case_when(
              p6179_i0.Prefer_not_to_answer == "Yes" ~ NA,
              TRUE ~ .x
             )
        )
    ) |>
    create_string_dummies("p6155_i0") |>
    mutate(
        across(
          c(
            starts_with("p6155_i0."),
            -any_of(c("p6155_i0.Prefer_not_to_answer"))
          ),
          ~ case_when(
              p6155_i0.Prefer_not_to_answer == "Yes" ~ NA,
              TRUE ~ .x
             )
        )
    ) |>
    select(-p6179_i0, -p6179_i0.Prefer_not_to_answer, -p6179_i0.None_of_the_above,
           -p6155_i0, -p6155_i0.Prefer_not_to_answer, -p6155_i0.None_of_the_above
    ) |>
    rename(
        fish_oil_supplements = p6179_i0.Fish_oil_including_cod_liver_oil_,
        calcium_supplements = p6179_i0.Calcium,
        selenium_supplements = p6179_i0.Selenium,
        glucosamine_supplements = p6179_i0.Glucosamine,
        iron_supplements = p6179_i0.Iron,
        zinc_supplements  = p6179_i0.Zinc,
        multivitamin_supplements = p6155_i0.Multivitamins_minerals,
        vitamin_A_supplements = p6155_i0.Vitamin_A,
        vitamin_B_supplements = p6155_i0.Vitamin_B,
        vitamin_C_supplements = p6155_i0.Vitamin_C,
        vitamin_D_supplements = p6155_i0.Vitamin_D,
        vitamin_E_supplements  = p6155_i0.Vitamin_E,
        folate_supplements  = p6155_i0.Folic_acid_or_Folate_Vit_B9_
        
    ) |>
    mutate(
        fish_oil_supplements  = recode_factor(fish_oil_supplements, "0" = "No", "1" = "Yes"), 
        calcium_supplements  = recode_factor(calcium_supplements, "0" = "No", "1" = "Yes"), 
        selenium_supplements  = recode_factor(selenium_supplements, "0" = "No", "1" = "Yes"), 
        glucosamine_supplements  = recode_factor(glucosamine_supplements, "0" = "No", "1" = "Yes"), 
        iron_supplements  = recode_factor(iron_supplements, "0" = "No", "1" = "Yes"), 
        zinc_supplements  = recode_factor(zinc_supplements, "0" = "No", "1" = "Yes"),
        multivitamin_supplements  = recode_factor(multivitamin_supplements, "0" = "No", "1" = "Yes"), 
        vitamin_A_supplements  = recode_factor(vitamin_A_supplements, "0" = "No", "1" = "Yes"), 
        vitamin_B_supplements  = recode_factor(vitamin_B_supplements, "0" = "No", "1" = "Yes"), 
        vitamin_C_supplements  = recode_factor(vitamin_C_supplements, "0" = "No", "1" = "Yes"),
        vitamin_D_supplements  = recode_factor(vitamin_C_supplements, "0" = "No", "1" = "Yes"),
        vitamin_E_supplements  = recode_factor(vitamin_E_supplements, "0" = "No", "1" = "Yes"), 
        folate_supplements  = recode_factor(folate_supplements, "0" = "No", "1" = "Yes") 
        
    )

supplem_vars <- c("fish_oil_supplements", "calcium_supplements", "selenium_supplements", "glucosamine_supplements", 
                  "iron_supplements", "zinc_supplements", "multivitamin_supplements", "vitamin_A_supplements", "vitamin_B_supplements", 
                  "vitamin_C_supplements", "vitamin_D_supplements", "vitamin_E_supplements", "folate_supplements"
                 )



### Male reproduction

In [63]:
dat <- dat |>
#male_repro <- dat |>
#    select(p2375_i0, p2385_i0, p2395_i0, p2405_i0) |>
    rename(
       age_facial_hair =  p2375_i0,
       age_voice       =  p2385_i0,
       balding         =  p2395_i0,
       number_children         =  p2405_i0 
        
        
    ) |>  
    mutate(
        age_facial_hair       = ifelse(age_facial_hair   %in% c("Do not know", "Prefer not to answer"), NA, age_facial_hair),
        age_voice             = ifelse(age_voice   %in% c("Do not know", "Prefer not to answer"), NA, age_voice),
        balding               = ifelse(balding   %in% c("Do not know", "Prefer not to answer"), NA, balding),
        number_children       = ifelse(number_children   %in% c("Do not know", "Prefer not to answer"), NA, number_children)
    ) |>  
    mutate(
        age_facial_hair       = relevel(factor(age_facial_hair,       ordered = FALSE), ref = "About average age"),
        age_voice             = relevel(factor(age_voice,       ordered = FALSE), ref = "About average age"),
        balding               = relevel(factor(balding,       ordered = FALSE), ref = "Pattern 1"),
        number_children       = as.numeric(number_children)
    ) 

male_repro_vars <- c("age_facial_hair", "age_voice", "balding", "number_children")


### Female reproduction

In [64]:
dat <- dat |>
#female_repro <- dat |>
#    select(p2674_i0, p2694_i0, p2714_i0, p2724_i0, p2734_i0, p3872_i0, p2754_i0, p2764_i0, p2774_i0, p2784_i0, p2814_i0, p3591_i0, p2834_i0, p3581_i0) |>
    rename(
       mammogram           =  p2674_i0,
       cervical_smear      =  p2694_i0,
       menarche_age        =  p2714_i0,
       menopause           =  p2724_i0,
       menopause_age       =  p3581_i0, 
       number_births       =  p2734_i0, 
       birth_age           =  p3872_i0,  
       first_birth_age     =  p2754_i0,
       last_birth_age      =  p2764_i0,
       stillbirth          =  p2774_i0, 
       oral_contraceptive  =  p2784_i0,
       HRT                 =  p2814_i0,
       hysterectomy        =  p3591_i0,
       bilateral_oophorectomy=  p2834_i0
        
    ) |>  
    mutate(
        mammogram             = ifelse(mammogram   %in% c("Do not know", "Prefer not to answer"), NA, mammogram),
        cervical_smear        = ifelse(cervical_smear   %in% c("Do not know", "Prefer not to answer"), NA, cervical_smear),
        menarche_age          = ifelse(menarche_age   %in% c("Do not know", "Prefer not to answer"), NA, menarche_age),
        menopause_age         = ifelse(menopause_age   %in% c("Do not know", "Prefer not to answer"), NA, menopause_age),
        menopause             = ifelse(menopause   %in% c("Not sure - had a hysterectomy", "Prefer not to answer", "Not sure - other reason"), NA, menopause),
        number_births         = ifelse(number_births   %in% c("Prefer not to answer"), NA, number_births),
        birth_age             = ifelse(birth_age   %in% c("Do not remember", "Prefer not to answer"), NA, birth_age),
        first_birth_age       = ifelse(first_birth_age   %in% c("Do not remember", "Prefer not to answer"), NA, first_birth_age),
        last_birth_age        = ifelse(last_birth_age   %in% c("Do not remember", "Prefer not to answer"), NA, last_birth_age),
        stillbirth            = ifelse(stillbirth   %in% c("Do not know", "Prefer not to answer"), NA, stillbirth),
        oral_contraceptive    = ifelse(oral_contraceptive   %in% c("Do not know", "Prefer not to answer"), NA, oral_contraceptive),
        HRT                   = ifelse(HRT   %in% c("Do not know", "Prefer not to answer"), NA, HRT),
        hysterectomy          = ifelse(hysterectomy   %in% c("Not sure", "Prefer not to answer"), NA, hysterectomy),
        bilateral_oophorectomy          = ifelse(bilateral_oophorectomy   %in% c("Not sure", "Prefer not to answer"), NA, bilateral_oophorectomy)
        
    ) |>  
    mutate(
        mammogram             = relevel(factor(mammogram,       ordered = FALSE), ref =  "No"),
        cervical_smear        = relevel(factor(cervical_smear,       ordered = FALSE), ref = "No"),
        menarche_age          = as.numeric(menarche_age),
        menopause             = relevel(factor(menopause,       ordered = FALSE), ref = "No"),
        number_births         = as.numeric(number_births),
        birth_age             = as.numeric(birth_age),
        first_birth_age       = as.numeric(first_birth_age),
        last_birth_age        = as.numeric(last_birth_age),
        stillbirth            = relevel(factor(stillbirth,       ordered = FALSE), ref = "No"),
        oral_contraceptive    = relevel(factor(oral_contraceptive,       ordered = FALSE), ref = "No"),
        HRT                   = relevel(factor(HRT,       ordered = FALSE), ref = "No"),
        hysterectomy          = relevel(factor(hysterectomy,       ordered = FALSE), ref = "No"),
        bilateral_oophorectomy= relevel(factor(bilateral_oophorectomy,       ordered = FALSE), ref = "No")
        
    ) |>
    # recode some nested female reproduction variables 
    mutate(
        menopause_age         =  ifelse(!is.na(menopause_age), 
                                        menopause_age,
                                        ifelse(sex == "Male", 777,
                                               ifelse(dat$menopause == "No", 
                                                # code nested missing as 777
                                                777,
                                                # response to menopause is 1, but menopause age is truly missing
                                                NA))
                                       ),
        menopause_age         = as.numeric(menopause_age),
        birth_age             = ifelse(!is.na(birth_age),
                                       birth_age,
                                     ifelse(sex == "Male",
                                        777,
                                        ifelse(number_births != 1,
                                           777,
                                           NA))
                                ),
        birth_age             = as.numeric(birth_age),
        first_birth_age       = ifelse(!is.na(first_birth_age),
                                       first_birth_age,
                                    ifelse(sex == "Male", 
                                           777,
                                        ifelse(number_births <= 1,
                                               777,
                                               NA))
                                    ),
        first_birth_age       = as.numeric(first_birth_age),
        last_birth_age        = ifelse(!is.na(last_birth_age),
                                       last_birth_age,
                                    ifelse(sex == "Male",
                                           777,
                                        ifelse(number_births <= 1,
                                               777,
                                               NA))
                                ),
        last_birth_age        = as.numeric(last_birth_age)
        
)

female_repro_vars <- c("mammogram", "cervical_smear", "menarche_age", "menopause", "menopause_age", "number_births", "birth_age", "first_birth_age", "last_birth_age", "stillbirth", "oral_contraceptive", "HRT", "hysterectomy", "bilateral_oophorectomy")



### Mental health

In [65]:
dat <- dat  |>
#mental_health <- dat  |>
#    select(p20127_i0, p1920_i0, p1930_i0, p1940_i0, p1950_i0, p1960_i0, p1970_i0, p1980_i0, p1990_i0, 
#           p2000_i0, p2010_i0, p2030_i0, p2040_i0, p2050_i0, p2060_i0, p2070_i0, p2080_i0, p4598_i0,
#           p4631_i0, p4642_i0, p4653_i0, p4526_i0, p4537_i0, p4559_i0 , p4570_i0, p4581_i0) |>
    rename(
       neuroticism                   =  p20127_i0,
       mood_swings                   =  p1920_i0,
       miserableness                 =  p1930_i0,
       irritability                  =  p1940_i0, 
       sensitivity                   =  p1950_i0,
       fed_up_feelings               =  p1960_i0,
       nervous                       =  p1970_i0,
       worrier                       =  p1980_i0,
       tense                         =  p1990_i0,
       worry_embarassment            =  p2000_i0,
       nerves                        =  p2010_i0,
       guilty_feelings               =  p2030_i0,
       risk_taking                   =  p2040_i0,
       depressed_mood_freq           =  p2050_i0,
       unenthusiasm_freq             =  p2060_i0,
       tenseness_freq                =  p2070_i0,
       tiredness_freq                =  p2080_i0, 
       depressed_whole_week          =  p4598_i0,
       unenthusiastic_whole_week     =  p4631_i0,
       manic                         =  p4642_i0,
       irritable                     =  p4653_i0,
       happiness                     =  p4526_i0, 
       job_satisfaction              =  p4537_i0,
       family_satisfaction           =  p4559_i0,
       friends_satisfaction          =  p4570_i0,
       financial_satisfaction        =  p4581_i0 
        
    ) |>
    mutate(
        neuroticism                  = as.numeric(neuroticism),
        mood_swings                  = ifelse(mood_swings   %in% c("Do not know", "Prefer not to answer"), NA, mood_swings),
        miserableness                = ifelse(miserableness   %in% c("Do not know", "Prefer not to answer"), NA, miserableness),
        irritability                 = ifelse(irritability   %in% c("Do not know", "Prefer not to answer"), NA, irritability),
        sensitivity                  = ifelse(sensitivity   %in% c("Do not know", "Prefer not to answer"), NA, sensitivity),
        fed_up_feelings              = ifelse(fed_up_feelings   %in% c("Do not know", "Prefer not to answer"), NA, fed_up_feelings),
        nervous                      = ifelse(nervous   %in% c("Do not know", "Prefer not to answer"), NA, nervous),
        worrier                      = ifelse(worrier   %in% c("Do not know", "Prefer not to answer"), NA, worrier),
        tense                        = ifelse(tense   %in% c("Do not know", "Prefer not to answer"), NA, tense),
        worry_embarassment           = ifelse(worry_embarassment   %in% c("Do not know", "Prefer not to answer"), NA, worry_embarassment),
        nerves                       = ifelse(nerves   %in% c("Do not know", "Prefer not to answer"), NA, nerves),
        guilty_feelings              = ifelse(guilty_feelings   %in% c("Do not know", "Prefer not to answer"), NA, guilty_feelings),
        risk_taking                  = ifelse(risk_taking   %in% c("Do not know", "Prefer not to answer"), NA, risk_taking),
        depressed_mood_freq          = ifelse(depressed_mood_freq   %in% c("Do not know", "Prefer not to answer"), NA, depressed_mood_freq),
        unenthusiasm_freq            = ifelse(unenthusiasm_freq   %in% c("Do not know", "Prefer not to answer"), NA, unenthusiasm_freq),
        tenseness_freq               = ifelse(tenseness_freq   %in% c("Do not know", "Prefer not to answer"), NA, tenseness_freq),
        tiredness_freq               = ifelse(tiredness_freq   %in% c("Do not know", "Prefer not to answer"), NA, tiredness_freq),
        depressed_whole_week         = ifelse(depressed_whole_week   %in% c("Do not know", "Prefer not to answer"), NA, depressed_whole_week),
        unenthusiastic_whole_week    = ifelse(unenthusiastic_whole_week   %in% c("Do not know", "Prefer not to answer"), NA, unenthusiastic_whole_week),
        manic                        = ifelse(manic   %in% c("Do not know", "Prefer not to answer"), NA, manic),
        irritable                    = ifelse(irritable   %in% c("Do not know", "Prefer not to answer"), NA, irritable),
        happiness                    = ifelse(happiness   %in% c("Do not know", "Prefer not to answer"), NA, happiness),
        job_satisfaction             = ifelse(job_satisfaction   %in% c("Do not know", "Prefer not to answer"), NA, job_satisfaction),
        family_satisfaction          = ifelse(family_satisfaction   %in% c("Do not know", "Prefer not to answer"), NA, family_satisfaction),
        friends_satisfaction         = ifelse(friends_satisfaction   %in% c("Do not know", "Prefer not to answer"), NA, friends_satisfaction),
        financial_satisfaction       = ifelse(financial_satisfaction %in% c("Do not know", "Prefer not to answer"), NA, financial_satisfaction),
        
    ) |>
    mutate(
        mood_swings                  = relevel(factor(mood_swings,       ordered = FALSE), ref = "No"),
        miserableness                = relevel(factor(miserableness,       ordered = FALSE), ref = "No"),
        irritability                 = relevel(factor(irritability,       ordered = FALSE), ref = "No"),
        sensitivity                  = relevel(factor(sensitivity,       ordered = FALSE), ref = "No"),
        fed_up_feelings              = relevel(factor(fed_up_feelings,       ordered = FALSE), ref = "No"),
        nervous                      = relevel(factor(nervous,       ordered = FALSE), ref = "No"),
        worrier                      = relevel(factor(worrier,       ordered = FALSE), ref = "No"),
        tense                        = relevel(factor(tense,       ordered = FALSE), ref = "No"),
        worry_embarassment           = relevel(factor(worry_embarassment,       ordered = FALSE), ref = "No"),
        nerves                       = relevel(factor(nerves,       ordered = FALSE), ref = "No"),
        guilty_feelings              = relevel(factor(guilty_feelings,       ordered = FALSE), ref = "No"),
        risk_taking                  = relevel(factor(risk_taking,       ordered = FALSE), ref = "No"),
        depressed_mood_freq          = relevel(factor(depressed_mood_freq,       ordered = FALSE), ref = "Not at all"),
        unenthusiasm_freq            = relevel(factor(unenthusiasm_freq,       ordered = FALSE), ref = "Not at all"),
        tenseness_freq               = relevel(factor(tenseness_freq,       ordered = FALSE), ref = "Not at all"),
        tiredness_freq               = relevel(factor(tiredness_freq,       ordered = FALSE), ref = "Not at all"),
        depressed_whole_week         = relevel(factor(depressed_whole_week,       ordered = FALSE), ref = "No"),
        unenthusiastic_whole_week    = relevel(factor(unenthusiastic_whole_week,       ordered = FALSE), ref = "No"),
        manic                        = relevel(factor(manic,       ordered = FALSE), ref = "No"),
        irritable                    = relevel(factor(irritable,       ordered = FALSE), ref = "No"),
        happiness                    = factor(happiness, 
                                              levels = c("Extremely unhappy",
                                                         "Very unhappy",
                                                         "Moderately unhappy",
                                                         "Moderately happy",
                                                         "Very happy",
                                                         "Extremely happy"), ordered = FALSE),
        happiness                     = relevel(happiness,                 ref = "Extremely unhappy"),
        job_satisfaction                    = factor(job_satisfaction, 
                                              levels = c("Extremely unhappy",
                                                         "Very unhappy",
                                                         "Moderately unhappy",
                                                         "Moderately happy",
                                                         "Very happy",
                                                         "Extremely happy"), ordered = FALSE),
        job_satisfaction                     = relevel(job_satisfaction,                 ref = "Extremely unhappy"),
        family_satisfaction                    = factor(family_satisfaction, 
                                              levels = c("Extremely unhappy",
                                                         "Very unhappy",
                                                         "Moderately unhappy",
                                                         "Moderately happy",
                                                         "Very happy",
                                                         "Extremely happy"), ordered = FALSE),
        family_satisfaction                     = relevel(family_satisfaction,                 ref = "Extremely unhappy"),
        friends_satisfaction                    = factor(friends_satisfaction, 
                                              levels = c("Extremely unhappy",
                                                         "Very unhappy",
                                                         "Moderately unhappy",
                                                         "Moderately happy",
                                                         "Very happy",
                                                         "Extremely happy"), ordered = FALSE),
        friends_satisfaction                     = relevel(friends_satisfaction,                 ref = "Extremely unhappy"),
        financial_satisfaction                    = factor(financial_satisfaction, 
                                              levels = c("Extremely unhappy",
                                                         "Very unhappy",
                                                         "Moderately unhappy",
                                                         "Moderately happy",
                                                         "Very happy",
                                                         "Extremely happy"), ordered = FALSE),
        financial_satisfaction                     = relevel(financial_satisfaction,                 ref = "Extremely unhappy")
  )

mental_health_vars <- c("neuroticism", "mood_swings", "miserableness", "irritability", "sensitivity", "fed_up_feelings", "nervous", "worrier", "tense", "worry_embarassment", "nerves", "guilty_feelings", "risk_taking", "depressed_mood_freq", "unenthusiasm_freq", "tenseness_freq", "tiredness_freq", "depressed_whole_week", "unenthusiastic_whole_week", "manic", "irritable", "happiness", "job_satisfaction", "family_satisfaction", "friends_satisfaction", "financial_satisfaction")


### Electronics use

In [66]:
dat <- dat |>
#electronics <- dat |>
#    select(p1120_i0, p1130_i0, p2237_i0) |>
    rename(
       mobile_phone_weekly_usage =  p1120_i0,
       speakerphone              =  p1130_i0,
       computer_games              =  p2237_i0 
       
    ) |>  
    mutate(
       mobile_phone_weekly_usage                  = ifelse(mobile_phone_weekly_usage   %in% c("Do not know", "Prefer not to answer"), NA, mobile_phone_weekly_usage),
       speakerphone                               = ifelse(speakerphone   %in% c("Do not know", "Prefer not to answer"), NA, speakerphone),
       computer_games                             = ifelse(computer_games   %in% c("Do not know", "Prefer not to answer"), NA, computer_games) 
    ) |>
    mutate(
       mobile_phone_weekly_usage       = relevel(factor(mobile_phone_weekly_usage,       ordered = FALSE), ref = "Less than 5mins"),
       speakerphone       = relevel(factor(speakerphone,       ordered = FALSE), ref = "Never or almost never"),
       computer_games       = relevel(factor(computer_games,       ordered = FALSE), ref = "Never/rarely") 
    )

electronics_vars <- c("mobile_phone_weekly_usage", "speakerphone", "computer_games")

### Sleep (NB: to modify after imputation)

In [67]:
dat <- dat |>
#sleep <- dat |>
#    select(p1160_i0, p1170_i0, p1180_i0, p1190_i0, p1200_i0, p1210_i0) |>
    rename(
       #sleep_hours             =  p1160_i0,
       easy_wake               =  p1170_i0,
       chronotype              =  p1180_i0,
       nap                     =  p1190_i0,
       sleep_difficulty        =  p1200_i0,
       snoring                 =  p1210_i0
       
        
    ) |>  
    mutate(
       #sleep_hours                   = ifelse(snoring   %in% c("Do not know", "Prefer not to answer"), NA, snoring),
       easy_wake                     = ifelse(easy_wake   %in% c("Do not know", "Prefer not to answer"), NA, easy_wake),
       chronotype                    = ifelse(chronotype   %in% c("Do not know", "Prefer not to answer"), NA, chronotype),
       nap                           = ifelse(nap   %in% c("Prefer not to answer"), NA, nap),
       sleep_difficulty              = ifelse(sleep_difficulty   %in% c("Prefer not to answer"), NA, sleep_difficulty),
       snoring                       = ifelse(snoring   %in% c("Prefer not to answer", "Do not know"), NA, snoring)
        
        
    ) |>
   mutate(
       #sleep_hours                   = as.numeric(snoring),
       easy_wake                     = relevel(factor(easy_wake,       ordered = FALSE), ref = "Not at all easy"),
       chronotype                    = relevel(factor(chronotype,       ordered = FALSE), ref = "More a 'morning' than 'evening' person"),
       nap                           = relevel(factor(nap,       ordered = FALSE), ref = "Never/rarely"),
       sleep_difficulty              = relevel(factor(sleep_difficulty,       ordered = FALSE), ref = "Never/rarely"),
       snoring                       = relevel(factor(snoring,       ordered = FALSE), ref = "No")  
       
   )

sleep_vars <- c("sleep_hours", "easy_wake", "chronotype", "nap", "sleep_difficulty", "snoring")

### Smoking

In [68]:
dat <- dat |>
#smoking <- dat |>
 #   select(p20160_i0, p20162_i0, p20161_i0, p20116_i0, p1239_i0) |>
    rename(
       ever_smoked       =  p20160_i0,
       pack_years_prop   = p20162_i0,
       pack_years        = p20161_i0,
       smoking_status    = p20116_i0,
       tobacco           = p1239_i0
        
    ) |>  
    mutate(
       pack_years_prop = as.numeric(pack_years_prop),
       pack_years = as.numeric(pack_years),
       ever_smoked      = ifelse(ever_smoked   %in% c("Do not know", "Prefer not to answer"), NA, ever_smoked),
       smoking_status      = ifelse(smoking_status   %in% c("Prefer not to answer"), NA, smoking_status),
       tobacco      = ifelse(tobacco   %in% c("Prefer not to answer"), NA, tobacco)
       
    ) |>
    mutate(
       ever_smoked       = relevel(factor(ever_smoked,       ordered = FALSE), ref = "No"),
       smoking_status    = relevel(factor(smoking_status,       ordered = FALSE), ref =  "Never"),
       tobacco           = factor(tobacco, levels = c("No",
                                                      "Only occasionally",
                                                      "Yes, on most or all days"), ordered = FALSE),
        tobacco          = relevel(tobacco, ref = "No")
        
    ) |>
    #Recoding pack_years and pack_years_prop because nested
    mutate(
      pack_years         = ifelse(!is.na(pack_years),
                                  pack_years,
                                  ifelse(ever_smoked == "No", 0,  NA)
                                 ),
      pack_years_prop    = ifelse(!is.na(pack_years_prop),
                                  pack_years_prop,
                                  ifelse(ever_smoked == "No", 0, NA)
                                 )
    )

smoking_vars <- c("ever_smoked", "pack_years_prop", "pack_years", "smoking_status", "tobacco")


### Alcohol

In [69]:
dat <- dat |>
#alcohol <- dat |>
#    select(p1558_i0) |>
    rename(
       alcohol_freq =  p1558_i0
    ) |>  
    mutate(
        alcohol_freq       = ifelse(alcohol_freq   %in% c("Prefer not to answer"), NA, alcohol_freq)
    ) |>
    mutate(
        alcohol_freq       = factor(alcohol_freq, levels = c("Never",
                                                        "Special occasions only",
                                                        "One to three times a month",
                                                        "Once or twice a week",
                                                        "Three or four times a week",
                                                        "Daily or almost daily"), ordered = FALSE),
        alcohol_freq       = relevel(alcohol_freq, ref = "Never")
    )

alcohol_vars <- c("alcohol_freq")


### Diet

In [70]:
dat <- dat |>
#diet <- dat |>
#    select(p1329_i0, p1339_i0, p1349_i0, p1359_i0, p1408_i0, p1478_i0, p1488_i0, p1498_i0, p1528_i0, p1538_i0, p1548_i0) |>
    rename(
       oily_fish             =  p1329_i0,
       non_oily_fish         =  p1339_i0, 
       processed_meat        =  p1349_i0,
       poultry               =  p1359_i0,
       cheese                =  p1408_i0, 
       salt                  =  p1478_i0,
       tea                   =  p1488_i0, 
       coffee                 =  p1498_i0, 
       water                 =  p1528_i0, 
       diet_change_5yrs      =  p1538_i0, 
       diet_variation        =  p1548_i0 
       
    ) |>  
    mutate(
        oily_fish            = ifelse(oily_fish   %in% c("Prefer not to answer", "Do not know"), NA, oily_fish),
        non_oily_fish        = ifelse(non_oily_fish   %in% c("Prefer not to answer", "Do not know"), NA, non_oily_fish),
        processed_meat       = ifelse(processed_meat   %in% c("Prefer not to answer", "Do not know"), NA, processed_meat),
        poultry              = ifelse(poultry   %in% c("Prefer not to answer", "Do not know"), NA, poultry),
        cheese               = ifelse(cheese   %in% c("Prefer not to answer", "Do not know"), NA, cheese),
        salt                 = ifelse(salt   %in% c("Prefer not to answer", "Do not know"), NA, salt),
        tea                  = ifelse(tea   %in% c("Prefer not to answer", "Do not know"), NA, tea),
        tea                  = ifelse(tea   %in% c("Less than one"), 0.5, tea),
        coffee                = ifelse(coffee   %in% c("Prefer not to answer", "Do not know"), NA, coffee),
        coffee                = ifelse(coffee   %in% c("Less than one"), 0.5, coffee),
        water                = ifelse(water   %in% c("Prefer not to answer", "Do not know"), NA, water),
        water                = ifelse(water   %in% c("Less than one"), 0.5, water),
        diet_change_5yrs     = ifelse(diet_change_5yrs   %in% c("Prefer not to answer"), NA, diet_change_5yrs),
        diet_variation       = ifelse(diet_variation   %in% c("Prefer not to answer", "Do not know"), NA, diet_variation)
        
        
    ) |>
    mutate(
        oily_fish            = relevel(factor(oily_fish,       ordered = FALSE), ref = "Never"),
        non_oily_fish        = relevel(factor(non_oily_fish,       ordered = FALSE), ref = "Never"),
        processed_meat       = relevel(factor(processed_meat,       ordered = FALSE), ref = "Never"),
        poultry              = relevel(factor(poultry,       ordered = FALSE), ref = "Never"),
        cheese               = relevel(factor(cheese,       ordered = FALSE), ref = "Never"),
        salt                 = relevel(factor(salt,       ordered = FALSE), ref = "Never/rarely"),
        tea                  = as.numeric(tea),
        coffee                = as.numeric(coffee),
        water                = as.numeric(water),
        diet_change_5yrs     = relevel(factor(diet_change_5yrs,       ordered = FALSE), ref = "No"),
        diet_variation       = relevel(factor(diet_variation,       ordered = FALSE), ref = "Never/rarely"),
    )

diet_vars <- c("oily_fish", "non_oily_fish", "processed_meat", "poultry", "cheese", "salt", "tea", "coffee", "water", "diet_change_5yrs", "diet_variation")

### Sun exposure

In [71]:
dat <- dat |>
#sun <- dat |>
#    select(p1050_i0, p1060_i0, p1727_i0, p1737_i0, p2267_i0, p2277_i0) |>
    rename(
       summer_outdoors_time        =  p1050_i0,
       winter_outdoors_time        =  p1060_i0,
       skin_tan_ease               =  p1727_i0, 
       childhood_sunburn_number    =  p1737_i0,
       sun_protection_use          =  p2267_i0,
       solarium_freq               =  p2277_i0  
        
    ) |>  
    mutate(
        summer_outdoors_time       = ifelse(summer_outdoors_time   %in% c("Prefer not to answer", "Do not know"), NA, summer_outdoors_time),
        summer_outdoors_time       = ifelse(summer_outdoors_time   %in% c("Less than one"), 0.5, summer_outdoors_time),
        winter_outdoors_time       = ifelse(winter_outdoors_time   %in% c("Prefer not to answer", "Do not know"), NA, winter_outdoors_time),
        winter_outdoors_time       = ifelse(winter_outdoors_time   %in% c("Less than one"), 0.5, winter_outdoors_time),
        skin_tan_ease              = ifelse(skin_tan_ease   %in% c("Prefer not to answer", "Do not know"), NA, skin_tan_ease ),
        childhood_sunburn_number   = ifelse(childhood_sunburn_number   %in% c("Prefer not to answer", "Do not know"), NA, childhood_sunburn_number),
        sun_protection_use         = ifelse(sun_protection_use   %in% c("Prefer not to answer", "Do not know"), NA, sun_protection_use),
        solarium_freq              = ifelse(solarium_freq   %in% c("Prefer not to answer", "Do not know"), NA, solarium_freq),
        solarium_freq              = ifelse(solarium_freq   %in% c("Less than once a year"), 0.5, solarium_freq)
        
        
    ) |>
    mutate(
        summer_outdoors_time       = as.numeric(summer_outdoors_time),
        winter_outdoors_time       = as.numeric(winter_outdoors_time),
        skin_tan_ease              = relevel(factor(skin_tan_ease,       ordered = FALSE), ref = "Get moderately tanned"),
        childhood_sunburn_number   = as.numeric(childhood_sunburn_number),
        sun_protection_use         = relevel(factor(sun_protection_use,       ordered = FALSE), ref = "Never/rarely"),
        solarium_freq              = as.numeric(solarium_freq),
        
    )

sun_vars <- c("summer_outdoors_time", "winter_outdoors_time", "skin_tan_ease", "childhood_sunburn_number", "sun_protection_use", "solarium_freq")


[1m[22m[36mℹ[39m In argument: `summer_outdoors_time = as.numeric(summer_outdoors_time)`.
[33m![39m NAs introduced by coercion


### Sex history

In [72]:
dat <- dat |>
#sex <- dat |>
#    select(p2159_i0) |>
    rename(
        same_sex_intercourse =  p2159_i0
    ) |>  
    mutate(
        same_sex_intercourse       = ifelse(same_sex_intercourse   %in% c("Prefer not to answer"), NA, same_sex_intercourse)
    ) |>
    mutate(
        same_sex_intercourse       = relevel(factor(same_sex_intercourse,       ordered = FALSE), ref = "No")
    )

sex_vars <- c("same_sex_intercourse")

### Physical activity

In [73]:
dat <- dat |>
#phys_activity <- dat |>
#select(p22032_i0) |>
    rename(
       IPAQ_activity_group =  p22032_i0
    ) |>
    mutate(
       IPAQ_activity_group       = relevel(factor(IPAQ_activity_group,       ordered = FALSE), ref = "low")
    )

phys_activity_vars <- c("IPAQ_activity_group")


### Material deprivation

In [74]:
dat <- dat |>
#imd <- dat |>
#    select(p22189) |>
    rename(
       townsend_deprivation_index =  p22189
    ) |>
    mutate(
       townsend_deprivation_index       = as.numeric(townsend_deprivation_index)
    )

imd_vars <- c("townsend_deprivation_index")


### Physical environment

In [75]:
dat <- dat |>
#enviro <- dat |>
#    select(p24023, p24024, p24020, p24021, p24022, p24014, p20118_i0, p24508_i0, p24501_i0, p24504_i0, p24500_i0, 
#           p24503_i0, p24012, p24010, p24506_i0, p24507_i0, p24016, p24017, p24018, p24003, p24004, p24019, p24005,
#           p24007, p24006, p24008, p24015, p24011, p24013, p24009, p24502_i0, p24505_i0            
#          ) |>
    rename(
        sound_average_16hr           =  p24023,
        sound_average_24hr           =  p24024,
        daytime_sound_average        =  p24020,
        evening_sound_average        =  p24021,
        night_sound_average          =  p24022,
        close_to_major_road          =  p24014,
        population_density           =  p20118_i0,
        distance_to_coast            =  p24508_i0,
        domestic_garden_buffer_1000m =  p24501_i0,
        domestic_garden_buffer_300m  =  p24504_i0,
        greenspace_buffer_1000m      =  p24500_i0,
        greenspace_buffer_300m       =  p24503_i0,
        inverse_distance_nearest_major_road       =  p24012,
        inverse_distance_nearest_road             =  p24010,
        natural_environment_buffer_1000m          =  p24506_i0,
        natural_environment_buffer_300m           =  p24507_i0,
        NO2_2005                     =  p24016,
        NO2_2006                     =  p24017,
        NO2_2007                     =  p24018,
        NO2_2010                     =  p24003,
        NO_2010                      =  p24004,
        PM10_2007                    =  p24019,
        PM10_2010                    =  p24005,
        PM2.5_absorbance_2010        =  p24007,
        PM2.5_2010                   =  p24006,
        PM2.5_to_10_2010             =  p24008,
        road_length_sum_100m         =  p24015,
        traffic_load_major_roads     =  p24013,
        traffic_intensity_nearest_major_road     =  p24011,
        traffic_intensity_nearest_road           =  p24009,
        water_buffer_1000m                       =  p24502_i0,
        water_buffer_300m                        =  p24505_i0
        
        
    ) |>  
    mutate(
        sound_average_16hr                   = as.numeric(sound_average_16hr),
        sound_average_24hr                   = as.numeric(sound_average_24hr),
        daytime_sound_average                = as.numeric(daytime_sound_average),
        evening_sound_average                = as.numeric(evening_sound_average),
        distance_to_coast                    = as.numeric(distance_to_coast),
        domestic_garden_buffer_1000m         = as.numeric(domestic_garden_buffer_1000m),
        domestic_garden_buffer_300m          = as.numeric(domestic_garden_buffer_300m),
        greenspace_buffer_1000m              = as.numeric(greenspace_buffer_1000m),
        greenspace_buffer_300m               = as.numeric(greenspace_buffer_300m),
        inverse_distance_nearest_major_road  = as.numeric(inverse_distance_nearest_major_road),
        inverse_distance_nearest_road        = as.numeric(inverse_distance_nearest_road),
        natural_environment_buffer_1000m     = as.numeric(natural_environment_buffer_1000m),
        natural_environment_buffer_300m      = as.numeric(natural_environment_buffer_300m),
        NO2_2005                             = as.numeric(NO2_2005),
        NO2_2006                             = as.numeric(NO2_2006),
        NO2_2007                             = as.numeric(NO2_2007),
        NO2_2010                             = as.numeric(NO2_2010),
        NO_2010                              = as.numeric(NO_2010),
        PM10_2007                            = as.numeric(PM10_2007),
        PM10_2010                            = as.numeric(PM10_2010),
        PM2.5_absorbance_2010                = as.numeric(PM2.5_absorbance_2010),
        PM2.5_2010                           = as.numeric(PM2.5_2010),
        PM2.5_to_10_2010                     = as.numeric(PM2.5_to_10_2010),
        road_length_sum_100m                 = as.numeric(road_length_sum_100m),
        traffic_load_major_roads             = as.numeric(traffic_load_major_roads),
        traffic_intensity_nearest_major_road = as.numeric(traffic_intensity_nearest_major_road),
        traffic_intensity_nearest_road       = as.numeric(traffic_intensity_nearest_road),
        water_buffer_1000m                   = as.numeric(water_buffer_1000m),
        water_buffer_300m                    = as.numeric(water_buffer_300m),
        
        
    ) |>
    mutate(
        close_to_major_road       = relevel(factor(close_to_major_road,       ordered = FALSE), ref = "No"),
        population_density        = relevel(factor(population_density,   ordered = FALSE), ref = "England/Wales - Urban - less sparse")
    ) |>
    # recode home population density to urban/rural
    mutate(
        population_density        = ifelse(population_density == "Postcode not linkable",
                                            "Postcode not linkable",
                                        ifelse(population_density %in% c("England/Wales - Urban - sparse" ,
                                                                              "England/Wales - Urban - less sparse",
                                                                              "Scotland - Large Urban Area",
                                                                              "Scotland - Other Urban Area"),
                                               "Urban", "Rural")
                                           ),
        population_density        = factor(population_density, ordered = FALSE, levels = c("Urban", "Rural", "Postcode not linkable")),
        population_density        = relevel(population_density, ref = "Urban")                                  
        
    )

enviro_vars <- c("sound_average_16hr", "sound_average_24hr", "daytime_sound_average", "evening_sound_average", "night_sound_average", "close_to_major_road", "population_density", "distance_to_coast", "domestic_garden_buffer_1000m", "domestic_garden_buffer_300m", "greenspace_buffer_1000m", "greenspace_buffer_300m", "inverse_distance_nearest_major_road", "inverse_distance_nearest_road", "natural_environment_buffer_1000m", "natural_environment_buffer_300m", "NO2_2005", "NO2_2006", "NO2_2007", "NO2_2010", "NO_2010", "PM10_2007", "PM10_2010", "PM2.5_absorbance_2010", "PM2.5_2010", "PM2.5_to_10_2010", "road_length_sum_100m", "traffic_load_major_roads", "traffic_intensity_nearest_major_road", "traffic_intensity_nearest_road", "water_buffer_1000m", "water_buffer_300m")

### Check levels and summary of of variables

In [76]:

# Identify categorical and numeric variables
is_cat <- function(x) is.factor(x) || is.character(x)

# 1) Categorical variables: list of levels
categorical_list <- dat %>%
  select(where(is_cat)) %>%
  map(~ sort(unique(na.omit(as.character(.)))))  # keep NA out

# 2) Numeric variables: list of summaries
numeric_list <- dat %>%
  select(where(is.numeric)) %>%
  map(~ {
    x <- .
    c(
      n      = sum(!is.na(x)),
      mean   = mean(x, na.rm = TRUE),
      sd     = sd(x, na.rm = TRUE),
      min    = min(x, na.rm = TRUE),
      q1     = unname(quantile(x, 0.25, na.rm = TRUE)),
      median = median(x, na.rm = TRUE),
      q3     = unname(quantile(x, 0.75, na.rm = TRUE)),
      max    = max(x, na.rm = TRUE),
      n_na   = sum(is.na(x))
    )
  })



## Other preprocessing before imputation

### Excluding people adopted

In [78]:
# row count before
rows1 <- nrow(dat)
# subset dataset to only those that weren't adopted
dat <- dat[which(dat$adopted == "No"), ]
# row count after
rows2 <- nrow(dat)
# check sample sizes before/after removing adopted participants 
paste("n removed excluding adopted:", (rows1 - rows2))

In [79]:
saveRDS(dat, file = "environmental_factors_preproc_noimputation.rds")
system('dx upload environmental_factors_preproc_noimputation.rds --path /Mattia/csv/')

# Second dataset to re-map after imputation

In [5]:
ref <- dat |>
    create_string_dummies("p6164_i0") |>
    mutate(
        across(
      c(
        starts_with("p6164_i0."),
        -any_of(c("p6164_i0.None_of_the_above", "p6164_i0.Prefer_not_to_answer"))
      ),
          ~ case_when(
              p6164_i0.None_of_the_above == "Yes" ~ NA,
              p6164_i0.Prefer_not_to_answer == "Yes" ~ "Prefer not to answer",
              TRUE ~ .x
            )
           )
   ) |> 
    rename(
        light_DIY_4wks             = p6164_i0.Light_DIY_eg_pruning_watering_the_lawn_,
        pleasure_walks_4wks        = p6164_i0.Walking_for_pleasure_not_as_a_means_of_transport_,
        other_exercise_4wks        = p6164_i0.Other_exercises_eg_swimming_cycling_keep_fit_bowling_,
        heavy_DIY_4wks             = p6164_i0.Heavy_DIY_eg_weeding_lawn_mowing_carpentry_digging_,
        strenuous_sports_4wks      = p6164_i0.Strenuous_sports
  ) |> 
  select(-p6164_i0, -p6164_i0.None_of_the_above, -p6164_i0.Prefer_not_to_answer) |>
  mutate(
      light_DIY_4wks               = recode_factor(light_DIY_4wks, "0" = "No", "1" = "Yes"),
      pleasure_walks_4wks          = recode_factor(pleasure_walks_4wks, "0" = "No", "1" = "Yes"),
      other_exercise_4wks          = recode_factor(other_exercise_4wks, "0" = "No", "1" = "Yes"),
      heavy_DIY_4wks               = recode_factor(heavy_DIY_4wks, "0" = "No", "1" = "Yes"),
      strenuous_sports_4wks        = recode_factor(strenuous_sports_4wks, "0" = "No", "1" = "Yes")
  ) |>
    rename(
       adopted         =  p1767_i0,
       sex             =  p31,
       sleep_hours     =  p1160_i0,
       fresh_fruit     =  p1309_i0,
       dried_fruit     =  p1319_i0,
       cooked_veg      =  p1289_i0,
       salad_or_raw_veg=  p1299_i0,
       bread_type      =  p1448_i0,
       cereal_type     =  p1468_i0,
       cereal          =  p1458_i0,
       bread           =  p1438_i0, 
       TV_time         =  p1070_i0,
       computer_time   =  p1080_i0,
       driving_time    =  p1090_i0,
       beef            =  p1369_i0,
       lamb            =  p1379_i0,
       pork            =  p1389_i0, 
       pleasure_walks_duration= p981_i0,
       pleasure_walks_freq_4wks=p971_i0,
       other_exercise_duration=p3647_i0,
       other_exercise_freq_4wks=p3637_i0,
       strenuous_sports_duration=p1001_i0,
       strenuous_sports_freq_4wks=p991_i0,
       heavy_DIY_duration= p2634_i0,
       heavy_DIY_freq_4wks=p2624_i0,
       light_DIY_duration=p1021_i0,
       light_DIY_freq_4wks=p1011_i0,
       employment_hours=p767_i0,
       employment_heavy_manual=p816_i0,
       employment_standing=p806_i0,
       alcohol_status  = p20117_i0 
        
    ) |>
    rename(
    country_birth       = p1647_i0,
    breastfed           = p1677_i0,
    body_size_10yrs_old = p1687_i0,
    height_10yrs_old    = p1697_i0,
    handedness          = p1707_i0,
    part_multiple_birth = p1777_i0,
    maternal_smoking    = p1787_i0,
    birth_weight        = p20022_i0
    ) |>
    create_string_dummies("p6145_i0") |>
    mutate(
        across(
      c(
        starts_with("p6145_i0."),
        -any_of(c("p6145_i0.None_of_the_above", "p6145_i0.Prefer_not_to_answer"))
      ),
          ~ case_when(
              #p6145_i0.None_of_the_above == "Yes" |
              p6145_i0.Prefer_not_to_answer == "Yes" ~ "Prefer not to answer",
              TRUE ~ .x
             )
            )
    ) |> 
    rename(
        self_illness_injury_assault       = p6145_i0.Serious_illness_injury_or_assault_to_yourself,
        relative_illness_injury_assault   = p6145_i0.Serious_illness_injury_or_assault_of_a_close_relative,
        death_relative                    = p6145_i0.Death_of_a_close_relative,
        death_partner                     = p6145_i0.Death_of_a_spouse_or_partner,
        divorce                           = p6145_i0.Marital_separation_divorce,
        financial_difficulty              = p6145_i0.Financial_difficulties,
  ) |> 
  select(-p6145_i0, -p6145_i0.None_of_the_above, -p6145_i0.Prefer_not_to_answer) |>
  create_string_dummies("p6160_i0") |>
    mutate(
        across(
      c(
        starts_with("p6160_i0."),
        -any_of(c("p6160_i0.None_of_the_above", "p6160_i0.Prefer_not_to_answer"))
      ),
          ~ case_when(
              #p6160_i0.None_of_the_above == "Yes" |
              p6160_i0.Prefer_not_to_answer == "Yes" ~ "Prefer not to answer",
              TRUE ~ .x
             )
            )
    ) |> 
    rename(
        gym                    = p6160_i0.Sports_club_or_gym,
        pub                    = p6160_i0.Pub_or_social_club,
        religious_group        = p6160_i0.Religious_group,
        adult_education        = p6160_i0.Adult_education_class,
        other_group_activity   = p6160_i0.Other_group_activity,
        family_visit_freq      = p1031_i0,
        confide_freq           = p2110_i0,
        loneliness             = p2020_i0
    ) |> 
    select(-p6160_i0, -p6160_i0.None_of_the_above, -p6160_i0.Prefer_not_to_answer) |> 
    create_string_dummies("p6141_i0") |> 
    mutate(
        across(
          c(
            starts_with("p6141_i0."),
            -any_of(c("p6141_i0.Prefer_not_to_answer"))
          ),
          ~ case_when(
              p6141_i0.Prefer_not_to_answer == "Yes" ~ "Prefer not to answer",
              TRUE ~ .x
             )
        )
    ) |>
    create_string_dummies("p6139_i0") |>
    mutate(
        across(
          c(
            starts_with("p6139_i0."),
            -any_of(c("p6139_i0.Prefer_not_to_answer", "p6139_i0.Do_not_know"))
          ),
          ~ case_when(
              p6139_i0.Prefer_not_to_answer == "Yes" |
              p6139_i0.Do_not_know == "Yes" ~ "Prefer not to answer",
              TRUE ~ .x
             )
        )
    ) |>
    create_string_dummies("p6140_i0") |>
    mutate(
        across(
          c(
            starts_with("p6140_i0."),
            -any_of(c("p6140_i0.Prefer_not_to_answer", "p6140_i0.Do_not_know"))
          ),
          ~ case_when(
              p6140_i0.Prefer_not_to_answer == "Yes" |
              p6140_i0.Do_not_know == "Yes" ~ "Prefer not to answer",
              TRUE ~ .x
             )
        )
    ) |>
    create_string_dummies("p6142_i0") |>
    mutate(
        across(
          c(
            starts_with("p6142_i0."),
            -any_of(c("p6142_i0.Prefer_not_to_answer"))
          ),
          ~ case_when(
              p6142_i0.Prefer_not_to_answer == "Yes" ~ "Prefer not to answer",
              TRUE ~ .x
             )
        )
    ) |>
    rename(
       hshld_partner = p6141_i0.Husband_wife_or_partner, 
       hshld_children = p6141_i0.Son_and_or_daughter_include_step_children_,
       hshld_siblings = p6141_i0.Brother_and_or_sister,
       hshld_parents = p6141_i0.Mother_and_or_father,
       hshld_grandparent = p6141_i0.Grandparent,
       hshld_grandchild = p6141_i0.Grandchild,
       hshld_other_related = p6141_i0.Other_related,
       hshld_other_unrelated = p6141_i0.Other_unrelated,
       accommodation_type =  p670_i0,
       own_or_rent =  p680_i0,
       years_at_address =  p699_i0,
       hshld_number =  p709_i0,
       hshld_income =  p738_i0,
       private_healthcare =  p4674_i0,
       gas_hob_heat = p6139_i0.A_gas_hob_or_gas_cooker,
       gas_fire_heat = p6139_i0.A_gas_fire_that_you_use_regularly_in_winter_time,
       open_fire_heat = p6139_i0.An_open_solid_fuel_fire_that_you_use_regularly_in_winter_time,
       gas_central_heat = p6140_i0.Gas_central_heating,
       electric_storage_heat = p6140_i0.Electric_storage_heaters,
       oil_central_heat = p6140_i0.Oil_kerosene_central_heating,
       employed = p6142_i0.In_paid_employment_or_self_employed,
       retired = p6142_i0.Retired,
       home_maker = p6142_i0.Looking_after_home_and_or_family,
       unemployed = p6142_i0.Unemployed,
       volunteer = p6142_i0.Doing_unpaid_or_voluntary_work,
       student  = p6142_i0.Full_or_part_time_student
       
    ) |> 
    select(-p6141_i0, -p6141_i0.Prefer_not_to_answer, 
           -p6139_i0, -p6139_i0.Prefer_not_to_answer, -p6139_i0.Do_not_know, -p6139_i0.None_of_the_above,
           -p6140_i0, -p6140_i0.Prefer_not_to_answer, -p6140_i0.Do_not_know, -p6140_i0.None_of_the_above, -p6140_i0.Open_fire_without_central_heating, -p6140_i0.Portable_gas_or_paraffin_heaters, -p6140_i0.Solid_fuel_central_heating,
           -p6142_i0, -p6142_i0.Prefer_not_to_answer, -p6142_i0.None_of_the_above, -p6142_i0.Unable_to_work_because_of_sickness_or_disability
           
    ) |>  
    create_string_dummies("p6179_i0") |>
    mutate(
        across(
          c(
            starts_with("p6179_i0."),
            -any_of(c("p6179_i0.Prefer_not_to_answer"))
          ),
          ~ case_when(
              p6179_i0.Prefer_not_to_answer == "Yes" ~ "Prefer not to answer",
              TRUE ~ .x
             )
        )
    ) |>
    create_string_dummies("p6155_i0") |>
    mutate(
        across(
          c(
            starts_with("p6155_i0."),
            -any_of(c("p6155_i0.Prefer_not_to_answer"))
          ),
          ~ case_when(
              p6155_i0.Prefer_not_to_answer == "Yes" ~ "Prefer not to answer",
              TRUE ~ .x
             )
        )
    ) |>
    select(-p6179_i0, -p6179_i0.Prefer_not_to_answer, -p6179_i0.None_of_the_above,
           -p6155_i0, -p6155_i0.Prefer_not_to_answer, -p6155_i0.None_of_the_above
    ) |>
    rename(
        fish_oil_supplements = p6179_i0.Fish_oil_including_cod_liver_oil_,
        calcium_supplements = p6179_i0.Calcium,
        selenium_supplements = p6179_i0.Selenium,
        glucosamine_supplements = p6179_i0.Glucosamine,
        iron_supplements = p6179_i0.Iron,
        zinc_supplements  = p6179_i0.Zinc,
        multivitamin_supplements = p6155_i0.Multivitamins_minerals,
        vitamin_A_supplements = p6155_i0.Vitamin_A,
        vitamin_B_supplements = p6155_i0.Vitamin_B,
        vitamin_C_supplements = p6155_i0.Vitamin_C,
        vitamin_D_supplements = p6155_i0.Vitamin_D,
        vitamin_E_supplements  = p6155_i0.Vitamin_E,
        folate_supplements  = p6155_i0.Folic_acid_or_Folate_Vit_B9_
    ) |>
    rename(
       age_facial_hair =  p2375_i0,
       age_voice       =  p2385_i0,
       balding         =  p2395_i0,
       number_children         =  p2405_i0 
    ) |>
    rename(
       mammogram           =  p2674_i0,
       cervical_smear      =  p2694_i0,
       menarche_age        =  p2714_i0,
       menopause           =  p2724_i0,
       menopause_age       =  p3581_i0, 
       number_births       =  p2734_i0, 
       birth_age           =  p3872_i0,  
       first_birth_age     =  p2754_i0,
       last_birth_age      =  p2764_i0,
       stillbirth          =  p2774_i0, 
       oral_contraceptive  =  p2784_i0,
       HRT                 =  p2814_i0,
       hysterectomy        =  p3591_i0,
       bilateral_oophorectomy=  p2834_i0
    ) |>
     rename(
       neuroticism                   =  p20127_i0,
       mood_swings                   =  p1920_i0,
       miserableness                 =  p1930_i0,
       irritability                  =  p1940_i0, 
       sensitivity                   =  p1950_i0,
       fed_up_feelings               =  p1960_i0,
       nervous                       =  p1970_i0,
       worrier                       =  p1980_i0,
       tense                         =  p1990_i0,
       worry_embarassment            =  p2000_i0,
       nerves                        =  p2010_i0,
       guilty_feelings               =  p2030_i0,
       risk_taking                   =  p2040_i0,
       depressed_mood_freq           =  p2050_i0,
       unenthusiasm_freq             =  p2060_i0,
       tenseness_freq                =  p2070_i0,
       tiredness_freq                =  p2080_i0, 
       depressed_whole_week          =  p4598_i0,
       unenthusiastic_whole_week     =  p4631_i0,
       manic                         =  p4642_i0,
       irritable                     =  p4653_i0,
       happiness                     =  p4526_i0, 
       job_satisfaction              =  p4537_i0,
       family_satisfaction           =  p4559_i0,
       friends_satisfaction          =  p4570_i0,
       financial_satisfaction        =  p4581_i0 
    ) |>
    rename(
       mobile_phone_weekly_usage =  p1120_i0,
       speakerphone              =  p1130_i0,
       computer_games              =  p2237_i0 
       
    ) |>
    rename(
       #sleep_hours             =  p1160_i0,
       easy_wake               =  p1170_i0,
       chronotype              =  p1180_i0,
       nap                     =  p1190_i0,
       sleep_difficulty        =  p1200_i0,
       snoring                 =  p1210_i0
    ) |>
    rename(
       ever_smoked       =  p20160_i0,
       pack_years_prop   = p20162_i0,
       pack_years        = p20161_i0,
       smoking_status    = p20116_i0,
       tobacco           = p1239_i0
    ) |>
    rename(
       alcohol_freq =  p1558_i0
    ) |>
    rename(
       oily_fish             =  p1329_i0,
       non_oily_fish         =  p1339_i0, 
       processed_meat        =  p1349_i0,
       poultry               =  p1359_i0,
       cheese                =  p1408_i0, 
       salt                  =  p1478_i0,
       tea                   =  p1488_i0, 
       coffee                 =  p1498_i0, 
       water                 =  p1528_i0, 
       diet_change_5yrs      =  p1538_i0, 
       diet_variation        =  p1548_i0 
    ) |>
     rename(
       summer_outdoors_time        =  p1050_i0,
       winter_outdoors_time        =  p1060_i0,
       skin_tan_ease               =  p1727_i0, 
       childhood_sunburn_number    =  p1737_i0,
       sun_protection_use          =  p2267_i0,
       solarium_freq               =  p2277_i0  
    ) |>
    rename(
        same_sex_intercourse =  p2159_i0
    ) |>
    rename(
       IPAQ_activity_group =  p22032_i0
    ) |>
    rename(
       townsend_deprivation_index =  p22189
    ) |>
    rename(
        sound_average_16hr           =  p24023,
        sound_average_24hr           =  p24024,
        daytime_sound_average        =  p24020,
        evening_sound_average        =  p24021,
        night_sound_average          =  p24022,
        close_to_major_road          =  p24014,
        population_density           =  p20118_i0,
        distance_to_coast            =  p24508_i0,
        domestic_garden_buffer_1000m =  p24501_i0,
        domestic_garden_buffer_300m  =  p24504_i0,
        greenspace_buffer_1000m      =  p24500_i0,
        greenspace_buffer_300m       =  p24503_i0,
        inverse_distance_nearest_major_road       =  p24012,
        inverse_distance_nearest_road             =  p24010,
        natural_environment_buffer_1000m          =  p24506_i0,
        natural_environment_buffer_300m           =  p24507_i0,
        NO2_2005                     =  p24016,
        NO2_2006                     =  p24017,
        NO2_2007                     =  p24018,
        NO2_2010                     =  p24003,
        NO_2010                      =  p24004,
        PM10_2007                    =  p24019,
        PM10_2010                    =  p24005,
        PM2.5_absorbance_2010        =  p24007,
        PM2.5_2010                   =  p24006,
        PM2.5_to_10_2010             =  p24008,
        road_length_sum_100m         =  p24015,
        traffic_load_major_roads     =  p24013,
        traffic_intensity_nearest_major_road     =  p24011,
        traffic_intensity_nearest_road           =  p24009,
        water_buffer_1000m                       =  p24502_i0,
        water_buffer_300m                        =  p24505_i0
    ) 

In [8]:
saveRDS(ref, file = "reference_dataset.rds")
system('dx upload reference_dataset.rds --path ####YOUR PATH')