# Imputation of exposure variables
The same pipeline proposed by Argentieri et al. is followed.

In [1]:
install.packages("tidyverse")
library("tidyverse")
install.packages("Hmisc")
library("Hmisc")
install.packages("missRanger")
library(missRanger)
install.packages("plotly")
library(plotly)

Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)

── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.4     [32m✔[39m [34mreadr    [39m 2.1.5
[32m✔[39m [34mforcats  [39m 1.0.1     [32m✔[39m [34mstringr  [39m 1.5.1
[32m✔[39m [34mggplot2  [39m 3.5.2     [32m✔[39m [34mtibble   [39m 3.3.0
[32m✔[39m [34mlubridate[39m 1.9.4     [32m✔[39m [34mtidyr    [39m 1.3.1
[32m✔[39m [34mpurrr    [39m 1.1.0     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors
Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)


At

In [2]:
system('dx download ###YOUR PATH/environmental_factors_preproc_noimputation.rds')

In [3]:
dat <- readRDS("environmental_factors_preproc_noimputation.rds") |> select(-X)

## Identify variables with more than 40% missing 

In [6]:
dat_missing <- sapply(dat, function(x){(length(x[which(is.na(x))])/length(x)) < 0.45})

In [8]:
dat_nomiss <- dat[dat_missing]

## Re-classing ordinal categorical variables as ordinal

In [None]:
# get position of all factor variables
factors <- sapply(dat_nomiss, is.factor)

# subset dataset to just factors
data <- dat_nomiss[factors]

# make list of levels for each factor
factor_list <- sapply(data, nlevels)

# get position of all factors with >2 responses
non_binary <- factor_list > 2

# subset data to non-binary factors
data_cat <- data[non_binary]

# get names of non-binary factors to check if ordinal 
colnames(data_cat)

In [14]:
# function to order ordinal factors
make_ordered <- function(x) {

  
  x$family_visit_freq <-
    factor(x$family_visit_freq,
           ordered = TRUE,
           levels = c("No friends/family outside household",
                      "Never or almost never",
                      "Once every few months",
                      "About once a month",
                      "About once a week",
                      "2-4 times a week",
                      "Almost daily"))
  
  x$confide_freq <-
    factor(x$confide_freq,
           ordered = TRUE,
           levels = c("Never or almost never",
                      "Once every few months",
                      "About once a month",
                      "About once a week",
                      "2-4 times a week",
                      "Almost daily"))

  x$hshld_income <-
    factor(x$hshld_income,
           ordered = TRUE,
           levels = c("Less than 18,000",
                      "18,000 to 30,999",
                      "31,000 to 51,999",
                      "52,000 to 100,000",
                      "Greater than 100,000"))
  
#   x$private_healthcare <-
#     factor(x$private_healthcare,
#            ordered = TRUE,
#            levels = c("No, never",
#                       "Yes, sometimes",
#                       "Yes, most of the time",
#                       "Yes, all of the time"))
  
  
  x$unenthusiasm_freq <-
    factor(x$unenthusiasm_freq,
           ordered = TRUE,
           levels = c("Not at all",
                      "Several days",
                      "More than half the days",
                      "Nearly every day"))
  
  x$tenseness_freq <-
    factor(x$tenseness_freq,
           ordered = TRUE,
           levels = c("Not at all",
                      "Several days",
                      "More than half the days",
                      "Nearly every day"))
  
  x$tiredness_freq <-
    factor(x$tiredness_freq,
           ordered = TRUE,
           levels = c("Not at all",
                      "Several days",
                      "More than half the days",
                      "Nearly every day"))
  
  x$mobile_phone_weekly_usage <-
    factor(x$mobile_phone_weekly_usage,
           ordered = TRUE,
           levels = c("Less than 5mins",
                      "5-29 mins",
                      "30-59 mins",
                      "1-3 hours",
                      "4-6 hours",
                      "More than 6 hours"))
  
  x$speakerphone <-
    factor(x$speakerphone,
           ordered = TRUE,
           levels = c("Never or almost never",
                      "Less than half the time",
                      "About half the time",
                      "More than half the time",
                      "Always or almost always"))
  
  x$computer_games <-
    factor(x$computer_games,
           ordered = TRUE,
           levels = c("Never/rarely",
                      "Sometimes",
                      "Often"))
  
  x$easy_wake <-
    factor(x$easy_wake,
           ordered = TRUE,
           levels = c("Not at all easy",
                      "Not very easy",
                      "Fairly easy",
                      "Very easy"))
  
  x$nap <-
    factor(x$nap,
           ordered = TRUE,
           levels = c("Never/rarely",
                      "Sometimes",
                      "Usually"))
  
  x$sleep_difficulty <-
    factor(x$sleep_difficulty,
           ordered = TRUE,
           levels = c("Never/rarely",
                      "Sometimes",
                      "Usually"))
  
  x$tobacco <-
    factor(x$tobacco,
           ordered = TRUE,
           levels = c("No",
                      "Only occasionally",
                      "Yes, on most or all days"))
  
  x$oily_fish <-
    factor(x$oily_fish,
           ordered = TRUE,
           levels = c("Never",
                      "Less than once a week",
                      "Once a week",
                      "2-4 times a week",
                      "5-6 times a week",
                      "Once or more daily"))
  
  x$non_oily_fish <-
    factor(x$non_oily_fish,
           ordered = TRUE,
           levels = c("Never",
                      "Less than once a week",
                      "Once a week",
                      "2-4 times a week",
                      "5-6 times a week",
                      "Once or more daily"))
  
  x$processed_meat <-
    factor(x$processed_meat,
           ordered = TRUE,
           levels = c("Never",
                      "Less than once a week",
                      "Once a week",
                      "2-4 times a week",
                      "5-6 times a week",
                      "Once or more daily"))
  
  x$poultry <-
    factor(x$poultry,
           ordered = TRUE,
           levels = c("Never",
                      "Less than once a week",
                      "Once a week",
                      "2-4 times a week",
                      "5-6 times a week",
                      "Once or more daily"))
  
  x$beef <-
    factor(x$beef,
           ordered = TRUE,
           levels = c("Never",
                      "Less than once a week",
                      "Once a week",
                      "2-4 times a week",
                      "5-6 times a week",
                      "Once or more daily"))
  
  x$lamb <-
    factor(x$lamb,
           ordered = TRUE,
           levels = c("Never",
                      "Less than once a week",
                      "Once a week",
                      "2-4 times a week",
                      "5-6 times a week",
                      "Once or more daily"))
  
  x$pork <-
    factor(x$pork,
           ordered = TRUE,
           levels = c("Never",
                      "Less than once a week",
                      "Once a week",
                      "2-4 times a week",
                      "5-6 times a week",
                      "Once or more daily"))
  
  x$cheese <-
    factor(x$cheese,
           ordered = TRUE,
           levels = c("Never",
                      "Less than once a week",
                      "Once a week",
                      "2-4 times a week",
                      "5-6 times a week",
                      "Once or more daily"))
  
  x$salt <-
    factor(x$salt,
           ordered = TRUE,
           levels = c("Never/rarely",
                      "Sometimes",
                      "Usually",
                      "Always"))

  
  x$diet_variation <-
    factor(x$diet_variation,
           ordered = TRUE,
           levels = c("Never/rarely",
                      "Sometimes",
                      "Often" ))
  
  x$skin_tan_ease <-
    factor(x$skin_tan_ease,
           ordered = TRUE,
           levels = c("Never tan, only burn",
                      "Get mildly or occasionally tanned",
                      "Get moderately tanned",
                      "Get very tanned"))
  
  x$sun_protection_use <-
    factor(x$sun_protection_use,
           ordered = TRUE,
           levels = c("Do not go out in sunshine",
                      "Never/rarely",
                      "Sometimes",
                      "Most of the time",
                      "Always"))
  
  x$IPAQ_activity_group <-
    factor(x$IPAQ_activity_group,
           ordered = TRUE,
           levels = c("low",
                      "moderate",
                      "high"))
  
#  x$happiness <-
#    factor(x$happiness,
#           ordered = TRUE,
#           levels = c("Extremely unhappy",
#                      "Very unhappy",
#                      "Moderately unhappy",
#                      "Moderately happy",
#                      "Very happy",
#                      "Extremely happy"))
  
#  x$job_satisfaction <-
#    factor(x$job_satisfaction,
#           ordered = TRUE,
#           levels = c("I am not employed",
#                      "Extremely unhappy",
#                      "Very unhappy",
#                      "Moderately unhappy",
#                      "Moderately happy",
#                      "Very happy",
#                      "Extremely happy"))
  
#  x$family_satisfaction <-
#    factor(x$family_satisfaction,
#           ordered = TRUE,
#           levels = c("Extremely unhappy",
#                      "Very unhappy",
#                      "Moderately unhappy",
#                      "Moderately happy",
#                      "Very happy",
#                      "Extremely happy"))
  
#  x$friends_satisfaction <-
#    factor(x$friends_satisfaction,
#           ordered = TRUE,
 #          levels = c("Extremely unhappy",
 #                     "Very unhappy",
 #                     "Moderately unhappy",
 #                     "Moderately happy",
 #                     "Very happy",
 #                     "Extremely happy"))
  
#  x$financial_satisfaction <-
#    factor(x$financial_satisfaction,
#           ordered = TRUE,
#           levels = c("Extremely unhappy",
#                      "Very unhappy",
#                      "Moderately unhappy",
#                      "Moderately happy",
#                      "Very happy",
#                      "Extremely happy"))
  
  return(x)
}

In [15]:
dat_nomiss <- make_ordered(dat_nomiss)

In [16]:
rows1 <- nrow(dat_nomiss)

# check final sample sizes
paste("final men analytic sample:", nrow(dat_nomiss[which(dat_nomiss$sex == "Male"), ]))
paste("final women analytic sample:", nrow(dat_nomiss[which(dat_nomiss$sex == "Female"), ]))
paste("final total analytic sample:", nrow(dat_nomiss))

In [17]:
ncol(dat_nomiss)

In [18]:
total_cols <- colnames(dat_nomiss)
total_cols <- total_cols[which(total_cols != "eid")] |> as.data.frame()

## Imputation
more instructions here: https://cran.r-project.org/web/packages/missRanger/vignettes/vignette_missRanger.html

In [19]:
missings <- sapply(dat_nomiss[colnames(dat_nomiss)], function(x) length(x[which(is.na(x))]))
missings <- sapply(missings, function(x) x/nrow(dat_nomiss))
range(missings)  
mean(missings)

In [139]:
# percentage of missingness per variable
col_missing <- colMeans(is.na(dat_nomiss)) * 100

# make a tidy table
missing_table <- data.frame(
  variable   = names(col_missing),
  pct_missing = round(col_missing, 2)
)

missing_table <- missing_table[order(-missing_table$pct_missing), ]

options(repr.matrix.max.rows = nrow(missing_table))
options(repr.matrix.max.cols = ncol(missing_table))

missing_table |> filter(variable == "menopause")

Unnamed: 0_level_0,variable,pct_missing
Unnamed: 0_level_1,<chr>,<dbl>
menopause,menopause,54.17


### Exclude sex specific variables and eid

In [13]:
exclude_vars <- c(
    Cs(
       eid,
       menopause_age,
       birth_age,
       first_birth_age, 
       last_birth_age 
    )
)

exclude_vars <- paste0(exclude_vars, collapse = " - ")
formula <- as.formula(paste0(". ~ . - ", exclude_vars))

In [None]:
set.seed(2025) 

non_miss <- rowSums(!is.na(dat_nomiss))

dat2_imputed <- missRanger(
  dat_nomiss,
  formula   = formula,
  maxiter   = 10,
  pmm.k     = 3,
  verbose   = 1,
  seed      = 2025,
  num.trees = 200,
  num.threads = 16,
  returnOOB = TRUE,
  data_only = FALSE,  
  case.weights = non_miss
)

In [19]:
saveRDS(dat2_imputed, file = "Second_imputation.rds")
system('dx upload Second_imputation.rds --path ###YOUR PATH')

In [38]:
saveRDS(dat2_imputed$data, file = "Exposome_imputed.rds")
system('dx upload Exposome_imputed.rds --path ###YOUR PATH')

In [None]:
## Retrieve OOB error rates to compare
error_200t <- as.data.frame(attr(dat2, "oob")))
                     
for(j in seq_along(error_200t)) { 
  colnames(error_200t[[j]]) <- c(paste("Imputation", j))
  error_200t[[j]]$Exposure <- rownames(error_200t[[j]])
}

error_200t <-
  error_200t %>% 
  reduce(left_join, by = "Exposure")

cols <- c("Exposure",
          "Imputation 1",
          "Imputation 2",
          "Imputation 3",
          "Imputation 4",
          "Imputation 5")

error_200t <- error_200t[cols]


avg_oob_err_200t <- c(
  mean(error_200t[ ,2]),
  mean(error_200t[ ,3]),
  mean(error_200t[ ,4]),
  mean(error_200t[ ,5]),
  mean(error_200t[ ,6])
)

range_oob_err_200t <- c(
  range(error_200t[ ,2]),
  range(error_200t[ ,3]),
  range(error_200t[ ,4]),
  range(error_200t[ ,5]),
  range(error_200t[ ,6])
)

total_avg_error <- mean(avg_oob_err_200t)
total_avg_range_high <- mean(range_oob_err_200t[c(2,4,6,8,10)])

## Post-imputation variables recoding

In [22]:
make_derived <- function(x) {  
    
    x$sleep_hours_categorical <- NA
    
    x$sleep_hours_categorical <- 
        ifelse(
            x$sleep_hours < 7, 
            '<7 hours', 
            ifelse(
                x$sleep_hours >= 7 & 
                    x$sleep_hours < 9, 
                '7-9 hours',
                ifelse(
                    x$sleep_hours >= 9, 
                    '>9 hours',
                    NA
                )
            )
        )
    
    x$sleep_hours_categorical <- 
        factor(x$sleep_hours_categorical, 
               levels = c('<7 hours',
                          '7-9 hours',
                          '>9 hours'),
               ordered = FALSE)
    
    x$sleep_hours_categorical <- 
        relevel(x$sleep_hours_categorical, 
                ref = '7-9 hours')
    
    x$population_density[which(x$population_density == "Postcode not linkable")] <- NA
    
    x$population_density <- 
        factor(x$population_density,
               ordered = FALSE,
               levels = c("Urban", "Rural"))
    
    x$population_density <- 
        relevel(x$population_density, ref = "Urban")
    
    return(x)
    
}

In [23]:
household_recode <- function(x) {
    
    # set hshld_number to NA if accommodation type is NA (care home) or sheltered accommodation 
    # (it was not asked to these participants)
    x$hshld_number[which(is.na(x$accommodation_type))] <- 0
    x$hshld_number[which(x$accommodation_type == "Sheltered accommodation")] <- 0
    
    # set household variables to "No" if only 1 person (self) in the house
    # set household variables to "No" if accommodation type is NA (care home) or sheltered accommodation 
    # (it was not asked to these participants)
    x$hshld_partner[which(x$hshld_number == 1)] <- "No"
    x$hshld_partner[which(is.na(x$accommodation_type))] <- "No"
    x$hshld_partner[which(x$accommodation_type == "Sheltered accommodation")] <- "No"
    
    x$hshld_grandparent[which(x$hshld_number == 1)] <- "No"
    x$hshld_grandparent[which(is.na(x$accommodation_type))] <- "No"
    x$hshld_grandparent[which(x$accommodation_type == "Sheltered accommodation")] <- "No"
    
    x$hshld_grandchild[which(x$hshld_number == 1)] <- "No"
    x$hshld_grandchild[which(is.na(x$accommodation_type))] <- "No"
    x$hshld_grandchild[which(x$accommodation_type == "Sheltered accommodation")] <- "No"
    
    x$hshld_siblings[which(x$hshld_number == 1)] <- "No"
    x$hshld_siblings[which(is.na(x$accommodation_type))] <- "No"
    x$hshld_siblings[which(x$accommodation_type == "Sheltered accommodation")] <- "No"
    
    return(x)
}

In [24]:
crosstab_recode <- function(x) {
  
  # usual_walking_pace
  #x$usual_walking_pace[which(x$usual_walking_pace == "None of the above")] <- NA
  #old_levels <- levels(x$usual_walking_pace)
  #x$usual_walking_pace <- factor(x$usual_walking_pace,
  #                               ordered = TRUE,
  #                               levels = c("Brisk pace",
  #                                          "Steady average pace",
  #                                          "Slow pace"))
  
  # job_satisfaction
  #x$job_satisfaction[which(x$job_satisfaction == "I am not employed")] <- NA
  #old_levels <- levels(x$job_satisfaction)
  #x$job_satisfaction <- 
  #  factor(x$job_satisfaction,
  #         ordered = TRUE,
  #         levels = old_levels[which(old_levels != "I am not employed")])
  
  # fast_driving
  #x$fast_driving[which(x$fast_driving == "Do not drive on the motorway")] <- NA
  #old_levels <- levels(x$fast_driving)
  #x$fast_driving <- 
  #  factor(x$fast_driving,
  #         ordered = TRUE,
  #         levels = old_levels[which(old_levels != "Do not drive on the motorway")])
  
  # sun_protection_use
  x$sun_protection_use[which(x$sun_protection_use == "Do not go out in sunshine")] <- NA
  old_levels <- levels(x$sun_protection_use)
  x$sun_protection_use <- 
    factor(x$sun_protection_use,
           ordered = TRUE,
           levels = old_levels[which(old_levels != "Do not go out in sunshine")])

  # narcolepsy
  #x$narcolepsy[which(x$narcolepsy == "All of the time")] <- "Often"
  #old_levels <- levels(x$narcolepsy)
  #x$narcolepsy <- 
  #  factor(x$narcolepsy,
  #         ordered = TRUE,
  #         levels = old_levels[which(old_levels != "All of the time")])
  
  # bipolar
  #x$bipolar <- NA
  #x$bipolar[which(x$bipolar_MDD == "No Bipolar or Depression")] <- "No"
  #x$bipolar[which(x$bipolar_MDD == "Single Probable major depression episode")] <- "No"
  #x$bipolar[which(x$bipolar_MDD == "Probable Recurrent major depression (moderate)")] <- "No"
  #x$bipolar[which(x$bipolar_MDD == "Probable Recurrent major depression (severe)")] <- "No"
  #x$bipolar[which(x$bipolar_MDD == "Bipolar I Disorder")] <- "Yes"  
  #x$bipolar[which(x$bipolar_MDD == "Bipolar II Disorder")] <- "Yes"
  #x$bipolar <- factor(x$bipolar, ordered = FALSE)
  #x$bipolar <- relevel(x$bipolar, ref = "No")
  
  # MDD
  #x$MDD <- NA
  #x$MDD[which(x$bipolar_MDD == "No Bipolar or Depression")] <- "No depression"
  #x$MDD[which(x$bipolar_MDD == "Bipolar I Disorder")] <- "No depression"  
  #x$MDD[which(x$bipolar_MDD == "Bipolar II Disorder")] <- "No depression"
  #x$MDD[which(x$bipolar_MDD == "Single Probable major depression episode")] <- "Single episode"
  #x$MDD[which(x$bipolar_MDD == "Probable Recurrent major depression (moderate)")] <- "Moderate"
  #x$MDD[which(x$bipolar_MDD == "Probable Recurrent major depression (severe)")] <- "Severe"
  #x$MDD <- factor(x$MDD,
  #                ordered = TRUE,
  #                levels = c("No depression",
  #                           "Single episode", 
  #                           "Moderate", 
  #                           "Severe"))
  
  # remove old bipolar_MDD var from dataset
  #x <- subset(x, select = -c(bipolar_MDD))
    
  # hearing_difficulty
  #x$hearing_difficulty[which(x$hearing_difficulty == "I am completely deaf")] <- "Yes"
  #old_levels <- levels(x$hearing_difficulty)
  #x$hearing_difficulty <- 
  #  factor(x$hearing_difficulty,
  #         ordered = FALSE,
  #         levels = old_levels[which(old_levels != "I am completely deaf")])
  #x$hearing_difficulty <- relevel(x$hearing_difficulty, ref = "No")
  
  # own_or_rent
  x$own_or_rent[which(x$own_or_rent == "None of the above")] <- NA
  old_levels <- levels(x$own_or_rent)
  x$own_or_rent <- factor(x$own_or_rent,
                          ordered = FALSE,
                          levels = old_levels[which(old_levels != "None of the above")])
  x$own_or_rent <- 
    relevel(x$own_or_rent,
            ref = "Own outright (by you or someone in your household)")
  
  # accommodation_type
  x$accommodation_type[which(x$accommodation_type == "None of the above")] <- NA
  x$accommodation_type[which(x$accommodation_type == "Care home")] <- NA
  x$accommodation_type[which(x$accommodation_type == "Sheltered accommodation")] <- NA
  old_levels <- levels(x$accommodation_type)
  x$accommodation_type <- 
    factor(x$accommodation_type,
           ordered = FALSE,
           levels = old_levels[which(old_levels %nin% c("None of the above",
                                                        "Care home",
                                                        "Sheltered accommodation"))])
  x$accommodation_type <- 
    relevel(x$accommodation_type, ref = "A house or bungalow")
  
  
  # hair_color
  #x$hair_color[which(x$hair_color == "Other")] <- NA
  #old_levels <- levels(x$hair_color)
  #x$hair_color <- factor(x$hair_color,
  #                       ordered = FALSE,
  #                       levels = old_levels[which(old_levels != "Other")])
  #x$hair_color <- relevel(x$hair_color, ref = "Light brown")
  
 
  # home population density
  x$population_density[which(x$population_density == "Postcode not linkable")] <- NA
  x$population_density <- factor(x$population_density,
                                 ordered = FALSE,
                                 levels = c("Urban", "Rural"))
  x$population_density <- relevel(x$population_density, ref = "Urban")
  
  # diet_change_5yrs
  x$diet_change_5yrs[which(x$diet_change_5yrs == "Yes, because of illness")] <- NA
  old_levels <- levels(x$diet_change_5yrs)
  x$diet_change_5yrs <- factor(x$diet_change_5yrs,
                         ordered = FALSE,
                         levels = old_levels[which(old_levels != "Yes, because of illness")])
  x$diet_change_5yrs <- relevel(x$diet_change_5yrs, ref = "No")
  
  return(x)
  
}

### Diet recode

In [49]:
## recode diet type variables to account for nested
# check for most common response type
prop.table(table(test$bread_type))
prop.table(table(test$cereal_type))
prop.table(table(test$coffee_type))
#the most common response are Wholemeal or wholegrain and Oat cereal (e.g. Ready Brek, porridge)


Wholemeal or wholegrain                   Brown     Other type of bread 
             0.56723710              0.12586780              0.04275609 
                  White 
             0.26413901 


      Other (e.g. Cornflakes, Frosties)          Biscuit cereal (e.g. Weetabix) 
                              0.1958539                               0.1767656 
Bran cereal (e.g. All Bran, Branflakes)                                  Muesli 
                              0.1684720                               0.2017081 
 Oat cereal (e.g. Ready Brek, porridge) 
                              0.2572004 

In [25]:
diet_type_recode <- function(x) {
    
    ## bread type
    levels(x$bread_type) <- 
        c("Wholemeal or wholegrain",
          "White",
          "Brown",
          "Other type of bread",
          "Never eat bread")
    x$bread_type[which(x$bread == 0)] <- "Never eat bread"
    
    
    # set reference to most common
    x$bread_type <- 
        relevel(x$bread_type, ref = "Wholemeal or wholegrain")
    
    
    ## cereal type
    levels(x$cereal_type) <- 
        c("Oat cereal (e.g. Ready Brek, porridge)",
          "Bran cereal (e.g. All Bran, Branflakes)",
          "Biscuit cereal (e.g. Weetabix)",
          "Muesli",
          "Other (e.g. Cornflakes, Frosties)",
          "Never eat cereal")
    x$cereal_type[which(x$cereal == 0)] <- "Never eat cereal"
    
    ## set reference to most common
    x$cereal_type <- 
        relevel(x$cereal_type, ref = "Other (e.g. Cornflakes, Frosties)")
    
    # coffee type
    #levels(x$coffee_type) <- 
    #    c("Instant coffee",
    #      "Decaffeinated coffee (any type)",
    #      "Ground coffee (include espresso, filter etc)",
    #      "Other type of coffee",
    #      "Never drink coffee")
    #x$coffee_type[which(x$coffee == 0)] <- "Never drink coffee"
    
    # set reference to most common
    #x$coffee_type <- 
    #    relevel(x$coffee_type, ref = "Instant coffee")
    
    return(x)
    
}



In [26]:
partial_fiber <- function(x) {

  # 2 grams fiber per fresh fruit serving
  x$fresh_fruit_fiber <- x$fresh_fruit * 2
  # 0.5 grams fiber per dried fruit serving
  x$dried_fruit_fiber <- x$dried_fruit * 0.5
  # 1 grams fiber per cooked vegetable serving
  x$cooked_veg_fiber <- x$cooked_veg * 1
  # 1 grams fiber per serving
  x$salad_or_raw_veg_fiber <- x$salad_or_raw_veg * 1

  x$bread_fiber <-
    ifelse(
      x$bread_type == "Wholemeal or wholegrain",
      (x$bread / 7) * 1.80,
      ifelse(
        x$bread_type == "Brown",
        (x$bread / 7) * 1.26,
        ifelse(
          x$bread_type == "White",
          (x$bread / 7) * 0.68,
          ifelse(
            x$bread_type == "Other type of bread",
            (x$bread / 7) * 1.25,
            ifelse(
              x$bread_type == "Never eat bread",
              0,
              0
            )
          )
        )
      )
    )

  # divide by 7 to get daily intake
  x$bread_fiber <- (x$bread_fiber / 7)

  x$cereal_fiber <-
    ifelse(
      x$cereal_type == "Bran cereal (e.g. All Bran, Branflakes)",
      (x$cereal / 7) * 7.16,
        ifelse(
        x$cereal_type == "Muesli",
        (x$cereal / 7) * 4.18,
        ifelse(
          x$cereal_type == "Biscuit cereal (e.g. Weetabix)",
          (x$cereal / 7) * 2.92,
          ifelse(
            x$cereal_type == "Oat cereal (e.g. Ready Brek, porridge)",
            (x$cereal / 7) * 1.92,
            ifelse(
              x$cereal_type == "Other (e.g. Cornflakes, Frosties)",
              (x$cereal / 7) * 0.54,
              ifelse(
                x$cereal_type == "Never eat cereal",
                0,
                0
              )
            )
          )
        )
      )
    )

  fiber_vars <-
    c(
      Cs(
        fresh_fruit_fiber,
        dried_fruit_fiber,
        cooked_veg_fiber,
        salad_or_raw_veg_fiber,
        bread_fiber,
        cereal_fiber
      )
    )

  # sum fiber intake across all vars
  x$partial_fiber_score_raw <-
    rowSums(
      x[which(colnames(x) %in% fiber_vars)]
    )

  # create quintiles
  x$partial_fiber_score <-
    cut(x$partial_fiber_score_raw,
        breaks = c(quantile(x$partial_fiber_score_raw,
                            probs = seq(0, 1, 1/5),
                            na.rm = TRUE)),
        labels = c("quintile 1", "quintile 2", "quintile 3", "quintile 4", "quintile 5"),
        include.lowest = TRUE
    )

  # make factor
  x$partial_fiber_score <-
    factor(x$partial_fiber_score,
           ordered = TRUE,
           levels = c("quintile 1", "quintile 2", "quintile 3", "quintile 4", "quintile 5"))

  
  # create quintiles - bread
  x$bread_fiber <-
    cut(x$bread_fiber,
        breaks = c(quantile(x$bread_fiber,
                            probs = seq(0, 1, 1/5),
                            na.rm = TRUE)),
        labels = c("quintile 1", "quintile 2", "quintile 3", "quintile 4", "quintile 5"),
        include.lowest = TRUE
    )

  # make factor
  x$bread_fiber <-
    factor(x$bread_fiber,
           ordered = TRUE,
           levels = c("quintile 1", "quintile 2", "quintile 3", "quintile 4", "quintile 5"))

  
  # create quintiles
  x$cereal_fiber <-
    cut(x$cereal_fiber,
        breaks = c(quantile(x$cereal_fiber,
                            probs = seq(0, 1, 1/5),
                            na.rm = TRUE)),
        labels = c("quintile 1", "quintile 2", "quintile 3", "quintile 4", "quintile 5"),
        include.lowest = TRUE
    )

  # make factor
  x$cereal_fiber <-
    factor(x$cereal_fiber,
           ordered = TRUE,
           levels = c("quintile 1", "quintile 2", "quintile 3", "quintile 4", "quintile 5"))

  return(x)
}



In [27]:
total_fruit <- function(x) {
    
    # get total number of servings (dried fruit is 2 pieces per serving)
    x$total_fruit <- x$fresh_fruit + (x$dried_fruit / 2)
    
    # cut into categories
    x$total_fruit <- ifelse(
        x$total_fruit < 2,
        "<2 servings",
        ifelse(
            x$total_fruit >= 2 & x$total_fruit < 3,
            "2-2.9 servings",
            ifelse(
                x$total_fruit >= 3 & x$total_fruit < 4,
                "3-3.9 servings",
                ifelse(
                    x$total_fruit >= 4,
                    ">=4 servings",
                    NA
                )
            )
        )
    )
    
    # make factor
    x$total_fruit <- 
        factor(x$total_fruit,
               ordered = TRUE,
               levels = c(
                   "<2 servings",
                   "2-2.9 servings",
                   "3-3.9 servings",
                   ">=4 servings"
               ))
    
    return(x)
}





In [28]:
total_vegetables <- function(x) {
    
    # get total number of servings (2 heaped tablespoons per serving)
    x$total_veg <- (x$cooked_veg / 2) + (x$salad_or_raw_veg / 2)
    
    
    # cut into categories
    x$total_veg <- ifelse(
        x$total_veg < 2,
        "<2 servings",
        ifelse(
            x$total_veg >= 2 & x$total_veg < 3,
            "2-2.9 servings",
            ifelse(
                x$total_veg >= 3 & x$total_veg < 4,
                "3-3.9 servings",
                ifelse(
                    x$total_veg >= 4,
                    ">=4 servings",
                    NA
                )
            )
        )
    )
    
    # make factor
    x$total_veg <- 
        factor(x$total_veg,
               ordered = TRUE,
               levels = c(
                   "<2 servings",
                   "2-2.9 servings",
                   "3-3.9 servings",
                   ">=4 servings"
               ))
    
    return(x)
}


In [30]:
red_meat <- function(x) {
    
    ## convert to frequency per week
    
    # beef
    x$beef_freq <- ifelse(
        x$beef == "Never",
        0,
        ifelse(
            x$beef == "Less than once a week",
            0.5,
            ifelse(
                x$beef == "Once a week" ,
                1,
                ifelse(
                    x$beef == "2-4 times a week",
                    3,
                    ifelse(
                        x$beef == "5-6 times a week",
                        5.5,
                        ifelse(
                            x$beef == "Once or more daily" ,
                            7,
                            NA
                        )
                    )
                )
            )
        )
    )
    
    # lamb
    x$lamb_freq <- ifelse(
        x$lamb == "Never",
        0,
        ifelse(
            x$lamb == "Less than once a week",
            0.5,
            ifelse(
                x$lamb == "Once a week" ,
                1,
                ifelse(
                    x$lamb == "2-4 times a week",
                    3,
                    ifelse(
                        x$lamb == "5-6 times a week",
                        5.5,
                        ifelse(
                            x$lamb == "Once or more daily" ,
                            7,
                            NA
                        )
                    )
                )
            )
        )
    )
    
    # pork
    x$pork_freq <- ifelse(
        x$pork == "Never",
        0,
        ifelse(
            x$pork == "Less than once a week",
            0.5,
            ifelse(
                x$pork == "Once a week" ,
                1,
                ifelse(
                    x$pork == "2-4 times a week",
                    3,
                    ifelse(
                        x$pork == "5-6 times a week",
                        5.5,
                        ifelse(
                            x$pork == "Once or more daily" ,
                            7,
                            NA
                        )
                    )
                )
            )
        )
    )
    
    
    meat_vars <-
        c(
            Cs(
                beef_freq,
                lamb_freq,
                pork_freq
            )
        )
    
    # sum red meat intake across all vars         
    x$red_meat_raw <- 
        rowSums(
            x[which(colnames(x) %in% meat_vars)]
        )
    
    # cut into categories
    x$red_meat <- ifelse(
        x$red_meat_raw < 1,
        "<1 time per week",
        ifelse(
            x$red_meat_raw >= 1 & x$red_meat_raw < 2,
            "1-1.9 times per week",
            ifelse(
                x$red_meat_raw >= 2 & x$red_meat_raw < 3,
                "2-2.9 times per week",
                ifelse(
                    x$red_meat_raw >=3,
                    ">=3 times per week",
                    NA
                )
            )
        )
    )
    
    # make factor
    x$red_meat <- 
        factor(x$red_meat, 
               ordered = TRUE, 
               levels = c("<1 time per week", 
                          "1-1.9 times per week", 
                          "2-2.9 times per week", 
                          ">=3 times per week")) 
    
    return(x)
}

## consumption recode


In [31]:
intake_vars <- function(x) {
    
    # oily fish
    levels(x$oily_fish) <- c(levels(x$oily_fish), ">=2 times per week")
    x$oily_fish[which(x$oily_fish == "Once or more daily")] <- ">=2 times per week"
    x$oily_fish[which(x$oily_fish == "5-6 times a week")] <- ">=2 times per week"
    x$oily_fish[which(x$oily_fish == "2-4 times a week")] <- ">=2 times per week"
    x$oily_fish <- factor(x$oily_fish,
                                  ordered = TRUE,
                                  levels = c("Never",
                                             "Less than once a week",
                                             "Once a week",
                                             ">=2 times per week"))

    # oily fish
    levels(x$non_oily_fish) <- c(levels(x$non_oily_fish), ">=2 times per week")
    x$non_oily_fish[which(x$non_oily_fish == "Once or more daily")] <- ">=2 times per week"
    x$non_oily_fish[which(x$non_oily_fish == "5-6 times a week")] <- ">=2 times per week"
    x$non_oily_fish[which(x$non_oily_fish == "2-4 times a week")] <- ">=2 times per week"
    x$non_oily_fish <- factor(x$non_oily_fish,
                                  ordered = TRUE,
                                  levels = c("Never",
                                             "Less than once a week",
                                             "Once a week",
                                             ">=2 times per week"))


    # processed meat
    levels(x$processed_meat) <- c(levels(x$processed_meat), ">=2 times per week")
    x$processed_meat[which(x$processed_meat == "Once or more daily")] <- ">=2 times per week"
    x$processed_meat[which(x$processed_meat == "5-6 times a week")] <- ">=2 times per week"
    x$processed_meat[which(x$processed_meat == "2-4 times a week")] <- ">=2 times per week"
    x$processed_meat <- factor(x$processed_meat,
                                  ordered = TRUE,
                                  levels = c("Never",
                                             "Less than once a week",
                                             "Once a week",
                                             ">=2 times per week"))
    
    # poultry
    levels(x$poultry) <- c(levels(x$poultry), ">=2 times per week")
    x$poultry[which(x$poultry == "Once or more daily")] <- ">=2 times per week"
    x$poultry[which(x$poultry == "5-6 times a week")] <- ">=2 times per week"
    x$poultry[which(x$poultry == "2-4 times a week")] <- ">=2 times per week"
    x$poultry <- factor(x$poultry,
                                ordered = TRUE,
                                levels = c("Never",
                                           "Less than once a week",
                                           "Once a week",
                                           ">=2 times per week"))
    
    # cheese
    levels(x$cheese) <- c(levels(x$cheese), "<1 time per week", ">=5 times per week")
    x$cheese[which(x$cheese == "Once or more daily")] <- ">=5 times per week"
    x$cheese[which(x$cheese == "5-6 times a week")] <- ">=5 times per week"
    x$cheese[which(x$cheese == "Never")] <- "<1 time per week"
    x$cheese[which(x$cheese == "Less than once a week")] <- "<1 time per week"
    x$cheese <- factor(x$cheese,
                                  ordered = TRUE,
                                  levels = c("<1 time per week",
                                             "Once a week",
                                             "2-4 times a week",
                                             ">=5 times per week"))
    
    # coffee
    x$coffee <- ifelse(
        x$coffee == 0,
        "0 cups/day",
        ifelse(
            x$coffee > 0 & x$coffee < 2,
            "0.5-1.9 cups/day",
            ifelse(
                x$coffee >= 2 & x$coffee < 3,
                "2-2.9 cups/day",
                ifelse(
                    x$coffee >= 3,
                    ">=3 cups/day",
                    NA
                )
            )
        )
    )
    
    # make factor
    x$coffee <- 
        factor(x$coffee,
               ordered = TRUE,
               levels = c(
                   "0 cups/day",
                   "0.5-1.9 cups/day",
                   "2-2.9 cups/day",
                   ">=3 cups/day"
               ))
    
    # tea
    x$tea <- ifelse(
        x$tea < 2,
        "<2 cups/day",
        ifelse(
            x$tea >= 2 & x$tea < 4,
            "2-3.9 cups/day",
            ifelse(
                x$tea >= 4 & x$tea < 6,
                "4-5.9 cups/day",
                ifelse(
                    x$tea >= 6,
                    ">=6 cups/day",
                    NA
                )
            )
        )
    )
    
    # make factor
    x$tea <- 
        factor(x$tea,
               ordered = TRUE,
               levels = c(
                   "<2 cups/day",
                   "2-3.9 cups/day",
                   "4-5.9 cups/day",
                   ">=6 cups/day"
               ))
    
    return(x)
}

In [32]:
bread_cereal <- function(x) {
    
    # cereal
    x$cereal <- ifelse(
        x$cereal < 2,
        "<2 bowls/week",
        ifelse(
            x$cereal >= 2 & x$cereal < 5,
            "2-4.9 bowls/week",
            ifelse(
                x$cereal >= 5 & x$cereal < 7,
                "5-6.9 bowls/week",
                ifelse(
                    x$cereal >= 7,
                    ">=7 bowls/week",
                    NA
                )
            )
        )
    )
    
    # make factor
    x$cereal <- 
        factor(x$cereal,
               ordered = TRUE,
               levels = c(
                   "<2 bowls/week",
                   "2-4.9 bowls/week",
                   "5-6.9 bowls/week",
                   ">=7 bowls/week"
               ))
    
    # bread
    x$bread <- ifelse(
        x$bread < 8,
        "<8 slices/week",
        ifelse(
            x$bread >= 8 & x$bread < 14,
            "8-13.9 slices/week",
            ifelse(
                x$bread >= 14 & x$bread < 20,
                "14-19.9 slices/week",
                ifelse(
                    x$bread >= 20,
                    ">=20 slices/week",
                    NA
                )
            )
        )
    )
    
    # make factor
    x$bread <- 
        factor(x$bread,
               ordered = TRUE,
               levels = c(
                   "<8 slices/week",
                   "8-13.9 slices/week",
                   "14-19.9 slices/week",
                   ">=20 slices/week"
               ))
    
    # water
    x$water <- ifelse(
        x$water < 1,
        "<1 glass/day",
        ifelse(
            x$water >= 1 & x$water < 2,
            "1-1.9 glass/day",
            ifelse(
                x$water >= 2 & x$water < 3,
                "2-2.9 glass/day",
                ifelse(
                    x$water >= 3,
                    ">=3 glass/day",
                    NA
                )
            )
        )
    )
    
    # make factor
    x$water <- 
        factor(x$water,
               ordered = TRUE,
               levels = c(
                   "<1 glass/day",
                   "1-1.9 glass/day",
                   "2-2.9 glass/day",
                   ">=3 glass/day"
               ))
    
    return(x)
}

In [33]:
alcohol_recode <- function(x) {
    
    # recode to nominal
    x$alcohol_freq <- factor(x$alcohol_freq, ordered = FALSE)
    x$alcohol_freq <- relevel(x$alcohol_freq, ref = "Never")
    # remove previous drinkers from reference category
    x$alcohol_freq[which(x$alcohol_status == "Previous")] <- NA

    # recode to only current drinkers
    x$alcohol_freq_old <- x$alcohol_freq
    old_levels <- levels(x$alcohol_freq)
    x$alcohol_freq <- 
        factor(x$alcohol_freq,
               ordered = FALSE,
               levels = old_levels[old_levels %nin% c("Never", "Special occasions only")])
    x$alcohol_freq <- relevel(x$alcohol_freq, ref = "One to three times a month")
    
    # remove previous drinkers from reference category
    x$alcohol_freq[which(x$alcohol_status %in% c("Never", "Previous"))] <- NA
    x$alcohol_freq[which(x$alcohol_freq_old == "Special occasions only")] <- NA
    
    # make new dichotomous alcohol status
    x$alcohol_status_old <- x$alcohol_status
    # remove previous drinkers from reference category
    x$alcohol_status[which(x$alcohol_status == "Previous")] <- NA
    x$alcohol_status <- factor(x$alcohol_status, ordered = FALSE, levels = c("Never", "Current"))
    x$alcohol_status <- relevel(x$alcohol_status, ref = "Never")
    
    return(x)
}

In [34]:
remove_vars <- function(x) {
    
    x <- subset(
        x, 
        select = -c(
            fresh_fruit_fiber,
            dried_fruit_fiber,
            cooked_veg_fiber,
            salad_or_raw_veg_fiber,
            # bread_fiber,
            # cereal_fiber,
            partial_fiber_score_raw,
            beef_freq,
            lamb_freq,
            pork_freq
        )
    )
    
    return(x)
}

In [36]:
OPA_recode <- function(x) {
    
    # minutes per week of employment
    x$employment_mins <- (x$employment_hours * 60)
    
    # heavy manual work - use 2/3 and 1/3 because those are the values if you rescale to be between 0-1
    # this assumes that the responses never to always are ordinal
    x$hman_mins <- 
        ifelse(
            x$employment_heavy_manual == "Always",
            x$employment_mins,
            ifelse(
                x$employment_heavy_manual == "Usually",
                x$employment_mins*(2/3),
                ifelse(
                    x$employment_heavy_manual == "Sometimes",
                    x$employment_mins*(1/3),
                    ifelse(
                        x$employment_heavy_manual == "Never/rarely",
                        0,
                        NA
                    )
                )
            )
        )

    # set all those who are not employed to 0
    x$hman_mins[which(x$employed == "No")] <- 0
    # multiply mins/week x 4.5 METs 
    x$hman_mins <- x$hman_mins * 4.5
    
    # employment standing
    x$employment_standing_mins <- 
        ifelse(
            x$employment_standing == "Always",
            x$employment_mins,
            ifelse(
                x$employment_standing == "Usually",
                x$employment_mins*(2/3),
                ifelse(
                    x$employment_standing == "Sometimes",
                    x$employment_mins*(1/3),
                    ifelse(
                        x$employment_standing == "Never/rarely",
                        0,
                        NA
                    )
                )
            )
        )
    
    # set all those who are not employed to 0
    x$employment_standing_mins[which(x$employed == "No")] <- 0
    # multiply mins/week x 2.25 METs 
    x$employment_standing_mins <- x$employment_standing_mins * 2.25
    
    # list of vars used to make OPA summary
    opa_vars <- 
        c(
            Cs(
                hman_mins,
                employment_standing_mins
            )
        )
    
    # make OPA summary 
    x$OPA_raw <- 
        rowSums(
            x[which(colnames(x) %in% opa_vars)],
            # na.rm will set all people with NA in both cols to 0
            na.rm = TRUE
        )
    
    # make categorical
    x$OPA <- NA
    x$OPA[which(x$OPA_raw >= 3000)] <- "high"
    x$OPA[which(x$OPA_raw >= 600 & x$OPA_raw < 3000)] <- "moderate"
    x$OPA[which(x$OPA_raw >= 0 & x$OPA_raw < 600)] <- "low"
    
    # class as factor
    x$OPA <- factor(x$OPA,
                    ordered = TRUE,
                    levels = c("low", "moderate", "high"))
    
    return(x)
}

In [37]:
sed_vars <- 
  c(
    Cs(
      TV_time,
      computer_time,
      driving_time
    )
  )


sedentary_recode <- function(x) {
    
    x$sedentary_raw <- 
        rowSums(
            x[which(colnames(x) %in% sed_vars)]
        )
    
    x$sedentary_raw[which(x$sedentary_raw > 24)] <- NA
    # x$sedentary_raw[which(x$sedentary_raw > 16)] <- 16
    
    x$sedentary <- NA
    x$sedentary[which(x$sedentary_raw < 4)] <- "<4 hours"
    x$sedentary[which(x$sedentary_raw >= 4 & x$sedentary_raw < 6)] <- "4-5.9 hours"
    x$sedentary[which(x$sedentary_raw >= 6)] <- ">=6 hours"
    
    x$sedentary <- 
        factor(x$sedentary,
               ordered = TRUE,
               levels = c("<4 hours",
                          "4-5.9 hours",
                          ">=6 hours"
               ))
    
    return(x)
}


In [None]:
recode_df <- dat2 |>
    make_derived() |>
    crosstab_recode() |>
    diet_type_recode() |>
    partial_fiber() |>
    total_fruit() |>
    total_vegetables() |>
    red_meat() |>
    intake_vars() |>
    bread_cereal() |>
    alcohol_recode() |>
    #LTPA_recode() |>
    OPA_recode() |>
    sedentary_recode() |>
    select(
       -sleep_hours, -fresh_fruit,-dried_fruit, -cooked_veg, -salad_or_raw_veg, -bread_type, -cereal_type, -TV_time, 
        -computer_time, -driving_time,  -beef, -lamb, -pork, -cereal, -bread, -pleasure_walks_duration,  
         -employment_hours, -employment_heavy_manual, -employment_standing, -alcohol_status, 
         -relative_illness_injury_assault, -self_illness_injury_assault, -other_group_activity, -hshld_siblings, -hshld_grandparent, -hshld_parents, 
        -hshld_other_related, -hshld_other_unrelated, -partial_fiber_score, -red_meat_raw, -alcohol_freq_old, 
        -alcohol_status_old, -employment_mins, -hman_mins, -employment_standing_mins, -OPA_raw, 
        -sedentary_raw, -partial_fiber_score_raw, -pleasure_walks_4wks, -fresh_fruit_fiber, -dried_fruit_fiber, 
        -cooked_veg_fiber, -salad_or_raw_veg_fiber, -beef_freq, -lamb_freq, -pork_freq
    ) |>
    # remove sex related factors
    select(
        -birth_age, -first_birth_age, -last_birth_age
    )


In [218]:
saveRDS(recode_df, file = "Exposome_imputed_recoded.rds")
system('dx upload Exposome_imputed_recoded.rds --path ###YOUR PATH')

## Re-map "prefer not to answer" (PNA) responses

In [220]:
# upload reference dataset
system('dx download "project-Gzb5YJQJgQzb41qJj9G5G1GV:/###YOUR PATH/reference_dataset.rds"')
ref <- readRDS("environmental_factors_preproc_reference_dataset.rds") |> select(-X)

In [223]:
# map PNA variables in the reference dataset
index_cols <- colnames(ref)#[which(colnames(ref) != "eid")]
index <- sapply(ref[index_cols], function(x) which(x == "Prefer not to answer" | x == "-3"))
index_eid <- as.list(seq_along(index))
names(index_eid) <- names(index)
for (k in seq_along(index)) {
    index_eid[[k]] <- ref$eid[index[[k]]]
}

In [228]:
PNA_remove <- function(x) {
    
    # recode newly derived or composite variables
    x$population_density[which(x$eid %in% index_eid$population_density)] <- NA
    x$alcohol_freq[which(x$eid %in% index_eid$alcohol_freq)] <- NA
    #x$education_years[which(x$eid %in% index_eid$education_years.0.0)] <- NA
    #x$dairy_milk[which(x$eid %in% index_eid$milk_type | 
    #                       x$eid %in% index_eid$cereal | 
    #                       x$eid %in% index_eid$tea | 
    #                       x$eid %in% index_eid$coffee)] <- NA
    #x$partial_fiber_score[which(x$eid %in% index_eid$dried_fruit |
    #                                x$eid %in% index_eid$fresh_fruit | 
    #                                x$eid %in% index_eid$cooked_veg |
    #                                x$eid %in% index_eid$salad_or_raw_veg | 
    #                                x$eid %in% index_eid$bread_type |
    #                                x$eid %in% index_eid$bread |
    #                                x$eid %in% index_eid$cereal |
    #                                x$eid %in% index_eid$cereal_type)] <- NA
    x$bread_fiber[which(x$eid %in% index_eid$bread_type |
                            x$eid %in% index_eid$bread)] <- NA
    x$cereal_fiber[which(x$eid %in% index_eid$cereal_type |
                             x$eid %in% index_eid$cereal)] <- NA
    x$red_meat[which(x$eid %in% index_eid$beef |
                         x$eid %in% index_eid$pork | 
                         x$eid %in% index_eid$lamb)] <- NA
    x$total_fruit[which(x$eid %in% index_eid$dried_fruit |
                            x$eid %in% index_eid$fresh_fruit)] <- NA
    x$total_veg[which(x$eid %in% index_eid$salad_or_raw_veg |
                          x$eid %in% index_eid$cooked_veg)] <- NA
    #x$LTPA[which(x$eid %in% index_eid$pleasure_walks_duration | 
    #                 x$eid %in% index_eid$pleasure_walks_freq_4wks | 
    #                 x$eid %in% index_eid$strenuous_sports_duration |
    #                 x$eid %in% index_eid$strenuous_sports_freq_4wks | 
    #                 x$eid %in% index_eid$other_exercise_duration | 
    #                 x$eid %in% index_eid$other_exercise_freq_4wks |
    #                 x$eid %in% index_eid$heavy_DIY_duration |
    #                 x$eid %in% index_eid$heavy_DIY_freq_4wks | 
    #                 x$eid %in% index_eid$light_DIY_duration | 
    #                 x$eid %in% index_eid$light_DIY_freq_4wks)] <- NA
    x$OPA[which(x$eid %in% index_eid$employment.0.0 |
                    x$eid %in% index_eid$employment_heavy_manual | 
                    x$eid %in% index_eid$employment_standing | 
                    x$eid %in% index_eid$employment_hours)] <- NA

    x$sleep_hours_categorical[which(x$eid %in% index_eid$sleep_hours)] <- NA
    
    #x$hshld_parents[which(x$eid %in% index_eid$hshld_relations.0.0)] <- NA
    x$hshld_partner[which(x$eid %in% index_eid$hshld_partner)] <- NA
    #x$hshld_siblings[which(x$eid %in% index_eid$hshld_relations.0.0)] <- NA
    x$hshld_children[which(x$eid %in% index_eid$hshld_children)] <- NA
    #x$hshld_grandparent[which(x$eid %in% index_eid$hshld_relations.0.0)] <- NA
    x$hshld_grandchild[which(x$eid %in% index_eid$hshld_grandchild)] <- NA
   
    x$gas_hob_heat[which(x$eid %in% index_eid$gas_hob_heat)] <- NA
    x$gas_fire_heat[which(x$eid %in% index_eid$gas_fire_heat)] <- NA
    x$open_fire_heat[which(x$eid %in% index_eid$open_fire_heat)] <- NA
    
    #x$gas_central_heat[which(x$eid %in% index_eid$gas_central_heat)] <- NA
    #x$electric_storage_heat[which(x$eid %in% index_eid$electric_storage_heat)] <- NA
    #x$oil_central_heat[which(x$eid %in% index_eid$oil_central_heat)] <- NA
    #x$portable_gas_heat[which(x$eid %in% index_eid$heating_type.0.0)] <- NA
    #x$solid_fuel_heat[which(x$eid %in% index_eid$heating_type.0.0)] <- NA
    #x$open_fire_no_central_heat[which(x$eid %in% index_eid$heating_type.0.0)] <- NA
    
    x$employed[which(x$eid %in% index_eid$employed)] <- NA
    x$retired[which(x$eid %in% index_eid$retired)] <- NA
    x$home_maker[which(x$eid %in% index_eid$home_maker)] <- NA
    #x$unemployed_disability[which(x$eid %in% index_eid$employment.0.0)] <- NA
    x$unemployed[which(x$eid %in% index_eid$unemployed)] <- NA
    x$volunteer[which(x$eid %in% index_eid$volunteer)] <- NA
    x$student[which(x$eid %in% index_eid$student)] <- NA
    
    #x$attendance_allowance[which(x$eid %in% index_eid$public_assistance.0.)] <- NA
    #x$disability_allowance[which(x$eid %in% index_eid$public_assistance.0.)] <- NA
    #x$blue_badge[which(x$eid %in% index_eid$public_assistance.0.)] <- NA
   
    x$gym[which(x$eid %in% index_eid$gym)] <- NA
    x$pub[which(x$eid %in% index_eid$social_activity.0.0)] <- NA
    x$religious_group[which(x$eid %in% index_eid$religious_group)] <- NA
    x$adult_education[which(x$eid %in% index_eid$social_activity.0.0)] <- NA

    #x$heart_attack_diagnosis[which(x$eid %in% index_eid$heart_diagnosis.0.0)] <- NA
    #x$stroke_diagnosis[which(x$eid %in% index_eid$heart_diagnosis.0.0)] <- NA
    #x$angina_diagnosis[which(x$eid %in% index_eid$heart_diagnosis.0.0)] <- NA
    #x$high_blood_pressure_diagnosis[which(x$eid %in% index_eid$heart_diagnosis.0.0)] <- NA

    #x$asthma_diagnosis[which(x$eid %in% index_eid$other_diagnosis.0.0)] <- NA
    #x$DVT_diagnosis[which(x$eid %in% index_eid$other_diagnosis.0.0)] <- NA
    #x$hayfever_diagnosis[which(x$eid %in% index_eid$other_diagnosis.0.0)] <- NA
    #x$bronchitis_emphysema_diagnosis[which(x$eid %in% index_eid$other_diagnosis.0.0)] <- NA
    #x$blood_clot_lung_diagnosis[which(x$eid %in% index_eid$other_diagnosis.0.0)] <- NA

    #x$cholesterol_meds[which(x$eid %in% index_eid$meds.0.0 |
    #                               x$eid %in% index_eid$meds_hormones.0.0)] <- NA
    #x$blood_pressure_meds[which(x$eid %in% index_eid$meds.0.0 |
    #                               x$eid %in% index_eid$meds_hormones.0.0)] <- NA
    #x$insulin[which(x$eid %in% index_eid$meds.0.0 |
    #                               x$eid %in% index_eid$meds_hormones.0.0)] <- NA
    
    x$multivitamin_supplements[which(x$eid %in% index_eid$multivitamin_supplements)] <- NA
    x$vitamin_A_supplements[which(x$eid %in% index_eid$vitamin_A_supplements)] <- NA
    x$vitamin_B_supplements[which(x$eid %in% index_eid$vitamin_B_supplements)] <- NA
    x$vitamin_C_supplements[which(x$eid %in% index_eid$vitamin_C_supplements)] <- NA
    x$vitamin_D_supplements[which(x$eid %in% index_eid$vitamin_D_supplements)] <- NA
    x$vitamin_E_supplements[which(x$eid %in% index_eid$vitamin_E_supplements)] <- NA
    x$folate_supplements[which(x$eid %in% index_eid$folate_supplements)] <- NA

    x$fish_oil_supplements[which(x$eid %in% index_eid$fish_oil_supplements)] <- NA
    x$calcium_supplements[which(x$eid %in% index_eid$calcium_supplements)] <- NA
    x$selenium_supplements[which(x$eid %in% index_eid$selenium_supplements)] <- NA
    x$glucosamine_supplements[which(x$eid %in% index_eid$glucosamine_supplements)] <- NA
    x$iron_supplements[which(x$eid %in% index_eid$iron_supplements)] <- NA
    x$zinc_supplements[which(x$eid %in% index_eid$zinc_supplements)] <- NA

    #x$never_eat_sugar[which(x$eid %in% index_eid$never_eat0.0)] <- NA
    #x$never_eat_eggs[which(x$eid %in% index_eid$never_eat0.0)] <- NA
    #x$never_eat_wheat[which(x$eid %in% index_eid$never_eat0.0)] <- NA
    #x$never_eat_dairy[which(x$eid %in% index_eid$never_eat0.0)] <- NA

    #x$self_illness_injury_assault[which(x$eid %in% index_eid$sle.0.0)] <- NA
    #x$relative_illness_injury_assault[which(x$eid %in% index_eid$sle.0.0)] <- NA
    x$death_relative[which(x$eid %in% index_eid$death_relative)] <- NA
    x$death_partner[which(x$eid %in% index_eid$death_partner)] <- NA
    x$divorce[which(x$eid %in% index_eid$sle.0.0)] <- NA
    x$financial_difficulty[which(x$eid %in% index_eid$financial_difficulty)] <- NA
    
    # recode for all other variables
    for(k in seq_along(x)) {
        if (colnames(x[k]) %in% names(index_eid)) {
            x[[k]][which(x[["eid"]] %in% index_eid[[colnames(x[k])]])] <- NA
        }
    }
    
    return(x)
}


In [None]:
recode_pna_df <- PNA_remove(recode_df)

In [233]:
saveRDS(recode_pna_df, file = "Exposome_imputed_recoded_pna.rds")
system('dx upload Exposome_imputed_recoded_pna.rds --path ###YOUR PATH')