# Data Preparation

In [188]:
#knitr::purl(here("Rmd/childcare_article_analyses.Rmd"), here("R/childcare_article_analyses.R"))

In [23]:
library(tidyverse)
library(here)
library(readxl)
library(foreign)
library(tidyverse)
library(kableExtra)
library(vegan)

In [11]:
# load data and helper functions
source("https://raw.githubusercontent.com/HenrikEckermann/in_use/master/bayesian_helper.R")
source("https://raw.githubusercontent.com/HenrikEckermann/in_use/master/mb_helper.R")
source("https://raw.githubusercontent.com/HenrikEckermann/in_use/master/reporting.R")

In [12]:
load(here("data/data_transfer.RData"))
source(here("R/read.R"))

In [13]:
# take over the meta variables I created in other docs
meta_new <- data_transfer[, 1:9] 

In [14]:
head(meta_new)

subject_id,sample_id,time,cc,age_d,age_d_s,bf_count,bf_count_s,bf_ratio
207,sa_10000,post,no,98.0,0.0320946,6.0,0.43948544,1
388,sa_10038,post,yes,98.0,0.0320946,0.0,-1.65710371,0
381,sa_10118,post,yes,108.0,0.5275689,6.59,0.64565004,1
283,sa_10211,post,yes,107.03,0.4795079,5.0,0.09005391,1
216,sa_10230,post,no,100.0,0.1311895,7.0,0.78891696,1
277,sa_10247,pre,yes,79.0,-0.9093066,6.1,0.47442859,1


In [15]:
# create catories for bf and childcare and specifically for ccyes vs rest
meta_new <- meta_new %>%
  mutate(
      groups = ifelse(time == "pre" & cc == "no", "noCCpre", ifelse(
          time == "pre" & cc == "yes", "CCpre", ifelse(
              time == "post" & cc == "no", "noCCpost", "CCpost"))),
      bf = ifelse(bf_ratio <= 0.25, "lowBF", ifelse(
          bf_ratio <0.75, "mediumBF", "highBF")),
      ccpost = ifelse(groups == "CCpost", 1, 0)) %>% 
  mutate(
      groups = as.factor(groups), 
      bf = as.factor(bf), 
      ccpost = as.factor(ccpost))

In [16]:
# add confounding variables
confounders <- foreign::read.spss(here("data/meta_data/bibo_confounders.sav"), to.data.frame = T) %>%
    select(ID, SIBLINGS, childsex) %>% rename(sibling = SIBLINGS, subject_id = ID) %>%
    mutate(sibling = ifelse(sibling == "at leat 1 sibling", 1, ifelse(sibling == "firstborn", 0, 1))) %>%
    filter(subject_id %in% meta_new$subject_id)
meta_new <- meta_new %>% left_join(confounders, by = "subject_id")

“Undeclared level(s) 6, 999 added in variable: DELIVERYmode”

In [17]:
# create new pseq object (read.R results in the object "genus" Leo created)
otu <- otu_to_df(genus, transpose = FALSE)
otu <- otu %>% 
    select(species, meta_new$sample_id) %>% 
    df_to_otu()
pseq <- phyloseq(otu, df_to_sd(meta_new), tax_table(genus))
# add diversity indeces to sample data
diversities <- 
    global(pseq, index = "all") %>% 
    select(contains("diversities")) %>% 
    rownames_to_column("sample_id")
colnames(diversities) <- gsub("diversities_", "", colnames(diversities))

sample_data(pseq) <- 
    sd_to_df(pseq) %>% 
    left_join(diversities, by = "sample_id") %>%
    df_to_sd()
meta <- sd_to_df(pseq)
# clr and relative abundance transformation to deal with compositionality of mb data
pseq.clr <- microbiome::transform(pseq, transform = "clr")
pseq.rel <- microbiome::transform(pseq, "compositional")

Richness
Observed (richness 0)
Diversity
Evenness
Dominance
Rarity
“Setting class(x) to multiple strings ("tbl_df", "tbl", ...); result will no longer be an S4 object”

# PCA

A PCA using CLR transformed values displays the Aitchison distance.

In [18]:
otus.clr <- otu_to_df(pseq.clr)
colnames(otus.clr)[which(colnames(otus.clr) == "Clostridium \\(sensu stricto\\)")] <- "Clostridium_sensu_stricto"
colnames(otus.clr) <- c("sample_id", gsub("_", "", colnames(otus.clr)[-1]))
colnames(otus.clr) <- gsub("\\.", "", colnames(otus.clr))
colnames(otus.clr) <- gsub(" ", "", colnames(otus.clr))
genus <- colnames(otus.clr)[-1]
data <- sd_to_df(pseq.clr) %>%
    left_join(otus.clr, by = "sample_id")
data$sibling <- as.factor(data$sibling)
data$childsex <- as.factor(data$childsex)


# PCA with CLR values (euclidean distance of clr transformed values = Aitchison distance) 
pcx <- prcomp(otus.clr %>% column_to_rownames("sample_id"))
# extract loadings
pcx_rot <- 
    pcx$rotation %>%
        as.tibble() %>%
        mutate_all(function(x) x*10) %>%
        add_column(genus = rownames(pcx$rotation))

# add PCs to data
princomps <- pcx$x %>% as.data.frame() %>%
    rownames_to_column("sample_id") %>%
    select(PC1, PC2, PC3, PC4, PC5, sample_id)
data <- data %>% left_join(princomps, by = "sample_id") 
                   
# how much variance do pcs explain?
pc1 <- round(pcx$sdev[1]^2/sum(pcx$sdev^2),2)
pc2 <- round(pcx$sdev[2]^2/sum(pcx$sdev^2),2)
pc3 <- round(pcx$sdev[3]^2/sum(pcx$sdev^2),2)
pc4 <- round(pcx$sdev[4]^2/sum(pcx$sdev^2),2)
pc5 <- round(pcx$sdev[5]^2/sum(pcx$sdev^2),2)                   

“Setting class(x) to multiple strings ("tbl_df", "tbl", ...); result will no longer be an S4 object”

In [48]:
prc <- prcomp(column_to_rownames(otus.clr, "sample_id"))

In [51]:
prc$x

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,⋯,PC121,PC122,PC123,PC124,PC125,PC126,PC127,PC128,PC129,PC130
sa_10000,0.6522481,-5.032932992,3.70614217,2.1031814,-1.327344857,2.71787131,-2.52470924,0.09053046,0.929992288,-0.388964615,⋯,-5.916126e-05,-0.0006769111,0.0056233695,-0.0021670125,0.0001076303,6.644995e-04,-0.0024848058,-0.0026899426,-9.358979e-05,-2.355754e-15
sa_10038,2.1772224,2.222425742,-1.14427369,-0.3484218,-0.113418987,-0.15860721,0.92335146,0.34826156,0.076911835,-0.061019323,⋯,1.787367e-03,-0.0004919804,0.0031533514,0.0046752640,0.0009389580,-2.939330e-03,-0.0032033244,-0.0010820887,-2.534319e-03,-1.040834e-16
sa_10118,-3.8047943,-3.362412891,0.04934593,5.5918241,-0.162220961,0.87947705,0.80783878,0.87546765,0.969592494,3.793341899,⋯,-7.878270e-04,0.0077840753,-0.0002975014,-0.0020403479,-0.0010708314,3.676301e-03,-0.0042117397,0.0010477876,-6.458181e-04,-2.144118e-15
sa_10211,0.5556765,-1.347495660,-2.29104743,-0.4959183,0.507623469,-1.67744968,-0.79543416,-0.50492866,-0.925542214,-0.463881386,⋯,5.545458e-04,-0.0046014409,-0.0009323639,0.0009103065,0.0023966780,4.529182e-03,-0.0003206956,0.0011654511,-1.273640e-03,6.245005e-16
sa_10230,0.5538283,-2.336835637,-0.05733496,-0.4504195,-0.049012978,-1.31517321,0.12022208,-0.21673896,-0.324764926,-0.244069244,⋯,2.785502e-03,-0.0009503741,0.0023659031,-0.0039445980,0.0019459324,7.132849e-03,0.0019064290,-0.0032579156,2.564181e-03,6.470519e-16
sa_10247,0.2194781,-3.135661431,0.50348276,-0.4307538,0.462025321,-1.98629623,-0.18896649,0.90461160,-0.608432008,0.157108835,⋯,2.190896e-03,0.0015356911,-0.0087510230,0.0010964071,0.0192418126,1.280739e-02,0.0058159519,0.0079337583,-1.236319e-03,2.121133e-15
sa_1025,1.6423085,0.004714348,1.39475862,-0.1227162,0.860289085,-1.27976517,0.46522884,0.67977595,0.112115826,-0.045160541,⋯,4.919798e-03,-0.0002991970,-0.0004347729,0.0046276248,0.0020296435,2.599899e-03,-0.0036210301,0.0046138401,-8.694880e-04,1.856154e-15
sa_10268,-7.5755279,7.674934350,1.32553886,-3.2160754,-1.906815116,0.39763043,-1.51431454,1.91576535,-0.219509802,-1.003730610,⋯,8.811790e-03,0.0034185811,0.0030439688,0.0031656177,-0.0055908444,1.132407e-03,0.0066058766,0.0044309341,-9.798468e-04,1.443290e-15
sa_1028,-3.3656079,1.922456354,2.95647757,-1.6068315,1.563702107,0.66010238,0.71086296,-1.38436969,0.944012138,-0.710850745,⋯,2.723476e-03,0.0030428077,-0.0034725475,-0.0015423015,0.0068162302,2.283882e-03,-0.0028905305,0.0013222780,3.625404e-03,9.714451e-17
sa_1030,2.0631718,1.431108137,3.02043790,0.7779362,0.230190755,-2.15876549,0.24626583,0.19585877,-0.398675991,0.340265624,⋯,2.297467e-03,0.0012337326,0.0104419800,-0.0122881470,-0.0116942681,2.774128e-05,0.0009331639,-0.0060690969,2.627017e-03,-4.562323e-16


In [52]:
# I recode the contrasts so that I have the comparisons I want:
# the intercept will reflect our group we want to compare to others (cc post)
# the cc coefficient then compares to no cc post, the time coefficent 
# to cc pre and the interaction to no cc pre
# contrasts(data$cc)[1, 1] <- 1
# contrasts(data$cc)[2, 1] <- 0
# contrasts(data$time)[1, 1] <- 1
# contrasts(data$time)[2, 1] <- 0

library(mice)
# Next I impute data using predictive mean matching (PMM). PMM is
# less difficult to specify. I use the PCs to impute instead of all 
# genus abundances since these are correlated
data_imp <- data %>% 
    select(
        -everything(), 
        subject_id, 
        age_d_s, 
        time, 
        cc, 
        bf_count_s, 
        sibling,
        PC1,
        PC2,
        PC3,
        PC4,
        PC5) %>%
    mice(m = 10, method = "pmm", print = F, seed = 412) %>%
    mice::complete("all")
# This I use to join genus abundances again
deselect_col <- colnames(data_imp[[1]])
data_lj <- data %>% select(-deselect_col, subject_id, time)
data_imp <- map(data_imp, ~.x %>% left_join(data_lj, by = c("subject_id", "time")))


Attaching package: ‘mice’

The following objects are masked from ‘package:BiocGenerics’:

    cbind, rbind

The following object is masked from ‘package:tidyr’:

    complete

The following objects are masked from ‘package:base’:

    cbind, rbind



In [54]:
# clr/euclidean
#meta.clr <- sd_to_df(pseq.clr) %>% select(-sibling) %>% na.omit()
#otus.clr <- otu_to_df(pseq.clr, transpose = F) %>% 
#    column_to_rownames("species") %>%
#    select(meta.clr$sample_id) %>%
#    t() 

# Homogeneity assumption (for sibling I use one of the imputed sets)
dist <- vegdist(otus.clr[, -1], method = "euclidean")
hg_cc <- anova(betadisper(dist, data_imp[[1]]$cc))
hg_time <- anova(betadisper(dist, data_imp[[1]]$time))
hg_groups <- anova(betadisper(dist, data_imp[[1]]$groups))
hg_childsex <- anova(betadisper(dist, data_imp[[1]]$childsex))
hg_sibling <- anova(betadisper(dist, data_imp[[1]]$sibling))
hg_groups


Unnamed: 0,Df,Sum Sq,Mean Sq,F value,Pr(>F)
Groups,3,20.81327,6.937758,1.194218,0.3132094
Residuals,192,1115.41545,5.809455,,


In [57]:
library(future)
library(furrr)
fit_pm <- function(data_imp) {
    otus <- data_imp %>% 
        select(genus, sample_id) %>% 
        column_to_rownames("sample_id")
    meta <- data_imp %>% 
        select(-everything(), subject_id, sample_id, time, cc, age_d_s, bf_count_s, sibling, childsex) %>% 
        column_to_rownames("sample_id")
    pm <- adonis(
        otus ~ time * cc + age_d_s + bf_count_s +subject_id, 
        method = "euclidean", 
        data =  meta)
    list(aov = pm$aov.tab, coefs = coefficients(pm))
}
plan(multiprocess)
pms <- future_map(data_imp, fit_pm)

In [60]:
phylum <- read.table(here("data/hitchip/l1-rpa.tab"))
phylum

Unnamed: 0,Df,SumsOfSqs,MeanSqs,F.Model,R2,Pr(>F)
time,1,58.44385,58.44385,1.4852084,0.007321523,0.122
cc,1,52.63603,52.63603,1.3376166,0.006593951,0.207
age_d_s,1,79.55309,79.55309,2.0216483,0.009965972,0.04
bf_count_s,1,197.05993,197.05993,5.0077991,0.024686581,0.001
subject_id,1,121.43274,121.43274,3.0859179,0.015212423,0.002
time:cc,1,36.08153,36.08153,0.9169244,0.004520095,0.467
Residuals,189,7437.26455,39.35061,,0.931699456,
Total,195,7982.47171,,,1.0,

Unnamed: 0,Actinomycetaceae,Aerococcus,Aeromonas,Akkermansia,Alcaligenesfaecalisetrel,Allistipesetrel,Anaerobiospirillum,Anaerofustis,Anaerostipescaccaeetrel,Anaerotruncuscolihominisetrel,⋯,UnculturedClostridialesI,UnculturedClostridialesII,UnculturedMollicutes,UnculturedSelenomonadaceae,Veillonella,Vibrio,Weissellaetrel,Wissellaetrel,Xanthomonadaceae,Yersiniaetrel
(Intercept),-0.729720335,-1.3554563477,-1.7589096679,-0.549432727,-0.4318565509,1.1864383958,-1.776853423,-1.57024233,0.5052723584,0.695809892,⋯,1.646802119,1.5670874337,0.8445443301,-1.829629122,-0.91298272,-0.1154861803,-0.5297476188,-1.451475369,-0.6883089127,0.0221839504
time1,0.035263146,0.0025045516,-0.021833437,-0.004231351,-0.0285757979,-0.038298987,-0.0284880525,-0.0102623146,0.0082566776,-0.018369303,⋯,-0.0062566717,-0.005266335,-0.0214716917,-0.0202445006,0.17219554,-0.0109806916,0.0292314259,-0.037348235,-0.0119697508,0.0179443356
cc1,-0.028702914,-0.0176229217,-0.019244343,0.057884947,-0.024828028,0.0257596567,-0.0067716393,-0.021415059,0.0287447823,-0.019559925,⋯,-0.0075900671,-0.0248855375,-0.0172899647,-0.0170915577,-0.10193465,-0.0162021493,-0.0073414834,-0.048043498,-0.0013170536,0.0449082048
age_d_s,0.019254713,0.0362188884,-0.017601467,0.048471099,-0.0200822907,-0.0424705917,-0.0203805172,0.0024999337,0.0234265383,-0.014800665,⋯,-0.024300686,-0.0127500474,-0.013537784,-0.0088542527,0.19536885,-0.0705921288,0.0416341928,-0.085233631,-0.0190704434,-0.0614641719
bf_count_s,-0.026034372,-0.0688265857,0.0046898108,-0.127760066,-0.0066860891,0.0614007582,0.0215459138,-0.0264229425,-0.0393847904,0.001099655,⋯,-0.0224797224,0.0006676864,0.0030756138,0.0003605775,0.09299464,-0.0504958255,0.0082869202,-0.103965231,0.0340180762,0.0549103576
subject_id,-0.000776023,-0.0005217578,-0.0004221951,-0.001126804,-0.0002310832,-0.0004160164,-0.0003160697,-0.0002246076,0.0002860727,-0.000362027,⋯,-0.0003740092,-0.0004898408,-0.0003271709,-0.0003109239,0.00308454,-0.0007422122,-0.0001624665,-0.001104046,-0.0007076874,-0.0007567036
time1:cc1,0.014629231,-0.0364820719,0.0126676479,0.061461782,0.023340299,-0.007398607,0.0083875522,-0.0267992458,-0.0096961776,0.007851589,⋯,0.022515099,0.0023846398,0.0092627939,0.0074136608,-0.08331689,0.0109646983,-0.0023400855,0.028621471,0.0200216517,0.0248983834
