In [None]:
knitr::opts_chunk$set(
  collapse = TRUE,   
  comment = "#>", 
  out.width = "100%",
  message=FALSE, warning=FALSE
)

In [None]:
library(tidyverse)
library(lme4)
library(emmeans)
library(ISwR)
library(ggridges)
library(MatchIt)
library(dplyr)
library(jtools)
library(broom)
library(forestmangr)

# Introduction

This .rmd uses outputs generated by the first stage of the method. This first stage generates a predicted age that is corrected for 'regression to the mean' bias associated in predicting biological age from chronological age.  

Inputs are healthy validation set predicted ages and remaining approx. 34K subjects' predicted ages in the UK Biobank.

In [None]:
#Load healthy predictions:
# healthyvalidationset <- #path to csv containing healthy validation set predicted ages. 
# sex <- #path to csv containing information on sex for participants. 
# 
# healthyvalidationset2 <- healthyvalidationset %>% select("eid_40616", "age_at_MRI", "lasso_delta_with_t1")
# healthyvalidationset2 <- left_join(healthyvalidationset, sex) %>% select("eid_40616", "age_at_MRI", "sex", "lasso_delta_with_t1")

#Load non-healthy outputs: 
non.healthy.df <-   read_csv("#path to csv containing non-healthy validation set predicted ages.")
sex <- #path to csv containing information on sex for participants %>% select("eid_40616", "sex")
non.healthy.df <- left_join(non.healthy.df, sex) %>% select("eid_40616", "age_at_MRI", "sex", "catb_delta_with_t1_bc_cole")

# Disease 1: Diabetes 

## (i) propensity match samples

Healthy sample is matched to disease sample by age and sex.

First, examine if there are any differences in covariates:

In [None]:
#combine disease of interest with predictions.
diablist <- read_csv("#path to csv containing diabetes set predicted ages") %>% select("eid_40616")
diablist$diabetes <- "1"
diabetespredictions <- left_join(non.healthy.df, diablist)
diabetespredictions$diabetes[is.na(diabetespredictions$diabetes)] <- 0

In [None]:
fullset_cov <- c('age_at_MRI', 'sex')
diabetespredictions %>%
  group_by(diabetes) %>%
  select(one_of(fullset_cov)) %>%
  summarise_all(funs(mean(., na.rm = T)))

diabetespredictions$diabetes <- as.factor(diabetespredictions$diabetes )

with(diabetespredictions, t.test(age_at_MRI ~ diabetes))  #(repeat for each covariate)
with(diabetespredictions, t.test(sex ~ diabetes))  #(repeat for each covariate)

In [None]:
m_ps <- glm(diabetes ~ age_at_MRI + sex, 
            family = binomial(), data = diabetespredictions)
summary(m_ps)

prs_df <- data.frame(pr_score = predict(m_ps, type = "response"),
                     disease = m_ps$model$diabetes)
# head(prs_df)

In [None]:
#matching process
mod_match <- matchit(diabetes ~ age_at_MRI + sex,
                     method = "nearest", data = diabetespredictions)
#summary(mod_match)
plot(mod_match)

Dimensions of matched data (equal numbers in both groups once matched, number represents total of healthy + disease):

In [None]:
dta_m <- match.data(mod_match)
dim(dta_m)

Now compare distributions of matched data:

In [None]:
dta_m %>%
  group_by(diabetes) %>%
  select(one_of(fullset_cov)) %>%
  summarise_all(funs(mean))

## (ii) analysis

In [None]:
summarydta_m <- dta_m %>% group_by(diabetes) %>% 
	summarise(meancatb_delta_with_t1_bc_cole = mean(catb_delta_with_t1_bc_cole), n= n()) 

summarydta_m$meancatb_delta_with_t1_bc_cole <- as.numeric(summarydta_m$meancatb_delta_with_t1_bc_cole)
summarydta_m$meancatb_delta_with_t1_bc_cole <- sprintf("%.3f",summarydta_m$meancatb_delta_with_t1_bc_cole)

### Summary:

In [None]:
summarydta_m

### Plot:

In [None]:
df_DM_plot <- ggplot(dta_m, aes(x=factor(diabetes), y=catb_delta_with_t1_bc_cole)) + 
  geom_point(stat="summary", fun.y="mean") + 
  geom_errorbar(stat="summary", width=0.05, fun.data="mean_se", fun.args = list(mult = 1.96)) +
  geom_hline(yintercept = 0, linetype='dotted') +
  theme(axis.text.x = element_text(face="bold", size=8, angle=45, hjust=1)) +
  ggtitle("Diabetes") +
  xlab("Diabetes status") +
  scale_x_discrete(labels=c("0" = "Non-DM (n=2466)", "1" =  "Age, sex matched DM (n=2466)"))
df_DM_plot

### Regression (adjusted for age, age^2, sex):

In [None]:
fitdm <- lm(catb_delta_with_t1_bc_cole ~ diabetes + poly(age_at_MRI, 2) + sex , dta_m)
summary(fitdm)
confint(fitdm, level = .95)

#female
fitdm_female <- lm(catb_delta_with_t1_bc_cole ~  poly(age_at_MRI, 2) + sex*diabetes , dta_m) #coefficient of diabetes = female coefficient. 
summary(fitdm_female)
confint(fitdm_female, level = .95)

#male
fitdm_male <- lm(catb_delta_with_t1_bc_cole ~  poly(age_at_MRI, 2) + I(1-sex)*diabetes , dta_m) #coefficient of diabetes = male coefficient. 
summary(fitdm_male)
confint(fitdm_male, level = .95)

# Disease 2: Hypertension 

In [None]:
#combine disease of interest with predictions.
fullsethtn <- read_csv("#path to csv containing hypertension set predicted ages") %>% select("eid_40616")
fullsethtn$htn <- "1"
fullsethtn <- left_join(non.healthy.df, fullsethtn)
fullsethtn$htn[is.na(fullsethtn$htn)] <- 0

## (i) propensity match samples

Healthy sample is matched to disease sample by age and sex.

First, examine if there are any differences in covariates.

In [None]:
fullsethtn %>%
  group_by(htn) %>%
  select(one_of(fullset_cov)) %>%
  summarise_all(funs(mean(., na.rm = T)))

fullsethtn$htn <- as.factor(fullsethtn$htn)

with(fullsethtn, t.test(age_at_MRI ~ htn))  #(repeat for each covariate)
with(fullsethtn, t.test(sex ~ htn))  #(repeat for each covariate)

In [None]:
m_ps_htn <- glm(htn ~ age_at_MRI + sex, 
            family = binomial(), data = fullsethtn)
summary(m_ps_htn)

prs_df_htn <- data.frame(pr_score = predict(m_ps_htn, type = "response"),
                     htn = m_ps_htn$model$htn)
# head(prs_df)

In [None]:
#matching process
mod_match_htn <- matchit(htn ~ age_at_MRI + sex,
                     method = "nearest", data = fullsethtn)
#summary(mod_match_htn)
plot(mod_match_htn)

Dimensions of matched data (equal numbers in both groups once matched, number represents total of healthy + htn)

In [None]:
dta_m_htn <- match.data(mod_match_htn)
dim(dta_m_htn)

Now compare distributions of matched data

In [None]:
dta_m_htn %>%
  group_by(htn) %>%
  select(one_of(fullset_cov)) %>%
  summarise_all(funs(mean))

## (ii) analysis

In [None]:
summarydta_m_htn <- dta_m_htn %>% group_by(htn) %>% 
	summarise(mean_catb_delta_with_t1_bc_cole = mean(catb_delta_with_t1_bc_cole), n= n()) 

summarydta_m_htn$mean_catb_delta_with_t1_bc_cole <- as.numeric(summarydta_m_htn$mean_catb_delta_with_t1_bc_cole)
summarydta_m_htn$mean_catb_delta_with_t1_bc_cole <- sprintf("%.3f",summarydta_m_htn$mean_catb_delta_with_t1_bc_cole)

### Summary:

In [None]:
summarydta_m_htn

### Plot:

In [None]:
df_htn_plot <- ggplot(dta_m_htn, aes(x=factor(htn), y=catb_delta_with_t1_bc_cole)) + 
  geom_point(stat="summary", fun.y="mean") + 
  geom_errorbar(stat="summary", width=0.05, fun.data="mean_se", fun.args = list(mult = 1.96)) +
  geom_hline(yintercept = 0, linetype='dotted') +
  theme(axis.text.x = element_text(face="bold", size=8, angle=45, hjust=1)) +
  ggtitle("HTN") +
  xlab("HTN status") +
  scale_x_discrete(labels=c("0" = "Non-HTN (n=11,047)", "1" =  "Age, sex matched HTN (n=11,047)"))
df_htn_plot

### Regression (adjusted for age, age^2, sex):

In [None]:
fithtn <- lm(catb_delta_with_t1_bc_cole ~ htn + age_at_MRI+ poly(age_at_MRI, 2) + sex  , dta_m_htn)
summary(fithtn)
confint(fithtn, level = .95)

#female
fithtn_female <- lm(catb_delta_with_t1_bc_cole ~  poly(age_at_MRI, 2) + sex*htn , dta_m_htn) #coefficient of diabetes = female coefficient. 
summary(fithtn_female)
confint(fithtn_female, level = .95)

#male
fithtn_male <- lm(catb_delta_with_t1_bc_cole ~  poly(age_at_MRI, 2) + I(1-sex)*htn , dta_m_htn) #coefficient of diabetes = male coefficient. 
summary(fithtn_female)
confint(fithtn_female, level = .95)

# Disease 3: Obesity 

In [None]:
#combine disease of interest with predictions.
fullsetobese <- read_csv("#path to csv containing obese set predicted ages")  %>% select("eid_40616")
fullsetobese$obese <- "1"
fullsetobese <- left_join(non.healthy.df, fullsetobese)
fullsetobese$obese[is.na(fullsetobese$obese)] <- 0

## (i) propensity match samples

Healthy sample is matched to disease sample by age and sex.

First, examine if there are any differences in covariates.

In [None]:
fullsetobese %>%
  group_by(obese) %>%
  select(one_of(fullset_cov)) %>%
  summarise_all(funs(mean(., na.rm = T)))

fullsetobese$obese <- as.factor(fullsetobese$obese)
# with(fullsetobese, t.test(age_at_MRI ~ obese))  #(repeat for each covariate)
# with(fullsetobese, t.test(sex ~ obese))  #(repeat for each covariate)

In [None]:
m_ps_obese <- glm(obese ~ age_at_MRI + sex, 
            family = binomial(), data = fullsetobese)
summary(m_ps_obese)

prs_df_obese <- data.frame(pr_score = predict(m_ps_obese, type = "response"),
                     obese = m_ps_obese$model$obese)
# head(prs_df)

In [None]:
#matching process
mod_match_obese <- matchit(obese ~ age_at_MRI + sex,
                     method = "nearest", data = fullsetobese)
#summary(mod_match_obese)
#plot(mod_match_obese)

Dimensions of matched data (equal numbers in both groups once matched, number represents total of healthy + obese)

In [None]:
dta_m_obese <- match.data(mod_match_obese)
dim(dta_m_obese)

Now compare distributions of matched data

In [None]:
dta_m_obese %>%
  group_by(obese) %>%
  select(one_of(fullset_cov)) %>%
  summarise_all(funs(mean))

## (ii) analysis

In [None]:
summarydta_m_obese <- dta_m_obese %>% group_by(obese) %>% 
	summarise(mean_catb_delta_with_t1_bc_cole = mean(catb_delta_with_t1_bc_cole), n= n()) 

summarydta_m_obese$mean_catb_delta_with_t1_bc_cole <- as.numeric(summarydta_m_obese$mean_catb_delta_with_t1_bc_cole)
summarydta_m_obese$mean_catb_delta_with_t1_bc_cole <- sprintf("%.3f",summarydta_m_obese$mean_catb_delta_with_t1_bc_cole)

### Summary:

In [None]:
summarydta_m_obese

### Plot:

In [None]:
df_obese_plot <- ggplot(dta_m_obese, aes(x=factor(obese), y=catb_delta_with_t1_bc_cole)) + 
  geom_point(stat="summary", fun.y="mean") + 
  geom_errorbar(stat="summary", width=0.05, fun.data="mean_se", fun.args = list(mult = 1.96)) +
  geom_hline(yintercept = 0, linetype='dotted') +
  theme(axis.text.x = element_text(face="bold", size=8, angle=45, hjust=1)) +
  ggtitle("obese") +
  xlab("obese status") +
  scale_x_discrete(labels=c("0" = "Non-obese (n=7089)", "1" =  "Age, sex matched obese (n=7089)"))
df_obese_plot

### Regression (adjusted for age, age^2, sex):

In [None]:
fitob <- lm(catb_delta_with_t1_bc_cole ~ obese + age_at_MRI+ poly(age_at_MRI, 2) + sex  , dta_m_obese)
summary(fitob)
confint(fitob, level = .95)

#female
fitob_female <- lm(catb_delta_with_t1_bc_cole ~  poly(age_at_MRI, 2) + sex*obese , dta_m_obese) #coefficient of diabetes = female coefficient. 
summary(fitob_female)
confint(fitob_female, level = .95)

#male
fitob_male <- lm(catb_delta_with_t1_bc_cole ~  poly(age_at_MRI, 2) + I(1-sex)*obese , dta_m_obese) #coefficient of diabetes = male coefficient. 
summary(fitob_male)
confint(fitob_male, level = .95)

# Disease 4: Coronary artery disease 


In [None]:
#combine disease of interest with predictions.
fullsetcad <- read_csv("#path to csv containing cad set predicted ages")  %>% select("eid_40616")
fullsetcad$cad <- "1"
fullsetcad <- left_join(non.healthy.df, fullsetcad)
fullsetcad$cad[is.na(fullsetcad$cad)] <- 0

## (i) propensity match samples

Healthy sample is matched to cad sample by age and sex.

First, examine if there are any differences in covariates.

In [None]:
fullsetcad %>%
  group_by(cad) %>%
  select(one_of(fullset_cov)) %>%
  summarise_all(funs(mean(., na.rm = T)))

fullsetcad$cad <- as.factor(fullsetcad$cad)
# with(fullsetcad, t.test(age_at_MRI ~ cad))  #(repeat for each covariate)
# with(fullsetcad, t.test(sex ~ cad))  #(repeat for each covariate)

In [None]:
m_ps_cad <- glm(cad ~ age_at_MRI + sex, 
            family = binomial(), data = fullsetcad)
summary(m_ps_cad)

prs_df_cad <- data.frame(pr_score = predict(m_ps_cad, type = "response"),
                     cad = m_ps_cad$model$cad)
# head(prs_df)

In [None]:
#matching process
mod_match_cad <- matchit(cad ~ age_at_MRI + sex,
                     method = "nearest", data = fullsetcad)
#summary(mod_match_cad)
#plot(mod_match_cad)

Dimensions of matched data (equal numbers in both groups once matched, number represents total of healthy + cad)

In [None]:
dta_m_cad <- match.data(mod_match_cad)
dim(dta_m_cad)

Now compare distributions of matched data

In [None]:
dta_m_cad %>%
  group_by(cad) %>%
  select(one_of(fullset_cov)) %>%
  summarise_all(funs(mean))

## (ii) analysis

In [None]:
summarydta_m_cad <- dta_m_cad %>% group_by(cad) %>% 
	summarise(mean_catb_delta_with_t1_bc_cole = mean(catb_delta_with_t1_bc_cole), n= n()) 

summarydta_m_cad$mean_catb_delta_with_t1_bc_cole <- as.numeric(summarydta_m_cad$mean_catb_delta_with_t1_bc_cole)
summarydta_m_cad$mean_catb_delta_with_t1_bc_cole <- sprintf("%.3f",summarydta_m_cad$mean_catb_delta_with_t1_bc_cole)

### Summary:

In [None]:
summarydta_m_cad

### Plot:

In [None]:
df_cad_plot <- ggplot(dta_m_cad, aes(x=factor(cad), y=catb_delta_with_t1_bc_cole)) + 
  geom_point(stat="summary", fun.y="mean") + 
  geom_errorbar(stat="summary", width=0.05, fun.data="mean_se", fun.args = list(mult = 1.96)) +
  geom_hline(yintercept = 0, linetype='dotted') +
  theme(axis.text.x = element_text(face="bold", size=8, angle=45, hjust=1)) +
  ggtitle("cad") +
  xlab("cad status") +
  scale_x_discrete(labels=c("0" = "Non-cad (n=2658)", "1" =  "Age, sex matched cad (n=2658)"))
df_cad_plot

### Regression (adjusted for age, age^2, sex):

In [None]:
fitcad <- lm(catb_delta_with_t1_bc_cole ~ cad + age_at_MRI+ poly(age_at_MRI, 2) + sex  , dta_m_cad)
summary(fitcad)
confint(fitcad, level = .95)

#female
fitcad_female <- lm(catb_delta_with_t1_bc_cole ~  poly(age_at_MRI, 2) + sex*cad , dta_m_cad) #coefficient of diabetes = female coefficient. 
summary(fitcad_female)
confint(fitcad_female, level = .95)

#male
fitcad_male <- lm(catb_delta_with_t1_bc_cole ~  poly(age_at_MRI, 2) + I(1-sex)*cad , dta_m_cad) #coefficient of diabetes = male coefficient. 
summary(fitcad_male)
confint(fitcad_male, level = .95)

# Disease 5: Hypercholesterolaemia

In [None]:
#combine chol of interest with predictions.
fullsetchol <- read_csv("#path to csv containing cholesterol set predicted ages")  %>% select("eid_40616")
fullsetchol$chol <- "1"
fullsetchol <- left_join(non.healthy.df, fullsetchol)
fullsetchol$chol[is.na(fullsetchol$chol)] <- 0

## (i) propensity match samples

Healthy sample is matched to chol sample by age and sex.

First, examine if there are any differences in covariates.

In [None]:
fullsetchol %>%
  group_by(chol) %>%
  select(one_of(fullset_cov)) %>%
  summarise_all(funs(mean(., na.rm = T)))

fullsetchol$chol <- as.factor(fullsetchol$chol)
# with(fullsetchol, t.test(age_at_MRI ~ chol))  #(repeat for each covariate)
# with(fullsetchol, t.test(sex ~ chol))  #(repeat for each covariate)

In [None]:
m_ps_chol <- glm(chol ~ age_at_MRI + sex, 
            family = binomial(), data = fullsetchol)
summary(m_ps_chol)

prs_df_chol <- data.frame(pr_score = predict(m_ps_chol, type = "response"),
                     chol = m_ps_chol$model$chol)
# head(prs_df)

In [None]:
#matching process
mod_match_chol <- matchit(chol ~ age_at_MRI + sex,
                     method = "nearest", data = fullsetchol)
#summary(mod_match_chol)
#plot(mod_match_chol)

Dimensions of matched data (equal numbers in both groups once matched, number represents total of healthy + chol)

In [None]:
dta_m_chol <- match.data(mod_match_chol)
dim(dta_m_chol)

Now compare distributions of matched data

In [None]:
dta_m_chol %>%
  group_by(chol) %>%
  select(one_of(fullset_cov)) %>%
  summarise_all(funs(mean))

## (ii) analysis

In [None]:
summarydta_m_chol <- dta_m_chol %>% group_by(chol) %>% 
	summarise(mean_catb_delta_with_t1_bc_cole = mean(catb_delta_with_t1_bc_cole), n= n()) 

summarydta_m_chol$mean_catb_delta_with_t1_bc_cole <- as.numeric(summarydta_m_chol$mean_catb_delta_with_t1_bc_cole)
summarydta_m_chol$mean_catb_delta_with_t1_bc_cole <- sprintf("%.3f",summarydta_m_chol$mean_catb_delta_with_t1_bc_cole)

### Summary:

In [None]:
summarydta_m_chol

### Plot:

In [None]:
df_chol_plot <- ggplot(dta_m_chol, aes(x=factor(chol), y=catb_delta_with_t1_bc_cole)) + 
  geom_point(stat="summary", fun.y="mean") + 
  geom_errorbar(stat="summary", width=0.05, fun.data="mean_se", fun.args = list(mult = 1.96)) +
  geom_hline(yintercept = 0, linetype='dotted') +
  theme(axis.text.x = element_text(face="bold", size=8, angle=45, hjust=1)) +
  ggtitle("chol") +
  xlab("chol status") +
  scale_x_discrete(labels=c("0" = "Non-chol (n=7482)", "1" =  "Age, sex matched chol (n=7482)"))
df_chol_plot

### Regression (adjusted for age, age^2, sex):

In [None]:
fitchol <- lm(catb_delta_with_t1_bc_cole ~ chol + age_at_MRI+ poly(age_at_MRI, 2) + sex  , dta_m_chol)
summary(fitchol)
confint(fitchol, level = .95)

#female
fitchol_female <- lm(catb_delta_with_t1_bc_cole ~  poly(age_at_MRI, 2) + sex*chol , dta_m_chol) #coefficient of diabetes = female coefficient. 
summary(fitchol_female)
confint(fitchol_female, level = .95)

#male
fitchol_male <- lm(catb_delta_with_t1_bc_cole ~  poly(age_at_MRI, 2) + I(1-sex)*chol , dta_m_chol) #coefficient of diabetes = male coefficient. 
summary(fitchol_male)
confint(fitchol_male, level = .95)

# Risk factor 6: Smoking

In [None]:
fullsetsmoke <- read_csv("#path to csv containing smoker set predicted ages")  %>% select("eid_40616", "smoking_status_f20116_2_0")
fullsetsmoke <- left_join(fullsetsmoke, non.healthy.df)
colnames(fullsetsmoke)[2] <- "smoke"

## (i) propensity match samples

Healthy sample is matched to hf sample by age and sex.

First, examine if there are any differences in covariates.

In [None]:
fullsetsmoke %>%
  group_by(smoke) %>%
  select(one_of(fullset_cov)) %>%
  summarise_all(funs(mean(., na.rm = T)))

fullsetsmoke$smoke <- as.factor(fullsetsmoke$smoke)

fullsetsmoke <- fullsetsmoke[complete.cases(fullsetsmoke), ]


fullsetsmoke0_2 <- fullsetsmoke %>% filter(smoke != 1)
# with(fullsethf, t.test(age_at_MRI ~ hf))  #(repeat for each covariate)
# with(fullsethf, t.test(sex ~ hf))  #(repeat for each covariate)

In [None]:
m_ps_smoke <- glm(smoke ~ age_at_MRI + sex, 
            family = binomial(), data = fullsetsmoke0_2)
summary(m_ps_smoke)

prs_df_smoke <- data.frame(pr_score = predict(m_ps_smoke, type = "response"),
                     smoke = m_ps_smoke$model$smoke)
# head(prs_df)

In [None]:
#matching process
mod_match_smoke <- matchit(smoke ~ age_at_MRI + sex,
                     method = "nearest", data = fullsetsmoke0_2)
#summary(mod_match_smoke)
#plot(mod_match_smoke)

Dimensions of matched data (equal numbers in both groups once matched, number represents total of healthy + smoke)

In [None]:
dta_m_smoke <- match.data(mod_match_smoke)
dim(dta_m_smoke)

Now compare distributions of matched data

In [None]:
dta_m_smoke %>%
  group_by(smoke) %>%
  select(one_of(fullset_cov)) %>%
  summarise_all(funs(mean))

## (ii) analysis

In [None]:
summarydta_m_smoke <- dta_m_smoke %>% group_by(smoke) %>% 
	summarise(mean_catb_delta_with_t1_bc_cole = mean(catb_delta_with_t1_bc_cole), n= n()) 

summarydta_m_smoke$mean_catb_delta_with_t1_bc_cole <- as.numeric(summarydta_m_smoke$mean_catb_delta_with_t1_bc_cole)
summarydta_m_smoke$mean_catb_delta_with_t1_bc_cole <- sprintf("%.3f",summarydta_m_smoke$mean_catb_delta_with_t1_bc_cole)

### Summary:

In [None]:
summarydta_m_smoke

### Plot:

In [None]:
df_smoke_plot <- ggplot(dta_m_smoke, aes(x=factor(smoke), y=catb_delta_with_t1_bc_cole)) + 
  geom_point(stat="summary", fun.y="mean") + 
  geom_errorbar(stat="summary", width=0.05, fun.data="mean_se", fun.args = list(mult = 1.96)) +
  geom_hline(yintercept = 0, linetype='dotted') +
  theme(axis.text.x = element_text(face="bold", size=8, angle=45, hjust=1)) +
  ggtitle("smoke") +
  xlab("smoke status") +
  scale_x_discrete(labels=c("0" = "Non-smoker (n = 1165)", "2" =  "Current Smoker (n = 1165)"))
df_smoke_plot

### Regression (adjusted for age, age^2, sex):

In [None]:
fitsmoke <- lm(catb_delta_with_t1_bc_cole ~ smoke + age_at_MRI+ poly(age_at_MRI, 2) + sex  , dta_m_smoke)
summary(fitsmoke)
#confint(fitsmoke, level = .95)

### Smoking - pack years analysis

In [None]:
packyrs <- read_csv("#path to csv containing smoking set predicted ages")  %>% select(eid_40616, packyrs_instance2)
packyrs <- left_join(packyrs, non.healthy.df)
packyrs <- packyrs[complete.cases(packyrs), ]
#summary(packyrs)

In [None]:
ggplot(packyrs, x =packyrs_instance2, y = catb_delta_with_t1_bc_cole) +
    geom_density_2d(aes(x =packyrs_instance2, y = catb_delta_with_t1_bc_cole),  color = "orange", size = 0.75, alpha = 1) +
    geom_smooth(aes(x =packyrs_instance2, y = catb_delta_with_t1_bc_cole), method = "lm", col = "gray40", size =  1)  +
  geom_point(aes(x =packyrs_instance2, y = catb_delta_with_t1_bc_cole),  color = "dodgerblue1", size = 1, alpha = 0.1)+
  #geom_density_2d() +   
  xlab("Pack years")+
  ylab("catb_delta_with_t1_bc_cole") +
  ggtitle("smoking") +
  theme_minimal()

Regular coefficients: and normalised coefficients reported

In [None]:
packyrs_std <- packyrs %>% select(-eid_40616)
packyrs_std = data.frame(scale(packyrs_std))

mod_sm = lm(catb_delta_with_t1_bc_cole ~ packyrs_instance2 + age_at_MRI+ poly(age_at_MRI, 2) + sex  , packyrs)
mod_std_sm = lm(catb_delta_with_t1_bc_cole ~ packyrs_instance2 + age_at_MRI+ poly(age_at_MRI, 2) + sex  , packyrs_std)

In [None]:
summ(mod_sm)

Normalised coefficient:

In [None]:
summ(mod_std_sm)

# Risk factor 7: Alcohol

In [None]:
alcohol <- read_csv("#path to csv containing alcohol set predicted ages")
alcohol <- left_join(alcohol, non.healthy.df)
alcohol <- alcohol[complete.cases(alcohol), ]

In [None]:
ggplot(alcohol, x =alcoholintakegpd, y = catb_delta_with_t1_bc_cole) +
    geom_density_2d(aes(x =alcoholintakegpd, y = catb_delta_with_t1_bc_cole),  color = "orange", size = 0.75, alpha = 1) +
    geom_smooth(aes(x =alcoholintakegpd, y = catb_delta_with_t1_bc_cole), method = "lm", col = "gray40", size =  1)  +
  geom_point(aes(x =alcoholintakegpd, y = catb_delta_with_t1_bc_cole),  color = "aquamarine4", size = 1, alpha = 0.05)+
  #geom_density_2d() +   
  xlab("Alcohol intake / gpd")+
  ylab("catb_delta_with_t1_bc_cole") +
  ggtitle("Alcohol") +
  theme_minimal()

Regular coefficients: and normalised coefficients reported

In [None]:
# standardising alcohol consumption ####

alcohol_std <- alcohol %>% select(-eid_40616)
alcohol_std = data.frame(scale(alcohol_std))

#mod_a = lm(catb_delta_with_t1_bc_cole ~ alcoholintakegpd + age_at_MRI+ poly(age_at_MRI, 2) + sex  , alcohol)
mod_std_a = lm(catb_delta_with_t1_bc_cole ~ alcoholintakegpd + age_at_MRI+ poly(age_at_MRI, 2) + sex  , alcohol_std)

#female##
fitalc_female <- lm(catb_delta_with_t1_bc_cole ~  poly(age_at_MRI, 2) + sex*alcoholintakegpd , alcohol_std) #coefficient of diabetes = female coefficient. 
summary(fitalc_female)
confint(fitalc_female, level = .95)

#male##
fitalc_male <- lm(catb_delta_with_t1_bc_cole ~  poly(age_at_MRI, 2) + I(1-sex)*alcoholintakegpd , alcohol_std) #coefficient of diabetes = male coefficient. 
summary(fitalc_male)
confint(fitalc_male, level = .95)

# non-standard alcohol consumption ####
alcohol_nonstd <- alcohol %>% select(-eid_40616)
mod_nonstd_a = lm(catb_delta_with_t1_bc_cole ~ alcoholintakegpd + age_at_MRI+ poly(age_at_MRI, 2) + sex  , alcohol_nonstd)
mod_nonstd_a
summary(mod_nonstd_a)
confint(mod_nonstd_a, level = .95)


In [None]:
summ(mod_std_a)

# Risk factor 8: Telomere Length

In [None]:
telomere <- read_csv("#path to csv containing telomere set predicted ages") %>% select(eid_40616, z_adj_0)
telomere <- left_join(telomere, non.healthy.df)
telomere <- telomere[complete.cases(telomere), ]

In [None]:
ggplot(telomere, x =z_adj_0, y = catb_delta_with_t1_bc_cole) +
    geom_density_2d(aes(x =z_adj_0, y = catb_delta_with_t1_bc_cole),  color = "orange", size = 0.75, alpha = 1) +
    geom_smooth(aes(x =z_adj_0, y = catb_delta_with_t1_bc_cole), method = "lm", col = "gray40", size =  1)  +
  geom_point(aes(x =z_adj_0, y = catb_delta_with_t1_bc_cole),  color = "aquamarine4", size = 1, alpha = 0.05)+
  #geom_density_2d() +   
  xlab("z_adj_0")+
  ylab("catb_delta_with_t1_bc_cole") +
  ggtitle("Telomere length (z_adj_0)") +
  theme_minimal()

Regular coefficients: and normalised coefficients reported

In [None]:
telomere_std <- telomere %>% select(-eid_40616)
telomere_std = data.frame(scale(telomere_std))

#mod_telo = lm(catb_delta_with_t1_bc_cole ~ z_adj_0 + age_at_MRI+ poly(age_at_MRI, 2) + sex  , telomere)
mod_std_telo = lm(catb_delta_with_t1_bc_cole ~ z_adj_0 + age_at_MRI+ poly(age_at_MRI, 2) + sex  , telomere_std)

#female##
fittelo_female <- lm(catb_delta_with_t1_bc_cole ~  poly(age_at_MRI, 2) + sex*z_adj_0 , telomere_std) #coefficient of diabetes = female coefficient. 
summary(fittelo_female)
confint(fittelo_female, level = .95)

#male##
fittelo_male <- lm(catb_delta_with_t1_bc_cole ~  poly(age_at_MRI, 2) + I(1-sex)*z_adj_0 , telomere_std) #coefficient of diabetes = male coefficient. 
summary(fittelo_male)
confint(fittelo_male, level = .95)

In [None]:
summ(mod_std_telo)

# Risk factor 9: Smoking

In [None]:
smokepack <- read_csv("#path to csv containing smoking set predicted ages") %>% select(eid_40616, packyrs_instance2)
smokepack <- left_join(smokepack, non.healthy.df)
smokepack <- smokepack[complete.cases(smokepack), ]

Regular coefficients: and normalised coefficients reported

In [None]:
smokepack_std <- smokepack %>% select(-eid_40616)
smokepack_std = data.frame(scale(smokepack_std))

smokepack_nonstd <- smokepack %>% select(-eid_40616)

#mod_smokepack = lm(catb_delta_with_t1_bc_cole ~ packyrs_instance2 + age_at_MRI+ poly(age_at_MRI, 2) + sex  , smokepack)
mod_std_smokepack = lm(catb_delta_with_t1_bc_cole ~ packyrs_instance2 + age_at_MRI+ poly(age_at_MRI, 2) + sex  , smokepack_std)

#female##
fitsmoke_female <- lm(catb_delta_with_t1_bc_cole ~  poly(age_at_MRI, 2) + sex*packyrs_instance2 , smokepack_std) #coefficient of diabetes = female coefficient.
summary(fitsmoke_female)
confint(fitsmoke_female, level = .95)

#male##
fitsmoke_male <- lm(catb_delta_with_t1_bc_cole ~  poly(age_at_MRI, 2) + I(1-sex)*packyrs_instance2 , smokepack_std) #coefficient of diabetes = male coefficient. 
summary(fitsmoke_male)
confint(fitsmoke_male, level = .95)

# non-standardised smoking consumption ####
mod_std_smokepack = lm(catb_delta_with_t1_bc_cole ~ packyrs_instance2 + age_at_MRI+ poly(age_at_MRI, 2) + sex  , smokepack_nonstd)
mod_std_smokepack
summary(mod_std_smokepack)
confint(mod_std_smokepack, level = .95)

In [None]:
summ(mod_std_smokepack)

# Forest plot: Categorical variables

In [None]:
#tidy outputs:
out_conf1 <- tidy(fitdm, conf.int = TRUE)
out_conf1_female <- tidy(fitdm_female, conf.int = TRUE)
out_conf1_male <- tidy(fitdm_male, conf.int = TRUE)

out_conf2 <- tidy(fithtn, conf.int = TRUE)
out_conf2_female <- tidy(fithtn_female, conf.int = TRUE)
out_conf2_male <- tidy(fithtn_male, conf.int = TRUE)

out_conf3 <- tidy(fitob, conf.int = TRUE)
out_conf3_female <- tidy(fitob_female, conf.int = TRUE)
out_conf3_male <- tidy(fitob_male, conf.int = TRUE)

out_conf4 <- tidy(fitcad, conf.int = TRUE)
out_conf4_female <- tidy(fitcad_female, conf.int = TRUE)
out_conf4_male <- tidy(fitcad_male, conf.int = TRUE)

out_conf5 <- tidy(fitchol, conf.int = TRUE)
out_conf5_female <- tidy(fitchol_female, conf.int = TRUE)
out_conf5_male <- tidy(fitchol_male, conf.int = TRUE)

###
lm_model_out1 <- round_df(out_conf1, digits=10)
lm_model_out1 <- lm_model_out1[2,] #remove the intercept 
lm_model_out_conf1_female <- round_df(out_conf1_female, digits=10)
lm_model_out_conf1_female <- lm_model_out_conf1_female[5,] #remove the intercept 
lm_model_out_conf1_male<- round_df(out_conf1_male, digits=10)
lm_model_out_conf1_male <- lm_model_out_conf1_male[5,] #remove the intercept 

lm_model_out2 <- round_df(out_conf2, digits=10)
lm_model_out2 <- lm_model_out2[2,] #remove the intercept 
lm_model_out_conf2_female <- round_df(out_conf2_female, digits=10)
lm_model_out_conf2_female <- lm_model_out_conf2_female[5,] #remove the intercept 
lm_model_out_conf2_male <- round_df(out_conf2_male, digits=10)
lm_model_out_conf2_male <- lm_model_out_conf2_male[5,] #remove the intercept 

lm_model_out3 <- round_df(out_conf3, digits=10)
lm_model_out3 <- lm_model_out3[2,] #remove the intercept 
lm_model_conf3_female <- round_df(out_conf3_female, digits=10)
lm_model_out_conf3_female <- lm_model_conf3_female[5,] #remove the intercept 
lm_model_out_conf3_male <- round_df(out_conf3_male, digits=10)
lm_model_out_conf3_male <- lm_model_out_conf3_male[5,] #remove the intercept 

lm_model_out4 <- round_df(out_conf4, digits=10)
lm_model_out4 <- lm_model_out4[2,] #remove the intercept 
lm_model_out_conf4_female <- round_df(out_conf4_female, digits=10)
lm_model_out_conf4_female <- lm_model_out_conf4_female[5,] #remove the intercept 
lm_model_out_conf4_male <- round_df(out_conf4_male, digits=10)
lm_model_out_conf4_male <- lm_model_out_conf4_male[5,] #remove the intercept 

lm_model_out5 <- round_df(out_conf5, digits=10)
lm_model_out5 <- lm_model_out5[2,] #remove the intercept 
lm_model_conf5_female <- round_df(out_conf5_female, digits=10)
lm_model_conf5_female <- lm_model_conf5_female[5,] #remove the intercept 
lm_model_out_conf5_male <- round_df(out_conf5_male, digits=10)
lm_model_out_conf5_male <- lm_model_out_conf5_male[5,] #remove the intercept 

##############################################
lm_intermediate_1 <- rbind(lm_model_out1, lm_model_out_conf1_female ,lm_model_out_conf1_male) 
lm_intermediate_1$group <- c("Overall", "Female", "Male")

lm_intermediate_2 <- rbind(lm_model_out2, lm_model_out_conf2_female ,lm_model_out_conf2_male) 
lm_intermediate_2$group <- c("Overall", "Female", "Male")

lm_intermediate_3 <- rbind(lm_model_out3, lm_model_out_conf3_female ,lm_model_out_conf3_male) 
lm_intermediate_3$group <- c("Overall", "Female", "Male")

lm_intermediate_4 <- rbind(lm_model_out4, lm_model_out_conf4_female ,lm_model_out_conf4_male) 
lm_intermediate_4$group <- c("Overall", "Female", "Male")

lm_intermediate_5 <- rbind(lm_model_out5, lm_model_conf5_female ,lm_model_out_conf5_male) 
lm_intermediate_5$group <- c("Overall", "Female", "Male")

lm_intermediate_overall <-  rbind(lm_intermediate_1, lm_intermediate_2,lm_intermediate_3, lm_intermediate_4, lm_intermediate_5)
lm_intermediate_overall$term <-  str_replace_all(lm_intermediate_overall$term, "[:digit:]", "")

#combine into one dataframe and clean up titles:
# lm_overall <- rbind(lm_model_out1, lm_model_out2,lm_model_out3,lm_model_out4, lm_model_out5) #, lm_model_out6 )
# lm_overall$term <- str_replace_all(lm_overall$term, "[:digit:]", "")

# Now plot the
# forestlm <- ggplot(lm_overall, aes(x=reorder(term, estimate), y=estimate)) +
#          geom_errorbar(aes(ymin=conf.low, ymax=conf.high), 
#                        width = 0.1,size  = 0.5,
#                        position = "dodge", color="gray19") +
#   geom_hline(yintercept = 0, color = "gray19", size = 0.5, linetype=2) +
#   geom_point() + coord_flip() +
#   ylab("beta-coefficient with 95% confidence intervals") +
#   xlab("disease")

#factoring here
term_order <- c('diabetes', 'htn', 'obese', 'cad', 'chol')
lm_intermediate_overall$term = factor (lm_intermediate_overall$term, level=term_order)

#write.csv(lm_intermediate_overall, "~/cardiac/Ageing/cardiacage/code/ageingpaper_steps7to9/diseases_by_sex.csv")


#define colours for dots and bars
dotCOLS = c("#5c1230",  "#ffa600", "#c14937")

forestlm_bysex <- ggplot(lm_intermediate_overall, aes(x=term, y=estimate, ymin=conf.low, ymax=conf.high, fill=group,col=group)) + 
  geom_errorbar(size=1,position=position_dodge(width = 0.5), width = 0, show.legend=F) +
  geom_point(size=4, shape=21, colour="white", stroke = 0.5, position=position_dodge(width = 0.5)) +
  scale_fill_manual(values=dotCOLS,
                    breaks = c("Overall", "Male","Female"))+
  scale_color_manual(values=dotCOLS,  
                     breaks = c("Overall", "Male","Female"))+ 
  scale_x_discrete(name=" ") +
  scale_y_continuous(name="Beta coefficient", limits = c(-1.5, 2.5)) +
  geom_hline(yintercept=0, linetype="dashed", color = "black") +
  coord_flip() +
  theme(legend.position = c(0.9, 0.9),
        panel.background = element_blank(),
        axis.text.x  = element_text(size=16, color='black'),
        axis.text.y  = element_text(size=16, color='black'),
        axis.line.x = element_line(size = 0.8, linetype = "solid"),
        axis.line.y = element_blank(),
        axis.title.x  = element_text(size=18, vjust=0.3, face="plain", color = 'black'),
        axis.title.y  = element_text(size=18, face = "plain", vjust=0.9, angle = 90, colour = 'black'),
  )

In [None]:
forestlm_bysex

# Forest plot: Continuous variables

In [None]:
#tidy outputs:
#alcohol
out_conf_a <- tidy(mod_std_a, conf.int = TRUE)
out_conf_a_female <- tidy(fitalc_female, conf.int = TRUE)
out_conf_a_male <- tidy(fitalc_male, conf.int = TRUE)

#telo
out_conf_telo <- tidy(mod_std_telo, conf.int = TRUE)
out_conf_telo_female <- tidy(fittelo_female, conf.int = TRUE)
out_conf_telo_male <- tidy(fittelo_male, conf.int = TRUE)

#bmi
# out_conf_bmi <- tidy(mod_std_bmi, conf.int = TRUE)
# out_conf_bmi_female <- tidy(fitbmi_female, conf.int = TRUE)
# out_conf_bmi_male <- tidy(fitbmi_male, conf.int = TRUE)

out_conf_sm <- tidy(mod_std_sm, conf.int = TRUE)
out_conf_sm_female <- tidy(fitsmoke_female, conf.int = TRUE)
out_conf_sm_male <- tidy(fitsmoke_male, conf.int = TRUE)

#alcohol
lm_model_out_a <- round_df(out_conf_a, digits=10)
lm_model_out_a <- lm_model_out_a[2,] #remove the intercept 
lm_model_out_conf_a_female <- round_df(out_conf_a_female, digits=10)
lm_model_out_conf_a_female <- lm_model_out_conf_a_female[5,] #remove the intercept 
lm_model_out_conf_a_male <- round_df(out_conf_a_male, digits=10)
lm_model_out_conf_a_male <- lm_model_out_conf_a_male[5,] #remove the intercept 

#telo
lm_model_out_telo <- round_df(out_conf_telo, digits=10)
lm_model_out_telo <- lm_model_out_telo[2,] #remove the intercept 
lm_model_out_conf_telo_female <- round_df(out_conf_telo_female, digits=10)
lm_model_out_conf_telo_female <- lm_model_out_conf_telo_female[5,] #remove the intercept 
lm_model_out_conf_telo_male <- round_df(out_conf_telo_male, digits=10)
lm_model_out_conf_telo_male <- lm_model_out_conf_telo_male[5,] #remove the intercept 

# #bmi
# lm_model_out_bmi <- round_df(out_conf_bmi, digits=10)
# lm_model_out_bmi <- lm_model_out_bmi[2,] #remove the intercept 
# lm_model_out_conf_bmi_female <- round_df(out_conf_bmi_female, digits=10)
# lm_model_out_conf_bmi_female <- lm_model_out_conf_bmi_female[2,] #remove the intercept 
# lm_model_out_conf_bmi_male <- round_df(out_conf_bmi_male, digits=10)
# lm_model_out_conf_bmi_male <- lm_model_out_conf_bmi_male[2,] #remove the intercept 

#smoke
lm_model_out_sm <- round_df(out_conf_sm, digits=10)
lm_model_out_sm <- lm_model_out_sm[2,] #remove the intercept 
lm_model_out_sm_female <- round_df(out_conf_sm_female, digits=10)
lm_model_out_sm_female <- lm_model_out_sm_female[5,] #remove the intercept 
lm_model_out_conf_sm_male <- round_df(out_conf_sm_male, digits=10)
lm_model_out_conf_sm_male <- lm_model_out_conf_sm_male[5,] #remove the intercept 



###
lm_intermediatecont_1 <- rbind(lm_model_out_a, lm_model_out_conf_a_female ,lm_model_out_conf_a_male) 
lm_intermediatecont_1$group <- c("Overall", "Female", "Male")

lm_intermediatecont_2 <- rbind(lm_model_out_telo, lm_model_out_conf_telo_female ,lm_model_out_conf_telo_male) 
lm_intermediatecont_2$group <- c("Overall", "Female", "Male")

lm_intermediatecont_3 <- rbind(lm_model_out_sm, lm_model_out_sm_female ,lm_model_out_conf_sm_male) 
lm_intermediatecont_3$group <- c("Overall", "Female", "Male")

lm_intermediatecombined <- rbind(lm_intermediatecont_1, lm_intermediatecont_2, lm_intermediatecont_3) #, lm_model_out6 )
lm_intermediatecombined$term <- str_replace_all(lm_intermediatecombined$term, "[:digit:]", "")

#combine into one dataframe and clean up titles:
# lm_overall_cat <- rbind(lm_model_out_sm, lm_model_out_a, lm_model_out_bmi, lm_model_out_telo) #, lm_model_out6 )
# lm_overall_cat$term <- str_replace_all(lm_overall_cat$term, "[:digit:]", "")

#factoring here
term_order <- c('alcoholintakegpd', 'z_adj_', 'packyrs_instance')
lm_intermediatecombined$term = factor (lm_intermediatecombined$term, level=term_order)


#forest plot
forestlmCONT_bysex <- ggplot(lm_intermediatecombined, aes(x=term, y=estimate, ymin=conf.low, ymax=conf.high, fill=group,col=group)) + 
  geom_errorbar(size=1,position=position_dodge(width = 0.5), width = 0, show.legend=F) +
  geom_point(size=4, shape=21, colour="white", stroke = 0.5, position=position_dodge(width = 0.5)) +
  scale_fill_manual(values=dotCOLS,
                    breaks = c("Overall", "Male","Female"))+
  scale_color_manual(values=dotCOLS,  
                     breaks = c("Overall", "Male","Female"))+ 
  scale_x_discrete(name=" ") +
  scale_y_continuous(name="Standardised beta coefficient", limits = c(-0.05, 0.15)) +
  geom_hline(yintercept=0, linetype="dashed", color = "black") +
  coord_flip() +
  theme(legend.position = c(0.9, 0.9),
        panel.background = element_blank(),
        axis.text.x  = element_text(size=16, color='black'),
        axis.text.y  = element_text(size=16, color='black'),
        axis.line.x = element_line(size = 0.8, linetype = "solid"),
        axis.line.y = element_blank(),
        axis.title.x  = element_text(size=18, vjust=0.3, face="plain", color = 'black'),
        axis.title.y  = element_text(size=18, face = "plain", vjust=0.9, angle = 90, colour = 'black'),
  )


In [None]:
forestlmCONT_bysex

# Summary stats for paper

In [None]:
dimensions_dm <- dim(dta_m)/2
dimensions_dm <- dimensions_dm[1]
dimensions_htn <- dim(dta_m_htn)/2
dimensions_htn <- dimensions_htn[1]
dimensions_ob <- dim(dta_m_obese)/2
dimensions_ob <- dimensions_ob[1]
dimensions_cad <- dim(dta_m_cad)/2
dimensions_cad <- dimensions_cad[1]
dimensions_chol <- dim(dta_m_chol)/2
dimensions_chol <- dimensions_chol[1]
dimensions_smoke <- dim(dta_m_smoke)/2
dimensions_smoke <- dimensions_smoke[1]

dimensions_alc <- dim(alcohol)[1]
dimensions_telom <- dim(telomere)[1]
dimensions_smokepack <- dim(smokepack)[1]

diseasegroupnames <- c("Diabetes", "htn", "obese", "cad", "hyperchol", "smoking")
numberingroup <- c(dimensions_dm, dimensions_htn, dimensions_ob, dimensions_cad, dimensions_chol, dimensions_smoke)
diseasegroupsummary <- data.frame(diseasegroupnames, numberingroup)

continouousgroups <- c("alcohol", "telomere", "smoking")
numberincontgroup <- c(dimensions_alc, dimensions_telom, dimensions_smokepack)
contgroupsummary <- data.frame(continouousgroups, numberincontgroup)
