## Environmental determinants of incident clonal hematopoiesis: 
### logistic regression

In [1]:
library(data.table) # version 1.14.6
library(dplyr)
# set working directory
setwd("/medpop/esp2/mesbah/projects/ch_progression/aric/epi/")

In [2]:
# Load data
## 0/1 CH status
aric_baseline_n_v05 <- fread("../pheno/aric_baseline_n_v05_N4189.pheno_ch_status_trajectory.23Mar2023.csv", header=T)
aric_baseline_n_v05$dAge <- aric_baseline_n_v05$Age - aric_baseline_n_v05$age_base
summary(aric_baseline_n_v05$dAge)
# CH variants
com.expansion.CH_v_b_v5_all <- fread("../pheno/combined.expansion.CH_v_b_v5_all.growth_rate.23Mar2023.csv", header=T)


   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   5.00   20.00   21.00   20.31   21.00   27.00 

In [4]:
# # Unadjusted: cont. variable
# chd_is_base== CHD or IS
aric_baseline_n_v05$chd_is_base <- ifelse(aric_baseline_n_v05$chd_base==1 | aric_baseline_n_v05$is_base==1, 1,
                                          ifelse(aric_baseline_n_v05$chd_base==0 | aric_baseline_n_v05$is_base==0,0,NA))
table(aric_baseline_n_v05$chd_is_base, exclude = NULL)

 # ASCVD = c("chd", "is", "hf")
aric_baseline_n_v05$ascvd_base <- ifelse(aric_baseline_n_v05$chd_base==1 | 
                                           aric_baseline_n_v05$is_base==1 | 
                                           aric_baseline_n_v05$hf_base==1, 1,
                                          ifelse(aric_baseline_n_v05$chd_base==0 | 
                                                   aric_baseline_n_v05$is_base==0 | 
                                                   aric_baseline_n_v05$hf_base==0 | 
                                                 is.na( aric_baseline_n_v05$is_base) | 
                                                 is.na( aric_baseline_n_v05$hf_base),0,NA))
table(aric_baseline_n_v05$ascvd_base, exclude = NULL)
aric_baseline_n_v05$ascvd_base[is.na(aric_baseline_n_v05$ascvd_base)] <- 0
table(aric_baseline_n_v05$ascvd_base, exclude = NULL)




   0    1 <NA> 
3929  166   94 


   0    1 <NA> 
3781  271  137 


   0    1 
3918  271 

## Un-adjusted model: GLM

In [29]:
cat(gsub(pattern = ", ", replacement = ",", x = toString(
  c("Dataset","Outcome", "Exposure","Beta", "SE", "t-stat", "P"))),
  file = "glm.adj_vs_unadj.incident_chip.2023May03.csv", append = F, fill = T)

In [15]:
cat(gsub(pattern =", ", replacement = ",", x= toString(c("Unadjusted", summary(aric_baseline_n_v05 %>% filter(!is.na(age_base) & !is.na(incident_CH)) %>%
                        glm(incident_CH ~  age_base, data = ., family = "binomial"))$coefficients[2,1:4]))))

Unadjusted,0.0335706423326323,0.00739252459672974,4.54116072166782,5.59453607500203e-06

In [30]:
# exclude: too many missing value (637) gluc_base_std"
exposures <- c("age_base",  "dAge", "bmi_base_std",   
               "hdl_base_std", "ldl_base_std",
               "tg_base_std","nonHDL_base_std", "Sex", "race_BW", 
               "ever_smoke", 
               "dm_126_base", "htn_5_base", "ascvd_base")

ch_phenotype <- c("incident_CH",
                  "incident_DTA", "incident_SF","incident_DDR",
                  "incident_DNMT3A","incident_TET2",
                  "incident_ASXL1")
for(i in exposures){
  
  for (j in ch_phenotype){
    
    # remove NA
    model1 <- summary(aric_baseline_n_v05 %>% filter(!is.na(get(i)) & !is.na(get(j))) %>%
                        glm(get(j) ~  get(i), data = ., family = "binomial"))
    cat( gsub(pattern = ", ", replacement = ",", x = toString(
      c("Unadjusted", paste0(j), paste0(i), 
        model1$coefficients[2,1:4]) ) ), 
      file = "glm.adj_vs_unadj.incident_chip.2023May03.csv", append = T, fill = T)
    
  }
}

"glm.fit: fitted probabilities numerically 0 or 1 occurred"
"glm.fit: fitted probabilities numerically 0 or 1 occurred"
"glm.fit: fitted probabilities numerically 0 or 1 occurred"


## Adjusted model:  
### Age: 
adjusted for Sex, Race, Smoking, batch(visit,center), bmi, statin, non-hdl, hdl, t2d, htn, ascvd

In [31]:
ch_phenotype <- c("incident_CH",
                  "incident_DTA", "incident_SF","incident_DDR",
                  "incident_DNMT3A","incident_TET2",
                  "incident_ASXL1")
#
for (j in ch_phenotype){
  model2 <- summary( aric_baseline_n_v05 %>% 
                       filter(!is.na(age_base) & !is.na(get(j))) %>% 
                       glm(get(j) ~  age_base + 
                                Sex + race_BW + ever_smoke + 
                                v2_vs_other + Center + bmi_base_std + 
                                chol_med_base + nonHDL_base_std + hdl_base_std + dm_126_base + 
                                htn_5_base + ascvd_base, data = ., family = "binomial") )
  
  cat( gsub(pattern = ", ", replacement = ",", x = toString(
    c("Adjusted", paste0(j), "age_base", 
      model2$coefficients[2,1:4]) ) ), 
    file = "glm.adj_vs_unadj.incident_chip.2023May03.csv", 
    append = T, fill = T)
}


## Adjusted model:  
### all exposures (non lipid): 
adjusted for age, age2, dAge, Sex, Race, Smoking, batch(visit,center), bmi, statin, non-hdl, hdl, t2d, htn, ascvd

In [32]:
rm(i,j,k, ch_phenotype, exposures, model1, model2)

# Outcomes
ch_phenotype <- c("incident_CH",
                  "incident_DTA", "incident_SF","incident_DDR",
                  "incident_DNMT3A","incident_TET2",
                  "incident_ASXL1")

# Exposures
test_exposures <- c("Sex", "race_BW", "ever_smoke", 
                    "bmi_base_std", "nonHDL_base_std", "hdl_base_std", 
                    "dm_126_base", "htn_5_base", 
                    "ascvd_base", "age_base", "age_base_sqr","dAge")

for (j in ch_phenotype){
  for (k in 1:length(test_exposures)) {
    cat("outcome:",j," exposure:", test_exposures[k],"\n")
    model3 <- summary(aric_baseline_n_v05 %>% 
                        filter(!is.na(get(j))) %>% 
                        glm(get(j) ~ 
                            Sex + race_BW + 
                            ever_smoke + bmi_base_std + 
                            nonHDL_base_std + hdl_base_std + 
                            dm_126_base + htn_5_base + 
                            ascvd_base + age_base + 
                            age_base_sqr + dAge + 
                            chol_med_base + Center + 
                            v2_vs_other, 
                            data = ., family="binomial"))
    cat( gsub(pattern = ", ", replacement = ",", x = toString(
      c("adjusted_hdl_non_hdl_dAge", paste0(j), paste0(test_exposures[k]),
        model3$coefficients[k+1,1:4]) ) ),
      file = "glm.adj_vs_unadj.incident_chip.2023May03.csv", 
      append = T, fill = T)
  }
}

## 
test_exposures <- c("Sex", "race_BW", "ever_smoke", 
                    "bmi_base_std", "nonHDL_base_std", "hdl_base_std", 
                    "dm_126_base", "htn_5_base", 
                    "ascvd_base", "age_base", "age_base_sqr")
for (j in ch_phenotype){
  for (k in 1:length(test_exposures)) {
    cat("outcome:",j," exposure:", test_exposures[k],"\n")
    model3.1 <- summary(aric_baseline_n_v05 %>% 
                        filter(!is.na(get(j))) %>% 
                        glm(get(j) ~ 
                            Sex + race_BW + 
                            ever_smoke + bmi_base_std + 
                            nonHDL_base_std + hdl_base_std + 
                            dm_126_base + htn_5_base + 
                            ascvd_base + age_base + 
                            age_base_sqr +  
                            chol_med_base + Center + 
                            v2_vs_other, 
                            data = ., family="binomial"))
    cat( gsub(pattern = ", ", replacement = ",", x = toString(
      c("adjusted_hdl_non_hdl", paste0(j), paste0(test_exposures[k]),
        model3.1$coefficients[k+1,1:4]) ) ),
      file = "glm.adj_vs_unadj.incident_chip.2023May03.csv", 
      append = T, fill = T)
  }
}

outcome: incident_CH  exposure: Sex 
outcome: incident_CH  exposure: race_BW 
outcome: incident_CH  exposure: ever_smoke 
outcome: incident_CH  exposure: bmi_base_std 
outcome: incident_CH  exposure: nonHDL_base_std 
outcome: incident_CH  exposure: hdl_base_std 
outcome: incident_CH  exposure: dm_126_base 
outcome: incident_CH  exposure: htn_5_base 
outcome: incident_CH  exposure: ascvd_base 
outcome: incident_CH  exposure: age_base 
outcome: incident_CH  exposure: age_base_sqr 
outcome: incident_CH  exposure: dAge 
outcome: incident_DTA  exposure: Sex 
outcome: incident_DTA  exposure: race_BW 
outcome: incident_DTA  exposure: ever_smoke 
outcome: incident_DTA  exposure: bmi_base_std 
outcome: incident_DTA  exposure: nonHDL_base_std 
outcome: incident_DTA  exposure: hdl_base_std 
outcome: incident_DTA  exposure: dm_126_base 
outcome: incident_DTA  exposure: htn_5_base 
outcome: incident_DTA  exposure: ascvd_base 
outcome: incident_DTA  exposure: age_base 
outcome: incident_DTA  exposur

#### LDL, HDL, nonHDL, TG

In [34]:
# rm(i,j,k, ch_phenotype, exposures, model1, model2,model3)

ch_phenotype <- c("incident_CH",
                  "incident_DTA", "incident_SF","incident_DDR",
                  "incident_DNMT3A","incident_TET2",
                  "incident_ASXL1")

# Lipids 
test_exposures <- c("ldl_base_std","hdl_base_std", 
                    "tg_base_std","nonHDL_base_std")

for (j in ch_phenotype){
  for (k in test_exposures) {
    cat("outcome:",j," exposure:", k,"\n")
    model3.2 <- summary(aric_baseline_n_v05 %>% 
                        filter(!is.na(get(k)) & 
                                 !is.na(get(j))) %>% 
                        glm( get(j) ~  get(k) +
                            Sex + race_BW + ever_smoke + 
                            bmi_base_std + dm_126_base + 
                            htn_5_base + ascvd_base + 
                            age_base + age_base_sqr + dAge +
                            chol_med_base + Center + 
                            v2_vs_other, 
                            data = ., family="binomial"))
      
    cat( gsub(pattern = ", ", replacement = ",", x = toString(
      c("adjusted_1_lipid_dAge", paste0(j), paste0(k),
        model3.2$coefficients[1+1,1:4] ) ) ),
      file = "glm.adj_vs_unadj.incident_chip.2023May03.csv", 
      append = T, fill = T)
  }
}

## no dAge adjustment
for (j in ch_phenotype){
  for (k in test_exposures) {
    cat("outcome:",j," exposure:", k,"\n")
    model3.3 <- summary(aric_baseline_n_v05 %>% 
                        filter(!is.na(get(k)) & 
                                 !is.na(get(j))) %>% 
                        glm( get(j) ~  get(k) +
                            Sex + race_BW + ever_smoke + 
                            bmi_base_std + dm_126_base + 
                            htn_5_base + ascvd_base + 
                            age_base + age_base_sqr +
                            chol_med_base + Center + 
                            v2_vs_other, 
                            data = ., family="binomial"))
      
    cat( gsub(pattern = ", ", replacement = ",", x = toString(
      c("adjusted_1_lipid", paste0(j), paste0(k),
        model3.3$coefficients[1+1,1:4] ) ) ),
      file = "glm.adj_vs_unadj.incident_chip.2023May03.csv", 
      append = T, fill = T)
  }
}



outcome: incident_CH  exposure: ldl_base_std 
outcome: incident_CH  exposure: hdl_base_std 
outcome: incident_CH  exposure: tg_base_std 
outcome: incident_CH  exposure: nonHDL_base_std 
outcome: incident_DTA  exposure: ldl_base_std 
outcome: incident_DTA  exposure: hdl_base_std 
outcome: incident_DTA  exposure: tg_base_std 
outcome: incident_DTA  exposure: nonHDL_base_std 
outcome: incident_SF  exposure: ldl_base_std 
outcome: incident_SF  exposure: hdl_base_std 
outcome: incident_SF  exposure: tg_base_std 
outcome: incident_SF  exposure: nonHDL_base_std 
outcome: incident_DDR  exposure: ldl_base_std 
outcome: incident_DDR  exposure: hdl_base_std 
outcome: incident_DDR  exposure: tg_base_std 
outcome: incident_DDR  exposure: nonHDL_base_std 
outcome: incident_DNMT3A  exposure: ldl_base_std 
outcome: incident_DNMT3A  exposure: hdl_base_std 
outcome: incident_DNMT3A  exposure: tg_base_std 


"glm.fit: fitted probabilities numerically 0 or 1 occurred"


outcome: incident_DNMT3A  exposure: nonHDL_base_std 
outcome: incident_TET2  exposure: ldl_base_std 
outcome: incident_TET2  exposure: hdl_base_std 
outcome: incident_TET2  exposure: tg_base_std 
outcome: incident_TET2  exposure: nonHDL_base_std 
outcome: incident_ASXL1  exposure: ldl_base_std 
outcome: incident_ASXL1  exposure: hdl_base_std 
outcome: incident_ASXL1  exposure: tg_base_std 


"glm.fit: fitted probabilities numerically 0 or 1 occurred"


outcome: incident_ASXL1  exposure: nonHDL_base_std 


"glm.fit: fitted probabilities numerically 0 or 1 occurred"


outcome: incident_CH  exposure: ldl_base_std 
outcome: incident_CH  exposure: hdl_base_std 
outcome: incident_CH  exposure: tg_base_std 
outcome: incident_CH  exposure: nonHDL_base_std 
outcome: incident_DTA  exposure: ldl_base_std 
outcome: incident_DTA  exposure: hdl_base_std 
outcome: incident_DTA  exposure: tg_base_std 
outcome: incident_DTA  exposure: nonHDL_base_std 
outcome: incident_SF  exposure: ldl_base_std 
outcome: incident_SF  exposure: hdl_base_std 
outcome: incident_SF  exposure: tg_base_std 
outcome: incident_SF  exposure: nonHDL_base_std 
outcome: incident_DDR  exposure: ldl_base_std 
outcome: incident_DDR  exposure: hdl_base_std 
outcome: incident_DDR  exposure: tg_base_std 
outcome: incident_DDR  exposure: nonHDL_base_std 
outcome: incident_DNMT3A  exposure: ldl_base_std 
outcome: incident_DNMT3A  exposure: hdl_base_std 
outcome: incident_DNMT3A  exposure: tg_base_std 


"glm.fit: fitted probabilities numerically 0 or 1 occurred"


outcome: incident_DNMT3A  exposure: nonHDL_base_std 
outcome: incident_TET2  exposure: ldl_base_std 
outcome: incident_TET2  exposure: hdl_base_std 
outcome: incident_TET2  exposure: tg_base_std 
outcome: incident_TET2  exposure: nonHDL_base_std 
outcome: incident_ASXL1  exposure: ldl_base_std 
outcome: incident_ASXL1  exposure: hdl_base_std 
outcome: incident_ASXL1  exposure: tg_base_std 


"glm.fit: fitted probabilities numerically 0 or 1 occurred"


outcome: incident_ASXL1  exposure: nonHDL_base_std 


"glm.fit: fitted probabilities numerically 0 or 1 occurred"


### Forest plot

In [35]:
##############################
#### Forest plot
##############################
library(data.table) # version 1.14.6
library(meta) # version 6.2-1
library(grid) # version 4.2.2
library(scales) # version 1.2.1
##############################

Loading 'meta' package (version 4.18-2).
Type 'help(meta)' for a brief overview.



In [36]:
# set working directory
setwd("/medpop/esp2/mesbah/projects/ch_progression/aric/epi/")
    # load cohph summary
glm_dat <- fread("glm.adj_vs_unadj.incident_chip.2023May03.csv", header = T)
table(glm_dat$Dataset)
table(glm_dat$Exposure)
table(glm_dat$Outcome)

< table of extent 0 >


            Sex        age_base    age_base_sqr      ascvd_base    bmi_base_std 
             21              28              14              21              21 
           dAge     dm_126_base      ever_smoke    hdl_base_std      htn_5_base 
             14              21              21              49              21 
   ldl_base_std nonHDL_base_std         race_BW     tg_base_std 
             35              49              21              35 


 incident_ASXL1     incident_CH    incident_DDR incident_DNMT3A    incident_DTA 
             53              53              53              53              53 
    incident_SF   incident_TET2 
             53              53 

In [37]:
## Exposures
glm_dat$Exposure[glm_dat$Exposure=="age_base"] <- "Age"
glm_dat$Exposure[glm_dat$Exposure=="bmi_base_std"] <- "BMI"
glm_dat$Exposure[glm_dat$Exposure=="ascvd_base"] <- "ASCVD"
glm_dat$Exposure[glm_dat$Exposure=="chol_base_std"] <- "Cholesterol"
glm_dat$Exposure[glm_dat$Exposure=="dm_126_base"] <- "T2D"
glm_dat$Exposure[glm_dat$Exposure=="ever_smoke"] <- "Ever Smoker"
glm_dat$Exposure[glm_dat$Exposure=="hdl_base_std"] <- "HDL-C"
glm_dat$Exposure[glm_dat$Exposure=="htn_5_base"] <- "Hypertension"
glm_dat$Exposure[glm_dat$Exposure=="ldl_base_std"] <- "LDL-C"
glm_dat$Exposure[glm_dat$Exposure=="nonHDL_base_std"] <- "Non-HDL-C"
glm_dat$Exposure[glm_dat$Exposure=="race_BW"] <- "European"
glm_dat$Exposure[glm_dat$Exposure=="Sex"] <- "Male Sex"
glm_dat$Exposure[glm_dat$Exposure=="tg_base_std"] <- "Triglyceride"
    ## Outcome
# glm_dat$Outcome[glm_dat$Outcome=="incident_CH_or_growingClones"] <- "Incident or growing clones"
glm_dat$Outcome[glm_dat$Outcome=="incident_CH"] <- "Overall CH"
glm_dat$Outcome[glm_dat$Outcome=="incident_DNMT3A"] <- "DNMT3A"
glm_dat$Outcome[glm_dat$Outcome=="incident_TET2"] <- "TET2"
glm_dat$Outcome[glm_dat$Outcome=="incident_ASXL1"] <- "ASXL1"
glm_dat$Outcome[glm_dat$Outcome=="incident_DTA"] <- "DTA"
glm_dat$Outcome[glm_dat$Outcome=="incident_SF"] <- "SF"
glm_dat$Outcome[glm_dat$Outcome=="incident_DDR"] <- "DDR"

In [39]:
## 20 independent test at 5%; P< 0.05/20 = 0.0025
cat("P threshold< 0.0025")
# 0.05/20 = 0.0025 = "2.5E-03"
glm_dat$sig <- ifelse(glm_dat$P<0.0025, "***","")
table(glm_dat$sig)

P threshold< 0.0025


    *** 
354  17 

In [41]:
## Sort outcome
glm_dat$Outcome <- ordered(glm_dat$Outcome, 
                           levels = c("Overall CH", "DTA", 
                                      "SF", "DDR", 
                                      "DNMT3A", "TET2", "ASXL1")) 

In [46]:
# format 
glm_dat$P_val <- formatC(x = glm_dat$P, digits = 1,format = "E")

    # OR
glm_dat$OR <- formatC(round(exp(glm_dat$Beta),2), digits = 2, format = "f")

glm_dat$lSE <- ( glm_dat$Beta - 1.96 * glm_dat$SE)
glm_dat$uSE <- ( glm_dat$Beta + 1.96 * glm_dat$SE)

    # 95% CI
glm_dat$CI95 <- paste0("[",formatC(round(exp( glm_dat$Beta - 1.96 * glm_dat$SE),2), digits = 2, format = "f"),
                       ", ",
                       formatC(round(exp( glm_dat$Beta + 1.96 * glm_dat$SE),2), digits = 2, format = "f"), 
                       "]")
head(glm_dat)

Dataset,Outcome,Exposure,Beta,SE,t-stat,P,sig,P_val,OR,CI95,lSE,uSE
<chr>,<ord>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>
Unadjusted,Overall CH,Age,0.033570642,0.007392525,4.5411607,5.594536e-06,***,5.6e-06,1.03,"[1.02, 1.05]",0.019081294,0.04805999
Unadjusted,DTA,Age,0.026250415,0.008482191,3.0947683,0.001969667,***,0.002,1.03,"[1.01, 1.04]",0.009625321,0.04287551
Unadjusted,SF,Age,0.090521117,0.020381694,4.4412951,8.941905e-06,***,8.9e-06,1.09,"[1.05, 1.14]",0.050572997,0.13046924
Unadjusted,DDR,Age,0.05167499,0.024274307,2.1287936,0.03327134,,0.033,1.05,"[1.00, 1.10]",0.004097349,0.09925263
Unadjusted,DNMT3A,Age,0.002795145,0.010835593,0.2579596,0.7964381,,0.8,1.0,"[0.98, 1.02]",-0.018442617,0.02403291
Unadjusted,TET2,Age,0.049037466,0.013626113,3.5987862,0.0003197059,***,0.00032,1.05,"[1.02, 1.08]",0.022330286,0.07574465


In [43]:
glm_dat_v1 <- subset(glm_dat, glm_dat$Outcome %in% c("Overall CH", "DNMT3A", "TET2"))

glm_dat_v2 <- subset(glm_dat, glm_dat$Outcome %in% c("ASXL1", "DTA","SF", "DDR"))


In [45]:
## cox_dat_v1: CH, DNMT3A, TET2
## All adjusted
df_plot_adj_v1 <- subset(glm_dat_v1, glm_dat_v1$Dataset=="Adjusted")
## Format metagen
b_adj_v1 <- metagen(TE = Beta,
              se = SE,
              studlab = Outcome,
              subgroup=Exposure,
              data=df_plot_adj_v1,
              sm="OR")
### all adjusted
pdf("glm.Forest_incidentCH.v1_adj.May03.pdf", width = 12, height= 30)
forest(x = b_adj_v1, 
       common=F, 
       random=F, 
       hetstat=F, 
       subgroup=k.w>=1, 
       weight.study="same",  
       level=0.95, 
       xlim=c(0.5, 3), 
       smlab="Effect of Exposures\non Incident CH\n", 
       smlab.pos=0, 
       colgap=unit(7, "mm"),
       xlab="Odds Ratio", 
       squaresize=0.6, 
       col.subgroup="black", 
       colgap.left=unit(0.1,"cm"),
       colgap.forest.left="3mm", 
       colgap.forest.right="2mm", 
       leftcols=c("studlab"), 
       leftlabs = c("                     "),
       rightcols=c("OR","CI95","P_val", "sig"),
       rightlabs=c("OR","95% CI","P", ""),
       #rightcols=NULL, 
       #rightlabs=NULL,
       col.inside="black", 
       plotwidth=unit(6.5, "cm"), 
       print.subgroup.name=F)
dev.off()


ERROR: Error in metagen(TE = Beta, se = SE, studlab = Outcome, subgroup = Exposure, : unused argument (subgroup = Exposure)
