In [None]:
source("diabetes_analysis_v06.R")

rank_score_func <- function(df){
df <- df %>% mutate(score = -1*log(p_val_adj+(10^-310))*avg_log2FC*(pct.1/(pct.2+10^-300)))
return(df)
}



# Load populations

## CD4

In [None]:
### Table for quantification and Bayes

In [None]:
cd4_l1_full_filt  <- readRDS("../data/processed/L1/cd4_l1_full_filt.rds")

In [None]:
cd4_patient_meta  <- cd4_l1_full_filt@meta.data  %>% 
                            dplyr::select(Sample_ID, Condition, Condition2, 
                                          Disease, 
                                          Sex, Age, Age_group, Patient_ID, 
                                          Time, Experiment_ID)   %>% unique

In [None]:
colnames(cd4_l1_full_filt@meta.data )

In [None]:
df3  <- cd4_l1_full_filt@meta.data %>% 
  group_by(Sample_ID, annotations_l3) %>% 
  summarise(n = n()) %>% 
  unique() %>% 
ungroup   %>% 
pivot_wider(names_from = "annotations_l3", values_from = "n", values_fill = 0) 
df4  <- left_join((cd4_l1_full_filt@misc$all_md %>% dplyr::select(Sample_ID) %>% unique), df3)
df4[is.na(df4)] <- 0
df4  <- df4  %>% pivot_longer(!Sample_ID, values_to = "n", names_to = "annotations")

# As we've lost non-grouping variables, let's join them back
md_to_join <- cd4_l1_full_filt@misc$all_md %>% 
  unique()

df4  <- left_join(df4, md_to_join)

In [None]:
df4$Level  <- "L3"

In [None]:
df_l3  <- df4

In [None]:
df_l3

In [None]:
df3  <- cd4_l1_full_filt@meta.data %>% 
  group_by(Sample_ID, annotations_l2) %>% 
  summarise(n = n()) %>% 
  unique() %>% 
ungroup   %>% 
pivot_wider(names_from = "annotations_l2", values_from = "n", values_fill = 0) 
df4  <- left_join((cd4_l1_full_filt@misc$all_md %>% dplyr::select(Sample_ID) %>% unique), df3)
df4[is.na(df4)] <- 0
df4  <- df4  %>% pivot_longer(!Sample_ID, values_to = "n", names_to = "annotations")

# As we've lost non-grouping variables, let's join them back
md_to_join <- cd4_l1_full_filt@misc$all_md %>% 
  unique()

df4  <- left_join(df4, md_to_join)
df4$Level  <- "L2"

In [None]:
df_l2  <- df4

In [None]:
df4

In [None]:
df3  <- cd4_l1_full_filt@meta.data %>% 
  group_by(Sample_ID, annotations_l1) %>% 
  summarise(n = n()) %>% 
  unique() %>% 
ungroup   %>% 
pivot_wider(names_from = "annotations_l1", values_from = "n", values_fill = 0) 
df4  <- left_join((cd4_l1_full_filt@misc$all_md %>% dplyr::select(Sample_ID) %>% unique), df3)
df4[is.na(df4)] <- 0
df4  <- df4  %>% pivot_longer(!Sample_ID, values_to = "n", names_to = "annotations")

# As we've lost non-grouping variables, let's join them back
md_to_join <- cd4_l1_full_filt@misc$all_md %>% 
  unique()

df4  <- left_join(df4, md_to_join)
df4$Level  <- "L1"

df_l1  <- df4

In [None]:
df_l3

In [None]:
df_all_levels  <- rbind(df_l1, df_l2, df_l3)

In [None]:
all_counts  <- df_all_levels

In [None]:
df3  <- all_counts %>% 
  group_by(Sample_ID, Level) %>% 
  mutate(freq_from_total = n / sum(n)) 

In [None]:
df3

In [None]:
## Set parent population

In [None]:
df3  <- df3  %>% separate(annotations, into = c("annot_l1","annot_l2",NA), sep = "---", remove = F)  %>% 
mutate(Parent_annotation = case_when(
Level == "L1" ~ "CD4",
Level == "L2" ~ annot_l1,
Level == "L3" ~ paste0(annot_l1, "---" ,annot_l2)
))

In [None]:
df3

In [None]:
## For each level and each patient calculate the total count per parent population

In [None]:
levels_l1  <- all_counts %>% 
  filter(Level == "L1")  %>% 
pull(annotations)  %>% unique
levels_l2  <- all_counts %>% 
  filter(Level == "L2")  %>% 
pull(annotations)  %>% unique

In [None]:

for(i in 1:length(levels_l1)) {
    
    df_filt  <- df3  %>% filter(grepl(annotations, pattern = levels_l1[i]) & Level == "L2")

    df_filt  <- df_filt  %>% 
            group_by(Sample_ID)  %>% 
        summarise(total_per_patient = sum(n))

    df_filt$Level = "L2"

    df_filt$Parent_annotation = levels_l1[i]

    if(i > 1){
       
        df_sum_of_parent  <- rbind(df_sum_of_parent, df_filt)
        
    } else {
        df_sum_of_parent  <- df_filt
        
        
    }

}

for(i in 1:length(levels_l2)) {
    
    df_filt  <- df3  %>% filter(grepl(annotations, pattern = levels_l2[i]) & Level == "L3")

    df_filt  <- df_filt  %>% 
            group_by(Sample_ID)  %>% 
        summarise(total_per_patient = sum(n))

    df_filt$Level = "L3"

    df_filt$Parent_annotation = levels_l2[i]

    df_sum_of_parent  <- rbind(df_sum_of_parent, df_filt)
   

}

In [None]:
df_sum_of_parent

In [None]:
dim(df3)

In [None]:
df4  <- left_join(df3, df_sum_of_parent)

In [None]:
dim(df4)

In [None]:
df4

In [None]:
df4$freq_from_parent  <- df4$n/df4$total_per_patient

In [None]:
df4

In [None]:
df4$pct_from_total  <- df4$freq_from_total*100
df4$pct_from_parent  <- df4$freq_from_parent*100


In [None]:
df4$freq_from_parent <- ifelse(is.na(df4$freq_from_parent) & df4$Level != "L1", 0, df4$freq_from_parent)
df4$pct_from_parent  <- ifelse(is.na(df4$pct_from_parent) & df4$Level != "L1", 0, df4$pct_from_parent)

In [None]:
df4

In [None]:
freq_cd4  <- df4

## CD8

In [None]:
cd8_l1_full_filt  <- readRDS("../data/processed/L1/cd8_l1_full_filt.rds")

In [None]:
cd8_patient_meta  <- cd8_l1_full_filt@meta.data  %>% 
                            dplyr::select(Sample_ID, Condition, Condition2, 
                                          Disease, 
                                          Sex, Age, Age_group, Patient_ID, 
                                          Time, Experiment_ID)   %>% unique

In [None]:
colnames(cd8_l1_full_filt@meta.data )

In [None]:
df3  <- cd8_l1_full_filt@meta.data %>% 
  group_by(Sample_ID, annotations_l3) %>% 
  summarise(n = n()) %>% 
  unique() %>% 
ungroup   %>% 
pivot_wider(names_from = "annotations_l3", values_from = "n", values_fill = 0) 
df4  <- left_join((cd8_l1_full_filt@misc$all_md %>% dplyr::select(Sample_ID) %>% unique), df3)
df4[is.na(df4)] <- 0
df4  <- df4  %>% pivot_longer(!Sample_ID, values_to = "n", names_to = "annotations")

# As we've lost non-grouping variables, let's join them back
md_to_join <- cd8_l1_full_filt@misc$all_md %>% 
  unique()

df4  <- left_join(df4, md_to_join)

In [None]:
df4$Level  <- "L3"

In [None]:
df_l3  <- df4

In [None]:
df3  <- cd8_l1_full_filt@meta.data %>% 
  group_by(Sample_ID, annotations_l2) %>% 
  summarise(n = n()) %>% 
  unique() %>% 
ungroup   %>% 
pivot_wider(names_from = "annotations_l2", values_from = "n", values_fill = 0) 
df4  <- left_join((cd8_l1_full_filt@misc$all_md %>% dplyr::select(Sample_ID) %>% unique), df3)
df4[is.na(df4)] <- 0
df4  <- df4  %>% pivot_longer(!Sample_ID, values_to = "n", names_to = "annotations")

# As we've lost non-grouping variables, let's join them back
md_to_join <- cd8_l1_full_filt@misc$all_md %>% 
  unique()

df4  <- left_join(df4, md_to_join)
df4$Level  <- "L2"

In [None]:
df_l2  <- df4

In [None]:
df4

In [None]:
df3  <- cd8_l1_full_filt@meta.data %>% 
  group_by(Sample_ID, annotations_l1) %>% 
  summarise(n = n()) %>% 
  unique() %>% 
ungroup   %>% 
pivot_wider(names_from = "annotations_l1", values_from = "n", values_fill = 0) 
df4  <- left_join((cd8_l1_full_filt@misc$all_md %>% dplyr::select(Sample_ID) %>% unique), df3)
df4[is.na(df4)] <- 0
df4  <- df4  %>% pivot_longer(!Sample_ID, values_to = "n", names_to = "annotations")

# As we've lost non-grouping variables, let's join them back
md_to_join <- cd8_l1_full_filt@misc$all_md %>% 
  unique()

df4  <- left_join(df4, md_to_join)
df4$Level  <- "L1"

df_l1  <- df4

In [None]:
df_l3

In [None]:
df_all_levels  <- rbind(df_l1, df_l2, df_l3)

In [None]:
df_all_levels

In [None]:
all_counts  <- df_all_levels

In [None]:
df3  <- all_counts %>% 
  group_by(Sample_ID, Level) %>% 
  mutate(freq_from_total = n / sum(n)) 

In [None]:
df3

In [None]:
## Set parent population

In [None]:
df3  <- df3  %>% separate(annotations, into = c("annot_l1","annot_l2",NA), sep = "---", remove = F)  %>% 
mutate(Parent_annotation = case_when(
Level == "L1" ~ "CD8",
Level == "L2" ~ annot_l1,
Level == "L3" ~ paste0(annot_l1, "---" ,annot_l2)
))

In [None]:
df3

In [None]:
## For each level and each patient calculate the total count per parent population

In [None]:
levels_l1  <- all_counts %>% 
  filter(Level == "L1")  %>% 
pull(annotations)  %>% unique
levels_l2  <- all_counts %>% 
  filter(Level == "L2")  %>% 
pull(annotations)  %>% unique

In [None]:

for(i in 1:length(levels_l1)) {
    
    df_filt  <- df3  %>% filter(grepl(annotations, pattern = levels_l1[i]) & Level == "L2")

    df_filt  <- df_filt  %>% 
            group_by(Sample_ID)  %>% 
        summarise(total_per_patient = sum(n))

    df_filt$Level = "L2"

    df_filt$Parent_annotation = levels_l1[i]

    if(i > 1){
       
        df_sum_of_parent  <- rbind(df_sum_of_parent, df_filt)
        
    } else {
        df_sum_of_parent  <- df_filt
        
        
    }

}

for(i in 1:length(levels_l2)) {
    
    df_filt  <- df3  %>% filter(grepl(annotations, pattern = levels_l2[i]) & Level == "L3")

    df_filt  <- df_filt  %>% 
            group_by(Sample_ID)  %>% 
        summarise(total_per_patient = sum(n))

    df_filt$Level = "L3"

    df_filt$Parent_annotation = levels_l2[i]

    df_sum_of_parent  <- rbind(df_sum_of_parent, df_filt)
   

}

In [None]:
df_sum_of_parent

In [None]:
dim(df3)

In [None]:
df4  <- left_join(df3, df_sum_of_parent)

In [None]:
dim(df4)

In [None]:
df4

In [None]:
df4$freq_from_parent  <- df4$n/df4$total_per_patient

In [None]:
df4

In [None]:
df4$pct_from_total  <- df4$freq_from_total*100
df4$pct_from_parent  <- df4$freq_from_parent*100


In [None]:
df4$freq_from_parent <- ifelse(is.na(df4$freq_from_parent) & df4$Level != "L1", 0, df4$freq_from_parent)
df4$pct_from_parent  <- ifelse(is.na(df4$pct_from_parent) & df4$Level != "L1", 0, df4$pct_from_parent)

In [None]:
df4

In [None]:
freq_cd8  <- df4

In [None]:
freq_cd4$Main  <- "CD4"
freq_cd8$Main  <- "CD8"


In [None]:
populations_2  <- rbind(freq_cd4, freq_cd8)

In [None]:
populations_2$Experiment_ID  %>% table

In [None]:
write.csv(populations_2, "../tables/populations_2.csv")

In [None]:
populations_2

# Population QC

## Correlation of populations in preliminary and final

### Freq from total

In [None]:
populations_2  <- read_csv("../tables/populations_2.csv")
populations_2$`...1`  <- NULL

In [None]:
populations_2$prelim_final  <- ifelse(populations_2$Experiment_ID %in% c("Exp08","Exp10","Exp11"), 
                                     "Prelim","Final")

In [None]:
IDs  <- populations_2  %>% dplyr::filter(prelim_final == "Prelim")  %>% pull(Patient_ID)  %>% unique

In [None]:
IDs

In [None]:
popul_final_freq_from_total  <- populations_2  %>% 
ungroup  %>% 
mutate(prelim_final = ifelse(Experiment_ID %in% c("Exp08", "Exp10", "Exp11"), "Prelim","Final"))  %>% 
dplyr::filter(Patient_ID %in% IDs & Time == "T0")  %>% 
dplyr::select(Patient_ID, Time, annotations, prelim_final, freq_from_total, Level)  %>% 
pivot_wider(names_from = "prelim_final", values_from = freq_from_total)

In [None]:
popul_final_freq_from_total

In [None]:
options(repr.plot.width= 20, repr.plot.height = 15)
popul_final_freq_from_total %>% 
dplyr::filter(Level == "L2")  %>% 
ggplot(aes(x=Final, y=Prelim)) +
  geom_point(shape = 16, alpha = 1, size = 2, aes(color = as.factor(annotations))) +
 geom_smooth(method=lm) + theme(legend.position = "bottom") + 
ggpubr::stat_cor()

In [None]:
l2_only  <- popul_final_freq_from_total %>% 
dplyr::filter(Level == "L2")
centroids <- aggregate(cbind(Final,Prelim)~annotations,l2_only,mean)

In [None]:
options(repr.plot.width= 20, repr.plot.height = 12)
popul_final_freq_from_total %>% 
dplyr::filter(Level == "L2")  %>% 
ggplot(aes(x=Final, y=Prelim)) +
 geom_smooth(method=lm, alpha = 0.2) + theme(legend.position = "bottom") + 
 geom_point(alpha =0.3, size = 3, aes(color = as.factor(annotations), 
                                       fill = as.factor(annotations),
                                    shape = as.factor(annotations))) +
scale_shape_manual(values = c(21:25,21:25,21:25,21:25)) +
ggpubr::stat_cor() + geom_point(data=centroids,size=5, color = "black",
                                aes(fill = as.factor(annotations),
                                                          shape = as.factor(annotations))) + 
scale_shape_manual(values = c(21:25,21:25,21:25,21:25)) + 
theme(axis.text.x = element_text(angle = 90)) + 
theme_classic() +
ggtheme()


In [None]:
options(repr.plot.width= 20, repr.plot.height = 12)
popul_final_freq_from_total %>% 
dplyr::filter(Level == "L2")  %>% 
ggplot(aes(x=Final, y=Prelim)) +
 geom_smooth(method=lm, alpha = 0.2) + theme(legend.position = "bottom") + 
 geom_point(alpha =0.3, size = 3, aes(color = as.factor(annotations), 
                                       fill = as.factor(annotations),
                                    shape = as.factor(annotations))) +
scale_shape_manual(values = c(21:25,21:25,21:25,21:25)) +
ggpubr::stat_cor(size = 5) + geom_point(data=centroids,size=5, color = "black",
                                aes(fill = as.factor(annotations),
                                                          shape = as.factor(annotations))) + 
scale_shape_manual(values = c(21:25,21:25,21:25,21:25)) + 
theme(axis.text.x = element_text(angle = 90)) + 
theme_classic() +
ggtheme()

ggsave("../figures/QC/final_vs_preliminary.svg", width = 44, height = 25, units = "cm")
ggsave("../figures/QC/final_vs_preliminary.png", width = 44, height = 25, units = "cm")

In [None]:
options(repr.plot.width= 15, repr.plot.height = 15)
popul_final_freq_from_total %>% 
dplyr::filter(Level == "L2")  %>% 
ggplot(aes(x=Final, y=Prelim)) +
facet_wrap(~annotations, ncol = 4, scales = "free")+
#  geom_point(shape = 16, alpha = 1, size = 2, aes(color = as.factor(annotations))) +
 geom_smooth(method=lm, alpha = 0.2) + theme(legend.position = "bottom") + 
ggpubr::stat_cor(size = 5) +
theme(axis.text.x = element_text(angle = 90)) + 
geom_point(size=3,                                aes(fill = as.factor(annotations),
                                                          shape = as.factor(annotations))) + 
scale_shape_manual(values = c(21:25,21:25,21:25,21:25)) +
theme_classic() +
ggtheme() + NoLegend()

## Correlation of populations in T0 vs T1

### Freq from total

In [None]:
IDs_1  <- populations_2  %>% 
dplyr::filter(prelim_final == "Final" & Time == "T0" & Disease == "Dia")  %>% 
pull(Patient_ID)  %>% unique

In [None]:
IDs_2  <- populations_2  %>% 
dplyr::filter(prelim_final == "Final" & Time == "T1" & Disease == "Dia")  %>% 
pull(Patient_ID)  %>% unique

In [None]:
IDs  <- intersect(IDs_1, IDs_2)

In [None]:
IDs

In [None]:
popul_final_freq_from_total  <- populations_2  %>% 
ungroup  %>% 
mutate(prelim_final = ifelse(Experiment_ID %in% c("Exp08", "Exp10", "Exp11"), "Prelim","Final"))  %>% 
dplyr::filter(Patient_ID %in% IDs & prelim_final == "Final")  %>% 
dplyr::select(Patient_ID, Time, annotations, freq_from_total, Level)  %>% 
pivot_wider(names_from = "Time", values_from = freq_from_total)

In [None]:
popul_final_freq_from_total  %>% dplyr::filter(!is.na(T1))

In [None]:
options(repr.plot.width= 20, repr.plot.height = 15)
popul_final_freq_from_total %>% 
dplyr::filter(Level == "L2")  %>% 
ggplot(aes(x=T0, y=T1)) +
  geom_point(shape = 16, alpha = 1, size = 2, aes(color = as.factor(annotations))) +
 geom_smooth(method=lm) + theme(legend.position = "bottom") + 
ggpubr::stat_cor()

In [None]:
l2_only  <- popul_final_freq_from_total %>% 
dplyr::filter(Level == "L2")
centroids <- aggregate(cbind(T1,T0)~annotations,l2_only,mean)

In [None]:
options(repr.plot.width= 20, repr.plot.height = 12)
popul_final_freq_from_total %>% 
dplyr::filter(Level == "L2")  %>% 
ggplot(aes(x=T0, y=T1)) +
 geom_smooth(method=lm, alpha = 0.2) + theme(legend.position = "bottom") + 
 geom_point(alpha =0.3, size = 3, aes(color = as.factor(annotations), 
                                       fill = as.factor(annotations),
                                    shape = as.factor(annotations))) +
scale_shape_manual(values = c(21:25,21:25,21:25,21:25)) +
ggpubr::stat_cor(size = 5) + geom_point(data=centroids,size=5, color = "black",
                                aes(fill = as.factor(annotations),
                                                          shape = as.factor(annotations))) + 
scale_shape_manual(values = c(21:25,21:25,21:25,21:25)) + 
theme(axis.text.x = element_text(angle = 90)) + 
theme_classic() +
ggtheme()

ggsave("../figures/QC/t1_vs_t0.svg", width = 44, height = 25, units = "cm")
ggsave("../figures/QC/t1_vs_t0.png", width = 44, height = 25, units = "cm")

In [None]:
options(repr.plot.width= 15, repr.plot.height = 15)
popul_final_freq_from_total %>% 
dplyr::filter(Level == "L2")  %>% 
ggplot(aes(x=T0, y=T1)) +
facet_wrap(~annotations, ncol = 4, scales = "free")+
#  geom_point(shape = 16, alpha = 1, size = 2, aes(color = as.factor(annotations))) +
 geom_smooth(method=lm, alpha = 0.2) + theme(legend.position = "bottom") + 
ggpubr::stat_cor(size = 5) +
theme(axis.text.x = element_text(angle = 90)) + 
geom_point(size=3,                                aes(fill = as.factor(annotations),
                                                          shape = as.factor(annotations))) + 
scale_shape_manual(values = c(21:25,21:25,21:25,21:25)) +
theme_classic() +
ggtheme() + NoLegend()

# Correlation with clinical data

## Freq from total

In [None]:
populations_2  %>% colnames

In [None]:
populations_2  %>% ncol

In [None]:
md3  <- populations_2  %>% 
ungroup  %>% 
dplyr::select(-Main)  %>% 
mutate(annotations = paste(Level, annotations))  %>% 
dplyr::select(6,7,10,12,2,22,16)  %>% 
dplyr::filter(prelim_final == "Final")  %>% 
unique  %>% 
pivot_wider(names_from = "annotations", values_from = "freq_from_total")

In [None]:
md3

In [None]:
md_cpept_orig  <- read_xlsx("../data/metadata_v06.xlsx") 

In [None]:
colnames(md_cpept_orig)

In [None]:
md_cpept  <- read_xlsx("../data/metadata_v06.xlsx")  %>% 
mutate(Patient_Time = paste(patient, time_taken))  %>% 
       dplyr::select(Patient_Time, fasting_cpept_T1, fasting_cpept_1, c_peptide_chang)  %>% unique

In [None]:
md_cpept   <- md_cpept  %>% mutate(group = substr(Patient_Time,1,1))  %>% filter(group == 1)

In [None]:
md_cpept

In [None]:
md3  <- md3  %>% separate(Condition, into = c("Disease", "Time"), remove = F, sep = " ")

In [None]:
md3$Patient_Time  <- paste(md3$Patient_ID, md3$Time)

In [None]:
md4  <- md_cpept  %>% left_join(md3)  %>% dplyr::filter(!is.na(fasting_cpept_T1))

In [None]:
md4 

In [None]:
population_colnames  <- which(substr(colnames(md4),start = 1,stop = 1) == "L")

In [None]:
population_colnames  %>% length

In [None]:
which(colnames(md4) == "fasting_cpept_1")

In [None]:
md4

In [None]:
calc_correlation  <- function(i){
    df  <- md4  %>% dplyr::select(i,2)  %>% filter(!is.na(2))
    colnames(df)  <- c("value","fasting_cpept_t1")
    cor  <- cor.test(df$value, df$fasting_cpept_t1)
    res_df1  <- data.frame(population = colnames(md4)[i], 
                          cor = cor$estimate, 
                          pval = cor$p.value, 
                         padj  = ifelse(cor$p.value*102>1,1,cor$p.value*102),
                         test = "fasting_cpept_T1")
    
    df  <- md4  %>% dplyr::select(i,3)  %>% filter(!is.na(3))
    colnames(df)  <- c("value","fasting_cpept_1")
    cor  <- cor.test(df$value, df$fasting_cpept_1)
    res_df2  <- data.frame(population = colnames(md4)[i], 
                          cor = cor$estimate, 
                          pval = cor$p.value, 
                         padj  = ifelse(cor$p.value*102>1,1,cor$p.value*102),
                         test = "fasting_cpept")
    
    df  <- md4  %>% dplyr::select(i,4)  %>% filter(!is.na(4))
    colnames(df)  <- c("value","c_peptide_change")
    cor  <- cor.test(df$value, df$c_peptide_change)
    res_df3  <- data.frame(population = colnames(md4)[i], 
                          cor = cor$estimate, 
                          pval = cor$p.value, 
                         padj  = ifelse(cor$p.value*102>1,1,cor$p.value*102),
                         test = "c_peptide_change")
    
    res_df  <- rbind(res_df1, res_df2, res_df3)
    return(res_df)
}

In [None]:
populations_corr  <- future_map(population_colnames, calc_correlation)

In [None]:
populations_corr  <- bind_rows(populations_corr)

In [None]:
populations_corr  %>% arrange(pval)

## L2 CD4

In [None]:
population_colnames_l2  <- which(substr(colnames(md4),start = 1,stop = 2) == "L2" &
                                 grepl(colnames(md4), pattern = "CD4"))

In [None]:
population_colnames_l2

In [None]:
md4

In [None]:
calc_correlation  <- function(i){
    df  <- md4  %>% dplyr::select(i,2)  %>% filter(!is.na(2))
   colnames(df)  <- c("value","fasting_cpept_t1")
    cor  <- cor.test(df$value, df$fasting_cpept_t1)
    res_df1  <- data.frame(population = colnames(md4)[i], 
                          cor = cor$estimate, 
                          pval = cor$p.value, 
                         padj  = ifelse(cor$p.value*10>1,1,cor$p.value*10),
                         test = "fasting_cpept_T1")
    
    df  <- md4  %>% dplyr::select(i,3)  %>% filter(!is.na(3))
    colnames(df)  <- c("value","fasting_cpept_1")
    cor  <- cor.test(df$value, df$fasting_cpept_1)
    res_df2  <- data.frame(population = colnames(md4)[i], 
                          cor = cor$estimate, 
                          pval = cor$p.value, 
                         padj  = ifelse(cor$p.value*10>1,1,cor$p.value*10),
                         test = "fasting_cpept")
    
    df  <- md4  %>% dplyr::select(i,4)  %>% filter(!is.na(4))
    colnames(df)  <- c("value","c_peptide_change")
    cor  <- cor.test(df$value, df$c_peptide_change)
    res_df3  <- data.frame(population = colnames(md4)[i], 
                          cor = cor$estimate, 
                          pval = cor$p.value, 
                         padj  = ifelse(cor$p.value*10>1,1,cor$p.value*10),
                         test = "c_peptide_change")
    
    res_df  <- rbind(res_df1, res_df2, res_df3)
    return(res_df)
}

In [None]:
populations_corr_cd4  <- future_map(population_colnames_l2, calc_correlation)

In [None]:
populations_corr  <- bind_rows(populations_corr_cd4)

populations_corr  %>% arrange(pval)

In [None]:
populations_corr  <- populations_corr  %>% mutate(population = gsub(population, pattern = "L2 CD4 T cells---", replacement = ""))  %>% 
    mutate(population = gsub(population, pattern = "L2 CD4 Unconventional T cells---", replacement = ""))

In [None]:
options(repr.plot.width = 36, repr.plot.height = 7)
populations_corr  %>% 
dplyr::filter(test == "c_peptide_change")  %>% 
arrange(cor)  %>% 
ggplot(aes(x = test, y = reorder(population, cor))) +
geom_point(aes(size = -log(pval), color = cor)) + 
facet_wrap(~test) +
ylab("") + xlab("") +
scale_color_gradient2(low = "blue", mid = "white", high = "red") + 
theme_classic() + ggtheme() + theme(axis.text.x = element_text(angle = 90)) +

populations_corr  %>% 
dplyr::filter(test == "fasting_cpept_T1")  %>% 
arrange(cor)  %>% 
ggplot(aes(x = test, y = reorder(population, cor))) +
geom_point(aes(size = -log(pval), color = cor)) + 
facet_wrap(~test) +
ylab("") + xlab("") +
scale_color_gradient2(low = "blue", mid = "white", high = "red") + 
theme_classic() + ggtheme() + theme(axis.text.x = element_text(angle = 90)) +

populations_corr  %>% 
dplyr::filter(test == "fasting_cpept")  %>% 
arrange(cor)  %>% 
ggplot(aes(x = test, y = reorder(population, cor))) +
geom_point(aes(size = -log(pval), color = cor)) + 
facet_wrap(~test) +
ylab("") + xlab("") +
scale_color_gradient2(low = "blue", mid = "white", high = "red") + 
theme_classic() + ggtheme() + theme(axis.text.x = element_text(angle = 90)) 

In [None]:
ggsave("../figures/populations_correlations/population_correlations_L2_cd4.png", width = 72, height = 15, units = "cm", create.dir = TRUE)
ggsave("../figures/populations_correlations/population_correlations_L2_cd4.svg", width = 72, height = 15, units = "cm")

## L2 CD8

In [None]:
population_colnames_l2  <- which(substr(colnames(md4),start = 1,stop = 2) == "L2" &
                                 grepl(colnames(md4), pattern = "CD8"))

In [None]:
population_colnames_l2

In [None]:
md4

In [None]:
calc_correlation  <- function(i){
    df  <- md4  %>% dplyr::select(i,2)  %>% filter(!is.na(2))
   colnames(df)  <- c("value","fasting_cpept_t1")
    cor  <- cor.test(df$value, df$fasting_cpept_t1)
    res_df1  <- data.frame(population = colnames(md4)[i], 
                          cor = cor$estimate, 
                          pval = cor$p.value, 
                         padj  = ifelse(cor$p.value*10>1,1,cor$p.value*10),
                         test = "fasting_cpept_T1")
    
    df  <- md4  %>% dplyr::select(i,3)  %>% filter(!is.na(3))
    colnames(df)  <- c("value","fasting_cpept_1")
    cor  <- cor.test(df$value, df$fasting_cpept_1)
    res_df2  <- data.frame(population = colnames(md4)[i], 
                          cor = cor$estimate, 
                          pval = cor$p.value, 
                         padj  = ifelse(cor$p.value*10>1,1,cor$p.value*10),
                         test = "fasting_cpept")
    
    df  <- md4  %>% dplyr::select(i,4)  %>% filter(!is.na(4))
    colnames(df)  <- c("value","c_peptide_change")
    cor  <- cor.test(df$value, df$c_peptide_change)
    res_df3  <- data.frame(population = colnames(md4)[i], 
                          cor = cor$estimate, 
                          pval = cor$p.value, 
                         padj  = ifelse(cor$p.value*10>1,1,cor$p.value*10),
                         test = "c_peptide_change")
    
    res_df  <- rbind(res_df1, res_df2, res_df3)
    return(res_df)
}

In [None]:
populations_corr_cd8  <- future_map(population_colnames_l2, calc_correlation)

In [None]:
populations_corr  <- bind_rows(populations_corr_cd8)

populations_corr  %>% arrange(pval)

In [None]:
populations_corr  <- populations_corr  %>% mutate(population = gsub(population, pattern = "L2 CD8 T cells---", replacement = ""))  %>% 
    mutate(population = gsub(population, pattern = "L2 CD8 Unconventional T cells---", replacement = ""))

In [None]:
options(repr.plot.width = 36, repr.plot.height = 7)
populations_corr  %>% 
dplyr::filter(test == "c_peptide_change")  %>% 
arrange(cor)  %>% 
ggplot(aes(x = test, y = reorder(population, cor))) +
geom_point(aes(size = -log(pval), color = cor)) + 
facet_wrap(~test) +
ylab("") + xlab("") +
scale_color_gradient2(low = "blue", mid = "white", high = "red") + 
theme_classic() + ggtheme() + theme(axis.text.x = element_text(angle = 90)) +

populations_corr  %>% 
dplyr::filter(test == "fasting_cpept_T1")  %>% 
arrange(cor)  %>% 
ggplot(aes(x = test, y = reorder(population, cor))) +
geom_point(aes(size = -log(pval), color = cor)) + 
facet_wrap(~test) +
ylab("") + xlab("") +
scale_color_gradient2(low = "blue", mid = "white", high = "red") + 
theme_classic() + ggtheme() + theme(axis.text.x = element_text(angle = 90)) +

populations_corr  %>% 
dplyr::filter(test == "fasting_cpept")  %>% 
arrange(cor)  %>% 
ggplot(aes(x = test, y = reorder(population, cor))) +
geom_point(aes(size = -log(pval), color = cor)) + 
facet_wrap(~test) +
ylab("") + xlab("") +
scale_color_gradient2(low = "blue", mid = "white", high = "red") + 
theme_classic() + ggtheme() + theme(axis.text.x = element_text(angle = 90)) 

In [None]:
ggsave("../figures/populations_correlations/population_correlations_L2_cd8.png", width = 72, height = 15, units = "cm", create.dir = TRUE)
ggsave("../figures/populations_correlations/population_correlations_L2_cd8.svg", width = 72, height = 15, units = "cm")

# Treg vs cpept

In [None]:
population_colnames_l3  <- which(substr(colnames(md4),start = 1,stop = 2) == "L3")

In [None]:
population_colnames_l3

In [None]:
calc_correlation  <- function(i){
    df  <- md4  %>% dplyr::select(i,2)  %>% filter(!is.na(2))
   colnames(df)  <- c("value","fasting_cpept_t1")
    cor  <- cor.test(df$value, df$fasting_cpept_t1)
    res_df1  <- data.frame(population = colnames(md4)[i], 
                          cor = cor$estimate, 
                          pval = cor$p.value, 
                         padj  = ifelse(cor$p.value*10>1,1,cor$p.value*10),
                         test = "fasting_cpept_T1")
    
    df  <- md4  %>% dplyr::select(i,3)  %>% filter(!is.na(3))
    colnames(df)  <- c("value","fasting_cpept_1")
    cor  <- cor.test(df$value, df$fasting_cpept_1)
    res_df2  <- data.frame(population = colnames(md4)[i], 
                          cor = cor$estimate, 
                          pval = cor$p.value, 
                         padj  = ifelse(cor$p.value*10>1,1,cor$p.value*10),
                         test = "fasting_cpept")
    
    df  <- md4  %>% dplyr::select(i,4)  %>% filter(!is.na(4))
    colnames(df)  <- c("value","c_peptide_change")
    cor  <- cor.test(df$value, df$c_peptide_change)
    res_df3  <- data.frame(population = colnames(md4)[i], 
                          cor = cor$estimate, 
                          pval = cor$p.value, 
                         padj  = ifelse(cor$p.value*10>1,1,cor$p.value*10),
                         test = "c_peptide_change")
    
    res_df  <- rbind(res_df1, res_df2, res_df3)
    return(res_df)
}

In [None]:
populations_corr_l3  <- future_map(population_colnames_l3, calc_correlation)

In [None]:
populations_corr  <- bind_rows(populations_corr_l3)

In [None]:
populations_corr  %>% filter(grepl(populations_corr$population, pattern = "Treg"))

In [None]:
md4

In [None]:
ls()

In [None]:
options(repr.plot.width = 12, repr.plot.height = 5)
j = 1
for(i in (populations_corr  %>% filter(grepl(population, pattern = "Treg"))  %>% 
           pull(population))[1:12]  %>% unique){
    df2 <- md4  %>% dplyr::select(which(colnames(md4)==i), 
                                  fasting_cpept_T1 = 2, 
                                  fasting_cpept_1 = 3,
                                  c_peptide_change = 4) 
    colnames(df2)[1]  <- c("value")
  p1 <-  df2 %>%  ggplot(aes(x=value, y=fasting_cpept_T1)) +
  geom_point(shape = 16, size = 2) +
  geom_smooth(method=lm, alpha = 0.2) + ggtitle(gsub(i, pattern = "L3 CD4 T cells---Treg---", replacement = " ")) + 
stat_cor(size = 7) + theme_classic() + ggtheme() 

    p2 <-  df2 %>%  ggplot(aes(x=value, y=fasting_cpept_1)) +
  geom_point(shape = 16, size = 2) +
  geom_smooth(method=lm, alpha = 0.2) + ggtitle(" ") + 
stat_cor(size = 7) + theme_classic() + ggtheme() 

    p3 <-  df2 %>%  ggplot(aes(x=value, y=c_peptide_change)) +
  geom_point(shape = 16, size = 2) +
  geom_smooth(method=lm, alpha = 0.2) + ggtitle(" ") + 
stat_cor(size = 7) + theme_classic() + ggtheme() 

    print(p1 + p2 + p3)
    j = j+1
library(svglite)
dir.create("../figures/correlation_populations/")
ggsave(filename = paste0("../figures/correlation_populations/",i,".svg"), width = 12, height = 4)
    }

### Both patient samples t1 and T0

In [None]:
md5  <- md_cpept  %>% left_join(md3)  %>% dplyr::filter(!is.na(fasting_cpept_1))

In [None]:
population_colnames_l3  <- which(substr(colnames(md5),start = 1,stop = 2) == "L3")

In [None]:
population_colnames_l3

In [None]:
calc_correlation  <- function(i){
    df  <- md5  %>% dplyr::select(i,2)  %>% filter(!is.na(2))
   colnames(df)  <- c("value","fasting_cpept_t1")
    cor  <- cor.test(df$value, df$fasting_cpept_t1)
    res_df1  <- data.frame(population = colnames(md4)[i], 
                          cor = cor$estimate, 
                          pval = cor$p.value, 
                         padj  = ifelse(cor$p.value*10>1,1,cor$p.value*10),
                         test = "fasting_cpept_T1")
    
    df  <- md5  %>% dplyr::select(i,3)  %>% filter(!is.na(3))
    colnames(df)  <- c("value","fasting_cpept_1")
    cor  <- cor.test(df$value, df$fasting_cpept_1)
    res_df2  <- data.frame(population = colnames(md4)[i], 
                          cor = cor$estimate, 
                          pval = cor$p.value, 
                         padj  = ifelse(cor$p.value*10>1,1,cor$p.value*10),
                         test = "fasting_cpept")
    
    df  <- md5  %>% dplyr::select(i,4)  %>% filter(!is.na(4))
    colnames(df)  <- c("value","c_peptide_change")
    cor  <- cor.test(df$value, df$c_peptide_change)
    res_df3  <- data.frame(population = colnames(md4)[i], 
                          cor = cor$estimate, 
                          pval = cor$p.value, 
                         padj  = ifelse(cor$p.value*10>1,1,cor$p.value*10),
                         test = "c_peptide_change")
    
    res_df  <- rbind(res_df1, res_df2, res_df3)
    return(res_df)
}

In [None]:
populations_corr_l3  <- future_map(population_colnames_l3, calc_correlation)

In [None]:
populations_corr  <- bind_rows(populations_corr_l3)

In [None]:
populations_corr  %>% filter(grepl(populations_corr$population, pattern = "Treg"))

In [None]:
options(repr.plot.width = 12, repr.plot.height = 5)
j = 1
for(i in (populations_corr  %>% filter(grepl(population, pattern = "Treg"))  %>% 
          arrange(pval)  %>% pull(population))[1:12]){
    df2 <- md5  %>% dplyr::select(which(colnames(md5)==i), 
                                  fasting_cpept_T1 = 2, 
                                  fasting_cpept_1 = 3,
                                  c_peptide_change = 4) 
    colnames(df2)[1]  <- c("value")
  p1 <-  df2 %>%  ggplot(aes(x=value, y=fasting_cpept_T1)) +
  geom_point(shape = 16, size = 2) +
  geom_smooth(method=lm, alpha = 0.2) + ggtitle(gsub(i, pattern = "L3 CD4 T cells---Treg---", replacement = " ")) + 
stat_cor(size = 7) + theme_classic() + ggtheme() 

    p2 <-  df2 %>%  ggplot(aes(x=value, y=fasting_cpept_1)) +
  geom_point(shape = 16, size = 2) +
  geom_smooth(method=lm, alpha = 0.2) + ggtitle(" ") + 
stat_cor(size = 7) + theme_classic() + ggtheme() 

    p3 <-  df2 %>%  ggplot(aes(x=value, y=c_peptide_change)) +
  geom_point(shape = 16, size = 2) +
  geom_smooth(method=lm, alpha = 0.2) + ggtitle(" ") + 
stat_cor(size = 7) + theme_classic() + ggtheme() 

    print(p1 + p2 + p3)
    j = j+1
#library(svglite)
#dir.create("../figures/correlation_populations/")
#ggsave(filename = paste0("../figures/correlation_populations/",i,".svg"), width = 5, height = 4.5)
    }

### Age

In [None]:
md3

In [None]:
population_colnames  <- which(substr(colnames(md4),start = 1,stop = 1) == "L")

In [None]:
population_colnames  %>% length

In [None]:
which(colnames(md4) == "Age")

In [None]:
calc_correlation  <- function(i){
    df  <- md4  %>% dplyr::select(i,5)  %>% filter(!is.na(5))
    colnames(df)  <- c("value","fasting_cpept_1")
    cor  <- cor.test(df$value, df$fasting_cpept_1)
    res_df  <- data.frame(population = colnames(md4)[i], 
                          cor = cor$estimate, 
                          pval = cor$p.value, 
                         padj  = ifelse(cor$p.value*102>1,1,cor$p.value*102))
    return(res_df)
}

In [None]:
populations_corr  <- future_map(population_colnames, calc_correlation)

In [None]:
populations_corr_age  <- bind_rows(populations_corr)

In [None]:
populations_corr_age  %>% arrange(pval)

In [None]:
options(repr.plot.width = 4, repr.plot.height = 4)
j = 1
for(i in (populations_corr_age  %>% arrange(pval)  %>% pull(population))[1:10]){
    df2 <- md4  %>% dplyr::select(which(colnames(md4)==i), Age = 5) 
    colnames(df2)  <- c("value", "Age")
  p <-  df2 %>%  ggplot(aes(x=value, y=Age)) +
  geom_point(shape = 16, size = 2) +
  geom_smooth(method=lm, alpha = 0.2) + ggtitle(gsub(str_to_upper(i), pattern = "_", replacement = " ")) + 
stat_cor(size = 7) + theme_classic() + ggtheme() 
 p <-   print(p)
    j = j+1
#library(svglite)
#dir.create("../figures/correlation_populations/")
#ggsave(filename = paste0("../figures/correlation_populations/",i,".svg"), width = 5, height = 4.5)
    }

## All correlations plot

In [None]:
populations_corr_age$test  <- "Age" 
populations_corr_cpept_change$test  <- "cpept_change"
populations_corr_fasting_cpept$test  <- "fasting_cpept"
populations_corr_idaa1c$test  <- "idaa1c"
populations_corr_pH$test  <- "pH"


In [None]:
all_correlations  <- rbind(populations_corr_age, populations_corr_cpept_change,
                       populations_corr_fasting_cpept, populations_corr_idaa1c,
                      populations_corr_pH)

In [None]:
options(repr.plot.width = 14, repr.plot.height = 30)

all_correlations  %>% ggplot(aes(x = test, y = population)) +
geom_point(aes(size = -log(pval), color = cor)) + 
scale_color_gradient2(low = "blue", mid = "white", high = "red") + 
theme_classic() + ggtheme() + theme(axis.text.x = element_text(angle = 90))

In [None]:
matrix  <- all_correlations  %>% dplyr::select(cor, population, test)  %>% 
pivot_wider(names_from = "population", values_from = "cor")  %>% 
column_to_rownames("test")  %>% as.matrix  %>% t()

In [None]:
library(corrplot)

In [None]:
hm  <- pheatmap(matrix, filename = "../figures/population_correlations_clust.pdf", width = 7, height = 30, units = "cm")

In [None]:
options(repr.plot.width = 14, repr.plot.height = 30)
all_correlations  %>% ggplot(aes(x = test, 
                                y = factor(population, 
                                           levels = rownames(matrix[hm$tree_row[["order"]],])))) +
geom_point(aes(size = -log(pval), color = cor)) + 
scale_size(range = c(4,10))+
scale_color_gradient2(low = "blue", mid = "white", high = "red") + theme_classic() + ggtheme() + theme(axis.text.x = element_text(angle = 90))

ggsave("../figures/populations_correlations/population_correlations_L3.png", width = 40, height = 75, units = "cm")
ggsave("../figures/populations_correlations/population_correlations_L3.svg", width = 40, height = 75, units = "cm")

In [None]:
options(repr.plot.width = 10, repr.plot.height = 8)

all_correlations  %>% 
dplyr::filter(grepl(population, pattern = "L2 C"))  %>% 
ggplot(aes(x = test, y = factor(population, 
                                           levels = rownames(matrix[hm$tree_row[["order"]],])))) +
geom_point(aes(size = -log(pval), color = cor)) + 
scale_size(range = c(4,10))+
scale_color_gradient2(low = "blue", mid = "white", high = "red") + 
theme_classic() + ggtheme() + theme(axis.text.x = element_text(angle = 90)) + 
xlab("") + ylab("")

ggsave("../figures/populations_correlations/population_correlations_L2.png", width = 24, height = 20, units = "cm")
ggsave("../figures/populations_correlations/population_correlations_L2.svg", width = 24, height = 20, units = "cm")

In [None]:
dir.create("../tables/population_correlations/")
write.csv(populations_corr_age, "../tables/population_correlations/populations_corr_age.csv")
write.csv(populations_corr_cpept_change,"../tables/population_correlations/populations_corr_cpept_change.csv")
write.csv(populations_corr_fasting_cpept, "../tables/population_correlations/populations_corr_fasting_cpept.csv")
write.csv(populations_corr_idaa1c,"../tables/population_correlations/populations_corr_idaa1c.csv")
write.csv(populations_corr_pH, "../tables/population_correlations/populations_corr_pH.csv")

In [None]:
IDs

## Freq from parent

### Correlation of populations with c_peptide change

#### C_peptide change perc

In [None]:
populations_2  %>% colnames

In [None]:
populations_2  %>% ncol

In [None]:
md3  <- populations_2  %>% 
ungroup  %>% 
dplyr::select(-Main)  %>% 
mutate(annotations = paste(Level, annotations))  %>% 
dplyr::select(6,7,10,12,2,22,19)  %>% 
dplyr::filter(prelim_final == "Final")  %>% 
unique  %>% 
pivot_wider(names_from = "annotations", values_from = "freq_from_parent")

In [None]:
md3

In [None]:
md_cpept  <- read_xlsx("../data/metadata_v05.xlsx") 

In [None]:
md_cpept  <- read_xlsx("../data/metadata_v05.xlsx")  %>% 
mutate(Patient_Time = paste(patient, time_taken))  %>% 
       dplyr::select(Patient_Time, c_peptide_change_perc)  %>% unique

In [None]:
md_cpept

In [None]:
md3  <- md3  %>% separate(Condition, into = c("Disease", "Time"), remove = F, sep = " ")

In [None]:
md3$Patient_Time  <- paste(md3$Patient_ID, md3$Time)

In [None]:
md4  <- md3  %>% left_join(md_cpept)  %>% dplyr::filter(!is.na(c_peptide_change_perc))

In [None]:
md4 

In [None]:
population_colnames  <- which(substr(colnames(md3),start = 1,stop = 1) == "L")

In [None]:
population_colnames  %>% length

In [None]:
which(colnames(md4) == "c_peptide_change_perc")

In [None]:
calc_correlation  <- function(i){
    df  <- md4  %>% dplyr::select(i,111)  %>% filter(!is.na(111))
    colnames(df)  <- c("value","fasting_cpept_1")
    cor  <- cor.test(df$value, df$fasting_cpept_1)
    res_df  <- data.frame(population = colnames(md4)[i], 
                          cor = cor$estimate, 
                          pval = cor$p.value, 
                         padj  = ifelse(cor$p.value*102>1,1,cor$p.value*102))
    return(res_df)
}

In [None]:
populations_corr  <- future_map(population_colnames[c(3:47,50,54:length(population_colnames))], calc_correlation)

In [None]:
populations_corr  <- bind_rows(populations_corr)

In [None]:
populations_corr  %>% arrange(pval)

In [None]:
options(repr.plot.width = 4, repr.plot.height = 4)
j = 1
for(i in (populations_corr  %>% arrange(pval)  %>% pull(population))[1:10]){
    df2 <- md4  %>% dplyr::select(which(colnames(md4)==i), c_peptide_change_perc = 111) 
    colnames(df2)  <- c("value", "c_peptide_change_perc")
  p <-  df2 %>%  ggplot(aes(x=value, y=c_peptide_change_perc)) +
  geom_point(shape = 16, size = 2) +
  geom_smooth(method=lm, alpha = 0.2) + ggtitle(gsub(str_to_upper(i), pattern = "_", replacement = " ")) + 
stat_cor(size = 7) + theme_classic() + ggtheme() 
 p <-   print(p)
    j = j+1
#library(svglite)
#dir.create("../figures/correlation_populations/")
#ggsave(filename = paste0("../figures/correlation_populations/",i,".svg"), width = 5, height = 4.5)
    }

In [None]:
dir.create("../tables/population_dynamics_pct_from_parent/")

In [None]:
population_corr  <- populations_corr  %>% arrange(pval)
write.csv(population_corr, "../tables/population_dynamics_pct_from_parent/corr_cpept_change_perc.csv", row.names = F)

#### C-peptide change

In [None]:
md3

In [None]:
md_cpept  <- read_xlsx("../data/metadata_v05.xlsx")  %>% 
mutate(Patient_Time = paste(patient, time_taken))  %>% 
       dplyr::select(Patient_Time, c_peptide_change)  %>% unique

In [None]:
md_cpept

In [None]:
md4  <- md3  %>% left_join(md_cpept)  %>% dplyr::filter(!is.na(c_peptide_change))

In [None]:
md4 

In [None]:
population_colnames  <- which(substr(colnames(md4),start = 1,stop = 1) == "L")

In [None]:
population_colnames  %>% length

In [None]:
which(colnames(md4) == "c_peptide_change")

In [None]:
calc_correlation  <- function(i){
    df  <- md4  %>% dplyr::select(i,111)  %>% filter(!is.na(111))
    colnames(df)  <- c("value","c_peptide_change")
    cor  <- cor.test(df$value, df$c_peptide_change)
    res_df  <- data.frame(population = colnames(md4)[i], 
                          cor = cor$estimate, 
                          pval = cor$p.value, 
                         padj  = ifelse(cor$p.value*102>1,1,cor$p.value*102))
    return(res_df)
}

In [None]:
populations_corr  <- future_map(population_colnames[c(3:47,54:length(population_colnames))], calc_correlation)

In [None]:
populations_corr_cpept_change  <- bind_rows(populations_corr)

In [None]:
populations_corr_cpept_change  %>% arrange(pval)

In [None]:
options(repr.plot.width = 4, repr.plot.height = 4)
j = 1
for(i in (populations_corr_cpept_change  %>% arrange(pval)  %>% pull(population))[1:10]){
    df2 <- md4  %>% dplyr::select(which(colnames(md4)==i), c_peptide_change_perc = 111) 
    colnames(df2)  <- c("value", "c_peptide_change_perc")
  p <-  df2 %>%  ggplot(aes(x=value, y=c_peptide_change_perc)) +
  geom_point(shape = 16, size = 2) +
  geom_smooth(method=lm, alpha = 0.2) + ggtitle(gsub(str_to_upper(i), pattern = "_", replacement = " ")) + 
stat_cor(size = 7) + theme_classic() + ggtheme() 
 p <-   print(p)
    j = j+1
#library(svglite)
#dir.create("../figures/correlation_populations/")
#ggsave(filename = paste0("../figures/correlation_populations/",i,".svg"), width = 5, height = 4.5)
    }

In [None]:
population_corr  <- populations_corr_cpept_change  %>% arrange(pval)
write.csv(population_corr, "../tables/population_dynamics_pct_from_parent/corr_cpept_change.csv", row.names = F)

### IDAA1c

In [None]:
md3

In [None]:
md_idaa1c  <- read_xlsx("../data/metadata_v05.xlsx")  %>% 
mutate(Patient_Time = paste(patient, time_taken))  %>% 
       dplyr::select(Patient_Time, idaa1c)  %>% unique

In [None]:
md_idaa1c

In [None]:
md_idaa1c  <- md_idaa1c  %>% dplyr::filter(!is.na(idaa1c))  %>% 
mutate(Patient_Time = gsub(Patient_Time, pattern = "T1", replacement = "T0"))

In [None]:
md_idaa1c  %>% nrow

In [None]:
md4  <- md3  %>% left_join(md_idaa1c)  %>% dplyr::filter(!is.na(idaa1c))

In [None]:
md4 

In [None]:
population_colnames  <- which(substr(colnames(md4),start = 1,stop = 1) == "L")

In [None]:
population_colnames  %>% length

In [None]:
which(colnames(md4) == "idaa1c")

In [None]:
calc_correlation  <- function(i){
    df  <- md4  %>% dplyr::select(i,111)  %>% filter(!is.na(111))
    colnames(df)  <- c("value","idaa1c")
    cor  <- cor.test(df$value, df$idaa1c)
    res_df  <- data.frame(population = colnames(md4)[i], 
                          cor = cor$estimate, 
                          pval = cor$p.value, 
                         padj  = ifelse(cor$p.value*102>1,1,cor$p.value*102))
    return(res_df)
}

In [None]:
populations_corr  <- future_map(population_colnames[c(3:47,54:length(population_colnames))], calc_correlation)

In [None]:
populations_corr_idaa1c  <- bind_rows(populations_corr)

In [None]:
populations_corr_idaa1c  %>% arrange(pval)

In [None]:
options(repr.plot.width = 4, repr.plot.height = 4)
j = 1
for(i in (populations_corr_idaa1c  %>% arrange(pval)  %>% pull(population))[1:10]){
    df2 <- md4  %>% dplyr::select(which(colnames(md4)==i), idaa1c = 111) 
    colnames(df2)  <- c("value", "idaa1c")
  p <-  df2 %>%  ggplot(aes(x=value, y=idaa1c)) +
  geom_point(shape = 16, size = 2) +
  geom_smooth(method=lm, alpha = 0.2) + ggtitle(gsub(str_to_upper(i), pattern = "_", replacement = " ")) + 
stat_cor(size = 7) + theme_classic() + ggtheme() 
 p <-   print(p)
    j = j+1
#library(svglite)
#dir.create("../figures/correlation_populations/")
#ggsave(filename = paste0("../figures/correlation_populations/",i,".svg"), width = 5, height = 4.5)
    }

In [None]:
population_corr  <- populations_corr_idaa1c  %>% arrange(pval)
write.csv(population_corr, "../tables/population_dynamics_pct_from_parent/corr_idaa1c.csv", row.names = F)

### Fasting cpept 

In [None]:
md3

In [None]:
md_fasting  <- read_xlsx("../data/metadata_v05.xlsx")  %>% 
mutate(Patient_Time = paste(patient, time_taken))  %>% 
       dplyr::select(Patient_Time, fasting_cpept_1)  %>% unique

In [None]:
md_fasting

In [None]:
md4  <- md3  %>% left_join(md_fasting)  %>% dplyr::filter(!is.na(fasting_cpept_1))

In [None]:
md4 

In [None]:
population_colnames  <- which(substr(colnames(md4),start = 1,stop = 1) == "L")

In [None]:
population_colnames  %>% length

In [None]:
which(colnames(md4) == "fasting_cpept_1")

In [None]:
calc_correlation  <- function(i){
    df  <- md4  %>% dplyr::select(i,111)  %>% filter(!is.na(111))
    colnames(df)  <- c("value","fasting_cpept_1")
    cor  <- cor.test(df$value, df$fasting_cpept_1)
    res_df  <- data.frame(population = colnames(md4)[i], 
                          cor = cor$estimate, 
                          pval = cor$p.value, 
                         padj  = ifelse(cor$p.value*102>1,1,cor$p.value*102))
    return(res_df)
}

In [None]:
populations_corr  <- future_map(population_colnames[c(3:47,54:length(population_colnames))], calc_correlation)

In [None]:
populations_corr_fasting_cpept  <- bind_rows(populations_corr)

In [None]:
populations_corr_fasting_cpept  %>% arrange(pval)

In [None]:
options(repr.plot.width = 4, repr.plot.height = 4)
j = 1
for(i in (populations_corr_fasting_cpept  %>% arrange(pval)  %>% pull(population))[1:10]){
    df2 <- md4  %>% dplyr::select(which(colnames(md4)==i), idaa1c = 111) 
    colnames(df2)  <- c("value", "fasting_cpept_1")
  p <-  df2 %>%  ggplot(aes(x=value, y=fasting_cpept_1)) +
  geom_point(shape = 16, size = 2) +
  geom_smooth(method=lm, alpha = 0.2) + ggtitle(gsub(str_to_upper(i), pattern = "_", replacement = " ")) + 
stat_cor(size = 7) + theme_classic() + ggtheme() 
 p <-   print(p)
    j = j+1
#library(svglite)
#dir.create("../figures/correlation_populations/")
#ggsave(filename = paste0("../figures/correlation_populations/",i,".svg"), width = 5, height = 4.5)
    }

In [None]:
populations_corr  <- populations_corr_fasting_cpept  %>% arrange(pval)
write.csv(population_corr, "../tables/population_dynamics_pct_from_parent/corr_fasting_cpept.csv", row.names = F)

### blood pH at time 0

In [None]:
md_ph  <- read_xlsx("../data/metadata_v05.xlsx")  %>% 
mutate(Patient_Time = paste(patient, time_taken))  %>% 
       dplyr::select(Patient_Time, ph_man)  %>% unique

In [None]:
md4  <- md3  %>% left_join(md_ph)  %>% dplyr::filter(!is.na(ph_man))

md4 

In [None]:
population_colnames  <- which(substr(colnames(md4),start = 1,stop = 1) == "L")

population_colnames  %>% length

In [None]:
which(colnames(md4) == "ph_man")

In [None]:
ncol(md4)

In [None]:
calc_correlation  <- function(i){
    df  <- md4  %>% dplyr::select(i,111)  %>% filter(!is.na(111))
    colnames(df)  <- c("value","ph_man")
    cor  <- cor.test(df$value, df$ph_man)
    res_df  <- data.frame(population = colnames(md4)[i], 
                          cor = cor$estimate, 
                          pval = cor$p.value, 
                         padj  = ifelse(cor$p.value*102>1,1,cor$p.value*102))
    return(res_df)
}

populations_corr  <- future_map(population_colnames[c(3:47,54:length(population_colnames))], calc_correlation)
populations_corr_pH  <- bind_rows(populations_corr)

populations_corr_pH  %>% arrange(pval)

In [None]:
options(repr.plot.width = 4, repr.plot.height = 4)
j = 1
for(i in (populations_corr_pH  %>% arrange(pval)  %>% pull(population))[1:10]){
    df2 <- md4  %>% dplyr::select(which(colnames(md4)==i), idaa1c = 111) 
    colnames(df2)  <- c("value", "ph_man")
  p <-  df2 %>%  ggplot(aes(x=value, y=ph_man)) +
  geom_point(shape = 16, size = 2) +
  geom_smooth(method=lm, alpha = 0.2) + ggtitle(gsub(str_to_upper(i), pattern = "_", replacement = " ")) + 
stat_cor(size = 7) + theme_classic() + ggtheme() 
 p <-   print(p)
    j = j+1
#library(svglite)
#dir.create("../figures/correlation_populations/")
#ggsave(filename = paste0("../figures/correlation_populations/",i,".svg"), width = 5, height = 4.5)
    }

In [None]:
populations_corr  <- populations_corr_pH  %>% arrange(pval)
write.csv(population_corr, "../tables/population_dynamics_pct_from_parent/corr_ph.csv", row.names = F)

### Age

In [None]:
md3

In [None]:
population_colnames  <- which(substr(colnames(md4),start = 1,stop = 1) == "L")

In [None]:
population_colnames  %>% length

In [None]:
which(colnames(md4) == "Age")

In [None]:
calc_correlation  <- function(i){
    df  <- md4  %>% dplyr::select(i,5)  %>% filter(!is.na(5))
    colnames(df)  <- c("value","fasting_cpept_1")
    cor  <- cor.test(df$value, df$fasting_cpept_1)
    res_df  <- data.frame(population = colnames(md4)[i], 
                          cor = cor$estimate, 
                          pval = cor$p.value, 
                         padj  = ifelse(cor$p.value*102>1,1,cor$p.value*102))
    return(res_df)
}

In [None]:
populations_corr  <- future_map(population_colnames[c(3:47,54:length(population_colnames))], calc_correlation)


In [None]:
populations_corr_age  <- bind_rows(populations_corr)

In [None]:
populations_corr_age  %>% arrange(pval)

In [None]:
options(repr.plot.width = 4, repr.plot.height = 4)
j = 1
for(i in (populations_corr_age  %>% arrange(pval)  %>% pull(population))[1:10]){
    df2 <- md4  %>% dplyr::select(which(colnames(md4)==i), Age = 5) 
    colnames(df2)  <- c("value", "Age")
  p <-  df2 %>%  ggplot(aes(x=value, y=Age)) +
  geom_point(shape = 16, size = 2) +
  geom_smooth(method=lm, alpha = 0.2) + ggtitle(gsub(str_to_upper(i), pattern = "_", replacement = " ")) + 
stat_cor(size = 7) + theme_classic() + ggtheme() 
 p <-   print(p)
    j = j+1
#library(svglite)
#dir.create("../figures/correlation_populations/")
#ggsave(filename = paste0("../figures/correlation_populations/",i,".svg"), width = 5, height = 4.5)
    }

In [None]:
populations_corr  <- populations_corr_age  %>% arrange(pval)
write.csv(population_corr, "../tables/population_dynamics_pct_from_parent/corr_age.csv", row.names = F)

## All correlations plot

In [None]:
populations_corr_age$test  <- "Age" 
populations_corr_cpept_change$test  <- "cpept_change"
populations_corr_fasting_cpept$test  <- "fasting_cpept"
populations_corr_idaa1c$test  <- "idaa1c"
populations_corr_pH$test  <- "pH"


In [None]:
all_correlations  <- rbind(populations_corr_age, populations_corr_cpept_change,
                       populations_corr_fasting_cpept, populations_corr_idaa1c,
                      populations_corr_pH)

In [None]:
options(repr.plot.width = 14, repr.plot.height = 30)

all_correlations  %>% ggplot(aes(x = test, y = population)) +
geom_point(aes(size = -log(pval), color = cor)) + 
scale_color_gradient2(low = "blue", mid = "white", high = "red") + 
theme_classic() + ggtheme() + theme(axis.text.x = element_text(angle = 90))

In [None]:
matrix  <- all_correlations  %>% dplyr::select(cor, population, test)  %>% 
pivot_wider(names_from = "population", values_from = "cor")  %>% 
column_to_rownames("test")  %>% as.matrix  %>% t()

In [None]:
library(corrplot)

In [None]:
hm  <- pheatmap(matrix, filename = "../figures/population_correlations_clust.pdf", width = 7, height = 30, units = "cm")

In [None]:
options(repr.plot.width = 14, repr.plot.height = 30)
all_correlations  %>% ggplot(aes(x = test, 
                                y = factor(population, 
                                           levels = rownames(matrix[hm$tree_row[["order"]],])))) +
geom_point(aes(size = -log(pval), color = cor)) + 
scale_size(range = c(4,10))+
scale_color_gradient2(low = "blue", mid = "white", high = "red") + theme_classic() + ggtheme() + theme(axis.text.x = element_text(angle = 90))

ggsave("../figures/populations_correlations/population_correlations_L3.png", width = 40, height = 75, units = "cm")
ggsave("../figures/populations_correlations/population_correlations_L3.svg", width = 40, height = 75, units = "cm")

In [None]:
options(repr.plot.width = 10, repr.plot.height = 8)

all_correlations  %>% 
dplyr::filter(grepl(population, pattern = "L2 C"))  %>% 
ggplot(aes(x = test, y = factor(population, 
                                           levels = rownames(matrix[hm$tree_row[["order"]],])))) +
geom_point(aes(size = -log(pval), color = cor)) + 
scale_size(range = c(4,10))+
scale_color_gradient2(low = "blue", mid = "white", high = "red") + 
theme_classic() + ggtheme() + theme(axis.text.x = element_text(angle = 90)) + 
xlab("") + ylab("")

ggsave("../figures/populations_correlations/population_correlations_L2.png", width = 24, height = 20, units = "cm")
ggsave("../figures/populations_correlations/population_correlations_L2.svg", width = 24, height = 20, units = "cm")

In [None]:
dir.create("../tables/population_correlations/")
write.csv(populations_corr_age, "../tables/population_correlations/populations_corr_age.csv")
write.csv(populations_corr_cpept_change,"../tables/population_correlations/populations_corr_cpept_change.csv")
write.csv(populations_corr_fasting_cpept, "../tables/population_correlations/populations_corr_fasting_cpept.csv")
write.csv(populations_corr_idaa1c,"../tables/population_correlations/populations_corr_idaa1c.csv")
write.csv(populations_corr_pH, "../tables/population_correlations/populations_corr_pH.csv")

# Correlation heatmaps in Dia

In [None]:
library(mice)

In [None]:
md5  <- read_xlsx("../data/metadata_v05.xlsx") 

In [None]:
colnames(md5)

In [None]:
md5  <- md5  %>% 
mutate(dq2 = ifelse((HLA_DQA11 == "DQA1*05:01:01" | HLA_DQA12 == "DQA1*05:01:01") &
                                   (HLA_DQB11 == "DQB1*02:01:01" | HLA_DQB12 == "DQB1*02:01:01"),"DQ2",
                                   "Other"),
                      dq8 = ifelse((HLA_DQA11 == "DQA1*03:01:01" | HLA_DQA12 == "DQA1*03:01:01") &
                                   (HLA_DQB11 == "DQB1*03:02:01" | HLA_DQB12 == "DQB1*03:02:01"),"DQ8",
                                   "Other"))  %>% mutate(
                      dq2_8 = ifelse(dq2 == "DQ2" & dq8 == "DQ8","DQ2_8",
                                     ifelse(dq2 == "DQ2","DQ2", ifelse(dq8 == "DQ8","DQ8","Other"))))  %>% 
mutate(risk_hla = recode_factor(dq2_8, "Other" = 0, "DQ8" = 1, "DQ2" = 2,"DQ2_8" = 3)) %>% 
mutate(c7_01 = ifelse((HLA_C1 == "C*07:01:01" & HLA_C2 == "C*07:01:01"), "c7_01_hom",
                                   ifelse(HLA_C1 == "C*07:01:01" | HLA_C2 == "C*07:01:01","c7_01_het",
                                   "Other")),
                      b8_01 = ifelse((HLA_B1 == "B*08:01:01" & HLA_B2 == "B*08:01:01"), "b8_01_hom",
                                   ifelse(HLA_B1 == "B*08:01:01" | HLA_B2 == "B*08:01:01","b8_01_het",
                                   "Other")))   %>% mutate(
                      c7_b8 = ifelse(b8_01 == "b8_01_hom" & c7_01 == "c7_01_hom",
                                     "c7_b8_hom", ifelse(c7_01 == "c7_01_hom", "c7_01_hom",
                              ifelse(b8_01 == "b8_01_hom", "b8_01_hom", 
                              ifelse(b8_01 == "b8_01_het" & c7_01 == "c7_01_het",
                                     "c7_b8_het", 
                              ifelse(b8_01 == "b8_01_het", "b8_01_het", 
                              ifelse(c7_01 == "c7_01_het", "c7_01_het", "Other" )))))))  %>% 
mutate(dr3 = ifelse((grepl(HLA_DRB11, pattern = "DRB1\\*03")) | (grepl(HLA_DRB12, pattern = "DRB1\\*03")),
                   "DR3", "Other"),
       dr4 = ifelse((grepl(HLA_DRB11, pattern = "DRB1\\*04")) | (grepl(HLA_DRB12, pattern = "DRB1\\*04")),
                   "DR4", "Other"),
       dr3_dr4 = ifelse(dr3 == "DR3" & dr4 == "DR4","DR3_DR4",
       ifelse(dr3 == "DR3","DR3", ifelse(dr4 == "DR4","DR4","Other"))))
       
       

In [None]:
test  <- md5  %>% colnames
names(test)  <- 1:length(test)
test

In [None]:
md5_select  <- md5  %>% dplyr::select(6,11:14,19,21:24,26:37,39,40,42,43,44,74:91)

In [None]:
md5_select

In [None]:
md5_select  <- md5_select  %>% dplyr::filter(!(Patient_Time %in% c("206 T0","207 T0", "116 T1")))

In [None]:
populations_2_select  <- dplyr::select(populations_2, Patient_ID, Time, annotations, Condition, Sex, Age, Level, Experiment_ID, freq_from_total)

In [None]:
populations_3  <- populations_2_select  %>% 
mutate(Patient_Time = paste(Patient_ID, Time, Experiment_ID))  %>% 
mutate(annotations = paste(Level, annotations))  %>% 
dplyr::select(-Patient_ID, -Time, -Level)  


In [None]:
populations_3  %>% nrow

In [None]:
populations_3$Patient_Time %in% md5_select$Patient_Time  %>% table

In [None]:
populations_3

In [None]:
populations_4  <- populations_3 %>% 
pivot_wider(names_from = "annotations", values_from = "freq_from_total")  %>% 
dplyr::filter(Experiment_ID %in% c("Exp16","Exp18","Exp19","Exp20"))

In [None]:
md2  <- left_join(populations_4, md5_select)

In [None]:
## Check missing data
md.pattern(md2)

In [None]:
colnames(md2)

In [None]:
md2$Condition  %>% table

In [None]:
## Clean column names, otherwise error
colnames(md2) <- janitor::make_clean_names(colnames(md2))

In [None]:
colnames(md2) 

In [None]:
md2 <- md2 %>%
    mutate(condition = as.factor(condition)) %>% 
    mutate(sex = as.factor(sex))  %>% 
    mutate(risk_hla = as.numeric(risk_hla))  %>% dplyr::select(-condition)
    

In [None]:
md3  <- md2

In [None]:
md.pattern(md3)

In [None]:
colnames(md.pattern(md3, plot = F)[,107:151])


In [None]:
str(md3[,c(1:10,100:121)])

In [None]:
init = mice(md3, maxit=0) 
meth = init$method
predM = init$predictorMatrix

In [None]:
predM

In [None]:
predM[colnames(predM),]=0

In [None]:
predM

In [None]:
predM[c("disease", "sex", "age","risk_hla"),]=1

In [None]:
predM[,c("disease", "sex", "age","risk_hla")]=1

In [None]:
predM

In [None]:
for(i in 1:121){
    predM[i,i]=0
}

In [None]:
predM

In [None]:
colnames(md.pattern(md3, plot = F)[,c(105:113,117:120)])

In [None]:
colnames(md.pattern(md3, plot = F)[,c(114,115,116,121)])

In [None]:
colnames(md.pattern(md3, plot = F)[,105:122])

In [None]:
meth[colnames(md.pattern(md3, plot = F)[,c(105:113,117:120)])]="norm" 
meth[colnames(md.pattern(md3, plot = F)[,c(114,115,116,121)])]="rf" 

In [None]:
## Impute missing data with MICE package
imputed = mice(md3, method=meth, predictorMatrix=predM, m=5)

In [None]:
imputed <- complete(imputed)

In [None]:
md.pattern(imputed)

In [None]:
write.csv(imputed, "../tables/240214_09_Data_MICE_imputed.csv")

In [None]:
imputed  <- read_csv("../tables/240214_09_Data_MICE_imputed.csv")

In [None]:
imputed$`...1`  <- NULL

## Feature correlation matrix

In [None]:
library(corrplot)
Corr <- cor(select_if(imputed, is.numeric), use="complete.obs")

In [None]:
pdf(file = "../figures/correlation_matrix.pdf", width = 30, height = 30)

corrplot(Corr, 
title = "Correlation of features in patient samples", 
mar = c(0,0,1,0), number.cex = 0.5, number.digits = 2,
number.font = 2)

dev.off()

In [None]:
pdf(file = "../figures/correlation_matrix_clust.pdf", width = 30, height = 30)

corrplot(Corr, 
title = "Correlation of features in patient samples", 
mar = c(0,0,1,0), number.cex = 0.5, number.digits = 2,
number.font = 2, order = 'hclust', addrect = 20, 
        tl.col = "black")

dev.off()

## Feature correlation matrix with binary

In [None]:
library(ggcorrplot)

In [None]:
p.mat <- model.matrix(~0+., data=imputed) %>% 
   cor_pmat(use="pairwise.complete.obs")

In [None]:
p.mat

In [None]:
model.matrix(~0+., data=imputed) %>% 
  cor(use="pairwise.complete.obs") %>% 
  ggcorrplot(show.diag=FALSE, method = "circle", lab=T, 
             lab_size=2, hc.order = T, p.mat = p.mat, 
  type = "full", insig = "blank")

In [None]:
options(repr.plot.width= 40, repr.plot.height = 40)
model.matrix(~0+., data=imputed) %>% 
  cor(use="pairwise.complete.obs") %>% 
  ggcorrplot(show.diag=FALSE, method = "circle", type="full", lab=F, 
             lab_size=2, hc.order = T)

## Populations together

In [None]:
library(ggcorrplot)

In [None]:
imputed_populations  <- imputed  %>% dplyr::select(which(substr(colnames(imputed),start = 1,stop = 3) %in% c("cd4","cd8")))

In [None]:
imputed_populations

In [None]:
p.mat <- model.matrix(~0+., data=imputed_populations) %>% 
   cor_pmat(use="pairwise.complete.obs")

p  <- model.matrix(~0+., data=imputed_populations) %>% 
  cor(use="pairwise.complete.obs") %>% 
ggcorrplot(show.diag=FALSE, method = "circle", type="full", lab=F, 
             lab_size=2, hc.order = T, p.mat = p.mat, insig = "blank") + scale_size_continuous(range = c(0,7))

print(p)

In [None]:
levels(p$data$Var1)

### Populations and metadata in population order

In [None]:
colnames(imputed)

In [None]:
c(levels(p$data$Var1), colnames(imputed)[which(!(colnames(imputed) %in% levels(p$data$Var1)))])

In [None]:
levels(p$data$Var1)

In [None]:
colnames(imputed)[which(!(colnames(imputed) %in% levels(p$data$Var1)))[c(1,2,3,9,4,5,8,14,21,6,7,10:13,15:20,26)]]

In [None]:
colnames_order  <- colnames(imputed)[which(!(colnames(imputed) %in% levels(p$data$Var1)))[c(1,2,3,9,4,5,8,14,21,6,7,10:13,15:20,26)]]

In [None]:
p.mat <- model.matrix(~0+.,
                     data=imputed[,c(colnames_order,
                                 levels(p$data$Var1))])  %>% 
   cor_pmat(use="pairwise.complete.obs")


In [None]:
p2  <- model.matrix(~0+.,
                     data=imputed[,c(colnames_order,
                                 levels(p$data$Var1))])%>% 
  cor(use="pairwise.complete.obs") %>% 
ggcorrplot(show.diag=FALSE, method = "circle", type="full", lab=F, 
             lab_size=2, hc.order = F, p.mat = p.mat, insig = "blank") + scale_size_continuous(range = c(0,6)) + 
theme(axis.text.x = element_text(angle = 90))

print(p2)

In [None]:
library(svglite)
ggsave(filename = "../figures/big_correlation_all.png", width = 20, height = 20)
ggsave(filename = "../figures/big_correlation_all.svg", width = 20, height = 20)

In [None]:
str_to_upper(gsub(levels(p2$data$Var1), pattern = "_", replacement = " "))

## Populations clustering

# Model 

### Dia vs Ctrl T0

In [None]:
populations_2

In [None]:
imputed

In [None]:
colnames(imputed)

In [None]:
times100  <- function(x){
    x = x*100
    return(x)
}

In [None]:
model_table_dia_t0_ctrl_t0  <- imputed  %>% 
dplyr::filter(time == "T0" & disease !="Pre-Dia")   %>% 
dplyr::select(-time)

In [None]:
model_table_dia_t0_ctrl_t0

In [None]:
colnames(model_table_dia_t0_ctrl_t0)

In [None]:
mtx4  <- model_table_dia_t0_ctrl_t0

In [None]:
population_colnames  <- which(substr(colnames(model_table_dia_t0_ctrl_t0),start = 1,stop = 3) %in% c("cd4","cd8"))

In [None]:
population_colnames

In [None]:
i = population_colnames[1]
df  <- mtx4  %>% dplyr::select(disease, value = i)  %>% mutate(value = as.numeric(value))
    wcx  <- wilcox.test(df$value ~ df$disease, conf.int = T)
    df2  <- df  %>% group_by(disease)  %>% summarise(mean = mean(value), sd = sd(value))
    df_all  <- data.frame(name = colnames(mtx4)[i], 
                          pval = wcx$p.value, 
                          mean_dia = df2$mean[2],
                         mean_ctrl = df2$mean[1], 
                         sd = df2$sd[1]+df2$sd[2],
                         upper = wcx$conf.int[2],
                         lower = wcx$conf.int[1]
                         )  %>% mutate(ratio = (mean_dia)/(mean_ctrl))
    
for(i in population_colnames[2:length(population_colnames)]){
    df  <- mtx4  %>% dplyr::select(disease, value = i)
    wcx  <- wilcox.test(df$value ~ df$disease, conf.int = T)
    df2  <- df  %>% group_by(disease)  %>% summarise(mean = mean(value), sd = sd(value))
    df  <- data.frame(name = colnames(mtx4)[i], 
                          pval = wcx$p.value, 
                          mean_dia = df2$mean[2],
                         mean_ctrl = df2$mean[1], 
                         sd = df2$sd[1]+df2$sd[2],
                         upper = wcx$conf.int[2],
                         lower = wcx$conf.int[1]
                         )  %>% mutate(ratio = (mean_dia)/(mean_ctrl))
    df_all  <- rbind(df_all, df)
}

In [None]:
df_all  %>% arrange(pval)

In [None]:
levels  <- df_all  %>% arrange(ratio)  %>% pull(name)

options(repr.plot.width = 20, repr.plot.height = 15)

levels  <- df_all  %>% arrange(ratio)  %>% pull(name)

plot <- df_all  %>% arrange(pval) %>% 
  mutate(upper = ratio + 1.96 * sd,
         lower = ratio - 1.96 * sd) %>%
  ggplot(aes(ratio, factor(name, levels = levels), color = ifelse(upper < 1, "1", 
                          ifelse(lower > 1, "2", "3")))) +
  geom_vline(xintercept = 1, color = "gray75") +
  geom_linerange(aes(xmin = lower, xmax = upper), size = 1.5, alpha = 0.5) +
  geom_point(size = 4) +
  theme_minimal(base_size = 16) +
  scale_color_manual(values = c("green4", "red3", "grey"), guide = "none") +
  labs(title = "Change in populations", y = NULL,
       x = "Ratio estimate \n (95% CI)") +
  theme(axis.text.y = element_text(hjust = 0, size = 18), panel.grid = element_blank())

plot

In [None]:
dir.create("../tables/population_dynamics/")

In [None]:
write.csv(df_all, "../tables/population_dynamics/freq_ctrl_t0_dia_t0_global.csv")

### Dia T1 vs Ctrl T0

In [None]:
imputed

In [None]:
model_table_dia_t1_ctrl_t0  <- imputed  %>% 
mutate(time_disease = paste(time, disease))  %>% 
dplyr::filter(time_disease %in% c("T1 Dia", "T0 Ctrl"))   %>% 
dplyr::select(-time_disease, -time)

In [None]:
model_table_dia_t1_ctrl_t0

In [None]:
colnames(model_table_dia_t1_ctrl_t0)

In [None]:
mtx4  <- model_table_dia_t1_ctrl_t0

In [None]:
population_colnames  <- which(substr(colnames(model_table_dia_t1_ctrl_t0),start = 1,stop = 3) %in% c("cd4","cd8"))

In [None]:
population_colnames

In [None]:
i = population_colnames[1]
df  <- mtx4  %>% dplyr::select(disease, value = i)  %>% mutate(value = as.numeric(value))
    wcx  <- wilcox.test(df$value ~ df$disease, conf.int = T)
    df2  <- df  %>% group_by(disease)  %>% summarise(mean = mean(value), sd = sd(value))
    df_all  <- data.frame(name = colnames(mtx4)[i], 
                          pval = wcx$p.value, 
                          mean_dia = df2$mean[2],
                         mean_ctrl = df2$mean[1], 
                         sd = df2$sd[1]+df2$sd[2],
                         upper = wcx$conf.int[2],
                         lower = wcx$conf.int[1]
                         )  %>% mutate(ratio = (mean_dia)/(mean_ctrl))
    
for(i in population_colnames[2:length(population_colnames)]){
    df  <- mtx4  %>% dplyr::select(disease, value = i)
    wcx  <- wilcox.test(df$value ~ df$disease, conf.int = T)
    df2  <- df  %>% group_by(disease)  %>% summarise(mean = mean(value), sd = sd(value))
    df  <- data.frame(name = colnames(mtx4)[i], 
                          pval = wcx$p.value, 
                          mean_dia = df2$mean[2],
                         mean_ctrl = df2$mean[1], 
                         sd = df2$sd[1]+df2$sd[2],
                         upper = wcx$conf.int[2],
                         lower = wcx$conf.int[1]
                         )  %>% mutate(ratio = (mean_dia)/(mean_ctrl))
    df_all  <- rbind(df_all, df)
}

In [None]:
df_all  %>% arrange(desc(ratio))

In [None]:
levels  <- df_all  %>% arrange(ratio)  %>% pull(name)

options(repr.plot.width = 20, repr.plot.height = 15)

levels  <- df_all  %>% arrange(ratio)  %>% pull(name)

plot <- df_all  %>% arrange(pval) %>% 
  mutate(upper = ratio + 1.96 * sd,
         lower = ratio - 1.96 * sd) %>%
  ggplot(aes(ratio, factor(name, levels = levels), color = ifelse(upper < 1, "1", 
                          ifelse(lower > 1, "2", "3")))) +
  geom_vline(xintercept = 1, color = "gray75") +
  geom_linerange(aes(xmin = lower, xmax = upper), size = 1.5, alpha = 0.5) +
  geom_point(size = 4) +
  theme_minimal(base_size = 16) +
  scale_color_manual(values = c("green4", "red3", "grey"), guide = "none") +
  labs(title = "Change in populations", y = NULL,
       x = "Ratio estimate \n (95% CI)") +
  theme(axis.text.y = element_text(hjust = 0, size = 18), panel.grid = element_blank())

plot

In [None]:
write.csv(df_all, "../tables/population_dynamics/freq_dia_t1_ctrl_t0_global.csv")

### Dia T1 vs Dia T0

In [None]:
imputed

In [None]:
model_table_dia_t1_dia_t0  <- imputed  %>% 
mutate(time_disease = paste(time, disease))  %>% 
dplyr::filter(time_disease %in% c("T1 Dia", "T0 Dia"))   %>% 
dplyr::select(-time_disease, -disease) %>% 
dplyr::filter(!(sex == "M" & age == 3 & time == "T0"))

In [None]:
model_table_dia_t1_dia_t0

In [None]:
mtx4  <- model_table_dia_t1_ctrl_t0

In [None]:
population_colnames  <- which(substr(colnames(model_table_dia_t1_ctrl_t0),start = 1,stop = 3) %in% c("cd4","cd8"))

In [None]:
population_colnames

In [None]:
i = population_colnames[1]
df  <- mtx4  %>% dplyr::select(value = i)  %>% 
                    mutate(value = as.numeric(value))  %>% 
pull(value)
mtx  <- matrix(df, ncol = 2, byrow = T)  %>% as.data.frame()
 wcx  <- wilcox.test(mtx$V1, mtx$V2, paired = TRUE, conf.int = T)
    df_all  <- data.frame(name = colnames(mtx4)[i], 
                          pval = wcx$p.value, 
                          mean_dia = mean(mtx$V1),
                         mean_ctrl = mean(mtx$V2), 
                         sd = sd(mtx$V1),
                         upper = wcx$conf.int[2],
                         lower = wcx$conf.int[1]
                         )  %>% mutate(ratio = (mean_dia)/(mean_ctrl))
    

In [None]:
for(i in population_colnames[2:length(population_colnames)]){
   df  <- mtx4  %>% dplyr::select(value = i)  %>% 
                    mutate(value = as.numeric(value))  %>% 
pull(value)
mtx  <- matrix(df, ncol = 2, byrow = T)  %>% as.data.frame()
 
 wcx  <- wilcox.test(mtx$V1, mtx$V2, paired = TRUE, conf.int = T)
    df  <- data.frame(name = colnames(mtx4)[i], 
                          pval = wcx$p.value, 
                          mean_dia = mean(mtx$V1),
                         mean_ctrl = mean(mtx$V2), 
                         sd = sd(mtx$V1),
                         upper = wcx$conf.int[2],
                         lower = wcx$conf.int[1]
                         )  %>% mutate(ratio = (mean_dia)/(mean_ctrl))
    df_all  <- rbind(df_all, df)
}

In [None]:
df_all  %>% arrange(pval)

In [None]:
levels  <- df_all  %>% arrange(ratio)  %>% pull(name)

options(repr.plot.width = 20, repr.plot.height = 15)

levels  <- df_all  %>% arrange(ratio)  %>% pull(name)

plot <- df_all  %>% arrange(pval) %>% 
  mutate(upper = ratio + 1.96 * sd,
         lower = ratio - 1.96 * sd) %>%
  ggplot(aes(ratio, factor(name, levels = levels), color = ifelse(upper < 1, "1", 
                          ifelse(lower > 1, "2", "3")))) +
  geom_vline(xintercept = 1, color = "gray75") +
  geom_linerange(aes(xmin = lower, xmax = upper), size = 1.5, alpha = 0.5) +
  geom_point(size = 4) +
  theme_minimal(base_size = 16) +
  scale_color_manual(values = c("green4", "red3", "grey"), guide = "none") +
  labs(title = "Change in populations", y = NULL,
       x = "Ratio estimate \n (95% CI)") +
  theme(axis.text.y = element_text(hjust = 0, size = 18), panel.grid = element_blank())

plot

In [None]:
write.csv(df_all, "../tables/population_dynamics/freq_dia_t1_dia_t0_global.csv")

### Populations - correlations with C-peptide

In [None]:
md3

In [None]:
population_colnames  <- which(substr(colnames(md3),start = 1,stop = 3) %in% c("cd4","cd8"))

In [None]:
population_colnames

In [None]:
which(colnames(md3) == "fasting_cpept_t1")

In [None]:
calc_correlation  <- function(i){
    df  <- md3  %>% dplyr::select(i,116)  %>% filter(!is.na(116))
    colnames(df)  <- c("value","fasting_cpept_t1")
    cor  <- cor.test(df$value, df$fasting_cpept_t1)
    res_df  <- data.frame(population = colnames(md3)[i], 
                          cor = cor$estimate, 
                          pval = cor$p.value, 
                         padj  = ifelse(cor$p.value*40>1,1,cor$p.value*40))
    return(res_df)
}

In [None]:
test  <- future_map(population_colnames, calc_correlation)

In [None]:
test2  <- bind_rows(test)

In [None]:
test2  %>% arrange(pval)

In [None]:
maximums  <- c(0.4,0.5,0.25,0.25,0.25,0.7,0.5,0.7,0.25,0.7)

In [None]:
options(repr.plot.width = 4, repr.plot.height = 4)
j = 1
for(i in (test2  %>% arrange(pval)  %>% pull(population))[1:10]){
    df2 <- md3  %>% dplyr::select(which(colnames(md3)==i), fasting_cpept_T1 = 116) 
    colnames(df2)  <- c("value", "fasting_cpept_T1")
  p <-  df2 %>%  ggplot(aes(x=value, y=fasting_cpept_T1)) +
  geom_point(shape = 16, size = 2) +
  geom_smooth(method=lm, alpha = 0.2) + ggtitle(gsub(str_to_upper(i), pattern = "_", replacement = " ")) + 
stat_cor(size = 7) + theme_classic() + ggtheme() + xlim(0,maximums[j])
 p <-   print(p)
    j = j+1
#library(svglite)
#dir.create("../figures/correlation_populations/")
ggsave(filename = paste0("../figures/correlation_populations/",i,".svg"), width = 5, height = 4.5)
    }

In [None]:
for(i in (test2  %>% arrange(pval)  %>% pull(population))[1:10]){
    df2 <- md3  %>% dplyr::select(which(colnames(md3)==i),fasting_cpept_T1 = 116) 
    colnames(df2)  <- c("value", "fasting_cpept_T1")
    p  <- df2 %>%  ggplot(aes(x=value, y=fasting_cpept_T1)) +
  geom_point(shape = 16, size = 2) +
 geom_smooth(method=lm) + ggtitle(i) 

In [None]:
write.csv(test2, "../tables/population_dynamics/corr_cpept.csv", row.names = F)

### Populations - correlations with imputed  C-peptide

In [None]:
population_colnames  <- which(substr(colnames(imputed),start = 1,stop = 3) %in% c("cd4","cd8"))

In [None]:
population_colnames

In [None]:
which(colnames(imputed) == "fasting_cpept_t1")

In [None]:
imputed$fasting_cpept_t1  

In [None]:
calc_correlation  <- function(i){
    df  <- imputed  %>% dplyr::select(i,116)  %>% filter(!is.na(116))
    colnames(df)  <- c("value","fasting_cpept_t1")
    cor  <- cor.test(df$value, df$fasting_cpept_t1)
    res_df  <- data.frame(population = colnames(imputed)[i], 
                          cor = cor$estimate, 
                          pval = cor$p.value, 
                         padj  = ifelse(cor$p.value*40>1,1,cor$p.value*40))
    return(res_df)
}

In [None]:
test  <- future_map(population_colnames, calc_correlation)

In [None]:
test2  <- bind_rows(test)

In [None]:
test2  %>% arrange(pval)

### Populations - correlations with age

In [None]:
md3

In [None]:
population_colnames  <- which(substr(colnames(md3),start = 1,stop = 3) %in% c("cd4","cd8"))

In [None]:
population_colnames

In [None]:
which(colnames(md3) == "age")

In [None]:
calc_correlation  <- function(i){
    df  <- md3  %>% dplyr::select(i,3)  %>% filter(!is.na(3))
    colnames(df)  <- c("value","age")
    cor  <- cor.test(df$value, df$age)
    res_df  <- data.frame(population = colnames(md3)[i], 
                          cor = cor$estimate, 
                          pval = cor$p.value, 
                         padj  = ifelse(cor$p.value*40>1,1,cor$p.value*40))
    return(res_df)
}

In [None]:
test  <- future_map(population_colnames, calc_correlation)

In [None]:
test2  <- bind_rows(test)

In [None]:
test3  <- test2  %>% filter(population %in% c(
"cd8_l3_naive3_sox4_stmn1_lrrn3",
"cd4_l3_treg3_hla_dr_pi16_ccr10",
"cd8_l2_proliferating",
"cd4_l3_th1_4_gzmk_cst7_cd27",
"cd4_l2_th1",
"cd4_l3_prolif3_foxp3_il2ra_tnfrsf1b"
))

In [None]:
test3

In [None]:
options(repr.plot.width = 4, repr.plot.height = 4)
for(i in (test3  %>% arrange(pval)  %>% pull(population))[1:6]){
    df2 <- md3  %>% dplyr::select(which(colnames(md3)==i),age = 3) 
    colnames(df2)  <- c("value", "age")
    p  <- df2 %>%  ggplot(aes(x=value, y=age)) +
  geom_point(shape = 16, size = 2) +
 geom_smooth(method=lm) + ggtitle(i) +stat_cor()
    print(p)
    }

In [None]:
write.csv(test2, "../tables/population_dynamics/corr_age.csv", row.names = F)

## Heatmap population dynammics

In [None]:
list.files("../tables/population_dynamics/")

In [None]:
library(data.table)

In [None]:
tables  <- map(list.files("../tables/population_dynamics/", full.names = TRUE), read_csv) 

In [None]:
tables

In [None]:
list.files("../tables/population_dynamics/")

In [None]:
tables[[5]]

In [None]:
age  <- tables[[1]]  %>% dplyr::select(population, corr_age = cor)
cpept  <- tables[[2]]  %>% dplyr::select(population, corr_cpept = cor)
ctrl_t0_dia_t0  <- tables[[3]]  %>% dplyr::select(population = name, ratio_dia_t0_ctrl_t0 = ratio)
ctrl_t0_dia_t1  <- tables[[4]]  %>% dplyr::select(population = name, ratio_dia_t1_ctrl_t0 = ratio)
dia_t1_dia_t0  <- tables[[5]]  %>% dplyr::select(population = name, ratio_dia_t1_dia_t0 = ratio)


In [None]:
all_pop_tables  <- left_join(age, cpept)
all_pop_tables  <- left_join(all_pop_tables, ctrl_t0_dia_t0)
all_pop_tables  <- left_join(all_pop_tables, ctrl_t0_dia_t1)
all_pop_tables  <- left_join(all_pop_tables, dia_t1_dia_t0)


In [None]:
all_pop_tables

In [None]:
mtx  <- as.matrix(all_pop_tables[,2:6])

In [None]:
rownames(mtx)  <- all_pop_tables$population

In [None]:
options(repr.plot.width = 20, repr.plot.height = 20)
pheatmap::pheatmap(mtx[which(rownames(mtx)!= "cd8_l3_tcm7_klrb1_lst1_blk"),], scale = "column", 
                   cluster_rows = T, 
                   cluster_cols = T, 
                   color = colorRampPalette(c("dodgerblue","lightblue","lightblue", "white","indianred2","indianred2","indianred4"))(50))

## Corr matrix

### Feature Matrix

In [None]:
age  <- tables[[1]]  %>% dplyr::select(population, corr_age = cor)
cpept  <- tables[[2]]  %>% dplyr::select(population, corr_cpept = cor)
ctrl_t0_dia_t0  <- tables[[3]]  %>% dplyr::select(population = name, ratio_dia_t0_ctrl_t0 = ratio)
ctrl_t0_dia_t1  <- tables[[4]]  %>% dplyr::select(population = name, ratio_dia_t1_ctrl_t0 = ratio)
dia_t1_dia_t0  <- tables[[5]]  %>% dplyr::select(population = name, ratio_dia_t1_dia_t0 = ratio)


In [None]:
all_pop_tables  <- left_join(age, cpept)
all_pop_tables  <- left_join(all_pop_tables, ctrl_t0_dia_t0)
all_pop_tables  <- left_join(all_pop_tables, ctrl_t0_dia_t1)
all_pop_tables  <- left_join(all_pop_tables, dia_t1_dia_t0)


In [None]:
all_pop_tables

In [None]:
mtx  <- as.matrix(all_pop_tables[,2:6])

In [None]:
rownames(mtx)  <- all_pop_tables$population

In [None]:
options(repr.plot.width = 20, repr.plot.height = 20)
pheatmap::pheatmap(mtx[which(rownames(mtx)!= "cd8_l3_tcm7_klrb1_lst1_blk"),], scale = "column", 
                   cluster_rows = T, 
                   cluster_cols = T, 
                   color = colorRampPalette(c("dodgerblue","lightblue","lightblue", "white","indianred2","indianred2","indianred4"))(50))

In [None]:
options(repr.plot.width = 20, repr.plot.height = 20)
pheatmap::pheatmap(mtx, scale = "column", 
                   cluster_rows = T, 
                   cluster_cols = T, 
                   color = colorRampPalette(c("dodgerblue","lightblue",
                                              "lightblue", "white","indianred2",
                                              "indianred2","indianred4"))(50))

In [None]:
mtx

### P-val matrix

In [None]:
tables[[3]]

In [None]:
### Feature Matrix

age  <- tables[[1]]  %>% dplyr::select(population, corr_age = pval)
cpept  <- tables[[2]]  %>% dplyr::select(population, corr_cpept = pval)
ctrl_t0_dia_t0  <- tables[[3]]  %>% dplyr::select(population = name, ratio_dia_t0_ctrl_t0 = pval)
ctrl_t0_dia_t1  <- tables[[4]]  %>% dplyr::select(population = name, ratio_dia_t1_ctrl_t0 = pval)
dia_t1_dia_t0  <- tables[[5]]  %>% dplyr::select(population = name, ratio_dia_t1_dia_t0 = pval)


all_pop_tables_pval  <- left_join(age, cpept)
all_pop_tables_pval  <- left_join(all_pop_tables_pval, ctrl_t0_dia_t0)
all_pop_tables_pval  <- left_join(all_pop_tables_pval, ctrl_t0_dia_t1)
all_pop_tables_pval  <- left_join(all_pop_tables_pval, dia_t1_dia_t0)

mtx_pval  <- as.matrix(all_pop_tables_pval[,2:6])

rownames(mtx_pval)  <- all_pop_tables_pval$population

In [None]:
mtx_pval

### Feature matrix by ggcorrplot

In [None]:
mtx_scaled  <- scale(x = mtx)

In [None]:
corrplot(mtx_scaled, is.corr = FALSE,
title = "Correlation of features in patient samples", 
mar = c(0,0,1,0), number.cex = 0.5, number.digits = 2,
p.mat =   mtx_pval, insig = 'label_sig', #insig = "blank", sig.level = 0.05,    
number.font = 2, order = 'hclust', addrect = 20, 
        tl.col = "black")

# PCA populations

In [None]:
imputed

In [None]:
population_colnames  <- which(substr(colnames(imputed),start = 1,stop = 3) %in% c("cd4","cd8"))

In [None]:
population_colnames

In [None]:
pca_populations  <- imputed  %>% 
dplyr::select(population_colnames)   %>% t

In [None]:
pca_populations

In [None]:
library("factoextra")
library("FactoMineR")

In [None]:
res.pca <- PCA(t(pca_populations),  graph = FALSE, ncp = 20)

In [None]:
fviz_screeplot(res.pca, addlabels = TRUE, ylim = c(0, 50), ncp = 20)

In [None]:
res.pca$ind$coord

In [None]:
var <- get_pca_var(res.pca)
head(var$contrib)

In [None]:
fviz_pca_var(res.pca, col.var="contrib",
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
             repel = TRUE # Avoid text overlapping
             )

In [None]:
metadata_pca  <- imputed  %>% 
dplyr::select(1:4)  %>% mutate(
dia_time = paste(disease, time))

In [None]:
options(repr.plot.width = 6, repr.plot.height = 5)
fviz_pca_ind(res.pca, col.ind = metadata_pca$disease)

In [None]:
fviz_pca_ind(res.pca, col.ind = metadata_pca$age)

In [None]:
fviz_pca_ind(res.pca, col.ind = metadata_pca$Disease, axes = c(1,2))

In [None]:
fviz_pca_ind(res.pca, col.ind = metadata_pca$age, axes = c(3,4))

In [None]:
fviz_pca_ind(res.pca, col.ind = metadata_pca$disease, axes = c(3,4))

In [None]:
fviz_pca_ind(res.pca, col.ind = metadata_pca$disease, axes = c(4,5))

In [None]:
df_pca  <- res.pca$ind$coord

In [None]:
df_pca  <- cbind(df_pca, metadata_pca)

In [None]:
df_pca

In [None]:
df_pca   <- df_pca %>% pivot_longer(starts_with("Dim"), names_to = "Dim", values_to = "value")

In [None]:
df_pca

In [None]:
options(repr.plot.width = 26, repr.plot.height = 25)

df_pca  %>% 
ggplot(aes(x = disease, y = value)) + 
geom_boxplot(outlier.shape = NA, 
           alpha = 0.5, width = 0.9, aes(fill = disease)) +
   geom_dotplot(binaxis='y', stackdir='center', dotsize=0) + 
   geom_jitter(binaxis='y', position=position_jitter(width = 0.1, height = 0), 
  size = 3, stackdir='center', aes(fill = disease, shape = disease), color = "black") + 
  facet_wrap(~Dim, scales = "free", ncol = 4) +
scale_shape_manual(values = c(21,22,22))+
scale_fill_manual(values = c("lightsteelblue1","rosybrown1", "purple"))+
  ylab("Frequency") +
  xlab("Condition") +
  theme_classic() +
ggtheme() +
ggpubr::stat_compare_means(label.x= 1.1, label.y.npc = 1, size = 7.025, vjust = 0.3, label = "p.format")+
  theme(strip.background = element_blank(), panel.grid = element_blank()) + 
  theme(axis.text = element_text(color = "black"),
        axis.line = element_line(color = "black"),
        axis.text.x = element_text(angle = 90))

In [None]:
options(repr.plot.width = 26, repr.plot.height = 25)

df_pca  %>% 
filter(time == "T0")  %>% 
ggplot(aes(x = disease, y = value)) + 
geom_boxplot(outlier.shape = NA, 
           alpha = 0.5, width = 0.9, aes(fill = disease)) +
   geom_dotplot(binaxis='y', stackdir='center', dotsize=0) + 
   geom_jitter(binaxis='y', position=position_jitter(width = 0.1, height = 0), 
  size = 3, stackdir='center', aes(fill = disease, shape = disease), color = "black") + 
  facet_wrap(~Dim, scales = "free", ncol = 4) +
scale_shape_manual(values = c(21,22,22))+
scale_fill_manual(values = c("lightsteelblue1","rosybrown1", "purple"))+
  ylab("Frequency") +
  xlab("Condition") +
  theme_classic() +
ggtheme() +
ggpubr::stat_compare_means(label.x= 1.1, label.y.npc = 1, size = 7.025, vjust = 0.3, label = "p.format")+
  theme(strip.background = element_blank(), panel.grid = element_blank()) + 
  theme(axis.text = element_text(color = "black"),
        axis.line = element_line(color = "black"),
        axis.text.x = element_text(angle = 90))

In [None]:
options(repr.plot.width = 26, repr.plot.height = 25)

df_pca  %>% 
filter(disease %in% c("Dia", "Ctrl"))  %>% 
ggplot(aes(x = disease, y = value)) + 
geom_boxplot(outlier.shape = NA, 
           alpha = 0.5, width = 0.9, aes(fill = disease)) +
   geom_dotplot(binaxis='y', stackdir='center', dotsize=0) + 
   geom_jitter(binaxis='y', position=position_jitter(width = 0.1, height = 0), 
  size = 3, stackdir='center', aes(fill = disease, shape = disease), color = "black") + 
  facet_wrap(~Dim, scales = "free", ncol = 4) +
scale_shape_manual(values = c(21,22,22))+
scale_fill_manual(values = c("lightsteelblue1","rosybrown1", "purple"))+
  ylab("Frequency") +
  xlab("Condition") +
  theme_classic() +
ggtheme() +
ggpubr::stat_compare_means(label.x= 1.1, label.y.npc = 1, size = 7.025, vjust = 0.3, label = "p.format")+
  theme(strip.background = element_blank(), panel.grid = element_blank()) + 
  theme(axis.text = element_text(color = "black"),
        axis.line = element_line(color = "black"),
        axis.text.x = element_text(angle = 90))

In [None]:
options(repr.plot.width = 6, repr.plot.height = 5)
fviz_pca_ind(res.pca, col.ind = metadata_pca$disease, axes = c(4,6))

In [None]:
options(repr.plot.width = 16, repr.plot.height = 16)
fviz_pca_var(res.pca, col.var="contrib",
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
             axes = c(4,6),
             repel = TRUE # Avoid text overlapping
             )

In [None]:
options(repr.plot.width = 26, repr.plot.height = 25)

df_pca  %>% 
filter(Condition %in% c("Dia T0"))  %>% 
ggplot(aes(x = Condition2, y = value)) + 
geom_boxplot(outlier.shape = NA, 
           alpha = 0.5, width = 0.9, aes(fill = disease)) +
   geom_dotplot(binaxis='y', stackdir='center', dotsize=0) + 
   geom_jitter(binaxis='y', position=position_jitter(width = 0.1, height = 0), 
  size = 3, stackdir='center', aes(fill = disease, shape = disease), color = "black") + 
  facet_wrap(~Dim, scales = "free", ncol = 4) +
scale_shape_manual(values = c(21,22,22))+
scale_fill_manual(values = c("lightsteelblue1","rosybrown1", "purple"))+
  ylab("Frequency") +
  xlab("Condition") +
  theme_classic() +
ggtheme() +
ggpubr::stat_compare_means(label.x= 1.1, label.y.npc = 1, size = 7.025, vjust = 0.3, label = "p.format")+
  theme(strip.background = element_blank(), panel.grid = element_blank()) + 
  theme(axis.text = element_text(color = "black"),
        axis.line = element_line(color = "black"),
        axis.text.x = element_text(angle = 90))

# Population Freq of Total

In [None]:
cd4_l1_full  <- readRDS("../data/processed/L1/cd4_l1_full_filt.rds")

In [None]:
cd8_l1_full  <- readRDS("../data/processed/L1/cd8_l1_full_filt.rds")

In [None]:
options(repr.plot.width = 14, repr.plot.height = 4)
DimPlot(cd4_l1_full, group.by = "annotations_l3", raster = T)

In [None]:
options(repr.plot.height = 4, repr.plot.width = 20)

DimPlot(cd8_l1_full, group.by = "annotations_l3", raster = T)

In [None]:
full  <- merge(cd4_l1_full, cd8_l1_full)

In [None]:
full@meta.data  <- full@meta.data  %>% 
mutate(annotations_l2  = gsub(pattern = " T cells", replacement = "", x = annotations_l3))  %>% 
separate(annotations_l2, into = c("annotations_l2", NA), sep = ":")  %>% 
mutate(annotations_l2  = gsub(pattern = "Unconventional_CD8 Unc", replacement = "CD8_Unc", x = annotations_l2))  %>% 
mutate(annotations_l2  = gsub(pattern = "Unconventional_Unc", replacement = "CD4_Unc_", x = annotations_l2))   %>% 
separate(annotations_l2, into = c("Level1", "Level2",NA,NA), sep = "_")  


In [None]:
full$annotations_l2   <- paste(full$Level1, full$Level2)

In [None]:
full$annotations_l2  %>% table

In [None]:
full$Sample_char  %>% table

In [None]:
full@meta.data  %>% colnames

In [None]:
seurat_meta_data <- full@meta.data


# Create grouped dataframe, calculate the frequencies of clusters
df3  <- seurat_meta_data %>% 
  group_by(Sample_char, annotations_l2) %>% 
  summarise(n = n()) %>% 
  unique() %>% 
  mutate(freq = n / sum(n))  %>% 
dplyr::select(-n)  %>% 
ungroup   %>% 
pivot_wider(names_from = "annotations_l2", values_from = "freq", values_fill = 0) 
df4  <- left_join((full@meta.data %>% dplyr::select(Sample_char) %>% unique), df3)
df4[is.na(df4)] <- 0



In [None]:
df4 

In [None]:
model_table_dia_t0_ctrl_t0  <- df4  %>% separate(Sample_char, 
                                                 into = c("Patient_ID", "Disease", "Time", "Age_group", "Sex", "Exp"),
                                                remove = F, sep = " ")  %>% 
dplyr::filter(Time == "T0" & Exp %in% c("Exp16", "Exp18", "Exp19", "Exp20") & Disease != "Pre-Dia")

In [None]:
colnames(model_table_dia_t0_ctrl_t0)

In [None]:
colnames(model_table_dia_t0_ctrl_t0)  <- janitor::make_clean_names(colnames(model_table_dia_t0_ctrl_t0))

In [None]:
colnames(model_table_dia_t0_ctrl_t0)

In [None]:
model_table_dia_t0_ctrl_t0  <- model_table_dia_t0_ctrl_t0  %>% mutate_at(8:24, as.numeric)

In [None]:
model_table_dia_t0_ctrl_t0 <- model_table_dia_t0_ctrl_t0  %>% mutate_at(c(3,5,6), as.factor)

In [None]:
binom.glm <- glm(disease ~ sex  + cd4_proliferating + cd4_temra + cd4_tfh + 
cd4_th1 + cd4_th17 + cd4_th2 + cd4_treg + cd4_unc + cd8_naive + cd8_proliferating + 
                 cd8_tcm + cd8_tem + cd8_temra + cd8_unc + nk_cells_nk -1,
                 data = model_table_dia_t0_ctrl_t0, family = binomial)

In [None]:
binom.glm

In [None]:
summary(binom.glm)

In [None]:
Explained_variance = (84.564 - 54.524) / 84.564
Explained_variance

In [None]:
pchisq(54.524, 49, lower.tail = FALSE)

In [None]:
res  <- coef(summary(binom.glm))  %>% as.data.frame()
colnames(res)  <- c("estimate", "std.error", "z", "p")
res$term  <- rownames(res)

In [None]:
res

In [None]:
options(repr.plot.width = 6, repr.plot.height = 5)

res %>%
dplyr::filter(!(term %in% c("Age", "SexM", "SexF", "RiskHLA")))  %>% 
  mutate(upper = estimate + 1.96 * std.error,
         lower = estimate - 1.96 * std.error) %>%
  mutate(across(all_of(c(
    "estimate", "lower", "upper"
  )), exp)) %>%
  mutate(upper = ifelse(upper > 5, 4.9999, upper)) %>%
  arrange(estimate) %>%
mutate(term = factor(term, unique(term)))  %>% 
  ggplot(aes(estimate, term, color = estimate > 1)) +
  geom_vline(xintercept = 1, color = "gray75") +
  geom_linerange(aes(xmin = lower, xmax = upper),
                 size = 1.5,
                 alpha = 0.5) +
  geom_point(size = 4) +
  theme_minimal(base_size = 16) +
  scale_color_manual(values = c("green4", "red3"), guide = "none") +
  labs(title = "", y = NULL,
       x = "Probability \n(95% Confidence Intervals)") +
  theme(axis.text.y = element_text(hjust = 0, size = 18),
        panel.grid = element_blank())

### Explaining Treg

In [None]:
md3

In [None]:
md_cpept

In [None]:
md5  <- read_xlsx("../data/metadata_v05.xlsx")

md5  <- md5  %>% mutate(dq2 = ifelse((HLA_DQA11 == "DQA1*05:01:01" | HLA_DQA12 == "DQA1*05:01:01") &
                                   (HLA_DQB11 == "DQB1*02:01:01" | HLA_DQB12 == "DQB1*02:01:01"),"DQ2",
                                   "Other"),
                      dq8 = ifelse((HLA_DQA11 == "DQA1*03:01:01" | HLA_DQA12 == "DQA1*03:01:01") &
                                   (HLA_DQB11 == "DQB1*03:02:01" | HLA_DQB12 == "DQB1*03:02:01"),"DQ8",
                                   "Other"))  %>% mutate(
                      dq2_8 = ifelse(dq2 == "DQ2" & dq8 == "DQ8","DQ2_8",
                                     ifelse(dq2 == "DQ2","DQ2", ifelse(dq8 == "DQ8","DQ8","Other"))))  %>% 
mutate(c7_01 = ifelse((HLA_C1 == "C*07:01:01" & HLA_C2 == "C*07:01:01"), "c7_01_hom",
                                   ifelse(HLA_C1 == "C*07:01:01" | HLA_C2 == "C*07:01:01","c7_01_het",
                                   "Other")),
                      b8_01 = ifelse((HLA_B1 == "B*08:01:01" & HLA_B2 == "B*08:01:01"), "b8_01_hom",
                                   ifelse(HLA_B1 == "B*08:01:01" | HLA_B2 == "B*08:01:01","b8_01_het",
                                   "Other")))   %>% mutate(
                      c7_b8 = ifelse(b8_01 == "b8_01_hom" & c7_01 == "c7_01_hom",
                                     "c7_b8_hom", ifelse(c7_01 == "c7_01_hom", "c7_01_hom",
                              ifelse(b8_01 == "b8_01_hom", "b8_01_hom", 
                              ifelse(b8_01 == "b8_01_het" & c7_01 == "c7_01_het",
                                     "c7_b8_het", 
                                     
                              ifelse(b8_01 == "b8_01_het", "b8_01_het", 
                              ifelse(c7_01 == "c7_01_het", "c7_01_het", "Other" )))))))

In [None]:
md4  <- left_join(md3, md5)

In [None]:
colnames(md4)

In [None]:
grep(colnames(md4), pattern = "Treg")

In [None]:
grep(colnames(md4), pattern = "part_remission_y_n")

In [None]:
grep(colnames(md4), pattern = "Treg", value = T)

In [None]:
md4  <- md4  %>% mutate(age_group = case_when(
Age < 7.9 ~ "2-7y",
Age > 8 & Age < 13.9 ~ "8-13y",
Age > 14 ~ "14-18y"
))

In [None]:
md4

In [None]:
options(repr.plot.width = 6, repr.plot.height = 5)

for(i in c(2,116,117,193:196,151)){
    df_filt  <- md4  %>% dplyr::select(i,value = 38, Condition = 1)
    colnames(df_filt)  <- c("Condition2", "value", "Condition")
    print(df_filt)
    p  <- df_filt  %>% ggplot(aes(x = Condition2,
             y = value)) +
   geom_dotplot(binaxis='y', stackdir='center', dotsize = 0) + 
   geom_boxplot(outlier.shape = NA) +
    geom_jitter(binaxis='y', position=position_jitter(width = 0.1, height = 0.01), 
                size = 3, stackdir='center', aes(color = Condition)) + 
  theme_classic() + xlab("") +  
 xlab("") + ylab("Value") +
   scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +
  ggpubr::stat_compare_means(label.x = 1.2, label.y.npc = "top", size = 5, vjust = 0.3, label = "p.format") + 
      theme(plot.title = element_text(hjust = 0.5, size = 22),
          axis.line = element_line(colour = "black"), 
        axis.ticks = element_line(colour = "black")) +
    ggtitle(colnames(md4)[i])
  ylim(0,NA)
    
    print(p)
}

# Recalculating percentages

In [None]:
library(brms)

In [None]:
populations_2  <- read_csv("../tables/populations_2.csv")

populations_2$`...1`  <- NULL

populations_2$prelim_final  <- ifelse(populations_2$Experiment_ID %in% c("Exp08","Exp10","Exp11"), 
                                     "Prelim","Final")

IDs  <- populations_2  %>% dplyr::filter(prelim_final == "Prelim")  %>% pull(Patient_ID)  %>% unique

IDs

popul_final_freq_from_total  <- populations_2  %>% 
ungroup  %>% 
mutate(prelim_final = ifelse(Experiment_ID %in% c("Exp08", "Exp10", "Exp11"), "Prelim","Final"))  %>% 
dplyr::filter(Patient_ID %in% IDs & Time == "T0")  %>% 
dplyr::select(Patient_ID, Time, annotations, prelim_final, freq_from_total, Level)  %>% 
pivot_wider(names_from = "prelim_final", values_from = freq_from_total)

popul_final_freq_from_total

In [None]:
popul_final_freq_from_total$Coeff  <- (popul_final_freq_from_total$Prelim+0.0000000001)/(popul_final_freq_from_total$Final+0.0000000001)

In [None]:
popul_final_freq_from_total

In [None]:
populations_2

In [None]:
populations_to_bayes  <- populations_2  %>% 
dplyr::filter(prelim_final == "Final")  %>% 
dplyr::select(annotations, Patient_ID, Sex, Age, Time, pct_from_total)

In [None]:
populations_to_bayes2  <- populations_to_bayes  %>% 
left_join(popul_final_freq_from_total  %>% dplyr::select(
          annotations, Patient_ID, Time, Prelim,
          Coeff))

In [None]:
populations_to_bayes2

In [None]:
write.csv(populations_to_bayes2, "../bayes_model_data.csv")

In [None]:
priors <- c(
    set_prior("normal(0, 5)", class = "b") # Generic prior for coefficients
)

In [None]:
populations_to_bayes2

In [None]:
bayesian_model <- brm(
  bf(Prelim ~ 0 + pct_from_total:Coeff, nl = F),
  data = populations_to_bayes2,
  family = gaussian(),  # Assuming a Gaussian family; adjust as needed for your data
  prior = priors,
  sample_prior = "yes",
  chains = 4,
  iter = 2000,
  control = list(adapt_delta = 0.95)
)

In [None]:
summary(bayesian_model)