# Part 21: Validation of findings in other T1D datasets


In this notebook, we will describe how we obtained and processed transcriptomics data from publicly available sources. 

Please note that we deposited on Zenodo:
* a full matrix of counts from each of those datasets
* a merged count matrix from all samples across datasets
* rds files where applicable 
Downloading those instead of running the code will save you time and effort. PLease note that other datasets are described in previous parts 16-20.  

In [None]:
ggtheme <- function() {
  theme(
    axis.text = element_text(size = 20),
    axis.title = element_text(size = 20),
    text = element_text(size = 20, colour = "black"),
    legend.text = element_text(size = 20),
    legend.key.size =  unit(10, units = "points")
    
  )
}


In [None]:
library(GEOquery)
source("diabetes_analysis_v07.R")

# GSE237218 Newman

The GSE237218 dataset is from the publication by [Newman et al, 2023](https://www.nature.com/articles/s42003-023-05327-7).

We will obtain the dataset and metadata from GEO. 

In [None]:
gset <- getGEO("GSE237218", GSEMatrix =TRUE, getGPL=TRUE, AnnotGPL=TRUE)

In [None]:
str(gset$GSE237218_series_matrix.txt.gz)

In [None]:
gset$GSE237218_series_matrix.txt.gz$`cell type:ch1`

In [None]:
gset$GSE237218_series_matrix.txt.gz$`disease status:ch1`

In [None]:
metadata <- data.frame(geo_id = gset$GSE237218_series_matrix.txt.gz$geo_accession,
                       disease = gset$GSE237218_series_matrix.txt.gz$`disease status:ch1`, 
                       cell_type = gset$GSE237218_series_matrix.txt.gz$`cell type:ch1`,
                      subject_id = gset$GSE237218_series_matrix.txt.gz$`subject id:ch1`,
                      sample_id = gset$GSE237218_series_matrix.txt.gz$description,
                      Age = gset$GSE237218_series_matrix.txt.gz$`age:ch1`,
                      Sex = gset$GSE237218_series_matrix.txt.gz$`Sex:ch1`
                      )



In [None]:
mtx  <- read_delim("../../240617_VN_Diabetes_V06/data/published_data/Newman_2023/data/GSE237218_processedCounts_log_TPM.tsv.gz")

In [None]:
fct_minus_one  <- function(x){
    x  <- x+1
    return(x)
}

In [None]:
mtx2  <- mtx  %>% dplyr::select(-transcript_id)  %>% 
mutate_at(.vars = vars(-gene_id), .funs = fct_minus_one)  %>% 
dplyr::filter(!is.na(gene_id))

In [None]:
mtx3  <- mtx2  %>% group_by(gene_id)  %>% 
summarise_all(.funs = sum)

In [None]:
mtx4  <- as.matrix(mtx3[,2:443])

In [None]:
rownames(mtx4)  <- mtx3$gene_id

In [None]:
colnames(mtx4) == metadata$sample_id

In [None]:
match(metadata$sample_id, colnames(mtx3)[2:443])

In [None]:
mtx4  <- mtx4[,match(metadata$sample_id, colnames(mtx3)[2:443])]

In [None]:
colnames(mtx4) == metadata$sample_id

Testing our genes.

In [None]:
df  <- metadata

In [None]:
df$NFKBIA  <- mtx4[which(rownames(mtx4)=="NFKBIA"),]

In [None]:
df$GZMB  <- mtx4[which(rownames(mtx4)=="GZMB"),]

In [None]:
df$CXCR4  <- mtx4[which(rownames(mtx4)=="CXCR4"),]

In [None]:
df  %>% filter(NFKBIA>0)  %>% 
ggplot(aes(x = disease,
             y = NFKBIA)) +
   geom_dotplot(binaxis='y', stackdir='center', dotsize = 0) + 
   geom_boxplot(outlier.shape = NA) +
    geom_jitter(binaxis='y', position=position_jitter(width = 0.1, height = 0.01), 
                size = 2, stackdir='center', aes(color = disease)) + 
    facet_wrap(~cell_type, ncol = 4) +
  theme_classic() + xlab("") +  
 xlab("") + ylab("Value") +
   scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +
     scale_color_manual(values = c("dodgerblue", "indianred3")) +
  ggpubr::stat_compare_means(label.x = 1.2, label.y.npc = "top", size = 3.5, vjust = 0.3, label = "p.format") + 
      theme(plot.title = element_text(hjust = 0.5, size = 18),
          axis.line = element_line(colour = "black"), 
            axis.text.x = element_text(angle = 90),
        axis.ticks = element_line(colour = "black"))

In [None]:
df  %>% 
ggplot(aes(x = disease,
             y = GZMB)) +
   geom_dotplot(binaxis='y', stackdir='center', dotsize = 0) + 
   geom_boxplot(outlier.shape = NA) +
    geom_jitter(binaxis='y', position=position_jitter(width = 0.1, height = 0.01), 
                size = 2, stackdir='center', aes(color = disease)) + 
    facet_wrap(~cell_type, ncol = 4) +
  theme_classic() + xlab("") +  
 xlab("") + ylab("Value") +
   scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +
     scale_color_manual(values = c("dodgerblue", "indianred3")) +
  ggpubr::stat_compare_means(label.x = 1.2, label.y.npc = "top", size = 3.5, vjust = 0.3, label = "p.format") + 
      theme(plot.title = element_text(hjust = 0.5, size = 18),
          axis.line = element_line(colour = "black"), 
            axis.text.x = element_text(angle = 90),
        axis.ticks = element_line(colour = "black"))

In [None]:
df  %>%  filter(CXCR4>0)  %>% 
ggplot(aes(x = disease,
             y = CXCR4)) +
   geom_dotplot(binaxis='y', stackdir='center', dotsize = 0) + 
   geom_boxplot(outlier.shape = NA) +
    geom_jitter(binaxis='y', position=position_jitter(width = 0.1, height = 0.01), 
                size = 2, stackdir='center', aes(color = disease)) + 
    facet_wrap(~cell_type, ncol = 4) +
  theme_classic() + xlab("") +  
 xlab("") + ylab("Value") +
   scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +
     scale_color_manual(values = c("dodgerblue", "indianred3")) +
  ggpubr::stat_compare_means(label.x = 1.2, label.y.npc = "top", size = 3.5, vjust = 0.3, label = "p.format") + 
      theme(plot.title = element_text(hjust = 0.5, size = 18),
          axis.line = element_line(colour = "black"), 
            axis.text.x = element_text(angle = 90),
        axis.ticks = element_line(colour = "black"))

In [None]:
mtx_newman  <- mtx4

In [None]:
md_newman  <- metadata

In [None]:
md_newman$sample_id == colnames(mtx_newman)

# GSE123658 Transimmunom

In [None]:
md_transimmunome  <- read_xlsx("../../240617_VN_Diabetes_V06/data/published_data/Transimmunome_2018/transimmunome_metadata.xlsx")

In [None]:
mtx_transimmunom  <- read_delim("../data/published_data/Transimmunome_2018/GSE123658_read_counts.gene_level.txt")

In [None]:
mtx_transimmunom

In [None]:
library("AnnotationDbi")
library("org.Hs.eg.db")

In [None]:
gene_annotations = mapIds(org.Hs.eg.db,
                    keys=mtx_transimmunom$Samples, 
                    column="SYMBOL",
                    keytype="ENSEMBL",
                    multiVals="first")  %>% as.data.frame()  %>% 
rownames_to_column("Samples")
colnames(gene_annotations)[2]  <- "Gene"

In [None]:
head(gene_annotations)

In [None]:
mtx_transimmunom2  <- mtx_transimmunom  %>% left_join(gene_annotations)

In [None]:
mtx_transimmunom2  <-  mtx_transimmunom2   %>% dplyr::filter(!is.na(Gene))  %>% 
dplyr::select(-Samples)  %>% 
group_by(Gene)  %>% 
summarise_all(.funs = sum)

In [None]:
mtx_transimmunom2  %>% dplyr::filter(Gene == "BTN3A2")

In [None]:
write.csv(mtx_transimmunom2, "../data/published_data/Transimmunome_2018/GSE123658_processed.csv")

In [None]:
mtx_transimmunom2  <- read.csv("../../240617_VN_Diabetes_V06/data/published_data/Transimmunome_2018/GSE123658_processed.csv")

In [None]:
mtx_transimmunom2$X  <- NULL

In [None]:
mtx_transimmunom2  %>% as.data.frame

In [None]:
mtx_transimmunom  <- mtx_transimmunom2

Testing our genes. 

In [None]:
df4  <- mtx_transimmunom2  %>% as.data.frame()  %>% 
pivot_longer(!Gene, names_to = "Patient_ID", values_to = "expression") 

In [None]:
md_transimmunome$Patient_ID  <- gsub(md_transimmunome$Patient_ID , pattern = "ID\\:", replacement = "")

In [None]:
df4$Patient_ID  <- gsub(df4$Patient_ID , pattern = "X", replacement = "")

In [None]:
df5  <- left_join(df4, md_transimmunome)

In [None]:
df5  %>%  
filter(Gene == "NFKBIA")  %>% 
ggplot(aes(x = Disease,
             y = expression)) +
   geom_dotplot(binaxis='y', stackdir='center', dotsize = 0) + 
   geom_boxplot(outlier.shape = NA) +
    geom_jitter(binaxis='y', position=position_jitter(width = 0.1, height = 0.01), 
                size = 2, stackdir='center', aes(color = Disease)) + 
  #  facet_wrap(~cell_type, ncol = 4) +
  theme_classic() + xlab("") +  
 xlab("") + ylab("Value") +
   scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +
     scale_color_manual(values = c("dodgerblue", "indianred3")) +
  ggpubr::stat_compare_means(label.x = 1.2, label.y.npc = "top", size = 3.5, vjust = 0.3, label = "p.format") + 
      theme(plot.title = element_text(hjust = 0.5, size = 18),
          axis.line = element_line(colour = "black"), 
            axis.text.x = element_text(angle = 90),
        axis.ticks = element_line(colour = "black"))

In [None]:
df5  %>%  
filter(Gene == "CXCR4")  %>% 
ggplot(aes(x = Disease,
             y = expression)) +
   geom_dotplot(binaxis='y', stackdir='center', dotsize = 0) + 
   geom_boxplot(outlier.shape = NA) +
    geom_jitter(binaxis='y', position=position_jitter(width = 0.1, height = 0.01), 
                size = 2, stackdir='center', aes(color = Disease)) + 
  #  facet_wrap(~cell_type, ncol = 4) +
  theme_classic() + xlab("") +  
 xlab("") + ylab("Value") +
   scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +
     scale_color_manual(values = c("dodgerblue", "indianred3")) +
  ggpubr::stat_compare_means(label.x = 1.2, label.y.npc = "top", size = 3.5, vjust = 0.3, label = "p.format") + 
      theme(plot.title = element_text(hjust = 0.5, size = 18),
          axis.line = element_line(colour = "black"), 
            axis.text.x = element_text(angle = 90),
        axis.ticks = element_line(colour = "black"))

In [None]:
df5  %>%  
filter(Gene == "GZMB")  %>% 
ggplot(aes(x = Disease,
             y = expression)) +
   geom_dotplot(binaxis='y', stackdir='center', dotsize = 0) + 
   geom_boxplot(outlier.shape = NA) +
    geom_jitter(binaxis='y', position=position_jitter(width = 0.1, height = 0.01), 
                size = 2, stackdir='center', aes(color = Disease)) + 
  #  facet_wrap(~cell_type, ncol = 4) +
  theme_classic() + xlab("") +  
 xlab("") + ylab("Value") +
   scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +
     scale_color_manual(values = c("dodgerblue", "indianred3")) +
  ggpubr::stat_compare_means(label.x = 1.2, label.y.npc = "top", size = 3.5, vjust = 0.3, label = "p.format") + 
      theme(plot.title = element_text(hjust = 0.5, size = 18),
          axis.line = element_line(colour = "black"), 
            axis.text.x = element_text(angle = 90),
        axis.ticks = element_line(colour = "black"))

In [None]:
df6  <- df5  %>% pivot_wider(names_from = "Gene", values_from = "expression", values_fill = 0)

In [None]:
mtx_6  <- df6  %>% column_to_rownames("Patient_ID")  %>% dplyr::select(-geo_id, -Disease)  %>% as.matrix()  %>% t()

In [None]:
df6_md  <- df6  %>% dplyr::select(Disease)

Test the differential expression by DESeq2. 

In [None]:
library(DESeq2)


# Create a DESeq dataset
dds <- DESeqDataSetFromMatrix(countData = mtx_6,
                              colData = df6_md,
                              design= ~ Disease) 

dds <- estimateSizeFactors(dds)
idx <- rowSums(counts(dds, normalized=TRUE) >= 5 ) >= 3


# Run the DESeq2 algorithm, which will give us the differentially expressed genes
dds <- dds[idx,]
dds <- DESeq(dds)

# Access and evaluate the results
res <- results(dds)
res

## Order results based on the adjusted p.value and show the first 30 genes
resOrdered.pval <- res[order(res$padj),]
rownames(resOrdered.pval)[1:30]

In [None]:
dds <- estimateSizeFactors(dds)

In [None]:
mtx_transimmunome  <- counts(dds, normalized=TRUE)

In [None]:
plotCounts(dds, gene="GZMB", intgroup="Disease") 

In [None]:
genes  <- c('BTN3A2','CENPK','JUN','CRIP1','FOS','FOSB','HLA-DQB1','IFITM2','IFITM3','CCL5','GSTM1',
'PPP1R15A','GBP5','GZMA','HLA-DRB1','IER2','GADD45B','DUSP2','EPSTI1','GZMB','NFKBIA','CD69',
'TSC22D3','ID3','IER5','JUNB','MYC','PASK','TNF','TUBB4B','AOAH','BTG2','C11orf98','CCNL1',
'CST7','CX3CR1','FGFBP2','GIMAP4','GIMAP5','GIMAP7','GZMH','HLA-C','ID2','MTRNR2L8','MYADM',
'NEU1','PRF1','RGCC','SERTAD1','SLC2A3','SRSF7','TNFSF10','ZFP36','ZNF683','RBM3','CXCR4',
'CDK11A','DDIT4','MYOM2','TSPO','TYROBP','ANXA2','ERAP2','KLRF1','MTRNR2L12','RCC1','RNASET2','ZFP36L2')

In [None]:
select_genes  <- res  %>% as.data.frame()  %>% 
rownames_to_column("Gene")  %>%  
filter(Gene %in% genes)  %>% dplyr::select(Gene, log2FoldChange)  %>% 
column_to_rownames("Gene")  %>% as.matrix

In [None]:
options(repr.plot.width = 20, repr.plot.height = 20)
pheatmap(select_genes, cluster_rows = T, show_rownames = T,
         cluster_cols = F, scale = 'column', cellwidth = 10, cellheight = 10,
         width = 5, height = 7, color=colorRampPalette(c("navy", "white", "red"))(50))

In [None]:
ti  <- mtx_transimmunome  %>% as.data.frame()  %>% 
rownames_to_column("gene")  %>% 
pivot_longer(!gene, names_to = "Patient_ID", values_to = "value")  %>% left_join(md_transimmunome)

In [None]:
rownames(resOrdered.pval)[1:100]

In [None]:
options(repr.plot.width = 6, repr.plot.height = 5)

for(i in rownames(resOrdered.pval)[1:100]){
print(ti  %>%  
filter(gene == i)  %>% 
ggplot(aes(x = Disease,
             y = value)) +
   geom_dotplot(binaxis='y', stackdir='center', dotsize = 0) + 
   geom_boxplot(outlier.shape = NA) +
    geom_jitter(binaxis='y', position=position_jitter(width = 0.1, height = 0.01), 
                size = 2, stackdir='center', aes(color = Disease)) + 
  #  facet_wrap(~cell_type, ncol = 4) +
  theme_classic() + xlab("") +  
 xlab("") + ylab("Value") +
      ggtitle(i)+
   scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +
     scale_color_manual(values = c("dodgerblue", "indianred3")) +
  ggpubr::stat_compare_means(label.x = 1.2, label.y.npc = "top", size = 3.5, vjust = 0.3, label = "p.format") + 
      theme(plot.title = element_text(hjust = 0.5, size = 18),
          axis.line = element_line(colour = "black"), 
            axis.text.x = element_text(angle = 90),
        axis.ticks = element_line(colour = "black")))
    
      }

In [None]:

for(i in genes){
print(ti  %>%  
filter(gene == i)  %>% 
ggplot(aes(x = Disease,
             y = value)) +
   geom_dotplot(binaxis='y', stackdir='center', dotsize = 0) + 
   geom_boxplot(outlier.shape = NA) +
    geom_jitter(binaxis='y', position=position_jitter(width = 0.1, height = 0.01), 
                size = 2, stackdir='center', aes(color = Disease)) + 
  #  facet_wrap(~cell_type, ncol = 4) +
  theme_classic() + xlab("") +  
 xlab("") + ylab("Value") +
      ggtitle(i)+
   scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +
     scale_color_manual(values = c("dodgerblue", "indianred3")) +
  ggpubr::stat_compare_means(label.x = 1.2, label.y.npc = "top", size = 3.5, vjust = 0.3, label = "p.format") + 
      theme(plot.title = element_text(hjust = 0.5, size = 18),
          axis.line = element_line(colour = "black"), 
            axis.text.x = element_text(angle = 90),
        axis.ticks = element_line(colour = "black")))
    
      }

Let's check the expression of BTN3A2. 

In [None]:
## BTN3A2

In [None]:
expression_matrix  <- t(mtx_transimmunom2  %>% dplyr::filter(Gene == "BTN3A2"))  %>% as.data.frame()  %>% 
rownames_to_column("Patient_ID")

In [None]:
expression_matrix  <- expression_matrix[2:nrow(expression_matrix),]

In [None]:
colnames(expression_matrix)[2]  <- "BTN3A2"

In [None]:
md_transimmunome$Patient_ID  <- gsub(md_transimmunome$Patient_ID , pattern = "ID\\:", replacement = "")

In [None]:
md_transimmunome$Patient_ID == expression_matrix$Patient_ID

In [None]:
expression_matrix2  <- left_join(md_transimmunome, expression_matrix)

In [None]:
sra_metadata  <- read_delim("../data/published_data/Transimmunome_2018/SraRunTable_metadata.txt")

In [None]:
sra_metadata2  <- sra_metadata  %>% dplyr::select(Run, Age, sex, geo_id = `Sample Name`)

In [None]:
expression_matrix3  <- left_join(expression_matrix2, sra_metadata2)

In [None]:
hla_transimmunome  <- read_delim("../data/published_data/Transimmunome_2018/genotypes.tsv")

In [None]:
hla_transimmunome  <- hla_transimmunome  %>% separate(subject, into = c("Run", "Read"), sep = "_")

In [None]:
expression_matrix_4  <- hla_transimmunome  %>% dplyr::filter(Read == 1)  %>% left_join(expression_matrix3)

In [None]:
write.csv(expression_matrix_4, "../data/published_data/Transimmunome_2018/BTN3A2_expression_by_HLA_transimmunome.csv")

In [None]:
transimmunome_all_counts  <- colSums(mtx_transimmunom2[2:ncol(mtx_transimmunom2)])

In [None]:
write.csv(transimmunome_all_counts, "../data/published_data/Transimmunome_2018/transimmunome_allcounts_for_normalization.csv")

In [None]:
transimmunome_all_counts  <- read.csv("../data/published_data/Transimmunome_2018/transimmunome_allcounts_for_normalization.csv")

# GSE10586 Jailwala

In [None]:
seurats  <- map(1:length(file_paths), .f = process_dataset)

In [None]:
library(GEOquery)

In [None]:
gset <- getGEO("GSE10586", GSEMatrix =TRUE, getGPL=TRUE, AnnotGPL=TRUE)

In [None]:
varLabels(gset$GSE10586_series_matrix.txt.gz)

In [None]:
gset$GSE10586_series_matrix.txt.gz$`characteristics_ch1.4`

In [None]:
gset$GSE10586_series_matrix.txt.gz$`characteristics_ch1.5`

In [None]:
gset$GSE10586_series_matrix.txt.gz$title

In [None]:
substr(gset$GSE10586_series_matrix.txt.gz$title,1,1)  %>% table

In [None]:
gset$GSE10586_series_matrix.txt.gz$supplementary_file

In [None]:
exprs(gset$GSE10586_series_matrix.txt.gz)

In [None]:
grep("FOXP3", rownames(exprs(gset$GSE10586_series_matrix.txt.gz)))

In [None]:
library("hgu133a.db")

In [None]:
gene_annotations  <- select(hgu133a.db, rownames(exprs(gset$GSE10586_series_matrix.txt.gz)), c("SYMBOL","ENTREZID", "GENENAME"))

In [None]:
table(is.na(gene_annotations$SYMBOL))

In [None]:
varLabels(gset$GSE10586_series_matrix.txt.gz)

In [None]:
gset$GSE10586_series_matrix.txt.gz$`subject id:ch1`

In [None]:
subject_id = gset$GSE10586_series_matrix.txt.gz$su  %>% length

In [None]:
metadata <- data.frame(geo_id = gset$GSE10586_series_matrix.txt.gz$geo_accession,
                       disease = gset$GSE10586_series_matrix.txt.gz$title, 
                       hla_risk = gset$GSE10586_series_matrix.txt.gz$`HLA risk:ch1`,
                      glucose = gset$GSE10586_series_matrix.txt.gz$`Glucose:ch1`,
                      hba1c = gset$GSE10586_series_matrix.txt.gz$`HbA1c:ch1`)


In [None]:
metadata$Disease  <- ifelse(grepl(metadata$disease, pattern = "Healthy"), "Ctrl", "Dia")

In [None]:
metadata

In [None]:
gene_annotations  %>% filter(SYMBOL == "BTN3A2")

In [None]:
jail_df  <- as.data.frame(exprs(gset$GSE10586_series_matrix.txt.gz))  %>% 
rownames_to_column("PROBEID")  %>% left_join(gene_annotations)

In [None]:
jail_df  %>% filter(!is.na(SYMBOL))

In [None]:
jail_df_sum  <- jail_df  %>% filter(!is.na(SYMBOL))  %>% 
dplyr::select(-ENTREZID, -GENENAME, -PROBEID)  %>% 
group_by(SYMBOL)  %>% 
summarise_all(.funs = sum)

In [None]:
jail_df_sum

In [None]:
df  <- metadata

In [None]:
df$geo_id == colnames(jail_df_sum2)

In [None]:
jail_df_sum2  <- jail_df_sum  %>% column_to_rownames("SYMBOL")

In [None]:
which((jail_df_sum$SYMBOL)=="NFKBIA")


In [None]:
jail_df_sum2[which((jail_df_sum$SYMBOL)=="NFKBIA"),]

In [None]:
df$NFKBIA  <- as.numeric(jail_df_sum2[which((jail_df_sum$SYMBOL)=="NFKBIA"),])

In [None]:
df$GZMB  <- as.numeric(jail_df_sum2[which((jail_df_sum$SYMBOL)=="GZMB"),])

In [None]:
df$CXCR4  <- as.numeric(jail_df_sum2[which((jail_df_sum$SYMBOL)=="CXCR4"),])

In [None]:
df$BTN3A2  <- as.numeric(jail_df_sum2[which((jail_df_sum$SYMBOL)=="BTN3A2"),])

In [None]:
df  %>% filter(NFKBIA>0)  %>% 
ggplot(aes(x = Disease,
             y = NFKBIA)) +
   geom_dotplot(binaxis='y', stackdir='center', dotsize = 0) + 
   geom_boxplot(outlier.shape = NA) +
    geom_jitter(binaxis='y', position=position_jitter(width = 0.1, height = 0.01), 
                size = 2, stackdir='center', aes(color = Disease)) + 
  #  facet_wrap(~cell_type, ncol = 4) +
  theme_classic() + xlab("") +  
 xlab("") + ylab("Value") +
   scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +
     scale_color_manual(values = c("dodgerblue", "indianred3")) +
  ggpubr::stat_compare_means(label.x = 1.2, label.y.npc = "top", size = 3.5, vjust = 0.3, label = "p.format") + 
      theme(plot.title = element_text(hjust = 0.5, size = 18),
          axis.line = element_line(colour = "black"), 
            axis.text.x = element_text(angle = 90),
        axis.ticks = element_line(colour = "black"))

In [None]:
df  %>% 
ggplot(aes(x = Disease,
             y = GZMB)) +
   geom_dotplot(binaxis='y', stackdir='center', dotsize = 0) + 
   geom_boxplot(outlier.shape = NA) +
    geom_jitter(binaxis='y', position=position_jitter(width = 0.1, height = 0.01), 
                size = 2, stackdir='center', aes(color = Disease)) + 
  #  facet_wrap(~cell_type, ncol = 4) +
  theme_classic() + xlab("") +  
 xlab("") + ylab("Value") +
   scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +
     scale_color_manual(values = c("dodgerblue", "indianred3")) +
  ggpubr::stat_compare_means(label.x = 1.2, label.y.npc = "top", size = 3.5, vjust = 0.3, label = "p.format") + 
      theme(plot.title = element_text(hjust = 0.5, size = 18),
          axis.line = element_line(colour = "black"), 
            axis.text.x = element_text(angle = 90),
        axis.ticks = element_line(colour = "black"))

In [None]:
df  %>% 
ggplot(aes(x = Disease,
             y = CXCR4)) +
   geom_dotplot(binaxis='y', stackdir='center', dotsize = 0) + 
   geom_boxplot(outlier.shape = NA) +
    geom_jitter(binaxis='y', position=position_jitter(width = 0.1, height = 0.01), 
                size = 2, stackdir='center', aes(color = Disease)) + 
  #  facet_wrap(~cell_type, ncol = 4) +
  theme_classic() + xlab("") +  
 xlab("") + ylab("Value") +
   scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +
     scale_color_manual(values = c("dodgerblue", "indianred3")) +
  ggpubr::stat_compare_means(label.x = 1.2, label.y.npc = "top", size = 3.5, vjust = 0.3, label = "p.format") + 
      theme(plot.title = element_text(hjust = 0.5, size = 18),
          axis.line = element_line(colour = "black"), 
            axis.text.x = element_text(angle = 90),
        axis.ticks = element_line(colour = "black"))

In [None]:
df  %>% 
ggplot(aes(x = Disease,
             y = BTN3A2)) +
   geom_dotplot(binaxis='y', stackdir='center', dotsize = 0) + 
   geom_boxplot(outlier.shape = NA) +
    geom_jitter(binaxis='y', position=position_jitter(width = 0.1, height = 0.01), 
                size = 2, stackdir='center', aes(color = Disease)) + 
  #  facet_wrap(~cell_type, ncol = 4) +
  theme_classic() + xlab("") +  
 xlab("") + ylab("Value") +
   scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +
     scale_color_manual(values = c("dodgerblue", "indianred3")) +
  ggpubr::stat_compare_means(label.x = 1.2, label.y.npc = "top", size = 3.5, vjust = 0.3, label = "p.format") + 
      theme(plot.title = element_text(hjust = 0.5, size = 18),
          axis.line = element_line(colour = "black"), 
            axis.text.x = element_text(angle = 90),
        axis.ticks = element_line(colour = "black"))

In [None]:
mtx_jailwala  <- jail_df_sum  %>% column_to_rownames("SYMBOL")  %>% as.matrix()
md_jailwala  <- df

In [None]:
mtx_jailwala

# Kallionpaa bulk

In [None]:
kallion_matrix  <- read_csv("../../../DATA_scRNAseq/Analysis of previously published data/068_Kallionpaa_diabimmune/bulk_count_matrix/DIA_bulk_agg.csv")

In [None]:
rownames(kallion_matrix)  <- kallion_matrix$`...1`

In [None]:
mtx  <- kallion_matrix[!(grepl(kallion_matrix$`...1`, pattern = "^MIR")  | 
                         grepl(kallion_matrix$`...1`, pattern = "^MT")  | 
                         grepl(kallion_matrix$`...1`, pattern = "^RP[LS]")  | 
                         grepl(kallion_matrix$`...1`, pattern = "^AL")  |
                         grepl(kallion_matrix$`...1`, pattern = "^AC")  |
                         grepl(kallion_matrix$`...1`, pattern = "\\.[1234567890]")  |
                         grepl(kallion_matrix$`...1`, pattern = "\\-[A1234567890]")  |
                         grepl(kallion_matrix$`...1`, pattern = "LINC")  |
                         grepl(kallion_matrix$`...1`, pattern = "^RNU")),
                      ]

In [None]:
mtx2  <- mtx[,2:ncol(mtx)]

In [None]:
rownames(mtx2)  <- mtx$`...1`

In [None]:
mtx2  <- as.matrix(mtx2)

In [None]:
md  <- data.frame(sample = colnames(mtx2))  %>% 
mutate(sample = gsub(sample, pattern = "10_1_", replacement = "10.1_"))  %>% 
mutate(sample = gsub(sample, pattern = "10_2_", replacement = "10.2_"))  %>% 
separate(sample, into = c(NA,"Sample_ID","Patient_ID","TimePoint","CellType","Index"), sep = "_", remove = F)  %>% 
mutate(Disease = ifelse(grepl(Patient_ID, pattern = "Control"),"Ctrl","Dia"))

Run DESeq2 workflow. 

In [None]:
library(DESeq2)

In [None]:
# Create a DESeq dataset
dds <- DESeqDataSetFromMatrix(countData = mtx2,
                              colData = md,
                              design= ~ CellType + TimePoint + Disease) 

In [None]:
keep <- rowSums(counts(dds) >= 10) >= 3
dds <- dds[keep,]

In [None]:
# Run the DESeq2 algorithm, which will give us the differentially expressed genes
dds <- DESeq(dds)
dds

In [None]:
res <- results(dds)

In [None]:
res <- results(dds, contrast=c("Disease", "Dia","Ctrl"))

In [None]:
counts_all <- counts(dds, normalized=TRUE)
counts_all <- counts_all %>% as.data.frame 

In [None]:
md$sample  <- gsub(md$sample, pattern = "10.", replacement = "10_")

In [None]:
mtx_kallionpaa  <- counts_all

In [None]:
md_kallionpaa  <- md

In [None]:
md$sample == colnames(counts_all)

In [None]:
counts_all2 <- counts_all %>% rownames_to_column("gene")  %>% 
pivot_longer(!gene, names_to = "sample") %>% 
mutate(sample = gsub(sample, pattern = "10_1_", replacement = "10.1_"))  %>% 
mutate(sample = gsub(sample, pattern = "10_2_", replacement = "10.2_"))  %>% 
left_join(md)
counts_all2

In [None]:
hla  <- read_delim("../data/published_data/Kallionpaa_2019/genotypes.tsv")

In [None]:
hla2  <- hla  %>% 
mutate(sample = gsub(subject, pattern = "10_1_", replacement = "10.1_"))  %>% 
mutate(sample = gsub(sample, pattern = "10_2_", replacement = "10.2_"))  %>% 
separate(sample, into = c("Sample_ID","Patient_ID","TimePoint","CellType","Index"), sep = "_", remove = F)

In [None]:
hla3  <- hla2  %>% filter(CellType == "PBMC")  %>% group_by(Patient_ID)  %>% 
slice_head(n = 1)  %>% mutate(dq2 = ifelse((DQA11 == "DQA1*05:01:01" | DQA12 == "DQA1*05:01:01") &
                                   (DQB11 == "DQB1*02:01:01" | DQB12 == "DQB1*02:01:01"),"DQ2",
                                   "Other"),
                      dq8 = ifelse((DQA11 == "DQA1*03:01:01" | DQA12 == "DQA1*03:01:01") &
                                   (DQB11 == "DQB1*03:02:01" | DQB12 == "DQB1*03:02:01"),"DQ8",
                                   "Other"))  %>% mutate(
                      dq2_8 = ifelse(dq2 == "DQ2" & dq8 == "DQ8","DQ2_8",
                                     ifelse(dq2 == "DQ2","DQ2", ifelse(dq8 == "DQ8","DQ8","Other"))))  %>% 
mutate(c7_01 = ifelse((C1 == "C*07:01:01" & C2 == "C*07:01:01"), "c7_01_hom",
                                   ifelse(C1 == "C*07:01:01" | C2 == "C*07:01:01","c7_01_het",
                                   "Other")),
                      b8_01 = ifelse((B1 == "B*08:01:01" & B2 == "B*08:01:01"), "b8_01_hom",
                                   ifelse(B1 == "B*08:01:01" | B2 == "B*08:01:01","b8_01_het",
                                   "Other")))   %>% mutate(
                      c7_b8 = ifelse(b8_01 == "b8_01_hom" & c7_01 == "c7_01_hom",
                                     "c7_b8_hom", ifelse(c7_01 == "c7_01_hom", "c7_01_hom",
                              ifelse(b8_01 == "b8_01_hom", "b8_01_hom", 
                              ifelse(b8_01 == "b8_01_het" & c7_01 == "c7_01_het",
                                     "c7_b8_het", 
                                     
                              ifelse(b8_01 == "b8_01_het", "b8_01_het", 
                              ifelse(c7_01 == "c7_01_het", "c7_01_het", "Other" ))))))) %>%
dplyr::select(Patient_ID, dq2_8, c7_b8)

In [None]:
hla3  <- hla3  %>% mutate(Patient_ID = gsub(Patient_ID, pattern = "Ctrll", replacement = "Control"))  %>% 
mutate(Patient_ID = gsub(Patient_ID, pattern = "Ctrl", replacement = "Control"))  %>% unique

In [None]:
counts_all3  <- counts_all2  %>% left_join(hla3)

In [None]:
#write.csv(counts_all3, "table_shiny_41BB_2.csv")

options(repr.plot.width = 5, repr.plot.height = 4)

plot_bulk <- function(gene2){
    df <- counts_all3 %>% dplyr::filter(gene == gene2)
 p  <-  df %>% mutate(CellType_Disease = paste(CellType, Disease))  %>% 
    ggplot(aes(x = CellType_Disease, y = value)) + 
geom_boxplot(outlier.shape = NA, aes(color = Disease)) +
 geom_dotplot(binaxis='y', stackdir='center', dotsize=0) + 
    geom_jitter(shape=16, position=position_jitter(0.05), aes(colour = Disease)) 
    return(p)
  }

In [None]:

plot_bulk2 <- function(gene2){
    options(repr.plot.width = 7, repr.plot.height = 4)
    df <- counts_all3 %>% dplyr::filter(gene == gene2)
 p  <-  df %>% mutate(CellType_Disease = paste(CellType, Disease))  %>% 
    ggplot(aes(x = dq2_8, y = value)) + 
geom_boxplot(outlier.shape = NA, aes(color = dq2_8)) +
 geom_dotplot(binaxis='y', stackdir='center', dotsize=0) + 
    geom_jitter(shape=16, position=position_jitter(0.05), aes(colour = Disease)) +
    facet_wrap(~CellType)
    return(p)
  }

In [None]:
plot_bulk2("BTN3A2")

In [None]:
plot_bulk3 <- function(gene2){
    options(repr.plot.width = 7, repr.plot.height = 4)
    df <- counts_all3 %>% dplyr::filter(gene == gene2)
 p  <-  df %>% mutate(CellType_Disease = paste(CellType, Disease))  %>% 
    ggplot(aes(x = c7_b8, y = value)) + 
geom_boxplot(outlier.shape = NA) +
    ggnewscale::new_scale_colour()+
 geom_dotplot(binaxis='y', stackdir='center', dotsize=0) + 
    geom_jitter(shape=16, position=position_jitter(0.1), size = 2, aes(colour = Disease)) +
    facet_wrap(~CellType)
    return(p)
  }

In [None]:
write.csv(counts_all3, "../data/published_data/Kallionpaa_2019/bulk_seq_counts_with_md.csv")

In [None]:
counts_all3  <- read_csv( "../data/published_data/Kallionpaa_2019/bulk_seq_counts_with_md.csv")

In [None]:
plot_bulk3("BTN3A2")

In [None]:
plot_bulk("SOCS3")

In [None]:
df <- counts_all3 %>% dplyr::filter(gene == gene)
  df %>% mutate(CellType_Disease = paste(CellType, Disease))  %>% 
    ggplot(aes(x = Disease, y = value)) + 
 geom_dotplot(binaxis='y', stackdir='center', dotsize=0) + 
    geom_jitter(shape=16, position=position_jitter(0.05), aes(colour = CellType)) 

In [None]:
 options(repr.plot.width = 5, repr.plot.height = 4)
 df %>% mutate(CellType_Disease = paste(CellType, Disease))  %>% 
    ggplot(aes(x = CellType_Disease, y = value)) + 
geom_boxplot(outlier.shape = NA, aes(color = Disease)) +
 geom_dotplot(binaxis='y', stackdir='center', dotsize=0) + 
    geom_jitter(shape=16, position=position_jitter(0.05), aes(colour = Disease)) 

In [None]:
plot_bulk2 <- function(gene){
    df <- counts_all3 %>% dplyr::filter(gene == gene)
  df %>% mutate(CellType_Disease = paste(CellType, Disease))  %>% 
    group_by(Patient_ID)  %>% 
    summarise(sum = )
    ggplot(aes(x = Disease, y = value)) + 
 geom_dotplot(binaxis='y', stackdir='center', dotsize=0) + 
    geom_jitter(shape=16, position=position_jitter(0.05), aes(colour = CellType)) 
  }

In [None]:
options(repr.plot.width = 5, repr.plot.height = 4)

plot_bulk2("BTN3A2")

# HPAP

Data from HPAP were preprocessed earlier in part 20 of this manual. 

In [None]:
hpap  <- readRDS("../../240617_VN_Diabetes_V06/data/published_data/HPAP/hpap_5p_t_nk_filt.rds")

In [None]:
DimPlot(hpap, raster = T)

In [None]:
hpap  <- FindClusters(hpap, resolution = 1)

In [None]:
DimPlot(hpap, raster = T, label = T)

Add patient IDs. 

In [None]:
hpap$Patient_ID  <- substr(hpap$source, 1,7)

In [None]:
hpap$Patient_ID  %>% table

Read and merge metadata. 

In [None]:
patient_metadata  <- read_delim("../../../DATA_scRNAseq/Analysis_of_previously_published_data//069_HPANCDB_Immune_TCRseq/hpap_medatata.csv")

In [None]:
patient_metadata2  <- patient_metadata  %>% dplyr::select(donor_ID, gender, age_years, clinical_diagnosis)  %>% 
mutate(Patient_ID = gsub(donor_ID, pattern = "-", replacement = "")) %>% 
mutate(Disease = ifelse(grepl(clinical_diagnosis, pattern = "control"),"Ctrl","Dia"))

In [None]:
md  <- hpap@meta.data  %>% left_join(patient_metadata2)

In [None]:
rownames(md)  <- colnames(hpap)

In [None]:
hpap@meta.data  <- md
#rownames(hpap@meta.data)  <- colnames(hpap)

In [None]:
DimPlot(hpap, raster = T, label = T, group.by = "Disease")

In [None]:
DimPlot(hpap, raster = T, label = T, group.by = "source")

In [None]:
hpap@meta.data  <- hpap@meta.data %>% 
separate(source, into = c(NA, "Tissue", NA, NA, "Run", NA, NA), remove = F, sep = "_")

In [None]:
hpap$Tissue2  <- ifelse(grepl(hpap$Tissue, pattern = "LN"),"LN","SPL")

Creaate aggregated matrix. 

In [None]:
avgexp = AverageExpression(hpap, features = c("BTN3A2", "IL32", "CXCR4", "NFKBIA", "GADD45B",
                                              "IER2", "GZMB", "GIMAP5","GIMAP7","RBM3"),
                           return.seurat = F, group.by = "Patient_ID", 
                          assay = "RNA")

expression_matrix  <- t(avgexp$RNA)  %>% as.data.frame()  %>% 
rownames_to_column("Patient_ID")  %>% 
left_join(patient_metadata2)

In [None]:
hpap_ln  <- subset(hpap, Tissue2 == "LN")
hpap_spl  <- subset(hpap, Tissue2 == "SPL")

In [None]:
Idents(hpap_ln)  <- hpap_ln$Disease
Idents(hpap_spl)  <- hpap_spl$Disease


Calculate log fold changes. 

In [None]:
fc_hpap_ln  <- FoldChange(hpap_ln,  `ident.1` = "Dia", `ident.2` = "Ctrl",
                      slot = "data")
    fc_hpap_ln$Study_CellType = 'HPAP LN'

In [None]:
    fc_hpap_ln$decile = ntile(fc_hpap_ln$avg_log2FC, 20)
    fc_hpap_ln$gene  <- rownames(fc_hpap_ln)

In [None]:
fc_hpap_spl  <- FoldChange(hpap_spl,  `ident.1` = "Dia", `ident.2` = "Ctrl",
                      slot = "data")
    fc_hpap_spl$Study_CellType = 'HPAP Spleen'

In [None]:
    fc_hpap_spl$decile = ntile(fc_hpap_spl$avg_log2FC, 20)
    fc_hpap_spl$gene  <- rownames(fc_hpap_spl)

In [None]:
colnames(hpap@meta.data)

hpap$Patient_ID  <- substr(hpap$source, 1,7)

patient_metadata  <- read_delim("../../../DATA_scRNAseq/Analysis_of_previously_published_data/069_HPANCDB_Immune_TCRseq/hpap_medatata.csv")

patient_metadata2  <- patient_metadata  %>% dplyr::select(donor_ID, gender, age_years, clinical_diagnosis,
                                                         disease_duration)  %>% 
mutate(Patient_ID = gsub(donor_ID, pattern = "-", replacement = "")) %>% 
mutate(Disease = ifelse(grepl(clinical_diagnosis, pattern = "control"),"Ctrl","Dia"))

md  <- hpap@meta.data  %>% left_join(patient_metadata2)

colnames(md)

md$Patient_ID  %>% table

In [None]:
md$source  %>% table

In [None]:
md  <- md %>% 
separate(source, into = c(NA, "Tissue", NA, NA, "Run", NA, NA), remove = F, sep = "_")

In [None]:
md$Study  <- "HPAP"

In [None]:
hpap@meta.data  <- md  %>% 
dplyr::select(
orig.ident,
nCount_RNA, 
nFeature_RNA,
percent.mt,
Batch = Run,
Tissue = Tissue,
Sample_ID = source,
Patient_ID = Patient_ID,
Sex = gender,
Age = age_years,
Disease = Disease,
Study,
Disease_Duration = disease_duration,
)

rownames(hpap@meta.data)  <- colnames(hpap)

In [None]:
hpap$Tissue  %>% table

In [None]:
hpap$Patient_ID_Tissue  <- paste(hpap$Patient_ID, hpap$Tissue)

In [None]:
avgexp = AggregateExpression(hpap, return.seurat = T, group.by = "Patient_ID_Tissue", 
                          assay = "RNA")

In [None]:
avgexp  <- NormalizeData(avgexp)
avgexp  <- ScaleData(avgexp)


In [None]:
mtx_seurat  <- avgexp@assays$RNA@layers$scale.data

In [None]:
rownames(mtx_seurat)  <- rownames(avgexp@assays$RNA)
colnames(mtx_seurat)  <- colnames(avgexp)

In [None]:
expression_matrix  <- t(mtx_seurat)  %>% 
as.data.frame()  %>% 
rownames_to_column("Patient_ID")  

In [None]:
expression_matrix2  <- expression_matrix[,which(!(grepl(colnames(expression_matrix), pattern = "MT-")|
             grepl(colnames(expression_matrix), pattern = "\\.")|
             grepl(colnames(expression_matrix), pattern = "^MIR")|
             grepl(colnames(expression_matrix), pattern = "-AS")|
             grepl(colnames(expression_matrix), pattern = "LINC")|
             grepl(colnames(expression_matrix), pattern = "ORF")|
             grepl(colnames(expression_matrix), pattern = "^TR[AB]")|
             grepl(colnames(expression_matrix), pattern = "^RP[LS]")) )]

In [None]:
ncol(expression_matrix2)

In [None]:
expression_matrix2

In [None]:
expression_matrix3  <- expression_matrix2  %>% column_to_rownames("Patient_ID")  %>% 
t()  %>% as.matrix()

In [None]:
expression_matrix3

In [None]:
mtx_hpap  <- expression_matrix3

In [None]:
md_hpap  <- data.frame(Patient_ID_Tissue = colnames(expression_matrix3))

In [None]:
md_hpap  <- md_hpap  %>% separate(Patient_ID_Tissue, 
                                  into = c("Patient_ID", "Tissue"), 
                                 sep = " ", remove = F)

In [None]:
md_hpap

In [None]:
md  <- hpap@meta.data %>% 
dplyr::select(
Patient_ID,
Sex,
Age,
Disease,
Study,
Disease_Duration,
)  %>% unique


In [None]:
md

In [None]:
rownames(md)  <- NULL

In [None]:
rownames(md)

In [None]:
md_hpap  <- md_hpap  %>% left_join(md)

In [None]:
md_hpap$Patient_ID_Tissue == colnames(mtx_hpap)

In [None]:
md_hpap

# Lab48

Load datasets from the current study for comparison with other datasets. 

## CD4 T cells

In [None]:
cd4  <- readRDS("../data/processed/L1/cd4_l1_full_filt.rds")

In [None]:
avgexp = AggregateExpression(subset(cd4, Experiment_ID %in% c("Exp16","Exp18","Exp19","Exp20")), 
                             return.seurat = T, group.by = "Patient_ID", 
                          assay = "RNA")

In [None]:
avgexp  <- NormalizeData(avgexp)
avgexp  <- ScaleData(avgexp)


In [None]:
expression_matrix  <- t(avgexp@assays$RNA@scale.data)  %>% 
as.data.frame()  %>% 
rownames_to_column("Patient_ID")  

In [None]:
expression_matrix

In [None]:
expression_matrix3  <- expression_matrix  %>% column_to_rownames("Patient_ID")  %>% 
t()  %>% as.matrix()

In [None]:
expression_matrix3

In [None]:
mtx_lab48_cd4  <- expression_matrix3

In [None]:
md_lab48_cd4  <- data.frame(Patient_ID = colnames(expression_matrix3))

In [None]:
md  <- cd4@meta.data %>% 
dplyr::select(
Patient_ID,
Sex,
Age,
Disease
)  %>% unique


In [None]:
md$Study  <- "Lab48"
md$cell_type  <- "CD4"

In [None]:
md_lab48_cd4  <- left_join(md_lab48_cd4, md)

In [None]:
colnames(mtx_lab48_cd4)  <- paste("CD4", colnames(mtx_lab48_cd4))

In [None]:
md_lab48_cd4$Sample_ID  <- paste("CD4", md_lab48_cd4$Patient_ID)

## CD8 T cells

In [None]:
cd8  <- readRDS("../data/processed/L2/cd8_l2_subcluster.rds")

In [None]:
avgexp = AggregateExpression(subset(cd8, Experiment_ID %in% c("Exp16","Exp18","Exp19","Exp20")), 
                             return.seurat = T, group.by = "Patient_ID", 
                          assay = "RNA")

In [None]:
avgexp  <- NormalizeData(avgexp)
avgexp  <- ScaleData(avgexp)


In [None]:
expression_matrix  <- t(avgexp@assays$RNA@scale.data)  %>% 
as.data.frame()  %>% 
rownames_to_column("Patient_ID")  

In [None]:
expression_matrix3  <- expression_matrix  %>% column_to_rownames("Patient_ID")  %>% 
t()  %>% as.matrix()

In [None]:
mtx_lab48_cd8  <- expression_matrix3

In [None]:
md_lab48_cd8  <- data.frame(Patient_ID = colnames(expression_matrix3))

In [None]:
md  <- cd8@meta.data %>% 
dplyr::select(
Patient_ID,
Sex,
Age,
Disease
)  %>% unique


In [None]:
md$Study  <- "Lab48"
md$cell_type  <- "CD8"

In [None]:
md_lab48_cd8  <- left_join(md_lab48_cd8, md)

# Honardoost

In [None]:
honar  <- readRDS("../../240617_VN_Diabetes_V06/data/published_data/Honardoost_2024/honar2_stacas_filt.rds")

In [None]:
DimPlot(honar, group.by = "Patient_ID")

In [None]:
DimPlot(honar, group.by = "Disease")

In [None]:
Idents(honar)  <- honar$Disease

In [None]:
fc_honar  <- FoldChange(honar,  `ident.1` = "T1D", `ident.2` = "H",
                      slot = "data")
    fc_honar$Study_CellType = 'Honardoost'
    fc_honar$decile = ntile(fc_honar$avg_log2FC, 20)
    fc_honar$gene  <- rownames(fc_honar)

In [None]:
honar@meta.data  %>% 
group_by(Patient_ID, Disease)  %>% 
tally  %>% 
dplyr::select(-n)  %>% 
group_by(Disease)  %>% 
tally

In [None]:
DefaultAssay(honar)  <- "RNA"

In [None]:
avgexp = AverageExpression(honar, return.seurat = T, group.by = "Sample_ID", 
                          assay = "RNA")


Idents(honar)  <- honar$Disease

In [None]:
avgexp

In [None]:
mtx_honar <- avgexp$RNA
md_honar  <- data.frame(Sample_ID = colnames(avgexp$RNA))

In [None]:
honar$Disease  <- ifelse(honar$Disease == "T1D", "Dia","Ctrl")

In [None]:
honar$Disease  %>% table

In [None]:
md  <- honar@meta.data %>% 
dplyr::select(
Sample_ID,
Sex = Gender,
Age = Age_at_profiling,
Disease
)  %>% unique

In [None]:
md$Study  <- "Honardoost"
md$cell_type  <- "T cells"

md_honar  <- left_join(md_honar, md)

In [None]:
md_honar  %>% arrange(Sample_ID)

In [None]:
mtx_honar  %>% nrow

In [None]:
write.csv(mtx_honar, "../data/published_data/Honardoost_2024/mtx_honar.csv")

In [None]:
write.csv(md_honar, "../data/published_data/Honardoost_2024/md_honar.csv")

# Comparison of all datasets

In [None]:
library(data.table)

In [None]:
dt_mtx_kallionpaa  <- as.data.table(mtx_kallionpaa, keep.rownames = T)
dt_mtx_transimmunome  <- as.data.table(mtx_transimmunome, keep.rownames = T)
dt_mtx_hpap  <- as.data.table(mtx_hpap, keep.rownames = T)
dt_mtx_jailwala <- as.data.table(mtx_jailwala, keep.rownames = T)
dt_mtx_newman <- as.data.table(mtx_newman, keep.rownames = T)

In [None]:
colnames(mtx_lab48_cd4) = paste("CD4", colnames(mtx_lab48_cd4))
colnames(mtx_lab48_cd8) = paste("CD8", colnames(mtx_lab48_cd8))


In [None]:
dt_mtx_48_cd4  <- as.data.table(mtx_lab48_cd4, keep.rownames = T)
dt_mtx_48_cd8  <- as.data.table(mtx_lab48_cd8, keep.rownames = T)

In [None]:
merged <- merge(dt_mtx_kallionpaa, dt_mtx_transimmunome, by = "rn", all = TRUE)

In [None]:
merged <- merge(merged, dt_mtx_hpap, by = "rn", all = TRUE)
merged <- merge(merged, dt_mtx_newman, by = "rn", all = TRUE)
merged <- merge(merged, dt_mtx_jailwala, by = "rn", all = TRUE)
merged <- merge(merged, dt_mtx_48_cd4, by = "rn", all = TRUE)
merged <- merge(merged, dt_mtx_48_cd8, by = "rn", all = TRUE)

## Correct and unify metadata

In [None]:
md_lab48_cd4$Patient_ID  <- paste("CD4", md_lab48_cd4$Patient_ID)
md_lab48_cd8$Patient_ID  <- paste("CD8", md_lab48_cd8$Patient_ID)

In [None]:
md_kallionpaa  <- md_kallionpaa  %>% transmute(
Sample_ID = sample, Patient_ID = Patient_ID, 
Age = TimePoint, CellType = CellType, Disease = Disease)

In [None]:
md_transimmunome  <- md_transimmunome  %>% 
transmute(
Sample_ID = geo_id, Patient_ID = Patient_ID, 
Disease = Disease)

In [None]:
md_jailwala2  <- data.frame(geo_id =gset$GSE10586_series_matrix.txt.gz$geo_accession ,
                           Age = gset$GSE10586_series_matrix.txt.gz$`characteristics_ch1.4`,
Sex = gset$GSE10586_series_matrix.txt.gz$`characteristics_ch1.5`)

In [None]:
md_jailwala2

In [None]:
md_jailwala3 <- md_jailwala  %>% left_join(md_jailwala2)  %>% transmute(
Sample_ID = geo_id, 
    Age = gsub(gsub(Age, pattern = " years", replacement = ""), pattern = "Age: ", replacement = ""),
    Sex = ifelse(grepl(Sex, pattern = "F"),"F","M"),
Disease = Disease)

In [None]:
md_jailwala3$CellType  <- "CD4+ CD25+"

In [None]:
md_transimmunome$CellType  <- "Whole blood"

In [None]:
gset <- getGEO("GSE123658", GSEMatrix =TRUE, getGPL=TRUE, AnnotGPL=TRUE)

In [None]:
test  <-  gset$`GSE123658-GPL20301_series_matrix.txt.gz`

In [None]:
test2  <- gset$`GSE123658-GPL18573_series_matrix.txt.gz`

In [None]:
md1  <- data.frame(Sample_ID = test$geo_accession,
                   Sex = test$`characteristics_ch1.2`,
                  Age = substr(test$`characteristics_ch1.3`,6,7))  %>% 
mutate(Sex = ifelse(grepl(Sex,pattern = "F"),"F","M"))

In [None]:
md2  <- data.frame(Sample_ID = test2$geo_accession,
                   Sex = test2$`characteristics_ch1.2`,
                  Age = substr(test2$`characteristics_ch1.3`,6,7))  %>% 
mutate(Sex = ifelse(grepl(Sex,pattern = "F"),"F","M"))

In [None]:
md3  <- rbind(md1,md2)

In [None]:
md_transimmunome2  <- left_join(md_transimmunome, md3)

In [None]:
md_transimmunome2$Sample_ID = md_transimmunome2$Patient_ID

In [None]:
colnames(mtx_transimmunome) == md_transimmunome2$Patient_ID

In [None]:
test$geo_accession == md_transimmunome$Sample_ID

Newman

In [None]:
md_newman  <- metadata

In [None]:
md_newman3  <- md_newman  %>% transmute(
Sample_ID = sample_id, 
   CellType = cell_type,
Disease = ifelse(disease == "CTL", "Ctrl", "Dia"),
    Age = Age,
    Sex = Sex
)

In [None]:
md_newman3$Sample_ID == colnames(mtx_newman)

HPAP

In [None]:
md_hpap3  <- md_hpap  %>% transmute(
Sample_ID = Patient_ID_Tissue, 
    Patient_ID = Patient_ID,
   CellType = paste(Tissue, "T cells"),
Disease = Disease, Sex = ifelse(grepl(Sex, pattern = "F"), "F", "M"), 
    Age)

In [None]:
colnames(mtx_hpap) == md_hpap3$Sample_ID

In [None]:
colnames(mtx_kallionpaa) == md_kallionpaa$Sample_ID

In [None]:
colnames(mtx_jailwala) == md_jailwala3$Sample_ID

Current study. 

In [None]:
colnames(mtx_lab48_cd4) == md_lab48_cd4$Patient_ID

In [None]:
md_lab48_cd4  <- md_lab48_cd4  %>% group_by(Patient_ID, Sex, Disease, CellType, Study)  %>% 
summarise(Age = mean(as.numeric(Age)))

In [None]:
md_lab48_cd4$Sample_ID  <- paste("CD4", md_lab48_cd4$Patient_ID)

In [None]:
colnames(mtx_lab48_cd4)  == md_lab48_cd4$Sample_ID

In [None]:
md_lab48_cd8$Sample_ID  <- gsub(md_lab48_cd8$Sample_ID, pattern = "CD8 CD8", replacement = "CD8")

In [None]:
md_lab48_cd8  <- md_lab48_cd8  %>% group_by(Patient_ID, Sex, Disease, CellType, Study)  %>% 
summarise(Age = mean(as.numeric(Age)))

md_lab48_cd8$Sample_ID  <- paste("CD8", md_lab48_cd8$Patient_ID)

In [None]:
colnames(mtx_lab48_cd8)  == md_lab48_cd8$Sample_ID

Add Study and Cell type metadata. 

In [None]:
md_lab48_cd8$Study  <- "Lab48 CD8"
md_lab48_cd4$Study  <- "Lab48 CD4"

In [None]:
md_lab48_cd4$CellType  <- "CD4"
md_lab48_cd8$CellType  <- "CD8"
md_lab48_cd4$cell_type  <- NULL
md_lab48_cd8$cell_type  <- NULL
md_lab48_cd4$Study  <- NULL
md_lab48_cd8$Study  <- NULL


In [None]:
merged <- merge(dt_mtx_kallionpaa, dt_mtx_transimmunome, by = "rn", all = TRUE)

merged <- merge(merged, dt_mtx_hpap, by = "rn", all = TRUE)
merged <- merge(merged, dt_mtx_newman, by = "rn", all = TRUE)
merged <- merge(merged, dt_mtx_jailwala, by = "rn", all = TRUE)
merged <- merge(merged, dt_mtx_48_cd4, by = "rn", all = TRUE)
merged <- merge(merged, dt_mtx_48_cd8, by = "rn", all = TRUE)

In [None]:
md_kallionpaa$Study  <- "Kallionpaa"
md_transimmunome$Study  <- "Transimmunome"
md_hpap3$Study  <- "HPAP"
md_newman$Study  <- "Newman"
md_jailwala3$Study  <- "Jailwala"
md_lab48_cd8$Study  <- "Lab48 CD8"
md_lab48_cd4$Study  <- "Lab48 CD4"

In [None]:
md_merged  <- bind_rows(md_kallionpaa, md_transimmunome)

In [None]:
md_kallionpaa$Study  <- "Kallionpaa"
md_transimmunome$Study  <- "Transimmunome"
md_hpap3$Study  <- "HPAP"
md_newman$Study  <- "Newman"
md_jailwala3$Study  <- "Jailwala"
md_lab48_cd8$Study  <- "Lab48 CD8"
md_lab48_cd4$Study  <- "Lab48 CD4"

In [None]:
md_hpap3$Age  <- as.character(md_hpap3$Age)

In [None]:
md_merged  <- bind_rows(md_merged, md_hpap3)

In [None]:
md_newman$Age  <- NA_character_

In [None]:
md_jailwala3$Age  <- as.character(md_jailwala3$Age)
md_lab48_cd4$Age  <- as.character(md_lab48_cd4$Age)
md_lab48_cd8$Age  <- as.character(md_lab48_cd8$Age)

In [None]:
md_merged  <- bind_rows(md_merged, md_newman)
md_merged  <- bind_rows(md_merged, md_jailwala3)
md_merged  <- bind_rows(md_merged, md_lab48_cd4)
md_merged  <- bind_rows(md_merged, md_lab48_cd8)

In [None]:
md_merged$Sample_ID == colnames(mtx)

In [None]:
table(colnames(merged) %in% md_merged$Sample_ID)

In [None]:
colnames(mtx_hpap) == md_hpap3$Sample_ID

In [None]:
colnames(mtx_newman) == md_newman3$Sample_ID

In [None]:
colnames(mtx_hpap) == md_hpap3$Sample_ID

In [None]:
colnames(mtx_jailwala) == md_jailwala3$Sample_ID

In [None]:
colnames(mtx_newman) == md_newman3$Sample_ID

In [None]:
colnames(mtx_kallionpaa) == md_kallionpaa$Sample_ID

In [None]:
colnames(mtx_transimmunome) == md_transimmunome2$Sample_ID

In [None]:
colnames(mtx_lab48_cd4) == md_lab48_cd4$Sample_ID

In [None]:
colnames(mtx_lab48_cd8) == md_lab48_cd8$Sample_ID

In [None]:
dt_mtx_kallionpaa  <- as.data.table(mtx_kallionpaa, keep.rownames = T)
dt_mtx_transimmunome  <- as.data.table(mtx_transimmunome, keep.rownames = T)
dt_mtx_hpap  <- as.data.table(mtx_hpap, keep.rownames = T)
dt_mtx_jailwala <- as.data.table(mtx_jailwala, keep.rownames = T)
dt_mtx_newman <- as.data.table(mtx_newman, keep.rownames = T)
dt_mtx_48_cd4  <- as.data.table(mtx_lab48_cd4, keep.rownames = T)
dt_mtx_48_cd8  <- as.data.table(mtx_lab48_cd8, keep.rownames = T)

In [None]:
merged <- merge(dt_mtx_kallionpaa, dt_mtx_transimmunome, by = "rn", all = TRUE)
merged <- merge(merged, dt_mtx_hpap, by = "rn", all = TRUE)
merged <- merge(merged, dt_mtx_newman, by = "rn", all = TRUE)
merged <- merge(merged, dt_mtx_jailwala, by = "rn", all = TRUE)
merged <- merge(merged, dt_mtx_48_cd4, by = "rn", all = TRUE)
merged <- merge(merged, dt_mtx_48_cd8, by = "rn", all = TRUE)

In [None]:
merged

In [None]:
colnames(merged)

Add age metadata. 

In [None]:
md_kallionpaa2  <- mutate(md_kallionpaa, Age=
                        case_when(Age == '12months' ~ 1,
                                  Age == '18months' ~ 1.5,
                                  Age == '24months' ~ 2,
                                  Age == '36months' ~ 3,
                                  Age == '3months' ~ 0.25,
                                  Age == '6months' ~ 0.5
                                  ))

In [None]:
md_kallionpaa2$Age  <- as.numeric(md_kallionpaa2$Age)
md_transimmunome2$Age  <- as.numeric(md_transimmunome2$Age)
md_newman3$Age  <- as.numeric(md_newman3$Age)
md_jailwala3$Age  <- as.numeric(md_jailwala3$Age)

In [None]:
md_lab48_cd4$Age  <- as.numeric(md_lab48_cd4$Age)
md_lab48_cd8$Age  <- as.numeric(md_lab48_cd8$Age)

In [None]:
md_hpap3$Age  <- as.numeric(md_hpap3$Age)

In [None]:
md_merged  <- bind_rows(md_kallionpaa2, md_transimmunome2)
md_merged  <- bind_rows(md_merged, md_hpap3)
md_merged  <- bind_rows(md_merged, md_newman3)
md_merged  <- bind_rows(md_merged, md_jailwala3)
md_merged  <- bind_rows(md_merged, md_lab48_cd4)
md_merged  <- bind_rows(md_merged, md_lab48_cd8)

In [None]:
merged <- merge(dt_mtx_kallionpaa, dt_mtx_transimmunome, by = "rn", all = TRUE)
merged <- merge(merged, dt_mtx_hpap, by = "rn", all = TRUE)
merged <- merge(merged, dt_mtx_newman, by = "rn", all = TRUE)
merged <- merge(merged, dt_mtx_jailwala, by = "rn", all = TRUE)
merged <- merge(merged, dt_mtx_48_cd4, by = "rn", all = TRUE)
merged <- merge(merged, dt_mtx_48_cd8, by = "rn", all = TRUE)

In [None]:
colnames(merged)[2:ncol(merged)] == md_merged$Sample_ID

Save the final matrix. 

In [None]:
write.csv(merged, "../data/matrix_allstudies.csv")

In [None]:
write.csv(md_merged, "../data/md_allstudies.csv")

In [None]:
merged  <- read_csv("../data/matrix_allstudies.csv")
md_merged  <- read_csv("../data/md_allstudies.csv")

In [None]:
merged$`...1`  <- NULL
md_merged$`...1`  <- NULL


## Add Honardoost

As the study by Honardoost et al., was added later, we added it separately. 

In [None]:
library(data.table)

In [None]:
dt_mtx_honar  <- as.data.table(mtx_honar, keep.rownames = T)

In [None]:
merged  <- left_join(merged, dt_mtx_honar)

In [None]:
md_honar$Patient_ID  <- md_honar$Sample_ID

In [None]:
md_honar$CellType  <- "T cells"
md_honar$cell_type  <- NULL

In [None]:
md_honar$Study_CellType  <- "Honardoost T cells"

In [None]:
md_honar$Sex  <- ifelse(md_honar$Sex == "female","F","M")

In [None]:
md_merged2  <- bind_rows(md_merged,md_honar)

In [None]:
write.csv(merged, "../data/matrix_allstudies_with_Honardoost.csv")
write.csv(md_merged2, "../data/md_allstudies_with_Honardoost.csv")

## Gene directions in different datasets

In [None]:
md_merged  <- md_merged %>% 
    mutate(Study_CellType = paste(Study, CellType))  %>% 
    mutate(Study_CellType = ifelse(grepl(Study_CellType, pattern = "HPAP LN"), "HPAP LN", Study_CellType)) 

In [None]:
   df  <- dt_mtx_transimmunome  %>% dplyr::filter(rn == "TRGV9")  %>% t
    colnames(df)  <- "gene"
    md_merged$gene  <- df[2:nrow(df)]
    options(repr.plot.width = 12, repr.plot.height = 7)
    plot1  <- md_merged  %>% 
       ggplot(aes(x = Disease,
             y = as.numeric(gene))) +
   geom_dotplot(binaxis='y', stackdir='center', dotsize = 0) + 
   geom_boxplot(outlier.shape = NA) +
    geom_jitter(binaxis='y', position=position_jitter(width = 0.1, height = 0.01), 
                size = 2, stackdir='center', aes(color = Disease)) + 
    facet_wrap(~Study_CellType, scales = "free", ncol = 6) +
  theme_classic() + xlab("") +  
 xlab("") + ylab("Value") + ggtitle(gene) +
   scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +
    scale_color_manual(values = c("dodgerblue", "indianred3")) +
  ggpubr::stat_compare_means(label.x = 1.2, label.y.npc = "top", size = 3.5, vjust = 0.3, label = "p.format") + 
      theme(plot.title = element_text(hjust = 0.5, size = 18),
          axis.line = element_line(colour = "black"), 
            axis.text.x = element_text(angle = 90),
        axis.ticks = element_line(colour = "black"))

## Function to plot one gene

In [None]:
plot_one_gene  <- function(gene){
    df  <- merged  %>% dplyr::filter(rn == gene)  %>% t
    colnames(df)  <- "gene"
    md_merged$gene  <- df[2:nrow(df)]
    options(repr.plot.width = 12, repr.plot.height = 7)
    plot1  <- md_merged  %>% 
       ggplot(aes(x = Disease,
             y = as.numeric(gene))) +
   geom_dotplot(binaxis='y', stackdir='center', dotsize = 0) + 
   geom_boxplot(outlier.shape = NA) +
    geom_jitter(binaxis='y', position=position_jitter(width = 0.1, height = 0.01), 
                size = 2, stackdir='center', aes(color = Disease)) + 
    facet_wrap(~Study_CellType, scales = "free", ncol = 6) +
  theme_classic() + xlab("") +  
 xlab("") + ylab("Value") + ggtitle(gene) +
   scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +
    scale_color_manual(values = c("dodgerblue", "indianred3")) +
  ggpubr::stat_compare_means(label.x = 1.2, label.y.npc = "top", size = 3.5, vjust = 0.3, label = "p.format") + 
      theme(plot.title = element_text(hjust = 0.5, size = 18),
          axis.line = element_line(colour = "black"), 
            axis.text.x = element_text(angle = 90),
        axis.ticks = element_line(colour = "black"))
    return(plot1)
}

In [None]:
plot_one_gene("GZMB")

In [None]:
genes_to_test = c("ISG15","MX1","IFI44L",
                   "IFNG","PRF1","CX3CR1","BTN3A2","HLA-DQB1",
                   "DUSP1","CD69","NFKBIA",
                   "GIMAP4","GIMAP5","GIMAP7",
                   "TRGV9","ZBTB16","IFNGR1")

In [None]:
studies_to_test = c(levels(factor(md_merged$Study_CellType))[c(1,2,3,4,5,6,7,11,12)])

In [None]:
studies_to_test

In [None]:
go_wilcox  <- function(df_filt){
    tryCatch(
        {
            wilcox = wilcox.test(as.numeric(df_filt$gene)~df_filt$Disease, conf.int = T)      
           return("y")
        },
        error = function(cond) {
          return("n")
        }
    )
    }

In [None]:
for(i in genes_to_test){
    for(j in studies_to_test){
       df_filt  <- md_merged  %>% dplyr::filter(Study_CellType == j)
        df  <- merged  %>% dplyr::filter(rn == i)  %>% t() %>% as.data.frame()  %>% rownames_to_column("sample")
        colnames(df)  <- c("sample","gene")
        df  <-  df  %>% dplyr::filter(sample %in% df_filt$Sample_ID)
        df_filt$gene  <- df$gene
        
        wilcox_possible = go_wilcox(df_filt)
        
        if(wilcox_possible == "y"){
            wilcox = wilcox.test(as.numeric(df_filt$gene)~df_filt$Disease, conf.int = T) 
            res_df  <- data.frame(gene = i, 
                             study = j,
                             pval = wilcox$p.value,
                             estimate = wilcox$estimate,
                             ci_upper = wilcox$conf.int[2],
                             ci_lower = wilcox$conf.int[1]
                             )
        
        } else {
             res_df  <- data.frame(gene = i, 
                             study = j,
                             pval = NA_integer_,
                             estimate = NA_integer_,
                             ci_upper = NA_integer_,
                             ci_lower = NA_integer_
                             )
        }
        
        if(i == genes_to_test[1] & j == studies_to_test[1]){
            df_res_all = res_df
        } else {
            df_res_all = rbind(df_res_all, res_df)
        }
    }
}


In [None]:
df_res_all  %>% arrange(pval)

In [None]:

markers_cd4  <- read_csv("../tables/de_genes/240306_cd4_all_markers_without_sex.csv")
markers_cd8  <- read_csv("../tables/de_genes/240306_cd8_all_markers_without_sex.csv")

all_markers  <- rbind(markers_cd4, markers_cd8)

all_markers  <- all_markers  %>% pull(gene)  %>% unique


In [None]:
all_markers

In [None]:
genes_to_test = all_markers

In [None]:
studies_to_test = c(levels(factor(md_merged$Study_CellType))[c(1,2,3,4,5,6,7,11,12)])

In [None]:
 for(i in genes_to_test){
    for(j in studies_to_test){
        print(i)
        print(j)
       df_filt  <- md_merged  %>% dplyr::filter(Study_CellType == j)
        df  <- merged  %>% dplyr::filter(rn == i)  %>% t() %>% as.data.frame()  %>% rownames_to_column("sample")
        if(ncol(df)>1){
           
                    colnames(df)  <- c("sample","gene")
        df  <-  df  %>% dplyr::filter(sample %in% df_filt$Sample_ID)
        df_filt$gene  <- df$gene
        
        wilcox_possible = go_wilcox(df_filt)
        
        if(wilcox_possible == "y"){
            wilcox = wilcox.test(as.numeric(df_filt$gene)~df_filt$Disease, conf.int = T) 
            res_df  <- data.frame(gene = i, 
                             study = j,
                             pval = wilcox$p.value,
                             estimate = wilcox$estimate,
                             ci_upper = wilcox$conf.int[2],
                             ci_lower = wilcox$conf.int[1],
                             mean_dia = mean(as.numeric((df_filt  %>% filter(Disease == "Dia"))$gene), na.rm = T),
                            mean_ctrl = mean(as.numeric((df_filt  %>% filter(Disease == "Ctrl"))$gene), na.rm = T),
                                  fc =  mean(as.numeric((df_filt  %>% filter(Disease == "Ctrl"))$gene), na.rm = T)/
                                   mean(as.numeric((df_filt  %>% filter(Disease == "Dia"))$gene), na.rm = T)
                             )
        
        } else {
             res_df  <- data.frame(gene = i, 
                             study = j,
                             pval = NA_integer_,
                             estimate = NA_integer_,
                             ci_upper = NA_integer_,
                             ci_lower = NA_integer_,
                             mean_dia = NA_integer_,
                             mean_ctrl = NA_integer_,
                             fc = NA_integer_
                             )
        }
        
        if(i == genes_to_test[1] & j == studies_to_test[1]){
            df_res_all = res_df
        } else {
            df_res_all = rbind(df_res_all, res_df)
        }
            
        } else {
            
            res_df  <- data.frame(gene = i, 
                             study = j,
                             pval = NA_integer_,
                             estimate = NA_integer_,
                             ci_upper = NA_integer_,
                             ci_lower = NA_integer_,
                              mean_dia = NA_integer_,
                             mean_ctrl = NA_integer_,
                             fc = NA_integer_
                             )
        }
        

    }
}


In [None]:
df_res_all

In [None]:
genes  <- df_res_all  %>% dplyr::filter(pval < 0.1)  %>% group_by(gene)  %>% tally  %>% arrange(desc(n))  %>% gene

In [None]:
for(i in genes[21:50]){
    print(plot_one_gene(i))
}

In [None]:
mtx  <- df_res_all  %>% dplyr::filter(gene %in% genes3)  %>% 
dplyr::select(gene, study, estimate)  %>% 
pivot_wider(names_from = study, values_from = estimate)  %>% column_to_rownames("gene")  %>% as.matrix

In [None]:
plot_one_gene("ZBTB10")

In [None]:
pheatmap(mtx, main = "", 
         scale = "column", cluster_cols = T, cluster_rows = T,
        color=colorRampPalette(c("dodgerblue", "grey95", "indianred2"))(50), 
         border_color = "white",
                  fontsize = 9)

In [None]:
genes3  <- c('CCL5',
'HLA-DQB1',
'HLA-DRB1',
'GZMA',
'IER2',
'EPSTI1',
'GZMB',
'NFKBIA',
'CD69',
'ID3',
'IER5',
'TNF',
'AOAH',
'CX3CR1',
'GZMH',
'MTRNR2L8',
'NEU1')

In [None]:
colnames(merged)[233:275]  <- md_merged$Sample_ID[232:274]

In [None]:
md_merged$Sample_ID[232:274]

In [None]:
for(i in genes3){
    print(plot_one_gene(i))
}

In [None]:
genes4  <- c("ISG15","MX1","IFI44L","XAF1","EPSTI1",
                   "IFNG","BTN3A2","HLA-DQB1","HLA-DRB1",
                   "DUSP1","NFKBIA","RBM3","TNF",
                   "CCR7","SELL","LEF1","IL7R",
                   "TNFAIP3",  "TSC22D3","IER2","BTG1",
                   "FOXP1","PIK3IP1")

In [None]:
for(i in genes[21:50]){
    print(plot_one_gene(i))
}

# Processing count matrices with Seurat

We will process all bulk RNAseq matrices using Seurat. 

Fix metadata in Kallionpaa study. 

In [None]:
md_kallionpaa  <- md_merged  %>% dplyr::filter(Study == "Kallionpaa")

In [None]:
mtx_kallion  <- merged[,which(colnames(merged) %in% md_kallionpaa$Sample_ID)]

In [None]:
mtx_kallion[is.na(mtx_kallion)]  <- 0

In [None]:
rownames(mtx_kallion)  <- merged$rn

In [None]:
seu_mtx_kallionpaa  <- CreateSeuratObject(mtx_kallion, min.cells = 1, min.features = 2)

In [None]:
md_kallionpaa  <- md_merged  %>% dplyr::filter(Study == "Kallionpaa")
mtx_kallionpaa  <- merged[,which(colnames(merged) %in% md_kallionpaa$Sample_ID)]
mtx_kallionpaa[is.na(mtx_kallionpaa)]  <- 0
rownames(mtx_kallionpaa)  <- merged$rn
seu_mtx_kallionpaa  <- CreateSeuratObject(mtx_kallionpaa, min.cells = 1, min.features = 0)
seu_mtx_kallionpaa  <- NormalizeData(seu_mtx_kallionpaa)
seu_mtx_kallionpaa  <- ScaleData(seu_mtx_kallionpaa)
seu_mtx_kallionpaa$Sample_ID  <- colnames(seu_mtx_kallionpaa)
seu_mtx_kallionpaa@meta.data  <- left_join(seu_mtx_kallionpaa@meta.data, md_kallionpaa)

In [None]:
md_merged$Study_CellType  %>% table 

In [None]:
which(colnames(merged)[2:length(colnames(merged))] != md_merged$Sample_ID)

In [None]:
colnames(merged)[233]

In [None]:
colnames(merged)[2:length(colnames(merged))] == md_merged$Sample_ID


Next, we will create a function which will help us convert bulk matrix to a seurat object. 

In [None]:
bulk_to_seurat  <- function(i){
    
md_kallionpaa  <- md_merged  %>% dplyr::filter(Study_CellType == levels(factor(md_merged$Study_CellType))[i])
mtx_kallionpaa  <- merged[,colnames(merged) %in% md_kallionpaa$Sample_ID]
mtx_kallionpaa[is.na(mtx_kallionpaa)]  <- 0
rownames(mtx_kallionpaa)  <- merged$rn
seu_mtx_kallionpaa  <- CreateSeuratObject(mtx_kallionpaa, min.cells = 1, min.features = 0)
seu_mtx_kallionpaa  <- NormalizeData(seu_mtx_kallionpaa)
seu_mtx_kallionpaa  <- ScaleData(seu_mtx_kallionpaa)
seu_mtx_kallionpaa$Sample_ID  <- colnames(seu_mtx_kallionpaa)
seu_mtx_kallionpaa@meta.data  <- cbind(seu_mtx_kallionpaa@meta.data, md_kallionpaa)
    rownames(seu_mtx_kallionpaa@meta.data)  <- colnames(seu_mtx_kallionpaa)
    Idents(seu_mtx_kallionpaa)  <- seu_mtx_kallionpaa$Disease
    fc  <- FoldChange(seu_mtx_kallionpaa, `ident.1` = "Dia", `ident.2` = "Ctrl",
                      slot = "data", pseudocount.use = 0.001)
    fc$Study_CellType = levels(factor(md_merged$Study_CellType))[i]
    fc$decile = ntile(fc$avg_log2FC, 10)
    fc$gene  <- rownames(fc)
   seu_mtx_kallionpaa@misc$fc  <- fc
    return(seu_mtx_kallionpaa)
    }

We will map the function to all datasets / studies that we have. 

In [None]:
seurats  <- map(.x = 1:12, .f = bulk_to_seurat)

We will add the log fold changes as Seurat meta data. 

In [None]:
fc_merged  <- seurats[[1]]@misc$fc
for(i in 2:length(levels(factor(md_merged$Study_CellType)))){
    fc_merged  <- rbind(fc_merged, seurats[[i]]@misc$fc)
}

In [None]:
for(i in 1:length(levels(factor(md_merged$Study_CellType)))){
 print(i)
    print(seurats[[i]]$Study_CellType[1])
}

In [None]:
list.files("../tables/fold_change", full.names = T)

For some studies, we will add the log FC from original Seurat calculation. 

In [None]:
cd4_fc  <- read_csv("../tables/fold_change/cd4_l1_full_filt.csv")

In [None]:
cd4_fc  <- cd4_fc  %>% dplyr::filter(cluster == "Dia")  

In [None]:
cd4_fc$decile = ntile(cd4_fc$avg_log2FC, 10)

In [None]:
cd4_fc  <- cd4_fc  %>% dplyr::filter(cluster == "Dia")  

In [None]:
cd4_fc$Study_CellType  <- "Lab48 CD4"

In [None]:
seurats[[6]]@misc$fc  <- cd4_fc

In [None]:
seurats[[6]]@misc$fc

In [None]:
cd8_fc  <- read_csv("../tables/fold_change/cd8_l1_full_filt.csv")

In [None]:
cd8_fc  <- cd8_fc  %>% dplyr::filter(cluster == "Dia")  

In [None]:
cd8_fc$decile = ntile(cd8_fc$avg_log2FC, 10)

In [None]:
cd8_fc$Study_CellType  <- "Lab48 CD8"

In [None]:
seurats[[7]]@misc$fc  <- cd8_fc

In [None]:
fc_hpap_ln

In [None]:
seurats[[1]]@misc$fc  <- fc_hpap_ln

In [None]:
seurats[[2]]@misc$fc  <- fc_hpap_spl

In [None]:
fc_merged  <- seurats[[1]]@misc$fc
for(i in 2:length(levels(factor(md_merged$Study_CellType)))){
    fc_merged  <- bind_rows(fc_merged, seurats[[i]]@misc$fc)
}

# ParseBio

We will add data from ParseBio. 

In [None]:
pb_cd4  <- readRDS("../../../DATA_scRNAseq//Analysis of previously published data/081_ParseBio_1M_Diabetes/241020_ParseBio_200k_CD4.rds")
pb_cd8_nk  <- readRDS("../../../DATA_scRNAseq//Analysis of previously published data/081_ParseBio_1M_Diabetes/241020_ParseBio_200k_CD8_and_NK.rds")


## ParseBio CD4

In [None]:
avgexp = AggregateExpression(pb_cd4, return.seurat = T, group.by = c("sample", "Disease"),
                          assay = "RNA")

In [None]:
avgexp  <- NormalizeData(avgexp)
avgexp  <- ScaleData(avgexp)


In [None]:
expression_matrix  <- t(avgexp@assays$RNA$scale.data)  %>% 
as.data.frame()  %>% 
rownames_to_column("Patient_ID")  

In [None]:
expression_matrix3  <- expression_matrix  %>% column_to_rownames("Patient_ID")  %>% 
t()  %>% as.matrix()

In [None]:
mtx_pb_cd4  <- expression_matrix3

In [None]:
md_pb_cd4  <- data.frame(Patient_ID = colnames(expression_matrix3),
                         Disease = ifelse(substr(colnames(expression_matrix3),1,1)=="H",  "Ctrl", "Dia"))

In [None]:
md_pb_cd4$Study  <- "ParseBio"
md_pb_cd4$cell_type  <- "CD4"

In [None]:
colnames(mtx_pb_cd4)  <- paste("CD4", colnames(mtx_pb_cd4))

In [None]:
md_pb_cd4$Sample_ID  <- paste("CD4", md_pb_cd4$Patient_ID)

## ParseBio CD8

In [None]:
avgexp = AggregateExpression(pb_cd8_nk, return.seurat = T, group.by = c("sample", "Disease"),
                          assay = "RNA")

In [None]:
avgexp  <- NormalizeData(avgexp)
avgexp  <- ScaleData(avgexp)

In [None]:
expression_matrix  <- t(avgexp@assays$RNA$scale.data)  %>% 
as.data.frame()  %>% 
rownames_to_column("Patient_ID")  

In [None]:
expression_matrix3  <- expression_matrix  %>% column_to_rownames("Patient_ID")  %>% 
t()  %>% as.matrix()

In [None]:
mtx_pb_cd8_nk  <- expression_matrix3

In [None]:
md_pb_cd8_nk  <- data.frame(Patient_ID = colnames(expression_matrix3),
                         Disease = ifelse(substr(colnames(expression_matrix3),1,1)=="H",  "Ctrl", "Dia"))

In [None]:
md_pb_cd8_nk$Study  <- "ParseBio"
md_pb_cd8_nk$cell_type  <- "CD8_NK"

In [None]:
colnames(mtx_pb_cd8_nk)  <- paste("CD8_NK", colnames(mtx_pb_cd8_nk))

In [None]:
md_pb_cd8_nk$Sample_ID  <- paste("CD8_NK", md_pb_cd8_nk$Patient_ID)

# Add data from GSE221297

Add data from the following study: [Zhong et al., 2024](https://www.nature.com/articles/s41467-024-53264-8)

In [None]:
nc_cd8_nk  <- readRDS("../../../DATA_scRNAseq//Analysis of previously published data/080_Zhong_NatCom_Diabetes/241019_NatComm_merged_filt_cd8_and_nk.rds")
nc_cd4  <- readRDS("../../../DATA_scRNAseq//Analysis of previously published data/080_Zhong_NatCom_Diabetes/241019_NatComm_merged_filt_cd4_noPR.rds")

In [None]:
nc_cd8_nk$source  %>% table

In [None]:
nc_cd4  <- subset(nc_cd4, source != "GSM6857965_P6_YSQ")
nc_cd8_nk  <- subset(nc_cd8_nk, source != "GSM6857965_P6_YSQ" )

In [None]:
nc_cd8_nk$source  %>% table

In [None]:
nc_cd4$source  %>% table

## GSE221297 CD4+ T cells

In [None]:
avgexp = AggregateExpression(nc_cd4, return.seurat = T, group.by = c("source", "Disease"),
                          assay = "RNA")

In [None]:
avgexp  <- NormalizeData(avgexp)
avgexp  <- ScaleData(avgexp)


In [None]:
expression_matrix  <- t(avgexp@assays$RNA$scale.data)  %>% 
as.data.frame()  %>% 
rownames_to_column("Patient_ID")  

In [None]:
expression_matrix3  <- expression_matrix  %>% column_to_rownames("Patient_ID")  %>% 
t()  %>% as.matrix()

In [None]:
mtx_nc_cd4  <- expression_matrix3

In [None]:
md_nc_cd4  <- avgexp@meta.data  %>% dplyr::select(-orig.ident, Patient_ID = source)  %>%
mutate(Disease = ifelse(Disease == "N","Ctrl","Dia"))

In [None]:
md_nc_cd4$Study  <- "NatComm"
md_nc_cd4$cell_type  <- "CD4"

In [None]:
colnames(mtx_nc_cd4)  <- paste("CD4", colnames(mtx_nc_cd4))

In [None]:
md_nc_cd4$Sample_ID  <- paste("CD4", md_nc_cd4$Patient_ID)

## NatComm CD8

In [None]:
avgexp = AggregateExpression(nc_cd8_nk, return.seurat = T, group.by = c("source", "Disease"),
                          assay = "RNA")

In [None]:
avgexp  <- NormalizeData(avgexp)
avgexp  <- ScaleData(avgexp)


In [None]:
expression_matrix  <- t(avgexp@assays$RNA$scale.data)  %>% 
as.data.frame()  %>% 
rownames_to_column("Patient_ID")  

In [None]:
expression_matrix3  <- expression_matrix  %>% column_to_rownames("Patient_ID")  %>% 
t()  %>% as.matrix()

In [None]:
mtx_nc_cd8_nk  <- expression_matrix3

In [None]:
md_nc_cd8_nk  <- avgexp@meta.data  %>% dplyr::select(-orig.ident, Patient_ID = source)  %>%
mutate(Disease = ifelse(Disease == "N","Ctrl","Dia"))

In [None]:
md_nc_cd8_nk$Study  <- "NatComm"
md_nc_cd8_nk$cell_type  <- "CD8_NK"

In [None]:
colnames(mtx_nc_cd8_nk)  <- paste("CD8_NK", colnames(mtx_nc_cd8_nk))

In [None]:
md_nc_cd8_nk$Sample_ID  <- paste("CD8_NK", md_nc_cd8_nk$Patient_ID)

# Heatmaps figure

We will load the prepared merged matrices and metadata for all studies. 

In [None]:
merged  <- read_csv("../data/published_studies/matrix_allstudies.csv")
md_merged  <- read_csv("../data/published_studies/md_allstudies.csv")

In [None]:
merged$`...1`  <- NULL
md_merged$`...1`  <- NULL

In [None]:
md_merged  <- md_merged %>% 
    mutate(Study_CellType = paste(Study, CellType))  %>% 
    mutate(Study_CellType = ifelse(grepl(Study_CellType, pattern = "HPAP LN"), "HPAP LN", Study_CellType)) 

Let's fix the metadata for Kallionpaa study. 

In [None]:
md_kallionpaa  <- md_merged  %>% dplyr::filter(Study == "Kallionpaa")

In [None]:
mtx_kallion  <- merged[,which(colnames(merged) %in% md_kallionpaa$Sample_ID)]

In [None]:
mtx_kallion[is.na(mtx_kallion)]  <- 0

In [None]:
mtx_kallion  <- as.data.frame(mtx_kallion)

In [None]:
rownames(mtx_kallion)  <- merged$rn

In [None]:
seu_mtx_kallionpaa  <- CreateSeuratObject(mtx_kallion, min.cells = 1, min.features = 2)

In [None]:
md_kallionpaa  <- md_merged  %>% dplyr::filter(Study == "Kallionpaa")
mtx_kallionpaa  <- merged[,which(colnames(merged) %in% md_kallionpaa$Sample_ID)]
mtx_kallionpaa[is.na(mtx_kallionpaa)]  <- 0
rownames(mtx_kallionpaa)  <- merged$rn
seu_mtx_kallionpaa  <- CreateSeuratObject(mtx_kallionpaa, min.cells = 1, min.features = 0)
seu_mtx_kallionpaa  <- NormalizeData(seu_mtx_kallionpaa)
seu_mtx_kallionpaa  <- ScaleData(seu_mtx_kallionpaa)
seu_mtx_kallionpaa$Sample_ID  <- colnames(seu_mtx_kallionpaa)
seu_mtx_kallionpaa@meta.data  <- left_join(seu_mtx_kallionpaa@meta.data, md_kallionpaa)

Let's check that the Sample cell types order is correct. 

In [None]:
md_merged$Study_CellType  %>% table 

In [None]:
which(colnames(merged)[2:length(colnames(merged))] != md_merged$Sample_ID)

In [None]:
colnames(merged)[233]

In [None]:
colnames(merged)[2:length(colnames(merged))] == md_merged$Sample_ID


Convert bulk to seurat. 

In [None]:
bulk_to_seurat  <- function(i){
    
md_kallionpaa  <- md_merged  %>% dplyr::filter(Study_CellType == levels(factor(md_merged$Study_CellType))[i])
mtx_kallionpaa  <- merged[,colnames(merged) %in% md_kallionpaa$Sample_ID]
mtx_kallionpaa[is.na(mtx_kallionpaa)]  <- 0
rownames(mtx_kallionpaa)  <- merged$rn
seu_mtx_kallionpaa  <- CreateSeuratObject(mtx_kallionpaa, min.cells = 1, min.features = 0)
seu_mtx_kallionpaa  <- NormalizeData(seu_mtx_kallionpaa)
seu_mtx_kallionpaa  <- ScaleData(seu_mtx_kallionpaa)
seu_mtx_kallionpaa$Sample_ID  <- colnames(seu_mtx_kallionpaa)
seu_mtx_kallionpaa@meta.data  <- cbind(seu_mtx_kallionpaa@meta.data, md_kallionpaa)
    rownames(seu_mtx_kallionpaa@meta.data)  <- colnames(seu_mtx_kallionpaa)
    Idents(seu_mtx_kallionpaa)  <- seu_mtx_kallionpaa$Disease
    fc  <- FoldChange(seu_mtx_kallionpaa, `ident.1` = "Dia", `ident.2` = "Ctrl",
                      slot = "data", pseudocount.use = 0.001)
    fc$Study_CellType = levels(factor(md_merged$Study_CellType))[i]
    fc$decile = ntile(fc$avg_log2FC, 10)
    fc$gene  <- rownames(fc)
   seu_mtx_kallionpaa@misc$fc  <- fc
    return(seu_mtx_kallionpaa)
    }

In [None]:
seurats  <- map(.x = 1:12, .f = bulk_to_seurat)

In [None]:
fc_merged  <- seurats[[1]]@misc$fc
for(i in 2:length(levels(factor(md_merged$Study_CellType)))){
    fc_merged  <- rbind(fc_merged, seurats[[i]]@misc$fc)
}

In [None]:
for(i in 1:length(levels(factor(md_merged$Study_CellType)))){
 print(i)
    print(seurats[[i]]$Study_CellType[1])
}

In [None]:
list.files("../tables/fold_change", full.names = T)

In [None]:
cd4_fc  <- read_csv("../tables/fold_change/cd4_l1_full_filt.csv")

In [None]:
cd4_fc  <- cd4_fc  %>% dplyr::filter(cluster == "Dia")  

In [None]:
cd4_fc$decile = ntile(cd4_fc$avg_log2FC, 10)

In [None]:
cd4_fc  <- cd4_fc  %>% dplyr::filter(cluster == "Dia")  

In [None]:
cd4_fc$Study_CellType  <- "Lab48 CD4"

In [None]:
seurats[[6]]@misc$fc  <- cd4_fc

In [None]:
seurats[[6]]@misc$fc

In [None]:
cd8_fc  <- read_csv("../tables/fold_change/cd8_l1_full_filt.csv")

In [None]:
cd8_fc  <- cd8_fc  %>% dplyr::filter(cluster == "Dia")  

In [None]:
cd8_fc$decile = ntile(cd8_fc$avg_log2FC, 10)

In [None]:
cd8_fc$Study_CellType  <- "Lab48 CD8"

In [None]:
seurats[[7]]@misc$fc  <- cd8_fc

In [None]:
seurats[[1]]@misc$fc  <- fc_hpap_ln

In [None]:
seurats[[2]]@misc$fc  <- fc_hpap_spl

In [None]:
fc_merged  <- seurats[[1]]@misc$fc
for(i in 2:length(levels(factor(md_merged$Study_CellType)))){
    fc_merged  <- bind_rows(fc_merged, seurats[[i]]@misc$fc)
}

### Only CD4 and CD8 studies


We will exclude studies which don't contain CD4/CD8 T cells. 

In [None]:
fc_merged  %>% 
dplyr::filter(Study_CellType != "Newman CD19+/CD27- B cells" &
              Study_CellType != "Newman Memory CD4+"
              )  %>% 
group_by(gene, Study_CellType, decile)  %>% dplyr::filter(decile %in% c(1))   %>% 
group_by(gene)  %>% tally()  %>% arrange(desc(n)) 

In [None]:
fc_merged  %>% 
dplyr::filter(Study_CellType != "Newman CD19+/CD27- B cells" &
              Study_CellType != "Newman Memory CD4+"
              )  %>% 
group_by(gene, Study_CellType, decile)  %>% dplyr::filter(decile %in% c(10))   %>% 
group_by(gene)  %>% tally()  %>% arrange(desc(n)) 

In [None]:
genes1  <- fc_merged  %>% 
dplyr::filter(Study_CellType != "Newman CD19+/CD27- B cells" &
              Study_CellType != "Newman Memory CD4+"
              )  %>% 
group_by(gene, Study_CellType, decile)  %>% dplyr::filter(decile %in% c(1))   %>% 
group_by(gene)  %>% tally()  %>% arrange(desc(n))   %>% 
dplyr::filter(n>6) %>% pull(gene)

In [None]:
genes2  <- fc_merged  %>% 
dplyr::filter(Study_CellType != "Newman CD19+/CD27- B cells" &
              Study_CellType != "Newman Memory CD4+"
              )  %>% 
group_by(gene, Study_CellType, decile)  %>% dplyr::filter(decile %in% c(10))   %>% 
group_by(gene)  %>% tally()  %>% arrange(desc(n))  %>% 
dplyr::filter(n>6) %>% pull(gene)

In [None]:
mtx_all_studies_deciles  <- fc_merged  %>% 
dplyr::filter(!is.na(Study_CellType))  %>%  
dplyr::filter(Study_CellType != "Newman CD19+/CD27- B cells" &
              Study_CellType != "Newman Memory CD4+"
              )  %>% filter(gene %in% c(genes1, genes2))  %>% 
dplyr::select(gene, Study_CellType, decile)  %>% 
pivot_wider(names_from = Study_CellType, values_from = decile)  %>% 
column_to_rownames("gene")   %>% as.matrix()

In [None]:
options(repr.plot.width = 18, repr.plot.height = 6)
pheatmap(t(mtx_all_studies_deciles), scale = "row",
        color=colorRampPalette(c("dodgerblue", "grey95", "indianred2"))(50), 
         border_color = "white",
                  fontsize = 12)

In [None]:
genes1  <- fc_merged  %>% 
dplyr::filter(Study_CellType != "Newman CD19+/CD27- B cells" &
              Study_CellType != "Newman Memory CD4+"
              )  %>% 
group_by(gene, Study_CellType, decile)  %>% dplyr::filter(decile %in% c(1))   %>% 
group_by(gene)  %>% tally()  %>% arrange(desc(n))   %>% 
dplyr::filter(n>5) %>% pull(gene)

genes2  <- fc_merged  %>% 
dplyr::filter(Study_CellType != "Newman CD19+/CD27- B cells" &
              Study_CellType != "Newman Memory CD4+"
              )  %>% 
group_by(gene, Study_CellType, decile)  %>% dplyr::filter(decile %in% c(10))   %>% 
group_by(gene)  %>% tally()  %>% arrange(desc(n))  %>% 
dplyr::filter(n>5) %>% pull(gene)

mtx_all_studies_deciles  <- fc_merged  %>% 
dplyr::filter(!is.na(Study_CellType))  %>%  
dplyr::filter(Study_CellType != "Newman CD19+/CD27- B cells" &
              Study_CellType != "Newman Memory CD4+"
              )  %>% filter(gene %in% c(genes1, genes2))  %>% 
dplyr::select(gene, Study_CellType, decile)  %>% 
pivot_wider(names_from = Study_CellType, values_from = decile)  %>% 
column_to_rownames("gene")   %>% as.matrix()

options(repr.plot.width = 18, repr.plot.height = 6)
pheatmap(t(mtx_all_studies_deciles), scale = "row",
        color=colorRampPalette(c("dodgerblue", "grey95", "indianred2"))(50), 
         border_color = "white",
                  fontsize = 12)

In [None]:
genes1  <- fc_merged  %>% 

group_by(gene, Study_CellType, decile)  %>% dplyr::filter(decile %in% c(1,2))   %>% 
group_by(gene)  %>% tally()  %>% arrange(desc(n))   %>% 
dplyr::filter(n>7) %>% pull(gene)

genes2  <- fc_merged  %>% 

group_by(gene, Study_CellType, decile)  %>% dplyr::filter(decile %in% c(10,9))   %>% 
group_by(gene)  %>% tally()  %>% arrange(desc(n))  %>% 
dplyr::filter(n>7) %>% pull(gene)

mtx_all_studies_deciles  <- fc_merged  %>% 
dplyr::filter(!is.na(Study_CellType))  %>%  
filter(gene %in% c(genes1, genes2))  %>% 
dplyr::select(gene, Study_CellType, decile)  %>% 
pivot_wider(names_from = Study_CellType, values_from = decile)  %>% 
column_to_rownames("gene")   %>% as.matrix()

options(repr.plot.width = 18, repr.plot.height = 6)
pheatmap(t(mtx_all_studies_deciles), scale = "row",
        color=colorRampPalette(c("dodgerblue", "grey95", "indianred2"))(50), 
         border_color = "white",
                  fontsize = 12)

# Dodecile norm. 

To fix different distributions of data in different datasets, we will encode the log fold changes in quantiles with n = 20. The highest quantile will represent genes which are the most upregulated in T1D and the lowest quantile will represent genes which are most upregulated in healthy. 

In [None]:
bulk_to_seurat  <- function(i){
    
md_kallionpaa  <- md_merged  %>% dplyr::filter(Study_CellType == levels(factor(md_merged$Study_CellType))[i])
mtx_kallionpaa  <- merged[,colnames(merged) %in% md_kallionpaa$Sample_ID]
mtx_kallionpaa[is.na(mtx_kallionpaa)]  <- 0
rownames(mtx_kallionpaa)  <- merged$rn
seu_mtx_kallionpaa  <- CreateSeuratObject(mtx_kallionpaa, min.cells = 1, min.features = 0)
seu_mtx_kallionpaa  <- NormalizeData(seu_mtx_kallionpaa)
seu_mtx_kallionpaa  <- ScaleData(seu_mtx_kallionpaa)
seu_mtx_kallionpaa$Sample_ID  <- colnames(seu_mtx_kallionpaa)
seu_mtx_kallionpaa@meta.data  <- cbind(seu_mtx_kallionpaa@meta.data, md_kallionpaa)
    rownames(seu_mtx_kallionpaa@meta.data)  <- colnames(seu_mtx_kallionpaa)
    Idents(seu_mtx_kallionpaa)  <- seu_mtx_kallionpaa$Disease
    fc  <- FoldChange(seu_mtx_kallionpaa, `ident.1` = "Dia", `ident.2` = "Ctrl",
                      slot = "data", pseudocount.use = 0.001)
    fc$Study_CellType = levels(factor(md_merged$Study_CellType))[i]
    fc$decile = ntile(fc$avg_log2FC, 20)
    fc$gene  <- rownames(fc)
   seu_mtx_kallionpaa@misc$fc  <- fc
    return(seu_mtx_kallionpaa)
    }

In [None]:
seurats  <- map(.x = 1:12, .f = bulk_to_seurat)

In [None]:
fc_merged  <- seurats[[1]]@misc$fc
for(i in 2:length(levels(factor(md_merged$Study_CellType)))){
    fc_merged  <- rbind(fc_merged, seurats[[i]]@misc$fc)
}

In [None]:
for(i in 1:length(levels(factor(md_merged$Study_CellType)))){
 print(i)
    print(seurats[[i]]$Study_CellType[1])
}

Add fold changes from original calculation in case of our study. 

In [None]:
list.files("../tables/fold_change", full.names = T)

In [None]:
cd4_fc  <- read_csv("../tables/fold_change/cd4_l1_full_filt.csv")

In [None]:
cd4_fc  <- cd4_fc  %>% dplyr::filter(cluster == "Dia")  

In [None]:
cd4_fc$decile = ntile(cd4_fc$avg_log2FC, 20)

In [None]:
cd4_fc  <- cd4_fc  %>% dplyr::filter(cluster == "Dia")  

In [None]:
cd4_fc$Study_CellType  <- "Lab48 CD4"

In [None]:
seurats[[6]]@misc$fc  <- cd4_fc

In [None]:
seurats[[6]]@misc$fc

In [None]:
cd8_fc  <- read_csv("../tables/fold_change/cd8_l1_full_filt.csv")

In [None]:
cd8_fc  <- cd8_fc  %>% dplyr::filter(cluster == "Dia")  

In [None]:
cd8_fc$decile = ntile(cd8_fc$avg_log2FC, 20)

In [None]:
cd8_fc$Study_CellType  <- "Lab48 CD8"

In [None]:
seurats[[7]]@misc$fc  <- cd8_fc

For HPAP, use cell-level logFC as well. 

In [None]:
fc_hpap_ln

In [None]:
seurats[[1]]@misc$fc  <- fc_hpap_ln

In [None]:
seurats[[2]]@misc$fc  <- fc_hpap_spl

In [None]:
fc_merged  <- seurats[[1]]@misc$fc
for(i in 2:length(levels(factor(md_merged$Study_CellType)))){
    fc_merged  <- bind_rows(fc_merged, seurats[[i]]@misc$fc)
}

In [None]:
fc_merged$Study_CellType  %>% table

In [None]:
genes1  <- fc_merged  %>% dplyr::filter(Study_CellType != "Newman CD19+/CD27- B cells")  %>% 
group_by(gene, Study_CellType, decile)  %>% dplyr::filter(decile %in% c(1,2,3,4))   %>% 
group_by(gene)  %>% tally()  %>% arrange(desc(n))   %>% dplyr::filter(n>7)  %>% pull(gene)

In [None]:
fc_merged  %>% dplyr::filter(Study_CellType != "Newman CD19+/CD27- B cells") %>% 
group_by(gene, Study_CellType, decile)  %>% dplyr::filter(decile %in% c(1,2,3,4))   %>% 
group_by(gene)  %>% tally()  %>% arrange(desc(n)) 

In [None]:
fc_merged  %>% dplyr::filter(Study_CellType != "Newman CD19+/CD27- B cells") %>% 
group_by(gene, Study_CellType, decile)  %>% dplyr::filter(decile %in% c(17,18,19,20))   %>% 
group_by(gene)  %>% tally()  %>% arrange(desc(n)) 

In [None]:
genes2  <- fc_merged  %>% group_by(gene, Study_CellType, decile)  %>% dplyr::filter(decile %in% c(17,18,19,20))   %>% 
group_by(gene)  %>% tally()  %>% arrange(desc(n))  %>% dplyr::filter(n>6)  %>% pull(gene)

In [None]:
mtx_all_studies_deciles  <- fc_merged  %>% 
dplyr::filter(Study_CellType != "Newman CD19+/CD27- B cells") %>% 
dplyr::filter(!is.na(Study_CellType))  %>%  
filter(gene %in% c(genes1, genes2))  %>% 
dplyr::select(gene, Study_CellType, decile)  %>% 
pivot_wider(names_from = Study_CellType, values_from = decile)  %>% 
column_to_rownames("gene")   %>% as.matrix()

In [None]:
options(repr.plot.width = 14, repr.plot.height = 4)
pheatmap(t(mtx_all_studies_deciles), scale = "row",
        color=colorRampPalette(c("dodgerblue", "grey95", "indianred2"))(50), 
         border_color = "white", width = 14, height = 4, 
                  fontsize = 12, filename = "../figures/heatmaps/published_data.pdf")

In [None]:
mtx_all_studies_deciles  <- fc_merged  %>% 
dplyr::filter(Study_CellType != "Newman CD19+/CD27- B cells") %>% 
dplyr::filter(!is.na(Study_CellType))  %>%  
filter(gene %in% c("RASA2","IRS2","NELL2","BACH2","CCL5","NKG7","CST7","TBX21"))  %>% 
dplyr::select(gene, Study_CellType, decile)  %>% 
pivot_wider(names_from = Study_CellType, values_from = decile)  %>% 
column_to_rownames("gene")   %>% as.matrix()
options(repr.plot.width = 14, repr.plot.height = 4)
pheatmap(t(mtx_all_studies_deciles), scale = "row",
        color=colorRampPalette(c("dodgerblue", "grey95", "indianred2"))(50), 
         border_color = "white", width = 7, height = 6, 
                  fontsize = 12, filename = "../figures/heatmaps/published_data2.pdf")

## With Honardoost

We will now recapitulate the analysis with Honardoost et al. study included. 

In [None]:
merged  <- read_csv("../../240617_VN_Diabetes_V06/data/published_studies/matrix_allstudies_with_Honardoost.csv")
md_merged  <- read_csv("../../240617_VN_Diabetes_V06/data/published_studies/md_allstudies_with_Honardoost.csv")

In [None]:
merged$`...1`  <- NULL
md_merged$`...1`  <- NULL

In [None]:
md_merged  <- md_merged %>% 
    mutate(Study_CellType = paste(Study, CellType))  %>% 
    mutate(Study_CellType = ifelse(grepl(Study_CellType, pattern = "HPAP LN"), "HPAP LN", Study_CellType)) 

Fix metadata for Kallionpaa study. 

In [None]:
md_kallionpaa  <- md_merged  %>% dplyr::filter(Study == "Kallionpaa")

In [None]:
mtx_kallion  <- merged[,which(colnames(merged) %in% md_kallionpaa$Sample_ID)]

In [None]:
mtx_kallion[is.na(mtx_kallion)]  <- 0

In [None]:
mtx_kallion  <- as.data.frame(mtx_kallion)

In [None]:
rownames(mtx_kallion)  <- merged$rn

In [None]:
seu_mtx_kallionpaa  <- CreateSeuratObject(mtx_kallion, min.cells = 1, min.features = 2)

In [None]:
md_kallionpaa  <- md_merged  %>% dplyr::filter(Study == "Kallionpaa")
mtx_kallionpaa  <- merged[,which(colnames(merged) %in% md_kallionpaa$Sample_ID)]
mtx_kallionpaa[is.na(mtx_kallionpaa)]  <- 0
rownames(mtx_kallionpaa)  <- merged$rn
seu_mtx_kallionpaa  <- CreateSeuratObject(mtx_kallionpaa, min.cells = 1, min.features = 0)
seu_mtx_kallionpaa  <- NormalizeData(seu_mtx_kallionpaa)
seu_mtx_kallionpaa  <- ScaleData(seu_mtx_kallionpaa)
seu_mtx_kallionpaa$Sample_ID  <- colnames(seu_mtx_kallionpaa)
seu_mtx_kallionpaa@meta.data  <- left_join(seu_mtx_kallionpaa@meta.data, md_kallionpaa)

Check the order of Study Celltype. 

In [None]:
md_merged$Study_CellType  %>% table 

In [None]:
which(colnames(merged)[2:length(colnames(merged))] != md_merged$Sample_ID)

In [None]:
colnames(merged)[233]

In [None]:
colnames(merged)[2:length(colnames(merged))] == md_merged$Sample_ID


Convert Bulk to Seurat. 

In [None]:
bulk_to_seurat  <- function(i){
    
md_kallionpaa  <- md_merged  %>% dplyr::filter(Study_CellType == levels(factor(md_merged$Study_CellType))[i])
mtx_kallionpaa  <- merged[,colnames(merged) %in% md_kallionpaa$Sample_ID]
mtx_kallionpaa[is.na(mtx_kallionpaa)]  <- 0
rownames(mtx_kallionpaa)  <- merged$rn
seu_mtx_kallionpaa  <- CreateSeuratObject(mtx_kallionpaa, min.cells = 1, min.features = 0)
seu_mtx_kallionpaa  <- NormalizeData(seu_mtx_kallionpaa)
seu_mtx_kallionpaa  <- ScaleData(seu_mtx_kallionpaa)
seu_mtx_kallionpaa$Sample_ID  <- colnames(seu_mtx_kallionpaa)
seu_mtx_kallionpaa@meta.data  <- cbind(seu_mtx_kallionpaa@meta.data, md_kallionpaa)
    rownames(seu_mtx_kallionpaa@meta.data)  <- colnames(seu_mtx_kallionpaa)
    Idents(seu_mtx_kallionpaa)  <- seu_mtx_kallionpaa$Disease
    fc  <- FoldChange(seu_mtx_kallionpaa, `ident.1` = "Dia", `ident.2` = "Ctrl",
                      slot = "data", pseudocount.use = 0.001)
    fc$Study_CellType = levels(factor(md_merged$Study_CellType))[i]
    fc$decile = ntile(fc$avg_log2FC, 20)
    fc$gene  <- rownames(fc)
   seu_mtx_kallionpaa@misc$fc  <- fc
    return(seu_mtx_kallionpaa)
    }

In [None]:
seurats  <- map(.x = 1:13, .f = bulk_to_seurat)

In [None]:
fc_merged  <- seurats[[1]]@misc$fc
for(i in 2:length(levels(factor(md_merged$Study_CellType)))){
    fc_merged  <- rbind(fc_merged, seurats[[i]]@misc$fc)
}

In [None]:
for(i in 1:length(levels(factor(md_merged$Study_CellType)))){
 print(i)
    print(seurats[[i]]$Study_CellType[1])
}

In [None]:
list.files("../tables/fold_change", full.names = T)

Add fold changes from the current study - CD4 T cells. 

In [None]:
cd4_fc  <- read_csv("../../240617_VN_Diabetes_V06/tables/fold_change/cd4_l1_full_filt.csv")

In [None]:
cd4_fc$`...1`  <- NULL

In [None]:
cd4_fc  <- cd4_fc  %>% dplyr::filter(cluster == "Dia")  

In [None]:
cd4_fc$decile = ntile(cd4_fc$avg_log2FC, 20)

In [None]:
cd4_fc  <- cd4_fc  %>% dplyr::filter(cluster == "Dia")  

In [None]:
cd4_fc$Study_CellType  <- "Lab48 CD4"

In [None]:
seurats[[7]]@misc$fc  <- cd4_fc

In [None]:
seurats[[7]]@misc$fc

Add fold changes from the current study - CD8 T cells. 

In [None]:
cd8_fc  <- read_csv("../../240617_VN_Diabetes_V06/tables/fold_change/cd8_l1_full_filt.csv")

In [None]:
cd8_fc$`...1`  <- NULL

In [None]:
cd8_fc  <- cd8_fc  %>% dplyr::filter(cluster == "Dia")  

In [None]:
cd8_fc$decile = ntile(cd8_fc$avg_log2FC, 20)

In [None]:
cd8_fc$Study_CellType  <- "Lab48 CD8"

In [None]:
seurats[[8]]@misc$fc  <- cd8_fc

Add HPAP FC LN. 

In [None]:
seurats[[2]]@misc$fc  <- fc_hpap_ln

Add HPAP FC SPL. 

In [None]:
seurats[[3]]@misc$fc  <- fc_hpap_spl

Add Honardoost FC. 

In [None]:
seurats[[1]]@misc$fc  <- fc_honar

In [None]:
for(i in 1:length(levels(factor(md_merged$Study_CellType)))){
 print(i)
    print(seurats[[i]]$Study_CellType[1])
    print(max(seurats[[i]]@misc$fc$decile))
    
}

In [None]:
for(i in 1:length(levels(factor(md_merged$Study_CellType)))){
 print(i)
    print(seurats[[i]]$Study_CellType[1])
    print(max(seurats[[i]]@misc$fc$decile))
    print(colnames(seurats[[i]]@misc$fc))
    
}

In [None]:
fc_merged  <- seurats[[1]]@misc$fc
for(i in 2:length(levels(factor(md_merged$Study_CellType)))){
    fc_merged  <- bind_rows(fc_merged, seurats[[i]]@misc$fc)
}

Check which genes are in the top deciles:

In [None]:
genes1  <- fc_merged  %>% dplyr::filter(Study_CellType != "Newman CD19+/CD27- B cells")  %>% 
group_by(gene, Study_CellType, decile)  %>% dplyr::filter(decile %in% c(1,2))   %>% 
group_by(gene)  %>% tally()  %>% arrange(desc(n))   %>% dplyr::filter(n>4)  %>% pull(gene)

In [None]:
genes1

In [None]:
fc_merged  %>% dplyr::filter(Study_CellType != "Newman CD19+/CD27- B cells") %>% 
group_by(gene, Study_CellType, decile)  %>% dplyr::filter(decile %in% c(1,2))   %>% 
group_by(gene)  %>% tally()  %>% arrange(desc(n)) 

Check which genes are in the bottom deciles:

In [None]:
fc_merged  %>% dplyr::filter(Study_CellType != "Newman CD19+/CD27- B cells") %>% 
group_by(gene, Study_CellType, decile)  %>% dplyr::filter(decile %in% c(19,20))   %>% 
group_by(gene)  %>% tally()  %>% arrange(desc(n)) 

In [None]:
genes2  <- fc_merged  %>% group_by(gene, Study_CellType, decile)  %>% dplyr::filter(decile %in% c(19,20))   %>% 
group_by(gene)  %>% tally()  %>% arrange(desc(n))  %>% dplyr::filter(n>4)  %>% pull(gene)

In [None]:
genes2

Prepare heatmap with the genes from top and bottom decile. 

In [None]:
mtx_all_studies_deciles  <- fc_merged  %>% 
dplyr::filter(Study_CellType != "Newman CD19+/CD27- B cells") %>% 
dplyr::filter(!is.na(Study_CellType))  %>%  
filter(gene %in% c(genes1, genes2))  %>% 
dplyr::select(gene, Study_CellType, decile)  %>% 
pivot_wider(names_from = Study_CellType, values_from = decile)  %>% 
column_to_rownames("gene")   %>% as.matrix()

options(repr.plot.width = 14, repr.plot.height = 4)
pheatmap(t(mtx_all_studies_deciles), scale = "row",
        color=colorRampPalette(c("dodgerblue", "grey95", "indianred2"))(50), 
         border_color = "white", width = 14, height = 4, 
                  fontsize = 12)

In [None]:
mtx_all_studies_deciles  <- fc_merged  %>% 
dplyr::filter(Study_CellType != "Newman CD19+/CD27- B cells") %>% 
dplyr::filter(!is.na(Study_CellType))  %>%  
filter(gene %in% c(genes1, genes2))  %>% 
dplyr::select(gene, Study_CellType, decile)  %>% 
pivot_wider(names_from = Study_CellType, values_from = decile)  %>% 
column_to_rownames("gene")   %>% as.matrix()

options(repr.plot.width = 14, repr.plot.height = 4)
pheatmap(t(mtx_all_studies_deciles), scale = "row",
        color=colorRampPalette(c("dodgerblue", "grey95", "indianred2"))(50), 
         border_color = "white", width = 14, height = 4, 
                  fontsize = 12)

In [None]:
mtx_all_studies_deciles  <- fc_merged  %>% 
dplyr::filter(Study_CellType != "Newman CD19+/CD27- B cells") %>% 
dplyr::filter(!is.na(Study_CellType))  %>%  
filter(gene %in% c(genes1, genes2))  %>% 
dplyr::select(gene, Study_CellType, decile)  %>% 
pivot_wider(names_from = Study_CellType, values_from = decile)  %>% 
column_to_rownames("gene")   %>% as.matrix()

In [None]:
options(repr.plot.width = 14, repr.plot.height = 4)
pheatmap(t(mtx_all_studies_deciles), scale = "none",
        color=colorRampPalette(c("indianred2", "grey95", "steelblue2"))(50), 
         border_color = "white", width = 14, height = 4, 
                  fontsize = 12)

In [None]:
options(repr.plot.width = 14, repr.plot.height = 4)
pheatmap(t(mtx_all_studies_deciles), scale = "none",
        color=colorRampPalette(c("indianred2", "grey95", "steelblue2"))(50), 
         border_color = "white", width = 14, height = 4, 
                  fontsize = 12,
        filename = "../figures/heatmap_other_studies2.pdf")

# With NatCom and PacBio

We will need to add the two single cell datasets to the heatmap. 

In [None]:
merged  <- read_csv("../../240617_VN_Diabetes_V06/data/published_studies/matrix_allstudies_with_Honardoost.csv")
md_merged  <- read_csv("../../240617_VN_Diabetes_V06/data/published_studies/md_allstudies_with_Honardoost.csv")

merged$`...1`  <- NULL
md_merged$`...1`  <- NULL


In [None]:
table(colnames(merged)[2:907] == md_merged$Sample_ID)

In [None]:
library(data.table)

In [None]:
mtx_nc_cd4  <- mtx_nc_cd4[!(grepl(rownames(mtx_nc_cd4), pattern = "^MT")|
                            grepl(rownames(mtx_nc_cd4), pattern = "\\.")|
                            grepl(rownames(mtx_nc_cd4), pattern = "LINC")|
                            grepl(rownames(mtx_nc_cd4), pattern = "^MIR")|
                            grepl(rownames(mtx_nc_cd4), pattern = "^MT")|
                            grepl(rownames(mtx_nc_cd4), pattern = "HNRNP")|
                            grepl(rownames(mtx_nc_cd4), pattern = "^RP[LS]")|
                            grepl(rownames(mtx_nc_cd4), pattern = "\\-")|
                            grepl(rownames(mtx_nc_cd4), pattern = "orf"))
                            ]

In [None]:
mtx_nc_cd8_nk  <- mtx_nc_cd8_nk[!(grepl(rownames(mtx_nc_cd8_nk), pattern = "^MT")|
                            grepl(rownames(mtx_nc_cd8_nk), pattern = "\\.")|
                            grepl(rownames(mtx_nc_cd8_nk), pattern = "LINC")|
                            grepl(rownames(mtx_nc_cd8_nk), pattern = "^MIR")|
                            grepl(rownames(mtx_nc_cd8_nk), pattern = "^MT")|
                            grepl(rownames(mtx_nc_cd8_nk), pattern = "HNRNP")|
                            grepl(rownames(mtx_nc_cd8_nk), pattern = "^RP[LS]")|
                            grepl(rownames(mtx_nc_cd8_nk), pattern = "\\-")|
                            grepl(rownames(mtx_nc_cd8_nk), pattern = "orf")) 
                            ]

In [None]:
mtx_pb_cd4  <- mtx_pb_cd4[!(grepl(rownames(mtx_pb_cd4), pattern = "^MT")|
                            grepl(rownames(mtx_pb_cd4), pattern = "\\.")|
                            grepl(rownames(mtx_pb_cd4), pattern = "LINC")|
                            grepl(rownames(mtx_pb_cd4), pattern = "^MIR")|
                            grepl(rownames(mtx_pb_cd4), pattern = "^MT")|
                            grepl(rownames(mtx_pb_cd4), pattern = "HNRNP")|
                            grepl(rownames(mtx_pb_cd4), pattern = "^RP[LS]")|
                            grepl(rownames(mtx_pb_cd4), pattern = "\\-")|
                            grepl(rownames(mtx_pb_cd4), pattern = "orf")) 
                            ]

In [None]:
mtx_pb_cd8_nk  <- mtx_pb_cd8_nk[!(grepl(rownames(mtx_pb_cd8_nk), pattern = "^MT")|
                            grepl(rownames(mtx_pb_cd8_nk), pattern = "\\.")|
                            grepl(rownames(mtx_pb_cd8_nk), pattern = "LINC")|
                            grepl(rownames(mtx_pb_cd8_nk), pattern = "^MIR")|
                            grepl(rownames(mtx_pb_cd8_nk), pattern = "^MT")|
                            grepl(rownames(mtx_pb_cd8_nk), pattern = "HNRNP")|
                            grepl(rownames(mtx_pb_cd8_nk), pattern = "^RP[LS]")|
                            grepl(rownames(mtx_pb_cd8_nk), pattern = "\\-")|
                            grepl(rownames(mtx_pb_cd8_nk), pattern = "orf")) 
                            ]

In [None]:
dt_mtx_nc_cd4  <- as.data.table(mtx_nc_cd4, keep.rownames = T)
dt_mtx_nc_cd8  <- as.data.table(mtx_nc_cd8_nk, keep.rownames = T)
dt_mtx_pb_cd4  <- as.data.table(mtx_pb_cd4, keep.rownames = T)
dt_mtx_pb_cd8  <- as.data.table(mtx_pb_cd8_nk, keep.rownames = T)

merged <- merge(merged, dt_mtx_nc_cd4, by = "rn", all.x = TRUE)
merged <- merge(merged, dt_mtx_nc_cd8, by = "rn", all.x = TRUE)
merged <- merge(merged, dt_mtx_pb_cd4, by = "rn", all.x = TRUE)
merged <- merge(merged, dt_mtx_pb_cd8, by = "rn", all.x = TRUE)


In [None]:
md_merged_new  <- rbind(md_nc_cd4, md_nc_cd8_nk, md_pb_cd4, md_pb_cd8_nk)

In [None]:
md_merged_new2  <- data.frame(Sample_ID = md_merged_new$Sample_ID, 
                              Patient_ID = md_merged_new$Patient_ID, 
                              Age = NA_character_,
                              CellType = md_merged_new$cell_type, 
                              Disease = md_merged_new$Disease,
                              Study = md_merged_new$Study, 
                              Sex = NA_character_,
                              gene = NA_character_)  %>% 
mutate(Study_CellType = paste(Study, CellType))

In [None]:
md_merged_old_and_new  <- rbind(md_merged, md_merged_new2)

In [None]:
colnames(merged)[908:923]  <- gsub(colnames(merged)[908:923], pattern = "_N", replacement = "")
colnames(merged)[908:923]  <- gsub(colnames(merged)[908:923], pattern = "_P", replacement = "")

In [None]:
colnames(merged)[908:923]  <- gsub(colnames(merged)[908:923], pattern = "CD8K", replacement = "CD8_NK")

In [None]:
md_merged_new2$Sample_ID[1:16]

In [None]:
colnames(merged)[908:923]

In [None]:
md_merged_new2$Sample_ID[1:16] == colnames(merged)[908:923]

In [None]:
colnames(merged)[924:971] 

In [None]:
md_merged_new2$Sample_ID[17:64]

In [None]:
colnames(merged)[924:971] == md_merged_new2$Sample_ID[17:64]

In [None]:
table(colnames(merged)[2:971] == md_merged_old_and_new$Sample_ID)

In [None]:
md_merged  <- md_merged_old_and_new

In [None]:
write.csv(merged, "../../240617_VN_Diabetes_V06/data/published_studies/matrix_allstudies_with_Honardoost_NC_PB.csv")
write.csv(md_merged, "../../240617_VN_Diabetes_V06/data/published_studies/md_allstudies_with_Honardoost_NC_PB.csv")

In [None]:
merged  <- read_csv("../../240617_VN_Diabetes_V06/data/published_studies/matrix_allstudies_with_Honardoost_NC_PB.csv")
md_merged  <- read_csv("../../240617_VN_Diabetes_V06/data/published_studies/md_allstudies_with_Honardoost_NC_PB.csv")

In [None]:
merged$`...1`  <- NULL
md_merged$`...1`  <- NULL

In [None]:
newman_samples  <- md_merged  %>% dplyr::filter(grepl(Study_CellType, pattern = "Newman"))  %>% pull(Sample_ID)

In [None]:
which(!(colnames(merged) %in% newman_samples))

In [None]:
md_merged_without_newman <- md_merged  %>% dplyr::filter(!grepl(Study_CellType, pattern = "Newman"))

In [None]:
merged_without_newman  <- merged[,which(!(colnames(merged) %in% newman_samples))]

In [None]:
all.equal(md_merged_without_newman$Sample_ID, colnames(merged_without_newman)[2:529])

In [None]:
library(data.table)

In [None]:
dt_mtx_newman  <- as.data.table(mtx_newman)

In [None]:
dt_mtx_newman$rn  <- rownames(mtx_newman)

In [None]:
merged <- merge(merged_without_newman, dt_mtx_newman, by = "rn", all = TRUE)

In [None]:
md_newman4  <- md_newman3  %>%  mutate(Patient_ID = "", Study = "Newman", gene = 0)  %>% 
mutate(Study_CellType = paste(Study, CellType))  %>% 
dplyr::select(Sample_ID, Patient_ID, Age, CellType, Disease, Study, Sex, gene, Study_CellType)

In [None]:
md_merged4  <- rbind(md_merged_without_newman, md_newman4)

Check that all sample IDs are equal in the matrix and metadata. 

In [None]:
all.equal(colnames(merged)[2:length(colnames(merged))],md_merged4$Sample_ID)

# Heatmap with Honardoost, NatCom and ParseBio

In [None]:
md_merged  <- md_merged4

In [None]:
md_merged  %>% group_by(Study, CellType, Disease)  %>% tally  %>% mutate(n = ifelse(Disease == "Dia",n*-1,n))

In [None]:
options(repr.plot.width = 12, repr.plot.height = 6)

md_merged  %>% 
mutate(Study_CellType = gsub(paste(Study, CellType), pattern = "-mes", replacement = ""))  %>% 
mutate(Study_CellType = gsub(Study_CellType, pattern = "-sma", replacement = ""))  %>% 
mutate(Study_CellType = gsub(Study_CellType, pattern = "-pct", replacement = ""))  %>% 
mutate(Study_CellType = gsub(Study_CellType, pattern = "-pch", replacement = ""))  %>% 
group_by(Study_CellType, Disease)  %>% tally  %>%
ggplot(aes(x = n, y = reorder(Study_CellType, n), fill = Disease)) + 
  geom_bar(stat="identity", position="stack") + ggtheme() + xlab("") + ylab("")

In [None]:
md_merged  %>% group_by(Study, CellType, Disease)  %>% tally  %>%
 mutate(n = ifelse(Disease == "Dia",n*-1,n)) %>% 
mutate(Study_CellType = paste(Study, CellType))  %>% 
ggplot(aes(x = n, y = Study_CellType, fill = Disease)) + 
  geom_bar(stat="identity", position="identity") 

### Without Y genes

In [None]:
library(biomaRt)
mart <- useMart(biomart="ensembl", dataset="hsapiens_gene_ensembl")

In [None]:
results <- getBM(attributes = c("chromosome_name", "hgnc_symbol"),
           filters = "chromosome_name", values = "Y", mart = mart)

In [None]:
results

In [None]:
merged

In [None]:
bulk_to_seurat  <- function(i){
    
md_kallionpaa  <- md_merged  %>% dplyr::filter(Study_CellType == levels(factor(md_merged$Study_CellType))[i])
mtx_kallionpaa  <- merged[,colnames(merged) %in% md_kallionpaa$Sample_ID]
mtx_kallionpaa[is.na(mtx_kallionpaa)]  <- 0
rownames(mtx_kallionpaa)  <- merged$rn
mtx_kallionpaa  <- mtx_kallionpaa[!(grepl(rownames(mtx_kallionpaa), pattern = "^MT")|
                            grepl(rownames(mtx_kallionpaa), pattern = "\\.")|
                            rownames(mtx_kallionpaa) %in% results$hgnc_symbol |
                            grepl(rownames(mtx_kallionpaa), pattern = "LINC")|
                            grepl(rownames(mtx_kallionpaa), pattern = "^MIR")|
                            grepl(rownames(mtx_kallionpaa), pattern = "^MT")|
                            grepl(rownames(mtx_kallionpaa), pattern = "HNRNP")|
                            grepl(rownames(mtx_kallionpaa), pattern = "^RP[LS]")|
                            grepl(rownames(mtx_kallionpaa), pattern = "\\-")|
                            grepl(rownames(mtx_kallionpaa), pattern = "orf")), ]    
    
seu_mtx_kallionpaa  <- CreateSeuratObject(mtx_kallionpaa, min.cells = 0, min.features = 0)
seu_mtx_kallionpaa  <- NormalizeData(seu_mtx_kallionpaa)
seu_mtx_kallionpaa  <- ScaleData(seu_mtx_kallionpaa)
seu_mtx_kallionpaa$Sample_ID  <- colnames(seu_mtx_kallionpaa)
seu_mtx_kallionpaa@meta.data  <- cbind(seu_mtx_kallionpaa@meta.data, md_kallionpaa)
    rownames(seu_mtx_kallionpaa@meta.data)  <- colnames(seu_mtx_kallionpaa)
    Idents(seu_mtx_kallionpaa)  <- seu_mtx_kallionpaa$Disease
    fc  <- FoldChange(seu_mtx_kallionpaa, `ident.1` = "Dia", `ident.2` = "Ctrl",
                      slot = "data", pseudocount.use = 0.001)
    fc$Study_CellType = levels(factor(md_merged$Study_CellType))[i]
    fc$decile = ntile(fc$avg_log2FC, 20)
    fc$gene  <- rownames(fc)
   seu_mtx_kallionpaa@misc$fc  <- fc
    return(seu_mtx_kallionpaa)
    }

In [None]:
levels(factor(md_merged$Study_CellType))  

In [None]:
seurats  <- map(.x = 1:17, .f = bulk_to_seurat)

In [None]:
seurats

### Add CD4 Lab48 FC

In [None]:
cd4_fc  <- read_csv("../../240617_VN_Diabetes_V06/tables/fold_change/cd4_l1_full_filt.csv")

In [None]:
cd4_fc$`...1`  <- NULL

In [None]:
cd4_fc  <- cd4_fc  %>% dplyr::filter(cluster == "Dia")  

In [None]:
cd4_fc$decile = ntile(cd4_fc$avg_log2FC, 20)

In [None]:
cd4_fc  <- cd4_fc  %>% dplyr::filter(cluster == "Dia")  

In [None]:
cd4_fc$Study_CellType  <- "Lab48 CD4"

In [None]:
cd4_fc  <- cd4_fc  %>% dplyr::filter(
 !(grepl(gene, pattern = "^MT")|
                            grepl(gene, pattern = "\\.")|
                            grepl(gene, pattern = "LINC")|
                            grepl(gene, pattern = "^MIR")|
                            grepl(gene, pattern = "^MT")|
                            grepl(gene, pattern = "HNRNP")|
                            grepl(gene, pattern = "^RP[LS]")|
                            grepl(gene, pattern = "\\-")|
                            gene %in% results$hgnc_symbol |
                            grepl(gene, pattern = "orf"))) 

In [None]:
seurats[[7]]@misc$fc  <- cd4_fc

In [None]:
seurats[[7]]@misc$fc

### Add CD8 Lab48 FC

In [None]:
cd8_fc  <- read_csv("../../240617_VN_Diabetes_V06/tables/fold_change/cd8_l1_full_filt.csv")

In [None]:
cd8_fc$`...1`  <- NULL

In [None]:
cd8_fc  <- cd8_fc  %>% dplyr::filter(cluster == "Dia")  

In [None]:
cd8_fc  <- cd8_fc  %>% dplyr::filter(
 !(grepl(gene, pattern = "^MT")|
                            grepl(gene, pattern = "\\.")|
                            grepl(gene, pattern = "LINC")|
                            grepl(gene, pattern = "^MIR")|
                            grepl(gene, pattern = "^MT")|
                            grepl(gene, pattern = "HNRNP")|
                            grepl(gene, pattern = "^RP[LS]")|
                            grepl(gene, pattern = "\\-")|
                            gene %in% results$hgnc_symbol |
                            grepl(gene, pattern = "orf")))

In [None]:
cd8_fc$decile = ntile(cd8_fc$avg_log2FC, 20)

In [None]:
cd8_fc$Study_CellType  <- "Lab48 CD8"

In [None]:
seurats[[8]]@misc$fc  <- cd8_fc

### Add HPAP FC LN

In [None]:
fc_hpap_ln

In [None]:
fc_hpap_ln  <- fc_hpap_ln  %>% dplyr::filter(
 !(grepl(gene, pattern = "^MT")|
                            grepl(gene, pattern = "\\.")|
                            grepl(gene, pattern = "LINC")|
                            grepl(gene, pattern = "^MIR")|
                            grepl(gene, pattern = "^MT")|
                            grepl(gene, pattern = "^OR4F")|
                            grepl(gene, pattern = "^MT")|
                            grepl(gene, pattern = "HNRNP")|
                            grepl(gene, pattern = "^RP[LS]")|
                            gene %in% results$hgnc_symbol |
                            grepl(gene, pattern = "\\-")|
                            grepl(gene, pattern = "orf")))

In [None]:
fc_hpap_ln$decile = ntile(fc_hpap_ln$avg_log2FC, 20)

In [None]:
seurats[[2]]@misc$fc  <- fc_hpap_ln

### Add HPAP FC SPL

In [None]:
fc_hpap_spl  <- fc_hpap_spl  %>% dplyr::filter(
 !(grepl(gene, pattern = "^MT")|
                            grepl(gene, pattern = "\\.")|
                            grepl(gene, pattern = "LINC")|
                            grepl(gene, pattern = "^MIR")|
                            grepl(gene, pattern = "^MT")|
                            grepl(gene, pattern = "^OR4F")|
                            grepl(gene, pattern = "^MT")|
                            grepl(gene, pattern = "HNRNP")|
                            grepl(gene, pattern = "^RP[LS]")|
                            gene %in% results$hgnc_symbol |
                            grepl(gene, pattern = "\\-")|
                            grepl(gene, pattern = "orf")))

In [None]:
fc_hpap_spl$decile = ntile(fc_hpap_spl$avg_log2FC, 20)

In [None]:
seurats[[3]]@misc$fc  <- fc_hpap_spl

### Add Honardoost FC

In [None]:
fc_honar  <- fc_honar  %>% dplyr::filter(
 !(grepl(gene, pattern = "^MT")|
                            grepl(gene, pattern = "\\.")|
                            grepl(gene, pattern = "LINC")|
                            grepl(gene, pattern = "^MIR")|
                            grepl(gene, pattern = "^MT")|
                            grepl(gene, pattern = "^OR4F")|
                            grepl(gene, pattern = "^MT")|
                            grepl(gene, pattern = "HNRNP")|
                            grepl(gene, pattern = "^RP[LS]")|
                            gene %in% results$hgnc_symbol |
                            grepl(gene, pattern = "\\-")|
                            grepl(gene, pattern = "orf")))

In [None]:
fc_honar$decile = ntile(fc_honar$avg_log2FC, 20)

In [None]:
seurats[[1]]@misc$fc  <- fc_honar

### Add ParseBio FC 

In [None]:
pb_cd4$Disease  <- ifelse(substr(pb_cd4$sample,1,1)=="H","Ctrl","Dia")
pb_cd8_nk$Disease  <- ifelse(substr(pb_cd8_nk$sample,1,1)=="H","Ctrl","Dia")

In [None]:
Idents(pb_cd4)  <- pb_cd4$Disease
Idents(pb_cd8_nk)  <- pb_cd8_nk$Disease

In [None]:
fc_pb_cd8_nk  <- FoldChange(pb_cd8_nk,  `ident.1` = "Dia", `ident.2` = "Ctrl",
                      slot = "data") 
    fc_pb_cd8_nk$gene  <- rownames(fc_pb_cd8_nk)

In [None]:
fc_pb_cd8_nk  <- fc_pb_cd8_nk  %>% dplyr::filter(
 !(grepl(gene, pattern = "^MT")|
                            grepl(gene, pattern = "\\.")|
                            grepl(gene, pattern = "LINC")|
                            grepl(gene, pattern = "^MIR")|
                            grepl(gene, pattern = "^MT")|
                            grepl(gene, pattern = "^OR4F")|
                            grepl(gene, pattern = "^MT")|
                            grepl(gene, pattern = "HNRNP")|
                            grepl(gene, pattern = "^RP[LS]")|
                            gene %in% results$hgnc_symbol |
                            grepl(gene, pattern = "\\-")|
                            grepl(gene, pattern = "orf")))

In [None]:
    fc_pb_cd8_nk$Study_CellType = 'ParseBio CD8_NK'
    fc_pb_cd8_nk$decile = ntile(fc_pb_cd8_nk$avg_log2FC, 20)

In [None]:
fc_pb_cd4  <- FoldChange(pb_cd4,  `ident.1` = "Dia", `ident.2` = "Ctrl",
                      slot = "data") 
    fc_pb_cd4$gene  <- rownames(fc_pb_cd4)

fc_pb_cd4  <- fc_pb_cd4  %>% dplyr::filter(
 !(grepl(gene, pattern = "^MT")|
                            grepl(gene, pattern = "\\.")|
                            grepl(gene, pattern = "LINC")|
                            grepl(gene, pattern = "^MIR")|
                            grepl(gene, pattern = "^MT")|
                            grepl(gene, pattern = "^OR4F")|
                            grepl(gene, pattern = "^MT")|
                            grepl(gene, pattern = "HNRNP")|
                            grepl(gene, pattern = "^RP[LS]")|
                            gene %in% results$hgnc_symbol |
                            grepl(gene, pattern = "\\-")|
                            grepl(gene, pattern = "orf")))

    fc_pb_cd4$Study_CellType = 'ParseBio CD4'
    fc_pb_cd4$decile = ntile(fc_pb_cd4$avg_log2FC, 20)

In [None]:
seurats[[15]]@misc$fc  <- fc_pb_cd4
seurats[[16]]@misc$fc  <- fc_pb_cd8_nk

### Add NatCom FC

In [None]:
nc_cd4$Disease  <- ifelse(nc_cd4$Disease=="N","Ctrl","Dia")
nc_cd8_nk$Disease  <- ifelse(nc_cd8_nk$Disease=="N","Ctrl","Dia")

In [None]:
nc_cd4$Disease   %>% table

In [None]:
Idents(nc_cd4)  <- nc_cd4$Disease
Idents(nc_cd8_nk)  <- nc_cd8_nk$Disease

In [None]:
fc_nc_cd4  <- FoldChange(nc_cd4,  `ident.1` = "Dia", `ident.2` = "Ctrl",
                      slot = "data")

In [None]:
grep(rownames(fc_nc_cd4), pattern = "CX3CR1", value = T)

In [None]:
    fc_nc_cd4$gene  <- rownames(fc_nc_cd4)

In [None]:
fc_nc_cd4  <- fc_nc_cd4   %>% dplyr::filter(
                             !(grepl(gene, pattern = "^MT")|
                            grepl(gene, pattern = "\\.")|
                            grepl(gene, pattern = "LINC")|
                            grepl(gene, pattern = "^MIR")|
                            grepl(gene, pattern = "^MT")|
                            grepl(gene, pattern = "^OR4F")|
                            grepl(gene, pattern = "^MT")|
                            grepl(gene, pattern = "HNRNP")|
                            grepl(gene, pattern = "^RP[LS]")|
                            gene %in% results$hgnc_symbol |
                            grepl(gene, pattern = "\\-")|
                            grepl(gene, pattern = "orf")) &
                                            (pct.1 > 0 | pct.2 > 0))

    fc_nc_cd4$Study_CellType = 'NatComm CD4'
    fc_nc_cd4$decile = ntile(fc_nc_cd4$avg_log2FC, 20)
    fc_nc_cd4$gene  <- rownames(fc_nc_cd4)

In [None]:
fc_nc_cd4  %>% dplyr::filter(gene == "CX3CR1")

In [None]:
fc_nc_cd8_nk  <- FoldChange(nc_cd8_nk,  `ident.1` = "Dia", `ident.2` = "Ctrl",
                      slot = "data")
    fc_nc_cd8_nk$gene  <- rownames(fc_nc_cd8_nk)
fc_nc_cd8_nk  <- fc_nc_cd8_nk   %>% dplyr::filter(
      !(grepl(gene, pattern = "^MT")|
                            grepl(gene, pattern = "\\.")|
                            grepl(gene, pattern = "LINC")|
                            grepl(gene, pattern = "^MIR")|
                            grepl(gene, pattern = "^MT")|
                            grepl(gene, pattern = "^OR4F")|
                            grepl(gene, pattern = "^MT")|
                            grepl(gene, pattern = "HNRNP")|
                            grepl(gene, pattern = "^RP[LS]")|
                            gene %in% results$hgnc_symbol |
                            grepl(gene, pattern = "\\-")|
                            grepl(gene, pattern = "orf")) &
                                            (pct.1 > 0 | pct.2 > 0))

    fc_nc_cd8_nk$Study_CellType = 'NatComm CD8_NK'
    fc_nc_cd8_nk$decile = ntile(fc_nc_cd8_nk$avg_log2FC, 20)
    fc_nc_cd8_nk$gene  <- rownames(fc_nc_cd8_nk)

In [None]:
seurats[[9]]@misc$fc  <- fc_nc_cd4
seurats[[10]]@misc$fc  <- fc_nc_cd8_nk

In [None]:
for(i in 1:length(levels(factor(md_merged$Study_CellType)))){
 print(i)
    print(seurats[[i]]$Study_CellType[1])
    print(max(seurats[[i]]@misc$fc$decile))
    
}

In [None]:
fc_merged  <- seurats[[1]]@misc$fc
for(i in 2:length(levels(factor(md_merged$Study_CellType)))){
    fc_merged  <- bind_rows(fc_merged, seurats[[i]]@misc$fc)
}

# Final version - all studies

In [None]:
md_merged  <- read_csv("../tables/other_data_matrix_metadata.csv")
merged  <- read_csv("../tables/other_data_matrix.csv")

merged$`...1`  <- NULL

md_merged$`...1`  <- NULL

### Honardoost 

In [None]:
honar  <- readRDS("../../240617_VN_Diabetes_V06/data/published_data/Honardoost_2024/honar2_stacas_filt.rds")

In [None]:
aggexp = AggregateExpression(honar, return.seurat = F, group.by = "Sample_ID", 
                          assay = "RNA")


In [None]:
colnames(aggexp$RNA) %in% md_merged$Sample_ID  %>% table

In [None]:
mtx_honar  <- aggexp$RNA

### Lab48

In [None]:
cd4_full_filt  <- readRDS("../../240617_VN_Diabetes_V06/data/processed/L1/cd4_l1_full_filt.rds")
cd8_full_filt  <- readRDS("../../240617_VN_Diabetes_V06/data/processed/L1/cd8_l1_full_filt.rds")

In [None]:
cd4_full_filt$Celltype_Patient  <- paste("CD4", cd4_full_filt$Patient_ID)

In [None]:
aggexp = AggregateExpression(cd4_full_filt, return.seurat = F, group.by = "Celltype_Patient", 
                          assay = "RNA")


In [None]:
aggexp

In [None]:
colnames(aggexp$RNA) %in% md_merged$Sample_ID  %>% table

In [None]:
mtx_cd4_lab48  <- aggexp$RNA

In [None]:
cd8_full_filt$Celltype_Patient  <- paste("CD8", cd8_full_filt$Patient_ID)

In [None]:
aggexp = AggregateExpression(cd8_full_filt, return.seurat = F, group.by = "Celltype_Patient", 
                          assay = "RNA")

In [None]:
colnames(aggexp$RNA) %in% md_merged$Sample_ID  %>% table

In [None]:
mtx_cd8_lab48  <- aggexp$RNA

### NatCom

In [None]:
nc_cd8_nk  <- readRDS("../../../DATA_scRNAseq/Analysis_of_previously_published_data/080_Zhong_NatCom_Diabetes/241019_NatComm_merged_filt_cd8_and_nk.rds")
nc_cd4  <- readRDS("../../../DATA_scRNAseq/Analysis_of_previously_published_data/080_Zhong_NatCom_Diabetes/241019_NatComm_merged_filt_cd4_noPR.rds")

In [None]:
aggexp = AggregateExpression(nc_cd4, return.seurat = F, group.by = "source", 
                          assay = "RNA")


In [None]:
colnames(aggexp$RNA)  <- paste("CD4", colnames(aggexp$RNA))

In [None]:
colnames(aggexp$RNA) %in% md_merged$Sample_ID  %>% table

In [None]:
(colnames(aggexp$RNA) %in% md_merged$Sample_ID )

In [None]:
colnames(aggexp$RNA) 

In [None]:
mtx_nc_cd4  <- aggexp$RNA[,which(colnames(aggexp$RNA) %in% md_merged$Sample_ID )]

In [None]:
aggexp = AggregateExpression(nc_cd8_nk, return.seurat = F, group.by = "source", 
                          assay = "RNA")


In [None]:
colnames(aggexp$RNA)  <- paste("CD8_NK", colnames(aggexp$RNA))

In [None]:
colnames(aggexp$RNA) %in% md_merged$Sample_ID  %>% table

In [None]:
(colnames(aggexp$RNA) %in% md_merged$Sample_ID )

In [None]:
colnames(aggexp$RNA) 

In [None]:
mtx_nc_cd8  <- aggexp$RNA[,which(colnames(aggexp$RNA) %in% md_merged$Sample_ID )]

### ParseBio

In [None]:
pb_cd4  <- readRDS("../../../DATA_scRNAseq/Analysis_of_previously_published_data/081_ParseBio_1M_Diabetes/241020_ParseBio_200k_CD4.rds")
pb_cd8_nk  <- readRDS("../../../DATA_scRNAseq/Analysis_of_previously_published_data/081_ParseBio_1M_Diabetes/241020_ParseBio_200k_CD8_and_NK.rds")


In [None]:
aggexp = AggregateExpression(pb_cd4, return.seurat = F, group.by = "sample", 
                          assay = "RNA")


In [None]:
colnames(aggexp$RNA)  <- paste("CD4", colnames(aggexp$RNA))

In [None]:
colnames(aggexp$RNA) %in% md_merged$Sample_ID  %>% table

In [None]:
colnames(aggexp$RNA) 

In [None]:
mtx_pb_cd4  <- aggexp$RNA

In [None]:
aggexp = AggregateExpression(pb_cd8_nk, return.seurat = F, group.by = "sample", 
                          assay = "RNA")


In [None]:
colnames(aggexp$RNA)  <- paste("CD8_NK", colnames(aggexp$RNA))

In [None]:
md_merged  %>% dplyr::filter(Study == "ParseBio")

In [None]:
test  <- paste0(colnames(aggexp$RNA), c("_D","_D","_D","_D","_D","_D","_D","_D",
                                       "_D","_D","_D","_D",
                                        "_H","_H","_H","_H", "_H","_H","_H","_H",
                                         "_H","_H","_H","_H"
                                       ))

In [None]:
test %in% md_merged$Sample_ID  %>% table

In [None]:
colnames(aggexp$RNA)  <- test

In [None]:
mtx_pb_cd8  <- aggexp$RNA

### HPAP

In [None]:
hpap_ln$Tissue  %>% table

In [None]:
hpap_ln$Sample_ID  <- paste(hpap_ln$Patient_ID, hpap_ln$Tissue)

In [None]:
aggexp = AggregateExpression(hpap_ln, return.seurat = F, group.by = "Sample_ID", 
                          assay = "RNA")

In [None]:
mtx_hpap_ln  <- aggexp$RNA

In [None]:
hpap_spl$Sample_ID  <- paste(hpap_spl$Patient_ID, hpap_spl$Tissue)

In [None]:
aggexp = AggregateExpression(hpap_spl, return.seurat = F, group.by = "Sample_ID", 
                          assay = "RNA")

In [None]:
mtx_hpap_spl  <- aggexp$RNA

### Matrix of sc studies 

In [None]:
library(data.table)

In [None]:
dt_hpap_spl  <- as.data.table(mtx_hpap_spl, keep.rownames = TRUE)
dt_hpap_ln <- as.data.table(mtx_hpap_ln, keep.rownames = TRUE)
dt_nc_cd8 <- as.data.table(mtx_nc_cd8, keep.rownames = TRUE)
dt_nc_cd4 <- as.data.table(mtx_nc_cd4, keep.rownames = TRUE)
dt_pc_cd8 <- as.data.table(mtx_pb_cd8, keep.rownames = TRUE)
dt_pb_cd4 <- as.data.table(mtx_pb_cd4, keep.rownames = TRUE)
dt_honar <- as.data.table(mtx_honar, keep.rownames = TRUE)
dt_48_cd4 <- as.data.table(mtx_cd4_lab48, keep.rownames = TRUE)
dt_48_cd8 <- as.data.table(mtx_cd8_lab48, keep.rownames = TRUE)

In [None]:
merged <- merge(dt_hpap_spl, dt_hpap_ln, by = "rn", all = TRUE)

In [None]:
merged <- merge(merged, dt_nc_cd8, by = "rn", all = TRUE)
merged <- merge(merged, dt_nc_cd4, by = "rn", all = TRUE)
merged <- merge(merged, dt_pc_cd8, by = "rn", all = TRUE)
merged <- merge(merged, dt_pb_cd4, by = "rn", all = TRUE)
merged <- merge(merged, dt_honar, by = "rn", all = TRUE)
merged <- merge(merged, dt_48_cd4, by = "rn", all = TRUE)
merged <- merge(merged, dt_48_cd8, by = "rn", all = TRUE)

In [None]:
md_merged_sc  <- md_merged  %>% dplyr::filter(Sample_ID %in% colnames(merged))

In [None]:
test  <- colnames(merged)[c(1,match(md_merged_sc$Sample_ID, colnames(merged)))]

In [None]:
(test[2:length(test)] == md_merged_sc$Sample_ID)  %>% table

In [None]:
merged_sc  <- as.data.frame(merged)[,c(1,match(md_merged_sc$Sample_ID, colnames(merged)))]

In [None]:
all.equal(colnames(merged_sc)[2:271], md_merged_sc$Sample_ID)

### Matrix of bulk studies

In [None]:
md_merged  <- read_csv("../tables/other_data_matrix_metadata.csv")
merged  <- read_csv("../tables/other_data_matrix.csv")

merged$`...1`  <- NULL

md_merged$`...1`  <- NULL

In [None]:
md_without_sc  <- md_merged  %>% dplyr::filter(Study_CellType %in% c("Jailwala CD4+ CD25+", "Kallionpaa CD4",
                                                                    "Kallionpaa CD8", "Newman CD19+/CD27- B cells",
                                                                    "Newman Memory CD4+", "Newman Memory CD4+/CD25- T cells",
                                                                    "Newman Memory CD4+/CD25+ regulatory T cells",
                                                                    "Transimmunome Whole blood"))

In [None]:
md_without_sc$Study_CellType  %>% table

In [None]:
colnames(merged) %in% md_without_sc$Sample_ID  %>% table

In [None]:
merged_without_sc  <- as.data.frame(merged)[,c(1,which(colnames(merged) %in% md_without_sc$Sample_ID))]

### Matrix bulk and sc

In [None]:
merged_all  <- full_join(merged_without_sc, merged_sc, by = "rn")

In [None]:
merged_all

In [None]:
md_all  <- rbind(md_without_sc, md_merged_sc)

In [None]:
all.equal(md_all$Sample_ID , colnames(merged_all)[2:971])

### Filter genes

In [None]:
library(biomaRt)
mart <- useMart(biomart="ensembl", dataset="hsapiens_gene_ensembl")

all_genes <- getBM(attributes = c("chromosome_name", "hgnc_symbol"), mart = mart)

all_genes  %>% group_by(chromosome_name)  %>% tally

all_genes_without_Y  <-  all_genes %>% dplyr::filter(chromosome_name %in% c(1:22,"X")) 

In [None]:
all_genes_without_Y

In [None]:
merged_all_filt  <- merged_all  %>% dplyr::filter(rn %in% all_genes_without_Y$hgnc_symbol &
                            !(grepl(rn, pattern = "^MT")|
                            grepl(rn, pattern = "\\.")|
                            grepl(rn, pattern = "LINC")|
                            grepl(rn, pattern = "^MIR")|
                            grepl(rn, pattern = "^MT")|
                            grepl(rn, pattern = "^OR4F")|
                            grepl(rn, pattern = "^MT")|
                            grepl(rn, pattern = "HNRNP")|
                            grepl(rn, pattern = "^RP[LS]")|
                            grepl(rn, pattern = "\\-")|
                            grepl(rn, pattern = "orf")) )

In [None]:
merged_all_filt2   <- merged_all_filt  %>% column_to_rownames("rn")

In [None]:
md_all

### Mtx fc Seurat no gene filtering

In [None]:
bulk_to_seurat_wilcox_FC  <- function(i){
    
md_select_study  <- md_all  %>% dplyr::filter(Study_CellType == levels(factor(md_all$Study_CellType))[i])
mtx_select_study  <- merged_all_filt2[,colnames(merged_all_filt2) %in% md_select_study$Sample_ID]
mtx_select_study[is.na(mtx_select_study)]  <- 0
seu_mtx_select_study  <- CreateSeuratObject(mtx_select_study, min.cells = 3, min.features = 0)
seu_mtx_select_study  <- NormalizeData(seu_mtx_select_study)
seu_mtx_select_study  <- ScaleData(seu_mtx_select_study)
seu_mtx_select_study$Sample_ID  <- colnames(seu_mtx_select_study)
seu_mtx_select_study@meta.data  <- cbind(seu_mtx_select_study@meta.data, md_select_study)
    rownames(seu_mtx_select_study@meta.data)  <- colnames(seu_mtx_select_study)
    Idents(seu_mtx_select_study)  <- seu_mtx_select_study$Disease
    fc  <- FoldChange(seu_mtx_select_study, `ident.1` = "Dia", `ident.2` = "Ctrl", pseudocount.use = 0.1)
    fc$Study_CellType = levels(factor(md_all$Study_CellType))[i]
    fc$decile = ntile(fc$avg_log2FC, 20)
    fc$gene  <- rownames(fc)
   seu_mtx_select_study@misc$fc  <- fc
    return(seu_mtx_select_study)
    }

In [None]:
seurats_wilcox_FC  <- map(1:17, .f = bulk_to_seurat_wilcox_FC)

In [None]:
fc_merged  <- seurats_wilcox_FC[[1]]@misc$fc
for(i in 2:length(levels(factor(md_all$Study_CellType)))){
    fc_merged  <- bind_rows(fc_merged, seurats_wilcox_FC[[i]]@misc$fc)
}

In [None]:
fc_merged  %>% dplyr::filter(Study_CellType != "Newman CD19+/CD27- B cells") %>% 
group_by(gene, Study_CellType, decile)  %>% dplyr::filter(decile %in% c(1,2))   %>% 
group_by(gene)  %>% tally()  %>% arrange(desc(n)) 

In [None]:
fc_merged  %>% dplyr::filter(Study_CellType != "Newman CD19+/CD27- B cells") %>% 
group_by(gene, Study_CellType, decile)  %>% dplyr::filter(decile %in% c(19,20))   %>% 
group_by(gene)  %>% tally()  %>% arrange(desc(n))  

In [None]:
genes1  <- fc_merged  %>% dplyr::filter(Study_CellType != "Newman CD19+/CD27- B cells")  %>% 
group_by(gene, Study_CellType, decile)  %>% dplyr::filter(decile %in% c(1,2))   %>% 
group_by(gene)  %>% tally()  %>% arrange(desc(n))   %>% dplyr::filter(n>6)  %>% pull(gene)

In [None]:
genes1

In [None]:
genes2  <- fc_merged  %>% group_by(gene, Study_CellType, decile)  %>% dplyr::filter(decile %in% c(19,20))   %>% 
group_by(gene)  %>% tally()  %>% arrange(desc(n))  %>% dplyr::filter(n>6)  %>% pull(gene)

In [None]:
genes2

In [None]:
mtx_all_studies_deciles  <- fc_merged  %>% 
dplyr::filter(Study_CellType != "Newman CD19+/CD27- B cells") %>% 
dplyr::filter(!is.na(Study_CellType))  %>%  
filter(gene %in% c(genes1, genes2))  %>% 
dplyr::select(gene, Study_CellType, decile)  %>% 
pivot_wider(names_from = Study_CellType, values_from = decile)  %>% 
column_to_rownames("gene")   %>% as.matrix()

In [None]:
library(pheatmap)

In [None]:
options(repr.plot.width = 20, repr.plot.height = 5)
pheatmap(t(mtx_all_studies_deciles), scale = "row",
        color=colorRampPalette(c("dodgerblue", "white", "indianred2"))(50), 
         border_color = "white", width = 14, height = 4, 
                  fontsize = 12)

In [None]:
options(repr.plot.width = 20, repr.plot.height = 5)
pheatmap(t(mtx_all_studies_deciles), scale = "none",
        color=colorRampPalette(c("dodgerblue", "white", "indianred2"))(50), 
         border_color = "white", width = 14, height = 4, 
                  fontsize = 12)

In [None]:
genes  <- c(genes1, genes2)

In [None]:
genes1

In [None]:
genes2

In [None]:
pheatmap(t(mtx_all_studies_deciles[match(genes, rownames(mtx_all_studies_deciles)),]), scale = "none", 
         cluster_cols = F,
        color=colorRampPalette(c("dodgerblue", "white", "indianred2"))(50), 
         border_color = "white", width = 14, height = 4, 
                  fontsize = 12, na_col = "grey50")

In [None]:
pheatmap(t(mtx_all_studies_deciles[match(genes2, rownames(mtx_all_studies_deciles)),]), scale = "none", 
         cluster_cols = T, cluster_rows = T,
        color=colorRampPalette(c("dodgerblue", "white", "indianred2"))(50), 
         border_color = "white", width = 14, height = 4, 
                  fontsize = 12, na_col = "grey50")

In [None]:
colnames(mtx_all_studies_deciles)

In [None]:
pheatmap(t(mtx_all_studies_deciles[match(genes2, rownames(mtx_all_studies_deciles)),]), scale = "none", 
         cluster_cols = T, cluster_rows = T,
        color=colorRampPalette(c("dodgerblue", "white", "indianred2"))(50), 
         border_color = "white", width = 14, height = 4, 
                  fontsize = 12, na_col = "grey50", filename = "../figures/heatmaps/all_datasets_up.pdf")

In [None]:
pheatmap(t(mtx_all_studies_deciles[match(genes1, rownames(mtx_all_studies_deciles)),
                                  match(c(
         'ParseBio CD4','ParseBio CD8_NK',
          'HPAP LN','HPAP Spleen T cells',                        
          'Honardoost T cells', 'Kallionpaa CD4','Kallionpaa CD8' ,
           'NatComm CD4','NatComm CD8_NK' ,
           'Lab48 CD4 CD4','Lab48 CD8 CD8'   ,
                                      'Transimmunome Whole blood',
             'Jailwala CD4+ CD25+',  'Newman Memory CD4+','Newman Memory CD4+/CD25- T cells','Newman Memory CD4+/CD25+ regulatory T cells'                        
                                  ), 
                                        colnames(mtx_all_studies_deciles))]), scale = "none", 
         cluster_cols = T,cluster_rows = F,
        color=colorRampPalette(c("dodgerblue", "white", "indianred2"))(50), 
         border_color = "white", width = 14, height = 4, 
                  fontsize = 12, na_col = "grey50")

In [None]:
pheatmap(t(mtx_all_studies_deciles[match(genes1, rownames(mtx_all_studies_deciles)),
                                  match(c(
         'ParseBio CD4','ParseBio CD8_NK',
          'HPAP LN','HPAP Spleen T cells',                        
          'Honardoost T cells', 'Kallionpaa CD4','Kallionpaa CD8' ,
           'NatComm CD4','NatComm CD8_NK' ,
           'Lab48 CD4 CD4','Lab48 CD8 CD8'   ,
                                      'Transimmunome Whole blood',
             'Jailwala CD4+ CD25+',  'Newman Memory CD4+','Newman Memory CD4+/CD25- T cells','Newman Memory CD4+/CD25+ regulatory T cells'                        
                                  ), 
                                        colnames(mtx_all_studies_deciles))]), scale = "none", 
         cluster_cols = T,cluster_rows = F,
        color=colorRampPalette(c("dodgerblue", "white", "indianred2"))(50), 
         border_color = "white", width = 14, height = 4, 
                  fontsize = 12, na_col = "grey50",
        filename = "../figures/heatmaps/all_datasets_down.pdf")

Save data for upload to Zenodo:

## PCA of genes in datasets - revisions

In [None]:
fc_merged_without_na  <- fc_merged  %>% dplyr::filter(Study_CellType != "Newman CD19+/CD27- B cells") %>% 
dplyr::select(Study_CellType, decile, gene)  %>% 
pivot_wider(names_from = Study_CellType, values_from = decile) %>%
  filter(if_all(everything(), ~ !is.na(.)))


In [None]:
fc_merged_without_na_mtx  <- fc_merged_without_na  %>% column_to_rownames("gene")  %>% 
as.matrix()

In [None]:
fc_merged_without_na_mtx

In [None]:
# Perform PCA
pca_res <- prcomp(fc_merged_without_na_mtx, scale. = T)

# Check summary
summary(pca_res)

# View loadings (rotation)
pca_res$rotation

# View principal component scores
pca_res$x

In [None]:
library(factoextra)

options(repr.plot.width = 10, repr.plot.height = 10)
# Plot variables
fviz_pca_var(pca_res)

In [None]:
fviz_pca_var(pca_res, repel = TRUE, col.var = "black", labelsize = 4) +
ggtheme()

ggsave("../figures/pca_genes.svg", width = 20, height = 20, units = "cm")

In [None]:

# Define genes you want to label
genes_to_highlight <- c("CX3CR1", "GNLY", "NKG7","PLEK","AZU1")

# Create a vector of labels: gene name if in selected, "" otherwise
gene_labels <- ifelse(rownames(fc_merged_without_na_mtx) %in% genes_to_highlight, rownames(fc_merged_without_na_mtx), "")

coords <- as.data.frame(pca_res$x)
coords$label <- gene_labels

ggplot(coords, aes(PC1, PC2)) +
geom_hline(yintercept = 0, color = "black") +
geom_vline(xintercept = 0, color = "black") +
  geom_point(color = "grey", size = 1.5) +
  geom_point(data = subset(coords, label != ""), color = "red", size = 2) +
  geom_text(data = subset(coords, label != ""), aes(label = label), 
            vjust = -1, color = "red", size = 6) +
  labs(title = "PCA analysis of gene changes in T1D vs healthy") +
  theme_minimal() +
ggtheme()
