#### Summary:
In this notebook I'll prepare the inputs for both modality (RNA, ATAC) associations, metadata inputs, etc. Then further notebooks will perform the associations and necessary meta-analyses.

Required inputs:
- Final Seurat object with metadata columns for donor ID (library here) and cell type (major_celltypes_fin here)

In [1]:
suppressMessages(library(hdf5r)) 
suppressMessages(library(Seurat))
suppressMessages(library(DESeq2))

suppressMessages(library(dplyr)) 
suppressMessages(library(ggplot2))
suppressMessages(library(ggpubr)) 

suppressMessages(library(Matrix)) 
suppressMessages(library(data.table))
suppressMessages(library(future)) 
suppressMessages(library(stringr))
suppressMessages(library(stringi))
suppressPackageStartupMessages(library(parallel))
suppressPackageStartupMessages(library(readr))

suppressMessages(library(enrichR))
suppressMessages(library(fgsea))
suppressMessages(library(ggrepel))
suppressMessages(library(RColorBrewer))
suppressMessages(library(shadowtext))
suppressMessages(library(forcats))

In [2]:
suppressMessages(library(ggplot2))
suppressMessages(library(tidyverse))

# Basic Inputs

In [3]:
alberta_samples <- c('R207','R217','R218','R221','R223',
                      'R226','R228','R234','R237','R238',
                      'R246', 'R247', 'R275', 'R284', 
                      'R290', 'R292', 'R316', 'R317', 'R319',
                      'R325','R326','R327','R332',
                      'R353','R354','R362','R363','R364')

hpap_samples_rna <- c('HPAP-022','HPAP-026','HPAP-034','HPAP-035','HPAP-036',
                  'HPAP-037','HPAP-039','HPAP-040','HPAP-042','HPAP-044',
                  'HPAP-047','HPAP-052','HPAP-053','HPAP-054','HPAP-056',
                  'HPAP-059','HPAP-063','HPAP-074','HPAP-075','HPAP-077',
                  'HPAP-080','HPAP-082','HPAP-099','HPAP-101','HPAP-103',
                  'HPAP-104','HPAP-105')
length(hpap_samples_rna)

hpap_samples_atac <- c('HPAP-035','HPAP-036','HPAP-039','HPAP-040','HPAP-044',
                       'HPAP-045','HPAP-047','HPAP-049','HPAP-050','HPAP-052',
                       'HPAP-053','HPAP-054','HPAP-056','HPAP-059','HPAP-063',
                       'HPAP-067','HPAP-069','HPAP-072','HPAP-075','HPAP-077',
                       'HPAP-080','HPAP-092','HPAP-099','HPAP-101','HPAP-103',
                       'HPAP-104','HPAP-105')
length(hpap_samples_atac)

In [4]:
hpap_samples_all <- sort(unique(c(hpap_samples_rna,hpap_samples_atac)))
length(hpap_samples_all)

In [29]:
outdir <- '/overall/dir/to/save/phenotype/association/outputs/to'

In [30]:
joint_celltypes <- c('beta', 'alpha', 'delta', 'gamma', 'acinar','ductal', 'endothelial', 'immune','stellate')

# 1. Read in Seurat objects (Alberta and HPAP)

## Alberta

In [6]:
#read in the adata object 
rds_fp = '/path/to/Alberta/multiome/final_object.rds'
adata1 = readRDS(rds_fp)
adata1

Loading required package: Signac



An object of class Seurat 
571775 features across 174819 samples within 4 assays 
Active assay: ATAC (210485 features, 210485 variable features)
 3 other assays present: RNA, SCT, ATAC_CTpeaks
 7 dimensional reductions calculated: pca, harmony.rna, umap.rna, lsi, harmony.atac, umap.atac, umap.wnn

In [8]:
table(adata1$major_celltypes_fin)


     acinar       alpha        beta       delta      ductal endothelial 
      27387       44500       81100        8911        5452         342 
      gamma      immune    stellate 
       5504         633         990 

### HPAP RNA

In [42]:
#read in hpap object
rds_fp2 <- '/path/to/HPAP/RNA/final_object.rds'
adata2 <- readRDS(rds_fp2)
adata2

An object of class Seurat 
36601 features across 192203 samples within 1 assay 
Active assay: RNA (36601 features, 2000 variable features)
 3 dimensional reductions calculated: pca, harmony, umap

In [43]:
# subset down to the ND samples
adata2_nd <- subset(adata2, subset=Diabetes_Status_w_AAB=='ND')
adata2_nd

An object of class Seurat 
36601 features across 71871 samples within 1 assay 
Active assay: RNA (36601 features, 2000 variable features)
 3 dimensional reductions calculated: pca, harmony, umap

In [44]:
#rename celltypes to match the alberta ones -- combine mast and macrophage into immune
celltype_map <- unique(adata2_nd$cell_type)
new_celltypes <- c('alpha','alpha+beta','stellate','stellate','endothelial','delta',
                   'beta','gamma','immune', 'ductal','acinar','alpha','ductal','immune')
names(celltype_map) <- new_celltypes
adata2_nd$matched_celltype <- plyr::mapvalues(adata2_nd$cell_type, from=celltype_map, to=names(celltype_map))

In [45]:
table(unique(adata2_nd$library) %in% hpap_samples_rna)


TRUE 
  27 

### HPAP ATAC

In [46]:
#read in hpap object
rds_fp3 <- '/path/to/HPAP/ATAC/final_object.rds'
adata3 <- readRDS(rds_fp3)
adata3

An object of class Seurat 
1303997 features across 97837 samples within 4 assays 
Active assay: ATAC_peaks (481311 features, 481311 variable features)
 3 other assays present: RNA, Final_Peaks, Unified_Peaks
 3 dimensional reductions calculated: lsi, harmony.atac, umap.atac

In [47]:
# subset down to the ND samples
adata3_nd <- subset(adata3, subset=condition=='Control')
adata3_nd

An object of class Seurat 
1303997 features across 66872 samples within 4 assays 
Active assay: ATAC_peaks (481311 features, 481311 variable features)
 3 other assays present: RNA, Final_Peaks, Unified_Peaks
 3 dimensional reductions calculated: lsi, harmony.atac, umap.atac

In [48]:
#rename celltypes to match the alberta ones -- combine mast and macrophage into immune
celltype_map <- unique(adata3_nd$Cell.Type)
new_celltypes <- c('acinar','ductal','alpha','beta','delta','immune','gamma',
                   'stellate','stellate','stellate','endothelial','ductal')
names(celltype_map) <- new_celltypes
adata3_nd$matched_celltype <- plyr::mapvalues(adata3_nd$Cell.Type, from=celltype_map, to=names(celltype_map))

In [50]:
table(unique(adata3_nd$library) %in% hpap_samples_atac)


TRUE 
  27 

# 2. Metadata Preparation for each dataset

## Alberta samples

### Read in phenotype metadata

In [97]:
#read in extended Alberta donor metadata
meta_dir <- '/path/to/alberta/sample/metadata'
fp <- file.path(meta_dir, 'donor_metadata_combined.tsv')
meta <- as.data.frame(fread(fp, sep='\t', header=TRUE))

#change colnames to be more R friendly
init_names <- colnames(meta)
colnames(meta) <- tolower(stri_replace_all_regex(gsub(r"{\s*\([^\)]+\)}","",colnames(meta)),
                                                 pattern=c(' ', '%', ',', ':','-','/'), 
                                                 replacement=c('_','pct','','','_','_div_'), vectorize=FALSE))
meta_names <- init_names
names(meta_names) <- colnames(meta)

#remove rows for samples we removed
meta2 <- meta[meta$record_id %in% alberta_samples,]

#remove unnecessary columns from meta
rm_cols <- c('rrid','known_medical_condition', 'was_the_tissue_purified?', 
             'excellent_pct','good_pct','fair_pct','poor_pct', 
             'ieq_per_cryopreservation_tube','number_of_cryopreserved_tubes_remaining',
             'ieq_per_cryopreservation_tube.1','number_of_snap_frozen_tubes_remaining',
             'is_an_embedded_biopsy_available_for_this_donor?', 'are_embedded_islets_available_for_this_donor?')
meta_cut <- meta2[,!colnames(meta2) %in% rm_cols]

meta1 <- meta_cut[,-1]
row.names(meta1) <- meta_cut$record_id
meta1$library <- row.names(meta1)

#shorten names for bmi and hba1c
colnames(meta1)[6] <- 'BMI'
colnames(meta1)[7] <- 'HbA1c'

#cut down to measurements of interest
meta1_cut <- select(meta1, age, sex, BMI, HbA1c, culture_time)
head(meta1_cut)

Unnamed: 0_level_0,age,sex,BMI,HbA1c,culture_time
Unnamed: 0_level_1,<int>,<chr>,<dbl>,<dbl>,<dbl>
R207,50,Female,22.2,,70
R217,71,Female,35.5,6.3,15
R218,73,Female,28.4,5.9,70
R221,44,Male,30.5,5.3,136
R223,54,Male,27.0,5.8,64
R226,30,Female,32.3,4.9,16


### Calculate additional sample level covariates for Alberta samples
- For both
    - Proportion of beta cells 
    - Total cells per donor
    - Number of cells per cell type
- RNA
    - Mean genes per cell type
- ATAC
    - Average TSSe

In [98]:
#calculate proportion of beta cells
raw_cts <- as.data.frame(table(adata1@meta.data$major_celltypes_fin, adata1@meta.data$library)) %>%
            reshape(idvar='Var1', timevar='Var2', direction='wide')
row.names(raw_cts) <- raw_cts$Var1
raw_cts <- raw_cts[,-1]
prop_cts <- sweep(raw_cts, 2, colSums(raw_cts), `/`)
colnames(prop_cts) <- gsub('Freq.','',colnames(prop_cts))

#now convert this to a long format
long_prop_cts <- tibble::rownames_to_column(prop_cts, var='ct') %>% tidyr::pivot_longer(-ct, names_to='library', values_to='prop')
long_prop_cts$pc <- 100 * long_prop_cts$prop

#arrange into a dataframe with other info
beta_prop <- unlist(prop_cts['beta',])
num_cells <- as.vector(table(adata1$library))
prop_data1 <- data.frame(beta_prop_rna=beta_prop, beta_prop_atac=beta_prop, num_cells_rna=num_cells, num_cells_atac=num_cells)
head(prop_data1)

Unnamed: 0_level_0,beta_prop_rna,beta_prop_atac,num_cells_rna,num_cells_atac
Unnamed: 0_level_1,<dbl>,<dbl>,<int>,<int>
R207,0.5168054,0.5168054,6248,6248
R217,0.2546716,0.2546716,5619,5619
R218,0.6887661,0.6887661,2172,2172
R221,0.2156263,0.2156263,4902,4902
R223,0.5443796,0.5443796,6343,6343
R226,0.5523939,0.5523939,5869,5869


In [99]:
#add in # cells per cell type info to metadata
results <- as.data.frame(table(adata1$library, adata1$major_celltypes_fin)) %>%
                reshape(idvar='Var1', timevar='Var2', direction='wide')
row.names(results) <- results$Var1
results <- results %>% select(-Var1)
colnames(results) <- paste0(gsub('Freq.','',colnames(results)),'_cells')
col_order <- paste0(joint_celltypes, '_cells')
results <- results %>% select(all_of(col_order))
head(results)

results1 <- copy(results)
colnames(results1) <- paste0(colnames(results1),'_rna')

results2 <- copy(results)
colnames(results2) <- paste0(colnames(results2),'_atac')

Unnamed: 0_level_0,beta_cells,alpha_cells,delta_cells,gamma_cells,acinar_cells,ductal_cells,endothelial_cells,immune_cells,stellate_cells
Unnamed: 0_level_1,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
R207,3229,2025,217,99,580,45,8,33,12
R217,1431,348,150,569,2489,614,2,10,6
R218,1496,227,133,83,165,52,2,4,10
R221,1057,905,183,44,2389,294,8,5,17
R223,3453,1616,225,101,711,203,7,8,19
R226,3242,1492,193,79,815,34,2,4,8


In [100]:
### Function to calculate mean_genes per sample for a cell type in an adata object
### Basically just average of nFeature_RNA for a celltype and sample pair
calc_mean_genes <- function(adata, celltypes, samples, celltype_col){
    # Collect mean of nFeature_RNA across samples and cell types
    mean_genes <- list()
    for (celltype in celltypes){
        gene_number <- c()
        for (sample in samples){
            cut_df <- adata@meta.data[adata@meta.data[,celltype_col] == celltype & adata@meta.data$library == sample,]
            gene_number <- c(gene_number, mean(cut_df$nFeature_RNA))
        }
        mean_genes[[celltype]] <- gene_number
    }

    # Format results as a df
    mean_genes_df <- as.data.frame(do.call(cbind, mean_genes))
    row.names(mean_genes_df) <- samples
    colnames(mean_genes_df) <- paste0(celltypes, '_mean_genes')
    return(mean_genes_df)
}

In [101]:
#calculate mean genes for alberta samples
alberta_mean_genes <- calc_mean_genes(adata1, joint_celltypes, alberta_samples, 'major_celltypes_fin')
head(alberta_mean_genes)

Unnamed: 0_level_0,beta_mean_genes,alpha_mean_genes,delta_mean_genes,gamma_mean_genes,acinar_mean_genes,ductal_mean_genes,endothelial_mean_genes,immune_mean_genes,stellate_mean_genes
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
R207,2461.714,2194.071,2378.373,2086.808,1462.876,1842.333,1304.0,1118.606,1314.583
R217,2545.581,2296.241,2301.96,2353.497,1800.699,1868.313,1578.5,1458.0,1771.0
R218,1960.642,1809.938,1901.744,1472.627,1199.855,1384.827,1503.5,910.25,1505.2
R221,1951.037,1973.824,2066.421,1581.727,1315.754,1625.554,1376.625,1460.6,1243.706
R223,2690.103,2312.497,2464.196,2330.564,1562.639,1854.044,1592.143,1568.125,1826.053
R226,2923.349,2535.351,2881.332,2677.658,1516.972,2238.235,1328.0,2198.75,1739.25


In [102]:
### Function to calculate the average of a metadata col per sample (all cell types!)
calc_sample_mean <- function(adata, samples, feature_col){
    # Collect mean of relevant column across samples and cell types
    means <- c()
    for(sample in samples){
        cut_df <- adata@meta.data[adata@meta.data$library == sample,]
        means <- c(means, mean(cut_df[,feature_col]))
    }
   
    #Return the vector
    names(means) <- samples
    return(means)
}

In [103]:
#calculate sample level means for TSSe
tsse1 <- calc_sample_mean(adata1, alberta_samples, 'TSS.enrichment')
head(tsse1)

In [104]:
#combine info into one df
meta1_cut$dataset <- 'Alberta'
meta1_fin <- cbind(meta1_cut, prop_data1, results1, results2, alberta_mean_genes) %>% relocate(dataset, .after=HbA1c)
meta1_fin$tsse <- tsse1
head(meta1_fin)

#save out df!
out_fp1 <- file.path(outdir, 'Alberta_metadata_all.tsv')
write.table(meta1_fin, out_fp1, sep='\t', quote=FALSE)

Unnamed: 0_level_0,age,sex,BMI,HbA1c,dataset,culture_time,beta_prop_rna,beta_prop_atac,num_cells_rna,num_cells_atac,⋯,beta_mean_genes,alpha_mean_genes,delta_mean_genes,gamma_mean_genes,acinar_mean_genes,ductal_mean_genes,endothelial_mean_genes,immune_mean_genes,stellate_mean_genes,tsse
Unnamed: 0_level_1,<int>,<chr>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<dbl>,<int>,<int>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
R207,50,Female,22.2,,Alberta,70,0.5168054,0.5168054,6248,6248,⋯,2461.714,2194.071,2378.373,2086.808,1462.876,1842.333,1304.0,1118.606,1314.583,3.450389
R217,71,Female,35.5,6.3,Alberta,15,0.2546716,0.2546716,5619,5619,⋯,2545.581,2296.241,2301.96,2353.497,1800.699,1868.313,1578.5,1458.0,1771.0,3.530508
R218,73,Female,28.4,5.9,Alberta,70,0.6887661,0.6887661,2172,2172,⋯,1960.642,1809.938,1901.744,1472.627,1199.855,1384.827,1503.5,910.25,1505.2,3.641667
R221,44,Male,30.5,5.3,Alberta,136,0.2156263,0.2156263,4902,4902,⋯,1951.037,1973.824,2066.421,1581.727,1315.754,1625.554,1376.625,1460.6,1243.706,3.792649
R223,54,Male,27.0,5.8,Alberta,64,0.5443796,0.5443796,6343,6343,⋯,2690.103,2312.497,2464.196,2330.564,1562.639,1854.044,1592.143,1568.125,1826.053,2.99232
R226,30,Female,32.3,4.9,Alberta,16,0.5523939,0.5523939,5869,5869,⋯,2923.349,2535.351,2881.332,2677.658,1516.972,2238.235,1328.0,2198.75,1739.25,3.864731


## HPAP samples

### Read in phenotype metadata

In [170]:
# Read in Ruth's HPAP metadata table
meta2 <- as.data.frame(fread('/path/to/HPAP/donor/metadata.csv'))
dim(meta2)
#head(meta2)

# Cut down to comparable measurements and rename/reorder columns to match
row.names(meta2) <- meta2$donor_ID
meta2_cut <- meta2[hpap_samples_all,c('age_years','gender','bmi','hba1c')]
colnames(meta2_cut) <- c('age','sex','BMI','HbA1c')

# Add in dataset and culture time
meta2_cut$dataset <- 'HPAP'
culture_time_days <- c(3,2,3,3,2,4,4,4,5,5,5,2,5,4,4,2,6,2,5,3,NA,NA,3,4,2,5,2,NA,2,NA,4,6,6,2) #manually collect culture time from dates (ex: 12/7/17-12/4/17 = 3days)
meta2_cut$culture_time <- 24 * culture_time_days
meta2_cut <- meta2_cut %>% rownames_to_column(var='library')

dim(meta2_cut)
head(meta2_cut)

Unnamed: 0_level_0,library,age,sex,BMI,HbA1c,dataset,culture_time
Unnamed: 0_level_1,<chr>,<int>,<chr>,<dbl>,<dbl>,<chr>,<dbl>
1,HPAP-022,39,Female,34.7,4.7,HPAP,72
2,HPAP-026,24,Male,20.8,4.9,HPAP,48
3,HPAP-034,13,Male,18.6,5.2,HPAP,72
4,HPAP-035,35,Male,26.91,5.2,HPAP,72
5,HPAP-036,23,Female,16.0,5.2,HPAP,48
6,HPAP-037,35,Female,21.9,5.3,HPAP,96


In [171]:
#calculate proportion of beta cells -- RNA
raw_cts <- as.data.frame(table(adata2_nd@meta.data$matched_celltype, adata2_nd@meta.data$library)) %>%
            reshape(idvar='Var1', timevar='Var2', direction='wide')
row.names(raw_cts) <- raw_cts$Var1
raw_cts <- raw_cts[,-1]
prop_cts <- sweep(raw_cts, 2, colSums(raw_cts), `/`)
colnames(prop_cts) <- gsub('Freq.','',colnames(prop_cts))

#now convert this to a long format
long_prop_cts <- tibble::rownames_to_column(prop_cts, var='ct') %>% tidyr::pivot_longer(-ct, names_to='library', values_to='prop')
long_prop_cts$pc <- 100 * long_prop_cts$prop

#arrange into a dataframe with other info
beta_prop <- unlist(prop_cts['beta',])
num_cells <- as.vector(table(adata2_nd$library))
prop_data1 <- data.frame(beta_prop_rna=beta_prop, num_cells_rna=num_cells)
#head(prop_data1)

In [172]:
#calculate proportion of beta cells -- ATAC
raw_cts <- as.data.frame(table(adata3_nd@meta.data$matched_celltype, adata3_nd@meta.data$library)) %>%
            reshape(idvar='Var1', timevar='Var2', direction='wide')
row.names(raw_cts) <- raw_cts$Var1
raw_cts <- raw_cts[,-1]
prop_cts <- sweep(raw_cts, 2, colSums(raw_cts), `/`)
colnames(prop_cts) <- gsub('Freq.','',colnames(prop_cts))

#now convert this to a long format
long_prop_cts <- tibble::rownames_to_column(prop_cts, var='ct') %>% tidyr::pivot_longer(-ct, names_to='library', values_to='prop')
long_prop_cts$pc <- 100 * long_prop_cts$prop

#arrange into a dataframe with other info
beta_prop <- unlist(prop_cts['beta',])
num_cells <- as.vector(table(adata3_nd$library))
prop_data2 <- data.frame(beta_prop_atac=beta_prop, num_cells_atac=num_cells)
#head(prop_data2)

In [173]:
#combine prop_data into one df
prop_data1 <- prop_data1 %>% rownames_to_column(var='library')
prop_data2 <- prop_data2 %>% rownames_to_column(var='library')

prop_data <- full_join(prop_data1, prop_data2, by='library')
prop_data <- prop_data[,c(1,2,4,3,5)]
dim(prop_data)
head(prop_data)

Unnamed: 0_level_0,library,beta_prop_rna,beta_prop_atac,num_cells_rna,num_cells_atac
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<int>,<int>
1,HPAP-022,0.17037251,,3463,
2,HPAP-026,0.22222222,,414,
3,HPAP-034,0.08055556,,360,
4,HPAP-035,0.22299465,0.2517033,1870,2642.0
5,HPAP-036,0.18008107,0.2229767,1727,5449.0
6,HPAP-037,0.11766504,,3272,


In [174]:
#add in # cells per cell type info to metadata -- RNA
results1 <- as.data.frame(table(adata2_nd$library, adata2_nd$matched_celltype)) %>%
                reshape(idvar='Var1', timevar='Var2', direction='wide')
row.names(results1) <- results1$Var1
results1 <- results1 %>% select(-Var1)
colnames(results1) <- paste0(gsub('Freq.','',colnames(results1)),'_cells_rna')
col_order <- paste0(joint_celltypes, '_cells_rna')
results1 <- results1 %>% select(all_of(col_order))
#head(results1)

In [175]:
#add in # cells per cell type info to metadata -- ATAC
results2 <- as.data.frame(table(adata3_nd$library, adata3_nd$matched_celltype)) %>%
                reshape(idvar='Var1', timevar='Var2', direction='wide')
row.names(results2) <- results2$Var1
results2 <- results2 %>% select(-Var1)
colnames(results2) <- paste0(gsub('Freq.','',colnames(results2)),'_cells_atac')
col_order <- paste0(joint_celltypes, '_cells_atac')
results2 <- results2 %>% select(all_of(col_order))
#head(results2)

In [176]:
#combine num_cells into one df
results1 <- results1 %>% rownames_to_column(var='library')
results2 <- results2 %>% rownames_to_column(var='library')

results <- full_join(results1, results2, by='library')
dim(results)
head(results)

Unnamed: 0_level_0,library,beta_cells_rna,alpha_cells_rna,delta_cells_rna,gamma_cells_rna,acinar_cells_rna,ductal_cells_rna,endothelial_cells_rna,immune_cells_rna,stellate_cells_rna,beta_cells_atac,alpha_cells_atac,delta_cells_atac,gamma_cells_atac,acinar_cells_atac,ductal_cells_atac,endothelial_cells_atac,immune_cells_atac,stellate_cells_atac
Unnamed: 0_level_1,<chr>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
1,HPAP-022,590,1699,133,48,53,75,127,73,334,,,,,,,,,
2,HPAP-026,92,87,12,11,152,35,8,2,8,,,,,,,,,
3,HPAP-034,29,52,4,2,171,74,3,3,13,,,,,,,,,
4,HPAP-035,417,664,35,15,354,174,66,19,78,665.0,878.0,140.0,30.0,556.0,316.0,21.0,9.0,27.0
5,HPAP-036,311,524,34,28,595,126,25,21,29,1215.0,1930.0,671.0,89.0,942.0,492.0,25.0,37.0,48.0
6,HPAP-037,385,424,20,2,1953,304,35,26,102,,,,,,,,,


In [177]:
hpap_mean_genes <- calc_mean_genes(adata2_nd, joint_celltypes, hpap_samples_all, 'matched_celltype')
hpap_mean_genes[is.na(hpap_mean_genes)] <- NA
hpap_mean_genes <- hpap_mean_genes %>% rownames_to_column(var='library') 
head(hpap_mean_genes)

Unnamed: 0_level_0,library,beta_mean_genes,alpha_mean_genes,delta_mean_genes,gamma_mean_genes,acinar_mean_genes,ductal_mean_genes,endothelial_mean_genes,immune_mean_genes,stellate_mean_genes
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,HPAP-022,3669.414,3373.147,2571.677,4404.521,1817.113,3678.88,2117.402,1812.945,3147.311
2,HPAP-026,3420.293,2101.138,1611.75,2526.909,1073.039,2694.914,1186.625,1298.5,1753.125
3,HPAP-034,4498.172,2616.404,1379.0,1951.0,2150.789,3323.135,1436.0,1102.333,2342.769
4,HPAP-035,3138.559,2171.274,1805.971,2827.067,1659.472,2779.764,1366.455,1295.737,1923.038
5,HPAP-036,3753.09,1953.011,1999.382,2925.25,1838.161,2880.111,1977.48,1480.143,2347.103
6,HPAP-037,3441.042,1910.719,1755.5,2269.0,1616.355,2726.039,1626.714,1346.077,1856.314


In [181]:
#calculate sample level means for TSSe
tsse2 <- calc_sample_mean(adata3_nd, hpap_samples_atac, 'TSS.enrichment')
tsse2_df <- data.frame(library=names(tsse2), tsse=tsse2)
head(tsse2_df)

Unnamed: 0_level_0,library,tsse
Unnamed: 0_level_1,<chr>,<dbl>
HPAP-035,HPAP-035,4.301869
HPAP-036,HPAP-036,4.297375
HPAP-039,HPAP-039,4.375953
HPAP-040,HPAP-040,4.340516
HPAP-044,HPAP-044,4.375415
HPAP-045,HPAP-045,4.39839


In [184]:
#combine info into one df
meta2_fin <- full_join(meta2_cut, prop_data, by='library') %>% 
                full_join(results, by='library') %>% 
                full_join(hpap_mean_genes, by='library') %>%
                full_join(tsse2_df, by='library')
row.names(meta2_fin) <- meta2_fin$library
meta2_fin <- meta2_fin %>% select(-library)
head(meta2_fin)

#save out df!
out_fp2 <- file.path(outdir, 'HPAP_metadata_all.tsv')
write.table(meta2_fin, out_fp2, sep='\t', quote=FALSE)

Unnamed: 0_level_0,age,sex,BMI,HbA1c,dataset,culture_time,beta_prop_rna,beta_prop_atac,num_cells_rna,num_cells_atac,⋯,beta_mean_genes,alpha_mean_genes,delta_mean_genes,gamma_mean_genes,acinar_mean_genes,ductal_mean_genes,endothelial_mean_genes,immune_mean_genes,stellate_mean_genes,tsse
Unnamed: 0_level_1,<int>,<chr>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<dbl>,<int>,<int>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
HPAP-022,39,Female,34.7,4.7,HPAP,72,0.17037251,,3463,,⋯,3669.414,3373.147,2571.677,4404.521,1817.113,3678.88,2117.402,1812.945,3147.311,
HPAP-026,24,Male,20.8,4.9,HPAP,48,0.22222222,,414,,⋯,3420.293,2101.138,1611.75,2526.909,1073.039,2694.914,1186.625,1298.5,1753.125,
HPAP-034,13,Male,18.6,5.2,HPAP,72,0.08055556,,360,,⋯,4498.172,2616.404,1379.0,1951.0,2150.789,3323.135,1436.0,1102.333,2342.769,
HPAP-035,35,Male,26.91,5.2,HPAP,72,0.22299465,0.2517033,1870,2642.0,⋯,3138.559,2171.274,1805.971,2827.067,1659.472,2779.764,1366.455,1295.737,1923.038,4.301869
HPAP-036,23,Female,16.0,5.2,HPAP,48,0.18008107,0.2229767,1727,5449.0,⋯,3753.09,1953.011,1999.382,2925.25,1838.161,2880.111,1977.48,1480.143,2347.103,4.297375
HPAP-037,35,Female,21.9,5.3,HPAP,96,0.11766504,,3272,,⋯,3441.042,1910.719,1755.5,2269.0,1616.355,2726.039,1626.714,1346.077,1856.314,


# 3. Combine metadata from both datasets into one table

In [31]:
#read back in both dfs
out_fp1 <- file.path(outdir, 'Alberta_metadata_all.tsv')
meta1 <- read.table(out_fp1, sep='\t', header=1)
head(meta1)

out_fp2 <- file.path(outdir, 'HPAP_metadata_all.tsv')
meta2 <- read.table(out_fp2, sep='\t', header=1)
head(meta2)

Unnamed: 0_level_0,age,sex,BMI,HbA1c,dataset,culture_time,beta_prop_rna,beta_prop_atac,num_cells_rna,num_cells_atac,⋯,beta_mean_genes,alpha_mean_genes,delta_mean_genes,gamma_mean_genes,acinar_mean_genes,ductal_mean_genes,endothelial_mean_genes,immune_mean_genes,stellate_mean_genes,tsse
Unnamed: 0_level_1,<int>,<chr>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<dbl>,<int>,<int>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
R207,50,Female,22.2,,Alberta,70,0.5168054,0.5168054,6248,6248,⋯,2461.714,2194.071,2378.373,2086.808,1462.876,1842.333,1304.0,1118.606,1314.583,3.450389
R217,71,Female,35.5,6.3,Alberta,15,0.2546716,0.2546716,5619,5619,⋯,2545.581,2296.241,2301.96,2353.497,1800.699,1868.313,1578.5,1458.0,1771.0,3.530508
R218,73,Female,28.4,5.9,Alberta,70,0.6887661,0.6887661,2172,2172,⋯,1960.642,1809.938,1901.744,1472.627,1199.855,1384.827,1503.5,910.25,1505.2,3.641667
R221,44,Male,30.5,5.3,Alberta,136,0.2156263,0.2156263,4902,4902,⋯,1951.037,1973.824,2066.421,1581.727,1315.754,1625.554,1376.625,1460.6,1243.706,3.792649
R223,54,Male,27.0,5.8,Alberta,64,0.5443796,0.5443796,6343,6343,⋯,2690.103,2312.497,2464.196,2330.564,1562.639,1854.044,1592.143,1568.125,1826.053,2.99232
R226,30,Female,32.3,4.9,Alberta,16,0.5523939,0.5523939,5869,5869,⋯,2923.349,2535.351,2881.332,2677.658,1516.972,2238.235,1328.0,2198.75,1739.25,3.864731


Unnamed: 0_level_0,age,sex,BMI,HbA1c,dataset,culture_time,beta_prop_rna,beta_prop_atac,num_cells_rna,num_cells_atac,⋯,beta_mean_genes,alpha_mean_genes,delta_mean_genes,gamma_mean_genes,acinar_mean_genes,ductal_mean_genes,endothelial_mean_genes,immune_mean_genes,stellate_mean_genes,tsse
Unnamed: 0_level_1,<int>,<chr>,<dbl>,<dbl>,<chr>,<int>,<dbl>,<dbl>,<int>,<int>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
HPAP-022,39,Female,34.7,4.7,HPAP,72,0.17037251,,3463,,⋯,3669.414,3373.147,2571.677,4404.521,1817.113,3678.88,2117.402,1812.945,3147.311,
HPAP-026,24,Male,20.8,4.9,HPAP,48,0.22222222,,414,,⋯,3420.293,2101.138,1611.75,2526.909,1073.039,2694.914,1186.625,1298.5,1753.125,
HPAP-034,13,Male,18.6,5.2,HPAP,72,0.08055556,,360,,⋯,4498.172,2616.404,1379.0,1951.0,2150.789,3323.135,1436.0,1102.333,2342.769,
HPAP-035,35,Male,26.91,5.2,HPAP,72,0.22299465,0.2517033,1870,2642.0,⋯,3138.559,2171.274,1805.971,2827.067,1659.472,2779.764,1366.455,1295.737,1923.038,4.301869
HPAP-036,23,Female,16.0,5.2,HPAP,48,0.18008107,0.2229767,1727,5449.0,⋯,3753.09,1953.011,1999.382,2925.25,1838.161,2880.111,1977.48,1480.143,2347.103,4.297375
HPAP-037,35,Female,21.9,5.3,HPAP,96,0.11766504,,3272,,⋯,3441.042,1910.719,1755.5,2269.0,1616.355,2726.039,1626.714,1346.077,1856.314,


In [32]:
#check that colnames all match and then merge
#colnames(meta1)
table(colnames(meta1) == colnames(meta2))

#combine dfs and write out final file
meta <- rbind(meta1, meta2)
dim(meta)


TRUE 
  38 

In [33]:
#scale all continuous variables 1,3,4,6-10, 29-38
cont_vars <- colnames(meta)[c(1,3,4,seq(6,10,1),seq(29,38,1))]
cont_vars

for (col_name in cont_vars){
    new_name <- paste0('scaled_', col_name)
    meta[new_name] <- as.double(scale(meta[,col_name]))
    meta <- meta %>% relocate(all_of(new_name), .after=all_of(col_name))
}
head(meta)

Unnamed: 0_level_0,age,scaled_age,sex,BMI,scaled_BMI,HbA1c,scaled_HbA1c,dataset,culture_time,scaled_culture_time,⋯,ductal_mean_genes,scaled_ductal_mean_genes,endothelial_mean_genes,scaled_endothelial_mean_genes,immune_mean_genes,scaled_immune_mean_genes,stellate_mean_genes,scaled_stellate_mean_genes,tsse,scaled_tsse
Unnamed: 0_level_1,<int>,<dbl>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
R207,50,0.4825992,Female,22.2,-0.93547849,,,Alberta,70,0.07402343,⋯,1842.333,-0.7227589,1304.0,-0.69291246,1118.606,-0.869054467,1314.583,-0.938693,3.450389,-1.2548308
R217,71,1.5811029,Female,35.5,1.15230857,6.3,1.6527657,Alberta,15,-1.34420328,⋯,1868.313,-0.6980603,1578.5,-0.11854077,1458.0,0.009333767,1771.0,-0.3702339,3.530508,-1.1219687
R218,73,1.6857223,Female,28.4,0.03777563,5.9,0.8536262,Alberta,70,0.07402343,⋯,1384.827,-1.1577107,1503.5,-0.27547292,910.25,-1.408302494,1505.2,-0.7012832,3.641667,-0.9376335
R221,44,0.168741,Male,30.5,0.36742622,5.3,-0.3450829,Alberta,136,1.77589549,⋯,1625.554,-0.9288509,1376.625,-0.54094982,1460.6,0.016062849,1243.706,-1.0269696,3.792649,-0.6872598
R223,54,0.691838,Male,27.0,-0.18199143,5.8,0.6538414,Alberta,64,-0.08069221,⋯,1854.044,-0.7116253,1592.143,-0.08999406,1568.125,0.294349182,1826.053,-0.3016668,2.99232,-2.0144478
R226,30,-0.5635948,Female,32.3,0.64998386,4.9,-1.1442224,Alberta,16,-1.31841734,⋯,2238.235,-0.3463746,1328.0,-0.64269417,2198.75,1.926474909,1739.25,-0.409778,3.864731,-0.5677271


In [34]:
#saving out this combined metadata df in case
out_fp <- file.path(outdir, 'Alberta_HPAP_combined_metadata_all.tsv')
write.table(meta, out_fp, sep='\t', quote=FALSE)

# 4. Prepare RNA combined pseudobulk counts tables for DESeq

In [12]:
### Function to sum up all counts for a cell type by sample
get_per_sample_gex_SUMS <- function(adata, gex.counts, sample_bcs, cell_type, samples, outdir){
    #pull out rows of gex.counts where BC Ident matches cell.type
    bcs <- names(Idents(adata)[Idents(adata) == cell_type])
    counts <- gex.counts[,colnames(gex.counts) %in% bcs]
    print(dim(counts))

    #initialize the matrix of sample gex
    counts.df <- as.data.frame(rep(0,length(row.names(gex.counts))))
    row.names(counts.df) <- row.names(gex.counts)
    colnames(counts.df) <- c('temp')

    #go through samples and calculate sum of gex values
    for (sample in samples){
        sample_cols <- colnames(counts) %in% sample_bcs[[sample]]
        counts.cut <- counts[,sample_cols]
        
        #if only one bc, this becomes a vector which is an issue
        if (typeof(counts.cut) == 'double'){
            mean.counts <- round(counts.cut)
        #if there are NO bcs, this will return NA (just return 0 for everything)
            } else if(length(colnames(counts.cut)) == 0){
            mean.counts <- rep(0,length(row.names(counts)))
            } else {
            mean.counts <- round(rowSums(counts.cut))
            }
        counts.df <- cbind(counts.df,as.data.frame(mean.counts))
         }
    fin.counts.df <- counts.df[,-c(1)]
    colnames(fin.counts.df) <- samples
    head(fin.counts.df)

    #export df
    mtx.fp <- file.path(outdir,sprintf('%s_sample_gex_total_counts.txt',cell_type))
    write.table(fin.counts.df,mtx.fp,sep='\t',quote=FALSE)
}

## Alberta

In [104]:
# Set cluster identities to cell types
Idents(adata1) <- adata1@meta.data$major_celltypes_fin

In [105]:
# Get counts data (RNA)
DefaultAssay(adata1) <- 'RNA'
gex.counts1 <- GetAssayData(adata1, slot='counts')
dim(gex.counts1)

In [106]:
# Get sample barcodes
sample_bcs1 <- list()
for (sample in alberta_samples){
    sample_bcs1[[sample]] <- row.names(adata1[[]][adata1[[]]$library == sample,])
}
length(sample_bcs1)

In [109]:
#Run function to make matrices
matrix_dir1 <- file.path(outdir, 'RNA', 'alberta_sample_matrices')
dir.create(matrix_dir1, showWarnings = FALSE)

In [110]:
for (celltype in joint_celltypes){
    print(paste(celltype, Sys.time()))
    get_per_sample_gex_SUMS(adata1, gex.counts1, sample_bcs1, celltype, alberta_samples, matrix_dir1)
}

[1] "beta 2023-10-05 15:12:31"
[1] 36601 81100
[1] "alpha 2023-10-05 15:12:53"
[1] 36601 44500
[1] "delta 2023-10-05 15:13:04"
[1] 36601  8911
[1] "gamma 2023-10-05 15:13:07"
[1] 36601  5504
[1] "acinar 2023-10-05 15:13:10"
[1] 36601 27387
[1] "ductal 2023-10-05 15:13:16"
[1] 36601  5452
[1] "endothelial 2023-10-05 15:13:18"
[1] 36601   342
[1] "immune 2023-10-05 15:13:19"
[1] 36601   633
[1] "stellate 2023-10-05 15:13:21"
[1] 36601   990


## HPAP

In [13]:
# Set cluster identities to cell types
Idents(adata2_nd) <- adata2_nd$matched_celltype

In [14]:
adata2_nd

An object of class Seurat 
36601 features across 71871 samples within 1 assay 
Active assay: RNA (36601 features, 2000 variable features)
 3 dimensional reductions calculated: pca, harmony, umap

In [None]:
# Get counts data (RNA)
DefaultAssay(adata2_nd) <- 'RNA'
gex.counts2 <- GetAssayData(adata2_nd, slot='counts')

In [19]:
# Get sample barcodes
sample_bcs2 <- list()
for (sample in hpap_samples){
    sample_bcs2[[sample]] <- row.names(adata2[[]][adata2[[]]$library == sample,])
}
length(sample_bcs2)

In [20]:
matrix_dir2 <- file.path(outdir, 'RNA', 'hpap_sample_matrices')
dir.create(matrix_dir2, showWarnings = FALSE)

In [23]:
#Run function to make matrices
for (celltype in joint_celltypes){
    print(paste(celltype, Sys.time()))
    get_per_sample_gex_SUMS(adata2_nd, gex.counts2, sample_bcs2, celltype, hpap_samples, matrix_dir2)
}

[1] "beta 2023-10-06 11:26:03"
[1] 36601 17671
[1] "alpha 2023-10-06 11:26:11"
[1] 36601 21476
[1] "delta 2023-10-06 11:26:16"
[1] 36601  1993
[1] "gamma 2023-10-06 11:26:18"
[1] 36601   815
[1] "acinar 2023-10-06 11:26:19"
[1] 36601 14419
[1] "ductal 2023-10-06 11:26:23"
[1] 36601  5987
[1] "endothelial 2023-10-06 11:26:26"
[1] 36601  2695
[1] "immune 2023-10-06 11:26:27"
[1] 36601   951
[1] "stellate 2023-10-06 11:26:28"
[1] 36601  4045


## Merge tables into one

In [24]:
matrix_dir_fin <- file.path(outdir,'RNA','merged_sample_matrices')
dir.create(matrix_dir_fin, showWarnings=FALSE)

In [25]:
for (celltype in joint_celltypes){
    #read in separate matrices
    fp1 <- file.path(outdir, 'alberta_sample_matrices', sprintf('%s_sample_gex_total_counts.txt',celltype))
    fp2 <- file.path(outdir, 'hpap_sample_matrices', sprintf('%s_sample_gex_total_counts.txt',celltype))
    mat1 <- read.table(fp1, sep='\t')
    mat2 <- read.table(fp2, sep='\t')
    
    #merge them into one matrix with cbind
    print(table(row.names(mat1) == row.names(mat2))) #make sure genes are in same order!!!!
    mat_fin <- cbind(mat1, mat2)
    fp_fin <- file.path(matrix_dir_fin, sprintf('%s_sample_gex_total_counts.txt',celltype))
    write.table(mat_fin, fp_fin, sep='\t', quote=FALSE)
}


 TRUE 
36601 

 TRUE 
36601 

 TRUE 
36601 

 TRUE 
36601 

 TRUE 
36601 

 TRUE 
36601 

 TRUE 
36601 

 TRUE 
36601 

 TRUE 
36601 


# 5. Prepare ATAC combined pseudobulk counts tables for DESeq

## Alberta

In [18]:
# Set cluster identities to cell types
Idents(adata1) <- adata1@meta.data$major_celltypes_fin

In [37]:
# Get counts data (RNA)
DefaultAssay(adata1) <- 'ATAC_CTpeaks'
atac.counts1 <- GetAssayData(adata1, slot='counts')
dim(atac.counts1)

In [38]:
# Get sample barcodes
sample_bcs1 <- list()
for (sample in alberta_samples){
    sample_bcs1[[sample]] <- row.names(adata1[[]][adata1[[]]$library == sample,])
}
length(sample_bcs1)

In [39]:
#Run function to make matrices
matrix_dir1 <- file.path(outdir, 'ATAC', 'alberta_sample_matrices')
dir.create(matrix_dir1, showWarnings = FALSE)

In [40]:
for (celltype in joint_celltypes){
    print(paste(celltype, Sys.time()))
    get_per_sample_gex_SUMS(adata1, atac.counts1, sample_bcs1, celltype, alberta_samples, matrix_dir1)
}

[1] "beta 2024-03-13 14:56:41"
[1] 291821  81100
[1] "alpha 2024-03-13 14:58:38"
[1] 291821  44500
[1] "delta 2024-03-13 14:59:50"
[1] 291821   8911
[1] "gamma 2024-03-13 15:00:18"
[1] 291821   5504
[1] "acinar 2024-03-13 15:00:38"
[1] 291821  27387
[1] "ductal 2024-03-13 15:02:23"
[1] 291821   5452
[1] "endothelial 2024-03-13 15:02:42"
[1] 291821    342
[1] "immune 2024-03-13 15:02:54"
[1] 291821    633
[1] "stellate 2024-03-13 15:03:06"
[1] 291821    990


## HPAP: Use featureCounts and sinto to get counts of Alberta peaks

In [13]:
#generate files with the list of barcodes for each sample for cell types of interest
#with second column labeled by cell type (with sample name appended to it)
bc_outdir <- file.path(outdir, 'ATAC', 'hpap_sample_barcodes')

for (sample in samples){    
    print(sample)
    # pull out all barcodes for the sample
    sample_bcs = Cells(so)[so[[]]$library==sample]
    
    # make df with cell type info
    bc_df <- data.frame(bc=sample_bcs, celltype=paste(so[[]]$matched_celltype[so[[]]$library==sample]))
    print(dim(bc_df))
    
    # cut down to celltypes of interest then add sample prefix to cell type name
    bc_df_cut <- subset(bc_df, celltype %in% joint_celltypes)
    bc_df_cut$celltype <- paste(sample, bc_df_cut$celltype, sep='_')
    print(dim(bc_df_cut))
    
    # reformat bc names and write to file
    bc_df_cut$bc = substr(bc_df_cut$bc, 10, 27)
    
    sample_bc_fp = file.path(bc_outdir, sprintf('%s.filtered_barcodes_wCTs_ofInterest.txt',sample))
    write.table(bc_df_cut, sample_bc_fp, sep='\t', row.names=F, col.names=F, quote=F)
}

[1] "HPAP-035"
[1] 2642    2
[1] 2642    2
[1] "HPAP-036"
[1] 5449    2
[1] 5449    2
[1] "HPAP-039"
[1] 2827    2
[1] 2827    2
[1] "HPAP-045"
[1] 1958    2
[1] 1958    2
[1] "HPAP-049"
[1] 2862    2
[1] 2862    2
[1] "HPAP-050"
[1] 1111    2
[1] 1111    2
[1] "HPAP-052"
[1] 1153    2
[1] 1153    2
[1] "HPAP-053"
[1] 1385    2
[1] 1385    2
[1] "HPAP-054"
[1] 2855    2
[1] 2855    2
[1] "HPAP-056"
[1] 2107    2
[1] 2107    2
[1] "HPAP-059"
[1] 1429    2
[1] 1429    2
[1] "HPAP-067"
[1] 1617    2
[1] 1617    2
[1] "HPAP-069"
[1] 2115    2
[1] 2115    2
[1] "HPAP-072"
[1] 2088    2
[1] 2088    2
[1] "HPAP-075"
[1] 1735    2
[1] 1735    2
[1] "HPAP-077"
[1] 2160    2
[1] 2160    2
[1] "HPAP-080"
[1] 2732    2
[1] 2732    2
[1] "HPAP-092"
[1] 3720    2
[1] 3720    2
[1] "HPAP-099"
[1] 4950    2
[1] 4950    2
[1] "HPAP-101"
[1] 2759    2
[1] 2759    2
[1] "HPAP-103"
[1] 3992    2
[1] 3992    2
[1] "HPAP-104"
[1] 3572    2
[1] 3572    2
[1] "HPAP-105"
[1] 3711    2
[1] 3711    2


### See directory `2b_quantify_counts_in_peaks` for instructions on running sinto and featureCounts to get per-peak counts for HPAP ATAC data

### Read in featureCounts outputs and reorganize
Goal: peak x sample matrix of total counts (rownames = chr-start-end, colnames = HPAP-###)

In [3]:
options(scipen=999)

In [4]:
### Function to process a featureCounts output matrix into an simplified format
### really just removing some columns and updating labels
convert_featureCounts_mtx <- function(fc_fp, celltype, out_fp){
    # Read in output df and make peaks row names
    df <- as.data.frame(vroom::vroom(fc_fp, delim='\t', skip=1))
    row.names(df) <- paste(df$Chr, df$Start-1, df$End, sep='-')
    
    # Remove unnecessary columns, then simplify colnames
    fin_df <- df[,-c(1,2,3,4,5,6)]
    new_names <- gsub(sprintf('_%s.bam', celltype), '', sapply(strsplit(colnames(fin_df), split='/'), tail, n=1))
    colnames(fin_df) <- new_names
    write.table(fin_df, out_fp, sep='\t', quote=F)
}

In [10]:
fc_dir <- 'path/to/feature/counts/outputs'
mtx_dir <- file.path(outdir, 'ATAC', 'hpap_sample_matrices')

In [11]:
for (celltype in joint_celltypes){
    fc_fp <- file.path(fc_dir, sprintf('%s_featureCounts_mtx.txt', celltype))
    out_fp <- file.path(mtx_dir ,sprintf('%s_sample_peak_total_counts.txt', celltype))
    convert_featureCounts_mtx(fc_fp, celltype, out_fp)
}

[1mRows: [22m[34m291821[39m [1mColumns: [22m[34m10[39m
[36m──[39m [1mColumn specification[22m [36m──────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m (3): Geneid, Chr, Strand
[32mdbl[39m (7): Start, End, Length, /nfs/lab/projects/multiomic_islet/outputs/multi...

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.
[1mRows: [22m[34m291821[39m [1mColumns: [22m[34m10[39m
[36m──[39m [1mColumn specification[22m [36m──────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m (3): Geneid, Chr, Strand
[32mdbl[39m (7): Start, End, Length, /nfs/lab/projects/multiomic_islet/outputs/multi...

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the colu

## Merge tables into one

In [78]:
matrix_dir_fin <- file.path(outdir, 'ATAC', 'merged_sample_matrices')
dir.create(matrix_dir_fin, showWarnings=FALSE)

In [79]:
#read in correctly sorted union peaks file to use for ordering
peaks_fp <- '/path/to/final_peaks.bed'
peaks_df <- read.table(peaks_fp, sep='\t')
head(peaks_df)
peaks <- paste(peaks_df$V1,peaks_df$V2,peaks_df$V3,sep='-')
head(peaks,20)

Unnamed: 0_level_0,V1,V2,V3,V4
Unnamed: 0_level_1,<chr>,<int>,<int>,<chr>
1,chr1,9956,10256,acinar_peak_1
2,chr1,29219,29482,acinar_peak_2
3,chr1,99568,99868,acinar_peak_3
4,chr1,102793,103093,acinar_peak_4
5,chr1,127658,127913,ductal_peak_3
6,chr1,180714,181014,immune_peak_2


In [81]:
#read in metadata df to get sample order
out_fp <- file.path(outdir, 'Alberta_HPAP_combined_metadata.tsv')
meta <- read.table(out_fp, sep='\t', header=1)
ordered_samples <- row.names(meta)
ordered_samples

In [None]:
for (celltype in joint_celltypes){
    #read in separate matrices
    fp1 <- file.path(outdir, 'ATAC', 'alberta_sample_matrices', sprintf('%s_sample_peak_total_counts.txt',celltype))
    fp2 <- file.path(outdir, 'ATAC', 'hpap_sample_matrices', sprintf('%s_sample_peak_total_counts.txt',celltype))
    mat1 <- read.table(fp1, sep='\t')
    mat2 <- read.table(fp2, sep='\t')
    
    #make sure peaks are in same order
    mat1_sort <- mat1[peaks,]
    mat2_sort <- mat2[peaks,]
    
    #merge them into one matrix with cbind and make sure sample names and order matches metadata df
    mat_fin <- cbind(mat1_sort, mat2_sort)
    colnames(mat_fin) <- gsub('\\.','-',colnames(mat_fin))
    fin_samples <- ordered_samples[ordered_samples %in% colnames(mat_fin)]
    mat_fin <- mat_fin %>% select(all_of(fin_samples))
    fp_fin <- file.path(matrix_dir_fin, sprintf('%s_sample_union_peaks_total_counts.txt',celltype))
    write.table(mat_fin, fp_fin, sep='\t', quote=FALSE)
}

In [3]:
sessionInfo()

R version 4.1.1 (2021-08-10)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 20.04.2 LTS

Matrix products: default
BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] parallel  stats4    stats     graphics  grDevices utils     datasets 
[8] methods   base     

other attached packages:
 [1] purrr_0.3.5                 tidyr_1.2.1                
 [3] tibble_3.1.8                tidyverse_1.3.2            
 [5] forcats_0.5.2               shadowtext_0.1.2           
 [7] RColorBrewer_1.1-3          ggrepel_0.9.4              
