# Integrate studies for Supplementary Figure 1/2B

#Note: integrated_studies_102023.qs is the same as 2_RNA_integrated_studies.rds with the metadata filtered

In [1]:
library(Seurat)

Attaching SeuratObject

‘SeuratObject’ was built with package ‘Matrix’ 1.6.4 but the current
version is 1.6.5; it is recomended that you reinstall ‘SeuratObject’ as
the ABI for ‘Matrix’ may have changed

Seurat v4 was just loaded with SeuratObject v5; disabling v5 assays and
validation routines, and ensuring assays work in strict v3/v4
compatibility mode



# 0. Integrate studies one by one by either RNA or TPM - ran in slurm

https://satijalab.org/seurat/articles/integration_introduction.html

In [None]:
#### integrate studies

#get all studies
library(Seurat)

#STUDY 1#
study1 <- readRDS('/path_to_file/SuppF1_Study1_obj_TPM_counts.rds')
#rename assay 
DefaultAssay(study1) <- 'RNA'
#find variable features
study1 <- FindVariableFeatures(study1, selection.method = "vst", nfeatures = 3000) 

#STUDY 2#
study2 <- readRDS('/path_to_file/SuppF1_Study2_obj_TPM_counts.rds')
DefaultAssay(study2) <- 'RNA'
#find variable features
study2 <- FindVariableFeatures(study2, selection.method = "vst", nfeatures = 3000) 

#STUDY 3#
study3 <- readRDS('/path_to_file/SuppF1_Study3_obj_TPM_counts.rds')
DefaultAssay(study3) <- 'RNA'
study3 <- FindVariableFeatures(study3, selection.method = "vst", nfeatures = 3000) 

#STUDY 4
study4 <- readRDS('/path_to_file/SuppF1_Study4_merged_obj_raw_counts.rds')
DefaultAssay(study4) <- 'RNA'
study4 <- NormalizeData(study4)
study4 <- FindVariableFeatures(study4, selection.method = "vst", nfeatures = 3000)
study4 <- ScaleData(study4)

#OUR OBJECT#
obj <- readRDS('/path_to_file/1_RNA_all.rds')
DefaultAssay(obj) <- 'RNA'
obj <- FindVariableFeatures(obj, selection.method = "vst", nfeatures = 3000)

#INTEGRATE#
#nfeatures_to_integrate <- min(c(length(VariableFeatures(study1)), length(VariableFeatures(study2)), length(VariableFeatures(study3)), length(VariableFeatures(study4))))

all_objects <- list(obj, study1, study2, study3, study4)
features <- SelectIntegrationFeatures(object.list = all_objects) #default of 2000 features

#find integration anchors
anchors <- FindIntegrationAnchors(object.list = all_objects, anchor.features = features, 
                                  normalization.method= 'LogNormalize')

objects.integrated <- IntegrateData(anchorset = anchors) #by default, will use the features used in anchor finding.

#save integrated objects and anchors
saveRDS(objects.integrated, '/path_to_file/integrated_objects_101623.rds')

saveRDS(anchors, '/path_to_filehome/cbw3/data/figure_generation/Figure1/integration_anchors_101623.rds')     

# 1. Predict cell types - ran in slurm
use RNA assay

In [None]:
library(Seurat)

objects.integrated <- readRDS('/home/cbw3/data/figure_generation/Figure1/integrated_objects_101623.rds')
DefaultAssay(objects.integrated) <- 'RNA'

#change study from "NA" to 0 for the reference 
#objects.integrated@meta.data[is.na(objects.integrated@meta.data$study),]$study = 0

obj.query <- subset(objects.integrated, subset = study == 0, 
				invert = TRUE)

obj.reference <- readRDS('/path_to_file/1_RNA_all.rds')
DefaultAssay(obj.reference) <- 'RNA'
obj.reference <- NormalizeData(obj.reference) 
obj.reference <- FindVariableFeatures(obj.reference, selection.method = 'vst', nfeatures = 3000)
#
anchors <- FindTransferAnchors(reference = obj.reference, query = obj.query,
    dims = 1:30, reference.reduction = "pca")

#
predictions <- TransferData(anchorset = anchors, refdata = obj.reference$cell_type,
    dims = 1:30)

#save predictions of cell types 
saveRDS(predictions, "/home/cbw3/data/figure_generation/Figure1/predicted_celltypes_101923.rds")


## 2. get the celltype predictions and add them to object - done here

In [37]:
predicted_celltypes <- readRDS('/path_to_file/predicted_celltypes_101923.rds')
objects.integrated <- readRDS('/path_to_file/integrated_objects_101623.rds')

#objects.integrated.test <- objects.integrated.harmony
objects.integrated@meta.data[rownames(objects.integrated@meta.data) %in% rownames(predicted_celltypes),]$cell_type <- predicted_celltypes$predicted.id

In [None]:
# save as qs object for faster loading
library(qs)

saveRDS(objects.integrated, '/path_to_file/integrated_objects_102023.qs')


## 3. Perform scaling/PCA/UMAP on object (independent of the predicted cell types )

In [8]:
#check that the integration looks okay
DefaultAssay(objects.integrated) <- 'integrated'
all.genes <- rownames(objects.integrated)
objects.integrated <- ScaleData(objects.integrated, features = all.genes, assay= 'integrated')
objects.integrated <- RunPCA(objects.integrated, features = VariableFeatures(object = objects.integrated))
objects.integrated <- RunUMAP(objects.integrated, dims = 1:30)
objects.integrated

Centering and scaling data matrix

PC_ 1 
Positive:  IFITM3, VIM, SPARC, LGALS1, CALD1, CRIP2, RRAS, HTRA1, SERPINH1, PRRX1 
	   IGFBP7, PRSS23, HSPB1, PPIC, EFEMP2, IGFBP4, FOS, CAV1, TSC22D1, NNMT 
	   BGN, NRP1, CD9, SERPING1, EMILIN1, FSTL1, COL6A2, PCOLCE, MT2A, HSPE1 
Negative:  CXCR4, CD69, CCL5, NKG7, GZMA, GZMK, IL7R, CST7, CD8A, TIGIT 
	   KLRB1, GZMH, CTSW, KLRD1, ICOS, PRF1, IFNG, CD8B, GZMB, TNFRSF9 
	   CCR7, CTLA4, GNLY, BATF, S1PR4, CREM, CD79A, CRTAM, MS4A1, GPR18 
PC_ 2 
Positive:  MZT2A, PLP1, GSTP1, HSPE1, GAPDH, STMN1, CDK4, SERPINE2, CKS1B, MLANA 
	   ERBB3, NSG1, H2AFZ, TUBA1B, PFN2, CKS2, HSPB1, SNF8, UBE2T, COL9A3 
	   PMEL, GPNMB, RHOBTB3, S100B, MDK, SLC39A4, GMPR, UPP1, TYMS, BAMBI 
Negative:  IGFBP4, IGFBP7, PDGFRB, THY1, SERPING1, COL6A2, C1R, COL6A3, GGT5, COL4A2 
	   CDH11, MXRA8, COL3A1, C1S, COL6A1, COL1A1, FBN1, CFH, COL18A1, HSPG2 
	   COL5A1, PXDN, ADAMTS2, FBLN2, PPIC, FILIP1L, EPAS1, FSTL1, PRRX2, ESAM 
PC_ 3 
Positive:  C1S, C1R, COL6A2, COL6A3, 

An object of class Seurat 
68629 features across 352966 samples within 3 assays 
Active assay: integrated (2000 features, 2000 variable features)
 2 other assays present: RNA, SCT
 2 dimensional reductions calculated: pca, umap

In [None]:
saveRDS(objects.integrated, '/path_to_file/integrated_objects_102023.qs')

In [19]:
objects.integrated@meta.data[is.na(objects.integrated$orig.ident),]$orig.ident <- 'Post_P6_T_enriched'

In [7]:
objects.integrated@meta.data[is.na(objects.integrated$orig.ident),]$orig.ident <- objects.integrated@meta.data[is.na(objects.integrated$orig.ident),]$sample_ID

# 4. Predict subtypes by subsetting the integrated object in clusters - done in slurm

In [2]:
library(qs)

qs 0.25.5



In [3]:
objects.integrated <- qread("./integrated_objects_102023.qs")

In [5]:
objects.integrated

An object of class Seurat 
68629 features across 352966 samples within 3 assays 
Active assay: integrated (2000 features, 2000 variable features)
 2 other assays present: RNA, SCT
 2 dimensional reductions calculated: pca, umap

In [24]:
library(Seurat)
library(qs)

objects.integrated <- qread('/path_to_file/integrated_objects_102023.qs')

obj.query <- subset(objects.integrated, subset = study == 0, invert = TRUE)
DefaultAssay(obj.query) <- 'RNA'

obj.reference <- readRDS('/path_to_file/1_RNA_all.rds')
DefaultAssay(obj.reference) <- 'RNA'
#subset(objects.integrated, subset = study == 0, invert = TRUE)

for(ct in unique(objects.integrated$cell_type)){
    if(!(ct %in% unique(obj.reference$cell_subtype))) {
        #subset both objects
        print(paste0('celltype: ', ct))
        obj.query.subset <- subset(obj.query, subset = reannotated_cell_type == ct)
        obj.ref.subset <- subset(obj.reference, subset = reannotated_cell_type == ct)
        
        #perform scaling and PCA reduction again on the reference subset  
        obj.ref.subset <- NormalizeData(obj.ref.subset) #don't know if this is absolutely necessary..
        obj.ref.subset <- FindVariableFeatures(obj.ref.subset, selection.method = 'vst', nfeatures = 3000)
        obj.ref.subset <- ScaleData(obj.ref.subset, features = VariableFeatures(obj.ref.subset))
        
        #PCA
        obj.ref.subset <- RunPCA(obj.ref.subset, features = VariableFeatures(object = obj.ref.subset))
        
        anchors <- FindTransferAnchors(reference = obj.ref.subset, query = obj.query.subset,
                    dims = 1:30, reference.reduction = "pca")
        
        predictions <- TransferData(anchorset = anchors, refdata = obj.ref.subset$cell_subtype,
                dims = 1:30)

        ct_file <- gsub(' ', '_', ct)
        ct_file <- gsub('/', '_', ct_file)
        
        saveRDS(predictions, paste0("/path_to_file/predicted_subtypes_", ct_file, ".rds"))
        
        #predict subtypes within the object 
        
    }
}

ERROR: Error in parse(text = x, srcfile = src): <text>:15:13: unexpected input
14: for(ct in unique(objects.integrated$reannotated_cell_type)){
15:     if(!(ct %in unique(objects.reference$subtyped_cell_type))) {
                ^


- files of predicted subtypes: predicted_subtypes_B.rds, predicted_subtypes_CD4_T.rds, predicted_subtypes_CD8_T.rds, predicted_subtypes_cDC.rds, predicted_subtypes_Fibroblast.rds, predicted_subtypes_Monocyte_Macrophage.rds, predicted_subtypes_Other_T.rds

## 5. Integrate subtypes in the integrated object 

In [6]:
B_predicted_subtypes <- readRDS("./predicted_subtypes_B.rds")
cDC_predicted_subtypes <- readRDS("./predicted_subtypes_cDC.rds")
CD4_predicted_subtypes <- readRDS("./predicted_subtypes_CD4_T.rds")
CD8_predicted_subtypes <- readRDS("./predicted_subtypes_CD8_T.rds")
fibro_predicted_subtypes <- readRDS("./predicted_subtypes_Fibroblast.rds")
mon_macro_predicted_subtypes <- readRDS("./predicted_subtypes_Monocyte_Macrophage.rds")
Other_T_predicted_subtypes <- readRDS("./predicted_subtypes_Other_T.rds")

In [7]:
#B cells
objects.integrated@meta.data[rownames(objects.integrated@meta.data) %in% rownames(B_predicted_subtypes),]$subtyped_cell_type <- B_predicted_subtypes$predicted.id

#cDC cells
objects.integrated@meta.data[rownames(objects.integrated@meta.data) %in% rownames(cDC_predicted_subtypes),]$subtyped_cell_type <- cDC_predicted_subtypes$predicted.id

#CD4
objects.integrated@meta.data[rownames(objects.integrated@meta.data) %in% rownames(CD4_predicted_subtypes),]$subtyped_cell_type <- CD4_predicted_subtypes$predicted.id

#CD8 
objects.integrated@meta.data[rownames(objects.integrated@meta.data) %in% rownames(CD8_predicted_subtypes),]$subtyped_cell_type <- CD8_predicted_subtypes$predicted.id

#Fibroblast
objects.integrated@meta.data[rownames(objects.integrated@meta.data) %in% rownames(fibro_predicted_subtypes),]$subtyped_cell_type <- fibro_predicted_subtypes$predicted.id

#Monocyte/Macrophage
objects.integrated@meta.data[rownames(objects.integrated@meta.data) %in% rownames(mon_macro_predicted_subtypes),]$subtyped_cell_type <- mon_macro_predicted_subtypes$predicted.id

#Other T
objects.integrated@meta.data[rownames(objects.integrated@meta.data) %in% rownames(Other_T_predicted_subtypes),]$subtyped_cell_type <- Other_T_predicted_subtypes$predicted.id


### Integrate tumor MPs from original object 
Tumor metaprograms were found on the subset of predicted tumor cells from all studies in the same way they were found for our RNA-seq data.

In [68]:
#MPs for the integrated study
tumor_mps_integrated <- readRDS("/path_to_file/integrated_mps.RDS")

In [11]:
unique(objects.integrated@meta.data[objects.integrated$reannotated_cell_type == 'Tumor',]$subtyped_cell_type)
#NA values are from the other studies

In [9]:
#Copy of integrated objects
objects.integrated.copy <- objects.integrated

In [10]:
other_studies_cells <- subset(tumor_mps_integrated, subset = (study != '0'))
other_studies_cells

An object of class Seurat 
92507 features across 137020 samples within 4 assays 
Active assay: integrated (2000 features, 2000 variable features)
 3 other assays present: RNA, SCT, TPM
 2 dimensional reductions calculated: pca, umap

In [11]:
other_studies_cells_copy <- other_studies_cells

In [12]:
other_studies_cells$mp_annotations <- unlist(lapply(other_studies_cells$mp_annotations, function(x) gsub(head(unlist(strsplit(x, ' ')), 1), "Tumor -", x)))

In [15]:
integrated_obj_other_study_tumor_cells <- subset(objects.integrated, subset = (cell_type == 'Tumor' & study != 0))

In [21]:
other_study_tumor_cellnames <- rownames(integrated_obj_other_study_tumor_cells@meta.data)

In [24]:
objects.integrated.copy <- objects.integrated

In [24]:
#transfer tumor annotations to integrated object for the OTHER studies 
objects.integrated@meta.data[rownames(objects.integrated@meta.data) %in% other_study_tumor_cellnames,]$subtyped_cell_type <- 
other_studies_cells@meta.data[rownames(other_studies_cells@meta.data) %in% other_study_tumor_cellnames,]$mp_annotations

## add tumor annotations from original study 

In [26]:
study0_tumor_cells <- readRDS("/net/bmc-lab5/data/kellis/group/scCancer2023/tumor_mps.RDS")
study0_tumor_cells$mp_trim2 <- unlist(lapply(study0_tumor_cells$mp_trim2, function(x) paste0("Tumor - ",x)))

In [27]:
unique(mp_annotations)

In [28]:
study0_tumor_cells$mp_trim2 <- unlist(lapply(study0_tumor_cells$mp_trim2, function(x) paste0("Tumor - ",x)))

In [29]:
#rownames in teh integrated object for the original study has _1 appended 
modified_rownames <- unlist(lapply(rownames(study0_tumor_cells@meta.data), function(x) paste0(x, '_1')))

In [32]:
objects.integrated@meta.data[rownames(objects.integrated@meta.data) %in% modified_rownames,]$subtyped_cell_type <- study0_tumor_cells$mp_trim2

In [33]:
objects.integrated.copy <- objects.integrated

#### add cell type to the subtypes that do not have more levels 

In [34]:
#non subtyped objects
unique(objects.integrated$cell_type)

for(ct in c('Endothelial', 'pDC', 'NK', 'Cycling T', 'Keratinocyte', 'Mast')){
    objects.integrated@meta.data[objects.integrated$cell_type == ct,]$cell_subtype = ct
}

In [189]:
objects.integrated@meta.data[is.na(objects.integrated$cell_subtype),]$cell_subtype <- objects.integrated@meta.data[is.na(objects.integrated$cell_subtype),]$cell_type

In [2]:
sessionInfo()

R version 4.2.3 (2023-03-15)
Platform: x86_64-conda-linux-gnu (64-bit)
Running under: CentOS Linux 7 (Core)

Matrix products: default
BLAS/LAPACK: /net/bmc-lab5/data/kellis/users/cbw3/conda/envs/r-kernel/lib/libopenblasp-r0.3.21.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] SeuratObject_5.0.1 Seurat_4.3.0.1    

loaded via a namespace (and not attached):
  [1] Rtsne_0.17             colorspace_2.1-0       deldir_2.0-4          
  [4] ggridges_0.5.6         IRdisplay_1.1          base64enc_0.1-3       
  [7] spatstat.data_3.0-4    leiden_0.4.3.1 