# Part 19: Analysis of scRNAseq data from ParseBio

In [None]:
source("diabetes_analysis_v07.R")
library(pheatmap)
library(STACAS)

This is a reanalysis of sample data generated by [ParseBio](https://resources.parsebiosciences.com/dataset-wt-mega-one-million-pbmc-type-1-diabetes). Raw data are available upon registration. 

# Loading and preprocessing data

This part uses the raw data which can be downloaded from the [ParseBio webpage](https://resources.parsebiosciences.com/dataset-wt-mega-one-million-pbmc-type-1-diabetes). If you wish to recapitulate the analysis, please download the data and save it to the folder: `../data/published_data/ParseBio/Raw` 

Libraries to process ParseBio data:

In [None]:
library(BPCells)
library(ggrepel)
# set this option when analyzing large datasets
options(future.globals.maxSize = 3e+09)

# Loading and preprocessing of the data

Load data and create Seurat object. 

In [None]:
mat <- ReadParseBio("../data/published_data/ParseBio/Raw/")

In [None]:
cell_metadata <- read.csv("../data/published_data/ParseBio/Raw/cell_metadata.csv",
                         row.names = 1)

In [None]:
pbmc <- CreateSeuratObject(mat, min.cells = 100, min.features = 100,
names.field = 0, meta.data = cell_metadata)

In [None]:
rm(mat)
rm(cell_metadata)
gc()

Downsample dataset. 

In [None]:
pbmc$barcode  <- colnames(pbmc)
pbmc  <- subset(pbmc, barcode %in% sample(colnames(pbmc), size = 200000))

Process using standard pipeline. 

In [None]:
pbmc <- NormalizeData(pbmc, verbose = FALSE)
pbmc <- ScaleData(pbmc, verbose = FALSE)
pbmc <- FindVariableFeatures(pbmc, nfeatures = 1000, verbose = FALSE)
pbmc <- RunPCA(pbmc, dims = 1:12)
pbmc <- RunUMAP(pbmc, reduction = "pca", dims = 1:12)

pbmc <- FindNeighbors(pbmc, dims = 1:12)
pbmc <- FindClusters(pbmc, resolution = 0.7)
print("CLustering done!")

saveRDS(pbmc, paste0("../data/published_data/ParseBio/241020_ParseBio_full_200k.rds"))

Visualize major cell types. 

In [None]:
DimPlot(pbmc)

In [None]:
FeaturePlot(pbmc, features = c("CD14","MS4A1","TRAC","LCK"), ncol = 4)

In [None]:
FeaturePlot(pbmc, features = c("CD3D","CD8A","CD4","NCR1"), ncol = 4)

In [None]:
FeaturePlot(pbmc, features = c("MKI67","NCR1","IL7R","CD4"), ncol = 4)

# Subsetting T cells

Subset T cells, recluster and save the dataset. 

In [None]:
pbmc  <- subset(pbmc, seurat_clusters %in% c(0,2,3,4,6,9,12,18))

DimPlot(pbmc, label = T, label.size = 10)

     pbmc <- NormalizeData(pbmc, verbose = FALSE)
        pbmc <- ScaleData(pbmc, verbose = FALSE)
        pbmc <- FindVariableFeatures(pbmc, nfeatures = 1000, verbose = FALSE)

pbmc <- RunPCA(pbmc, dims = 1:12)
pbmc <- RunUMAP(pbmc, reduction = "pca", dims = 1:12)
  

pbmc <- FindNeighbors(pbmc, dims = 1:12)
pbmc <- FindClusters(pbmc, resolution = 0.7)
            print("CLustering done!")

In [None]:
saveRDS(pbmc, paste0("../data/published_data/ParseBio/241020_ParseBio_200k_filtT.rds"))

In [None]:
# Use if starting the reanalysis from here
# pbmc  <- readRDS("../data/published_data/ParseBio/Raw/241020_ParseBio_200k_filtT.rds")

In [None]:
DimPlot(pbmc)

In [None]:
options(repr.plot.width = 16, repr.plot.height = 4)
FeaturePlot(pbmc, features = c("FOXP3","CD8A","CD4","NCR1"), max.cutoff = c(2,2,2,2), ncol = 4)

In [None]:
FeaturePlot(pbmc, features = c("MKI67","NCR1","IL7R","CD4"), ncol = 4)

FeaturePlot(pbmc, features = c("FOXP3","GATA3","NCAM1","RORG"), ncol = 4)

Composition of the dataset by sample and disease status. 

In [None]:
DimPlot(pbmc, group.by = "sample")

In [None]:
pbmc$Disease  <- substr(pbmc$sample, 1,1)

In [None]:
DimPlot(pbmc, group.by = "Disease")

# Separating CD4 and CD8 T cells

In [None]:
which(rownames(pbmc@assays$RNA) == "CD4")

In [None]:
which(rownames(pbmc@assays$RNA) == "CD8A")

In [None]:
pbmc$cd4_or_cd8  <- ifelse(pbmc@assays$RNA$counts[245,]>0 & pbmc@assays$RNA$counts[8716,]>0, "Both",
                               ifelse(pbmc@assays$RNA$counts[245,]>0 & pbmc@assays$RNA$counts[8716,]==0, "CD4",
                               ifelse(pbmc@assays$RNA$counts[245,]==0 & pbmc@assays$RNA$counts[8716,]>0, "CD8",
                               "Unknown")))

In [None]:
pbmc$cd4_or_cd8  %>% table

In [None]:
DimPlot(pbmc, group.by = "cd4_or_cd8")

In [None]:
options(future.globals.maxSize = 5e+09)

In [None]:
pbmc <- FindNeighbors(pbmc, dims = 1:12)
pbmc <- FindClusters(pbmc, resolution = 0.5)
options(repr.plot.width = 7, repr.plot.height = 5)
DimPlot(pbmc, label  = T, label.size = 10, repel = T)

In [None]:
options(repr.plot.width = 24, repr.plot.height = 12)

FeaturePlot(pbmc, features = c("CD8A"), split.by = "seurat_clusters", max.cutoff = 2) + plot_layout(ncol = 5, nrow = 3)

In [None]:
options(repr.plot.width = 24, repr.plot.height = 12)

FeaturePlot(pbmc, features = c("CD4"), split.by = "seurat_clusters", max.cutoff = 2) + plot_layout(ncol = 5, nrow = 3)


Clusters 6, 10 and 12 are composed of both CD4 and CD8 T cells. We will separate CD4 from CD8 T cells in those clusters by separated analysis. 

### Reclustering cluster 10

In [None]:
sub  <- subset(pbmc, seurat_clusters == 10)

In [None]:
sub <- NormalizeData(sub, verbose = FALSE)
sub <- ScaleData(sub, verbose = FALSE)
sub <- FindVariableFeatures(sub, nfeatures = 1000, verbose = FALSE)

sub <- RunPCA(sub, dims = 1:12)
sub <- RunUMAP(sub, reduction = "pca", dims = 1:12)

sub <- FindNeighbors(sub, dims = 1:12)

In [None]:
cl10 <- FindClusters(sub, resolution = 0.7)

In [None]:
options(repr.plot.width = 6, repr.plot.height = 5)
DimPlot(cl10, label = T, label.size = 10)

In [None]:
options(repr.plot.width = 6, repr.plot.height = 5)

FeaturePlot(cl10, features = c("CD8A"),  max.cutoff = 2)

In [None]:
options(repr.plot.width = 6, repr.plot.height = 5)

FeaturePlot(cl10, features = c("CD4"),  max.cutoff = 2)

In [None]:
options(repr.plot.width = 24, repr.plot.height = 9)

FeaturePlot(cl10, features = c("CD8A"), split.by = "seurat_clusters", max.cutoff = 2) + plot_layout(ncol = 5, nrow = 2)


In [None]:
options(repr.plot.width = 24, repr.plot.height = 9)

FeaturePlot(cl10, features = c("CD4"), split.by = "seurat_clusters", max.cutoff = 2) + plot_layout(ncol = 5, nrow = 2)


In [None]:
cl10_cd4  <- subset(cl10, seurat_clusters %in% c(0,1,5))
cl10_cd8  <- subset(cl10, seurat_clusters %in% c(2,4,7))
cl10_nk  <- subset(cl10, seurat_clusters %in% c(3,6))

### Reclustering cluster 12

In [None]:
sub  <- subset(pbmc, seurat_clusters == 12)

In [None]:
    sub <- NormalizeData(sub, verbose = FALSE)
        sub <- ScaleData(sub, verbose = FALSE)
        sub <- FindVariableFeatures(sub, nfeatures = 1000, verbose = FALSE)

sub <- RunPCA(sub, dims = 1:12)
sub <- RunUMAP(sub, reduction = "pca", dims = 1:12)
  

sub <- FindNeighbors(sub, dims = 1:12)
cl12 <- FindClusters(sub, resolution = 0.7)

In [None]:
options(repr.plot.width = 6, repr.plot.height = 5)
DimPlot(cl12)

In [None]:
options(repr.plot.width = 6, repr.plot.height = 5)
DimPlot(cl12)

In [None]:
options(repr.plot.width = 6, repr.plot.height = 5)

FeaturePlot(cl12, features = c("CD8A"),  max.cutoff = 2)

In [None]:
options(repr.plot.width = 6, repr.plot.height = 5)

FeaturePlot(cl12, features = c("CD4"),  max.cutoff = 2)

In [None]:
options(repr.plot.width = 24, repr.plot.height = 5)

FeaturePlot(cl12, features = c("CD8A"), split.by = "seurat_clusters", max.cutoff = 2) + plot_layout(ncol = 5, nrow = 1)


In [None]:
options(repr.plot.width = 24, repr.plot.height = 5)

FeaturePlot(cl12, features = c("CD4"), split.by = "seurat_clusters", max.cutoff = 2) + plot_layout(ncol = 5, nrow = 1)


In [None]:
cl12_cd4  <- subset(cl12, seurat_clusters %in% c(0,2,3,4))
cl12_cd8  <- subset(cl12, seurat_clusters %in% c(1))

### Reclustering cluster 6

In [None]:
sub  <- subset(pbmc, seurat_clusters == 6)

In [None]:
    sub <- NormalizeData(sub, verbose = FALSE)
        sub <- ScaleData(sub, verbose = FALSE)
        sub <- FindVariableFeatures(sub, nfeatures = 1000, verbose = FALSE)

sub <- RunPCA(sub, dims = 1:12)
sub <- RunUMAP(sub, reduction = "pca", dims = 1:12)
  

sub <- FindNeighbors(sub, dims = 1:12)

In [None]:
cl6 <- FindClusters(sub, resolution = 0.8)

In [None]:
options(repr.plot.width = 6, repr.plot.height = 5)
DimPlot(cl6, label = T, label.size = 10)

In [None]:
options(repr.plot.width = 6, repr.plot.height = 5)

FeaturePlot(cl6, features = c("CD8A"),  max.cutoff = 2)

In [None]:
options(repr.plot.width = 6, repr.plot.height = 5)

FeaturePlot(cl6, features = c("CD4"),  max.cutoff = 2)

In [None]:
options(repr.plot.width = 20, repr.plot.height = 8)

FeaturePlot(cl6, features = c("CD8A"), split.by = "seurat_clusters", max.cutoff = 2) + plot_layout(ncol = 6, nrow = 2)


In [None]:
options(repr.plot.width = 20, repr.plot.height = 8)

FeaturePlot(cl6, features = c("CD4"), split.by = "seurat_clusters", max.cutoff = 2) + plot_layout(ncol = 6, nrow = 2)


In [None]:
cl6_cd4  <-  subset(cl6, seurat_clusters %in% c(0,1,3,5,6,7,9))
cl6_cd8  <-  subset(cl6, seurat_clusters %in% c(2,4,8,10))


Add metadata showing CD4 CD8 or NK cell identity. 

In [None]:
md_pbmc  <- pbmc@meta.data

In [None]:
md_pbmc <- md_pbmc  %>% mutate(cd4_or_cd8_2 = 
                                ifelse(cd4_or_cd8 == "Unknown" | cd4_or_cd8 == "Both",
                                  case_when(seurat_clusters %in% c(0,5,7) ~ "CD8",
                                            seurat_clusters %in% c(1,2,3,8,9) ~ "CD4",
                                            seurat_clusters %in% c(4,11) ~ "NK_DN",
                                            barcode %in% c(cl6_cd4$barcode, 
                                                           cl10_cd4$barcode,
                                                           cl12_cd4$barcode) ~ "CD4",
                                            barcode %in% c(cl6_cd8$barcode, 
                                                           cl10_cd8$barcode,
                                                           cl12_cd8$barcode) ~ "CD8",
                                            barcode %in% c(cl10_nk$barcode) ~ "NK_DN"),
                                   cd4_or_cd8))

In [None]:
md_pbmc$cd4_or_cd8  %>% table

In [None]:
md_pbmc$cd4_or_cd8_2  %>% table

In [None]:
pbmc$cd4_or_cd8_2  <- md_pbmc$cd4_or_cd8_2

In [None]:
options(repr.plot.width = 8, repr.plot.height = 6)

DimPlot(pbmc, group.by = "cd4_or_cd8_2")

In [None]:
saveRDS(pbmc, paste0("../data/published_data/ParseBio/241020_ParseBio_200k_filtT.rds"))

# Analysis of NK DN cells

In [None]:
sub  <- subset(pbmc, cd4_or_cd8_2 == "NK_DN")
     sub <- NormalizeData(sub, verbose = FALSE)
        sub <- ScaleData(sub, verbose = FALSE)
        sub <- FindVariableFeatures(sub, nfeatures = 1000, verbose = FALSE)

sub <- RunPCA(sub, dims = 1:12)
sub <- RunUMAP(sub, reduction = "pca", dims = 1:12)
  

sub <- FindNeighbors(sub, dims = 1:12)
sub <- FindClusters(sub, resolution = 0.7)
            print("CLustering done!")
nk_dn  <- sub

In [None]:
DimPlot(nk_dn)

# Analysis of CD4 cells

In [None]:
options(future.globals.maxSize = 10e+09)

In [None]:
plan("sequential")

In [None]:
sub  <- subset(pbmc, cd4_or_cd8_2 == "CD4")
sub <- NormalizeData(sub, verbose = FALSE)
sub <- ScaleData(sub, verbose = FALSE)
sub <- FindVariableFeatures(sub, nfeatures = 1000, verbose = FALSE)

In [None]:
sub <- RunPCA(sub, dims = 1:10)
sub <- RunUMAP(sub, reduction = "pca", dims = 1:10)
  

sub <- FindNeighbors(sub, dims = 1:10)
sub <- FindClusters(sub, resolution = 0.5)
            print("Clustering done!")
cd4  <- sub

In [None]:
DimPlot(cd4)

In [None]:
options(repr.plot.width = 16, repr.plot.height = 16)
FeaturePlot(cd4, features = c("CD3D","CD8A","RORC","LGALS3",
                                         "GATA3","MKI67","ISG15","NCAM1",
                                         "TRGC1","TRDC","FOXP3","CTLA4",
                                         "IL4","IL5","NFKBIA","CD4"), ncol = 4)


# Analysis of CD8 cells

In [None]:
sub  <- subset(pbmc, cd4_or_cd8_2 == "CD8")
     sub <- NormalizeData(sub, verbose = FALSE)
        sub <- ScaleData(sub, verbose = FALSE)
        sub <- FindVariableFeatures(sub, nfeatures = 1000, verbose = FALSE)

In [None]:
sub <- RunPCA(sub, dims = 1:10)
sub <- RunUMAP(sub, reduction = "pca", dims = 1:10)
  

sub <- FindNeighbors(sub, dims = 1:10)
sub <- FindClusters(sub, resolution = 0.5)
            print("CLustering done!")
cd8  <- sub

In [None]:
DimPlot(cd8)

In [None]:
options(repr.plot.width = 16, repr.plot.height = 16)
FeaturePlot(cd8, features = c("SELL","EOMES","TBX21","LGALS3",
                                         "IFNG","MKI67","ISG15","NCAM1",
                                         "TRGC1","TRDC","IKZF2","CTLA4",
                                         "IL4","IL5","NFKBIA","CD4"), ncol = 4)

In [None]:
saveRDS(cd4, "241020_ParseBio_200k_CD4.rds")
saveRDS(cd8, "../data/published_data/ParseBio/241020_ParseBio_200k_CD8.rds")
saveRDS(nk_dn, "../data/published_data/ParseBio/241020_ParseBio_200k_NK_DN.rds")