In [1]:
# run time record
time.start <- Sys.time()
print(paste('start time:',time.start))

[1] "start time: 2022-02-09 13:27:16"


In [2]:
library(MLmetrics)
library(Seurat)
library(SeuratDisk)
library(caret)


Attaching package: ‘MLmetrics’


The following object is masked from ‘package:base’:

    Recall


Attaching SeuratObject

Registered S3 method overwritten by 'cli':
  method     from         
  print.boxx spatstat.geom

Registered S3 method overwritten by 'SeuratDisk':
  method            from  
  as.sparse.H5Group Seurat

Loading required package: ggplot2

Loading required package: lattice


Attaching package: ‘caret’


The following objects are masked from ‘package:MLmetrics’:

    MAE, RMSE




In [3]:
# library(SeuratDisk)
# Convert ( "../scClass_data/pbmcmm_pp_subsample.h5ad" , dest  =  "h5seurat" , overwrite  =  FALSE )
# Convert ("../scClass_data/pbmc68k_v2.h5ad", dest="h5seurat", overwrite=TRUE) #if the file exist, then overwrite

In [4]:
ref_file <- "../scClass_data/pbmcmm_pp_subsample.h5seurat"
test.file <- "../scClass_data/pbmc68k(csr).h5seurat"

## Reference Data Preprocessing

In [5]:
ref <- LoadH5Seurat(ref_file)

Validating h5Seurat file

Initializing RNA with data

Adding counts for RNA

Adding feature-level metadata for RNA

Adding command information

Adding cell-level metadata

Adding miscellaneous information

Adding tool-specific results



In [6]:
ref <- NormalizeData(ref, verbose = FALSE)
ref <- FindVariableFeatures(ref, selection.method = "vst", nfeatures = 2000,verbose = FALSE)

In [7]:
ref <- ScaleData(ref, verbose = FALSE)
ref <- RunPCA(ref, npcs = 30, verbose = FALSE)
ref <- RunUMAP(ref, reduction = "pca", dims = 1:30, verbose = FALSE)

“The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
This message will be shown once per session”


## Cell type classification

In [8]:
test <- LoadH5Seurat(test.file)

Validating h5Seurat file

Initializing RNA with data

Adding counts for RNA

Adding feature-level metadata for RNA

Adding command information

Adding cell-level metadata

Adding miscellaneous information

Adding tool-specific results



In [9]:
ref.anchors <- FindTransferAnchors(reference = ref, query =test, dims = 1:30, reference.reduction = "pca")

Projecting cell embeddings

Finding neighborhoods

Finding anchors

	Found 3101 anchors

Filtering anchors

	Retained 210 anchors



In [10]:
ref$transfer_cell_type <- as.character(ref$transfer_id) # celltype = transfer_id (int to char)
predictions <- TransferData(anchorset = ref.anchors, refdata = ref$transfer_cell_type,dims = 1:30)
test <- AddMetaData(test, metadata = predictions)

Finding integration vectors

Finding integration vector weights

Predicting cell labels



## Make Confusion Matrix and evaluate accuracy

In [11]:
label <- c('T-helper cell','cytotoxic T cell','memory B cell','naive B cell',
           'plasma cell','natural killer cell','erythrocyte',
           'megakaryocyte','monocyte','dendritic cell','bone marrow hematopoietic cell')

In [12]:
y.pred <- factor(label[as.integer(test$predicted.id)+1],levels=label)
y.true <- factor(test$transfer_cell_type,levels=label)
y.pred <- y.pred[!is.na(y.true)]
y.true <- y.true[!is.na(y.true)]

In [13]:
mat.table <- confusionMatrix(y.true,y.pred)
mat.df <- as.data.frame.matrix(mat.table$table) 
colnames(mat.df) = 0:10
mat.df

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,10
Unnamed: 0_level_1,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
T-helper cell,24376,17620,0,0,9,9,0,0,131,19,15
cytotoxic T cell,1032,21126,0,0,1,3,0,0,0,0,0
memory B cell,0,0,0,0,0,0,0,0,0,0,0
naive B cell,8122,121,0,1126,1,0,0,0,2,713,0
plasma cell,0,0,0,0,0,0,0,0,0,0,0
natural killer cell,15,7571,0,0,0,798,0,0,1,0,0
erythrocyte,0,0,0,0,0,0,0,0,0,0,0
megakaryocyte,0,0,0,0,0,0,0,0,0,0,0
monocyte,48,2,0,0,1,0,0,0,2121,295,0
dendritic cell,3,0,0,0,0,0,0,0,0,96,0


In [14]:
mat.df.percent <- round(mat.df/(rowSums(mat.df)+0.001)*100, digits=1)
mat.df.percent
print(paste('acc:',sum(y.true==y.pred)/length(y.true)*100,'%'))

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,10
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
T-helper cell,57.8,41.8,0,0.0,0.0,0.0,0,0,0.3,0.0,0.0
cytotoxic T cell,4.7,95.3,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0
memory B cell,0.0,0.0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0
naive B cell,80.5,1.2,0,11.2,0.0,0.0,0,0,0.0,7.1,0.0
plasma cell,0.0,0.0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0
natural killer cell,0.2,90.3,0,0.0,0.0,9.5,0,0,0.0,0.0,0.0
erythrocyte,0.0,0.0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0
megakaryocyte,0.0,0.0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0
monocyte,1.9,0.1,0,0.0,0.0,0.0,0,0,86.0,12.0,0.0
dendritic cell,3.0,0.0,0,0.0,0.0,0.0,0,0,0.0,97.0,0.0


[1] "acc: 57.9394045038076 %"


In [15]:
evaluation_metrics<-function(performance){
    eva.list <- c()
    performance.name <- as.character(substitute(performance))
    for (i in label)
    {
        eva <- performance(y.true,y.pred, positive = i)
        if (!is.nan(eva))
        {
            eva.list <- c(eva.list,eva)
        }
    }
    print(paste(c(paste(performance.name,':'),eva.list)))
    print('-----------------------------------------------------------------')
    print(paste(performance.name,'(average):',mean(eva.list)))
}

In [16]:
evaluation_metrics(Precision)

[1] "Precision :"        "0.698092674265422"  "0.451873716632444" 
[4] "0.999112688553682"  "0"                  "0.979141104294479" 
[7] "0.638855421686747"  "0.0747081712062257" "0.995779403489026" 
[1] "-----------------------------------------------------------------"
[1] "Precision (average): 0.604695397516003"


In [17]:
evaluation_metrics(Recall)

[1] "Recall :"          "0.577917921240428" "0.953253316487682"
[4] "0.11165096678235"  "0.095169946332737" "0.859748682610458"
[7] "0.96969696969697"  "0.551933873986276"
[1] "-----------------------------------------------------------------"
[1] "Recall (average): 0.588481668162414"


In [18]:
evaluation_metrics(F1_Score)

[1] "F1_Score :"        "0.632346265094621" "0.613111994660011"
[4] "0.200856225472708" "0.173478260869565" "0.733022291342665"
[7] "0.138728323699422" "0.71021473008228" 
[1] "-----------------------------------------------------------------"
[1] "F1_Score (average): 0.45739401303161"


In [19]:
# run time record
time.end <- Sys.time()
print(paste('start time:',time.start))
print(paste('end   time:',time.end))
time.end - time.start 

[1] "start time: 2022-02-09 13:27:16"
[1] "end   time: 2022-02-09 13:32:32"


Time difference of 5.256886 mins