In [None]:
if (!requireNamespace("Seurat", quietly = TRUE)){
    install.packages("MLmetrics") 
    install.packages('Seurat')
    install.packages("remotes")
    remotes::install_github("mojaveazure/seurat-disk")
}

In [1]:
# run time record
time.start <- Sys.time()
print(paste('start time:',time.start))

[1] "start time: 2022-03-11 09:24:01"


In [2]:
library(MLmetrics)
library(Seurat)
library(SeuratDisk)
library(caret)


Attaching package: ‘MLmetrics’


The following object is masked from ‘package:base’:

    Recall


Attaching SeuratObject

Registered S3 method overwritten by 'cli':
  method     from         
  print.boxx spatstat.geom

Registered S3 method overwritten by 'SeuratDisk':
  method            from  
  as.sparse.H5Group Seurat

Loading required package: ggplot2

Loading required package: lattice


Attaching package: ‘caret’


The following objects are masked from ‘package:MLmetrics’:

    MAE, RMSE




In [3]:
##h5ad data -> h5seurat data
#overwrite: if the file exist, then not  overwrite
if (!file.exists("../dataset/PBMC_CITE_modelC.h5seurat")){
    Convert ( "../dataset/PBMC_CITE_modelC.h5ad" , dest  =  "h5seurat" , overwrite  =  FALSE )
}
if (!file.exists("../dataset/CordBlood_modelC.h5seurat")){
    Convert ( "../dataset/CordBlood_modelC.h5ad" , dest  =  "h5seurat" , overwrite  =  FALSE )
}

In [4]:
ref.file <- "../dataset/PBMC_CITE_modelC.h5seurat"
test.file <- "../dataset/CordBlood_modelC.h5seurat"

In [5]:
ref <- LoadH5Seurat(ref.file)

Validating h5Seurat file

Initializing RNA with data

Adding counts for RNA

Adding feature-level metadata for RNA

Adding command information

Adding cell-level metadata

“Invalid name supplied, making object name syntactically valid. New object name is Phasecelltype.l1celltype.l2celltype.l3donorlanenCount_ADTnCount_RNAnCount_SCTnFeature_ADTnFeature_RNAnFeature_SCTorig.identtimemodelA.idmodelC.idtransfer.cell.typeorigin.cell.type; see ?make.names for more details on syntax validity”
Adding miscellaneous information

Adding tool-specific results



In [6]:
ref <- NormalizeData(ref, verbose = FALSE)
ref <- FindVariableFeatures(ref, selection.method = "vst", nfeatures = 2000,verbose = FALSE)

In [7]:
ref <- ScaleData(ref, verbose = FALSE)
ref <- RunPCA(ref, npcs = 30, verbose = FALSE)
ref <- RunUMAP(ref, reduction = "pca", dims = 1:30, verbose = FALSE)

“The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
This message will be shown once per session”


## Cell type classification

In [8]:
test <- LoadH5Seurat(test.file)

Validating h5Seurat file

Initializing RNA with data

Adding counts for RNA

Adding feature-level metadata for RNA

Adding command information

Adding cell-level metadata

“Invalid name supplied, making object name syntactically valid. New object name is antisense_readscell_barcode_fraction_bases_above_30_meancell_barcode_fraction_bases_above_30_variancecell_namesduplicate_readsemptydrops_FDRemptydrops_IsCellemptydrops_Limitedemptydrops_LogProbemptydrops_PValueemptydrops_Totalfragments_per_moleculefragments_with_single_read_evidencegenes_detected_multiple_observationsgenomic_read_quality_meangenomic_read_quality_variancegenomic_reads_fraction_bases_quality_above_30_meangenomic_reads_fraction_bases_quality_above_30_varianceinput_idmolecule_barcode_fraction_bases_above_30_meanmolecule_barcode_fraction_bases_above_30_variancemolecules_with_single_read_evidencen_fragmentsn_genesn_mitochondrial_genesn_mitochondrial_moleculesn_moleculesn_readsnoise_readspct_mitochondrial_moleculesperfect_cel

In [9]:
# test <- NormalizeData(test, verbose = FALSE)

In [10]:
ref.anchors <- FindTransferAnchors(reference = ref, query =test, dims = 1:30, reference.reduction = "pca")

Projecting cell embeddings

Finding neighborhoods

Finding anchors

	Found 18155 anchors

Filtering anchors

	Retained 325 anchors



In [11]:
ref$transfer_cell_type <- as.character(ref$modelC.id) # celltype = transfer_id (int to char)
predictions <- TransferData(anchorset = ref.anchors, refdata = ref$transfer_cell_type,dims = 1:30)
test <- AddMetaData(test, metadata = predictions)

Finding integration vectors

Finding integration vector weights

Predicting cell labels



## Make Confusion Matrix and evaluate accuracy

In [12]:
label <- c('helper T cell','cytotoxic T cell','memory B cell','naive B cell',
           'plasma cell','natural killer cell','erythrocyte',
           'megakaryocyte','monocyte','dendritic cell','HSPC')

In [13]:
y.pred <- factor(label[as.integer(test$predicted.id)+1],levels=label)
y.true <- factor(test$transfer.cell.type,levels=label)

In [14]:
y.pred <- y.pred[!is.na(y.true)]
y.true <- y.true[!is.na(y.true)]

In [15]:
mat.table <- confusionMatrix(y.true,y.pred)
mat.df <- as.data.frame.matrix(mat.table$table) 
colnames(mat.df) = 0:10
mat.df

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,10
Unnamed: 0_level_1,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
helper T cell,0,0,0,0,0,0,0,0,0,0,0
cytotoxic T cell,0,33559,0,0,0,30,0,0,4,0,0
memory B cell,0,572,0,366,0,4,0,0,747,3,0
naive B cell,0,7675,29,9896,0,6,131,0,18684,20,0
plasma cell,0,0,0,0,0,0,0,0,0,0,0
natural killer cell,0,5295,0,17,0,9293,7,0,46,0,0
erythrocyte,0,683,0,15,0,853,1324,7,325,1,1638
megakaryocyte,0,1341,0,33,0,82,90,738,229,1,2
monocyte,0,154,0,0,0,1,1,10,28687,1,0
dendritic cell,0,3,0,0,0,1,0,0,1386,275,0


In [16]:
mat.df.percent <- round(mat.df/(rowSums(mat.df)+0.001)*100, digits=1)
mat.df.percent
print(paste('acc:',sum(y.true==y.pred)/length(y.true)*100,'%'))

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,10
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
helper T cell,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
cytotoxic T cell,0,99.9,0.0,0.0,0,0.1,0.0,0.0,0.0,0.0,0.0
memory B cell,0,33.8,0.0,21.6,0,0.2,0.0,0.0,44.1,0.2,0.0
naive B cell,0,21.1,0.1,27.2,0,0.0,0.4,0.0,51.3,0.1,0.0
plasma cell,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
natural killer cell,0,36.1,0.0,0.1,0,63.4,0.0,0.0,0.3,0.0,0.0
erythrocyte,0,14.1,0.0,0.3,0,17.6,27.3,0.1,6.7,0.0,33.8
megakaryocyte,0,53.3,0.0,1.3,0,3.3,3.6,29.3,9.1,0.0,0.1
monocyte,0,0.5,0.0,0.0,0,0.0,0.0,0.0,99.4,0.0,0.0
dendritic cell,0,0.2,0.0,0.0,0,0.1,0.0,0.0,83.2,16.5,0.0


[1] "acc: 67.4139942864041 %"


In [17]:
evaluation_metrics<-function(performance){
    eva.list <- c()
    performance.name <- as.character(substitute(performance))
    for (i in label)
    {
        eva <- performance(y.true,y.pred, positive = i)
        if (!is.nan(eva))
        {
            eva.list <- c(eva.list,eva)
        }
    }
    print(paste(c(paste(performance.name,':'),eva.list)))
    print('-----------------------------------------------------------------')
    print(paste(performance.name,'(average):',mean(eva.list)))
}

In [18]:
evaluation_metrics(Precision)

 [1] "Precision :"       "0.680958564993304" "0"                
 [4] "0.958264742906943" "0.904868549172347" "0.852543464262717"
 [7] "0.977483443708609" "0.572503392671829" "0.913621262458472"
[10] "0"                
[1] "-----------------------------------------------------------------"
[1] "Precision (average): 0.651138157797136"


In [19]:
evaluation_metrics(Recall)

[1] "Recall :"          "0.998987884380675" "0"                
[4] "0.271562251310337" "0.633988265793423" "0.273215022699133"
[7] "0.293322734499205" "0.994212240937132" "0.165165165165165"
[1] "-----------------------------------------------------------------"
[1] "Recall (average): 0.453806695598134"


In [20]:
evaluation_metrics(F1_Score)

[1] "F1_Score :"        "0.809870286576169" "0.42319534724598" 
[4] "0.74558729139923"  "0.413814658540397" "0.451238153469887"
[7] "0.72660266963856"  "0.279755849440488"
[1] "-----------------------------------------------------------------"
[1] "F1_Score (average): 0.550009179472959"


In [21]:
# run time record
time.end <- Sys.time()
print(paste('start time:',time.start))
print(paste('end   time:',time.end))
time.end - time.start 

[1] "start time: 2022-03-11 09:24:01"
[1] "end   time: 2022-03-11 10:52:13"


Time difference of 1.470056 hours

In [22]:
# !rm ../dataset/PBMC_CITE_modelC.h5ad
# !rm ../dataset/CordBlood_modelC.h5ad