In [1]:
library(ggplot2)
library(dplyr)
#setwd("/fast/AG_Haghverdi/Carla_Moelbert/Cell_annotation/Scripts/")
get_combinations <- function(files, size, seed){
    set.seed(seed)
    return(sample(files, size = size))
}

translate <- function(vector){
    vector <- stringr::str_replace_all(vector, "\\.\\.", "+ ")
    vector <- stringr::str_replace_all(vector, "\\.", " ")
    return(vector)
}

readSeurat <- function(file){
    seurat <- read.csv(file, sep="\t")
    seurat$prediction.score.max <- NULL
    seurat <- seurat[, stringr::str_detect(colnames(seurat), "prediction.score.")]
    colnames(seurat) <- stringr::str_replace(colnames(seurat), "prediction.score.", "")
    colnames(seurat) <- translate(colnames(seurat))
    seurat[is.na(seurat)] <- 0
    return(seurat)
}

readSCN <- function(file){
    scn <- read.csv(file, sep="\t")
    rownames(scn) <- scn$id 
    scn <- scn[, !colnames(scn) %in% c("id", "predicted", "nGene", "nUMI", "percent.mito",
                                       "Cluster", "class_", "Experiment", "Method",
                                       "prediction.match")]
    colnames(scn) <- translate(colnames(scn))
    scn[is.na(scn)] <- 0
   
    return(as.data.frame(scn))
}

readSingleR <- function(file){
    singler <- read.csv(file, sep="\t")
    rownames(singler) <- singler$id 
    
    singler <- singler[, stringr::str_detect(colnames(singler), "scores.")]
    singler <- singler[, !stringr::str_detect(colnames(singler), "tuning")]
    colnames(singler) <- stringr::str_replace(colnames(singler), "scores.", "")
    colnames(singler) <- translate(colnames(singler))
    singler[is.na(singler)] <- 0
    return(singler)
}

readItclust <- function(file){

    itclust <- read.csv(paste(sep="/", file, "results/clustering_prob.csv"))
    ids <- read.csv(paste(sep="/", file, "results/celltype_assignment.txt"), header=F)
    rownames(itclust) <- itclust$X
    itclust$X <- NULL

    ids[c("x", 'Cluster', 'x', "confidence", "x", "x", "type")] <-  stringr::str_split_fixed(ids$V1, ' ', 7)
    ids$type <- unlist(lapply(ids$type, function(type) substr(type, 1, nchar(type)-5)))
    ids$Cluster <- paste0("cluster", ids$Cluster)

    colnames(itclust) <- ids$type[ids$Cluster ==colnames(itclust)]
    itclust[is.na(itclust)] <- 0
    return(itclust)

}
                              
filter_set<- function(set, cutoff=NULL){
    set[is.na(set)] <- 0
    if(!is.null(cutoff)) set[set < cutoff]<- 0
    return(set)
} 
                              
summarize_binary_prediction <- function(set){
   x <- which.max(table(set))
   return(set[x]) 
}

get_binary_prediction <- function(set){
    predictions <-  apply(set, 1, which.max)
    predictions <- colnames(set)[predictions]
    names(predictions) <- rownames(set)
    
    return(predictions)
}

filter_data <- function(data, cutoff){
    
    data[data < cutoff] <- 0
    return(data)
}
get_confidence_df <- function(subsets, nr_sets){
    combs <- Reduce('+', subsets)
    combs <- combs / nr_sets                    
    prediction <- apply(combs, 1, which.max)
    prediction <- colnames(combs)[prediction] 
    return(prediction)
}
get_predicton <- function(sets, combinations, method, set, cutoff=0.25){
    nr_sets <- length(combinations)
    subsets <- sets[names(sets) %in% combinations]
    if(method != "CellID") prediction <- sapply(subsets, function(set) get_binary_prediction(set)) 
    else { 
        prediction <- Reduce(function(x, y) merge(x, y, by="id", all=TRUE), subsets)
        rownames(prediction) <- prediction$id
        prediction <- prediction[colnames(prediction) != "id",]
                             }                       
    prediction_binary <- apply(prediction, 1, summarize_binary_prediction)

    df <- data.frame(id=rownames(prediction), method=method,
                     set=set, prediction_binary = prediction_binary)   
    return(df)
}
                             
readCellID <- function(id){
    file <- list.files("/fast/AG_Haghverdi/Carla_Moelbert/Celltype_annotation/Data/Predictions/CellID/",
                                                     pattern = paste0(id, ".txt"),
                                                     full.names = T)
    data <- read.csv(file, sep="\t")
    data <- data[, c("id", "predicted")]
    colnames(data)[2] <- id
    return(data)
}                             


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union




In [2]:
print("Start....")
size=20
cutoff = 0.3
sequence <- seq(1,20,1)
                   
get_bootstrap_df <- function(pattern, path,  size=20, sequence = seq(1,20,1)){
    files <- list.files(paste(sep="/", path, "ItClust"), pattern = pattern)
    
    combinations <- lapply(sequence, function(seed) get_combinations(files, size, seed))
 
    print("Get Seurat...")
    files_seurat <- lapply(files, function(file) list.files(paste(sep="/", path, "Seurat"),
                                                            pattern = paste0(file, "_predictions.txt"),
                                                            full.names = T))
              
    sets_seurat  <- lapply(files_seurat, function(file) readSeurat(file))
    names(sets_seurat)<- files
    seurat_pred <- do.call(rbind,lapply(sequence, function(seq) get_predicton(sets_seurat,
                                                                       unlist(combinations[seq]),
                                                                       "Seurat", seq, cutoff)))
    sets_cellid <- lapply(files, function(file) readCellID(file))
    names(sets_cellid) <- files
    cellid_pred <- do.call(rbind,lapply(sequence, function(seq) get_predicton(sets_cellid,
                                                                       unlist(combinations[seq]),
                                                                       "CellID", seq, cutoff)))
                                   
    print("Get SCN...")                                    
    files_scn <- lapply(files, function(file) list.files(paste(sep="/", path, "SingleCellNet"),
                                                     pattern = paste0(file, ".txt"),
                                                     full.names = T))
    sets_scn  <- lapply(files_scn, function(file) readSCN(file))
    names(sets_scn)<- files
    scn_pred <- do.call(rbind,lapply(sequence, function(seq) get_predicton(sets_scn,
                                                                       unlist(combinations[seq]),"SingleCellNet", seq)))

      print("Get SingleR")
    files_singler <- lapply(files, function(file) list.files(paste(sep="/", path, "SingleR"),
                                                         pattern = paste0(file, ".txt"), 
                                                         full.names = T))
    sets_singler  <- lapply(files_singler, function(file) readSingleR(file))
    names(sets_singler )<- files
    singler_pred <- do.call(rbind,lapply(sequence, function(seq) get_predicton(sets_singler,
                                                                       unlist(combinations[seq]),
                                                                       "SingleR", seq)))
  print("Get ItClust...")
files_itclust <- paste(sep="/", path, "ItClust", files)
sets_itclust  <- lapply(files_itclust, function(file) readItclust(file))
names(sets_itclust )<- files
itclust_pred <- do.call(rbind,lapply(sequence, function(seq) get_predicton(sets_itclust,
                                                                       unlist(combinations[seq]),
                                                                       "ItClust", seq)))
itclust_pred$id <- stringr::str_replace(itclust_pred$id, "-target", "")
  predictions <- do.call(rbind, list(seurat_pred, scn_pred, singler_pred, itclust_pred, cellid_pred))
                            
 
}

[1] "Start...."


In [3]:
ref <- read.csv("/fast/AG_Haghverdi/Carla_Moelbert/Celltype_annotation/Data/Fulldata/PBMC_Query/meta.csv")
ref <- ref[,c("id", "class_")]
head(ref)
nrow(ref)

Unnamed: 0_level_0,id,class_
Unnamed: 0_level_1,<chr>,<chr>
1,pbmc2_SM2_Cell_1,CD14+ monocyte
2,pbmc2_SM2_Cell_102,CD14+ monocyte
3,pbmc2_SM2_Cell_118,CD14+ monocyte
4,pbmc2_SM2_Cell_127,CD14+ monocyte
5,pbmc2_SM2_Cell_131,CD14+ monocyte
6,pbmc2_SM2_Cell_133,CD14+ monocyte


In [4]:
data <- merge(ref, bootstrap, by=c("id"), all=T)
data$match_mosaic <- data$class_ == data$bootstrap_mosaic
data$match_mosaic[data$match_mosaic == TRUE] <- 1

data$match_mono <- data$class_ == data$bootstrap_mono
data$match_mono[data$match_mono == TRUE] <- 1

summary <- data %>% 
           dplyr::group_by(method, class_, id) %>% 
dplyr::summarize(bootstrap_mono= mean(match_mono),
                 bootstrap_mosaic= mean(match_mosaic)) 

print(head(summary))

ERROR: Error in as.data.frame(y): object 'bootstrap' not found


In [9]:
write.table(data,
            "/fast/AG_Haghverdi/Carla_Moelbert/Celltype_annotation/Results/Files/bootstrap_summary.csv",
           col.names=T, row.names=F, sep=",", quote=T)

In [10]:
data <- read.csv("/fast/AG_Haghverdi/Carla_Moelbert/Celltype_annotation/Results/Files/results_general.csv")

In [11]:
full <- data[, stringr::str_detect(colnames(data), "3090")]
full$class_ <- data$class_
full$id <- data$id
full$tech <- data$Method
full <- reshape2::melt(full,id=c("class_", "id", "tech"), value.name = "full")
full[c('reference', 'method', "size", "set")] <- stringr::str_split_fixed(full$variable, '_', 4)
full <- full[, c("id", "class_", "method", "full", "tech")]

In [12]:
individual <- data[, stringr::str_detect(colnames(data), "_100_")]
individual$class_ <- data$class_
individual$id <- data$id
individual <- reshape2::melt(individual,id=c("class_", "id"), value.name = "individual")
individual[c('reference', 'method', "size", "set")] <- stringr::str_split_fixed(individual$variable, '_', 4)
individual <- individual[, c("id", "class_", "method", "individual")]


individual$match <- individual$class_ == individual$individual
individual$match[individual$match == TRUE] <- 1

ind<- individual %>% 
           dplyr::group_by(method, class_, id) %>% 
dplyr::summarize(individual= mean(match)) 

[1m[22m`summarise()` has grouped output by 'method', 'class_'. You can override using the `.groups` argument.


method,class_,id,individual
<chr>,<chr>,<chr>,<dbl>
CellID,B cell,pbmc2_10X_V2_AAACCTGAGATGGGTC,1.0
CellID,B cell,pbmc2_10X_V2_AAACCTGAGCGTAATA,0.9333333
CellID,B cell,pbmc2_10X_V2_AAACCTGTCTTGTACT,0.1433333
CellID,B cell,pbmc2_10X_V2_AAACGGGAGTGCAAGC,0.0
CellID,B cell,pbmc2_10X_V2_AAACGGGCATTATCTC,0.7933333
CellID,B cell,pbmc2_10X_V2_AAACGGGGTCCAACTA,0.0


In [13]:
umap_data <- merge(full, ind, by=c("id", "class_", "method"))
umap_data <- merge(summary, umap_data, by=c("id", "class_", "method"))
head(umap_data)

Unnamed: 0_level_0,id,class_,method,bootstrap_mono,bootstrap_mosaic,full,tech,individual
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<dbl>,<dbl>,<chr>,<chr>,<dbl>
1,pbmc2_10X_V2_AAACCTGAGATGGGTC,B cell,ItClust,1,1.0,B cell,10x,1.0
2,pbmc2_10X_V2_AAACCTGAGATGGGTC,B cell,SCN,1,1.0,B cell,10x,1.0
3,pbmc2_10X_V2_AAACCTGAGATGGGTC,B cell,Seurat,1,1.0,B cell,10x,1.0
4,pbmc2_10X_V2_AAACCTGAGATGGGTC,B cell,SingleR,1,1.0,B cell,10x,1.0
5,pbmc2_10X_V2_AAACCTGAGCGTAATA,B cell,CellID,0,0.15,B cell,10x,0.9333333
6,pbmc2_10X_V2_AAACCTGAGCGTAATA,B cell,ItClust,1,1.0,B cell,10x,1.0


In [14]:
umap_data$mono_dif <- umap_data$bootstrap_mono - umap_data$individual
umap_data$mosaic_dif <- umap_data$bootstrap_mosaic - umap_data$bootstrap_mono
head(umap_data)

Unnamed: 0_level_0,id,class_,method,bootstrap_mono,bootstrap_mosaic,full,tech,individual,mono_dif,mosaic_dif
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<dbl>,<dbl>,<chr>,<chr>,<dbl>,<dbl>,<dbl>
1,pbmc2_10X_V2_AAACCTGAGATGGGTC,B cell,ItClust,1,1.0,B cell,10x,1.0,0.0,0.0
2,pbmc2_10X_V2_AAACCTGAGATGGGTC,B cell,SCN,1,1.0,B cell,10x,1.0,0.0,0.0
3,pbmc2_10X_V2_AAACCTGAGATGGGTC,B cell,Seurat,1,1.0,B cell,10x,1.0,0.0,0.0
4,pbmc2_10X_V2_AAACCTGAGATGGGTC,B cell,SingleR,1,1.0,B cell,10x,1.0,0.0,0.0
5,pbmc2_10X_V2_AAACCTGAGCGTAATA,B cell,CellID,0,0.15,B cell,10x,0.9333333,-0.9333333,0.15
6,pbmc2_10X_V2_AAACCTGAGCGTAATA,B cell,ItClust,1,1.0,B cell,10x,1.0,0.0,0.0


In [21]:
getVectors <- function(method, data){

    summary <- data[data$method == method,]  %>% 
           dplyr::group_by(id, class_, tech, full, curated) %>% 
           dplyr::summarize("mono" = mean(bootstrap_mono),
                            "mosaic" = mean(bootstrap_mosaic),
                            "individual" = mean(individual),
                            "mono_dif" = mean(mono_dif),
                            "mosaic_dif" = mean(mosaic_dif))
    colnames(summary) <- c("id", "class","tech", paste(sep="_", "full", method) ,
                           paste(sep="_", "curated", method),
                           paste(sep="_","mono", method),
                           paste(sep="_","mosaic", method),
                           paste(sep="_","individual", method),
                           paste(sep="_","mono_dif", method),
                           paste(sep="_","mosaic_dif", method))
    return(summary)
}

getVectors_set <- function(method, data){

    summary <- data[data$method == method,]  %>% 
           dplyr::group_by(set, class_, tech, full, curated) %>% 
           dplyr::summarize("mono" = mean(bootstrap_mono),
                            "mosaic" = mean(bootstrap_mosaic),
                            "individual" = mean(individual),
                            "mono_dif" = mean(mono_dif),
                            "mosaic_dif" = mean(mosaic_dif))
    colnames(summary) <- c("id", "class","tech", paste(sep="_", "full", method) ,
                           paste(sep="_", "curated", method),
                           paste(sep="_","mono", method),
                           paste(sep="_","mosaic", method),
                           paste(sep="_","individual", method),
                           paste(sep="_","mono_dif", method),
                           paste(sep="_","mosaic_dif", method))
    return(summary)
}

In [16]:
data <- read.csv("/fast/AG_Haghverdi/Carla_Moelbert/Celltype_annotation/Results/Files/results_curated_PBMC.csv")
full <- data[, stringr::str_detect(colnames(data), "3090")]
full$class_ <- data$class_
full$id <- data$id
full$tech <- data$Method
full <- reshape2::melt(full,id=c("class_", "id", "tech"), value.name = "full")
full[c('reference', 'method', "size", "set")] <- stringr::str_split_fixed(full$variable, '_', 4)
full <- full[, c("id", "class_", "method", "full", "tech")]

colnames(full)[4] <- "curated"
umap_data <- merge(umap_data, full, by=c("id", "class_","method", "tech"),all =TRUE)
head(umap_data)

Unnamed: 0_level_0,id,class_,method,tech,bootstrap_mono,bootstrap_mosaic,full,individual,mono_dif,mosaic_dif,curated
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<dbl>,<chr>
1,pbmc2_10X_V2_AAACCTGAGATGGGTC,B cell,CellID,10x,,,,,,,B cell
2,pbmc2_10X_V2_AAACCTGAGATGGGTC,B cell,ItClust,10x,1.0,1.0,B cell,1.0,0.0,0.0,
3,pbmc2_10X_V2_AAACCTGAGATGGGTC,B cell,SCN,10x,1.0,1.0,B cell,1.0,0.0,0.0,B cell
4,pbmc2_10X_V2_AAACCTGAGATGGGTC,B cell,Seurat,10x,1.0,1.0,B cell,1.0,0.0,0.0,B cell
5,pbmc2_10X_V2_AAACCTGAGATGGGTC,B cell,SingleR,10x,1.0,1.0,B cell,1.0,0.0,0.0,B cell
6,pbmc2_10X_V2_AAACCTGAGCGTAATA,B cell,CellID,10x,0.0,0.15,B cell,0.9333333,-0.9333333,0.15,B cell


In [18]:
methods <- c("Seurat",  "SingleR","CellID", "SCN", "ItClust")
df <- lapply(methods, function(method) getVectors(method, umap_data))
df <- Reduce(function(x, y) merge(x, y, by=c("id", "class", "tech")),df)
rownames(df)<- df$id

[1m[22m`summarise()` has grouped output by 'id', 'class_', 'tech', 'full'. You can override using the `.groups` argument.
[1m[22m`summarise()` has grouped output by 'id', 'class_', 'tech', 'full'. You can override using the `.groups` argument.
[1m[22m`summarise()` has grouped output by 'id', 'class_', 'tech', 'full'. You can override using the `.groups` argument.
[1m[22m`summarise()` has grouped output by 'id', 'class_', 'tech', 'full'. You can override using the `.groups` argument.
[1m[22m`summarise()` has grouped output by 'id', 'class_', 'tech', 'full'. You can override using the `.groups` argument.


In [19]:
write.tableble(df, "../Results/Files/umap_data.csv", col.names=T, row.names=T, quote=T, sep=",")

In [None]:
stringr::str_s