## 01. Load mgatk variants (and filter if needed)

In [1]:
name = "d1"
donor = 1
cells_f = "all" #"/data/isshamie/mito_lineage/output/pipeline/cd34norm/MTblacklist/data/merged/MT/cellr_True/numread_200/filters/minC10_minR50_topN0_hetT0.001_hetC10_hetCount5_bq20/mgatk/vireoIn/clones/variants_mgatkdonor/vireo/donor1/mgatk_donor/cells_meta.tsv" 
outdir =  "/data/isshamie/mito_lineage/output/pipeline/cd34norm/MTblacklist/data/merged/MT/cellr_True/numread_200/filters/minC10_minR50_topN0_hetT0.001_hetC10_hetCount5_bq20/mgatk/vireoIn/clones/variants_mgatkdonor/knn/kparam_3/donor1"
mgatk_in =  "/data/isshamie/mito_lineage/output/pipeline/cd34norm/MTblacklist/data/merged/MT/cellr_True/numread_200/filters/minC10_minR50_topN0_hetT0.001_hetC10_hetCount5_bq20/mgatk/vireoIn//clones/variants_mgatkdonor/donor1/mgatk_donor/d1.af.tsv" 
kparam = 3
cells_col_val = 1
#mgatk_in_full = "/data/isshamie/mito_lineage/output/pipeline/cd34norm/MTblacklist/data/merged/MT/cellr_True/numread_200/filters/minC10_minR50_topN0_hetT0.001_hetC10_hetCount5_bq20/mgatk/vireoIn//clones/variants_mgatkdonor/donor1/mgatk_donor/d1.variant.rds" 

# Cluster parameters
#k.param <- 10
# vars_f <- "filter" #filter, which uses additional filter parameters, or a tsv file where first column is variant names
#cells_f <- "all" # all, or a cell tsv file where first column are the cell names
cells_col <- "donor"
cell_id_col <- "ID"


# # Additional parameters if either vars or cells are do not have a file associated
# n_cells_conf_detected <- 5
# strand_correlation <- 0.65 
# log10_vmr <- -2 
# mean_coverage <- 20


In [2]:
# SEall <- readRDS(mgatk_in) #"/data2/mito_lineage/data/processed/mttrace/TcellDupi_may17_2021/MTblacklist/pre/filters/minC10_minR50_topN0_hetT0.001_hetC10_hetCount5_bq20/filter_mgatk/pre.signac.rds")
# SEall

## Subset on cells if given

In [3]:
modify_SEall <- function(se, cells){
    ## Subset cells before calling variants
    se$counts = se$counts[,which(rownames(se$depth) %in% cells)]
    se$depth <- se$depth[which(rownames(se$depth) %in% cells), ]
    return(se)
}

rowVars <- function(x, ...) {
  Matrix::rowSums((x - Matrix::rowMeans(x, ...))^2, ...)/(dim(x)[2] - 1)
}

In [4]:
if(cells_f != "all"){
    SEall <- readRDS(stringr::str_replace(mgatk_in, ".variant.rds", ".signac.rds"))
    cells <- read.table(cells_f, row.names=cell_id_col, sep="\t", header=T)
    if(cells_col != ""){
        cells <- cells[, cells_col==cells_col_val]
    }
    SEall <- SEall[rownames(SEall$depth) %in% rownames(cells), ]

    cells <- rownames(SEall$depth)
    se <- modify_SEall(SEall, cells)

    se_mean <- (Matrix::rowMeans(se$counts))
    se_var <- rowVars(se$counts)
    se_vmr <- se_var/(se_mean + 0.00000000001)
}

### Load Libraries

In [5]:
library(data.table)
library(dplyr)
library(SummarizedExperiment)
library(Matrix)
library(BuenColors)

library(future)
plan()
#plan("multiprocess", workers = workers)
options(future.globals.maxSize = 8000 * 1024^2)

library(Seurat)
library(Signac)


Attaching package: ‘dplyr’


The following objects are masked from ‘package:data.table’:

    between, first, last


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union


Loading required package: MatrixGenerics

Loading required package: matrixStats


Attaching package: ‘matrixStats’


The following object is masked _by_ ‘.GlobalEnv’:

    rowVars


The following object is masked from ‘package:dplyr’:

    count



Attaching package: ‘MatrixGenerics’


The following object is masked _by_ ‘.GlobalEnv’:

    rowVars


The following objects are masked from ‘package:matrixStats’:

    colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
    colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
    colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
    colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
    colProds, colQuantiles, colRange

Attaching SeuratObject


Attaching package: ‘Seurat’


The following object is masked from ‘package:SummarizedExperiment’:

    Assays




## 02. Cluster using Seurat FindNeighbors (S-NN graph ) and FindClusters (louvain)

In [6]:
getNN <- function(mat_af, k.param = 10){
  set.seed(1)
  rownames(mat_af) <- make.unique(rownames(mat_af))
  obj <- FindNeighbors(mat_af, k.param = k.param, annoy.metric = "cosine")
  obj
}

# Function to use Seurat functions to call mitochondrial cluster clones
seuratSNN_cosineDistance <- function(obj, resolution){
  clusters <- FindClusters(object = obj$snn, resolution = resolution)
  return(as.character(clusters[,1]))
}


In [7]:
plot_af_clusters <- function(afin, cluster_name){
    mdf <- data.frame(cluster_name, t(afin), check.names=F) %>%
    group_by(cluster_name) %>% dplyr::filter(n() >= 5) %>% summarize_all(.funs = mean)

    melt_df <- reshape2::melt(mdf, id.vars = "cluster_name")

    # This series of arrangements is for aesthetics
    melt_df2 <- melt_df %>% mutate(g5 = value >= 0.05, g1 = value >= 0.01, sqrt = sqrt(value)) %>%  group_by(variable) %>%
    mutate(total_g1 = sum(g1)) %>% mutate(total_g5 = sum(g5)) %>% 
    arrange(desc((g5)), desc((g1))) %>% dplyr::filter(total_g1 > 0) 

    # More aesthetics
    melt_df2$cluster_name <- factor(as.character(melt_df2$cluster_name ), levels = rev(unique( as.character(melt_df2$cluster_name))))
    melt_df2$variable <- factor(as.character(melt_df2$variable ), levels = (unique( as.character(melt_df2$variable))))
    melt_df2$value <- ifelse(melt_df2$value > 0.05, 1, melt_df2$value)
    melt_df2$value <- ifelse(melt_df2$value < 0.005, 0, melt_df2$value)
    

    # Visualize heatmap of clusters x variants number cells > 0.01
    p.numvar1 <- ggplot(melt_df2 , aes(x = variable, y = cluster_name, fill = total_g5)) +
    geom_tile() +
    scale_fill_gradientn(colors = jdb_palette("solar_rojos")) +
    theme(axis.text.x=element_blank(),
          axis.ticks.x=element_blank()) +
    theme(axis.title.y=element_blank(),
          axis.text.y=element_blank(),
          axis.ticks.y=element_blank()) +
    xlab("Variant") +
    ggtitle("Total number of cells greater than 0.05 in variant") +
    L_border() #+
    #theme(legend.position = "none")
    ggsave(file.path(outdir, paste0(name,".variants.g5.png")))
    #ggsave(file.path(outdir, "variants.png"))
    #cowplot::ggsave(p1, file = paste0("../plots/mutations_clones_",what,"_invivo_grid.pdf"), width = 3.2, height = 1.5)
    print(length(unique(melt_df2$variable)))
    print(length(unique(melt_df2$cluster_name)))

    # Visualize heatmap of clusters x variants
    p.bin <- ggplot(melt_df2 , aes(x = variable, y = cluster_name, fill = value)) +
    geom_tile() +
    scale_fill_gradientn(colors = jdb_palette("solar_rojos")) +
    theme(axis.text.x=element_blank(),
          axis.ticks.x=element_blank()) +
    theme(axis.text.y=element_blank(),
          axis.ticks.y=element_blank()) +
    xlab("Variant") +
    ylab("Cluster") +
    ggtitle("Variant avg greater than 0.05 set to 1 and <0.005 to 0") +
    L_border() #+
           
    #theme(legend.position = "none")
    print("Number of variants")
    print(length(unique(melt_df2$variable)))
    print("Number of clusters")
    print(length(unique(melt_df2$cluster_name)))
    ggsave(file.path(outdir, paste0(name, ".variants.binary.png")))
    
    p.varmean <- ggplot(melt_df2 , aes(x = variable, y = cluster_name, fill = sqrt)) +
    geom_tile() +
    scale_fill_gradientn(colors = jdb_palette("solar_rojos")) +
    theme(axis.text.x=element_blank(),
          axis.ticks.x=element_blank()) +
    theme(axis.title.y=element_text("Cluster"),
          axis.text.y=element_blank(),
          axis.ticks.y=element_blank()) +
    xlab("Variant") +
    ylab("Cluster") +
    ggtitle("Sqrt Mean AF in each cluster") +
    L_border() #+
    ggsave(file.path(outdir, paste0(name,".variants.labels.png")))
    #theme(legend.position = "none")

    return(c(melt_df2, p.numvar1, p.varmean))
}

In [8]:
run_clones <- function(SE, curr_out, k.param){
    afin <- data.matrix(assays(SE)[["allele_frequency"]])
    # Since the PBMCs alone undercalled the mutations, we will examine the mutations called in the PBMCs and CD34s together
    obj <- getNN(t(sqrt(afin)), k.param=k.param)
    clusters <- seuratSNN_cosineDistance(obj, resolution = 3.5) 
    cluster_name <- stringr::str_pad(as.character(clusters), 3, pad = "0")
    table(cluster_name)
    saveRDS(clusters, file.path(curr_out, "clusters.rds"))
    saveRDS(obj, file.path(curr_out, "af.dimRed.rds"))
    cell_clusters <- data.frame(ID=colnames(afin), lineage=cluster_name, donor=donor)
    write.table(cell_clusters, file.path(curr_out, "cells_meta.tsv"), row.names=F, sep="\t", quote=F)
    out <- plot_af_clusters(afin, cluster_name)
    melt_df2 <- out[[1]]
    write.table(melt_df2, file.path(curr_out, "var_clusters.tsv"), row.names=F, sep="\t", quote=F)
    return(out)
}

# Run workflow

In [9]:
mgatk <- read.table(mgatk_in, sep="\t", header=T, check.names=F) 
SE <- readRDS(stringr::str_replace(mgatk_in, ".af.tsv",".variant.rds"))
#SE <- readRDS(mgatk_in_full)
print("before filter")
print(dim(assays(SE)[["allele_frequency"]]))
print("after filter")
SE <- SE[which(rownames(rowData(SE)) %in% rownames(mgatk)), which(colnames(assay(SE, "allele_frequency")) %in% colnames(mgatk))]

[1] "before filter"
[1] 49704  3750
[1] "after filter"


In [10]:
#colnames(SE) <- lapply(colnames(SE), function(x) paste0(x, name))
print("clones w all cells")
#curr_SE <- SE #filt_vars(SE, cells_f, vars_f)
out <- run_clones(SE, outdir, k.param=kparam)

[1] "clones w all cells"


Computing nearest neighbor graph

Computing SNN



Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck

Number of nodes: 3750
Number of edges: 409549

Running Louvain algorithm...
Maximum modularity in 10 random starts: 0.0776
Number of communities: 517
Elapsed time: 1 seconds


458 singletons identified. 59 final clusters.

Saving 6.67 x 6.67 in image



[1] 99
[1] 44
[1] "Number of variants"
[1] 99
[1] "Number of clusters"
[1] 44


Saving 6.67 x 6.67 in image

Saving 6.67 x 6.67 in image

