In [1]:
library(ggseqlogo)
library(ggplot2)
library(rtracklayer)
library(BSgenome.Hsapiens.UCSC.hg38)
library(patchwork) 
library(reticulate) 

Loading required package: GenomicRanges

Loading required package: stats4

Loading required package: BiocGenerics


Attaching package: ‘BiocGenerics’


The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs


The following objects are masked from ‘package:base’:

    anyDuplicated, aperm, append, as.data.frame, basename, cbind,
    colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
    get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
    match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
    Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
    table, tapply, union, unique, unsplit, which.max, which.min


Loading required package: S4Vectors


Attaching package: ‘S4Vectors’


The following objects are masked from ‘package:base’:

    expand.grid, I, unname


Loading required package: IRanges

Loading required package: GenomeInfoDb

Loading required package: BSgenome

Loading required package: Biostrings

L

In [2]:
cs1 = make_col_scheme(chars=c('A', 'C', 'G', 'T'), 
                      cols=c('#109648', '#255C99', '#F7B32B', '#D62839'))

In [3]:
plot_seq <- function(m, ymin=0, ymax=2, title=NULL, method='custom') {
    rownames(m) = c("A","C","G","T")
    
    p <- ggseqlogo(m, method=method, ncol=1,col_scheme=cs1) + 
                                    ylim(c(ymin,ymax)) 
    
    if (!is.null(title)) {
        p = p + ggtitle(title)
        p = p + theme(plot.title = element_text(hjust = 0.5, size=18))
    }
    
    p
}

In [4]:
# plot_npy_shaps <- function(shap_path, seq_width, shift, title, fig_save_path, plot_negatives=TRUE) {

#     np<-import("numpy") #datareading 
#     mat<-np$load(shap_path) 
#     dim(mat)

#     len <- 1364

#     ind_start <- as.integer((len - seq_width)/2) + shift
#     ind_end <- ind_start + seq_width - 1
#     ind_start
#     ind_end

#     mat1 <- t(drop(mat))[,ind_start:ind_end]
#     dimnames(mat1) <- list(c('A', 'C', 'G', 'T'))
#     mat1
    
#     ymax <- max(mat1)
#     ymin <- 0
#     if (plot_negatives) {
#         ymin <- -ymax
#     }

#     p = plot_seq(mat1, ymin=ymin, ymax=ymax, title=title)
#     ggsave(p, file=fig_save_path, height=2, width=20, useDingbats=F)
#     p
# }

In [5]:
get_mat1 <- function(shap_path, seq_width, shift, title, fig_save_path, plot_negatives=TRUE, ymax=0.1) {

    np<-import("numpy") #datareading 
    mat<-np$load(shap_path) 
    dim(mat)

    len <- 1364

    ind_start <- as.integer((len - seq_width)/2) + shift
    ind_end <- ind_start + seq_width - 1
    ind_start
    ind_end

    mat1 <- t(drop(mat))[,ind_start:ind_end]
    dimnames(mat1) <- list(c('A', 'C', 'G', 'T'))
    mat1
}

In [6]:
all_samples <- c('BRCA_35924',
 'BRCA_25566',
 'BRCA_34128',
 'COAD_34825',
 'COAD_38715',
 'COAD_18844',
 'COAD_27088',
 'COAD_10769',
 'BLCA_10605',
 'BLCA_1242',
 'LUAD_12047',
 'LUAD_3544',
 'LUAD_6551',
 'COAD_AI')
all_samples

In [8]:
shift <- 0
for (sample_name in all_samples) {
    for (flank in c(25,50,100,250)) {
        for (plot_negatives in c(TRUE,FALSE)) {
            
            print(sample_name)
            print(flank)
            print(plot_negatives)
            seq_width <- flank * 2
            
            ### get ref
            shap_path_ref <- sprintf("/illumina/scratch/deep_learning/akumar22/TCGA/mutation_prioritization/mutation_vignette/shap_scores/%s_mutation_centered_ref.npy", sample_name)
            mat1 <- get_mat1(shap_path_ref, seq_width, shift) 
            
            
            ### get alt
            shap_path_alt <- sprintf("/illumina/scratch/deep_learning/akumar22/TCGA/mutation_prioritization/mutation_vignette/shap_scores/%s_mutation_centered_alt.npy", sample_name)
            mat2 <- get_mat1(shap_path_alt, seq_width, shift) 
            
            ### get y limits
            ymax <- max(max(mat1), max(mat2))
            ymin <- 0
            if (plot_negatives) {
                ymin <- -ymax
            }
            print(ymax)
            print(ymin)

            fig_save_path_ref <- sprintf("/illumina/scratch/deep_learning/akumar22/TCGA/mutation_prioritization/mutation_vignette/shap_scores/%s_ref_%s_%s_R.pdf", sample_name, seq_width, plot_negatives)
            p = plot_seq(mat1, ymin=ymin, ymax=ymax, title=sprintf("%s_ref", sample_name))
            ggsave(p, file=fig_save_path_ref, height=2, width=20, useDingbats=F)
            
            fig_save_path_alt <- sprintf("/illumina/scratch/deep_learning/akumar22/TCGA/mutation_prioritization/mutation_vignette/shap_scores/%s_alt_%s_%s_R.pdf", sample_name, seq_width, plot_negatives)
            p = plot_seq(mat2, ymin=ymin, ymax=ymax, title=sprintf("%s_alt", sample_name))
            ggsave(p, file=fig_save_path_alt, height=2, width=20, useDingbats=F)
            
            
            }
        }
    }
    

[1] "BRCA_35924"
[1] 25
[1] TRUE
[1] 0.05112726
[1] -0.05112726
[1] "BRCA_35924"
[1] 25
[1] FALSE
[1] 0.05112726
[1] 0
[1] "BRCA_25566"
[1] 25
[1] TRUE
[1] 0.02898708
[1] -0.02898708
[1] "BRCA_25566"
[1] 25
[1] FALSE
[1] 0.02898708
[1] 0
[1] "BRCA_34128"
[1] 25
[1] TRUE
[1] 0.04279521
[1] -0.04279521
[1] "BRCA_34128"
[1] 25
[1] FALSE
[1] 0.04279521
[1] 0
[1] "COAD_34825"
[1] 25
[1] TRUE
[1] 0.04614326
[1] -0.04614326
[1] "COAD_34825"
[1] 25
[1] FALSE
[1] 0.04614326
[1] 0
[1] "COAD_38715"
[1] 25
[1] TRUE
[1] 0.05779655
[1] -0.05779655
[1] "COAD_38715"
[1] 25
[1] FALSE
[1] 0.05779655
[1] 0
[1] "COAD_18844"
[1] 25
[1] TRUE
[1] 0.04355817
[1] -0.04355817
[1] "COAD_18844"
[1] 25
[1] FALSE
[1] 0.04355817
[1] 0
[1] "COAD_27088"
[1] 25
[1] TRUE
[1] 0.05768721
[1] -0.05768721
[1] "COAD_27088"
[1] 25
[1] FALSE
[1] 0.05768721
[1] 0
[1] "COAD_10769"
[1] 25
[1] TRUE
[1] 0.03282704
[1] -0.03282704
[1] "COAD_10769"
[1] 25
[1] FALSE
[1] 0.03282704
[1] 0
[1] "BLCA_10605"
[1] 25
[1] TRUE
[1] 0.1050216
[