In [None]:
import sys
import pandas as pd
from seq2atac.stable import one_hot_encode, bed_to_numpy, read_pickle
from seq2atac.stable.models import model_name_to_fn
from seq2atac.analysis.enrichment_utils import get_alt_sequence
from seq2atac.analysis.mutation_utils import search_names_in_vierstra_group
from pybedtools import BedTool
from pyfaidx import Fasta
import numpy as np
from pyfaidx import Fasta
from tqdm import tqdm
from copy import deepcopy

import os
from seq2atac.analysis.shap_utils import plot_summit_centered, plot_mutation_centered, predict_classification_proba
import glob
from seq2atac.stable.models.convolutional import get_bpnet_model

In [None]:

### get model
model = get_bpnet_model(1364,8)


In [None]:
all_mutations = pd.read_csv("/illumina/scratch/deep_learning/nravindra/tmp/ism/results/genomewide/all_cancers_all_mutations.csv",index_col=0)

def merge_ism_scores(somatic_df,ism_df):
    
    sh_before = len(somatic_df)
    somatic_df = somatic_df.merge(ism_df,how="left")
    assert sh_before == somatic_df.shape[0]
    
    return somatic_df

In [None]:
ism_thresh = read_pickle('/illumina/scratch/deep_learning/nravindra/results/reg_diffs/motifism_cancers_thresh_v22.pkl')

ism_thresh['reca'] = ism_thresh['kirc']
ism_thresh['mela'] = ism_thresh['skcm']
ism_thresh

In [None]:
### get fasta file
fasta_file = '/illumina/scratch/deep_learning/lsundaram/singlecelldatasets/TCGA/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta'
fasta_seq=Fasta(fasta_file)

In [None]:
somatic_df_dict = {}

for cancer_name in ["BRCA","BLCA","LUAD","COAD"]:
    print(cancer_name)

    somatic_df = read_pickle(f"/illumina/scratch/deep_learning/akumar22/TCGA/mutation_prioritization/matching_experiments/{cancer_name}_somatic_annotated.pkl")
    print(somatic_df.shape)

    somatic_df = merge_ism_scores(somatic_df,all_mutations[all_mutations["sample"]==cancer_name.lower()])
    print(somatic_df.shape)
    ism_es = ism_thresh[cancer_name.lower()]

    somatic_df = somatic_df[somatic_df["ctcf_uncleaned"]==0]
    print(somatic_df.shape)


    somatic_df["ap1_fox_indicator"] = somatic_df["vierstra_groups"].apply(lambda x: search_names_in_vierstra_group(x,["AP1/","ATF/"]))
    somatic_df
    
    somatic_df_dict[cancer_name] = somatic_df

In [None]:
###

interesting_indices = {"BRCA":[35924, 25566, 34128],
                       "COAD":[34825, 38715, 18844, 27088, 10769],
                       "BLCA":[10605, 1242],
                       "LUAD":[12047, 3544, 6551]}

In [None]:
all_sample_interesting_indicesk,vv in interesting_indices.items():
    for v in vv:
        all_sample_names.append(f"{k}_{v}")

In [None]:
### for each of the index, generate mutation and summit centered ref and alt scores

In [None]:
!mkdir shap_scores/

In [None]:
from seq2atac.analysis.enrichment_utils import get_alt_sequence, get_refalt_sequence
from seq2atac.stable import one_hot_encode
from seq2atac.analysis.shap_utils import score_classification, compute_shap_score

In [None]:
for cancer_name in ["BRCA","BLCA","LUAD","COAD"]:
    
    ### get weights
    weights_files = glob.glob(f"/illumina/scratch/deep_learning/akumar22/TCGA/models_250_1364_minibatch_prejitter/{cancer_name}/fold_*/model.h5")
    print(weights_files)    

    ### get the mutation indices to plot
    somatic_df = somatic_df_dict[cancer_name].loc[interesting_indices[cancer_name]]
    print(somatic_df)
    
    for idx,row in somatic_df.iterrows():
        print(idx)
        
#         ### summit centered
#         X_ref,X_alt = [one_hot_encode(x) for x in get_alt_sequence(somatic_df.loc[[idx],:],1364,fasta_seq)]
#         # compute score
#         score1 = score_classification(X_ref,model,weights_files,compute_shap_score) * X_ref
#         np.save(f"./shap_scores/{cancer_name}_{idx}_summit_centered_ref.npy", score1)
#         # compute score
#         score2 = score_classification(X_alt,model,weights_files,compute_shap_score) * X_alt
#         np.save(f"./shap_scores/{cancer_name}_{idx}_summit_centered_alt.npy", score2)
        
        ### mutation centered
        X_ref,X_alt = [one_hot_encode(x) for x in get_refalt_sequence(somatic_df.loc[[idx],:],1364,fasta_seq)]
        # compute score
        score1 = score_classification(X_ref,model,weights_files,compute_shap_score) * X_ref
        np.save(f"./shap_scores/{cancer_name}_{idx}_mutation_centered_ref.npy", score1)
        # compute score
        score2 = score_classification(X_alt,model,weights_files,compute_shap_score) * X_alt
        np.save(f"./shap_scores/{cancer_name}_{idx}_mutation_centered_alt.npy", score2)
    

In [None]:
cancer_name = "COAD"
### get weights
weights_files = glob.glob(f"/illumina/scratch/deep_learning/akumar22/TCGA/models_250_1364_minibatch_prejitter/{cancer_name}/fold_*/model.h5")
print(weights_files)   


somatic_df = pd.DataFrame()
somatic_df["Chromosome"] = ["chr10"]
somatic_df["hg38_start"] = [74871769]
somatic_df["hg38_end"] = [74871770]
somatic_df["Reference_Allele"] = ["A"]
somatic_df["Tumor_Seq_Allele2"] = ["C"]


for idx,row in somatic_df.iterrows():
    print(idx)

    ### mutation centered
    X_ref,X_alt = [one_hot_encode(x) for x in get_refalt_sequence(somatic_df.loc[[idx],:],1364,fasta_seq)]
    # compute score
    score1 = score_classification(X_ref,model,weights_files,compute_shap_score) * X_ref
    np.save(f"./shap_scores/COAD_AI_mutation_centered_ref.npy", score1)
    # compute score
    score2 = score_classification(X_alt,model,weights_files,compute_shap_score) * X_alt
    np.save(f"./shap_scores/COAD_AI_mutation_centered_alt.npy", score2)
    

In [None]:
print("Done")

In [None]:
## after R plots are generated, move the 200 versions

In [None]:
!mkdir shap_scores_200
import shutil

In [None]:
for sample in ['BRCA_35924',
 'BRCA_25566',
 'BRCA_34128',
 'COAD_34825',
 'COAD_38715',
 'COAD_18844',
 'COAD_27088',
 'COAD_10769',
 'BLCA_10605',
 'BLCA_1242',
 'LUAD_12047',
 'LUAD_3544',
 'LUAD_6551',
 'COAD_AI']:
    
    print(sample)
    
    shutil.copy(f"shap_scores/{sample}_ref_100_FALSE_R.pdf",f"shap_scores_200/{sample}_ref_100_FALSE_R.pdf")
    shutil.copy(f"shap_scores/{sample}_alt_100_FALSE_R.pdf",f"shap_scores_200/{sample}_alt_100_FALSE_R.pdf")

In [None]:
!pwd

In [None]:
!ls /illumina/scratch/deep_learning/akumar22/TCGA/mutation_prioritization/mutation_vignette/shap_scores_200/