### best sanity is to run part of the notebook after restarting the kernel
- for instance to get 5x pli, restart and run only the relevant cells
- for 5x random, restart and run only relevant cells
- TODO: Check if just running all cells in order gives same results

In [None]:
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from IPython.display import display, HTML
import scipy.stats
from pybedtools import BedTool
import pickle
from tqdm import tqdm
from pyfaidx import Fasta

from seq2atac.stable import compute_gc_bed, one_hot_encode, read_pickle, compute_signal, write_pickle
from seq2atac.analysis import get_promoterai_tss, get_cosmic_pancan_genes, fasta_file
from seq2atac.analysis.enrichment_utils import create_pancancer_distribution_plots, create_pancancer_correlations, create_pancancer_valuecounts, create_pancancer_distribution_plots_discrete
from seq2atac.analysis.sample_controls import matching_logic
from seq2atac.analysis.mutation_utils import compute_vierstra_groups, compute_trinuc, ingene_indicator
from seq2atac.analysis.mutation_processing_pipeline_utils import annotateNearestGene, annotateScore, filterannotatePeaks, annotatePhylopScore, annotatePhylopScorePeak, annotateBedfile

In [None]:
all_tcga_cancers = ["BLCA","BRCA","GBM","COAD","KIRC","KIRP","LUAD"]

### Expt 1

In [None]:
## requires access to indexed peakfiles in akumar22

In [None]:
ctcf_vierstra_file = "../process_tcga_icgc/ctcf_vierstra.bed"
pancan_genes = read_pickle("../process_tcga_icgc/ogtsg.pkl")
len(pancan_genes)

In [None]:
#### 
somatic_df_dict = {}
for cancer_name in all_tcga_cancers:
    print(cancer_name)
    somatic_df = read_pickle(f"../somatic_filtered/{cancer_name}_filtered_annotated_somatic.pkl")
    print(somatic_df.shape)
    
    ### remove uncleaned ctcf
    somatic_df = annotateBedfile(somatic_df,
                                 ctcf_vierstra_file,
                                 "ctcf_uncleaned")
    somatic_df = somatic_df[somatic_df["ctcf_uncleaned"]==0]
    print(somatic_df.shape)
    
    ### closest pancan annotation
    somatic_df["closest_pancan"] = somatic_df["gene"].apply(lambda x : ingene_indicator(x, pancan_genes))
    
    ### inside peaks or not
    somatic_df = annotateBedfile(somatic_df,
                               f"/illumina/scratch/deep_learning/akumar22/TCGA/mutations_scoring/master_files/cancer_peaks_500/{cancer_name}_peaks_indexed.bed",
                               "inside_peaks")
    print(somatic_df.shape)
    
    
    assert len(somatic_df.drop_duplicates(["Chromosome","hg38_start","Tumor_Seq_Allele2"])) == len(somatic_df)
    
    somatic_df_dict[cancer_name] = somatic_df
    
    

In [None]:
total_fisher = np.zeros((2,2))
for cancer_name in all_tcga_cancers:
    print(cancer_name)
    df = somatic_df_dict[cancer_name].copy()
    
    l1 = df[(df["closest_pancan"]==1) & (df["inside_peaks"] == 1)]
    l2 = df[(df["closest_pancan"]==1) & (df["inside_peaks"] == 0)]
    l3 = df[(df["closest_pancan"]==0) & (df["inside_peaks"] == 1)]
    l4 = df[(df["closest_pancan"]==0) & (df["inside_peaks"] == 0)]


    fisher_matrix = np.array([[len(l1),len(l2)],[len(l3),len(l4)]])
    
    assert fisher_matrix.sum() == len(df)
    total_fisher += fisher_matrix
    fold_change,pval = scipy.stats.fisher_exact(fisher_matrix)
    print(fisher_matrix,fold_change,pval)
    
print("total")
print(total_fisher)
print(scipy.stats.fisher_exact(total_fisher))


### Expt2 

In [7]:
ism_thresh = read_pickle("../process_tcga_icgc/ism_thresh_percentile.pkl")[95]
ism_thresh

{'brca': 0.17696281257796992,
 'blca': 0.2097989422114596,
 'coad': 0.19172578644761995,
 'luad': 0.27739862176416474,
 'kirp': 0.23343622358881733,
 'kirc': 0.2235218676975348,
 'gbm': 0.21235916285438658,
 'skcm': 0.19640249133953247}

In [None]:
somatic_df_dict = read_pickle("somatic_df_dict_lof.pkl")

In [None]:
def get_enrichments(somatic_df_dict, pancan_matched_dict, nonpancan_matched_dict, ism_thresh):
    q_to_or = {}
    q_to_pval = {}
    q_to_fisher = {}
    
    all_tcga_cancers = list(somatic_df_dict.keys())

    for q in [round(x,2) for x in np.linspace(0.5, 0.99, 50)] + [0.975]:
        print(q)

        total_fisher = np.zeros((2,2))

        for cancer_name in all_tcga_cancers:

            if cancer_name == "SKCM":
                continue

            somatic_df = somatic_df_dict[cancer_name].copy()


            threshold = somatic_df["diff_summit_centered"].quantile(q)
            ism_es = None
            if ism_thresh:
                ism_es = ism_thresh[cancer_name.lower()]

            pancan_df = pancan_matched_dict[cancer_name].copy()
            nonpancan_df = nonpancan_matched_dict[cancer_name].copy()

            l1 = None
            if ism_thresh:
                l1 = pancan_df[(pancan_df["diff_summit_centered"]>threshold) & (pancan_df["ref"] > ism_es) & (pancan_df["ref-mut"] > 0)]
            else:
                l1 = pancan_df[(pancan_df["diff_summit_centered"]>threshold)]
                
            l2 = pancan_df[~pancan_df.index.isin(l1.index)]
            
            l3 = None
            if ism_thresh:
                l3 = nonpancan_df[(nonpancan_df["diff_summit_centered"]>threshold) & (nonpancan_df["ref"] > ism_es) & (nonpancan_df["ref-mut"] > 0)]
            else:
                l3 = nonpancan_df[(nonpancan_df["diff_summit_centered"]>threshold)]
                
            l4 = nonpancan_df[~nonpancan_df.index.isin(l3.index)]

            fisher_matrix = np.array([[len(l1),len(l2)],[len(l3),len(l4)]])

            assert fisher_matrix.sum() == len(pancan_df) + len(nonpancan_df)
            total_fisher += fisher_matrix
            fold_change,pval = scipy.stats.fisher_exact(fisher_matrix)

        print(total_fisher)
        fold_change,pval = scipy.stats.fisher_exact(total_fisher)
        print(fold_change, pval)
        q_to_or[q] = fold_change
        q_to_pval[q] = -np.log10(pval)
        q_to_fisher[q] = total_fisher
        
    return q_to_fisher, q_to_or, q_to_pval


def plot_percentiles(q_to_or, q_to_pval, ymin=0.9, ymax=1.7, title=None, outfile=None):
    
        
    fig,axes = plt.subplots(1,2,figsize=(10,5))
    ax1,ax2 = axes
    ax1.scatter(q_to_or.keys(),q_to_or.values(), color="#ff7f0e")
    ax1.set_xlabel("threshold percentile of somatic")
    ax1.set_ylabel("fold change")
    ax1.axhline(1.0,color="#ff7f0e")
    ax1.set_ylim(ymin,ymax)
    
    ax2.scatter(q_to_pval.keys(),q_to_pval.values(), color="#ff7f0e")
    ax2.set_xlabel("threshold percentile of somatic")
    ax2.set_ylabel("-log10 p value")
    ax2.axhline(-np.log10(0.05),color="#ff7f0e")

    if title:
        fig.suptitle(title)
    fig.tight_layout()
    if outfile:
        fig.savefig(outfile,dpi=1200)
    else:
        plt.show()

In [None]:
pancan_matched_dict = read_pickle("pancan_matched_dict_ogtsg.pkl")
nonpancan_matched_dict = read_pickle("nonpancan_matched_dict_ogtsg.pkl")

In [None]:
### figure
import matplotlib
plt.rcParams["figure.figsize"]=20,10
matplotlib.rcParams['pdf.fonttype']=42
fisher, odds, pvals = get_enrichments(somatic_df_dict, pancan_matched_dict, nonpancan_matched_dict, ism_thresh)
plot_percentiles(odds, pvals, 0.8,1.7,"Odds and p values as a function of threshold","./varied_thresholds_with_ism.pdf")


In [None]:
fisher, odds, pvals = get_enrichments(somatic_df_dict, pancan_matched_dict, nonpancan_matched_dict, ism_thresh=None)
plot_percentiles(odds, pvals, 0.8,1.7)

In [None]:
### near tsg

In [None]:
# cosmos_pancan_file = "/illumina/scratch/deep_learning/asalcedo/cancer_gene_census.csv"
# cosmos_pancan_df = pd.read_csv(cosmos_pancan_file)
# cosmos_pancan_df = cosmos_pancan_df[(~cosmos_pancan_df["Role in Cancer"].isna()) & 
#                                         (cosmos_pancan_df["Role in Cancer"]!="fusion")]

# tsg = cosmos_pancan_df[(~cosmos_pancan_df["Role in Cancer"].isin(['oncogene, fusion','oncogene']))]["Gene Symbol"].tolist()
# tsg = list(set(tsg))
# len(tsg)

In [None]:
# def get_enrichments_tsg(somatic_df_dict, pancan_matched_dict, nonpancan_matched_dict, ism_thresh):
#     q_to_or = {}
#     q_to_pval = {}
#     q_to_fisher = {}
    
#     all_tcga_cancers = list(somatic_df_dict.keys())

#     for q in [round(x,2) for x in np.linspace(0.5, 0.99, 50)] + [0.975]:
#         print(q)

#         total_fisher = np.zeros((2,2))

#         for cancer_name in all_tcga_cancers:

#             if cancer_name == "SKCM":
#                 continue

#             somatic_df = somatic_df_dict[cancer_name].copy()


#             threshold = somatic_df["diff_summit_centered"].quantile(q)
#             ism_es = None
#             if ism_thresh:
#                 ism_es = ism_thresh[cancer_name.lower()]

#             pancan_df = pancan_matched_dict[cancer_name].copy()
#             nonpancan_df = nonpancan_matched_dict[cancer_name].copy()
            
#             pancan_df["closest_tsg"] = pancan_df["gene"].apply(lambda x : ingene_indicator(x, tsg))
#             pancan_df = pancan_df[pancan_df["closest_tsg"]==1]
#             nonpancan_df = nonpancan_df[nonpancan_df.index.isin(pancan_df.index)]
            
            
#             print(pancan_df.shape)
#             print(nonpancan_df.shape)
            
#             assert (pancan_df["distance_to_summit_discrete"]==nonpancan_df["distance_to_summit_discrete"]).all()

#             l1 = None
#             if ism_thresh:
#                 l1 = pancan_df[(pancan_df["diff_summit_centered"]>threshold) & (pancan_df["ref"] > ism_es) & (pancan_df["ref-mut"] > 0)]
#             else:
#                 l1 = pancan_df[(pancan_df["diff_summit_centered"]>threshold)]
                
#             l2 = pancan_df[~pancan_df.index.isin(l1.index)]
            
#             l3 = None
#             if ism_thresh:
#                 l3 = nonpancan_df[(nonpancan_df["diff_summit_centered"]>threshold) & (nonpancan_df["ref"] > ism_es) & (nonpancan_df["ref-mut"] > 0)]
#             else:
#                 l3 = nonpancan_df[(nonpancan_df["diff_summit_centered"]>threshold)]
                
#             l4 = nonpancan_df[~nonpancan_df.index.isin(l3.index)]

#             fisher_matrix = np.array([[len(l1),len(l2)],[len(l3),len(l4)]])

#             assert fisher_matrix.sum() == len(pancan_df) + len(nonpancan_df)
#             total_fisher += fisher_matrix
#             fold_change,pval = scipy.stats.fisher_exact(fisher_matrix)

#         print(total_fisher)
#         fold_change,pval = scipy.stats.fisher_exact(total_fisher)
#         print(fold_change, pval)
#         q_to_or[q] = fold_change
#         q_to_pval[q] = -np.log10(pval)
#         q_to_fisher[q] = total_fisher
        
#     return q_to_fisher, q_to_or, q_to_pval


# def plot_percentiles(q_to_or, q_to_pval, ymin=0.9, ymax=1.7):
    
        
#     fig,axes = plt.subplots(1,2,figsize=(10,5))
#     ax1,ax2 = axes
#     ax1.scatter(q_to_or.keys(),q_to_or.values(), color="#ff7f0e")
#     ax1.set_xlabel("threshold percentile of somatic")
#     ax1.set_ylabel("fold change")
#     ax1.axhline(1.0,color="#ff7f0e")
#     ax1.set_ylim(ymin,ymax)
    
#     ax2.scatter(q_to_pval.keys(),q_to_pval.values(), color="#ff7f0e")
#     ax2.set_xlabel("threshold percentile of somatic")
#     ax2.set_ylabel("-log10 p value")
#     ax2.axhline(-np.log10(0.05),color="#ff7f0e")

#     fig.suptitle("Odds and -log10 p values as the threshold is increased")
#     fig.tight_layout()
#     plt.show()

In [None]:
# pancan_matched_dict = read_pickle("pancan_matched_dict_ogtsg.pkl")
# nonpancan_matched_dict = read_pickle("nonpancan_matched_dict_ogtsg.pkl")

# fisher, odds, pvals = get_enrichments_tsg(somatic_df_dict, pancan_matched_dict, nonpancan_matched_dict, ism_thresh)
# plot_percentiles(odds, pvals, 0.8,1.7)

### pli matched

In [None]:
### for each random pli geneset - compute odds and p vals as a function of quantile and 
### create a dataframe with 5 columns (one for each genelist)
df_or = pd.DataFrame()
df_or.index = [round(x,2) for x in np.linspace(0.5, 0.99, 50)] + [0.975]

df_pval = pd.DataFrame()
df_pval.index = [round(x,2) for x in np.linspace(0.5, 0.99, 50)] + [0.975]
for i in range(5):
    
    pancan_matched_dict = read_pickle(f"pancan_matched_dict_pli_x2{i}.pkl")
    nonpancan_matched_dict = read_pickle(f"nonpancan_matched_dict_pli_x2{i}.pkl")
    
    fisher, odds, pvals = get_enrichments(somatic_df_dict, pancan_matched_dict, nonpancan_matched_dict, ism_thresh)

    df_or[f"odds_{i}"] = df_or.index.map(odds.get)
    df_pval[f"odds_{i}"] = df_pval.index.map(pvals.get)

In [None]:
### compute median for each quantile and take the corresponding geneset's p value at that quantile
odds_median = {}
pvals_median = {}
df_or_np = df_or.to_numpy()
df_pval_np = df_pval.to_numpy()
for i in range(len(df_or_np)):
    
    ors_q = df_or_np[i]
    pvals_q = df_pval_np[i]
    q = df_or.index[i]
    
    median_q = sorted(ors_q)[2]
    
    pval_median_q_index = np.where(ors_q == median_q)[0]
    pval_median_q = pvals_q[pval_median_q_index[0]]
    
    odds_median[q] = median_q
    pvals_median[q] = pval_median_q
    
plot_percentiles(odds_median, pvals_median, 0.8,1.7)

In [None]:
odds_median[0.975],10**-pvals_median[0.975], odds_median[0.99],10**-pvals_median[0.99]

### pli matched near tsg

In [None]:
# ### for each random pli geneset - compute odds and p vals as a function of quantile and 
# ### create a dataframe with 5 columns (one for each genelist)
# df_or = pd.DataFrame()
# df_or.index = [round(x,2) for x in np.linspace(0.5, 0.99, 50)] + [0.975]

# df_pval = pd.DataFrame()
# df_pval.index = [round(x,2) for x in np.linspace(0.5, 0.99, 50)] + [0.975]
# for i in range(5):
    
#     pancan_matched_dict = read_pickle(f"pancan_matched_dict_pli_ts_x2{i}.pkl")
#     nonpancan_matched_dict = read_pickle(f"nonpancan_matched_dict_pli_ts_x2{i}.pkl")
    
#     fisher, odds, pvals = get_enrichments(somatic_df_dict, pancan_matched_dict, nonpancan_matched_dict, ism_thresh)

#     df_or[f"odds_{i}"] = df_or.index.map(odds.get)
#     df_pval[f"odds_{i}"] = df_pval.index.map(pvals.get)

In [None]:
# ### compute median for each quantile and take the corresponding geneset's p value at that quantile
# odds_median = {}
# pvals_median = {}
# df_or_np = df_or.to_numpy()
# df_pval_np = df_pval.to_numpy()
# for i in range(len(df_or_np)):
    
#     ors_q = df_or_np[i]
#     pvals_q = df_pval_np[i]
#     q = df_or.index[i]
    
#     median_q = sorted(ors_q)[2]
    
#     pval_median_q_index = np.where(ors_q == median_q)[0]
#     pval_median_q = pvals_q[pval_median_q_index[0]]
    
#     odds_median[q] = median_q
#     pvals_median[q] = pval_median_q
    
# plot_percentiles(odds_median, pvals_median, 0.8,1.7)

### random near pancan

In [None]:
### for each random pli geneset - compute odds and p vals as a function of quantile and 
### create a dataframe with 5 columns (one for each genelist)
df_or = pd.DataFrame()
df_or.index = [round(x,2) for x in np.linspace(0.5, 0.99, 50)] + [0.975]

df_pval = pd.DataFrame()
df_pval.index = [round(x,2) for x in np.linspace(0.5, 0.99, 50)] + [0.975]
for i in range(5):
    
    pancan_matched_dict = read_pickle(f"pancan_matched_dict_random_{i}.pkl")
    nonpancan_matched_dict = read_pickle(f"nonpancan_matched_dict_random_{i}.pkl")
    
    fisher, odds, pvals = get_enrichments(somatic_df_dict, pancan_matched_dict, nonpancan_matched_dict, ism_thresh)

    df_or[f"odds_{i}"] = df_or.index.map(odds.get)
    df_pval[f"odds_{i}"] = df_pval.index.map(pvals.get)

In [None]:
### compute median for each quantile and take the corresponding geneset's p value at that quantile
odds_median = {}
pvals_median = {}
df_or_np = df_or.to_numpy()
df_pval_np = df_pval.to_numpy()
for i in range(len(df_or_np)):
    
    ors_q = df_or_np[i]
    pvals_q = df_pval_np[i]
    q = df_or.index[i]
    
    median_q = sorted(ors_q)[2]
    
    pval_median_q_index = np.where(ors_q == median_q)[0]
    pval_median_q = pvals_q[pval_median_q_index[0]]
    
    odds_median[q] = median_q
    pvals_median[q] = pval_median_q
    
plot_percentiles(odds_median, pvals_median, 0.8,1.7)

In [None]:
odds_median[0.975],10**-pvals_median[0.975], odds_median[0.99],10**-pvals_median[0.99]

### Supplementary plots

In [3]:
somatic_df_dict = read_pickle("somatic_df_dict_lof.pkl")
pancan_matched_dict = read_pickle("pancan_matched_dict_ogtsg.pkl")
nonpancan_matched_dict = read_pickle("nonpancan_matched_dict_ogtsg.pkl")

In [5]:
import matplotlib
plt.rcParams["figure.figsize"]=20,10
matplotlib.rcParams['pdf.fonttype']=42

### matching - no distribution shift plot

In [None]:
df=create_pancancer_distribution_plots(pancan_matched_dict, nonpancan_matched_dict,"diff_summit_centered",False)
df

In [None]:
# fig,axes = plt.subplots(1,2,figsize=(20,10))
# ax1,ax2 = axes
# ax1.bar(np.arange(7),
#         df["ranksums"],
#         0.7,
#         label=df.index,
#         color='#1f77b4')
# ax1.set_xticks(np.arange(len(df.index)), df.index)
# ax1.set_xlabel("cancer name")
# ax1.set_ylabel("ranksum")
# ax1.set_ylim(-5,5)

fig,ax2 = plt.subplots(figsize=(20,10))
ax2.bar(np.arange(7),
        -np.log10(df["pvalue"]),
        0.7,
        label=df.index,
        color='#1f77b4')
ax2.set_xticks(np.arange(len(df.index)), df.index)
ax2.set_xlabel("cancer name")
ax2.set_ylabel("-log10 p value")
ax2.axhline(-np.log10(0.05),color="#1f77b4")
ax2.set_ylim(0,5)

fig.suptitle("Matching statistics - p value")

fig.savefig("./matching_bar_plot_inside_peaks.pdf",dpi=1200)


### table

In [13]:
all_mutations = []
df_ism_thresh = []
for cancer_name in all_tcga_cancers:
    
    
    somatic_df = somatic_df_dict[cancer_name].copy()
    threshold = somatic_df["diff_summit_centered"].quantile(0.975)
    ism_es = ism_thresh[cancer_name.lower()]

    print(cancer_name, threshold, ism_es)
    df_ism_thresh.append((cancer_name, threshold, ism_es))
    
    pancan_df = pancan_matched_dict[cancer_name][["Chromosome","hg38_start","Reference_Allele","Tumor_Seq_Allele2","sample","diff_summit_centered","proba_ref_summit_centered","proba_alt_summit_centered","ref","mut","ref-mut","closest_pancan"]].copy()
    pancan_df["sample"] = pancan_df["sample"].apply(lambda x : x.upper())
    pancan_df["prioritized"] = 0
    indices = pancan_df[(pancan_df["diff_summit_centered"]>threshold) & (pancan_df["ref"] > ism_es) & (pancan_df["ref-mut"] > 0)].index
    pancan_df.loc[indices,"prioritized"] = 1

    
    nonpancan_df = nonpancan_matched_dict[cancer_name][["Chromosome","hg38_start","Reference_Allele","Tumor_Seq_Allele2","sample","diff_summit_centered","proba_ref_summit_centered","proba_alt_summit_centered","ref","mut","ref-mut","closest_pancan"]].copy()
    nonpancan_df["sample"] = nonpancan_df["sample"].apply(lambda x : x.upper())
    nonpancan_df["prioritized"] = 0
    indices = nonpancan_df[(nonpancan_df["diff_summit_centered"]>threshold) & (nonpancan_df["ref"] > ism_es) & (nonpancan_df["ref-mut"] > 0)].index
    nonpancan_df.loc[indices,"prioritized"] = 1
    
    all_mutations.append(pancan_df)
    all_mutations.append(nonpancan_df)
    
all_mutations = pd.concat(all_mutations, axis=0, ignore_index=True)
all_mutations.shape

BLCA 0.047121298313140714 0.2097989422114596
BRCA 0.046678137779235754 0.17696281257796992
GBM 0.0649726867675781 0.21235916285438658
COAD 0.04962486773729317 0.19172578644761995
KIRC 0.046412371098995216 0.2235218676975348
KIRP 0.05527774393558505 0.23343622358881733
LUAD 0.0665462560951709 0.27739862176416474


(15342, 13)

In [16]:
df_ism_thresh = pd.DataFrame(df_ism_thresh,columns=["Cancer Type","Mutation score threshold","ISM threhsold"])
df_ism_thresh["Cancer Type"] = df_ism_thresh["Cancer Type"].apply(lambda x : x.upper())
df_ism_thresh.to_excel("./lof_thresholds.xlsx")
df_ism_thresh

In [14]:
all_mutations.columns = ["chr","pos","ref","alt","sample","effect_size","ref_proba","alt_proba","ref_ism","alt_ism","ism_diff","og/tsg","prioritized"]
all_mutations = all_mutations.sort_values(["chr","pos","sample"]).reset_index(drop=True)
all_mutations

Unnamed: 0,chr,pos,ref,alt,sample,effect_size,ref_proba,alt_proba,ref_ism,alt_ism,ism_diff,og/tsg,prioritized
0,chr1,978707,G,C,LUAD,-0.024571,0.623894,0.648465,0.186169,0.172975,0.013194,0,0
1,chr1,1552046,T,G,BRCA,-0.001262,0.567517,0.568779,-0.008160,-0.064508,0.056349,0,0
2,chr1,2127530,G,A,GBM,-0.049524,0.817293,0.866817,0.356885,0.435889,-0.079004,0,0
3,chr1,2193610,G,C,BRCA,0.004750,0.795651,0.790902,0.062991,0.018371,0.044620,0,0
4,chr1,2224737,C,A,BLCA,0.017585,0.390333,0.372748,0.005234,-0.774536,0.779770,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
15337,chr9,137396336,T,C,LUAD,0.312677,0.761473,0.448796,1.122884,-0.060694,1.183578,0,1
15338,chr9,137405668,C,A,COAD,0.005860,0.813176,0.807316,-0.185964,-0.265868,0.079904,0,0
15339,chr9,137436398,G,A,BLCA,-0.006511,0.936512,0.943023,-0.187652,-0.163534,-0.024118,0,0
15340,chr9,137555968,A,G,COAD,0.003467,0.937267,0.933800,0.451839,0.400671,0.051168,0,0


In [15]:
### sanity check - please check if the values are correct
l1 = all_mutations[(all_mutations["prioritized"]==1) & (all_mutations["og/tsg"]==1)].shape[0]
l2 = all_mutations[(all_mutations["prioritized"]==0) & (all_mutations["og/tsg"]==1)].shape[0]
l3 = all_mutations[(all_mutations["prioritized"]==1) & (all_mutations["og/tsg"]==0)].shape[0]
l4 = all_mutations[(all_mutations["prioritized"]==0) & (all_mutations["og/tsg"]==0)].shape[0]

fisher_matrix = [[l1,l2],[l3,l4]]
scipy.stats.fisher_exact(fisher_matrix)

(1.5142418629267003, 0.006473624241152715)

In [None]:
all_mutations.to_excel("./all_mutations_inside_peaks.xlsx")

### correlation plots

In [None]:
### compute phylop
for cancer_name in all_tcga_cancers:
    somatic_df = somatic_df_dict[cancer_name].copy()
    print(somatic_df.shape)
    somatic_df = annotatePhylopScore(somatic_df,"phylop_mutation")
    print(somatic_df.shape)
    somatic_df_dict[cancer_name] = somatic_df

In [None]:
for quantity in ["gc_peak","phylop_mutation","distance_to_tss"]:
    df_stats = pd.DataFrame()

    for cancer_name in all_tcga_cancers:

        print(cancer_name)

        somatic_df = somatic_df_dict[cancer_name].copy()


        for fold in range(5):
            spr, pval = scipy.stats.spearmanr(somatic_df[f"fold_{fold}_diff_summit_centered"], 
                                              somatic_df[quantity])
            df_stats.loc[fold,cancer_name] = spr


    display(df_stats)

    fig,ax=plt.subplots()
    df_stats.boxplot(ax=ax)
    ax.set_title(quantity)
    ax.set_xticks(ax.get_xticks(),rotation=90)
    ax.grid(False)
    ax.set_ylim(-0.3,0.3)
    fig.savefig(f'{quantity}_effect_size.pdf',dpi=1200)
    plt.show()

### Expt3 - combined LoF and GoF

In [None]:
somatic_df_dict_lof = read_pickle("somatic_df_dict_lof.pkl")
pancan_matched_dict_lof = read_pickle("pancan_matched_dict_ogtsg.pkl")
nonpancan_matched_dict_lof = read_pickle("nonpancan_matched_dict_ogtsg.pkl")

In [None]:
somatic_df_dict_gof = read_pickle("../gof/somatic_df_dict_gof.pkl")
pancan_matched_dict_gof = read_pickle("../gof/pancan_matched_dict_ogtsg.pkl")
nonpancan_matched_dict_gof = read_pickle("../gof/nonpancan_matched_dict_ogtsg.pkl")

In [None]:
ism_thresh_lof = read_pickle("../process_tcga_icgc/ism_thresh_percentile.pkl")[95]
ism_thresh_lof

In [None]:
ism_thresh_gof = read_pickle("../process_tcga_icgc/ism_thresh_percentile.pkl")[99]
ism_thresh_gof

In [None]:
q_to_or = {}
q_to_pval = {}
q_to_fisher = {}

for q in [round(x,2) for x in np.linspace(0.8, 0.99, 20)] + [0.975]:
    print(q)

    total_fisher = np.zeros((2,2))
    
    for cancer_name in all_tcga_cancers:
        
        if cancer_name == "SKCM":
            continue

        # LoF
        somatic_df = somatic_df_dict_lof[cancer_name].copy()
        
        threshold = somatic_df["diff_summit_centered"].quantile(q)
        ism_es = ism_thresh_lof[cancer_name.lower()]
        
        pancan_df = pancan_matched_dict_lof[cancer_name].copy()
        nonpancan_df = nonpancan_matched_dict_lof[cancer_name].copy()
        
        l1 = pancan_df[(pancan_df["diff_summit_centered"]>threshold) & (pancan_df["ref"] > ism_es) & (pancan_df["ref-mut"] > 0)]
        l2 = pancan_df[~pancan_df.index.isin(l1.index)]
        l3 = nonpancan_df[(nonpancan_df["diff_summit_centered"]>threshold) & (nonpancan_df["ref"] > ism_es) & (nonpancan_df["ref-mut"] > 0)]
        l4 = nonpancan_df[~nonpancan_df.index.isin(l3.index)]
        
        fisher_matrix = np.array([[len(l1),len(l2)],[len(l3),len(l4)]])
        assert fisher_matrix.sum() == len(pancan_df) + len(nonpancan_df)
        total_fisher += fisher_matrix
        
        if q == 0.975:
            print(fisher_matrix)
        

        somatic_df = somatic_df_dict_gof[cancer_name].copy()
        
        threshold = somatic_df["diff_mutation_centered"].quantile(1.0-q)
        ism_es = ism_thresh_gof[cancer_name.lower()]
        
        pancan_df = pancan_matched_dict_gof[cancer_name].copy()
        nonpancan_df = nonpancan_matched_dict_gof[cancer_name].copy()
        
        l1 = pancan_df[(pancan_df["diff_mutation_centered"]<threshold) & (pancan_df["mut"] > ism_es)& (pancan_df["ref-mut"] < 0)] 
        l2 = pancan_df[~pancan_df.index.isin(l1.index)]
        l3 = nonpancan_df[(nonpancan_df["diff_mutation_centered"]<threshold) & (nonpancan_df["mut"] > ism_es) & (nonpancan_df["ref-mut"] < 0)]
        l4 = nonpancan_df[~nonpancan_df.index.isin(l3.index)]
        
        fisher_matrix = np.array([[len(l1),len(l2)],[len(l3),len(l4)]])
        assert fisher_matrix.sum() == len(pancan_df) + len(nonpancan_df)
        total_fisher += fisher_matrix
        
        if q == 0.975:
            print(fisher_matrix)
    
    print(total_fisher)
    fold_change,pval = scipy.stats.fisher_exact(total_fisher)
    print(fold_change, pval)
    q_to_or[q] = fold_change
    q_to_pval[q] = -np.log10(pval)
    q_to_fisher = total_fisher

    
import matplotlib
plt.rcParams["figure.figsize"]=20,10
matplotlib.rcParams['pdf.fonttype']=42
fig,axes = plt.subplots(1,2,figsize=(10,5))
ax1,ax2 = axes
ax1.scatter(q_to_or.keys(),q_to_or.values(), color="#ff7f0e")
ax1.set_xlabel("threshold percentile of somatic")
ax1.set_ylabel("fold change")
ax1.axhline(1.0,color="#ff7f0e")
ax1.set_ylim(0.8,1.7)

ax2.scatter(q_to_pval.keys(),q_to_pval.values(), color="#ff7f0e")
ax2.set_xlabel("threshold percentile of somatic")
ax2.set_ylabel("-log10 p value")
ax2.axhline(-np.log10(0.05),color="#ff7f0e")

fig.suptitle("Odds and -log10 p values as the threshold is increased")
fig.tight_layout()
fig.savefig("./lof_gof_combined_threshold_wise.pdf",dpi=1200)
plt.show()

In [None]:
print("Done")