In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from IPython.display import display, HTML
import scipy.stats
from pybedtools import BedTool
import pickle
from tqdm import tqdm
from pyfaidx import Fasta

from seq2atac.stable import compute_gc_bed, one_hot_encode, read_pickle, compute_signal, write_pickle
from seq2atac.analysis import get_promoterai_tss, get_cosmic_pancan_genes, fasta_file
from seq2atac.analysis.enrichment_utils import create_pancancer_distribution_plots, create_pancancer_correlations, create_pancancer_valuecounts, create_pancancer_distribution_plots_discrete
from seq2atac.analysis.sample_controls import matching_logic
from seq2atac.analysis.mutation_utils import compute_vierstra_groups, compute_trinuc, ingene_indicator
from seq2atac.analysis.mutation_processing_pipeline_utils import annotateNearestGene, annotateScore, filterannotatePeaks, annotatePhylopScore, annotatePhylopScorePeak, annotateBedfile

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
pancan_genes = read_pickle("../process_tcga_icgc/ogtsg.pkl")
len(pancan_genes)

565

In [4]:
all_tcga_cancers = ["BLCA","BRCA","GBM","COAD","KIRC","KIRP","LUAD"]

In [5]:
ism_scores = pd.read_csv("../score_backups/ism_scores_genomewide/all_cancers_all_mutations.csv",index_col=0)

def merge_ism_scores(somatic_df,ism_df):
    
    sh_before = len(somatic_df)
    somatic_df = somatic_df.merge(ism_df,how="left")
    assert sh_before == somatic_df.shape[0]
    assert len(somatic_df[somatic_df["ref"].isna()]) == 0
    return somatic_df

# ism_scores["sample"].unique()

In [6]:
peak_proba_thresh = read_pickle("../process_tcga_icgc/peak_proba_thresh.pkl")
peak_proba_thresh

{'BLCA': 0.30038172008,
 'BRCA': 0.40271141012000006,
 'GBM': 0.256060285,
 'COAD': 0.2942905675,
 'KIRC': 0.39631355571677007,
 'KIRP': 0.2664290534,
 'LUAD': 0.24252313195000003,
 'RECA': 0.39631355571677007}

In [7]:
somatic_df_dict = {}
for cancer_name in all_tcga_cancers:
    print(cancer_name)

    ### read the processed somatic dataframes (this ensures within peaks)
    somatic_df = read_pickle(f"../somatic_filtered/{cancer_name}_filtered_annotated_somatic.pkl")
    assert len(somatic_df.drop_duplicates(["Chromosome","hg38_start","Tumor_Seq_Allele2"])) == len(somatic_df)
    print(somatic_df.shape)    
    
    ### annotate model scores

    somatic_df = annotateScore(somatic_df,
                               f"../score_backups/mutation_centered_genomewide/{cancer_name}/fold_*_somatic_mutation_centered.pkl",
                              remove_foldwise=False)
    somatic_df = somatic_df.rename(columns={"proba_ref":"proba_ref_mutation_centered",
                                            "proba_alt":"proba_alt_mutation_centered"})
    somatic_df["diff_mutation_centered"] = somatic_df["proba_ref_mutation_centered"] - somatic_df["proba_alt_mutation_centered"]
    for fold in range(5):
        somatic_df[f"fold_{fold}_diff_mutation_centered"] = somatic_df[f"proba_ref_{fold}"] - somatic_df[f"proba_alt_{fold}"]
    print(somatic_df.shape)
    
    ### annotate ism scores
    somatic_df = merge_ism_scores(somatic_df,
                                  ism_scores[ism_scores["sample"]==cancer_name.lower()])
    print(somatic_df.shape)
    
    
    ### annotate pancan
    somatic_df["closest_ogtsg"] = somatic_df["gene"].apply(lambda x : ingene_indicator(x, pancan_genes))
    print(somatic_df.shape)
    
    ### outside peaks, but < 1000
    somatic_df = annotateBedfile(somatic_df, 
                                 f"/illumina/scratch/deep_learning/akumar22/TCGA/mutations_scoring/master_files/cancer_peaks_500/{cancer_name}_peaks_indexed.bed", 
                                 "inside_peaks")
    somatic_df = somatic_df[(somatic_df["inside_peaks"]==0) & (somatic_df["distance_to_summit"] <= 1000)]
    print(somatic_df.shape)
    print(len(somatic_df[somatic_df["closest_ogtsg"]==1]))
    
    ### 
    minpeak_proba = peak_proba_thresh[cancer_name]
    somatic_df = somatic_df[somatic_df["proba_ref_mutation_centered"] > minpeak_proba]
    print(somatic_df.shape)
    
    
    ### some annotations for matching
    somatic_df["gc_2m"] = somatic_df["gc_mutation"].apply(lambda x:round(x,2))
    somatic_df["gc_1m"] = somatic_df["gc_mutation"].apply(lambda x:round(x,1))
    somatic_df["distance_to_summit_discrete"] = somatic_df["distance_to_summit"].apply(lambda x:round(x,-1))
    somatic_df["distance_to_tss_discrete"] = somatic_df["distance_to_tss"].apply(lambda x:round(x,-4))
    
    assert len(somatic_df.drop_duplicates(["Chromosome","hg38_start","Tumor_Seq_Allele2"])) == len(somatic_df)    
    somatic_df_dict[cancer_name] = somatic_df
 

BLCA
(155700, 31)
(155700, 49)
(155700, 53)
(155700, 54)
(18438, 55)
1129
(11393, 55)
BRCA
(376478, 31)
(376478, 49)
(376478, 53)
(376478, 54)
(55645, 55)
3528
(33464, 55)
GBM
(89159, 31)
(89159, 49)
(89159, 53)
(89159, 54)
(8370, 55)
486
(4989, 55)
COAD
(1165711, 31)
(1165711, 49)
(1165711, 53)
(1165711, 54)
(84645, 55)
4959
(43990, 55)
KIRC
(223811, 31)
(223811, 49)
(223811, 53)
(223811, 54)
(12289, 55)
729
(5492, 55)
KIRP
(56691, 31)
(56691, 49)
(56691, 53)
(56691, 54)
(5587, 55)
332
(3183, 55)
LUAD
(377986, 31)
(377986, 49)
(377986, 53)
(377986, 54)
(30787, 55)
1807
(15102, 55)


In [8]:
### Store these dataframes
write_pickle(somatic_df_dict,"somatic_df_dict_gof.pkl")

### Generate Matching

In [9]:
def get_match(somatic_df_dict,genelist):
    
    all_tcga_cancers = list(somatic_df_dict.keys())
    
    ### gene annotation
    for cancer_name in all_tcga_cancers:
        somatic_df = somatic_df_dict[cancer_name].copy()
        print(somatic_df.shape)

        somatic_df["closest_pancan"] = somatic_df["gene"].apply(lambda x : ingene_indicator(x, genelist))

        print(somatic_df.shape)
        somatic_df_dict[cancer_name] = somatic_df   
        
    ### matching
    pancan_matched_dict = {}
    nonpancan_matched_dict = {}

    for cancer_name in all_tcga_cancers:
        print(cancer_name)
        somatic_df = somatic_df_dict[cancer_name].copy()
        df1, df2 = matching_logic(somatic_df[somatic_df["closest_pancan"]==1],
                                             somatic_df[(somatic_df["closest_pancan"]==0) & (somatic_df["closest_ogtsg"]==0)],
                                             levels=[["trinuc","gc_2m","distance_to_summit_discrete","distance_to_tss_discrete"],
                                                     ["gc_2m","distance_to_summit_discrete","distance_to_tss_discrete"],
                                                     ["gc_2m","distance_to_summit_discrete"],
                                                     ["gc_2m"],
                                                     ["gc_1m"]],
                                             verbose=False)

        pancan_matched_dict[cancer_name] = df1
        nonpancan_matched_dict[cancer_name] = df2
        
    ### sanity
    df_stats = pd.DataFrame()
    for cancer_name in all_tcga_cancers:
        print(cancer_name)
        somatic_df = somatic_df_dict[cancer_name].copy()
        df_stats.loc[cancer_name,"total somatic"] = len(somatic_df)
        df_stats.loc[cancer_name,"pancan"] = len(somatic_df[(somatic_df["closest_pancan"]==1)])
        df_stats.loc[cancer_name,"nonpancan"] = len(somatic_df[(somatic_df["closest_pancan"]==0)])
        df_stats.loc[cancer_name,"matched pancan"] = len(pancan_matched_dict[cancer_name])
        df_stats.loc[cancer_name,"matched not near pancan"] = len(nonpancan_matched_dict[cancer_name])
        
#         assert len(somatic_df[(somatic_df["closest_pancan"]==1)]) == len(pancan_matched_dict[cancer_name])
#         assert len(pancan_matched_dict[cancer_name]) == len(nonpancan_matched_dict[cancer_name])
        
    display(df_stats.astype(int))
 
    
    return pancan_matched_dict, nonpancan_matched_dict

In [10]:
print(len(pancan_genes))
pancan_matched_dict, nonpancan_matched_dict = get_match(somatic_df_dict,pancan_genes)
write_pickle(pancan_matched_dict,"pancan_matched_dict_ogtsg.pkl")
write_pickle(nonpancan_matched_dict,"nonpancan_matched_dict_ogtsg.pkl")

565
(11393, 59)
(11393, 59)
(33464, 59)
(33464, 59)
(4989, 59)
(4989, 59)
(43990, 59)
(43990, 59)
(5492, 59)
(5492, 59)
(3183, 59)
(3183, 59)
(15102, 59)
(15102, 59)
BLCA
Grouping by:  ['trinuc', 'gc_2m', 'distance_to_summit_discrete', 'distance_to_tss_discrete']
Grouping by:  ['gc_2m', 'distance_to_summit_discrete', 'distance_to_tss_discrete']
Grouping by:  ['gc_2m', 'distance_to_summit_discrete']
Grouping by:  ['gc_2m']
BRCA
Grouping by:  ['trinuc', 'gc_2m', 'distance_to_summit_discrete', 'distance_to_tss_discrete']
Grouping by:  ['gc_2m', 'distance_to_summit_discrete', 'distance_to_tss_discrete']
Grouping by:  ['gc_2m', 'distance_to_summit_discrete']
Grouping by:  ['gc_2m']
GBM
Grouping by:  ['trinuc', 'gc_2m', 'distance_to_summit_discrete', 'distance_to_tss_discrete']
Grouping by:  ['gc_2m', 'distance_to_summit_discrete', 'distance_to_tss_discrete']
Grouping by:  ['gc_2m', 'distance_to_summit_discrete']
Grouping by:  ['gc_2m']
COAD
Grouping by:  ['trinuc', 'gc_2m', 'distance_to_sum

Unnamed: 0,total somatic,pancan,nonpancan,matched pancan,matched not near pancan
BLCA,11393,694,10699,694,694
BRCA,33464,2087,31377,2087,2087
GBM,4989,296,4693,296,296
COAD,43990,2584,41406,2584,2584
KIRC,5492,357,5135,357,357
KIRP,3183,198,2985,198,198
LUAD,15102,896,14206,896,896


### 5 random pli matched genesets - also total mutations matched

In [11]:
for i in range(5):
    pli_matched_genes = read_pickle(f"../process_tcga_icgc/pli_matched_genes_x2{i}.pkl")
    print(len(pli_matched_genes))
    
    assert len(set(pli_matched_genes).intersection(set(pancan_genes))) == 0
    
    pancan_matched_dict_pli, nonpancan_matched_dict_pli = get_match(somatic_df_dict,pli_matched_genes)
    
    write_pickle(pancan_matched_dict_pli,f"pancan_matched_dict_pli_x2{i}.pkl")
    write_pickle(nonpancan_matched_dict_pli,f"nonpancan_matched_dict_pli_x2{i}.pkl")
    

1114
(11393, 59)
(11393, 59)
(33464, 59)
(33464, 59)
(4989, 59)
(4989, 59)
(43990, 59)
(43990, 59)
(5492, 59)
(5492, 59)
(3183, 59)
(3183, 59)
(15102, 59)
(15102, 59)
BLCA
Grouping by:  ['trinuc', 'gc_2m', 'distance_to_summit_discrete', 'distance_to_tss_discrete']
Grouping by:  ['gc_2m', 'distance_to_summit_discrete', 'distance_to_tss_discrete']
Grouping by:  ['gc_2m', 'distance_to_summit_discrete']
Grouping by:  ['gc_2m']
BRCA
Grouping by:  ['trinuc', 'gc_2m', 'distance_to_summit_discrete', 'distance_to_tss_discrete']
Grouping by:  ['gc_2m', 'distance_to_summit_discrete', 'distance_to_tss_discrete']
Grouping by:  ['gc_2m', 'distance_to_summit_discrete']
Grouping by:  ['gc_2m']
GBM
Grouping by:  ['trinuc', 'gc_2m', 'distance_to_summit_discrete', 'distance_to_tss_discrete']
Grouping by:  ['gc_2m', 'distance_to_summit_discrete', 'distance_to_tss_discrete']
Grouping by:  ['gc_2m', 'distance_to_summit_discrete']
Grouping by:  ['gc_2m']
COAD
Grouping by:  ['trinuc', 'gc_2m', 'distance_to_su

Unnamed: 0,total somatic,pancan,nonpancan,matched pancan,matched not near pancan
BLCA,11393,717,10676,717,717
BRCA,33464,2321,31143,2321,2321
GBM,4989,353,4636,353,353
COAD,43990,2923,41067,2923,2923
KIRC,5492,387,5105,387,387
KIRP,3183,209,2974,209,209
LUAD,15102,1025,14077,1025,1025


1114
(11393, 59)
(11393, 59)
(33464, 59)
(33464, 59)
(4989, 59)
(4989, 59)
(43990, 59)
(43990, 59)
(5492, 59)
(5492, 59)
(3183, 59)
(3183, 59)
(15102, 59)
(15102, 59)
BLCA
Grouping by:  ['trinuc', 'gc_2m', 'distance_to_summit_discrete', 'distance_to_tss_discrete']
Grouping by:  ['gc_2m', 'distance_to_summit_discrete', 'distance_to_tss_discrete']
Grouping by:  ['gc_2m', 'distance_to_summit_discrete']
Grouping by:  ['gc_2m']
BRCA
Grouping by:  ['trinuc', 'gc_2m', 'distance_to_summit_discrete', 'distance_to_tss_discrete']
Grouping by:  ['gc_2m', 'distance_to_summit_discrete', 'distance_to_tss_discrete']
Grouping by:  ['gc_2m', 'distance_to_summit_discrete']
Grouping by:  ['gc_2m']
Grouping by:  ['gc_1m']
GBM
Grouping by:  ['trinuc', 'gc_2m', 'distance_to_summit_discrete', 'distance_to_tss_discrete']
Grouping by:  ['gc_2m', 'distance_to_summit_discrete', 'distance_to_tss_discrete']
Grouping by:  ['gc_2m', 'distance_to_summit_discrete']
Grouping by:  ['gc_2m']
COAD
Grouping by:  ['trinuc', 

Unnamed: 0,total somatic,pancan,nonpancan,matched pancan,matched not near pancan
BLCA,11393,703,10690,703,703
BRCA,33464,2140,31324,2140,2140
GBM,4989,305,4684,305,305
COAD,43990,2690,41300,2690,2690
KIRC,5492,387,5105,387,387
KIRP,3183,223,2960,223,223
LUAD,15102,906,14196,906,906


1114
(11393, 59)
(11393, 59)
(33464, 59)
(33464, 59)
(4989, 59)
(4989, 59)
(43990, 59)
(43990, 59)
(5492, 59)
(5492, 59)
(3183, 59)
(3183, 59)
(15102, 59)
(15102, 59)
BLCA
Grouping by:  ['trinuc', 'gc_2m', 'distance_to_summit_discrete', 'distance_to_tss_discrete']
Grouping by:  ['gc_2m', 'distance_to_summit_discrete', 'distance_to_tss_discrete']
Grouping by:  ['gc_2m', 'distance_to_summit_discrete']
Grouping by:  ['gc_2m']
BRCA
Grouping by:  ['trinuc', 'gc_2m', 'distance_to_summit_discrete', 'distance_to_tss_discrete']
Grouping by:  ['gc_2m', 'distance_to_summit_discrete', 'distance_to_tss_discrete']
Grouping by:  ['gc_2m', 'distance_to_summit_discrete']
Grouping by:  ['gc_2m']
GBM
Grouping by:  ['trinuc', 'gc_2m', 'distance_to_summit_discrete', 'distance_to_tss_discrete']
Grouping by:  ['gc_2m', 'distance_to_summit_discrete', 'distance_to_tss_discrete']
Grouping by:  ['gc_2m', 'distance_to_summit_discrete']
Grouping by:  ['gc_2m']
COAD
Grouping by:  ['trinuc', 'gc_2m', 'distance_to_su

Unnamed: 0,total somatic,pancan,nonpancan,matched pancan,matched not near pancan
BLCA,11393,843,10550,843,843
BRCA,33464,2458,31006,2458,2458
GBM,4989,430,4559,430,430
COAD,43990,3172,40818,3172,3172
KIRC,5492,364,5128,364,364
KIRP,3183,219,2964,219,219
LUAD,15102,1101,14001,1101,1101


1114
(11393, 59)
(11393, 59)
(33464, 59)
(33464, 59)
(4989, 59)
(4989, 59)
(43990, 59)
(43990, 59)
(5492, 59)
(5492, 59)
(3183, 59)
(3183, 59)
(15102, 59)
(15102, 59)
BLCA
Grouping by:  ['trinuc', 'gc_2m', 'distance_to_summit_discrete', 'distance_to_tss_discrete']
Grouping by:  ['gc_2m', 'distance_to_summit_discrete', 'distance_to_tss_discrete']
Grouping by:  ['gc_2m', 'distance_to_summit_discrete']
Grouping by:  ['gc_2m']
Grouping by:  ['gc_1m']
BRCA
Grouping by:  ['trinuc', 'gc_2m', 'distance_to_summit_discrete', 'distance_to_tss_discrete']
Grouping by:  ['gc_2m', 'distance_to_summit_discrete', 'distance_to_tss_discrete']
Grouping by:  ['gc_2m', 'distance_to_summit_discrete']
Grouping by:  ['gc_2m']
GBM
Grouping by:  ['trinuc', 'gc_2m', 'distance_to_summit_discrete', 'distance_to_tss_discrete']
Grouping by:  ['gc_2m', 'distance_to_summit_discrete', 'distance_to_tss_discrete']
Grouping by:  ['gc_2m', 'distance_to_summit_discrete']
Grouping by:  ['gc_2m']
COAD
Grouping by:  ['trinuc', 

Unnamed: 0,total somatic,pancan,nonpancan,matched pancan,matched not near pancan
BLCA,11393,776,10617,776,776
BRCA,33464,2404,31060,2404,2404
GBM,4989,375,4614,375,375
COAD,43990,3047,40943,3047,3047
KIRC,5492,365,5127,365,365
KIRP,3183,226,2957,226,226
LUAD,15102,1071,14031,1071,1071


1114
(11393, 59)
(11393, 59)
(33464, 59)
(33464, 59)
(4989, 59)
(4989, 59)
(43990, 59)
(43990, 59)
(5492, 59)
(5492, 59)
(3183, 59)
(3183, 59)
(15102, 59)
(15102, 59)
BLCA
Grouping by:  ['trinuc', 'gc_2m', 'distance_to_summit_discrete', 'distance_to_tss_discrete']
Grouping by:  ['gc_2m', 'distance_to_summit_discrete', 'distance_to_tss_discrete']
Grouping by:  ['gc_2m', 'distance_to_summit_discrete']
Grouping by:  ['gc_2m']
BRCA
Grouping by:  ['trinuc', 'gc_2m', 'distance_to_summit_discrete', 'distance_to_tss_discrete']
Grouping by:  ['gc_2m', 'distance_to_summit_discrete', 'distance_to_tss_discrete']
Grouping by:  ['gc_2m', 'distance_to_summit_discrete']
Grouping by:  ['gc_2m']
GBM
Grouping by:  ['trinuc', 'gc_2m', 'distance_to_summit_discrete', 'distance_to_tss_discrete']
Grouping by:  ['gc_2m', 'distance_to_summit_discrete', 'distance_to_tss_discrete']
Grouping by:  ['gc_2m', 'distance_to_summit_discrete']
Grouping by:  ['gc_2m']
COAD
Grouping by:  ['trinuc', 'gc_2m', 'distance_to_su

Unnamed: 0,total somatic,pancan,nonpancan,matched pancan,matched not near pancan
BLCA,11393,701,10692,701,701
BRCA,33464,2182,31282,2182,2182
GBM,4989,338,4651,338,338
COAD,43990,2830,41160,2830,2830
KIRC,5492,409,5083,409,409
KIRP,3183,217,2966,217,217
LUAD,15102,983,14119,983,983


In [12]:
# for i in range(5):
#     pli_matched_genes = read_pickle(f"../process_tcga_icgc/pli_matched_oncogenes_x2{i}.pkl")
#     print(len(pli_matched_genes))
    
#     assert len(set(pli_matched_genes).intersection(set(pancan_genes))) == 0
    
#     pancan_matched_dict_pli, nonpancan_matched_dict_pli = get_match(somatic_df_dict,pli_matched_genes)
    
#     write_pickle(pancan_matched_dict_pli,f"pancan_matched_dict_pli_onco_x2{i}.pkl")
#     write_pickle(nonpancan_matched_dict_pli,f"nonpancan_matched_dict_pli_onco_x2{i}.pkl")
    

### 5 random genesets - also total mutations matched

In [13]:
for i in range(5):
    random_genes = read_pickle(f"../process_tcga_icgc/random_genes_{i}.pkl")
    print(len(random_genes))
    
    assert len(set(random_genes).intersection(set(pancan_genes))) == 0
    
    pancan_matched_dict_random, nonpancan_matched_dict_random = get_match(somatic_df_dict,random_genes)
    
    write_pickle(pancan_matched_dict_random,f"pancan_matched_dict_random_{i}.pkl")
    write_pickle(nonpancan_matched_dict_random,f"nonpancan_matched_dict_random_{i}.pkl")
    

1130
(11393, 59)
(11393, 59)
(33464, 59)
(33464, 59)
(4989, 59)
(4989, 59)
(43990, 59)
(43990, 59)
(5492, 59)
(5492, 59)
(3183, 59)
(3183, 59)
(15102, 59)
(15102, 59)
BLCA
Grouping by:  ['trinuc', 'gc_2m', 'distance_to_summit_discrete', 'distance_to_tss_discrete']
Grouping by:  ['gc_2m', 'distance_to_summit_discrete', 'distance_to_tss_discrete']
Grouping by:  ['gc_2m', 'distance_to_summit_discrete']
Grouping by:  ['gc_2m']
Grouping by:  ['gc_1m']
BRCA
Grouping by:  ['trinuc', 'gc_2m', 'distance_to_summit_discrete', 'distance_to_tss_discrete']
Grouping by:  ['gc_2m', 'distance_to_summit_discrete', 'distance_to_tss_discrete']
Grouping by:  ['gc_2m', 'distance_to_summit_discrete']
Grouping by:  ['gc_2m']
Grouping by:  ['gc_1m']
GBM
Grouping by:  ['trinuc', 'gc_2m', 'distance_to_summit_discrete', 'distance_to_tss_discrete']
Grouping by:  ['gc_2m', 'distance_to_summit_discrete', 'distance_to_tss_discrete']
Grouping by:  ['gc_2m', 'distance_to_summit_discrete']
Grouping by:  ['gc_2m']
COAD
G

Unnamed: 0,total somatic,pancan,nonpancan,matched pancan,matched not near pancan
BLCA,11393,749,10644,749,749
BRCA,33464,1965,31499,1965,1965
GBM,4989,283,4706,283,283
COAD,43990,2532,41458,2531,2531
KIRC,5492,328,5164,328,328
KIRP,3183,192,2991,192,192
LUAD,15102,903,14199,903,903


1130
(11393, 59)
(11393, 59)
(33464, 59)
(33464, 59)
(4989, 59)
(4989, 59)
(43990, 59)
(43990, 59)
(5492, 59)
(5492, 59)
(3183, 59)
(3183, 59)
(15102, 59)
(15102, 59)
BLCA
Grouping by:  ['trinuc', 'gc_2m', 'distance_to_summit_discrete', 'distance_to_tss_discrete']
Grouping by:  ['gc_2m', 'distance_to_summit_discrete', 'distance_to_tss_discrete']
Grouping by:  ['gc_2m', 'distance_to_summit_discrete']
Grouping by:  ['gc_2m']
BRCA
Grouping by:  ['trinuc', 'gc_2m', 'distance_to_summit_discrete', 'distance_to_tss_discrete']
Grouping by:  ['gc_2m', 'distance_to_summit_discrete', 'distance_to_tss_discrete']
Grouping by:  ['gc_2m', 'distance_to_summit_discrete']
Grouping by:  ['gc_2m']
Grouping by:  ['gc_1m']
GBM
Grouping by:  ['trinuc', 'gc_2m', 'distance_to_summit_discrete', 'distance_to_tss_discrete']
Grouping by:  ['gc_2m', 'distance_to_summit_discrete', 'distance_to_tss_discrete']
Grouping by:  ['gc_2m', 'distance_to_summit_discrete']
Grouping by:  ['gc_2m']
COAD
Grouping by:  ['trinuc', 

Unnamed: 0,total somatic,pancan,nonpancan,matched pancan,matched not near pancan
BLCA,11393,667,10726,667,667
BRCA,33464,2033,31431,2033,2033
GBM,4989,309,4680,309,309
COAD,43990,2687,41303,2687,2687
KIRC,5492,340,5152,340,340
KIRP,3183,165,3018,165,165
LUAD,15102,929,14173,929,929


1130
(11393, 59)
(11393, 59)
(33464, 59)
(33464, 59)
(4989, 59)
(4989, 59)
(43990, 59)
(43990, 59)
(5492, 59)
(5492, 59)
(3183, 59)
(3183, 59)
(15102, 59)
(15102, 59)
BLCA
Grouping by:  ['trinuc', 'gc_2m', 'distance_to_summit_discrete', 'distance_to_tss_discrete']
Grouping by:  ['gc_2m', 'distance_to_summit_discrete', 'distance_to_tss_discrete']
Grouping by:  ['gc_2m', 'distance_to_summit_discrete']
Grouping by:  ['gc_2m']
BRCA
Grouping by:  ['trinuc', 'gc_2m', 'distance_to_summit_discrete', 'distance_to_tss_discrete']
Grouping by:  ['gc_2m', 'distance_to_summit_discrete', 'distance_to_tss_discrete']
Grouping by:  ['gc_2m', 'distance_to_summit_discrete']
Grouping by:  ['gc_2m']
GBM
Grouping by:  ['trinuc', 'gc_2m', 'distance_to_summit_discrete', 'distance_to_tss_discrete']
Grouping by:  ['gc_2m', 'distance_to_summit_discrete', 'distance_to_tss_discrete']
Grouping by:  ['gc_2m', 'distance_to_summit_discrete']
Grouping by:  ['gc_2m']
COAD
Grouping by:  ['trinuc', 'gc_2m', 'distance_to_su

Unnamed: 0,total somatic,pancan,nonpancan,matched pancan,matched not near pancan
BLCA,11393,654,10739,654,654
BRCA,33464,1882,31582,1882,1882
GBM,4989,275,4714,275,275
COAD,43990,2576,41414,2576,2576
KIRC,5492,313,5179,313,313
KIRP,3183,201,2982,201,201
LUAD,15102,843,14259,843,843


1130
(11393, 59)
(11393, 59)
(33464, 59)
(33464, 59)
(4989, 59)
(4989, 59)
(43990, 59)
(43990, 59)
(5492, 59)
(5492, 59)
(3183, 59)
(3183, 59)
(15102, 59)
(15102, 59)
BLCA
Grouping by:  ['trinuc', 'gc_2m', 'distance_to_summit_discrete', 'distance_to_tss_discrete']
Grouping by:  ['gc_2m', 'distance_to_summit_discrete', 'distance_to_tss_discrete']
Grouping by:  ['gc_2m', 'distance_to_summit_discrete']
Grouping by:  ['gc_2m']
Grouping by:  ['gc_1m']
BRCA
Grouping by:  ['trinuc', 'gc_2m', 'distance_to_summit_discrete', 'distance_to_tss_discrete']
Grouping by:  ['gc_2m', 'distance_to_summit_discrete', 'distance_to_tss_discrete']
Grouping by:  ['gc_2m', 'distance_to_summit_discrete']
Grouping by:  ['gc_2m']
GBM
Grouping by:  ['trinuc', 'gc_2m', 'distance_to_summit_discrete', 'distance_to_tss_discrete']
Grouping by:  ['gc_2m', 'distance_to_summit_discrete', 'distance_to_tss_discrete']
Grouping by:  ['gc_2m', 'distance_to_summit_discrete']
Grouping by:  ['gc_2m']
COAD
Grouping by:  ['trinuc', 

Unnamed: 0,total somatic,pancan,nonpancan,matched pancan,matched not near pancan
BLCA,11393,659,10734,659,659
BRCA,33464,2062,31402,2062,2062
GBM,4989,362,4627,362,362
COAD,43990,2798,41192,2798,2798
KIRC,5492,319,5173,319,319
KIRP,3183,178,3005,178,178
LUAD,15102,922,14180,922,922


1130
(11393, 59)
(11393, 59)
(33464, 59)
(33464, 59)
(4989, 59)
(4989, 59)
(43990, 59)
(43990, 59)
(5492, 59)
(5492, 59)
(3183, 59)
(3183, 59)
(15102, 59)
(15102, 59)
BLCA
Grouping by:  ['trinuc', 'gc_2m', 'distance_to_summit_discrete', 'distance_to_tss_discrete']
Grouping by:  ['gc_2m', 'distance_to_summit_discrete', 'distance_to_tss_discrete']
Grouping by:  ['gc_2m', 'distance_to_summit_discrete']
Grouping by:  ['gc_2m']
BRCA
Grouping by:  ['trinuc', 'gc_2m', 'distance_to_summit_discrete', 'distance_to_tss_discrete']
Grouping by:  ['gc_2m', 'distance_to_summit_discrete', 'distance_to_tss_discrete']
Grouping by:  ['gc_2m', 'distance_to_summit_discrete']
Grouping by:  ['gc_2m']
GBM
Grouping by:  ['trinuc', 'gc_2m', 'distance_to_summit_discrete', 'distance_to_tss_discrete']
Grouping by:  ['gc_2m', 'distance_to_summit_discrete', 'distance_to_tss_discrete']
Grouping by:  ['gc_2m', 'distance_to_summit_discrete']
Grouping by:  ['gc_2m']
COAD
Grouping by:  ['trinuc', 'gc_2m', 'distance_to_su

Unnamed: 0,total somatic,pancan,nonpancan,matched pancan,matched not near pancan
BLCA,11393,638,10755,638,638
BRCA,33464,2026,31438,2026,2026
GBM,4989,278,4711,278,278
COAD,43990,2871,41119,2871,2871
KIRC,5492,314,5178,314,314
KIRP,3183,182,3001,182,182
LUAD,15102,991,14111,990,990


In [14]:
print("Done")

Done
