### Figure 5 master notebook

What it does?
1. Inside/Outside peak enrichment
2. Model and ISM based prioritization
3. Supplement: using random pli-matched genes

Pre-requisite files
- Gene list - From CGC: Only Oncogenes / TSGs (PIC: Adriana)
- All genes and TSS - PromoterAI file (PIC: Nicole)
- All vierstra arechetype ctcfs (PIC: Arvind)
- Model scores (PIC: Arvind)
- ISM scores (PIC: Neal)
- Per cancer peakset (same as one used to train model) (PIC: Laksshman, copied over to Arvind's directory)
- 5 fold models (PIC: Arvind)
- Phylop file (PIC: Laksshman)
- Fasta file (PIC: Laksshman)
- PLI file: gnomad.v2.1.1.lof_metrics.by_gene.txt (PIC: Hong)

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from IPython.display import display, HTML
import scipy.stats
from pybedtools import BedTool
import pickle
from tqdm import tqdm
from pyfaidx import Fasta

from seq2atac.stable import compute_gc_bed, one_hot_encode, read_pickle, compute_signal, write_pickle
from seq2atac.analysis import get_promoterai_tss, get_cosmic_pancan_genes, fasta_file
from seq2atac.analysis.enrichment_utils import create_pancancer_distribution_plots, create_pancancer_correlations, create_pancancer_valuecounts, create_pancancer_distribution_plots_discrete
from seq2atac.analysis.sample_controls import matching_logic
from seq2atac.analysis.mutation_utils import compute_vierstra_groups, compute_trinuc, ingene_indicator
from seq2atac.analysis.mutation_processing_pipeline_utils import annotateNearestGene, annotateScore, filterannotatePeaks, annotatePhylopScore, annotatePhylopScorePeak, annotateBedfile

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
ctcf_vierstra_file = "../process_tcga_icgc/ctcf_vierstra.bed"


In [4]:
pancan_genes = read_pickle("../process_tcga_icgc/ogtsg.pkl")
len(pancan_genes)

565

In [5]:
all_tcga_cancers = ["BLCA","BRCA","GBM","COAD","KIRC","KIRP","LUAD"]

### Process all somatic mutations for LoF

In [6]:
ism_scores = pd.read_csv("../score_backups/ism_scores_genomewide/all_cancers_all_mutations.csv",index_col=0)

def merge_ism_scores(somatic_df,ism_df):
    
    sh_before = len(somatic_df)
    somatic_df = somatic_df.merge(ism_df,how="left")
    assert sh_before == somatic_df.shape[0]
    assert len(somatic_df[somatic_df["ref"].isna()]) == 0
    return somatic_df

# ism_scores["sample"].unique()

In [7]:
somatic_df_dict = {}
for cancer_name in all_tcga_cancers:
    print(cancer_name)

    ### read the processed somatic dataframes (this ensures within peaks)
    somatic_df = read_pickle(f"../somatic_filtered/{cancer_name}_filtered_annotated_somatic_regulatory.pkl")
    assert len(somatic_df.drop_duplicates(["Chromosome","hg38_start","Tumor_Seq_Allele2"])) == len(somatic_df)
    print(somatic_df.shape)    
    
    
    ### annotate model scores
    
    ### mutation centered scores
    somatic_df = annotateScore(somatic_df,
                               f"../score_backups/summit_centered_peaks/{cancer_name}/fold_*_somatic_mutation_centered.pkl")
    somatic_df = somatic_df.rename(columns={"proba_ref":"proba_ref_mutation_centered",
                                            "proba_alt":"proba_alt_mutation_centered"})
    somatic_df["diff_mutation_centered"] = somatic_df["proba_ref_mutation_centered"] - somatic_df["proba_alt_mutation_centered"]
    
    ### summit centered scores
    somatic_df = annotateScore(somatic_df,
                               f"../score_backups/summit_centered_peaks/{cancer_name}/fold_*_somatic_summit_centered.pkl",
                              remove_foldwise=False)
    somatic_df = somatic_df.rename(columns={"proba_ref":"proba_ref_summit_centered",
                                            "proba_alt":"proba_alt_summit_centered"})
    somatic_df["diff_summit_centered"] = somatic_df["proba_ref_summit_centered"] - somatic_df["proba_alt_summit_centered"]
    for fold in range(5):
        somatic_df[f"fold_{fold}_diff_summit_centered"] = somatic_df[f"proba_ref_{fold}"] - somatic_df[f"proba_alt_{fold}"]

    
    ### annotate ism scores
    somatic_df = merge_ism_scores(somatic_df,
                                  ism_scores[ism_scores["sample"]==cancer_name.lower()])
    
    
    ### annotate pancan
    somatic_df["closest_ogtsg"] = somatic_df["gene"].apply(lambda x : ingene_indicator(x, pancan_genes))

    ### remove ctcfs
    somatic_df = annotateBedfile(somatic_df,
                                 ctcf_vierstra_file,
                                 "ctcf_uncleaned")
    print(somatic_df.shape)
    somatic_df = somatic_df[somatic_df["ctcf_uncleaned"]==0]
    print(somatic_df.shape)
    
    
    ### some annotations for matching
    somatic_df["gc_2"] = somatic_df["gc_peak"].apply(lambda x:round(x,2))
    somatic_df["gc_1"] = somatic_df["gc_peak"].apply(lambda x:round(x,1))
    somatic_df["distance_to_summit_discrete"] = somatic_df["distance_to_summit"].apply(lambda x:round(x,-1))
    
    assert len(somatic_df.drop_duplicates(["Chromosome","hg38_start","Tumor_Seq_Allele2"])) == len(somatic_df)    
    somatic_df_dict[cancer_name] = somatic_df
 

BLCA
(14219, 31)
(14219, 58)
(13593, 58)
BRCA
(38491, 31)
(38491, 58)
(36972, 58)
GBM
(5016, 31)
(5016, 58)
(4732, 58)
COAD
(40544, 31)
(40544, 58)
(39782, 58)
KIRC
(8361, 31)
(8361, 58)
(7984, 58)
KIRP
(3655, 31)
(3655, 58)
(3495, 58)
LUAD
(17010, 31)
(17010, 58)
(16028, 58)


In [8]:
### Store these dataframes
write_pickle(somatic_df_dict,"somatic_df_dict_lof.pkl")
{cancer_name:somatic_df_dict[cancer_name]["diff_summit_centered"].quantile(0.975) for cancer_name in all_tcga_cancers}


{'BLCA': 0.047121298313140714,
 'BRCA': 0.046678137779235754,
 'GBM': 0.0649726867675781,
 'COAD': 0.04962486773729317,
 'KIRC': 0.046412371098995216,
 'KIRP': 0.05527774393558505,
 'LUAD': 0.0665462560951709}

In [9]:
thresholds = {}
for q in [0.01,0.025,0.05,0.1,0.9,0.95,0.975,0.99]:
    thresh_quant_dict = {cn:somatic_df_dict[cn]["diff_summit_centered"].quantile(q) for cn in all_tcga_cancers}
    thresholds[q] = thresh_quant_dict
print(thresholds)

write_pickle(thresholds,"threshold_summit_centered.pkl")

thresholds = {}
for q in [0.01,0.025,0.05,0.1,0.9,0.95,0.975,0.99]:
    thresh_quant_dict = {cn:somatic_df_dict[cn]["diff_mutation_centered"].quantile(q) for cn in all_tcga_cancers}
    thresholds[q] = thresh_quant_dict
print(thresholds)

write_pickle(thresholds,"threshold_mutation_centered.pkl")

{0.01: {'BLCA': -0.057111942768096925, 'BRCA': -0.05661134839057922, 'GBM': -0.06666439831256867, 'COAD': -0.06200151786208153, 'KIRC': -0.048552260994911195, 'KIRP': -0.05301146030426025, 'LUAD': -0.07743662059307098}, 0.025: {'BLCA': -0.03862689733505249, 'BRCA': -0.036670146882534026, 'GBM': -0.04861255288124084, 'COAD': -0.04070858210325241, 'KIRC': -0.03324506804347038, 'KIRP': -0.03717219829559326, 'LUAD': -0.0480069562792778}, 0.05: {'BLCA': -0.027190136909484863, 'BRCA': -0.02477472126483917, 'GBM': -0.03535886406898499, 'COAD': -0.02590775489807129, 'KIRC': -0.021711236238479613, 'KIRP': -0.02503616511821747, 'LUAD': -0.029831188917160033}, 0.1: {'BLCA': -0.016697907447814943, 'BRCA': -0.01504697799682617, 'GBM': -0.0220944881439209, 'COAD': -0.01513502597808838, 'KIRC': -0.013126516342163086, 'KIRP': -0.013112854957580567, 'LUAD': -0.017016947269439697}, 0.9: {'BLCA': 0.016679382324218756, 'BRCA': 0.01682010293006897, 'GBM': 0.02704451084136965, 'COAD': 0.01715412735939026, '

### Generate Matching

In [10]:
def get_match(somatic_df_dict,genelist):
    
    all_tcga_cancers = list(somatic_df_dict.keys())
    
    ### gene annotation
    for cancer_name in all_tcga_cancers:
        somatic_df = somatic_df_dict[cancer_name].copy()
        print(somatic_df.shape)

        somatic_df["closest_pancan"] = somatic_df["gene"].apply(lambda x : ingene_indicator(x, genelist))

        print(somatic_df.shape)
        somatic_df_dict[cancer_name] = somatic_df   
        
    ### matching
    pancan_matched_dict = {}
    nonpancan_matched_dict = {}

    for cancer_name in all_tcga_cancers:
        print(cancer_name)
        somatic_df = somatic_df_dict[cancer_name].copy()
        df1, df2 = matching_logic(somatic_df[somatic_df["closest_pancan"]==1],
                                  somatic_df[(somatic_df["closest_pancan"]==0) & (somatic_df["closest_ogtsg"]==0)],
                                  levels=[["trinuc","gc_2","distance_to_summit_discrete"],
                                         ["trinuc","gc_1","distance_to_summit_discrete"],
                                         ["trinuc","distance_to_summit_discrete"],
                                         ["distance_to_summit_discrete"]],
                                  verbose=False)

        pancan_matched_dict[cancer_name] = df1
        nonpancan_matched_dict[cancer_name] = df2
        
    ### sanity
    df_stats = pd.DataFrame()
    for cancer_name in all_tcga_cancers:
        print(cancer_name)
        somatic_df = somatic_df_dict[cancer_name].copy()
        df_stats.loc[cancer_name,"total somatic"] = len(somatic_df)
        df_stats.loc[cancer_name,"pancan"] = len(somatic_df[(somatic_df["closest_pancan"]==1)])
        df_stats.loc[cancer_name,"nonpancan"] = len(somatic_df[(somatic_df["closest_pancan"]==0)])
        df_stats.loc[cancer_name,"matched pancan"] = len(pancan_matched_dict[cancer_name])
        df_stats.loc[cancer_name,"matched not near pancan"] = len(nonpancan_matched_dict[cancer_name])
        
        assert len(somatic_df[(somatic_df["closest_pancan"]==1)]) == len(pancan_matched_dict[cancer_name])
        assert len(pancan_matched_dict[cancer_name]) == len(nonpancan_matched_dict[cancer_name])
        
    display(df_stats.astype(int))
 
    
    return pancan_matched_dict, nonpancan_matched_dict

In [11]:
print(len(pancan_genes))
pancan_matched_dict, nonpancan_matched_dict = get_match(somatic_df_dict,pancan_genes)
write_pickle(pancan_matched_dict,"pancan_matched_dict_ogtsg.pkl")
write_pickle(nonpancan_matched_dict,"nonpancan_matched_dict_ogtsg.pkl")

565
(13593, 61)
(13593, 61)
(36972, 61)
(36972, 61)
(4732, 61)
(4732, 61)
(39782, 61)
(39782, 61)
(7984, 61)
(7984, 61)
(3495, 61)
(3495, 61)
(16028, 61)
(16028, 61)
BLCA
Grouping by:  ['trinuc', 'gc_2', 'distance_to_summit_discrete']
Grouping by:  ['trinuc', 'gc_1', 'distance_to_summit_discrete']
Grouping by:  ['trinuc', 'distance_to_summit_discrete']
Grouping by:  ['distance_to_summit_discrete']
BRCA
Grouping by:  ['trinuc', 'gc_2', 'distance_to_summit_discrete']
Grouping by:  ['trinuc', 'gc_1', 'distance_to_summit_discrete']
Grouping by:  ['trinuc', 'distance_to_summit_discrete']
Grouping by:  ['distance_to_summit_discrete']
GBM
Grouping by:  ['trinuc', 'gc_2', 'distance_to_summit_discrete']
Grouping by:  ['trinuc', 'gc_1', 'distance_to_summit_discrete']
Grouping by:  ['trinuc', 'distance_to_summit_discrete']
Grouping by:  ['distance_to_summit_discrete']
COAD
Grouping by:  ['trinuc', 'gc_2', 'distance_to_summit_discrete']
Grouping by:  ['trinuc', 'gc_1', 'distance_to_summit_discrete

Unnamed: 0,total somatic,pancan,nonpancan,matched pancan,matched not near pancan
BLCA,13593,888,12705,888,888
BRCA,36972,2475,34497,2475,2475
GBM,4732,291,4441,291,291
COAD,39782,2351,37431,2351,2351
KIRC,7984,484,7500,484,484
KIRP,3495,230,3265,230,230
LUAD,16028,952,15076,952,952


### 5 random pli matched genesets - also total mutations matched

In [12]:
for i in range(5):
    pli_matched_genes = read_pickle(f"../process_tcga_icgc/pli_matched_genes_x2{i}.pkl")
    print(len(pli_matched_genes))
    
    assert len(set(pli_matched_genes).intersection(set(pancan_genes))) == 0
    
    pancan_matched_dict_pli, nonpancan_matched_dict_pli = get_match(somatic_df_dict,pli_matched_genes)
    
    write_pickle(pancan_matched_dict_pli,f"pancan_matched_dict_pli_x2{i}.pkl")
    write_pickle(nonpancan_matched_dict_pli,f"nonpancan_matched_dict_pli_x2{i}.pkl")
    

1114
(13593, 61)
(13593, 61)
(36972, 61)
(36972, 61)
(4732, 61)
(4732, 61)
(39782, 61)
(39782, 61)
(7984, 61)
(7984, 61)
(3495, 61)
(3495, 61)
(16028, 61)
(16028, 61)
BLCA
Grouping by:  ['trinuc', 'gc_2', 'distance_to_summit_discrete']
Grouping by:  ['trinuc', 'gc_1', 'distance_to_summit_discrete']
Grouping by:  ['trinuc', 'distance_to_summit_discrete']
Grouping by:  ['distance_to_summit_discrete']
BRCA
Grouping by:  ['trinuc', 'gc_2', 'distance_to_summit_discrete']
Grouping by:  ['trinuc', 'gc_1', 'distance_to_summit_discrete']
Grouping by:  ['trinuc', 'distance_to_summit_discrete']
Grouping by:  ['distance_to_summit_discrete']
GBM
Grouping by:  ['trinuc', 'gc_2', 'distance_to_summit_discrete']
Grouping by:  ['trinuc', 'gc_1', 'distance_to_summit_discrete']
Grouping by:  ['trinuc', 'distance_to_summit_discrete']
Grouping by:  ['distance_to_summit_discrete']
COAD
Grouping by:  ['trinuc', 'gc_2', 'distance_to_summit_discrete']
Grouping by:  ['trinuc', 'gc_1', 'distance_to_summit_discret

Unnamed: 0,total somatic,pancan,nonpancan,matched pancan,matched not near pancan
BLCA,13593,928,12665,928,928
BRCA,36972,2432,34540,2432,2432
GBM,4732,332,4400,332,332
COAD,39782,2634,37148,2634,2634
KIRC,7984,572,7412,572,572
KIRP,3495,255,3240,255,255
LUAD,16028,1100,14928,1100,1100


1114
(13593, 61)
(13593, 61)
(36972, 61)
(36972, 61)
(4732, 61)
(4732, 61)
(39782, 61)
(39782, 61)
(7984, 61)
(7984, 61)
(3495, 61)
(3495, 61)
(16028, 61)
(16028, 61)
BLCA
Grouping by:  ['trinuc', 'gc_2', 'distance_to_summit_discrete']
Grouping by:  ['trinuc', 'gc_1', 'distance_to_summit_discrete']
Grouping by:  ['trinuc', 'distance_to_summit_discrete']
Grouping by:  ['distance_to_summit_discrete']
BRCA
Grouping by:  ['trinuc', 'gc_2', 'distance_to_summit_discrete']
Grouping by:  ['trinuc', 'gc_1', 'distance_to_summit_discrete']
Grouping by:  ['trinuc', 'distance_to_summit_discrete']
Grouping by:  ['distance_to_summit_discrete']
GBM
Grouping by:  ['trinuc', 'gc_2', 'distance_to_summit_discrete']
Grouping by:  ['trinuc', 'gc_1', 'distance_to_summit_discrete']
Grouping by:  ['trinuc', 'distance_to_summit_discrete']
Grouping by:  ['distance_to_summit_discrete']
COAD
Grouping by:  ['trinuc', 'gc_2', 'distance_to_summit_discrete']
Grouping by:  ['trinuc', 'gc_1', 'distance_to_summit_discret

Unnamed: 0,total somatic,pancan,nonpancan,matched pancan,matched not near pancan
BLCA,13593,865,12728,865,865
BRCA,36972,2320,34652,2320,2320
GBM,4732,305,4427,305,305
COAD,39782,2534,37248,2534,2534
KIRC,7984,564,7420,564,564
KIRP,3495,234,3261,234,234
LUAD,16028,990,15038,990,990


1114
(13593, 61)
(13593, 61)
(36972, 61)
(36972, 61)
(4732, 61)
(4732, 61)
(39782, 61)
(39782, 61)
(7984, 61)
(7984, 61)
(3495, 61)
(3495, 61)
(16028, 61)
(16028, 61)
BLCA
Grouping by:  ['trinuc', 'gc_2', 'distance_to_summit_discrete']
Grouping by:  ['trinuc', 'gc_1', 'distance_to_summit_discrete']
Grouping by:  ['trinuc', 'distance_to_summit_discrete']
Grouping by:  ['distance_to_summit_discrete']
BRCA
Grouping by:  ['trinuc', 'gc_2', 'distance_to_summit_discrete']
Grouping by:  ['trinuc', 'gc_1', 'distance_to_summit_discrete']
Grouping by:  ['trinuc', 'distance_to_summit_discrete']
Grouping by:  ['distance_to_summit_discrete']
GBM
Grouping by:  ['trinuc', 'gc_2', 'distance_to_summit_discrete']
Grouping by:  ['trinuc', 'gc_1', 'distance_to_summit_discrete']
Grouping by:  ['trinuc', 'distance_to_summit_discrete']
Grouping by:  ['distance_to_summit_discrete']
COAD
Grouping by:  ['trinuc', 'gc_2', 'distance_to_summit_discrete']
Grouping by:  ['trinuc', 'gc_1', 'distance_to_summit_discret

Unnamed: 0,total somatic,pancan,nonpancan,matched pancan,matched not near pancan
BLCA,13593,964,12629,964,964
BRCA,36972,2558,34414,2558,2558
GBM,4732,380,4352,380,380
COAD,39782,2924,36858,2924,2924
KIRC,7984,570,7414,570,570
KIRP,3495,282,3213,282,282
LUAD,16028,1089,14939,1089,1089


1114
(13593, 61)
(13593, 61)
(36972, 61)
(36972, 61)
(4732, 61)
(4732, 61)
(39782, 61)
(39782, 61)
(7984, 61)
(7984, 61)
(3495, 61)
(3495, 61)
(16028, 61)
(16028, 61)
BLCA
Grouping by:  ['trinuc', 'gc_2', 'distance_to_summit_discrete']
Grouping by:  ['trinuc', 'gc_1', 'distance_to_summit_discrete']
Grouping by:  ['trinuc', 'distance_to_summit_discrete']
Grouping by:  ['distance_to_summit_discrete']
BRCA
Grouping by:  ['trinuc', 'gc_2', 'distance_to_summit_discrete']
Grouping by:  ['trinuc', 'gc_1', 'distance_to_summit_discrete']
Grouping by:  ['trinuc', 'distance_to_summit_discrete']
Grouping by:  ['distance_to_summit_discrete']
GBM
Grouping by:  ['trinuc', 'gc_2', 'distance_to_summit_discrete']
Grouping by:  ['trinuc', 'gc_1', 'distance_to_summit_discrete']
Grouping by:  ['trinuc', 'distance_to_summit_discrete']
Grouping by:  ['distance_to_summit_discrete']
COAD
Grouping by:  ['trinuc', 'gc_2', 'distance_to_summit_discrete']
Grouping by:  ['trinuc', 'gc_1', 'distance_to_summit_discret

Unnamed: 0,total somatic,pancan,nonpancan,matched pancan,matched not near pancan
BLCA,13593,874,12719,874,874
BRCA,36972,2562,34410,2562,2562
GBM,4732,375,4357,375,375
COAD,39782,2723,37059,2723,2723
KIRC,7984,599,7385,599,599
KIRP,3495,273,3222,273,273
LUAD,16028,1142,14886,1142,1142


1114
(13593, 61)
(13593, 61)
(36972, 61)
(36972, 61)
(4732, 61)
(4732, 61)
(39782, 61)
(39782, 61)
(7984, 61)
(7984, 61)
(3495, 61)
(3495, 61)
(16028, 61)
(16028, 61)
BLCA
Grouping by:  ['trinuc', 'gc_2', 'distance_to_summit_discrete']
Grouping by:  ['trinuc', 'gc_1', 'distance_to_summit_discrete']
Grouping by:  ['trinuc', 'distance_to_summit_discrete']
Grouping by:  ['distance_to_summit_discrete']
BRCA
Grouping by:  ['trinuc', 'gc_2', 'distance_to_summit_discrete']
Grouping by:  ['trinuc', 'gc_1', 'distance_to_summit_discrete']
Grouping by:  ['trinuc', 'distance_to_summit_discrete']
Grouping by:  ['distance_to_summit_discrete']
GBM
Grouping by:  ['trinuc', 'gc_2', 'distance_to_summit_discrete']
Grouping by:  ['trinuc', 'gc_1', 'distance_to_summit_discrete']
Grouping by:  ['trinuc', 'distance_to_summit_discrete']
Grouping by:  ['distance_to_summit_discrete']
COAD
Grouping by:  ['trinuc', 'gc_2', 'distance_to_summit_discrete']
Grouping by:  ['trinuc', 'gc_1', 'distance_to_summit_discret

Unnamed: 0,total somatic,pancan,nonpancan,matched pancan,matched not near pancan
BLCA,13593,824,12769,824,824
BRCA,36972,2397,34575,2397,2397
GBM,4732,319,4413,319,319
COAD,39782,2636,37146,2636,2636
KIRC,7984,518,7466,518,518
KIRP,3495,236,3259,236,236
LUAD,16028,1080,14948,1080,1080


In [13]:
# for i in range(5):
#     pli_matched_genes = read_pickle(f"../process_tcga_icgc/pli_matched_tsgenes_x2{i}.pkl")
#     print(len(pli_matched_genes))
    
#     assert len(set(pli_matched_genes).intersection(set(pancan_genes))) == 0
    
#     pancan_matched_dict_pli, nonpancan_matched_dict_pli = get_match(somatic_df_dict,pli_matched_genes)
    
#     write_pickle(pancan_matched_dict_pli,f"pancan_matched_dict_pli_ts_x2{i}.pkl")
#     write_pickle(nonpancan_matched_dict_pli,f"nonpancan_matched_dict_pli_ts_x2{i}.pkl")
    

### 5 random genesets - also total mutations matched

In [14]:
for i in range(5):
    random_genes = read_pickle(f"../process_tcga_icgc/random_genes_{i}.pkl")
    print(len(random_genes))
    
    assert len(set(random_genes).intersection(set(pancan_genes))) == 0
    
    pancan_matched_dict_random, nonpancan_matched_dict_random = get_match(somatic_df_dict,random_genes)
    
    write_pickle(pancan_matched_dict_random,f"pancan_matched_dict_random_{i}.pkl")
    write_pickle(nonpancan_matched_dict_random,f"nonpancan_matched_dict_random_{i}.pkl")
    

1130
(13593, 61)
(13593, 61)
(36972, 61)
(36972, 61)
(4732, 61)
(4732, 61)
(39782, 61)
(39782, 61)
(7984, 61)
(7984, 61)
(3495, 61)
(3495, 61)
(16028, 61)
(16028, 61)
BLCA
Grouping by:  ['trinuc', 'gc_2', 'distance_to_summit_discrete']
Grouping by:  ['trinuc', 'gc_1', 'distance_to_summit_discrete']
Grouping by:  ['trinuc', 'distance_to_summit_discrete']
Grouping by:  ['distance_to_summit_discrete']
BRCA
Grouping by:  ['trinuc', 'gc_2', 'distance_to_summit_discrete']
Grouping by:  ['trinuc', 'gc_1', 'distance_to_summit_discrete']
Grouping by:  ['trinuc', 'distance_to_summit_discrete']
Grouping by:  ['distance_to_summit_discrete']
GBM
Grouping by:  ['trinuc', 'gc_2', 'distance_to_summit_discrete']
Grouping by:  ['trinuc', 'gc_1', 'distance_to_summit_discrete']
Grouping by:  ['trinuc', 'distance_to_summit_discrete']
Grouping by:  ['distance_to_summit_discrete']
COAD
Grouping by:  ['trinuc', 'gc_2', 'distance_to_summit_discrete']
Grouping by:  ['trinuc', 'gc_1', 'distance_to_summit_discret

Unnamed: 0,total somatic,pancan,nonpancan,matched pancan,matched not near pancan
BLCA,13593,820,12773,820,820
BRCA,36972,2241,34731,2241,2241
GBM,4732,266,4466,266,266
COAD,39782,2328,37454,2328,2328
KIRC,7984,452,7532,452,452
KIRP,3495,225,3270,225,225
LUAD,16028,966,15062,966,966


1130
(13593, 61)
(13593, 61)
(36972, 61)
(36972, 61)
(4732, 61)
(4732, 61)
(39782, 61)
(39782, 61)
(7984, 61)
(7984, 61)
(3495, 61)
(3495, 61)
(16028, 61)
(16028, 61)
BLCA
Grouping by:  ['trinuc', 'gc_2', 'distance_to_summit_discrete']
Grouping by:  ['trinuc', 'gc_1', 'distance_to_summit_discrete']
Grouping by:  ['trinuc', 'distance_to_summit_discrete']
Grouping by:  ['distance_to_summit_discrete']
BRCA
Grouping by:  ['trinuc', 'gc_2', 'distance_to_summit_discrete']
Grouping by:  ['trinuc', 'gc_1', 'distance_to_summit_discrete']
Grouping by:  ['trinuc', 'distance_to_summit_discrete']
Grouping by:  ['distance_to_summit_discrete']
GBM
Grouping by:  ['trinuc', 'gc_2', 'distance_to_summit_discrete']
Grouping by:  ['trinuc', 'gc_1', 'distance_to_summit_discrete']
Grouping by:  ['trinuc', 'distance_to_summit_discrete']
Grouping by:  ['distance_to_summit_discrete']
COAD
Grouping by:  ['trinuc', 'gc_2', 'distance_to_summit_discrete']
Grouping by:  ['trinuc', 'gc_1', 'distance_to_summit_discret

Unnamed: 0,total somatic,pancan,nonpancan,matched pancan,matched not near pancan
BLCA,13593,791,12802,791,791
BRCA,36972,2126,34846,2126,2126
GBM,4732,256,4476,256,256
COAD,39782,2408,37374,2408,2408
KIRC,7984,524,7460,524,524
KIRP,3495,181,3314,181,181
LUAD,16028,966,15062,966,966


1130
(13593, 61)
(13593, 61)
(36972, 61)
(36972, 61)
(4732, 61)
(4732, 61)
(39782, 61)
(39782, 61)
(7984, 61)
(7984, 61)
(3495, 61)
(3495, 61)
(16028, 61)
(16028, 61)
BLCA
Grouping by:  ['trinuc', 'gc_2', 'distance_to_summit_discrete']
Grouping by:  ['trinuc', 'gc_1', 'distance_to_summit_discrete']
Grouping by:  ['trinuc', 'distance_to_summit_discrete']
Grouping by:  ['distance_to_summit_discrete']
BRCA
Grouping by:  ['trinuc', 'gc_2', 'distance_to_summit_discrete']
Grouping by:  ['trinuc', 'gc_1', 'distance_to_summit_discrete']
Grouping by:  ['trinuc', 'distance_to_summit_discrete']
Grouping by:  ['distance_to_summit_discrete']
GBM
Grouping by:  ['trinuc', 'gc_2', 'distance_to_summit_discrete']
Grouping by:  ['trinuc', 'gc_1', 'distance_to_summit_discrete']
Grouping by:  ['trinuc', 'distance_to_summit_discrete']
Grouping by:  ['distance_to_summit_discrete']
COAD
Grouping by:  ['trinuc', 'gc_2', 'distance_to_summit_discrete']
Grouping by:  ['trinuc', 'gc_1', 'distance_to_summit_discret

Unnamed: 0,total somatic,pancan,nonpancan,matched pancan,matched not near pancan
BLCA,13593,770,12823,770,770
BRCA,36972,1999,34973,1999,1999
GBM,4732,277,4455,277,277
COAD,39782,2303,37479,2303,2303
KIRC,7984,448,7536,448,448
KIRP,3495,233,3262,233,233
LUAD,16028,906,15122,906,906


1130
(13593, 61)
(13593, 61)
(36972, 61)
(36972, 61)
(4732, 61)
(4732, 61)
(39782, 61)
(39782, 61)
(7984, 61)
(7984, 61)
(3495, 61)
(3495, 61)
(16028, 61)
(16028, 61)
BLCA
Grouping by:  ['trinuc', 'gc_2', 'distance_to_summit_discrete']
Grouping by:  ['trinuc', 'gc_1', 'distance_to_summit_discrete']
Grouping by:  ['trinuc', 'distance_to_summit_discrete']
Grouping by:  ['distance_to_summit_discrete']
BRCA
Grouping by:  ['trinuc', 'gc_2', 'distance_to_summit_discrete']
Grouping by:  ['trinuc', 'gc_1', 'distance_to_summit_discrete']
Grouping by:  ['trinuc', 'distance_to_summit_discrete']
Grouping by:  ['distance_to_summit_discrete']
GBM
Grouping by:  ['trinuc', 'gc_2', 'distance_to_summit_discrete']
Grouping by:  ['trinuc', 'gc_1', 'distance_to_summit_discrete']
Grouping by:  ['trinuc', 'distance_to_summit_discrete']
Grouping by:  ['distance_to_summit_discrete']
COAD
Grouping by:  ['trinuc', 'gc_2', 'distance_to_summit_discrete']
Grouping by:  ['trinuc', 'gc_1', 'distance_to_summit_discret

Unnamed: 0,total somatic,pancan,nonpancan,matched pancan,matched not near pancan
BLCA,13593,773,12820,773,773
BRCA,36972,2271,34701,2271,2271
GBM,4732,292,4440,292,292
COAD,39782,2403,37379,2403,2403
KIRC,7984,491,7493,491,491
KIRP,3495,192,3303,192,192
LUAD,16028,1023,15005,1023,1023


1130
(13593, 61)
(13593, 61)
(36972, 61)
(36972, 61)
(4732, 61)
(4732, 61)
(39782, 61)
(39782, 61)
(7984, 61)
(7984, 61)
(3495, 61)
(3495, 61)
(16028, 61)
(16028, 61)
BLCA
Grouping by:  ['trinuc', 'gc_2', 'distance_to_summit_discrete']
Grouping by:  ['trinuc', 'gc_1', 'distance_to_summit_discrete']
Grouping by:  ['trinuc', 'distance_to_summit_discrete']
Grouping by:  ['distance_to_summit_discrete']
BRCA
Grouping by:  ['trinuc', 'gc_2', 'distance_to_summit_discrete']
Grouping by:  ['trinuc', 'gc_1', 'distance_to_summit_discrete']
Grouping by:  ['trinuc', 'distance_to_summit_discrete']
Grouping by:  ['distance_to_summit_discrete']
GBM
Grouping by:  ['trinuc', 'gc_2', 'distance_to_summit_discrete']
Grouping by:  ['trinuc', 'gc_1', 'distance_to_summit_discrete']
Grouping by:  ['trinuc', 'distance_to_summit_discrete']
Grouping by:  ['distance_to_summit_discrete']
COAD
Grouping by:  ['trinuc', 'gc_2', 'distance_to_summit_discrete']
Grouping by:  ['trinuc', 'gc_1', 'distance_to_summit_discret

Unnamed: 0,total somatic,pancan,nonpancan,matched pancan,matched not near pancan
BLCA,13593,858,12735,858,858
BRCA,36972,2215,34757,2215,2215
GBM,4732,374,4358,374,374
COAD,39782,2517,37265,2517,2517
KIRC,7984,469,7515,469,469
KIRP,3495,200,3295,200,200
LUAD,16028,977,15051,977,977


In [1]:
print("Done")

Done
