In [1]:
import pandas as pd
import numpy as np
from seq2atac.stable import read_pickle, write_pickle

  from .autonotebook import tqdm as notebook_tqdm


### ISM thresholds

In [2]:
ism_thresh_distro = read_pickle("/illumina/scratch/deep_learning/nravindra/results/reg_diffs/motifism_cancers_null_v22.pkl")


In [3]:
all_percentiles = [0,5,25,50,75,90,95,97.5,99,99.5,99.9,100]
all_percentiles

In [4]:
ism_thresh_percentile = {x:{} for x in all_percentiles}
    
for k,val in ism_thresh_distro.items():
    print(k)
    for percentile in all_percentiles:
        print(percentile)
        ism_thresh_percentile[percentile][k] = np.percentile(val, percentile)
        
ism_thresh_percentile[95]

brca
0
5
25
50
75
90
95
97.5
99
99.5
99.9
100
blca
0
5
25
50
75
90
95
97.5
99
99.5
99.9
100
coad
0
5
25
50
75
90
95
97.5
99
99.5
99.9
100
luad
0
5
25
50
75
90
95
97.5
99
99.5
99.9
100
kirp
0
5
25
50
75
90
95
97.5
99
99.5
99.9
100
kirc
0
5
25
50
75
90
95
97.5
99
99.5
99.9
100
gbm
0
5
25
50
75
90
95
97.5
99
99.5
99.9
100
skcm
0
5
25
50
75
90
95
97.5
99
99.5
99.9
100


{'brca': 0.17696281257796992,
 'blca': 0.2097989422114596,
 'coad': 0.19172578644761995,
 'luad': 0.27739862176416474,
 'kirp': 0.23343622358881733,
 'kirc': 0.2235218676975348,
 'gbm': 0.21235916285438658,
 'skcm': 0.19640249133953247}

In [5]:
write_pickle(ism_thresh_percentile,"ism_thresh_percentile.pkl")

### Peak probability distribution for each model - 5th percentile

In [6]:
#### get peak probability distribution for each model
peak_proba_thresh = {}
for cancer_name in  ["BLCA","BRCA","GBM","COAD","KIRC","KIRP","LUAD"]:
    
    print(cancer_name)
    
    master_df = 0.0
    for fold_idx in range(5):
        print(fold_idx)
        preds_df = pd.read_csv(f"/illumina/scratch/deep_learning/akumar22/TCGA/models_250_1364_minibatch_prejitter/{cancer_name}/fold_{fold_idx}/peak_preds.csv")
        if type(master_df) == float:
            master_df = preds_df
        else:
            master_len_before = master_df.shape[0]
            master_df = pd.merge(master_df,preds_df)
            assert master_df.shape[0] == master_len_before
        

    master_df["model_avg"] = (master_df["preds_0"] + master_df["preds_1"] + master_df["preds_2"] + master_df["preds_3"] + master_df["preds_4"])/5
    peak_proba_thresh[cancer_name] = master_df["model_avg"].quantile(0.05)
    
    
peak_proba_thresh['RECA'] = peak_proba_thresh['KIRC']
peak_proba_thresh

BLCA
0
1
2
3
4
BRCA
0
1
2
3
4
GBM
0
1
2
3
4
COAD
0
1
2
3
4
KIRC
0
1
2
3
4
KIRP
0
1
2
3
4
LUAD
0
1
2
3
4


{'BLCA': 0.30038172008,
 'BRCA': 0.40271141012000006,
 'GBM': 0.256060285,
 'COAD': 0.2942905675,
 'KIRC': 0.39631355571677007,
 'KIRP': 0.2664290534,
 'LUAD': 0.24252313195000003,
 'RECA': 0.39631355571677007}

In [7]:
write_pickle(peak_proba_thresh,"peak_proba_thresh.pkl")

### pancan, og, tsg

In [8]:
cosmos_pancan_file = "/illumina/scratch/deep_learning/asalcedo/cancer_gene_census.csv"
cosmos_pancan_df = pd.read_csv(cosmos_pancan_file)
display(cosmos_pancan_df)

cosmos_pancan_df = cosmos_pancan_df[(~cosmos_pancan_df["Role in Cancer"].isna()) & 
                                    (cosmos_pancan_df["Role in Cancer"]!="fusion")]

pancan_genes = list(cosmos_pancan_df["Gene Symbol"].unique())
ispancan = lambda x : ingene_indicator(x, pancan_genes)
len(pancan_genes)

Unnamed: 0,Gene Symbol,Name,Entrez GeneId,Genome Location,Tier,Hallmark,Chr Band,Somatic,Germline,Tumour Types(Somatic),...,Tissue Type,Molecular Genetics,Role in Cancer,Mutation Types,Translocation Partner,Other Germline Mut,Other Syndrome,COSMIC ID,cosmic gene name,Synonyms
0,AR,Androgen Receptor,367.0,X:67544036-67730619,1,Yes,12.00,yes,yes,prostate,...,E,Dom,oncogene,Mis,,yes,"Androgen insensitivity, Hypospadias 1, X-linke...",COSG292497,AR,"367,AIS,AR,DHTR,ENSG00000169083.16,HUMARA,NR3C..."
1,FH,fumarate hydratase,2271.0,1:241497603-241519761,1,,43.00,,yes,,...,"E, M",Rec,TSG,"Mis, N, F",,,,COSG255037,FH,"2271,ENSG00000091483.6,FH,P07954"
2,ALK,anaplastic lymphoma kinase (Ki-1),238.0,2:29192774-29921566,1,Yes,23.20,yes,yes,"ALCL, NSCLC, neuroblastoma, inflammatory myofi...",...,"L, E, M",Dom,"oncogene, fusion","T, Mis, A","NPM1, TPM3, TFG, TPM4, ATIC, CLTC, MSN, RNF213...",,,COSG383409,ALK,"238,ALK,CD246,ENSG00000171094.17,Q9UM73"
3,APC,adenomatous polyposis of the colon gene,324.0,5:112737888-112846239,1,Yes,22.20,yes,yes,"colorectal, pancreatic, desmoid, hepatoblastom...",...,"E, M, O",Rec,TSG,"D, Mis, N, F, S",,,,COSG208824,APC,"324,APC,DP2,DP2.5,DP3,ENSG00000134982.16,P2505..."
4,ATM,ataxia telangiectasia mutated,472.0,11:108222832-108369099,1,Yes,22.30,yes,yes,T-PLL,...,"L, O",Rec,TSG,"D, Mis, N, F, S",,,,COSG358825,ATM,"472,ATA,ATC,ATD,ATDC,ATM,ENSG00000149311.18,Q1..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
728,RAP1GDS1,"RAP1, GTP-GDP dissociation stimulator 1",5910.0,4:98261518-98442334,1,Yes,23.00,yes,,T-ALL,...,L,Dom,"oncogene, fusion",T,NUP98,,,COSG382853,RAP1GDS1,"5910,ENSG00000138698.14,P52306,RAP1GDS1,SmgGDS"
729,TNFRSF14,"tumor necrosis factor receptor superfamily, me...",8764.0,1:2556366-2563829,1,,36.32,yes,,follicular lymphoma,...,L,Rec,TSG,"Mis, N, F",,,,COSG247544,TNFRSF14,"8764,ATAR,CD270,ENSG00000157873.17,HVEA,HVEM,L..."
730,TNFRSF17,"tumor necrosis factor receptor superfamily, me...",608.0,16:11965107-11968068,1,,13.13,yes,,intestinal T-cell lymphoma,...,L,Dom,"oncogene, fusion",T,IL2,,,COSG208492,TNFRSF17,"608,BCM,BCMA,CD269,ENSG00000048462.10,Q02223,T..."
731,ARHGEF10L,Rho guanine nucleotide exchange factor 10 like,55160.0,1:17539835-17697869,2,,36.13,yes,,lymphoma,...,L,,TSG,D,,,,COSG269628,ARHGEF10L,"55160,ARHGEF10L,ENSG00000074964.16,FLJ10521,KI..."


565

In [9]:
write_pickle(pancan_genes,"ogtsg.pkl")

In [10]:
cosmos_pancan_file = "/illumina/scratch/deep_learning/asalcedo/cancer_gene_census.csv"
cosmos_pancan_df = pd.read_csv(cosmos_pancan_file)
cosmos_pancan_df = cosmos_pancan_df[(~cosmos_pancan_df["Role in Cancer"].isna()) & 
                                        (cosmos_pancan_df["Role in Cancer"]!="fusion")]

oncogenes = cosmos_pancan_df[(~cosmos_pancan_df["Role in Cancer"].isin(['TSG, fusion','TSG']))]["Gene Symbol"].tolist()
oncogenes = list(set(oncogenes))
len(oncogenes)

318

In [11]:
write_pickle(pancan_genes,"og.pkl")

In [12]:
cosmos_pancan_file = "/illumina/scratch/deep_learning/asalcedo/cancer_gene_census.csv"
cosmos_pancan_df = pd.read_csv(cosmos_pancan_file)
cosmos_pancan_df = cosmos_pancan_df[(~cosmos_pancan_df["Role in Cancer"].isna()) & 
                                        (cosmos_pancan_df["Role in Cancer"]!="fusion")]

tsg = cosmos_pancan_df[(~cosmos_pancan_df["Role in Cancer"].isin(['oncogene, fusion','oncogene']))]["Gene Symbol"].tolist()
tsg = list(set(tsg))
len(tsg)

320

In [13]:
write_pickle(pancan_genes,"tsg.pkl")

### pLI matched genes

In [14]:
from seq2atac.analysis.mutation_utils import ingene_indicator
from seq2atac.analysis.sample_controls import matching_logic

In [15]:
!cp /illumina/scratch/deep_learning/hgao/fitness/gnomad/gnomad.v2.1.1.lof_metrics.by_gene.txt .

In [16]:
all_gene_pli = pd.read_csv("/illumina/scratch/deep_learning/hgao/fitness/gnomad/gnomad.v2.1.1.lof_metrics.by_gene.txt",sep="\t")
all_gene_pli.columns

Index(['gene', 'transcript', 'obs_mis', 'exp_mis', 'oe_mis', 'mu_mis',
       'possible_mis', 'obs_mis_pphen', 'exp_mis_pphen', 'oe_mis_pphen',
       'possible_mis_pphen', 'obs_syn', 'exp_syn', 'oe_syn', 'mu_syn',
       'possible_syn', 'obs_lof', 'mu_lof', 'possible_lof', 'exp_lof', 'pLI',
       'pNull', 'pRec', 'oe_lof', 'oe_syn_lower', 'oe_syn_upper',
       'oe_mis_lower', 'oe_mis_upper', 'oe_lof_lower', 'oe_lof_upper',
       'constraint_flag', 'syn_z', 'mis_z', 'lof_z', 'oe_lof_upper_rank',
       'oe_lof_upper_bin', 'oe_lof_upper_bin_6', 'n_sites', 'classic_caf',
       'max_af', 'no_lofs', 'obs_het_lof', 'obs_hom_lof', 'defined', 'p',
       'exp_hom_lof', 'classic_caf_afr', 'classic_caf_amr', 'classic_caf_asj',
       'classic_caf_eas', 'classic_caf_fin', 'classic_caf_nfe',
       'classic_caf_oth', 'classic_caf_sas', 'p_afr', 'p_amr', 'p_asj',
       'p_eas', 'p_fin', 'p_nfe', 'p_oth', 'p_sas', 'transcript_type',
       'gene_id', 'transcript_level', 'cds_length', 'num_codi

In [17]:
gene_to_pli_dict = dict(zip(all_gene_pli["gene"],all_gene_pli["pLI"]))
gene_to_loeuf_dict = dict(zip(all_gene_pli["gene"],all_gene_pli["oe_lof_upper"]))
gene_to_pli_dict, gene_to_loeuf_dict

({'MED13': 1.0,
  'NIPBL': 1.0,
  'SMC3': 1.0,
  'CNOT1': 1.0,
  'RLF': 1.0,
  'PCF11': 1.0,
  'FNDC3B': 1.0,
  'TAF1': 1.0,
  'RSF1': 1.0,
  'NCKAP1': 1.0,
  'KDM2A': 1.0,
  'BRD4': 1.0,
  'HELZ': 1.0,
  'FBN1': 1.0,
  'XPO1': 1.0,
  'PRR12': 1.0,
  'USP9X': 1.0,
  'POLA1': 1.0,
  'SYNGAP1': 1.0,
  'PRKDC': 1.0,
  'HDAC4': 1.0,
  'SMG1': 1.0,
  'ZC3H4': 1.0,
  'COL5A1': 1.0,
  'SMARCA4': 1.0,
  'TNPO1': 1.0,
  'AGO1': 1.0,
  'ARHGAP35': 1.0,
  'LRP1': 1.0,
  'TOP1': 1.0,
  'TRIP12': 1.0,
  'KMT2E': 1.0,
  'HCFC1': 1.0,
  'UBTF': 1.0,
  'HUWE1': 1.0,
  'KDM3B': 1.0,
  'TRRAP': 1.0,
  'GRIN2B': 1.0,
  'USP7': 1.0,
  'ATP1A3': 1.0,
  'ASH1L': 1.0,
  'ANKRD17': 1.0,
  'SMC1A': 1.0,
  'NPEPPS': 1.0,
  'CHERP': 1.0,
  'MED13L': 1.0,
  'RPRD2': 1.0,
  'KMT2A': 1.0,
  'CREBBP': 1.0,
  'SF3B1': 1.0,
  'EIF3B': 1.0,
  'PPFIA1': 1.0,
  'ZSWIM6': 1.0,
  'PRPF3': 1.0,
  'KAT6A': 1.0,
  'KMT2B': 1.0,
  'SIN3A': 1.0,
  'CHD2': 1.0,
  'ARID1A': 1.0,
  'GLTSCR1L': 1.0,
  'SCN1A': 1.0,
  'MED12': 1.0,


In [18]:
all_gene_pli["closest_pancan"] = all_gene_pli["gene"].apply(lambda x : ingene_indicator(x, pancan_genes))
all_gene_pli["closest_og"] = all_gene_pli["gene"].apply(lambda x : ingene_indicator(x, oncogenes))
all_gene_pli["closest_tsg"] = all_gene_pli["gene"].apply(lambda x : ingene_indicator(x, tsg))
all_gene_pli["pLI_2"] = all_gene_pli["pLI"].apply(lambda x : round(x,2))
all_gene_pli[all_gene_pli["closest_pancan"]==1].shape,\
    all_gene_pli[all_gene_pli["closest_og"]==1].shape,\
    all_gene_pli[all_gene_pli["closest_tsg"]==1].shape, 


((557, 81), (312, 81), (316, 81))

### Random pli matched genes x2

In [19]:
rnd_state = [94404, 94305, 92020,292299,423432432]
for i in range(5):
    pancan_genes_matched, nonpancan_genes_matched = matching_logic(all_gene_pli[all_gene_pli["closest_pancan"]==1],
                                                                   all_gene_pli[all_gene_pli["closest_pancan"]==0],
                                                                   levels=[["pLI_2"]], verbose=False)
    nonpancan_genes_matched = list(nonpancan_genes_matched["gene"].unique())


    pancan_genes_matched, nonpancan_genes_matched2 = matching_logic(all_gene_pli[all_gene_pli["closest_pancan"]==1],
                                                                   all_gene_pli[(all_gene_pli["closest_pancan"]==0) & (~all_gene_pli["gene"].isin(nonpancan_genes_matched))],
                                                                   levels=[["pLI_2"]], verbose=False)

    nonpancan_genes_matched2 = list(nonpancan_genes_matched2["gene"].unique())
    nonpancan_genes_matched += nonpancan_genes_matched2

    print(len(nonpancan_genes_matched))
    
    write_pickle(nonpancan_genes_matched,f"pli_matched_genes_x2{i}.pkl")
    
    all_gene_pli = all_gene_pli.sample(frac=1,random_state=rnd_state[i])
    

Grouping by:  ['pLI_2']
Grouping by:  ['pLI_2']
1114
Grouping by:  ['pLI_2']
Grouping by:  ['pLI_2']
1114
Grouping by:  ['pLI_2']
Grouping by:  ['pLI_2']
1114
Grouping by:  ['pLI_2']
Grouping by:  ['pLI_2']
1114
Grouping by:  ['pLI_2']
Grouping by:  ['pLI_2']
1114


In [13]:
# rnd_state = [94404, 94305, 92020,292299,423432432]
# for i in range(5):
#     pancan_genes_matched, nonpancan_genes_matched = matching_logic(all_gene_pli[all_gene_pli["closest_og"]==1],
#                                                                    all_gene_pli[(all_gene_pli["closest_og"]==0) & (all_gene_pli["closest_pancan"]==0)],
#                                                                    levels=[["pLI_2"]], verbose=False)
#     nonpancan_genes_matched = list(nonpancan_genes_matched["gene"].unique())


#     pancan_genes_matched, nonpancan_genes_matched2 = matching_logic(all_gene_pli[all_gene_pli["closest_og"]==1],
#                                                                    all_gene_pli[(all_gene_pli["closest_og"]==0)  & (all_gene_pli["closest_pancan"]==0) & (~all_gene_pli["gene"].isin(nonpancan_genes_matched))],
#                                                                    levels=[["pLI_2"]], verbose=False)

#     nonpancan_genes_matched2 = list(nonpancan_genes_matched2["gene"].unique())
#     nonpancan_genes_matched += nonpancan_genes_matched2

#     print(len(nonpancan_genes_matched))
    
#     write_pickle(nonpancan_genes_matched,f"pli_matched_oncogenes_x2{i}.pkl")
    
#     all_gene_pli = all_gene_pli.sample(frac=1,random_state=rnd_state[i])
    

Grouping by:  ['pLI_2']
Grouping by:  ['pLI_2']
624
Grouping by:  ['pLI_2']
Grouping by:  ['pLI_2']
624
Grouping by:  ['pLI_2']
Grouping by:  ['pLI_2']
624
Grouping by:  ['pLI_2']
Grouping by:  ['pLI_2']
624
Grouping by:  ['pLI_2']
Grouping by:  ['pLI_2']
624


In [None]:
# rnd_state = [94404, 94305, 92020,292299,423432432]
# for i in range(5):
#     pancan_genes_matched, nonpancan_genes_matched = matching_logic(all_gene_pli[all_gene_pli["closest_tsg"]==1],
#                                                                    all_gene_pli[(all_gene_pli["closest_tsg"]==0) & (all_gene_pli["closest_pancan"]==0)],
#                                                                    levels=[["pLI_2"]], verbose=False)
#     nonpancan_genes_matched = list(nonpancan_genes_matched["gene"].unique())


#     pancan_genes_matched, nonpancan_genes_matched2 = matching_logic(all_gene_pli[all_gene_pli["closest_tsg"]==1],
#                                                                    all_gene_pli[(all_gene_pli["closest_tsg"]==0) & (all_gene_pli["closest_pancan"]==0) & (~all_gene_pli["gene"].isin(nonpancan_genes_matched))],
#                                                                    levels=[["pLI_2"]], verbose=False)

#     nonpancan_genes_matched2 = list(nonpancan_genes_matched2["gene"].unique())
#     nonpancan_genes_matched += nonpancan_genes_matched2

#     print(len(nonpancan_genes_matched))
    
#     write_pickle(nonpancan_genes_matched,f"pli_matched_tsgenes_x2{i}.pkl")
    
#     all_gene_pli = all_gene_pli.sample(frac=1,random_state=rnd_state[i])
    

### Generate random genesets

In [20]:
!cp /illumina/scratch/deep_learning/nersaro/promoterAI/data/ref_data/gencodev39_cage_ratio_to_sum_refined_tss_positions_transcripts_protein_coding_inclZeros_withTranscriptID.tsv .

In [21]:
tss_df = pd.read_csv("/illumina/scratch/deep_learning/nersaro/promoterAI/data/ref_data/gencodev39_cage_ratio_to_sum_refined_tss_positions_transcripts_protein_coding_inclZeros_withTranscriptID.tsv",sep="\t")
tss_df = tss_df[["gene"]].drop_duplicates()
tss_df["pancan"] = tss_df["gene"].apply(lambda x: int(x in pancan_genes))
print(tss_df)
all_genes = list(set(tss_df["gene"].tolist()))
print(len(all_genes))

          gene  pancan
0        OR4F5       0
1       OR4F29       0
2       OR4F16       0
3       SAMD11       0
4        NOC2L       0
...        ...     ...
18763    OR1N2       0
18764    OR1L1       0
18765    OR1L6       0
18766  C9orf50       0
18767     QRFP       0

[18761 rows x 2 columns]
18761


In [22]:
nonpancan_df = tss_df[tss_df["pancan"]==0]
nonpancan_df

Unnamed: 0,gene,pancan
0,OR4F5,0
1,OR4F29,0
2,OR4F16,0
3,SAMD11,0
4,NOC2L,0
...,...,...
18763,OR1N2,0
18764,OR1L1,0
18765,OR1L6,0
18766,C9orf50,0


In [23]:
random_nonpancan_genes = []
for i in range(5):
    
    random_pancan_genes_df = nonpancan_df[~nonpancan_df["gene"].isin(random_nonpancan_genes)].sample(2*len(pancan_genes),random_state=94404)
    random_nonpancan_genes_new = random_pancan_genes_df["gene"].tolist()
    
    random_nonpancan_genes += random_nonpancan_genes_new
    write_pickle(random_nonpancan_genes_new,f"random_genes_{i}.pkl")
    

In [24]:
print("Done")

Done
