## Creating a custom panel for use in small sequencers for LungCancerMutations

### Strategy: Look for mutational hotspots for lung cancer in the COSMIC Database

In [1]:
# some sensible settings for better output
import os
import pandas as pd
from IPython.display import display
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 200)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
pd.set_option('max_colwidth', 200)


# get the code
import sys
sys.path.append('../code')
from script_utils import show_output

# paths
static_path = os.path.join(os.environ['STATIC'], "annotation/clinical")
local_path = os.path.join(os.environ['WORK'], "LO/Sequencing/LungCustomPanel")

## putting it all together

In [2]:
from cosmic_panel import cosmic_panel_master, analyze_genes
# load exonic annotations
cosmic_exon_path = os.path.join(static_path, "cosmic_exon.csv")
cosmic_exon_df = pd.read_csv(cosmic_exon_path, sep="\t", compression="gzip")

In [8]:
# filter settings
custom_filter = dict(
    exonic_list = ['exonic', 'UTR3', 'UTR5', 'UTR5;UTR3', 'exonic;splicing'],
    mut_list = ['nonsynonymous SNV', 'stopgain', 'startloss', 'stoploss', 'frameshift deletion', 'nonframeshift deletion'],
    gnomad_max=1e-2,
    cosmic_rolling_min=500,
    rolling_window_size=5,
    cosmic_min = 3500,
    cosmic_density_min = 250,
    padding=75
)

clinscore_file = "../configs/clinscoreLung_with_geneboost.yaml"

panel_muts, regions_df, cosmic_scored = cosmic_panel_master(cosmic_exon_df,
                                              cosmic_weights_file=clinscore_file, 
                                              filter_setting=custom_filter,
                                                threads=10,
                                              verbose=1
                                             )

[1;30;1mCreating custom panel based on limits set in filter settings.
	[cosmic_rolling_min:	500]
	[rolling_window_size:	5]
	[cosmic_min:	3500]
	[cosmic_density_min:	250]
	[padding:	75][0m
[1;30;1mComputing cosmic score using 10 threads.[0m
[1;30;1mInflating gene-wise scores for the following genes:
ERBB4,PDGFRA,FLT1,KDR,RET,BRCA2,ROS1,MAP3K4,JAK3,ERBB3,MYC,NOTCH4,PIK3CD,CDK12,NOTCH1,PTEN,SMARCB1,PDGFRB,SMO,PIK3CB,MAP2K1,MAPK8,MAP2K2,CDKN1B,CDK3[0m
[1;30;1mCosmic score finished.[0m
[1;30;1mCondensing the mutations per position.[0m
[1;30;1mCondensing cosmic scores using 10 threads.[0m
[1;36;1mFinished condensing cosmic scores.[0m
[1;30;1mPerform rolling window computation[0m
[1;30;1mComputing mutation density[0m
[1;30;1mFiltering out background mutations[0m
[1;30;1mFiltered out 203521 mutations [209405 --> 5884][0m
[1;30;1mCollapsing the mutations to adjacency groups[0m
[1;30;1mCollapsing adjacent mutations and including bait padding[0m
[1;36;1mFinished! Librar

### reruns can be performed without re-computing the cosmic scores
+ just use the last output from previous run (`cosmic_scored`) as input and remove the cosmic_weights file, so the tool knows you do not want to recompute the clinscores

In [9]:
custom_filter = dict(
    exonic_list = ['exonic', 'UTR3', 'UTR5', 'UTR5;UTR3', 'exonic;splicing'],
    mut_list = ['nonsynonymous SNV', 'stopgain', 'startloss', 'stoploss', 'frameshift deletion', 'nonframeshift deletion'],
    gnomad_max=1e-2,
    cosmic_rolling_min=500,
    rolling_window_size=5,
    cosmic_min = 3500,
    cosmic_density_min = 250,
    padding=75
)

panel_muts, regions_df, cosmic_scored = cosmic_panel_master(cosmic_scored,
                                            filter_setting=custom_filter,
                                            threads=10,
                                            verbose=1
                                            )

[1;30;1mCreating custom panel based on limits set in filter settings.
	[cosmic_rolling_min:	500]
	[rolling_window_size:	5]
	[cosmic_min:	3500]
	[cosmic_density_min:	250]
	[padding:	75][0m
[1;30;1mUsing precomputed cosmic scores! For recomputation, provide a cosmic weights file[0m
[1;30;1mPerform rolling window computation[0m
[1;30;1mComputing mutation density[0m
[1;30;1mFiltering out background mutations[0m
[1;30;1mFiltered out 203521 mutations [209405 --> 5884][0m
[1;30;1mCollapsing the mutations to adjacency groups[0m
[1;30;1mCollapsing adjacent mutations and including bait padding[0m
[1;36;1mFinished! Library size = 456kb - 5884 mutations included[0m


## analyse for the top genes and inclusion of panel genes

### look for the genes with highest accumulative clinscore in all cosmic

In [10]:
# get the gene info
genes_excel_file = "../testdata/ExampleGeneList.xlsx"
in_panel, cosmic_not_included, list_not_included = analyze_genes(panel_muts, cosmic_scored, save_excel="", panel_excel=genes_excel_file)

list_not_included[:10]

  genes['notes'] = genes['notes'].str.strip("|").str.replace(r"^[|]+$", "", regex=True)


Unnamed: 0,Gene,cosmic_scoreDB,countOtherPanels,notes
280,NTRK1,99492,3,im Panel sind- 2 und -3
454,NOTCH4,71806,1,
483,CDK12,69072,1,
489,PIK3CD,68630,1,
525,RNF43,66475,1,
547,NOTCH1,65134,2,
816,MTOR,50429,1,
820,DDR2,50268,3,
871,MLH1,48190,1,
985,AXIN2,43780,1,


## try out different settings and then save

In [None]:
save_excel = os.path.join(local_path, "output/500k_panel_design3.xlsx")

# filter settings
custom_filter = dict(
    exonic_list = ['exonic', 'UTR3', 'UTR5', 'UTR5;UTR3', 'exonic;splicing'],
    mut_list = ['nonsynonymous SNV', 'stopgain', 'startloss', 'stoploss', 'frameshift deletion', 'nonframeshift deletion'],
    gnomad_max=1e-2,
    cosmic_rolling_min=500,
    rolling_window_size=5,
    cosmic_min = 3300,
    cosmic_density_min = 250,
    padding=75
)

clinscore_file = os.path.join(local_path, "info/clinscoreLung_with_geneboost2.yaml")

panel_muts, regions_df, cosmic_scored = cosmic_panel_master(cosmic_exon_df,cosmic_weights_file=clinscore_file, filter_setting=custom_filter,threads=10)

in_panel, cosmic_not_included, panel_not_included = analyze_genes(panel_muts, cosmic_scored, panel_excel=genes_excel_file)
panel_not_included

In [None]:
def check_genes_in_cosmic(gene_list, min_score=100):
    dfs = []
    for gene in gene_list:
        df = cosmic_scored.query("Gene == @gene and cosmic_score > @min_score").sort_values('cosmic_score', ascending=False).drop("AAChange", axis=1)
        dfs.append(df)
    df = pd.concat(dfs)
    return df

In [None]:
check_genes_in_cosmic(gene_list = ["NTRK1", "AXIN2"], min_score=500)