## Creating a custom panel for use in small sequencers for LungCancerMutations

### Strategy: Look for mutational hotspots for lung cancer in the COSMIC Database

In [None]:
# some sensible settings for better output
import os
import pandas as pd
from IPython.display import display
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 200)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
pd.set_option('max_colwidth', 200)


# get the code
import sys
sys.path.append('../code')
from script_utils import show_output

# paths
static_path = os.path.join(os.environ['STATIC'], "annotation/clinical")

## putting it all together

In [None]:
from cosmic_panel import cosmic_panel_master, analyze_genes
# load exonic annotations
cosmic_exon_path = os.path.join(static_path, "cosmic_exon.csv")
cosmic_exon_df = pd.read_csv(cosmic_exon_path, sep="\t", compression="gzip")

In [None]:
# filter settings
custom_filter = dict(
    exonic_list = ['exonic', 'UTR3', 'UTR5', 'UTR5;UTR3', 'exonic;splicing'],
    mut_list = ['nonsynonymous SNV', 'stopgain', 'startloss', 'stoploss', 'frameshift deletion', 'nonframeshift deletion'],
    gnomad_max=1e-2,
    cosmic_rolling_min=500,
    rolling_window_size=5,
    cosmic_min = 3500,
    cosmic_density_min = 250,
    padding=75
)

clinscore_file = "../configs/clinscoreLung.yaml"

panel_muts, regions_df, cosmic_scored = cosmic_panel_master(cosmic_exon_df,
                                              cosmic_weights_file=clinscore_file, 
                                              filter_setting=custom_filter,
                                                threads=10,
                                              verbose=1
                                             )

### reruns can be performed without re-computing the cosmic scores
+ just use the last output from previous run (`cosmic_scored`) as input and remove the cosmic_weights file, so the tool knows you do not want to recompute the clinscores

In [None]:
custom_filter = dict(
    exonic_list = ['exonic', 'UTR3', 'UTR5', 'UTR5;UTR3', 'exonic;splicing'],
    mut_list = ['nonsynonymous SNV', 'stopgain', 'startloss', 'stoploss', 'frameshift deletion', 'nonframeshift deletion'],
    gnomad_max=1e-2,
    cosmic_rolling_min=500,
    rolling_window_size=5,
    cosmic_min = 3500,
    cosmic_density_min = 250,
    padding=75
)

panel_muts, regions_df, cosmic_scored = cosmic_panel_master(cosmic_scored,
                                            filter_setting=custom_filter,
                                            threads=10,
                                            verbose=1
                                            )

### if special genes are very important you can add a genespecific booster
+ just use the last output from previous run (`cosmic_scored`) as input and remove the cosmic_weights file, so the tool knows you do not want to recompute the clinscores

In [None]:
# filter settings
custom_filter = dict(
    exonic_list = ['exonic', 'UTR3', 'UTR5', 'UTR5;UTR3', 'exonic;splicing'],
    mut_list = ['nonsynonymous SNV', 'stopgain', 'startloss', 'stoploss', 'frameshift deletion', 'nonframeshift deletion'],
    gnomad_max=1e-2,
    cosmic_rolling_min=500,
    rolling_window_size=5,
    cosmic_min = 3500,
    cosmic_density_min = 250,
    padding=75
)

clinscore_file = "../configs/clinscoreLung_with_geneboost.yaml"

panel_muts, regions_df, cosmic_boosted = cosmic_panel_master(cosmic_exon_df,
                                              cosmic_weights_file=clinscore_file, 
                                              filter_setting=custom_filter,
                                                threads=10,
                                              verbose=1
                                             )

## analyse for the top genes and inclusion of panel genes

### look for the genes with highest accumulative clinscore in all cosmic
+ for this analysis, I would recommend using the non-boosted cosmic scores

In [None]:
# get the gene info
genes_excel_file = "../testdata/ExampleGeneList.xlsx"
in_panel, cosmic_not_included, list_not_included = analyze_genes(panel_muts, cosmic_scored, save_excel="", panel_excel=genes_excel_file)

list_not_included[:10]

## try out different settings and then save

In [None]:
local_path = os.path.join(os.environ['WORK'], "LO/Sequencing/LungCustomPanel")
save_excel = os.path.join(local_path, "output/500k_panel_design3.xlsx")
clinscore_file = os.path.join(local_path, "info/clinscoreLung_with_geneboost2.yaml")
panel_file = os.path.join(local_path, "info/GeneList4Inclusion.xlsx")

# filter settings
custom_filter = dict(
    exonic_list = ['exonic', 'UTR3', 'UTR5', 'UTR5;UTR3', 'exonic;splicing'],
    mut_list = ['nonsynonymous SNV', 'stopgain', 'startloss', 'stoploss', 'frameshift deletion', 'nonframeshift deletion'],
    gnomad_max=1e-2,
    cosmic_rolling_min=500,
    rolling_window_size=5,
    cosmic_min = 3300,
    cosmic_density_min = 250,
    padding=75
)

panel_muts, regions_df, cosmic_scored_boost = cosmic_panel_master(cosmic_exon_df,cosmic_weights_file=clinscore_file, filter_setting=custom_filter,threads=10)

In [None]:
# filter settings
custom_filter = dict(
    exonic_list = ['exonic', 'UTR3', 'UTR5', 'UTR5;UTR3', 'exonic;splicing'],
    mut_list = ['nonsynonymous SNV', 'stopgain', 'startloss', 'stoploss', 'frameshift deletion', 'nonframeshift deletion'],
    gnomad_max=1e-2,
    cosmic_rolling_min=500,
    rolling_window_size=5,
    cosmic_min = 3100,
    cosmic_density_min = 300,
    padding=75
)
panel_muts, regions_df, cosmic_scored_boost = cosmic_panel_master(cosmic_scored_boost, filter_setting=custom_filter,threads=10)

in_panel, cosmic_not_included, panel_not_included = analyze_genes(panel_muts, cosmic_scored, panel_excel=panel_file, save_excel=save_excel)

In [None]:
panel_not_included.sort_values("Gene")

In [None]:
def check_genes_in_cosmic(gene_list, min_score=500):
    dfs = []
    for gene in gene_list:
        df = cosmic_scored.query("Gene == @gene and cosmic_score > @min_score").sort_values('cosmic_score', ascending=False).drop("AAChange", axis=1)
        dfs.append(df)
    df = pd.concat(dfs)
    return df

In [None]:
check_genes_in_cosmic(['GNAQ', 'GNA11', 'MTOR'])

In [None]:
check_genes_in_cosmic(['TGFBR2', 'ACVR2A'])

In [None]:
check_genes_in_cosmic(['SMAD1','SMAD3'])

In [None]:
check_genes_in_cosmic(['CDKN1B', 'CDKN2B','CDKN2C', 'CDK12'])

In [None]:
check_genes_in_cosmic(['CDK6', 'CCND2', 'NOTCH4', 'KIT'])