## Creating a custom panel for use in small sequencers for LungCancerMutations

### Strategy: Look for mutational hotspots for lung cancer in the COSMIC Database

In [None]:
# some sensible settings for better output
import os
import pandas as pd
from IPython.display import display
pd.set_option('display.max_columns', None)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
pd.set_option('max_colwidth', 200)


# get the code
import sys
sys.path.append('../code')
from script_utils import show_output

# paths
static_path = os.path.join(os.environ['STATIC'], "annotation/clinical")
local_path = os.path.join(os.environ['WORK'], "LO/Sequencing/LungCustomPanel/output")

In [None]:
LO/Sequencing/LungCustomPanel/output/M162_muts.hg38.csv

### filter the cosmic database for exonic mutations
+ annotate the whole of cosmic with annovar for gene model and SNPs
* `$ conda activate annovar-env`
* `perl ../code/anno2019/table_annovar.pl --buildver hg38 --maxgenethread 10 --thread 10 --protocol refGene,cytoband,gnomad30 --operation g,r,f -nastring "." --otherinfo --remove --outfile ../output/cosmic ${STATIC}/annotation/annovar/humandb/hg38_cosmic95.txt $STATIC/annotation/annovar/humandb`

+ load the annovar output and edit

In [None]:
from pyseq_utils import load_anno
cosmic_all_path = os.path.join(local_path, "../output/cosmic.hg38_multianno.txt")
cosmic_all = load_anno(cosmic_all_path)
cosmic_all

+ save and load

In [None]:
cosmic_all.to_csv(os.path.join(static_path, "cosmic_all.csv"), sep="\t", index=False, compression="gzip")
cosmic_all = pd.read_csv(os.path.join(static_path, "cosmic_all.csv"), sep="\t", compression="gzip")
cosmic_all[:3]
len(cosmic_all.index)

### Filter on non-exonic and SNP-rich mutations

In [None]:
from pyseq_utils import filter_exonic

exonic_filter = dict(
    exonic_list = ['exonic', 'UTR3', 'UTR5', 'UTR5;UTR3', 'exonic;splicing'],
    mut_list = ['nonsynonymous SNV', 'stopgain', 'startloss', 'stoploss', 'frameshift deletion', 'nonframeshift deletion'],
    gnomad_max=1e-2
)

cosmic_exon = filter_exonic(cosmic_all, filter_settings=exonic_filter)
cosmic_exon

## save and re-import the COSMIC exon databased load

In [None]:
cosmic_exon_path = os.path.join(static_path, "cosmic_exon.csv")
# cosmic_exon.to_csv(cosmic_exon_path, sep="\t", index=False, compression="gzip")
cosmic_exon = pd.read_csv(cosmic_exon_path, sep="\t", compression="gzip")
cosmic_exon[:3]
len(cosmic_exon.index)

## calculate the clinscore for all exonic data based on clinscore weights
+ weights are applied via yaml file

In [None]:
from clinscore import get_cosmic_score
clinscore_file = "../configs/clinscoreLung.yaml"
cosmic_scored = get_cosmic_score(cosmic_exon, clinscore_file=clinscore_file, threads=10, verbose=1)
cosmic_scored

### plot the distribution of scores

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sn
sn.set()

plt.style.use("seaborn-white")
_ = plt.hist(cosmic_scored.query('cosmic_score > 20000')['cosmic_score'], alpha=.5, bins=700)
plt.yscale("log")
plt.xscale("log")

In [None]:
# cosmic_scored.to_csv(os.path.join(local_path, "cosmic_lung.csv"), sep="\t", index=False)
cosmic_scored = pd.read_csv(os.path.join(local_path, "cosmic_lung.csv"), sep="\t")
cosmic_scored.query('cosmic_score > 100')[:10]

### perform rolling computation to get hotspots
+ cycle through the chromosomes
+ remove background mutations
+ roll for density
+ merge into chromosome df

In [None]:
from cosmic_panel import compute_cosmic_density

custom_filter = dict(
    cosmic_rolling_min=500,
    rolling_window_size=5
)

cosmic_roll = compute_cosmic_density(cosmic_scored, filter_setting=custom_filter, verbose=1)
cosmic_roll[:3]

### plot the density distribution

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sn
sn.set()

plt.style.use("seaborn-white")
_ = plt.hist(cosmic_roll.query('cosmic_score > 10000').query('cosmic_density > 100')['cosmic_density'], alpha=.5, bins=700)
plt.yscale("log")
plt.xscale("log")

### filter for very important mutations and high density regions
+ looks like the density cutoff should be around 

### now calculate approximate panel size based on approximate padding size

In [None]:
from cosmic_panel import full_collapse

cosmic_muts, cosmic_collapsed = full_collapse(cosmic_f2)
cosmic_collapsed

In [None]:
cosmic_muts.query('Gene == "EGFR"').drop(['AAChange'])

## putting it all together

In [None]:
from cosmic_panel import cosmic_master

# load exonic annotations
cosmic_exon_path = os.path.join(static_path, "cosmic_exon.csv")
cosmic_exon_df = pd.read_csv(cosmic_exon_path, sep="\t", compression="gzip")

In [None]:
# filter settings
custom_filter = dict(
    exonic_list = ['exonic', 'UTR3', 'UTR5', 'UTR5;UTR3', 'exonic;splicing'],
    mut_list = ['nonsynonymous SNV', 'stopgain', 'startloss', 'stoploss', 'frameshift deletion', 'nonframeshift deletion'],
    gnomad_max=1e-2,
    cosmic_rolling_min=2000,
    rolling_window_size=5,
    cosmic_min = 3500,
    cosmic_density_min = 250,
    padding=100
)

clinscore_file = "../testdata/clinscoreLung.yaml"

cosmic_muts, cosmic_collapsed, cosmic_scored = cosmic_master(cosmic_exon_df,
                                              cosmic_weights_file=clinscore_file, 
                                              filter_setting=custom_filter, 
                                              verbose=1
                                             )

### reruns can be performed without re-computing the cosmic scores
+ just use the last output from previous run and remove the cosmic_weights file

In [None]:
custom_filter = dict(
    exonic_list = ['exonic', 'UTR3', 'UTR5', 'UTR5;UTR3', 'exonic;splicing'],
    mut_list = ['nonsynonymous SNV', 'stopgain', 'startloss', 'stoploss', 'frameshift deletion', 'nonframeshift deletion'],
    gnomad_max=1e-2,
    cosmic_rolling_min=2000,
    rolling_window_size=5,
    cosmic_min = 3500,
    cosmic_density_min = 250,
    padding=100
)

cosmic_muts, cosmic_collapsed, cosmic_scored = cosmic_master(cosmic_scored,
                                              cosmic_weights_file="", 
                                              filter_setting=custom_filter, 
                                              verbose=1
                                             )

In [None]:
cosmic_collapsed

In [None]:
cosmic_muts.drop(['AAChange', 'type'], axis=1).query('Gene == "EGFR" or Gene == "BRAF"')