## Creating a custom panel for use in small sequencers for LungCancerMutations

### Strategy: Look for mutational hotspots for lung cancer in the COSMIC Database

In [1]:
# some sensible settings for better output
import os
import pandas as pd
from IPython.display import display
pd.set_option('display.max_columns', None)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
pd.set_option('max_colwidth', 200)


# get the code
import sys
sys.path.append('../code')
from script_utils import show_output

# paths
static_path = os.path.join(os.environ['STATIC'], "annotation/clinical")
local_path = os.path.join(os.environ['WORK'], "LO/Sequencing/LungCustomPanel/output")

### filter the cosmic database for exonic mutations
+ annotate the whole of cosmic with annovar for gene model and SNPs
* `$ conda activate annovar-env`
* `perl ../code/anno2019/table_annovar.pl --buildver hg38 --maxgenethread 10 --thread 10 --protocol refGene,cytoband,gnomad30 --operation g,r,f -nastring "." --otherinfo --remove --outfile ../output/cosmic ${STATIC}/annotation/annovar/humandb/hg38_cosmic95.txt $STATIC/annotation/annovar/humandb`

+ load the annovar output and edit

In [None]:
from pyseq_utils import load_anno
cosmic_all_path = os.path.join(local_path, "../output/cosmic.hg38_multianno.txt")
cosmic_all = load_anno(cosmic_all_path)
cosmic_all

+ save and load

In [None]:
cosmic_all.to_csv(os.path.join(static_path, "cosmic_all.csv"), sep="\t", index=False, compression="gzip")
cosmic_all = pd.read_csv(os.path.join(static_path, "cosmic_all.csv"), sep="\t", compression="gzip")
cosmic_all[:3]
len(cosmic_all.index)

### Filter on non-exonic and SNP-rich mutations

In [None]:
from pyseq_utils import filter_exonic

exonic_filter = dict(
    exonic_list = ['exonic', 'UTR3', 'UTR5', 'UTR5;UTR3', 'exonic;splicing'],
    mut_list = ['nonsynonymous SNV', 'stopgain', 'startloss', 'stoploss', 'frameshift deletion', 'nonframeshift deletion'],
    gnomad_max=1e-2
)

cosmic_exon = filter_exonic(cosmic_all, filter_settings=exonic_filter)
cosmic_exon

## save and re-import the COSMIC exon databased load

In [None]:
cosmic_exon_path = os.path.join(static_path, "cosmic_exon.csv")
# cosmic_exon.to_csv(cosmic_exon_path, sep="\t", index=False, compression="gzip")
cosmic_exon = pd.read_csv(cosmic_exon_path, sep="\t", compression="gzip")
cosmic_exon[:3]
len(cosmic_exon.index)

## calculate the clinscore for all exonic data based on clinscore weights
+ weights are applied via yaml file

In [None]:
from clinscore import get_cosmic_score
clinscore_file = "../testdata/clinscoreLung.yaml"
cosmic_scored = get_cosmic_score(cosmic_exon, clinscore_file=clinscore_file, threads=10, verbose=1)
cosmic_scored

### plot the distribution of scores

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sn
sn.set()

plt.style.use("seaborn-white")
_ = plt.hist(cosmic_scored.query('cosmic_score > 20000')['cosmic_score'], alpha=.5, bins=700)
plt.yscale("log")
plt.xscale("log")

In [None]:
# cosmic_scored.to_csv(os.path.join(local_path, "cosmic_lung.csv"), sep="\t", index=False)
cosmic_scored = pd.read_csv(os.path.join(local_path, "cosmic_lung.csv"), sep="\t")
cosmic_scored.query('cosmic_score > 100')[:10]

### perform rolling computation to get hotspots
+ cycle through the chromosomes
+ remove background mutations
+ roll for density
+ merge into chromosome df

In [None]:
from cosmic_panel import compute_cosmic_density

custom_filter = dict(
    cosmic_rolling_min=500,
    rolling_window_size=5
)

cosmic_roll = compute_cosmic_density(cosmic_scored, filter_setting=custom_filter, verbose=1)
cosmic_roll[:3]

### plot the density distribution

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sn
sn.set()

plt.style.use("seaborn-white")
_ = plt.hist(cosmic_roll.query('cosmic_score > 10000').query('cosmic_density > 100')['cosmic_density'], alpha=.5, bins=700)
plt.yscale("log")
plt.xscale("log")

### filter for very important mutations and high density regions
+ looks like the density cutoff should be around 

In [None]:
data = cosmic_roll.query('cosmic_score > 5000')

plt.scatter(data['cosmic_score'], data['cosmic_density'], s=2)
plt.yscale("log")
plt.xscale("log")
plt.ylabel("density")
plt.xlabel("score")

In [None]:
from cosmic_panel import filter_cosmic

custom_filter = dict(
    cosmic_min = 20000,
    cosmic_density_min = 1000
)

cosmic_f2 = filter_cosmic(cosmic_roll, filter_settings=custom_filter, verbose=1)
cosmic_f2[:10]

In [None]:
cr = cosmic_f2.query('Chr == 7').loc[:,["Chr", "Start", "End"]]
for col in ["Start", "End"]:
    cr[col] = cr[col]-55100000
# cr.iloc[1048,:] = [7,77355, 79555]
newline = pd.DataFrame([{"Chr": 7, "Start": 67355, "End":69555}, {"Chr": 7, "Start": 77355, "End":79555}, {"Chr": 7, "Start": 177355, "End":179555}], index=[1000,1050,2000])
cr = cr.append(newline, ignore_index=False)
cr = cr.sort_index()
cr

In [None]:
pad = 100
cr['Start'] = cr['Start'] - pad
cr['End'] = cr['End'] + pad
# get the overlaps
cr['ov1'] = (cr['End'] > cr.shift(-1)['Start']).astype(int)
cr['ov2'] = (cr['Start'] < cr.shift(1)['End']).astype(int)
cr['gap'] = cr['ov1'] + cr['ov2']
cr

In [None]:
# assign overlap groups
cr['ovgroup'] = ((cr['ov1'] * (cr['ov2'] == 0).astype(int)).cumsum()) * (cr['ov1'] | (cr['ov2']))
cr

### now calculate approximate panel size based on approximate padding size

In [None]:
from cosmic_panel import full_collapse

cosmic_muts, cosmic_collapsed = full_collapse(cosmic_f2)
cosmic_collapsed

In [None]:
cosmic_muts.query('Gene == "EGFR"').drop(['AAChange'])

## putting it all together

In [26]:
from clinscore import get_cosmic_score
from cosmic_panel import compute_cosmic_density, filter_cosmic, full_collapse


def cosmic_master(df, cosmic_weights_file="", filter_settings={}, verbose=1):
    '''
    takes an annovar annotated mutation list and returns the collapsed mutation list based on filter list
    '''
    
    filter_info = "".join([f"\n\t[{col}:\t{filter_setting[col]}]" for col in ["cosmic_rolling_min", "rolling_window_size", "cosmic_min", "cosmic_density_min", "padding"]])
    show_output(f"Creating custom panel based on limits set in filter settings.{filter_info}")
    if cosmic_weights_file:
        df_scored = get_cosmic_score(df, cosmic_weights_file=cosmic_weights_file, threads=10, verbose=1)
    else:
        if 'cosmic_score' in df.columns:
            show_output(f"Using precomputed cosmic scores! For recomputation, provide a cosmic weights file", time=False)
            df_scored = df
        else:
            show_output("No clinscore in df and no weights file to compute clinscores. Sorry - stopping here!", color="warning")
            return
    
    # perform rolling window computation
    if verbose:
        show_output("Perform rolling window computation", time=False)
    df = compute_cosmic_density(df_scored, filter_setting=filter_settings, verbose=verbose)

    # filter based on cosmic scores
    if verbose:
        show_output("Filtering out background mutations", time=False)
    df = filter_cosmic(df, filter_settings=filter_settings, verbose=verbose)

    # collapse the df
    if verbose:
        show_output("Collapsing the mutations to adjacency groups", time=False)
    df, group_df = full_collapse(df, padding=filter_settings['padding'], verbose=verbose)

    # meaningfull output
    mutN = group_df['mutN'].sum()
    kb_size = int(group_df['stretch'].sum() / 1000)
    show_output(f"Library size = {kb_size}kb - {mutN} mutations included")
    return df, group_df, df_scored

In [18]:
# load exonic annotations
cosmic_exon_path = os.path.join(static_path, "cosmic_exon.csv")
cosmic_exon_df = pd.read_csv(cosmic_exon_path, sep="\t", compression="gzip")

In [41]:
# filter settings
custom_filter = dict(
    exonic_list = ['exonic', 'UTR3', 'UTR5', 'UTR5;UTR3', 'exonic;splicing'],
    mut_list = ['nonsynonymous SNV', 'stopgain', 'startloss', 'stoploss', 'frameshift deletion', 'nonframeshift deletion'],
    gnomad_max=1e-2,
    cosmic_rolling_min=2000,
    rolling_window_size=5,
    cosmic_min = 3500,
    cosmic_density_min = 250,
    padding=100
)

clinscore_file = "../testdata/clinscoreLung.yaml"

cosmic_muts, cosmic_collapsed, cosmic_scored = cosmic_master(cosmic_scored,
                                              cosmic_weights_file="", 
                                              filter_settings=custom_filter, 
                                              verbose=1
                                             )

[1;35;2m19:23:52[0m : [1;30;1mCreating custom panel based on limits set in filter settings.
	[cosmic_rolling_min:	2000]
	[rolling_window_size:	5]
	[cosmic_min:	20000]
	[cosmic_density_min:	5000]
	[padding:	100][0m
[1;30;1mUsing precomputed cosmic scores! For recomputation, provide a cosmic weights file[0m
[1;30;1mPerform rolling window computation[0m
[1;35;2m19:23:52[0m : [1;30;1mComputing mutation density[0m
[1;30;1mFiltering out background mutations[0m
[1;35;2m19:23:53[0m : [1;30;1mFiltered out 12961 mutations [17755 --> 4794][0m
[1;30;1mCollapsing the mutations to adjacency groups[0m
[1;35;2m19:23:53[0m : [1;30;1mCollapsing adjacent mutations and including bait padding[0m
[1;35;2m19:23:53[0m : [1;30;1mLibrary size = 489kb - 4794 mutations included[0m


In [29]:
cosmic_muts

Unnamed: 0,Chr,Start,End,Ref,Alt,Func,Gene,ExonicFunc,AAChange,cytoband,gnomAD,Mut_ID,type,cosmic_score,cosmic_density,ov1,ov2,ovgroup
4,1,33013230,33013430,G,C,exonic,AK2,nonsynonymous SNV,"AK2:NM_001319142:exon5:c.C445G:p.H149D,AK2:NM_001199199:exon6:c.C547G:p.H183D,AK2:NM_001319141:exon6:c.C571G:p.H191D,AK2:NM_001625:exon6:c.C571G:p.H191D,AK2:NM_013411:exon6:c.C571G:p.H191D,AK2:NM_...",1p35.1,0.0,COSV61466144,66x(adenocarcinoma@lung)+18x(carcinoma@thyroid)+6x(large_cell_carcinoma@lung)+6x(mesothelioma@pleura)+6x(neoplasm@liver)+30x(neoplasm@thyroid)+12x(non_small_cell_carcinoma@lung)+138x(squamous_cell...,21294,0.8,1,0,1
5,1,33013258,33013458,C,G,exonic,AK2,nonsynonymous SNV,"AK2:NM_001319142:exon5:c.G417C:p.K139N,AK2:NM_001199199:exon6:c.G519C:p.K173N,AK2:NM_001319141:exon6:c.G543C:p.K181N,AK2:NM_001625:exon6:c.G543C:p.K181N,AK2:NM_013411:exon6:c.G543C:p.K181N,AK2:NM_...",1p35.1,0.0,COSV61466156,90x(adenocarcinoma@lung)+24x(carcinoma@thyroid)+6x(large_cell_carcinoma@lung)+6x(mesothelioma@pleura)+6x(metaplastic_carcinoma@breast)+6x(mucoepidermoid_carcinoma@lung)+6x(neoplasm@liver)+18x(neop...,28836,0.1,0,1,1
6,1,158354351,158354551,T,A,exonic,CD1E,nonsynonymous SNV,"CD1E:NM_001042583:exon2:c.T133A:p.F45I,CD1E:NM_001042584:exon2:c.T133A:p.F45I,CD1E:NM_001042585:exon2:c.T133A:p.F45I,CD1E:NM_001185107:exon2:c.T133A:p.F45I,CD1E:NM_001185108:exon2:c.T133A:p.F45I,C...",1q23.1,0.0,COSV63771407,13x(adenocarcinoma@lung),3263,526.7,1,0,2
7,1,158354365,158354565,C,A,exonic,CD1E,nonsynonymous SNV,"CD1E:NM_001042583:exon2:c.C147A:p.S49R,CD1E:NM_001042584:exon2:c.C147A:p.S49R,CD1E:NM_001042585:exon2:c.C147A:p.S49R,CD1E:NM_001185107:exon2:c.C147A:p.S49R,CD1E:NM_001185108:exon2:c.C147A:p.S49R,C...",1q23.1,0.0,COSV63769950,13x(adenocarcinoma@right_lower_lobe)+13x(carcinoma@bladder),3276,628.0,1,1,2
8,1,158354519,158354719,T,G,exonic,CD1E,nonsynonymous SNV,"CD1E:NM_001042583:exon2:c.T301G:p.F101V,CD1E:NM_001042584:exon2:c.T301G:p.F101V,CD1E:NM_001042585:exon2:c.T301G:p.F101V,CD1E:NM_001185107:exon2:c.T301G:p.F101V,CD1E:NM_001185108:exon2:c.T301G:p.F1...",1q23.1,0.0,COSV63771967,13x(adenocarcinoma@stomach)+13x(carcinoma@bladder)+13x(large_cell_neuroendocrine_carcinoma@lung),3289,545.6,0,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73,9,21971090,21971290,C,A,exonic,CDKN2A,nonsynonymous SNV,"CDKN2A:NM_000077:exon2:c.G169T:p.A57S,CDKN2A:NM_001195132:exon2:c.G169T:p.A57S,CDKN2A:NM_058195:exon2:c.G212T:p.R71L",9p21.3,0.0,COSV58685036,10x(adenocarcinoma@lung)+10x(malignant_melanoma@skin),2520,1080.7,1,1,5
74,9,21971095,21971295,C,A,exonic,CDKN2A,nonsynonymous SNV,"CDKN2A:NM_000077:exon2:c.G164T:p.G55V,CDKN2A:NM_001195132:exon2:c.G164T:p.G55V",9p21.3,0.0,COSV58683339,20x(adenocarcinoma@lung)+10x(undifferentiated_carcinoma@lung),5030,1512.0,0,1,5
75,9,36881957,36882157,G,C,exonic,PAX5,nonsynonymous SNV,"PAX5:NM_001280553:exon7:c.C830G:p.P277R,PAX5:NM_001280554:exon7:c.C830G:p.P277R,PAX5:NM_001280556:exon7:c.C635G:p.P212R,PAX5:NM_001280548:exon8:c.C959G:p.P320R,PAX5:NM_016734:exon8:c.C959G:p.P320R",9p13.2,0.0,COSV99050267,11x(adenocarcinoma@lung),2761,1061.9,1,0,6
76,9,36881963,36882163,T,A,exonic,PAX5,nonsynonymous SNV,"PAX5:NM_001280553:exon7:c.A824T:p.H275L,PAX5:NM_001280554:exon7:c.A824T:p.H275L,PAX5:NM_001280556:exon7:c.A629T:p.H210L,PAX5:NM_001280548:exon8:c.A953T:p.H318L,PAX5:NM_016734:exon8:c.A953T:p.H318L",9p13.2,0.0,COSV63911186,11x(adenocarcinoma@lung),2761,726.6,1,1,6


In [None]:
cosmic_collapsed

In [None]:
cosmic_muts.drop(['AAChange', 'type'], axis=1).query('Gene == "EGFR" or Gene == "BRAF"')