In [None]:
# get the code
import sys
import os
import matplotlib.pyplot as plt
# use seaborn plotting defaults
import seaborn as sns; sns.set()
sys.path.append('../code')

# import package functions
from script_utils_CNV import get_CNVconfig, show_output
from plot import plot_cov, plot_snp
from script_utils_CNV import show_output
from rollingCov import rolling_coverage
from combineCNV import filter_snp

######## need to find the chained assignment!!!
pd.set_option('mode.chained_assignment', None)

# HOME
home = '/Users/mahtin'
home = '/Users/martinscience'


# standard paths
static = os.path.join(home, "Dropbox/Icke/Work/static")
tooldata = os.path.join(home, "Dropbox/Icke/Work/somVar/tooldata")
testdata = os.path.join(home,"Dropbox/Icke/Work/somVar/testdata")
PON_path = os.path.join(static, "PON/HAEv7_hg38_NovaSeq")


cnvdata = os.path.join(tooldata, "myCNVdata")
output_path = os.path.join(cnvdata, "output")
plot_path = os.path.join(cnvdata, "plot")
fig_path = os.path.join(cnvdata, "figures")

### get the config
+ use the get_CNVconfig util function to update the general configs with the appropriate paths

In [None]:
path_config = dict(
        mawk_path="../shell",
        cov_path=os.path.join(output_path, "pile2CNV"),   # path containing rawcov.gz files for this sample
        snp_path=os.path.join(output_path, "pile2CNV"),   # path containing snp files for this sample
        bed_file=os.path.join(static, "bed_files/SureSelect/hg38/SS_HAEv7_hg38_Padded.bed"),
        genome_split_path=os.path.join(static, "genome/gatk/hg38/split"),
        gc_split_path=os.path.join(static, "genome/gatk/hg38/split"),
        genmap_split_path=os.path.join(static, "annotation/genmap/hg38/split"),
        PON_path = PON_path,
    )
CNVconfig = get_CNVconfig(
    "../config/config_CNV.yaml", 
    local_config=path_config)
CNVconfig

### load snp data and visualize

In [None]:
sample = "03_A-B"
snp_df = pd.read_csv(os.path.join(output_path, f"snp/{sample}.snp.gz"), sep="\t", compression="gzip")

fig_params = dict(
    figsize=(32,3),
    colormap='coolwarm_r',
    color_chroms=True,
    ylim=(-0,1),
    label_size=13
)

vaf = dict(
        title='VAF',
        plot_type='scatter',  # ['line', 'scatter']
        data='VAF2',
        plot_args=dict(
            s=2,
            color='black',
            cmap='viridis',
            alpha=1
        )
    )
chroms = ['chr3', 'chr4', 'chr5', 'chr6','chr7', 'chr9', 'chr12', 'chr17']
r1 = 'chr17:3Mb-9Mb'


r7 = 'chr7:100.8Mb-101Mb'
r17 = 'chr17:21.2Mb-22Mb'
fig, _, _, _ = plot_snp(snp_df, plots=[vaf], chroms="all", region='', **fig_params)
fig, _, _, _ = plot_snp(snp_df, plots=[vaf], chroms="chr20", region='', **fig_params)
fig, _, _, _ = plot_snp(snp_df, plots=[vaf], chroms="all", region=r7, **fig_params)
fig, _, _, _ = plot_snp(snp_df, plots=[vaf], chroms="all", region=r17, **fig_params)
#fig, ax, df, chrom_df

## merge coverage data into SNP
+ reduce to important columns

In [None]:
roll_cov_df = pd.read_csv(os.path.join(output_path, f"cov/{sample}.roll.cov.gz"), sep="\t", compression="gzip")
roll_cov_df.columns

In [None]:
from rollingCNV import interpolate

def mergeCov2SNP(snp_df, cov_df):
    
    # get the columns
    snp_cols = list(snp_df.columns)
    base_cols = snp_cols[:4]
    cov_cols = [col for col in roll_cov_df.columns if col.endswith("mean") or col.endswith("sum") or col == "GCratio"] 
    # reduce cols of cov_df
    cov_df = cov_df.loc[:, base_cols + cov_cols]
    # PONVAF and PONDepth have to be filled to zero
    for col in ['PONVAF', 'PONDepth']:
        snp_df.loc[:, col] = snp_df[col].fillna(0)

    show_output('Merging')
    snp_df = snp_df.merge(cov_df, how="outer").sort_values("FullExonPos").reset_index(drop=True)

    # interpolate the data
    for col in cov_cols:
        show_output(f"Interpolating {col}")
        snp_df = interpolate(snp_df, col, expand_limit=100)

    # reduce to VAF values
    snp_df = snp_df.query('PONVAF == PONVAF')
    return snp_df

In [None]:
snp_df = mergeCov2SNP(snp_df, roll_cov_df)
snp_df.columns

## heteroSNP rolling window
+ #### first, VAF is centered
+ #### next, fallSNP have to be removed
+ #### next, important local features have to be extracted:
    * offVAF: distance measure from the meanVAF
    * SNPdensity: measure of genomic spread of SNPs in order to find falling SNPs
    * absVAF: absolute dist measure from meanVAF
    * snpLLH: log-likelihood to belong the center gaussian
    * hsnpLLH: log-likelihood to belong to VAF==1 (to identify high purity clones)

+ #### next, these local features have to be computed with a rolling window to convert local to regional data

### remove the fallSNP
+ compute genomic snp-density (stretch / window)
+ combine with rolling offVAFsum
+ remove the negative offVAF in that range

In [None]:
from rollingCNV import interpolate, one_col_rolling, llh, rolling_data
from script_utils_CNV import show_output

def center_vaf(snp_df):
    '''
    adjusts VAFs to 0.5
    '''
    
    df = snp_df.copy()
    cols = list(df.columns)
    
    for col in df.columns:
        if col.startswith("VAF"):
            cols = list(df.columns)
            insert_index = df.columns.get_loc(col) + 1
            df.loc[:, col] = df[col] + 0.5 - df[col].mean()
            # get additional features from VAFs
            # get offVAF as measure of straighing from center
            df.loc[:, f'off{col}'] = (df[col] - 0.5) * 2
            # get absVAF as absolute measure of straighing from center
            df.loc[:, f'abs{col}'] = np.abs(df[f'off{col}'])
            out_cols = cols[:insert_index] + [f'abs{col}',f'off{col}'] + cols[insert_index:]
            df = df.loc[:, out_cols]
    return df

### center VAF

In [None]:
snp_df['VAF1'].mean()
snp_df['VAF2'].mean()
csnp_df = center_vaf(snp_df)
csnp_df['VAF1'].mean()
csnp_df['VAF2'].mean()
csnp_df.columns

In [None]:
# get the density computer for rolling
def make_get_density(window_size=20):
    '''
    helper for returning a density computer for given window_size
    '''
    
    def SNPdensity(data):
        return (data.max() - data.min()) / window_size
    return SNPdensity



window = 20
get_SNPdensity = make_get_density(window)

In [None]:
fallSNP_params = {
    "FullExonPos":{get_SNPdensity: window},
    "offVAF": {'sum': 20}
    }
# expand
exp_df = rolling_data(csnp_df, data_params=fallSNP_params, roll_config=CNVconfig['rolling']['snp'])

In [None]:
def remove_fallSNP(snp_df, mean=0.5, std=0.2, params={}):
    '''
    removes the falling SNP probably caused by mismapping
    '''
    
    window = params['offVAFwindow']
    cutoff = params['maxFallSNP']
    
    # get the density computer for rolling
    get_SNPdensity = make_get_density(window)
    # cycle through chroms
    chrom_dfs = []
    for chrom in snp_df['Chr'].unique():
        df = snp_df.query('Chr == @chrom')
  
        # get the snp
        df = one_col_rolling(df, df.query('VAF < 0.95'), 'ExonPos', get_SNPdensity, window_size=window, diff_exp=4)
        df.loc[:, 'SNPdensity'] = df['SNPdensity'] / df['SNPdensity'].mean()
    
        # get the offVAFsum
        df = one_col_rolling(df, df.query('VAF < 0.95'), 'offVAF', 'sum', window_size=window, normalize=True, diff_exp=4)
    
        # combine both metrices
        df.loc[:, 'fallSNP'] = df['SNPdensity'] * df['offVAFsum']
        # now remove the ones below average VAFstd
        df = df.query('VAF > @mean - @std / 2 or fallSNP > @cutoff')
        chrom_dfs.append(df)
        
    return pd.concat(chrom_dfs).sort_values('FullExonPos').reset_index(drop=True)

### expand the LLH

In [None]:
def compute_snp_llh(df, mean=0.5, sigma=0.2):
    '''
    computes the local log-likelihood of belonging to the center gaussian
    '''
        
    show_output(f"Computing log-likelihood of VAF belonging to center gaussian [mean:{round(mean, 3)}, sigma:{round(sigma,3)}]")
    df.loc[:, 'snpLLH'] = llh(df['VAF'], mean, sigma)
    
    # for homoSNPs reduce the VAFs to the ones above mean
    upper_vafs = df.query('@mean < VAF')['VAF']
    # then compute the hsnpLLH

    show_output(f"Computing log-likelihood of VAF belonging to purity100  [mean:1, sigma:{round(sigma,3)}]")
    # these are called hsnp
    # upper_vafs only contains half the snps, the remaining have to be interpolated
    df.loc[:, 'hsnpLLH'] = llh(upper_vafs, 1, sigma)
    df = interpolate(df, 'hsnpLLH', expand_limit=50)
    return df
    

def expand_SNPdata(snp_df, config):
    '''
    retrieve a few data columns locally to use rolling windows on
    this needs to be done chromosome-wise in order to avoid gap effects
    VAF limits are also applied here
    '''
        
    # split the params dict for easier access
    params = config['snp']
    filter_params = params['filter']
    # data_params = params['data']
    
    # reduce the snp_df using lower config limit
    # upper limit has to be set later as we still need the homoSNP llh
    VAFmin, VAFmax = filter_params['VAF']
    snp_df = snp_df.query('@VAFmin < VAF')
    
    
    # get std and mean of VAF
    minVAF, maxVAF = params['LLH']['center_range']
    # get the sigma and mean of the center band VAF (extracted as pd.Series center_vafs)
    center_vafs = snp_df.query('@minVAF < VAF < @maxVAF')['VAF']
    # get width of gaussian from std * sigma_factor
    VAFstd = center_vafs.std()
    VAFmean = center_vafs.mean()
    
    
    # get additional features from VAFs
    snp_df.loc[:, 'offVAF'] = (snp_df['VAF'] - VAFmean) * 2
    # absolute values for cluster 
    snp_df.loc[:,'absVAF'] = np.abs(snp_df['offVAF'])    
    
    ########## remove fallSNP ########
    fs_params = params['fallSNP']
    if fs_params['run']:
        show_output('Removing falling SNPs')
        snp_df = remove_fallSNP(snp_df, mean=VAFmean, std=VAFstd, params=fs_params)
     
    ######## LLH  #####################
    # get the snpLLH and hsnpLLH
    # get config params
    sigma = VAFstd * params['LLH']['sigma_factor']
    # hsnpLLH is computed in order to rescue high absVAF that would have been filtered out
    # lower VAF is already removed because density of VAF ~0 is highly irregular and would confound 
    snp_df = compute_snp_llh(snp_df, mean=VAFmean, sigma=sigma)

    return snp_df.query('VAF < @VAFmax').reset_index(drop=True)

## filter snp data

In [None]:
def filter_snp(snp_df, config={}):
    """
    takes the config and applies pre-filtering for rolling computation
    """

    c = config["filter"]["snp"]

    # ### build the query
    
    # minDepth = csnp['minDepth']
    # minVAF = csnp['minVAF']
    
    # PONVAF and PONDepth have to be filled to zero
    
    for col in ['PONVAF', 'PONDepth']:
        snp_df.loc[:, col] = snp_df[col].fillna(0)
    # pon query 
    maxPONVAF = c["maxPONVAF"]
    # minPONDepth = c["minPONDepth"]
    # PON_query = f"PONDepth >= {minPONDepth} and PONVAF < {maxPONVAF}"
    PON_query = f"PONVAF < {maxPONVAF}"
    # map query
    # extract map query from filter config
    # 'map30_0 > 0.1 and map50_0 > 0.1 and map75_1 > 0.1 and map100_2 > 0.1'
    map_query = " and ".join([f"{m} >= {c[m]}" for m in c.keys() if m.startswith("map")])
    
    snp_query = f"{PON_query} and {map_query}"
    show_output(f'Filtering SNP data using {snp_query.replace("@", "")}')
    snp_df = snp_df.query(snp_query)

    return snp_df

In [None]:
CNVconfig = get_CNVconfig(
    "../config/config_CNV.yaml", 
    local_config=path_config)
filter_snp_df = filter_snp(snp_df, config=CNVconfig)
fig, _, _, _ = plot_snp(filter_snp_df, plots=[vaf], chroms="all", region='', **fig_params)
fig, _, _, _ = plot_snp(filter_snp_df, plots=[vaf], chroms="chr20", region='', **fig_params)
fig, _, df1, _ = plot_snp(filter_snp_df, plots=[vaf], chroms="chr7", region=r7, **fig_params)
fig, _, df2, _ = plot_snp(filter_snp_df, plots=[vaf], chroms="chr17", region=r17, **fig_params)

In [None]:
config = get_config(config_file, 'combine')
snp2_df = expand_SNPdata(snp_df, config)

In [None]:
fig_params = dict(
    figsize=(20,4),
    colormap='coolwarm_r',
    color_chroms=True,
    ylim=(-0,1),
    cov_offset=.1,  # how much log2ratio=0 is shifted above SNP-data
    cov_height=.5,
    label_size=13
)


absvaf = dict(
        title='absVAF',
        plot_type='scatter',   # ['line', 'scatter']
        data='absVAF',
        plot_args=dict(
            linewidth=0.5,
            color='blue',
            s=2,
            alpha=1
        ))
deltavaf = dict(
        title='deltaVAF',
        plot_type='scatter',   # ['line', 'scatter']
        data='deltaVAF',
        plot_args=dict(
            linewidth=0.5,
            color='green',
            s=5,
            alpha=1
        ))

offvaf = dict(
        title='offVAF',
        plot_type='scatter',   # ['line', 'scatter']
        data='offVAF',
        plot_args=dict(
            linewidth=0.5,
            color='blue',
            s=5,
            alpha=1
        ))

vaf = dict(
        title='VAF',
        plot_type='scatter',  # ['line', 'scatter']
        data='VAF',
        plot_args=dict(
            s=2,
            color='black',
            cmap='viridis',
            alpha=.7
        )
    )
r1 = 'chr17:3Mb-9Mb'

fig, _, _, _  = plot_snp2(snp2_df, snp_plots=[vaf], cov_plots=[log2,log2mean], chroms=chroms, region='', **fig_params)

In [None]:
fig.savefig(f'{plot_path}/{sample}.snp.clean.jpg')

## Rolling SNPdata

In [None]:
def rolling_SNP(snp_df, config):
    '''
    cycle through the chroms and perform rolling window computations of snp data set in config
    '''

    # split the params dict for easier access
    params = config['snp']
    filter_params = params['filter']
    data_params = params['rolling_data']
    debug = config['debug']
        
    minDepth = filter_params['minDepth']
    filter_df = snp_df.query('Depth >= @minDepth')
    
    rolling_df = rolling_data(snp_df, filter_df, expand=params['expand'], ddof=config['ddof'], debug=debug, data_params=data_params)
    
    return rolling_df

### test rolling windows

In [None]:
config = get_config(config_file, 'combine')
snp3_df = rolling_SNP(snp2_df, config)

## optimizing rolling windows

### VAF
+ std  (20)
+ var

In [None]:
vaf = dict(
        title='VAF',
        plot_type='scatter',   # ['line', 'scatter']
        data='VAF',
        plot_args=dict(
            linewidth=0.5,
            color='blue',
            s=5,
            alpha=1
        ))

vafstd = dict(
        title='VAFstd',
        plot_type='line',   # ['line', 'scatter']
        data='VAFstd',
        plot_args=dict(
            linewidth=1,
            color='yellow',
            alpha=.7
        )
    )

vafstddiff = dict(
        title='VAFstdDiff',
        plot_type='line',   # ['line', 'scatter']
        data='VAFstdDiff',
        plot_args=dict(
            linewidth=1,
            color='blue',
            alpha=.7
        )
    )


config = get_config(config_file, 'combine')
snp3_df = rolling_SNP(snp2_df, config)

r1 = 'chr17:3Mb-9Mb'

std_plots = [vaf,vafstd, vafstddiff]

fig, ax, df, chrom_df = plot_snp(snp3_df, snp_plots=[vaf,vafstd], chroms=chroms, region='', **fig_params)

### absVAF
+ mean (20)
+ std

In [None]:
absvaf = dict(
        title='absVAF',
        plot_type='scatter',   # ['line', 'scatter']
        data='absVAF',
        plot_args=dict(
            linewidth=0.5,
            color='blue',
            s=5,
            alpha=1
        ))

absvafmean = dict(
        title='absVAFmean',
        plot_type='line',   # ['line', 'scatter']
        data='absVAFmean',
        plot_args=dict(
            linewidth=1,
            color='yellow',
            alpha=.7
        )
    )

absvafmeandiff = dict(
        title='absVAFdiff',
        plot_type='line',   # ['line', 'scatter']
        data='absVAFmeanDiff',
        plot_args=dict(
            linewidth=1,
            color='blue',
            alpha=.7
        )
    )


config = get_config(config_file, 'combine')
snp3_df = rolling_SNP(snp2_df, config)


r1 = 'chr17:3Mb-9Mb'
mean_plots = [absvaf,absvafmean, absvafmeandiff]

fig, ax, df, chrom_df = plot_snp(snp3_df, snp_plots=mean_plots, chroms=chroms, region='', **fig_params)

### snpLLH

In [None]:
snpllh = dict(
        title='snpLLH',
        plot_type='scatter',   # ['line', 'scatter']
        data='snpLLH',
        plot_args=dict(
            linewidth=0.5,
            color='blue',
            s=5,
            alpha=1
        ))

snpllhsum = dict(
        title='snpLLHsum',
        plot_type='line',   # ['line', 'scatter']
        data='snpLLHsum',
        plot_args=dict(
            linewidth=1,
            color='blue',
            alpha=.7
        )
    )

hsnpllhsum = dict(
        title='hsnpLLHsum',
        plot_type='line',   # ['line', 'scatter']
        data='hsnpLLHsum',
        plot_args=dict(
            linewidth=1,
            color='red',
            alpha=.7
        )
    )


snpllhsumdiff = dict(
        title='snpLLHsumdiff',
        plot_type='line',   # ['line', 'scatter']
        data='snpLLHsumDiff',
        plot_args=dict(
            linewidth=1,
            color='yellow',
            alpha=.7
        )
    )


config = get_config(config_file, 'combine')
snp3_df = rolling_SNP(snp2_df, config)


r1 = 'chr17:3Mb-9Mb'
snpllh_plots = [vaf,snpllhsum]

fig, ax, df, chrom_df = plot_snp(snp3_df, snp_plots=snpllh_plots, chroms=chroms, region='', **fig_params)

In [None]:
fig, ax, df, chrom_df = plot_snp(snp3_df.query('snpLLHsum < 0.2'), snp_plots=[vaf,snpllhsum], chroms=chroms, region='chr17', **fig_params)

## bring all together
+ center snp_df
+ get extra data
+ do the rolling

In [None]:
def apply_rolling_SNP(snp_df, config):

    # get extra data
    snp_df = expand_SNPdata(snp_df, config)
    # do the rolling
    snp_df = rolling_SNP(snp_df, config)
    # get the CNV and Center blocks
    snp_df = get_CNV_blocks(snp_df, 'snpLLH', config)
    
    # select columns for output
    base_cols = list(snp_df.columns[:4])

    snp_cols = [col for col in snp_df.columns[4:] if not 'log2' in col and not 'cov' in col and not 'off' in col]
    rolling_snp_df = snp_df[base_cols + snp_cols]
    cluster_cols = ['log2ratio', 'log2ratiomean', 'VAF', 'absVAF', 'absVAFmean']
    cluster_cols += [col for col in snp_df.columns if 'Center' in col or 'CNV' in col]
    cluster_df = snp_df[base_cols + cluster_cols]
    return rolling_snp_df, cluster_df

In [None]:
rolling_snp_df, cluster_df = apply_rolling_SNP(snp_df, config)

## run the code and visualize

In [None]:
sample = "03_A-B"
snp_df = pd.read_csv(os.path.join(output_path, f"snp/{sample}.snp.gz"), sep="\t", compression="gzip")

vaf2 = dict(
        title='VAF',
        plot_type='scatter',  # ['line', 'scatter']
        data='VAF2',
        plot_args=dict(
            s=2,
            c='snpCNVcore',
            cmap='viridis',
            alpha=.7
        )
    )

absvaf = dict(
        title='absVAF',
        plot_type='scatter',   # ['line', 'scatter']
        data='absVAF2',
        plot_args=dict(
            linewidth=0.2,
            c='snpCNVcore',
            cmap='viridis',
            s=5,
            alpha=1
        ))


snpllhsum = dict(
        title='snpLLHsum',
        plot_type='line',   # ['line', 'scatter']
        data='snpLLH_sum2',
        plot_args=dict(
            linewidth=.5,
            color='red',
            alpha=.7
        )
    )

absvafmean = dict(
        title='absVAFmean',
        plot_type='line',   # ['line', 'scatter']
        data='absVAF2_mean',
        plot_args=dict(
            linewidth=1,
            color='yellow',
            alpha=.7
        )
    )

######################################################

fig_params = dict(
    figsize=(24,4),
    colormap='coolwarm_r',
    color_chroms=True,
    ylim=(-0,1),
    cov_offset=.1,  # how much log2ratio=0 is shifted above SNP-data
    cov_height=.5,
    label_size=13
)

chroms = ['chr3', 'chr4', 'chr5', 'chr6','chr7', 'chr9', 'chr12', 'chr17']
r1 = 'chr17:3Mb-9Mb'




fig, _, _, _ = plot_snp(cluster_df, snp_plots=[vaf], cov_plots=[log2,log2mean], chroms=chroms, region='', **fig_params)
#fig, ax, df, chrom_df