## make plots for all samples

In [1]:
import os
import matplotlib as mpl
import matplotlib.pyplot as plt
# use seaborn plotting defaults
import seaborn as sns; sns.set()
######## need to find the chained assignment!!!
pd.set_option('mode.chained_assignment', None)

# load the config
# edit config directly in yaml file
import yaml
config_file = '../config/config_devel.yaml'
def get_config(config_file, param):
        with open(config_file) as file:
        # The FullLoader parameter handles the conversion from YAML
        # scalar values to Python the dictionary format
            config = yaml.load(file, Loader=yaml.FullLoader)['CNV'][param]
        return config
config = get_config(config_file, 'combine')



user = 'martinscience'
# HOME
home = '/Users/mahtin'
home = '/Users/martinscience'

# get the code
import sys
sys.path.append('../scripts')
from codeCNV.plot import plot_snp, plot_2d, plot_3d, plot_snp2
from codeCNV.rollingSNP import apply_rolling_SNP
from script_utils import show_output


# user = 'mahtin'
HOME = f"/Users/{user}"
wes_path = f"{HOME}/Dropbox/Icke/Work/somVar/AMLMono7/WESData"
path = f"{wes_path}"
!ls {path}

AMLPresi.pptx         [34mP559[m[m                  SNPplots.ipynb
AML_relapse.csv       [34mP615[m[m                  [34mSampleCheck[m[m
AML_relapse.xlsx      [34mP625[m[m                  WES CNV Analysis.xlsx
AML_relapse1.xlsx     [34mP665[m[m                  filter1.csv
[34mCNVplots[m[m              [34mP685[m[m                  filter1GDC.csv
[34mCNVraw[m[m                [34mP778[m[m                  samples.csv
[34mP12[m[m                   [34mPGDC[m[m
[34mP483[m[m                  [34mPmerge[m[m


In [2]:
from codeCNV.rollingCNV import interpolate, one_col_rolling, llh, get_blocks, rolling_data, get_CNV_blocks
from script_utils import show_output

def make_get_density(window_size=20):
    '''
    helper for returning a density computer for given window_size
    '''
    
    def SNPdensity(data):
        return (data.max() - data.min()) / window_size
    return SNPdensity

def remove_fallSNP(snp_df, mean=0.5, std=0.2, params={}):
    '''
    removes the falling SNP probably caused by mismapping
    '''
    
    window = params['offVAFwindow']
    cutoff = params['maxFallSNP']
    
    # get the density computer for rolling
    get_SNPdensity = make_get_density(window)
    # cycle through chroms
    chrom_dfs = []
    for chrom in snp_df['Chr'].unique():
        df = snp_df.query('Chr == @chrom')
  
        # get the snp
        df = one_col_rolling(df, df.query('VAF < 0.95'), 'ExonPos', get_SNPdensity, window_size=window, diff_exp=4)
        df.loc[:, 'SNPdensity'] = df['SNPdensity'] / df['SNPdensity'].mean()
    
        # get the offVAFsum
        df = one_col_rolling(df, df.query('VAF < 0.95'), 'offVAF', 'sum', window_size=window, normalize=True, diff_exp=4)
    
        # combine both metrices
        df.loc[:, 'fallSNP'] = df['SNPdensity'] * df['offVAFsum']
        # now remove the ones below average VAFstd
        df = df.query('VAF > @mean - @std / 2 or fallSNP > @cutoff')
        chrom_dfs.append(df)
        
    return pd.concat(chrom_dfs).sort_values('FullExonPos').reset_index(drop=True)


def compute_snp_llh(df, mean=0.5, sigma=0.2):
    '''
    computes the local log-likelihood of belonging to the center gaussian
    '''
        
    show_output(f"Computing log-likelihood of VAF belonging to center gaussian [mean:{round(mean, 3)}, sigma:{round(sigma,3)}]")
    df.loc[:, 'snpLLH'] = llh(df['VAF'], mean, sigma)
    
    # for homoSNPs reduce the VAFs to the ones above mean
    upper_vafs = df.query('@mean < VAF')['VAF']
    # then compute the hsnpLLH

    show_output(f"Computing log-likelihood of VAF belonging to purity100  [mean:1, sigma:{round(sigma,3)}]")
    # these are called hsnp
    # upper_vafs only contains half the snps, the remaining have to be interpolated
    df.loc[:, 'hsnpLLH'] = llh(upper_vafs, 1, sigma)
    df = interpolate(df, 'hsnpLLH', expand_limit=50)
    return df
    

def expand_SNPdata(snp_df, config):
    '''
    retrieve a few data columns locally to use rolling windows on
    this needs to be done chromosome-wise in order to avoid gap effects
    VAF limits are also applied here
    '''
        
    # split the params dict for easier access
    params = config['snp']
    filter_params = params['filter']
    # data_params = params['data']
    
    # reduce the snp_df using lower config limit
    # upper limit has to be set later as we still need the homoSNP llh
    VAFmin, VAFmax = filter_params['VAF']
    snp_df = snp_df.query('@VAFmin < VAF')
    
    
    # get std and mean of VAF
    minVAF, maxVAF = params['LLH']['center_range']
    # get the sigma and mean of the center band VAF (extracted as pd.Series center_vafs)
    center_vafs = snp_df.query('@minVAF < VAF < @maxVAF')['VAF']
    # get width of gaussian from std * sigma_factor
    VAFstd = center_vafs.std()
    VAFmean = center_vafs.mean()
    
    
    # get additional features from VAFs
    snp_df.loc[:, 'offVAF'] = (snp_df['VAF'] - VAFmean) * 2
    # absolute values for cluster 
    snp_df.loc[:,'absVAF'] = np.abs(snp_df['offVAF'])    
    
    ########## remove fallSNP ########
    fs_params = params['fallSNP']
    if fs_params['run']:
        show_output('Removing falling SNPs')
        snp_df = remove_fallSNP(snp_df, mean=VAFmean, std=VAFstd, params=fs_params)
     
    ######## LLH  #####################
    # get the snpLLH and hsnpLLH
    # get config params
    sigma = VAFstd * params['LLH']['sigma_factor']
    # hsnpLLH is computed in order to rescue high absVAF that would have been filtered out
    # lower VAF is already removed because density of VAF ~0 is highly irregular and would confound 
    snp_df = compute_snp_llh(snp_df, mean=VAFmean, sigma=sigma)

    return snp_df.query('VAF < @VAFmax').reset_index(drop=True)

def rolling_SNP(snp_df, config):
    '''
    cycle through the chroms and perform rolling window computations of snp data set in config
    '''

    # split the params dict for easier access
    params = config['snp']
    filter_params = params['filter']
    data_params = params['rolling_data']
    debug = config['debug']
        
    minDepth = filter_params['minDepth']
    filter_df = snp_df.query('Depth >= @minDepth')
    
    rolling_df = rolling_data(snp_df, filter_df, expand=params['expand'], ddof=config['ddof'], debug=debug, data_params=data_params)
    
    return rolling_df

def apply_rolling_SNP(snp_df, config):

    # get extra data
    snp_df = expand_SNPdata(snp_df, config)
    # do the rolling
    snp_df = rolling_SNP(snp_df, config)
    # get the CNV and Center blocks
    snp_df = get_CNV_blocks(snp_df, 'snpLLH', config)
    
    # select columns for output
    base_cols = list(snp_df.columns[:4])

    snp_cols = [col for col in snp_df.columns[4:] if not 'log2' in col and not 'cov' in col and not 'off' in col]
    rolling_snp_df = snp_df[base_cols + snp_cols]
    cluster_cols = ['log2ratio', 'log2ratiomean', 'VAF', 'absVAF', 'absVAFmean']
    cluster_cols += [col for col in snp_df.columns if 'Center' in col or 'CNV' in col]
    cluster_df = snp_df[base_cols + cluster_cols]
    return rolling_snp_df, cluster_df

In [None]:
project = "P483"
sample = "01_A"

snp_df = pd.read_csv(os.path.join(wes_path, f"CNVraw/{project}/{sample}.snp"), sep='\t')
snp_df = expand_SNPdata(snp_df, config)
snp_df

In [None]:
log2mean = dict(
        title='rollinglog2ratio',
        plot_type='line',   # ['line', 'scatter']
        data='log2ratiomean',
        plot_args=dict(
            linewidth=1,
            color='yellow',
            alpha=.7
        )
    )

log2 = dict(
        title='log2ratio',
        plot_type='scatter',   # ['line', 'scatter']
        data='log2ratio',
        plot_args=dict(
            linewidth=0.3,
            color='black',
            cmap='binary',
            # color='black',
            s=2,
            alpha=1
        )
    )

vaf = dict(
        title='VAF',
        plot_type='scatter',  # ['line', 'scatter']
        data='VAF',
        plot_args=dict(
            s=1,
            color='black',
            cmap='viridis',
            alpha=1
        )
    )
chroms = ['chr3', 'chr4', 'chr5', 'chr6','chr7', 'chr9', 'chr12', 'chr17']
r1 = 'chr17:3Mb-9Mb'

######################################################

fig_params = dict(
    figsize=(24,5),
    colormap='coolwarm_r',
    color_chroms=True,
    ylim=(-0,1),
    cov_offset=.1,  # how much log2ratio=0 is shifted above SNP-data
    cov_height=.5,
    label_size=13
)

fig, _, _, _ = plot_snp2(snp_df.query('0.03 < VAF < 0.95'), snp_plots=[vaf], cov_plots=[log2,log2mean], chroms="all", region='', **fig_params)
#fig, ax, df, chrom_df

In [None]:
project = "P483"
sample = "01_A"
fig.savefig(os.path.join(wes_path, f"CNVplots/svg/{project}/{sample}.svg"))
fig.savefig(os.path.join(wes_path, f"CNVplots/jpg/{project}/{sample}.jpg"), quality=90)

### Making sample_df for all clustered samples

In [13]:
def get_sample_name(file):
    sample = os.path.basename(file).split(".")[0]
    name = sample.split('_')[0] + sample.split('_')[1].replace("-B", "")
    return name


projects = [f"P{i}" for i in [483,559,615,625,665,685,778,"merge"]]

sample_df = pd.DataFrame(columns=['sample', 'file'])
for project in projects:
    folder = os.path.join(wes_path, f"{project}/CNV")
    print(f"Looking for cluster files in {folder}")
    for folder, _, files in os.walk(folder):
        for file in files:
            if file.endswith(".cluster"):
                sample = file.replace(".cluster", "").replace("_", "")
                file = os.path.join(folder, file)
                # print(f"Found sample {sample} as file {file}")
                sample_df = sample_df.append(pd.Series(dict(sample=sample, project=project, file=file)), ignore_index=True)
sample_df.to_csv(os.path.join(wes_path, "cluster_samples.csv"), sep='\t', index=False)

Looking for cluster files in /Users/martinscience/Dropbox/Icke/Work/somVar/AMLMono7/WESData/P483/CNV
Looking for cluster files in /Users/martinscience/Dropbox/Icke/Work/somVar/AMLMono7/WESData/P559/CNV
Looking for cluster files in /Users/martinscience/Dropbox/Icke/Work/somVar/AMLMono7/WESData/P615/CNV
Looking for cluster files in /Users/martinscience/Dropbox/Icke/Work/somVar/AMLMono7/WESData/P625/CNV
Looking for cluster files in /Users/martinscience/Dropbox/Icke/Work/somVar/AMLMono7/WESData/P665/CNV
Looking for cluster files in /Users/martinscience/Dropbox/Icke/Work/somVar/AMLMono7/WESData/P685/CNV
Looking for cluster files in /Users/martinscience/Dropbox/Icke/Work/somVar/AMLMono7/WESData/P778/CNV
Looking for cluster files in /Users/martinscience/Dropbox/Icke/Work/somVar/AMLMono7/WESData/Pmerge/CNV


### making plots for the merge check

In [None]:
projects = [f"P{i}" for i in ["12"]]

def get_sample_name(file):
    sample = os.path.basename(file).split(".")[0]
    name = sample.split('_')[0] + sample.split('_')[1].replace("-B", "")
    return name


sample_list = []
filter_lists = []
for project in projects:
    folder = os.path.join(wes_path, f"CNVraw/{project}")
    print(f"Looking for filter files in {folder}")
    for folder, _, files in os.walk(folder):
        for file in files:
            if file.endswith(".snp"):
                sample = file.replace(".snp", "").replace("_", "")
                print(sample)
                # if sample != "27B" and sample !="30R":
                    # continue
                print(f"Found sample {sample} as file {file}")
                # load sample
                snp_df = pd.read_csv(os.path.join(folder, file), sep='\t')
                snp_df = expand_SNPdata(snp_df, config)
                fig, _, _, _ = plot_snp2(snp_df.query('0.03 < VAF < 0.95'), snp_plots=[vaf], cov_plots=[log2,log2mean], chroms="all", region='', **fig_params)
                fig.savefig(os.path.join(wes_path, f"CNVplots/svg/{project}/{sample}.svg"))
                fig.savefig(os.path.join(wes_path, f"CNVplots/jpg/{project}/{sample}.jpg"), quality=90)

### making plots for all samples

In [None]:
projects = [f"P{i}" for i in [483,559,615,625,665,685,778,"merge"]]

def get_sample_name(file):
    sample = os.path.basename(file).split(".")[0]
    name = sample.split('_')[0] + sample.split('_')[1].replace("-B", "")
    return name


sample_list = []
filter_lists = []
for project in projects:
    folder = os.path.join(wes_path, f"CNVraw/{project}")
    print(f"Looking for filter files in {folder}")
    for folder, _, files in os.walk(folder):
        for file in files:
            if file.endswith(".snp"):
                sample = file.replace(".snp", "").replace("_", "")
                print(f"Found sample {sample} as file {file}")
                # load sample
                snp_df = pd.read_csv(os.path.join(folder, file), sep='\t')
                snp_df = expand_SNPdata(snp_df, config)
                fig, _, _, _ = plot_snp2(snp_df.query('0.03 < VAF < 0.95'), snp_plots=[vaf], cov_plots=[log2,log2mean], chroms="all", region='', **fig_params)
                fig.savefig(os.path.join(wes_path, f"CNVplots/svg/{project}/{sample}.svg"))
                fig.savefig(os.path.join(wes_path, f"CNVplots/jpg/{project}/{sample}.jpg"), quality=90)
[get_sample_name(s) for s in sample_list]

## running rollingSNP for all samples

In [None]:
projects = [f"P{i}" for i in [483,559,615,625,665,685,778,"merge"]]

def get_sample_name(file):
    sample = os.path.basename(file).split(".")[0]
    name = sample.split('_')[0] + sample.split('_')[1].replace("-B", "")
    return name


sample_list = []
filter_lists = []
for project in projects:
    folder = os.path.join(wes_path, f"CNVraw/{project}")
    print(f"Looking for filter files in {folder}")
    for folder, _, files in os.walk(folder):
        for file in files:
            if file.endswith(".snp") and not "roll" in file:
                sample = file.replace(".snp", "").replace("_", "")
                if sample != "27B" and sample !="30R":
                    continue
                print(f"Found sample {sample} as file {file}")
                # load sample
                in_file = os.path.join(folder, file)
                snp_df = pd.read_csv(in_file, sep='\t')
                snp_df, cluster_df = apply_rolling_SNP(snp_df, config)
                out_file = os.path.join(wes_path, f"{project}/CNV/{file}")
                roll_file = out_file.replace(".snp", ".roll.snp")
                snp_df.to_csv(roll_file, sep='\t', index=False)
                cluster_file = out_file.replace(".snp", ".cluster")
                cluster_df.to_csv(cluster_file, sep='\t', index=False)

In [None]:
### Making sample_df for all clustered samples