## 2d-plot snpVAF vs log2ratio and assign CNVstatus for individual blocks

### SETUP

In [None]:
# HOME
home = '/Users/mahtin'
home = '/Users/martinscience'

import os
import matplotlib as mpl
import matplotlib.pyplot as plt
# use seaborn plotting defaults
import seaborn as sns; sns.set()

# get the code
import sys
sys.path.append('../scripts')
from codeCNV.plot import plot_genomic, plot_snp2, plot_snp, plot_2d, plot_3d
from script_utils import show_output

# standard paths
testdata = os.path.join(home,"Dropbox/Icke/Work/somVar/testdata")
static_path = os.path.join(home, "Dropbox/Icke/Work/static")
cluster_path = os.path.join(home, "mount")
tooldata = os.path.join(home, "Dropbox/Icke/Work/somVar/tooldata")

# load the config
# edit config directly in yaml file
import yaml
config_file = '../config/config_devel.yaml'
def get_config(config_file, param):
        with open(config_file) as file:
        # The FullLoader parameter handles the conversion from YAML
        # scalar values to Python the dictionary format
            config = yaml.load(file, Loader=yaml.FullLoader)['CNV'][param]
        return config
config = get_config(config_file, 'cluster')

cnvdata = os.path.join(tooldata, "myCNVdata")
output_path = os.path.join(cnvdata, "output")
plot_path = f'{home}/Dropbox/Icke/Work/myLabmeeting/figures/matplotlib'

###  get the sample

In [None]:
sample = "06_A"
cluster_df = pd.read_csv(os.path.join(output_path, f'CNV/{sample}.cluster'), sep='\t')
cluster_df

### strategy
+ get the means and sigma from the center_df
+ adjust absVAF and log2ratio of CNV_df
+ plot
+ find most likely purity by llh2d_mask

In [None]:
def center_data(cluster_df):
    '''
    get the center_df containing only points belonging to both Centercores
    '''
    
    cols = ['log2ratio', 'log2ratiomean', 'absVAF', 'absVAFmean']

    # get the center_df with points belonging to both Centercores
    center_df = cluster_df.query('covCentercore + snpCentercore == 2').copy()
    # get the mean and std for relevant columns
    center_params = center_df[cols + ['VAF']].agg(['mean', 'std']).T
    # get the cnv_df with at least one CNVcore per point
    cnv_df = cluster_df.query('snpCNVcore > 0').copy()
    
    for col in cols:
        center_df.loc[:, col] = center_df[col] - center_params.loc[col, 'mean']
        cnv_df.loc[:, col] = cnv_df[col] - center_params.loc[col, 'mean']
        
    cnv_df.loc[:, 'VAF'] = cnv_df['VAF'] + 0.5 - center_params.loc['VAF', 'mean']
    center_df.loc[:, 'VAF'] = center_df['VAF'] + 0.5 - center_params.loc['VAF', 'mean']
    return cnv_df, center_df, center_params

In [None]:
cnv_df, center_df, center_params = center_data(cluster_df)
center_params

### 2-dimensional llh

In [None]:
def llh2d(dx, dy, mx=0, my=0, sx=0.5, sy=0.1):
    '''
    compute the density function for a given gaussian
    takes a pd.Series or np.array
    '''   
    # get the fixed term
    s = 2 * np.pi * sx * sy
    return np.exp((((dx - mx) / sx) **2 + ((dy - my) / sy) **2) / -2) / s

### get all the means depending on alpha
+ will be used for making gaussians


In [None]:
# start with a simple dataFrame
import math

def get_gauss_mask(alpha, Nmax):
    '''
    returns the gauss params for the gauss mask
    '''
    
    alpha = min(1,alpha)
    df = pd.DataFrame()
    for n in range(int(Nmax)):
        N = n + 1 
        for i in range(math.ceil((N + 1) / 2)):
            string = "A" * (N - i) + "B" * i
            if string == 'A':
                string = 'LOH'
            absVAF = alpha * np.abs((2 * i) / N - 1)
            log2 = np.log2(2 + alpha * (N - 2)) - 1
            s = pd.Series({'type': string, 'absVAF': absVAF, 'log2ratio': log2})
            df = df.append(s, ignore_index=True)
    return df.query('absVAF !=0 or log2ratio != 0')


Nmax = 6
alpha = 1
gauss_mask = get_gauss_mask(alpha, Nmax)
gauss_mask

+ ####  mask2VAF for converting to VAF gaussians

In [None]:
def mask2VAF(mask_df):
    '''
    converts the gauss_mask for absVAF into mask for VAF
    '''
    
    df = mask_df.copy()
    df.loc[:, 'VAF-'] = 0.5 - df['absVAF'] / 2
    df.loc[:, 'VAF+'] = 0.5 + df['absVAF'] / 2
    return df.loc[:, ['VAF-','VAF+', 'log2ratio', 'type']]


def VAFmask(alpha, Nmax=6):
    return mask2VAF(get_gauss_mask(alpha, Nmax))

VAFmask(alpha, Nmax)

In [None]:
plt.style.use('seaborn-white')

def plot_gaussian(df, xcol, ycol, 
                  df2=pd.DataFrame(), 
                  Nmax=0,   # maximal expected
                  alpha=1,
                  gauss_params=pd.DataFrame(), # the center_params containing the std
                  logmax=2.5, 
                  std_factor = 1,
                  rings=10, # number of rings for contour
                  figsize=(6, 6)):
    fig, ax = plt.subplots(figsize=figsize)
    _ = ax.scatter(df[xcol], df[ycol], s=.1)
    if len(df2.index):
        _ = ax.scatter(df2[xcol], df2[ycol], s=1, color='red')
    _ = ax.set_xlabel(xcol, fontsize=15)
    _ = ax.set_ylabel(ycol, fontsize=15)
    _ = sns.despine(ax=ax, offset=0)
    _ = ax.spines['left'].set_position('zero')

    
    def get_lims(col):
        if 'log' in col:
            return (-1.5, logmax)
        if 'VAF' in col:
            return (-0.05, 1.05)
    _ = ax.set_xlim(get_lims(xcol))
    _ = ax.set_ylim(get_lims(ycol))
    
    # add gaussian mask
    if Nmax:
        # create the grid
        x = np.linspace(*get_lims(xcol), 500)
        y = np.linspace(*get_lims(ycol), 400)
        X, Y = np.meshgrid(x, y)
        Z = np.zeros_like(X)
        
        # get the std from the params
        sx, sy = gauss_params.loc[[xcol, ycol], 'std']
        print(sx, sy)
        sx *= std_factor
        sy *= std_factor
        gaussians = get_gauss_mask(alpha, Nmax)
        if xcol == 'VAF':
            gaussians = mask2VAF(gaussians)
            for _, row in gaussians.iterrows():
                mx1 = row['VAF-']
                mx2 = row['VAF+']
                my = row['log2ratio']
                Z += llh2d(X,Y, mx1, my, sx, sy) +   llh2d(X,Y, mx2, my, sx, sy)
                ax.text(mx1, my-0.2, row['type'], ha='center')
                ax.text(mx2, my-0.2, row['type'].replace('A', 'G').replace('B', 'A').replace('G', 'B'), ha='center')
        else:
            for _, row in gaussians.iterrows():
                mx = row['absVAF']
                my = row['log2ratio']
                Z += llh2d(X,Y, mx, my, sx, sy)
                ax.text(mx, my-0.2, row['type'], ha='center')
        _ = ax.contour(X,Y,Z, rings, colors='black', alpha=1,linewidths=.4)
    # set the y-spine
    _ = ax.axhline(y=0, color='k')
    _ = ax.axvline(x=0, color='k')
    _ = ax.axvline(x=1, color='k')
    if xcol == 'VAF':
        _ = ax.axvline(x=0.5, c='k',ls="--")

    return fig, ax

## test samples

In [None]:
log2 = dict(
        title='log2ratio',
        plot_type='scatter',   # ['line', 'scatter']
        data='log2ratio',
        plot_args=dict(
            linewidth=0.3,
            color='black',
            s=1,
            alpha=.7
        )
    )

log2mean = dict(
        title='rollinglog2ratio',
        plot_type='line',   # ['line', 'scatter']
        data='log2ratiomean',
        plot_args=dict(
            linewidth=1,
            color='yellow',
            alpha=.7
        )
    )
vaf = dict(
        title='VAF',
        plot_type='scatter',   # ['line', 'scatter']
        data='VAF',
        plot_args=dict(
            linewidth=1,
            color='black',
            s=2,
            alpha=.4
        ))

fig_params = dict(
    figsize=(22,4),
    colormap='coolwarm_r',
    color_chroms=True,
    ylim=(0,1),
    cov_offset=.1,  # how much log2ratio=0 is shifted above SNP-data
    cov_height=.5,
    label_size=13
)

In [None]:
sample = "04_A"
cluster_df = pd.read_csv(os.path.join(output_path, f'CNV/{sample}.cluster'), sep='\t')
chroms = ['chr5', 'chr7','chr8', 'chr11', 'chr17']
cnv_df, center_df, center_params = center_data(cluster_df)
_ = plot_snp2(cluster_df, snp_plots=[vaf], cov_plots=[log2, log2mean], chroms='all', region='', **fig_params)

In [None]:
cnv_df, center_df, center_params = center_data(cluster_df)
xcol = 'VAF'
ycol = 'log2ratiomean'
fig, ax = plot_gaussian(center_df, df2=cnv_df, xcol=xcol, ycol=ycol, 
                        Nmax=4, 
                        rings=8, 
                        logmax=1.5,
                        alpha=.85, 
                        std_factor=1,
                        gauss_params=center_params
                       )

In [None]:
df = cnv_df.copy()
df

## do the block assignment according to LLH
+ LLH2D computation has to be done for every possible gaussian
+ data can be computed for entire CNV_df
+ grouped sums can then be maximized
+ center params are needed for gaussians

In [None]:
a = 0.85
b = [0.85]
isinstance(a, list)

In [None]:
def call_blocks(df, alpha=0.9, Nmax=6, center_params=pd.DataFrame()):
    # get the std from center_params
    vaf_std = center_params.loc['VAF', 'std']
    log2_std = center_params.loc['log2ratiomean', 'std']
    
    # force list for alpha
    if not isinstance(alpha, list):
        alpha = [alpha] 
    
    # add a column for every CNV type and every alpha
    # and calculate the respective 
    for a in alpha:
        # cycle through the VAFmask (containing the means for the respective gaussians)
        mask = VAFmask(a, Nmax)
        for _, row in mask.iterrows():
            df[f"{a}-{row['type']}"] = llh2d(df['VAF'],df['log2ratiomean'], mx=row['VAF-'], my=row['log2ratio'], sx=vaf_std, sy=log2_std)
            df[f"{a}-{row['type']}"] += llh2d(df['VAF'],df['log2ratiomean'], mx=row['VAF+'], my=row['log2ratio'], sx=vaf_std, sy=log2_std)
    
    # reduce df to the sums of LLH per CNVtype
    # reduce to required columns and group by snpCNV
    cnv_df = df.loc[:, ['snpCNV'] + [f"{a}-{m}" for a in alpha for m in mask['type']]].groupby('snpCNV').sum()
    
    cnv_df['CNVcall'] = cnv_df.columns[np.argmax(cnv_df.values, axis=1)]
    
    # get the start and end coordinates for each group
    region_df = df.loc[:, ['Chr', 'Pos', 'snpCNV']].groupby(['snpCNV', 'Chr'])['Pos'].agg(["min", "max"]).rename({'min':'Start', 'max': 'End'}, axis=1).reset_index('Chr')
    
    # merge the region into the cnv_df
    return cnv_df.merge(region_df, left_index=True, right_index=True).loc[:, ['Chr', 'Start', 'End', 'CNVcall']]

In [None]:
block_df = call_blocks(df, [0.85, 0.4], 6, center_params)
block_df

In [None]:
def get_means(row):
    return VAFmask(float(row['a'])).set_index("type").loc[row['type']]

def get_all_means(df):
    df[['a', 'type']] = df['CNVcall'].str.extract(r"([0-9.]+)-([ABLOH]+)")
    df[['VAF-', 'VAF+', 'log2ratio']] = df.apply(get_means, axis=1)
    return df.drop(['a', 'type'], axis=1)

In [None]:
get_all_means(block_df)