# Combine SNP and COV data into one df for clustering
+ all local data points must have been converted to reliable rolling values and correct filtering has been applied:
    * minDepth + EBscore for heteroSNPs
    * minCoverage and maxStd for cov data



+ use a constant binning for sampling the average data
    * all local values can be removed

In [None]:
from IPython.display import display
pd.options.display.max_columns = 37
pd.options.display.max_rows = 100
pd.set_option('display.max_colwidth', None)

In [None]:
# HOME
home = '/Users/mahtin'
home = '/Users/martinscience'
# standard paths
testdata = os.path.join(home,"Dropbox/Icke/Work/somVar/testdata")
static_path = os.path.join(home, "Dropbox/Icke/Work/static")
cluster_path = os.path.join(home, "/Users/martinscience/mount")
tooldata = os.path.join(home, "Dropbox/Icke/Work/somVar/tooldata")

# tool-specific paths
shell_path = "../shell"
# 
cnvdata = os.path.join(tooldata, "myCNVdata")
output_path = os.path.join(cnvdata, "output")

# the path to the input data
cnv_path = os.path.join(cnvdata, "cnv")
cnvPON_path = cnv_path = os.path.join(cnvdata, "chromCov")
cnv_path = os.path.join(cluster_path, "scratch/develop/PONcoverage/cnv")
cnvPON_path = os.path.join(cluster_path, "scratch/develop/PONcoverage/chromCov")

## Load snp_df and cov_df with rolling metrices

In [None]:
snp_df = pd.read_csv(os.path.join(output_path, 'heteroSNP/01_A.snp'), sep='\t')
cov_df = pd.read_csv(os.path.join(output_path, 'covDif/01_A.cov'), sep='\t')

In [None]:
snp_df[:10]

In [None]:
cov_df[:10]

# use the code

In [None]:
# get the code
import sys
sys.path.append('../code')
from merge_cnv2snp import mergeSNP2Cov

In [None]:
config = {
    'heteroSNP': {
        'VAFlimits':[0.05,0.95],
        'normalize': True,
        'windows': {
            'absVAF': {
                'sum': 20
            },
            'VAF': {
                'std': 20
            },
            'deltaVAF': {
                'std': 20
            }
        },
        'minEBscore':0.5,
        'minDepth': 30,
    },
    'coverage': {
        'VAFlimits':[0.05,0.95],
        'normalize':False,
        'windows': {
            'log2ratio': {
                'mean':500
            }
        }
    }
}
merge_df = mergeSNP2Cov(snp_df, cov_df, config)
merge_df

In [None]:
merge_df.to_csv(os.path.join(output_path, 'cluster/01_A_merged.cnv'), sep='\t', index=False)

# Walk me through it..

### merge both dfs per chrom and combine the values

In [None]:
# get the required snp_df columns for merge
snp_cols = ['Chr', 'Start', 'FullExonPos', 'ExonPos', 'VAF']
for metrix in ['absVAFsum', 'deltaVAFstd', 'VAFstd']:
    snp_cols += [col for col in snp_df.columns if col.startswith(metrix)]

snp_chrom_df = snp_df.loc[:,snp_cols]
snp_chrom_df

In [None]:
# get the required cov_df columns for merge
cov_chrom_df = cov_df.loc[:, ['Chr', 'Pos', 'FullExonPos', 'ExonPos', 'log2ratiomeanDiff', 'log2ratiomean']]

merge_df = snp_chrom_df.merge(cov_chrom_df, on=['Chr', 'FullExonPos'], how='outer').sort_values('FullExonPos').rename(columns={
    'ExonPos_x': 'PosSNP',
    'ExonPos_y': 'PosCov'
})

# merge chromosomal start coords
merge_df.loc[merge_df['Start'] != merge_df['Start'], 'Start'] = merge_df['Pos']
merge_df['Start'] = merge_df['Start'].astype(int)
merge_df = merge_df.drop(columns='Pos').reset_index(drop=True).sort_values('FullExonPos')
merge_df[:10]

In [None]:
merge_df.to_csv(os.path.join(output_path, 'cluster/01_A_staggered.cnv'), sep='\t', index=False)

In [None]:
# store the ones that are already merged

# store the fitting values as merged_df
merged_df = merge_df.query('VAF == VAF and log2ratiomean == log2ratiomean')

# go on with the SNPs with non-fitting data
merge_df = merge_df.query('VAF != VAF or log2ratiomean != log2ratiomean')
merge_df

### go on with chrom-separate merging

In [None]:
chrom = 'chr5'
merge = merge_df.query('Chr == @chrom')
merge

In [None]:
# focus on VAFstd for simplicity

def approximate_data(merge, col='VAFstd', pos_col='PosSNP', trans_pos_col='PosCov'):
    '''
    takes the data values from col at positions in pos_col 
    and linearly approximates data values into merged rows at positions in trans_pos_col
    '''
    
    cols = list(merge.columns)
    # find the adjacent positions for missing rows and store in PosL and PosR
    merge.loc[:,'PosL'] = merge[pos_col].fillna(method="ffill")
    merge.loc[:,'PosR'] = merge[pos_col].fillna(method="bfill")
    # find the adjacent data values for missing rows and store in L and R
    merge.loc[:,'L'] = merge[col].fillna(method="ffill")
    merge.loc[:,'R'] = merge[col].fillna(method="bfill")
    # approximate the missing values
    merge.loc[merge[col] != merge[col], col] = merge['L'] + (merge['R'] - merge['L']) / (
            merge['PosR'] - merge['PosL']) * (merge[trans_pos_col] - merge['PosL'])
    # close the gaps
    merge.loc[:,col] = merge[col].fillna(method='bfill').fillna(method='ffill')
    
    # return the only the original columns with filled in values
    return merge.loc[:,cols]


In [None]:
merge = approximate_data(merge, col='VAFstd', pos_col='PosSNP', trans_pos_col='PosCov')
merge

In [None]:
def get_approx_col_list(config):
    '''
    generates a list of dictionaries with data columns to be approximated
    from the config to be consumed by the approximator
    '''
    
    # approx_cols is the list of 
    approx_cols = []
    snp_conf = config['heteroSNP']['windows']

     
    for col in snp_conf.keys():
        for mode in snp_conf[col].keys():
            approx_cols.append({
                'col':f"{col}{mode}",
                'pos_col':'PosSNP',
                'trans_pos_col': 'PosCov'
            })
    cov_conf = config['coverage']['windows']
    for col in cov_conf.keys():
        for mode in cov_conf[col].keys():
            approx_cols.append({
                'col':f"{col}{mode}",
                'pos_col':'PosCov',
                'trans_pos_col': 'PosSNP'
            })
    return approx_cols

get_approx_col_list(config)

In [None]:
# run the approximation for all the columns defined in the config

for data in get_approx_col_list(config):
    print(data['col'])
    merge = approximate_data(merge, col=data['col'], pos_col=data['pos_col'], trans_pos_col=data['trans_pos_col'])
    merge = approximate_data(merge, col=data['col']+"Diff", pos_col=data['pos_col'], trans_pos_col=data['trans_pos_col'])
merge

In [None]:
def mergeSNP2Cov(snp_df, cov_df, config):
    '''
    for clustering, all data points from SNP
    '''
    # get the required snp_df columns for merge
    snp_cols = ['Chr', 'Start', 'FullExonPos', 'ExonPos', 'VAF']
    for metrix in ['absVAFsum', 'deltaVAFstd', 'VAFstd']:
        snp_cols += [col for col in snp_df.columns if col.startswith(metrix)]

    snp_chrom_df = snp_df.loc[:,snp_cols]

    # get the required cov_df columns for merge
    cov_chrom_df = cov_df.loc[:, ['Chr', 'Pos', 'FullExonPos', 'ExonPos', 'log2ratiomeanDiff', 'log2ratiomean']]
    
    # do the merge and rename the respective ExonPos
    merge_df = snp_chrom_df.merge(cov_chrom_df, on=['Chr', 'FullExonPos'], how='outer').sort_values('FullExonPos').rename(columns={
        'ExonPos_x': 'PosSNP',
        'ExonPos_y': 'PosCov'
    })

    # merge chromosomal start coords
    merge_df.loc[merge_df['Start'] != merge_df['Start'], 'Start'] = merge_df['Pos']
    merge_df.loc[:,'Start'] = merge_df['Start'].astype(int)
    merge_df = merge_df.drop(columns='Pos').reset_index(drop=True).sort_values('FullExonPos')
    
    # store the fitting values as merged_df
    merged_df = merge_df.query('VAF == VAF and log2ratiomean == log2ratiomean')

    # go on with the SNPs with non-fitting data
    merge_df = merge_df.query('VAF != VAF or log2ratiomean != log2ratiomean')
    
    # get the data columns for the approximator
    data_col_list = get_approx_col_list(config)
    
    # go through the chromosomes and do the approximation
    merge_dfs = []
    for chrom in merge_df['Chr'].unique():
        chrom_merge_df = merge_df.query('Chr == @chrom')
        for data in data_col_list:
            chrom_merge_df = approximate_data(chrom_merge_df, col=data['col'], pos_col=data['pos_col'], trans_pos_col=data['trans_pos_col'])
            chrom_merge_df = approximate_data(chrom_merge_df, col=data['col']+"Diff", pos_col=data['pos_col'], trans_pos_col=data['trans_pos_col'])
        merge_dfs.append(chrom_merge_df)
    # concat the chroms and add the already merged df
    merge_df = pd.concat(merge_dfs + [merged_df]).sort_values('FullExonPos').rename(columns={'PosSNP':'ExonPos'})
    # transfer the missing positions from 
    merge_df.loc[:, 'ExonPos'] = merge_df['ExonPos'].fillna(merge_df['PosCov'])
    return merge_df.drop(columns='PosCov')

In [None]:
merge_df = mergeSNP2Cov(snp_df, cov_df, config)

In [None]:
merge_df.sort_values('FullExonPos').query('VAF != VAF')