In [1]:
ls ../data

001_A-B-chr1.EB           002_chr7.bed              test.bed
001_A-B-chr1.mutmatrix    002_chr7.pon              test.cleanpileup
002.csv                   003_A-B-chr1.EB           test.cutpileup
002_A-B-chr1.EB           003_A-B-chr1.mutmatrix    test.filterCount
002_A-B-chr1.EBmutmatrix  chr7.pilecount.gz         test.pileup
002_A-B-chr1.mutmatrix    stop.file                 test.pon


In [5]:
home = '/Users/mahtin'
testdata = f"{home}/Dropbox/Icke/Work/somVar/testdata"
pon_path = f"{testdata}/testpon"

### squeezing all data and shell paths into config

In [6]:
EBconfig = {
    "cleanpileup": "../shell/cleanpileup.mawk",
    "makeponlist": "../shell/makeponlist.sh",
    "csv2bed":"../shell/csv2bed.mawk",
    "pon2cols": "../shell/pon2cols.mawk",
    "pile2count": "../shell/pile2count2.mawk",
    "filterVar": "../shell/filterVar.mawk",
    "pon2tumor": "../shell/pon2tumor.mawk",
    "pon_path": pon_path,
    "genome_split": "/Users/mahtin/Dropbox/Icke/Work/static/genome/gatk/hg38/split",
    "MAPQ": 20,
    "Q": 25,
    "fit_pen": 0.5
}

In [7]:
bam_file = f"{testdata}/bam/002_A.bam"
mut_file = "../data/002.csv"
pon_list = "Pon_chr7short.txt"
chrom = "chr7"

### get the matrix file

In [10]:
# load in the EB.matrix file
eb_matrix = pd.read_csv('../data/002_A-B-chr1.EBmutmatrix', sep='\t').iloc[:30,:]
row = eb_matrix.iloc[1,:]
row

Chr                                                    chr1
Start                                                 16957
End                                                   16957
Ref                                                       G
Alt                                                       T
depthP    12|45|63|45|31|5|23|20|34|33|32|23|18|29|14|13...
misP      0|0|0|0|1|0|0|0|0|0|0|0|1|0|0|1|0|0|0|0|0|0|0|...
depthN    42|78|169|110|79|29|80|69|118|18|77|35|115|67|...
misN      5|0|21|0|12|0|0|0|24|0|0|0|25|0|19|15|0|5|0|10...
Name: 1, dtype: object

### look at the computation

In [20]:
def get_count_df(row):
    '''
    converts the base-wise read coverage to a matrix
    '''

    matrix = pd.DataFrame()
    matrix['depth_p'] = np.array(row['depthP'].split('|')).astype(int)
    matrix['mm_p'] = np.array(row['misP'].split('|')).astype(int)
    matrix['depth_n'] = np.array(row['depthN'].split('|')).astype(int)
    matrix['mm_n'] = np.array(row['misN'].split('|')).astype(int)
    return matrix

In [17]:
import math
from functools import partial
from scipy.optimize import fmin_l_bfgs_b as minimize_func
from scipy.stats import chi2
from scipy.special import gammaln

In [101]:
def fisher_combination(p_values):

    if 0 in p_values.values():
        return 0
    else:
        return 1 - chi2.cdf(sum([-2 * math.log(x) for x in p_values.values()]), 2 * len(p_values.values()))


def bb_pvalue(params, target_df):
    n_minus_k = target_df[0] - target_df[1]
    # get the list of observations [n, k] to [n, n]
    obs_list = [target_df + np.array([0, i])
                for i in range(0, n_minus_k + 1)]
    # get the list of loglikelihoods per observation
    ll_list = [bb_loglikelihood(params, obs, True) for obs in obs_list]

    #######################################################
    # print(f'ab: {params}\n observations: {obs_list} ll {ll_list}\n')
    #######################################################

    # get the sum of exponentials of loglikelihoods (densities) per observation

    p_value = sum([math.exp(ll) for ll in ll_list])

    return p_value

       
def bb_pvalues(params, target_df):
    '''
    accumulate p_value of target observation falling in fitted bb_distribution (not a variant)
    p_values are computed per strand (pvalue_p and pvalue_n)
    p_value: exponential sum of loglikelihooks of successes greater or equal than observed
    [n, k] --> sum of density (exp of loglikelihood) [n, k] to [n, n]
    '''



    target_p = target_df.loc[['depth_p', 'mm_p']]
    target_n = target_df.loc[['depth_n', 'mm_n']]
    p_values = {}
    p_values['p'] = bb_pvalue(params['p'], target_p)
    p_values['n'] = bb_pvalue(params['n'], target_n)
    return p_values


# the matrices for beta-binomial calculation
KS_matrix = np.array([[1, 0, 1, 1, 0, 1, 0, 0, 0], [
                     0, 1, -1, 0, 1, -1, 0, 0, 0]])
gamma_reduce = np.array([1, -1, -1, -1, 1, 1, 1, -1, -1])


def bb_loglikelihood(params, count_df, is_1d):
    [a, b] = params
    ab_matrix = np.array([1, 1, 1, a + b, a, b, a + b, a, b])
    # convert df into matrix for np.array operations that change dims
    count_matrix = count_df.values
    # perform matrix multiplication to get inputs to log-gamma
    input_matrix = np.matmul(count_matrix, KS_matrix) + ab_matrix
    # get corresponding log-gamma values and reduce over pon-values
    if is_1d:  # check whether gammatrix is 2-dim - otherwise sum aggregation over axis 0 is faulty
        gamma_matrix = gammaln(input_matrix)
    else:
        gamma_matrix = np.sum(gammaln(input_matrix), axis=0)
    # add or subtract using gamma_reduce matrix and sum to loglikelihood (scalar)
    log_likelihood = np.sum(gamma_matrix * gamma_reduce)
    return log_likelihood


def fit_bb(count_df, pen):
    '''
    Obtaining maximum likelihood estimator of beta-binomial distribution
    count_df is the array of depth-mismatch (trials, success) pairs over the PoN list for either strand
    during minimization of fitting function (max for loglikelihood) penalty term is applied to constrain alpha and beta
        Ref for L-BFGS-B algorithm:
        A Limited Memory Algorithm for Bound Constrained Optimization
        R. H. Byrd, P. Lu and J. Nocedal. , (1995),
        SIAM Journal on Scientific and Statistical Computing, 16, 5, pp. 1190-1208.
    '''

    def bb_loglikelihood_fitting(params, count_df, penalty):
        '''
        Fitting params [alpha, beta] to maximize loglikelihood
        '''

        # Here, we apply the penalty term of alpha and beta (default 0.5 is slightly arbitray...)
        result = penalty * \
            math.log(sum(params)) - bb_loglikelihood(params,
                                                     count_df, False)  # matrix is dim2
        return result

    # get the respective control matrices (as dataframe) for positive and negative strands
    count_p = count_df.loc[:, ['depth_p', 'mm_p']]
    count_n = count_df.loc[:, ['depth_n', 'mm_n']]
    # minimize loglikelihood using L-BFGS-B algorithm
    ab_p = minimize_func(
        bb_loglikelihood_fitting, [20, 20],
        args=(count_p, pen), approx_grad=True,
        bounds=[(0.1, 10000000), (1, 10000000)]
    )[0]
    ab_p = [round(param, 5) for param in ab_p]
    ab_n = minimize_func(
        bb_loglikelihood_fitting, [20, 20],
        args=(count_n, pen), approx_grad=True,
        bounds=[(0.1, 10000000), (1, 10000000)]
    )[0]
    ab_n = [round(param, 5) for param in ab_n]
    return {'p': ab_p, 'n': ab_n}

In [102]:
def matrix2EBscore(pen, row):
    
    count_df = get_count_df(row)
    # ########### FITTING ####################################
    # get the respective control matrices (as dataframe) for positive and negative strands
    # estimate the beta-binomial parameters for positive and negative strands

    # <<<<<<######### DEBUG ###############
    # print(row['Chr'], row['pos'], count_df)
    # <<<<<<###############################

    bb_params = fit_bb(count_df[1:], pen)
    # evaluate the p-values of target mismatch numbers for positive and negative strands
    p_values = bb_pvalues(bb_params, count_df.iloc[0])

    # ########### FISHER COMBINATION #########################
    # perform Fisher's combination methods for integrating two p-values of positive and negative strands
    EB_pvalue = fisher_combination(p_values)
    EB_score = 0
    if EB_pvalue < 1e-60:
        EB_score = 60
    elif EB_pvalue > 1.0 - 1e-10:
        EB_score = 0
    else:
        EB_score = -round(math.log10(EB_pvalue), 3)
    return EB_score

In [103]:
count_df = get_count_df(row)
count_df

Unnamed: 0,depth_p,mm_p,depth_n,mm_n
0,12,0,42,5
1,45,0,78,0
2,63,0,169,21
3,45,0,110,0
4,31,1,79,12
5,5,0,29,0
6,23,0,80,0
7,20,0,69,0
8,34,0,118,24
9,33,0,18,0


In [104]:
bb_params = fit_bb(count_df[1:], 0.5)
bb_params

{'p': [0.20509, 23.07746], 'n': [0.22661, 2.47235]}

### bb_pvalues

In [105]:
target_df = count_df.iloc[0]
print(target_df)
target_p = target_df.loc[['depth_p', 'mm_p']]
target_p
isinstance(target_p, pd.DataFrame)
target_n = target_df.loc[['depth_n', 'mm_n']]
target_n
p_values = {}

depth_p    12
mm_p        0
depth_n    42
mm_n        5
Name: 0, dtype: int64


depth_p    12
mm_p        0
Name: 0, dtype: int64

False

depth_n    42
mm_n        5
Name: 0, dtype: int64

### bb_pvalue

In [112]:
target_df = target_p
n_minus_k = target_df[0] - target_df[1]
n_minus_k

12

In [113]:
obs_list = [target_df + np.array([0, i])
                for i in range(0, n_minus_k + 1)]
obs_list

[depth_p    12
 mm_p        0
 Name: 0, dtype: int64,
 depth_p    12
 mm_p        1
 Name: 0, dtype: int64,
 depth_p    12
 mm_p        2
 Name: 0, dtype: int64,
 depth_p    12
 mm_p        3
 Name: 0, dtype: int64,
 depth_p    12
 mm_p        4
 Name: 0, dtype: int64,
 depth_p    12
 mm_p        5
 Name: 0, dtype: int64,
 depth_p    12
 mm_p        6
 Name: 0, dtype: int64,
 depth_p    12
 mm_p        7
 Name: 0, dtype: int64,
 depth_p    12
 mm_p        8
 Name: 0, dtype: int64,
 depth_p    12
 mm_p        9
 Name: 0, dtype: int64,
 depth_p    12
 mm_p       10
 Name: 0, dtype: int64,
 depth_p    12
 mm_p       11
 Name: 0, dtype: int64,
 depth_p    12
 mm_p       12
 Name: 0, dtype: int64]

In [114]:
obs = obs_list[0]
obs

depth_p    12
mm_p        0
Name: 0, dtype: int64

In [116]:
bb_params['p']

[0.20509, 23.07746]

In [117]:
ll_list = [bb_loglikelihood(bb_params['p'], obs, True) for obs in obs_list]
ll_list

[-0.08708843497620222,
 -2.71512432565185,
 -4.322674064115681,
 -5.796086538203646,
 -7.256898948463835,
 -8.754375676957544,
 -10.320551307742598,
 -11.984299183812333,
 -13.778216739844545,
 -15.745463602335015,
 -17.951649249447676,
 -20.514786859406364,
 -23.72218175343587]

In [118]:
[math.exp(ll) for ll in ll_list]

p_value = sum([math.exp(ll) for ll in ll_list])
p_value

[0.9165960329004581,
 0.06619672254477434,
 0.013264366309962802,
 0.0030394261792586028,
 0.0007052917940841681,
 0.00015776946433032388,
 3.294894526343216e-05,
 6.241442803424289e-06,
 1.0379979586463286e-06,
 1.4515501416863553e-07,
 1.598445340206159e-08,
 1.2318030342587887e-09,
 4.984107097409842e-11]

1.0000000000000056

In [None]:

from io import StringIO
from subprocess import Popen, PIPE, run
from HDR_run import HDR_master

home = '/Users/mahtin'
path = '/Users/martinscience/Dropbox/Icke/Work/somVar/HDRtest'
bam_file = f'{path}/data/002.bam '
chrom = 'chr1'
# chrom = ''
pileup_file = f'{path}/data/001.chr1.pileup'
threads = 8
mut_file = f'{path}/data/002.csv'
genome_split_path = os.path.join(home,'Dropbox/Icke/Work/static/genome/gatk/hg38/split')
HDR_config = {
    "minAltSum": 2,
    "minAltRatio": 0.1,
    "maxAltRatio": 0.85,
    "MINQ": 15,
    "MINSIM": .50,
    "PAD": 100,
    "MINq": 10,
    "MinAltSupport": 5,
    "MinHDRCount": 1,
    "pile2hotspot": '../shell/pile2hotspot.mawk',#
    "pile2hotspot_chrom": '../shell/pile2hotspot_chrom.mawk',
    "editbam": '../shell/editbam.mawk',
    "bam2csv": '../shell/bam2csv.mawk',
    "genome_split_path": genome_split_path # the path to the folder with chrom-split genomes (chr1.fa..)
}
HDR_master(mut_file, bam_file=bam_file, pileup_file=pileup_file, chrom=chrom, threads=threads, HDR_config=HDR_config)

In [None]:
ls data

In [None]:

home = '/Users/mahtin'
path = '/Users/mahtin/Dropbox/Icke/Work/somVar/HDRtest'
bam_file = f'{path}/data/002.bam '
chrom = 'chr1'
# chrom = ''
pileup_file = f'{path}/data/002.pileup'
threads = 8
mut_file = f'{path}/data/002.csv'
mut_df = pd.read_csv(mut_file, sep='\t').loc[:, [
        'Chr', 'Start', 'End', 'Ref', 'Alt', 'Gene']]
# make Chr column categorical for sorting .. and sort
chrom_list = [f"chr{i}" for i in range(23)] + ['chrX', 'chrY']
mut_df['Chr'] = pd.Categorical(mut_df['Chr'], chrom_list)
mut_df = mut_df.sort_values(['Chr', 'Start'])
mut_df = mut_df.query('Chr == @chrom')
mut_df
mut_split = np.array_split(mut_df, 4)

In [None]:
mut_df = mut_split[0]

In [None]:
anno_df = mut_df.sort_values(['Chr', 'Start']).iloc[0:,:5]
anno_df

In [None]:
def reduce_regions(df, padding):
    '''
    takes a mutation list and returns a region list using padding
    overlapping regions are reduced to one using the gap strategy
    '''

    df = df.sort_values('Start')
    df['Start'] = df['Start'] - padding
    df['End'] = df['End'] + padding
    # find the break points
    # if Start is greater than previous End (using shift), this is a gap --> df['gap'] = 1
    df['gap'] = df['Start'].gt(df['End'].shift()).astype('int')
    # id different reads according to gap
    # cumulative sum does not increase at df['gap'] == 0 and so these consecutive stretches are grouped together
    df['gap'] = df['gap'].cumsum()
    # groupby the coverage break group and condense individual coverage islands
    # agg has to contain the neccessary shared columns TransLength because it is needed for coverage computation
    df = df.groupby('gap').agg({'Chr':'first','Start': 'min', 'End':'max'})
    return df.reset_index('gap').drop(columns='gap')

In [None]:
df = anno_df
anno_df
padding = 100
df['Start'] = df['Start'] - padding
df['End'] = df['End'] + padding
df['gap'] = df['Start'].gt(df['End'].shift()).astype('int')
df['gap'] = df['gap'].cumsum()
df = df.groupby('gap').agg({'Chr':'first','Start': 'min', 'End':'max'}).reset_index().drop(columns='gap')
df

In [None]:
bed_df = reduce_regions(anno_df, 100)
bed_df

In [None]:
pd.DataFrame(columns=[
        'Chr',
        'Start',
        'End',
        'Ref',
        'Alt',
        'Gene',
        'HDRcand',
        'HDRcount',
        'HDRinfo',
    ])