In [13]:
import pandas as pd
import numpy as np
from functools import partial
import math
from scipy.optimize import fmin_l_bfgs_b as minimize_func
from scipy.stats import chi2
from scipy.special import gammaln

mut_df = pd.read_csv('output/test_rel_clean.csv', sep='\t')
# for snps
row = pd.Series(mut_df.iloc[0])
var = row['Alt']
matrix = pd.DataFrame()
matrix['depth_p'] = row.iloc[6::3].str.count(r'[ACTG]')
matrix['mm_p'] = row.iloc[6::3].str.count(var)
matrix['depth_n'] = row.iloc[6::3].str.count(r'[actg]')
matrix['mm_n'] = row.iloc[6::3].str.count(var.lower())
count_df = matrix
mut_df.iloc[:,[0,1,2,3,4,-1]]

Unnamed: 0,Chr,Start,End,Ref,Alt,Q10
0,1,3649562,3649562,G,A,DcAE
1,1,53099279,53099279,C,T,cCe@bB
2,2,29416366,29416366,G,C,Eh
3,2,85097574,85097574,G,A,cD
4,2,210685100,210685100,G,A,*
5,4,140651585,140651587,CTG,-,B
6,5,176930172,176930174,GAG,-,C
7,6,30996643,30996643,T,C,a?B
8,6,31084943,31084945,CTT,-,D
9,7,1040174,1040174,G,A,*


In [6]:
# the matrices for beta-binomial calculation
KS_matrix = np.array([[1,0,1,1,0,1,0,0,0],[0,1,-1,0,1,-1,0,0,0]])
gamma_reduce = np.array([1,-1,-1,-1,1,1,1,-1,-1])

def bb_loglikelihood(params, count_df):
    [a, b] = params
    ab_matrix = np.array([1,1,1,a+b,a,b,a+b,a,b])
    # convert df into matrix for np.array operations that change dims
    count_matrix = count_df.values
    # perform matrix multiplication to get inputs to log-gamma
    input_matrix = np.matmul(count_matrix,KS_matrix) + ab_matrix
    # get corresponding log-gamma values and reduce over pon-values
    gamma_matrix = np.sum(gammaln(input_matrix), axis=0)
    # add or subtract using gamma_reduce matrix and sum to loglikelihood (scalar)
    log_likelihood = np.sum(gamma_matrix * gamma_reduce)
    return log_likelihood

def fit_beta_binomial(count_df, pen):
    '''
    Obtaining maximum likelihood estimator of beta-binomial distribution
    count_df is the array of depth-mismatch (trials, success) pairs over the PoN list for either strand
    during minimization of fitting function (max for loglikelihood) penalty term is applied to constrain alpha and beta
        Ref for L-BFGS-B algorithm:
        A Limited Memory Algorithm for Bound Constrained Optimization
        R. H. Byrd, P. Lu and J. Nocedal. , (1995), 
        SIAM Journal on Scientific and Statistical Computing, 16, 5, pp. 1190-1208.
    '''

    def bb_loglikelihood_fitting(params, count_df, penalty):
        '''
        Fitting params [alpha, beta] to maximize loglikelihood
        '''

        # Here, we apply the penalty term of alpha and beta (default 0.5 is slightly arbitray...)
        run_value = 0.5 * math.log(sum(params)) - bb_loglikelihood(params, count_df)
        return run_value

    # get the respective control matrices (as dataframe) for positive and negative strands
    count_p = count_df.loc[:, ['depth_p', 'mm_p']]
    count_n = count_df.loc[:, ['depth_n', 'mm_n']]
    # minimize loglikelihood using L-BFGS-B algorithm
    ab_p = minimize_func(
                           bb_loglikelihood_fitting, [20, 20],
                           args = (count_p, pen), approx_grad = True,
                           bounds = [(0.1, 10000000), (1, 10000000)]
                          )[0]
    ab_n = minimize_func(
                           bb_loglikelihood_fitting, [20, 20],
                           args = (count_n, pen), approx_grad = True,
                           bounds = [(0.1, 10000000), (1, 10000000)]
                          )[0]
    return {'p':ab_p, 'n':ab_n}
control_df = count_df.loc['read1':]
bb_params = fit_beta_binomial(control_df, 0.5)
count_p = count_df.loc[:, ['depth_p', 'mm_p']]
target_df = count_df.loc['read0']
target_p = target_df.loc[['depth_p', 'mm_p']]
target_n = target_df.loc[['depth_n', 'mm_n']]
target_n

def beta_binom_pvalue(params, target_df):
    n_minus_k = target_df[0] - target_df[1]
    # get the list of observations [n, k] to [n, n]
    obs_list = [target_df + np.array([0,i]) for i in range(0, n_minus_k + 1)]
    # get the list of loglikelihoods per observation
    ll_list = [bb_loglikelihood(params, obs) for obs in obs_list]
    # get the sum of exponentials of loglikelihoods (densities) per observation

    p_value = sum([math.exp(ll) for ll in ll_list])

    return p_value
p_values = {'p': beta_binom_pvalue(bb_params['p'], target_p)}
p_values['n'] = beta_binom_pvalue(bb_params['n'], target_n)


In [7]:
import numpy as np
import math
from scipy.optimize import fmin_l_bfgs_b as minimize_func
from scipy.stats import chi2
from scipy.special import gammaln


def fisher_combination(p_values):

    if 0 in p_values.values():
        return 0
    else:
        return 1 - chi2.cdf(-2 * math.log(p_values['p'] * p_values['n']), 4)

p_values
# fisher_combination(p_values)

{'p': 9.317262282517519e-11, 'n': 2.576547253857686e-05}