In [None]:
# get the code
import sys
import os
sys.path.append('../code')

# set the paths
home = '/Users/martinscience'
#  home = '/Users/mahtin'


somvar_path = os.path.join(home, "Dropbox/Icke/Work/somVar")
testdata = os.path.join(somvar_path, "testdata")
ebdata = os.path.join(somvar_path, "tooldata/EBdata")
pon_path = os.path.join(testdata, "PON")

static = os.path.join(home, "Dropbox/Icke/Work/static")

## AB to EB

### load AB file


In [None]:
AB_df = pd.read_csv(os.path.join(ebdata, "AB/02_A.chr7.AB"), sep="\t")
AB_df

In [None]:
from AB2EB import AB2EB_multi
EB_config = {
    "threads": 8
}
AB2EB_multi(AB_df, config=EB_config)

### testing core functionality

In [None]:
from scipy.stats import chi2
from scipy.special import gammaln
import math
# the matrices for beta-binomial calculation
KS_matrix = np.array([[1, 0, 1, 1, 0, 1, 0, 0, 0], [0, 1, -1, 0, 1, -1, 0, 0, 0]])
gamma_reduce = np.array([1, -1, -1, -1, 1, 1, 1, -1, -1])

def fisher_combination(p_values):

    if 0 in p_values:
        return 0
    else:
        return 1 - chi2.cdf(sum([-2 * math.log(p) for p in p_values]), 4)

    
def bb_loglikelihood(params, count_matrix):
    [a, b] = params
    ab_matrix = np.array([1, 1, 1, a + b, a, b, a + b, a, b])

    # perform matrix multiplication to get inputs to log-gamma
    input_matrix = np.matmul(count_matrix, KS_matrix) + ab_matrix

    # get corresponding log-gamma values and reduce over pon-values
    # if count_matrix is 2d (from fitting), gammas have to be summed up
    # if count_matrix is 1d (shape == (2,))  only use the one gamma

    gamma_matrix = gammaln(input_matrix) if (count_matrix.shape == (2,)) else np.sum(gammaln(input_matrix), axis=0)

    # add or subtract using gamma_reduce matrix and sum to loglikelihood (scalar)
    log_likelihood = np.sum(gamma_matrix * gamma_reduce)
    return log_likelihood


def bb_pvalue(obs_array, AB_params):
    """
    get the sum of exponentials of loglikelihoods (densities) per observation
    params is [A,B] pair
    """
    return np.exp([bb_loglikelihood(AB_params, obs) for obs in obs_array]).sum()


def get_obs_array(t_pair):
    '''
    turns an array [5,9] into array of observation pairs
       [[9, 5],
        [9, 6],
        [9, 7],
        [9, 8],
        [9, 9]]
    '''
    return np.array([[t_pair[1], s] for s in range(t_pair[0], t_pair[1] + 1)])


def AB2EBscore(row):
    """
    takes a df containing an AB column of shape "A+|A=B+|B-""
    """
    
    # get AB params from AB string
    AB_params = np.array([p.split("|") for p in row["AB"].split("=")]).astype(float)

    # get tumor matrix from Tumor string
    t_matrix = np.transpose([p.split("-") for p in row['Tumor'].split("=")]).astype(int)
    
    # convert tumor matrix in observation arrays
    obs_arrays = [get_obs_array(t_pair) for t_pair in t_matrix]
    
    # get the p_values for each strand
    p_values = [bb_pvalue(obs_array, AB) for (obs_array, AB) in zip(obs_arrays, AB_params)]
    
    # combine p_values with fisher combination
    EB_p = fisher_combination(p_values)
    if EB_p < 1e-60:
        return 60
    if EB_p > 1.0 - 1e-10:
        return 0
    return -round(math.log10(EB_p), 3)

In [None]:
row = AB_df.iloc[7012]
row

In [None]:
AB2EBscore(row)

In [None]:
# get AB params
AB_params = np.array([p.split("|") for p in row["AB"].split("=")]).astype(float)
AB_params

In [None]:
# get tumor matrix
t_matrix = np.transpose([p.split("-") for p in row['Tumor'].split("=")]).astype(int)
t_matrix

In [None]:
obs_arrays = [get_obs_array(t_pair) for t_pair in t_matrix]
obs_arrays

In [None]:

[bb_pvalue(obs_array, AB) for (obs_array, AB) in zip(obs_arrays, AB_params)]

### old version

In [None]:
import pandas as pd
import numpy as np
import math
from scipy.optimize import fmin_l_bfgs_b as minimize_func
from scipy.stats import chi2
from scipy.special import gammaln


def fisher_combination(p_values):

    if 0 in p_values.values():
        return 0
    else:
        return 1 - chi2.cdf(
            sum([-2 * math.log(x) for x in p_values.values()]),
            2 * len(p_values.values()),
        )

    
def retrieveABdata(row):
    # retrieve the data from the row
    # target_s:
    # turn string "0-5=12-42" into target_s:
    # pd.Series
    # alt+       0
    # alt-       5
    # depth+    12
    # depth-    42
    target = row["Tumor"]
    count_dict = {0: "alt+", 1: "alt-", 2: "depth+", 3: "depth-"}
    target_split = [s for ad in target.split("=") for s in ad.split("-")]
    target_s = pd.Series({count_dict[i]: v for i, v in enumerate(target_split)}).astype(
        int
    )
    # params:
    # turn string A+|B+-A-|B- into AB dict {'+':[A+, B+], '-':[A-, B-]}
    params = row["AB"]
    AB_list = [float(ab) for s in params.split("=") for ab in s.split("|")]
    AB_dict = {"+": AB_list[:2], "-": AB_list[2:]}
    return target_s, AB_dict

def bb_loglikelihood_1d(obs_row, params):
    """
    specialized 1-d version of bb_loglikelihood for p_value of targets
    copy of code is justified by omitting one if clause in the heavily used 2-d version
    """

    [a, b] = params
    ab_matrix = np.array([1, 1, 1, a + b, a, b, a + b, a, b])
    # convert df into matrix for np.array operations that change dims
    count_matrix = obs_row.values
    # perform matrix multiplication to get inputs to log-gamma
    input_matrix = np.matmul(count_matrix, KS_matrix) + ab_matrix
    # get corresponding log-gamma values and reduce over pon-values
    gamma_matrix = gammaln(input_matrix)
    # else:
    # gamma_matrix = np.sum(gammaln(input_matrix), axis=0)
    # add or subtract using gamma_reduce matrix and sum to loglikelihood (scalar)
    log_likelihood = np.sum(gamma_matrix * gamma_reduce)
    return log_likelihood


def get_obs_df(target_s, cols):
    """
    turn the target_s into obs_df
    """

    # cols is either ['depth+', 'alt+'] or ['depth-', 'alt-']
    alt_type = cols[1]
    # creates an observation df for each observation from depth-alt to depth-depth
    n_minus_k = target_s[cols[0]] - target_s[alt_type]
    # obs_df is instantiated from target_s dict with n_minus_k + 1 rows
    obs_df = pd.DataFrame(target_s[cols].to_dict(), index=range(n_minus_k + 1))
    # alt column is incremented using index
    obs_df[alt_type] = obs_df[alt_type] + obs_df.index
    return obs_df


def bb_pvalue(obs_df, params):
    """
    get the sum of exponentials of loglikelihoods (densities) per observation
    params is strand-specific [A,B]
    """

    # get the loglikelihood per observation
    obs_df["p"] = obs_df.apply(bb_loglikelihood_1d, params=params, axis=1)

    # sum up the exponentials
    p_value = np.exp(obs_df["p"]).sum()

    return p_value


def AB2EBscore(row):
    """
    takes a df containing an AB column of shape "A+|A--B+|B-""
    """
    # retrieve the data from the row
    target_s, AB_dict = retrieveABdata(row)
    print(target_s)
    # get the p_values for each strand
    p_values = {}
    p_values["+"] = bb_pvalue(get_obs_df(target_s, ["depth+", "alt+"]), AB_dict["+"])
    p_values["-"] = bb_pvalue(get_obs_df(target_s, ["depth-", "alt-"]), AB_dict["-"])
    # combine p_values with fisher combination
    EB_p = fisher_combination(p_values)
    if EB_p < 1e-60:
        return 60
    if EB_p > 1.0 - 1e-10:
        return 0
    return -round(math.log10(EB_p), 3)

AB2EBscore(row)

In [None]:
target_s, AB_dict = retrieveABdata(row)
target_s

obs_df = get_obs_df(target_s, ["depth-", "alt-"])
obs_array = obs_df.values
obs_df