In [1]:
import sys
import argparse

import numpy as np
import pandas as pd
import bisect
import io
from sklearn.preprocessing import StandardScaler

### load data sets for CI computations
# CI_param: known parameter values (i.e. used for obtaining simulations in the training set)
CI_param = pd.read_csv('filt_for_train_with_header_and_rescaling_factor.csv', sep=",")
# CI_predicted: predicted parameter values obtained with the training set
CI_predicted = pd.read_csv('training_BiSSE_predictions.csv', sep=",", index_col=0)
# harmonize index numbers for correspondance
CI_predicted.index = CI_param.index 
CI_param['rescaling_factor'] = CI_param['1005'] 

# test_predicted: predicted parameter values obtained with the empirical set
test_predicted = pd.read_csv('empirical_BiSSE_predicted.csv', sep=",", index_col=0)


## help parameters for CI computation: rescale factor, sampling frequency, tree size
# rescale factor of the empirical set
test_rescale = test_predicted['rescaling_factor']
# set sampling frequency to the presumed value
test_sampling_fr = 0.6824147
# set tree size to the tree size of the empirical set
test_tree_size = 381 


In [2]:
test_predicted

Unnamed: 0_level_0,turnover,lambda1_rescaled,lambda2_rescaled,q01_rescaled,lambda1,lambda2,q01,rescaling_factor
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,0.233583,1.198741,0.376441,0.036333,0.294911,0.092611,0.008939,4.064755


In [3]:
# check
CI_param.describe()

Unnamed: 0,1005,lambda1,lambda2,turnover,sampling_frac,tree_size,mu1,mu2,net_rate1,net_rate2,q01,q10,lambda2_ratio,q01_ratio,rescaling_factor
count,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0
mean,8.090137,0.491152,0.291376,0.4954139,0.505139,350.008687,0.2432767,0.1443915,0.2478751,0.146984,0.027765,0.027765,0.549914,0.054989,8.090137
std,24.687037,0.282537,0.234709,0.2860754,0.285868,86.881676,0.2137587,0.1585532,0.2152647,0.1604099,0.021777,0.021777,0.259795,0.025976,24.687037
min,0.450472,0.002167,0.001036,1.046097e-08,0.010002,200.0,7.99469e-09,1.121413e-09,2.219205e-07,1.001284e-07,0.00011,0.00011,0.1,0.01,0.450472
25%,1.620826,0.247098,0.102296,0.2477154,0.25753,275.0,0.06723599,0.0303589,0.07085918,0.03183632,0.010053,0.010053,0.324952,0.032482,1.620826
50%,2.912276,0.485714,0.226591,0.4953189,0.505221,350.0,0.1814917,0.08719248,0.1866166,0.08885699,0.022073,0.022073,0.549849,0.054986,2.912276
75%,6.391111,0.731211,0.429986,0.743038,0.752787,425.0,0.3703248,0.2029328,0.3756361,0.2060351,0.041183,0.041183,0.774922,0.077483,6.391111
max,5576.290038,0.999999,0.999998,0.9999992,0.999999,500.0,0.9978102,0.9952059,0.9982215,0.9960592,0.099866,0.099866,0.999999,0.1,5576.290038


In [4]:
# rescaling all values so that they correspond to trees of average branch length of 1
CI_param['lambda1_rescaled'] = CI_param['lambda1']*CI_param['rescaling_factor']
CI_param['lambda2_rescaled'] = CI_param['lambda2']*CI_param['rescaling_factor']
CI_param['q01_rescaled'] = CI_param['q01']*CI_param['rescaling_factor']

In [5]:
def get_indexes_of_closest_single_factor(test_value, ci_values, n):
    """Returns indexes of knn for given set

    :param test_value: float, value of parameter (e.g. sampling proba or tree size) on which we select given observation
    :param ci_values: dataframe, values of these parameters in CI set
    :param n: int, number of KNNs to find
    :return: list, indexes of n KNNs
    """
    ref = ci_values.iloc[(ci_values-test_value).abs().argsort()].index
    return [ref[i] for i in range(n)]


def get_indexes_of_closest(test_s, ci_s, n):
    """Returns indexes of knn for given set
    :param test_s: dataframe, param set given observation
    :param ci_s: dataframe, param sets of CI set
    :param n: int, number of KNNs to find
    :return: list, indexes of n KNNs
    """
    ref = ci_s.iloc[(ci_s - test_s.values).pow(2).sum(axis=1).pow(0.5).argsort()].index
    return [ref[i] for i in range(n)]


def get_predicted_closest_single(indexes, pred_value_table, targ):
    """ returns the absolute errors for knn
    :param indexes: list, index of knn
    :param pred_value_table: dataframe, predicted parameter values of CI set
    :param targ: str, parameter name
    :return: list of predictions for each knn
    """
    # subset the real and predicted values of the closest neighbors
    closest_pred = pred_value_table.loc[indexes, :]

    # for single parameter, get the absolute difference between these
    pred_d = list(closest_pred[targ][:])
    return pred_d


def get_error_closest_single(indexes, real_value_table, pred_value_table, targ):
    """ returns the absolute errors for knn
    :param indexes: list, index of knn
    :param real_value_table: dataframe, real/target parameter values of CI set
    :param pred_value_table: dataframe, predicted parameter values of CI set
    :param targ: str, parameter name
    :return: list of absolute error in predictions for each knn
    """
    # subset the real and predicted values of the closest neighbors
    closest_pred = pred_value_table.loc[indexes, :]
    closest_real = real_value_table.loc[indexes, :]

    # for single parameter, get the absolute difference between these
    error_d = closest_pred[targ] - closest_real[targ]
    return error_d


def apply_filter(df1, df2, df3, df4, indexes):
    return df1.loc[indexes], df2.loc[indexes], df3.loc[indexes], df4.loc[indexes]


def load_files(arg_name, sep=""):
    """Loads given file

    :param arg_name: parser arg, pointer to the file
    :param sep: str, eventual separator
    :return: pd.Dataframe, loaded file
    """
    with open(arg_name, 'r') as des0:
        des_data0 = des0.read()
    des0.close()

    if sep == "":
        output = pd.read_csv(io.StringIO(des_data0), index_col=0, header=None)
    else:
        output = pd.read_csv(io.StringIO(des_data0), index_col=0, header=None, sep=sep)

    return output




In [None]:
### prepare col names of output table
# parameters for which we compute the CI
targets = ["turnover", "lambda1_rescaled", "lambda2_rescaled", "q01_rescaled"]
# number of neighboring simulation sets we consider to compute CI
n_neighbors = [1000]
# min max values for the computed CI values: set to biologically relevant boundaries (i.e. non negative values)
min_max = {targets[0]: [0, 1], targets[1]: [0, 1000], targets[2]: [0, 1000], targets[3]: [0, 1000]}
# prepare col names of output table: value of lower boundrary, upper boundary and the width of CI
add_ons_names = ['_CI_2_5', '_CI_97_5', '_CI_width']
col = [add_on + '_' + str(n_neigh) for n_neigh in n_neighbors for add_on in add_ons_names]
col_comp = []
col_comp = [target + co for target in targets for co in col]

In [6]:
### pre processing of datasets used for CI computation: extracting parameters of interest, standardizing them
# extract helper parameters of the CI set 
# subset sampling probability:
CI_sampling = CI_param["sampling_frac"]
# tree size:
CI_tree_size = CI_param["tree_size"]

# subselect columns/parameters of interest for each table + all in the same order
CI_param = CI_param[targets]
test_predicted = test_predicted[targets]
CI_predicted = CI_predicted[targets]

# before computation, standardize all columns so that each parameter is on the same scale:
scaler = StandardScaler()
CI_param_standardized = pd.DataFrame(scaler.fit_transform(CI_param)) # fit to CI set
test_predicted_standardized = pd.DataFrame(scaler.transform(test_predicted))

# restore column names and index values
CI_param_standardized.columns = CI_param.columns
CI_param_standardized.index = CI_param.index
test_predicted_standardized.columns = test_predicted.columns
test_predicted_standardized.index = test_predicted.index



In [7]:
targets

['turnover', 'lambda1_rescaled', 'lambda2_rescaled', 'q01_rescaled']

In [8]:
# initialize the output table
CI_df = pd.DataFrame(index=range(0, test_predicted.shape[0]), columns=col_comp)

# predicted parameter values from empirical set: here there is only one empirical set for which we want to compute CI values
current_obs = test_predicted.iloc[0, :]
current_obs_standardized = test_predicted_standardized.iloc[0, :]

## find the 4% of closest simulations with respect to tree size and sampling frequency
# first filter: keep only the closest 200k CI sets with respect to tree size
tree_size_indexes = get_indexes_of_closest_single_factor(test_tree_size, CI_tree_size, 200000)
filt_1_CI_predicted, filt_1_param_CI_standardized, filt_1_CI_param, filt_1_CI_sampling_proba = \
    apply_filter(CI_predicted, CI_param_standardized, CI_param, CI_sampling, tree_size_indexes)
# reset indexes
filt_1_CI_param.index = filt_1_param_CI_standardized.index = filt_1_CI_predicted.index = \
    filt_1_CI_sampling_proba.index = range(0, 200000)

# second filter: keep only the closest 40k CI sets with respect to sampling frequency
sampling_proba_indexes = get_indexes_of_closest_single_factor(test_sampling_fr, filt_1_CI_sampling_proba, 40000)
filt_2_CI_predicted, filt_2_param_CI_standardized, filt_2_CI_param, filt_2_CI_sampling_proba = \
    apply_filter(filt_1_CI_predicted, filt_1_param_CI_standardized, filt_1_CI_param,
                 filt_1_CI_sampling_proba, sampling_proba_indexes)

# reset indexes
filt_2_CI_predicted.index = filt_2_param_CI_standardized.index = filt_2_CI_param.index = range(0, 40000)

# vector to stock all measures of the current observation
all_real = []

for elt in targets:

    # find indexes of closest parameter sets within the predicted values of 40K simulation of CI set
    top_ind = get_indexes_of_closest_single_factor(current_obs_standardized[elt], filt_2_param_CI_standardized[
        elt], n_neighbors[-1])

    # measure errors on closest parameters sets (predicted - actual values)
    pred_closest = get_predicted_closest_single(top_ind, filt_2_CI_predicted, elt)
    error_closest = get_error_closest_single(top_ind, filt_2_CI_param, filt_2_CI_predicted, elt)

    for j in range(len(n_neighbors)):
        # refactor the measured error into a dict 'name_of_param': list of errors (top n neighbours)
        pred_closest_n_neigh = pred_closest[0:n_neighbors[j]]
        error_closest_n_neigh = error_closest[0:n_neighbors[j]]
        median_pred = np.median(pred_closest_n_neigh)
        median_error = np.median(error_closest_n_neigh)
        # center the values around the given prediction
        centered = [item - median_error + current_obs[elt] for item in error_closest_n_neigh]

        # rescale back to original time scale of empirical observation for time-related parameters:
        if 'resc' in elt:
            centered_resc = [float(item / test_rescale) for item in centered]
        else:
            centered_resc = centered
        
        # apply minimum and maximum values for each parameter (e.g. no negative values)
        # print(centered_resc)
        centered_resc = [max(min_max[elt][0], item) for item in centered_resc]
        centered_resc = [min(min_max[elt][1], item) for item in centered_resc]
        # compute statistics: 2.5%, 97.5% boundaries
        qtls = np.percentile(centered_resc, np.array(np.array([2.5, 97.5])))
        min_2_5 = qtls[0]
        max_97_5 = qtls[1]
        width_CI = qtls[1] - qtls[0]

        all_real.append(min_2_5)
        all_real.append(max_97_5)
        all_real.append(width_CI)
CI_df.loc[0, :] = all_real.copy()


In [9]:
# CI values for each paramater
CI_df

Unnamed: 0,turnover_CI_2_5_1000,turnover_CI_97_5_1000,turnover_CI_width_1000,lambda1_rescaled_CI_2_5_1000,lambda1_rescaled_CI_97_5_1000,lambda1_rescaled_CI_width_1000,lambda2_rescaled_CI_2_5_1000,lambda2_rescaled_CI_97_5_1000,lambda2_rescaled_CI_width_1000,q01_rescaled_CI_2_5_1000,q01_rescaled_CI_97_5_1000,q01_rescaled_CI_width_1000
0,0.0996187,0.446021,0.346402,0.23389,0.37468,0.14079,0.045853,0.144161,0.0983085,0.00485004,0.0137293,0.0088793
