In [2]:
import json
from cogent3 import get_app, open_data_store
from cogent3.evolve.models import register_model
from cogent3.evolve.ns_substitution_model import GeneralStationary
from cogent3 import make_tree, get_moltype
from cogent3.app.composable import LOADER, define_app, WRITER
from cogent3.app.typing import AlignedSeqsType, BootstrapResultType, SerialisableType, IdentifierType


from cogent3.app import evo
RATE_PARAM_UPPER = 50

def get_id(result):
    return result.source.unique_id

@register_model("nucleotide")
def GSN(**kwargs):
    """A General Stationary Nucleotide substitution model instance."""
    kwargs["optimise_motif_probs"] = kwargs.get("optimise_motif_probs", True)
    kwargs["name"] = kwargs.get("name", "GSN")
    return GeneralStationary(get_moltype("dna").alphabet, **kwargs)








In [3]:
def get_param_rules_upper_limit(model_name, upper):
    """rules to set the upper value for rate matrix terms"""
    from cogent3 import get_model

    sm = get_model(model_name)
    return [{"par_name": par_name, "upper": upper} for par_name in sm.get_param_list()]


@define_app
def test_hypothesis_model_bootstrapper(aln: AlignedSeqsType, tree=None, opt_args=None) -> SerialisableType:
    outgroup_name = aln.info['triads_species_name']['outgroup']
    print(outgroup_name)
    tree = make_tree(tip_names=aln.names)
    
    outgroup_edge = [outgroup_name]

    model_kwargs = dict(
    tree=tree,
    opt_args=opt_args,
    # unique_trees=True,
    lf_args=dict(discrete_edges=[outgroup_edge]),
    optimise_motif_probs=True,
    )
    null = evo.model(
            "GSN",
            param_rules=get_param_rules_upper_limit("GSN", RATE_PARAM_UPPER),
            **model_kwargs,
        )
    alt = evo.model(
            "GN",
            param_rules=get_param_rules_upper_limit("GN", RATE_PARAM_UPPER),
            **model_kwargs,
        )
    
    hyp = evo.hypothesis(null, alt, sequential=True)
    bootstrapper = evo.bootstrap(hyp, num_reps=100, parallel=True)
    result = bootstrapper(aln)    
    print('finish one')
    return result

load_json_app = get_app("load_json")


@define_app
def customised_load_json(DataMember: IdentifierType) -> AlignedSeqsType:
    aln = load_json_app(DataMember)
    aln.source = DataMember.unique_id
    return aln

load_json_customised = customised_load_json()


# @define_app(app_type=WRITER)
# def customised_write_json(bootstrap_result_serilised: SerialisableType, unique_id: IdentifierType):
#     path_to_dir = '/Users/gulugulu/Desktop/honours/data_local/'
#     out_dstore = open_data_store(path_to_dir, mode="w", suffix="json")
#     write_json_app = get_app("write_json", data_store=out_dstore, identifier = unique_id)
#     return write_json_app


def p_value(result):
    return sum(result.observed.LR <= null_lr for null_lr in result.null_dist) / len(result.null_dist)


In [4]:
aln_dir_new = '/Users/gulugulu/Desktop/honours/data_local/triples_aln_subset_info_added_to_run'

path_to_dir = '/Users/gulugulu/Desktop/honours/data_local/bootstrapping_test_non'
out_dstore = open_data_store(path_to_dir, mode="w", suffix="json")
write_json_app = get_app("write_json", data_store=out_dstore, id_from_source = get_id)

input_data_store = open_data_store(aln_dir_new, suffix= 'json')

bootstrapper = test_hypothesis_model_bootstrapper()
path_bootstrapper = load_json_app + bootstrapper + write_json_app



In [12]:
path_bootstrapper.apply_to(
        input_data_store,
        parallel=False, 
        par_kw=dict(
            max_workers=2, use_mpi=False
        ),
)

Squirrel
finish one


DataStoreDirectory(source=/Users/gulugulu/Desktop/honours/data_local/bootstrapping_test_non, mode=Mode.w, suffix=json, limit=None, verbose=False)

In [11]:
p_value_list = []
for bootstrap_result in bootstrap_results:
    p_value_list.append(pvalue(bootstrap_result))


p_value_list

[0.0,
 0.02,
 0.13,
 0.26,
 0.29,
 0.11,
 0.0,
 0.66,
 0.4,
 0.15,
 0.1,
 0.24,
 0.4791666666666667,
 0.51,
 0.0,
 0.6067415730337079,
 0.45,
 0.7244897959183674,
 0.77,
 0.1,
 0.26,
 0.13,
 0.04,
 0.0,
 0.6868686868686869,
 0.04,
 0.0,
 0.84375,
 0.030303030303030304,
 0.32,
 0.0,
 0.21,
 0.7157894736842105,
 0.6020408163265306,
 0.0,
 0.99,
 0.6060606060606061,
 0.01,
 0.0,
 0.6421052631578947,
 0.26,
 0.41,
 0.6666666666666666,
 0.21,
 0.0,
 0.0,
 0.3,
 0.84,
 0.8,
 0.41,
 0.0,
 0.4387755102040816,
 1.0,
 0.14,
 0.01,
 0.13131313131313133,
 0.2222222222222222,
 0.21,
 0.3,
 0.26,
 0.16494845360824742,
 0.01,
 0.0,
 0.625,
 0.30303030303030304,
 0.5959595959595959,
 0.43,
 0.37755102040816324,
 0.94,
 0.05,
 0.0,
 0.0,
 0.15,
 0.0,
 0.15,
 0.82,
 0.020833333333333332,
 0.6979166666666666,
 0.7171717171717171,
 0.15,
 0.59,
 0.01,
 0.47,
 0.69,
 0.04,
 0.0,
 0.0,
 0.6161616161616161,
 0.49,
 0.38,
 0.01,
 0.0,
 0.68,
 0.99,
 0.5833333333333334,
 0.06,
 0.55,
 0.72,
 0.5154639175257731

In [10]:
observed_p_value_list = []
for bootstrap_result in bootstrap_results:
    observed_p_value_list.append(bootstrap_result.observed.pvalue)

observed_p_value_list

In [15]:
len([c for c in observed_p_value_list if c < 0.05])

86

In [16]:
len([c for c in p_value_list if c < 0.05])

44

In [6]:
len([c for c in p_value_list if c < 0.25])

79

In [15]:
a.observed

LR,df,pvalue
11.1094,2,0.0039

hypothesis,key,lnL,nfp,DLC,unique_Q
,'GSN',-859.244,15,True,True
alt,'GN',-853.6893,17,True,True


In [17]:
import plotly.express as px
px.histogram(a.null_dist)

In [16]:
pvalue(a)

0.19114688128772636