In [3]:
import json
from cogent3 import get_app, open_data_store
from cogent3.evolve.models import register_model
from cogent3.evolve.ns_substitution_model import GeneralStationary
from cogent3 import make_tree, get_moltype
from cogent3.app.composable import define_app, NotCompleted
from cogent3.app.typing import AlignedSeqsType, HypothesisResultType, SerialisableType, IdentifierType


from cogent3.app import evo
RATE_PARAM_UPPER = 50


@register_model("nucleotide")
def GSN(**kwargs):
    """A General Stationary Nucleotide substitution model instance."""
    kwargs["optimise_motif_probs"] = kwargs.get("optimise_motif_probs", True)
    kwargs["name"] = kwargs.get("name", "GSN")
    return GeneralStationary(get_moltype("dna").alphabet, **kwargs)

def get_id(result):
    return result.source.unique_id



def get_param_rules_upper_limit(model_name, upper):
    """rules to set the upper value for rate matrix terms"""
    from cogent3 import get_model

    sm = get_model(model_name)
    return [{"par_name": par_name, "upper": upper} for par_name in sm.get_param_list()]

def p_value(result):
    return sum(result.observed.LR <= null_lr for null_lr in result.null_dist) / len(result.null_dist)

In [73]:
import os
a = '/Users/gulugulu/repos/PuningAnalysis/results/output_data/model_fitting_result_350_threshold/ENSG00000020129.json'
os.path.basename(a.rstrip('/'))

'ENSG00000020129.json'

## Test of non-stationary model vs. sattionary model

In [9]:
@define_app
def ton_bootstrapper(aln: AlignedSeqsType, tree=None, opt_args=None) -> SerialisableType:
    outgroup_name = aln.info['triads_species_name']['outgroup']
    print(outgroup_name)
    tree = make_tree(tip_names=aln.names)
    
    outgroup_edge = [outgroup_name]

    model_kwargs = dict(
    tree=tree,
    opt_args=opt_args,
    # unique_trees=True,
    lf_args=dict(discrete_edges=[outgroup_edge]),
    optimise_motif_probs=True,
    )
    null = evo.model(
            "GSN",
            param_rules=get_param_rules_upper_limit("GSN", RATE_PARAM_UPPER),
            **model_kwargs,
        )
    alt = evo.model(
            "GN",
            param_rules=get_param_rules_upper_limit("GN", RATE_PARAM_UPPER),
            **model_kwargs,
        )
    
    hyp = evo.hypothesis(null, alt, sequential=True)
    bootstrapper = evo.bootstrap(hyp, num_reps=100, parallel=True)
    result = bootstrapper(aln)    
    print('finish one')
    return result

load_json_app = get_app("load_json")


# @define_app
# def customised_load_json(DataMember: IdentifierType) -> AlignedSeqsType:
#     aln = load_json_app(DataMember)
#     aln.source = DataMember.unique_id
#     return aln

# load_json_customised = customised_load_json()


# @define_app(app_type=WRITER)
# def customised_write_json(bootstrap_result_serilised: SerialisableType, unique_id: IdentifierType):
#     path_to_dir = '/Users/gulugulu/Desktop/honours/data_local/'
#     out_dstore = open_data_store(path_to_dir, mode="w", suffix="json")
#     write_json_app = get_app("write_json", data_store=out_dstore, identifier = unique_id)
#     return write_json_app





In [3]:
aln_dir_new = '/Users/gulugulu/Desktop/honours/data_local/triples_aln_subset_info_added'

path_to_dir = '/Users/gulugulu/Desktop/honours/data_local/bootstrapping_test_non'
out_dstore = open_data_store(path_to_dir, mode="w", suffix="json")
write_json_app = get_app("write_json", data_store=out_dstore, id_from_source = get_id)

input_data_store = open_data_store(aln_dir_new, suffix= 'json', limit=1)

bootstrapper_ton = ton_bootstrapper()
non_bootstrapper = load_json_app + bootstrapper_ton + write_json_app

# non_bootstrapper.apply_to(
#         input_data_store,
#         parallel=False, 
#         par_kw=dict(
#             max_workers=2, use_mpi=False
#         ),
# )

## Test of clock violation 

In [4]:
def all_hypothesis_model_collection(aln: AlignedSeqsType, tree=None, opt_args=None) -> HypothesisResultType:
    
    outgroup_name = aln.info['triads_species_name']['outgroup']
    tree = make_tree(tip_names=aln.names)
    
    outgroup_edge = [outgroup_name]

    model_kwargs = dict(
    tree=tree,
    opt_args=opt_args,
    # unique_trees=True,
    lf_args=dict(discrete_edges=[outgroup_edge]),
    optimise_motif_probs=True,
    )
    null = evo.model(
            "GSN",
            param_rules=get_param_rules_upper_limit("GSN", RATE_PARAM_UPPER),
            time_het = None,
            **model_kwargs,
        )
    alt = evo.model(
            "GN",
            param_rules=get_param_rules_upper_limit("GN", RATE_PARAM_UPPER),
            time_het = None,
            **model_kwargs,
        )
    
    alt2 = evo.model(
            "GN",
            name="GN-max-het",
            param_rules=get_param_rules_upper_limit("GN", RATE_PARAM_UPPER),
            time_het = 'max',
            **model_kwargs,
        )
    
    
    models = [null, alt, alt2]
    
    return evo.model_collection(*models, sequential=True)
    



In [5]:

@define_app
def test_hypothesis_clock_model_whole_gene(aln_path: IdentifierType, result_lf_path, opt_args=None) -> HypothesisResultType:
    aln = deserialise_object(json.load(open(aln_path, 'r')))

    tree = load_json_app(result_lf_path).get_ens_tree()

    model_kwargs = dict(
    tree=tree,
    opt_args=opt_args,
    # unique_trees=True,
    optimise_motif_probs=True,
    )
    null = evo.model(
            "GN",
            param_rules=get_param_rules_upper_limit("GN", RATE_PARAM_UPPER),
            time_het = None,
            **model_kwargs,
        )
    alt = evo.model(
            "GN",
            name="GN-max-het",
            param_rules=get_param_rules_upper_limit("GN", RATE_PARAM_UPPER),
            time_het = "max",
            **model_kwargs,
        )
    
    hyp = evo.hypothesis(null, alt, sequential=True)
    result = hyp(aln)

    return result

In [None]:
import glob
import os
aln_dir = '/Users/gulugulu/Desktop/honours/data_local/triples_aln_subset_info_added'
result_lf_dir = '/Users/gulugulu/repos/PuningAnalysis/results/output_data/model_fitting_result_350_threshold'
gene_aln_dir = '/Users/gulugulu/repos/PuningAnalysis/data/ensembl_ortholog_sequences/homologies_alignment_common_name_350_threshold'
result_lf_paths = glob.glob(os.path.join(result_lf_dir, '*.json'))
aln_paths = glob.glob(os.path.join(aln_dir, '*.json'))
gene_aln_paths = glob.glob(os.path.join(gene_aln_dir, '*.json'))

In [6]:
from cogent3.evolve import ns_substitution_model, substitution_model

@define_app
def toc_S(aln: AlignedSeqsType, tree=None, opt_args=None) -> HypothesisResultType:
    outgroup_name = aln.info['triads_species_name']['outgroup']
    outgroup_edge = [outgroup_name]

    tree = make_tree(tip_names=aln.names)

    model_kwargs = dict(
    tree=tree,
    opt_args=opt_args,
    # unique_trees=True,
    lf_args=dict(discrete_edges=[outgroup_edge]),
    optimise_motif_probs=True,
    )
    null = evo.model(
            "GSN",
            param_rules=get_param_rules_upper_limit("GSN", RATE_PARAM_UPPER),
            time_het = None,
            **model_kwargs,
        )
    alt = evo.model(
            "GSN",
            name="GSN-max-het",
            param_rules=get_param_rules_upper_limit("GSN", RATE_PARAM_UPPER),
            time_het = "max",
            **model_kwargs,
        )
    
    hyp = evo.hypothesis(null, alt, sequential=True)
    result = hyp(aln)

    return result

In [7]:
@define_app
def toc_N_bootstrapper(aln: AlignedSeqsType, tree=None, opt_args=None) -> SerialisableType:
    outgroup_name = aln.info['triads_species_name']['outgroup']
    tree = make_tree(tip_names=aln.names)
    outgroup_edge = [outgroup_name]


    model_kwargs = dict(
    tree=tree,
    opt_args=opt_args,
    # unique_trees=True,
    lf_args=dict(discrete_edges=[outgroup_edge]),
    optimise_motif_probs=True,
    )
    null = evo.model(
            "GN",
            param_rules=get_param_rules_upper_limit("GN", RATE_PARAM_UPPER),
            time_het = None,
            **model_kwargs,
        )
    alt = evo.model(
            "GN",
            name="GN-max-het",
            param_rules=get_param_rules_upper_limit("GN", RATE_PARAM_UPPER),
            time_het = "max",
            **model_kwargs,
        )
    
    hyp = evo.hypothesis(null, alt, sequential=True)
    bootstrapper = evo.bootstrap(hyp, num_reps=100, parallel=True)
    result = bootstrapper(aln)    
    return result

In [9]:

path_to_dir = '/Users/gulugulu/Desktop/honours/data_local/bootstrapping_test_clock'
out_dstore = open_data_store(path_to_dir, mode="w", suffix="json")
write_json_app = get_app("write_json", data_store=out_dstore, id_from_source = get_id)

bootstrapper_toc_N = toc_N_bootstrapper()
clock_bootstrapper = load_json_app + bootstrapper_toc_N + write_json_app

# clock_bootstrapper.apply_to(
#         input_data_store,
#         parallel=False, 
#         par_kw=dict(
#             max_workers=2, use_mpi=False
#         ),
# )




Non-stationarity proportion from bootstrape

In [4]:
ton_result_dir = '/Users/gulugulu/Desktop/honours/data_local/bootstrapping_test_non'

input_data_store_ton_result = open_data_store(ton_result_dir, suffix= 'json')


In [5]:
len(input_data_store_ton_result)

2170

In [33]:
p_value_observed_dict = {}
p_value_tested_dict = {}
i = 0
for data in input_data_store_ton_result:
    gene_name = data.unique_id.split('.')[0]
    bootstrap_result = load_json_app(data)
    if isinstance(bootstrap_result, NotCompleted):
        i += 1
        print(data) 
    else:
        p_value_observed_dict[gene_name] = bootstrap_result.observed.pvalue
        p_value_tested_dict[gene_name] = p_value(bootstrap_result)
        


In [104]:
list1 = list(p_value_tested_dict.values())
count = len([x for x in list1 if x < 0.25])
count
#34%, #60%

1294

In [105]:
# Assuming list1 is created from p_value_observed_dict.values()
list2 = list(p_value_observed_dict.values())

# Remove NoneType values
filtered_list = [x for x in list2 if x is not None]

# Now count the values less than 0.05
count = len([x for x in filtered_list if x < 0.25])

print(count)
#63%, 82%

1788


In [188]:
from collections import defaultdict
# Dictionary to store the count of values < 0.05 for each gene
count_less_than_005 = defaultdict(int)
# Dictionary to store the total count of entries for each gene
total_count = defaultdict(int)

for key, value in p_value_tested_dict.items():
    gene = key.split('_')[0]
    total_count[gene] += 1
    if value < 0.1:
        count_less_than_005[gene] += 1

# Calculate the proportion of values < 0.05 for each gene
proportion_less_than_005 = {gene: count_less_than_005[gene] / total for gene, total in total_count.items()}

count_gene = len([x for x in proportion_less_than_005.values() if x > 0.29])

count_gene

78

In [193]:
gene_list = [gene for gene in proportion_less_than_005.keys() if proportion_less_than_005[gene] > 0.49]

In [41]:
cor_dict1 = {'ENSG00000138315': 0.41632410995929325,
 'ENSG00000143479': 0.5011257024334949,
 'ENSG00000117707': 0.5143501260471539,
 'ENSG00000155380': 0.5368650475105942,
 'ENSG00000183801': 0.022257428398586906,
 'ENSG00000149782': 0.46981502660235697,
 'ENSG00000107560': 0.7094300670960713,
 'ENSG00000110344': 0.5799762739532286,
 'ENSG00000111262': 0.260748820557938,
 'ENSG00000120457': 0.27542735232677007,
 'ENSG00000143105': 0.12463917065105679,
 'ENSG00000135372': 0.3182468308548365,
 'ENSG00000171492': 0.8148554461935932,
 'ENSG00000197147': 0.5936434803215991,
 'ENSG00000148948': -0.10945862806371154,
 'ENSG00000116991': 0.4722474156678038,
 'ENSG00000165494': 0.12817487855655793,
 'ENSG00000174576': 0.6317601951899616,
 'ENSG00000149295': 0.2471135001529111,
 'ENSG00000185085': 0.26788426444898955,
 'ENSG00000198198': 0.20102316023802388,
 'ENSG00000133816': 0.1365249215754618,
 'ENSG00000116194': 0.47908216941773996,
 'ENSG00000171862': 0.6723851954901209,
 'ENSG00000185875': 0.6843284044550202,
 'ENSG00000135775': 0.5740332966470127,
 'ENSG00000198162': 0.5380384618102918,
 'ENSG00000172409': 0.41005625390652123,
 'ENSG00000172269': 0.17080298494123955,
 'ENSG00000197106': 0.06856965336055383,
 'ENSG00000156097': 0.15370764902001904,
 'ENSG00000058085': 0.32431042110353203,
 'ENSG00000187554': 0.6482192574161928,
 'ENSG00000265203': 0.27455067326770816,
 'ENSG00000110075': 0.5010018361411549,
 'ENSG00000197136': 0.3827179591612949,
 'ENSG00000171488': 0.6086915446386395,
 'ENSG00000148700': 0.1299460854465024,
 'ENSG00000142661': 0.3219402867459939,
 'ENSG00000160703': 0.2673626610987112,
 'ENSG00000196878': 0.06509661835748794,
 'ENSG00000117000': 0.284674665886255,
 'ENSG00000109906': 0.5256171962126348,
 'ENSG00000129159': 0.5003572604283687,
 'ENSG00000010165': 0.19962875695276555,
 'ENSG00000116128': 0.16612319575257356,
 'ENSG00000171812': 0.5729671938510141,
 'ENSG00000048707': 0.7221005961789457,
 'ENSG00000138161': 0.7765584173198955,
 'ENSG00000152778': 0.22678941737661862,
 'ENSG00000166189': 0.3171592551844421,
 'ENSG00000196968': 0.3484175670965493,
 'ENSG00000174684': 0.5438725336557962,
 'ENSG00000166341': 0.48067213421312754,
 'ENSG00000117322': 0.5546669947506562,
 'ENSG00000152779': 0.5274260838019227,
 'ENSG00000065613': 0.7831141862832004,
 'ENSG00000143669': 0.6612184830332009,
 'ENSG00000129083': 0.6467608735074528,
 'ENSG00000166349': 0.15428046620454397,
 'ENSG00000215009': 0.21879266609169964,
 'ENSG00000187486': 0.3003897168100097,
 'ENSG00000143278': 0.7146413646813754,
 'ENSG00000181333': 0.432364963437026,
 'ENSG00000073756': 0.594792994054204,
 'ENSG00000254685': 0.7339048048725467,
 'ENSG00000166106': 0.5531644162296107,
 'ENSG00000160695': 0.20585172129646584,
 'ENSG00000116679': 0.31083423993709836,
 'ENSG00000137507': 0.2945378151260505,
 'ENSG00000166507': 0.35890842816052354,
 'ENSG00000133019': 0.19792968087276314,
 'ENSG00000116539': 0.6967705778880635,
 'ENSG00000020129': 0.6113947919506353,
 'ENSG00000174516': 0.39700866213597164,
 'ENSG00000283703': 0.11897731025048351,
 'ENSG00000150347': 0.605438855911118,
 'ENSG00000198561': 0.3551687903612491,
 'ENSG00000109927': 0.3590862182193451,
 'ENSG00000107815': 0.19151454249238822,
 'ENSG00000185278': 0.3098913443031674,
 'ENSG00000126705': 0.5423026825073819,
 'ENSG00000170382': 0.31578947368421045,
 'ENSG00000186603': 0.12759470716595456,
 'ENSG00000198739': 0.7801636755577861,
 'ENSG00000257315': 0.26964583080166915,
 'ENSG00000143368': 0.24492298998578815,
 'ENSG00000198730': 0.6139680006207014,
 'ENSG00000175097': 0.15015694923177295,
 'ENSG00000116199': 0.20275922630749957,
 'ENSG00000183340': 0.24352159468438533,
 'ENSG00000198797': 0.14507152489855624,
 'ENSG00000143107': 0.5195582472401861,
 'ENSG00000233436': 0.332013498312711,
 'ENSG00000170322': 0.3128581096323032,
 'ENSG00000154143': -0.030822510822510824,
 'ENSG00000007933': 0.07033395221866658,
 'ENSG00000214655': 0.301628877199347,
 'ENSG00000151079': 0.3270859364154192,
 'ENSG00000182022': 0.37453836486094544,
 'ENSG00000152092': 0.3451792551340063,
 'ENSG00000157184': 0.1318165402672445,
 'ENSG00000248746': 0.2085294281389793,
 'ENSG00000116183': 0.29728309481216464,
 'ENSG00000154309': 0.38810083689072183,
 'ENSG00000187151': 0.5592882628254435,
 'ENSG00000169641': 0.25070798765247365,
 'ENSG00000123836': 0.19525319811438982,
 'ENSG00000107862': 0.2268386896547902,
 'ENSG00000162630': 0.3620248495885296,
 'ENSG00000042781': 0.5121527079731414,
 'ENSG00000168539': 0.14263555505168665,
 'ENSG00000180720': 0.21597056798970987,
 'ENSG00000236287': 0.05755297334244703,
 'ENSG00000064309': 0.5049529006230988,
 'ENSG00000162711': 0.3042397151947045,
 'ENSG00000087365': 0.2651905804548907,
 'ENSG00000091664': 0.6711631191703112,
 'ENSG00000127124': 0.4025830327898054,
 'ENSG00000173456': 0.25307337238262184,
 'ENSG00000159164': 0.28635811495009494,
 'ENSG00000173898': 0.43853169174646606,
 'ENSG00000119285': 0.2625868612181497,
 'ENSG00000162775': 0.4166611218311266,
 'ENSG00000023839': 0.19305049410683214,
 'ENSG00000177853': 0.5504728079245905,
 'ENSG00000176986': 0.3901180134696867}

In [42]:
sorted_proportion = {gene: proportion_less_than_005[gene] for gene in cor_dict1.keys()}

In [44]:
import plotly.express as px
fig = px.scatter(x = list(sorted_proportion.values()), y = list(cor_dict1.values()), labels={'x':'Non-stationary Evolution propotion', 'y':'Correlation Coefficient'}, trendline="ols", title= None)
# Update layout with labels and title
fig.update_layout(
    template='plotly_white',
    margin=dict(l=20, r=20, t=50, b=20),
    autosize=True,
    yaxis_title_font={'size': 20},  
    xaxis_title_font={'size': 20}, 
    width=None 
)
fig.show()

In [74]:
from cogent3.util.deserialise import deserialise_object

def test_hypothesis_clock_model_whole_gene(aln_path: IdentifierType, result_lf_dir, output_dir, opt_args=None) -> HypothesisResultType:
    aln = deserialise_object(json.load(open(aln_path, 'r')))
    print('start')
    file_name = os.path.basename(aln_path.rstrip('/'))
    print(file_name)
    result_lf_path = os.path.join(result_lf_dir, file_name)

    tree = load_json_app(result_lf_path).get_ens_tree()

    model_kwargs = dict(
    tree=tree,
    opt_args=opt_args,
    # unique_trees=True,
    optimise_motif_probs=True,
    )
    null = evo.model(
            "GN",
            param_rules=get_param_rules_upper_limit("GN", RATE_PARAM_UPPER),
            time_het = None,
            **model_kwargs,
        )
    alt = evo.model(
            "GN",
            name="GN-max-het",
            param_rules=get_param_rules_upper_limit("GN", RATE_PARAM_UPPER),
            time_het = "max",
            **model_kwargs,
        )
    print('begin hypothesis test')
    hyp = evo.hypothesis(null, alt, sequential=True)
    result = hyp(aln)
    print('end hypothesis test')
    out_dstore = open_data_store(output_dir, mode="w", suffix="json")
    write_json_app = get_app("write_json", data_store=out_dstore)
    write_json_app(result, identifier=file_name)

In [75]:
aln_path = '/Users/gulugulu/Desktop/honours/data_local/whole_genome_mammal87/triads_alignment_350_threshold/ENSG00000010165/1.json'
result_lf_dir = '/Users/gulugulu/Desktop/honours/data_local/whole_genome_mammal87/triads_model_fitting_350_threshold/ENSG00000010165/model_fitting_result'
output_dir = '/Users/gulugulu/Desktop/honours/data_local/'

In [195]:
import glob
clock_whole_gene_result = '/Users/gulugulu/Desktop/honours/data_local/clock_test_whole_gene'
input_data_store_clock_whole_gene_result = open_data_store(clock_whole_gene_result, suffix= 'json')
p_value_dict_clock_whole_gene = {}
LR_dict_clock_whole_gene = {}
for data in input_data_store_clock_whole_gene_result:
    gene_name = data.unique_id.split('.')[0]
    hyp_result = load_json_app(data)
    p_value_dict_clock_whole_gene[gene_name] = hyp_result.pvalue
    LR_dict_clock_whole_gene[gene_name] = hyp_result.LR

In [198]:
count = len([x for x in p_value_dict_clock_whole_gene.values() if x < 0.25])
count

55

Propotion of clock violation from bootstrape

In [17]:
toc_result_dir = '/Users/gulugulu/Desktop/honours/data_local/bootstrapping_test_clock'

input_data_store_toc_result = open_data_store(toc_result_dir, suffix= 'json')

len(input_data_store_toc_result)

2170

In [18]:
p_value_observed_dict_toc = {}
p_value_tested_dict_toc = {}
i = 0
for data in input_data_store_toc_result:
    gene_name = data.unique_id.split('.')[0]
    bootstrap_result = load_json_app(data)
    if isinstance(bootstrap_result, NotCompleted):
        i += 1
        print(data) 
    else:
        p_value_observed_dict_toc[gene_name] = bootstrap_result.observed.pvalue
        p_value_tested_dict_toc[gene_name] = p_value(bootstrap_result)

ENSG00000174684_210.json
ENSG00000154309_62.json
ENSG00000196878_192.json


In [31]:
list1 = list(p_value_tested_dict_toc.values())
count = len([x for x in list1 if x < 0.25])
count

#55% clock violation with 0.05, 64% with 0.1, 77% with 0.25

1683

In [32]:
# Assuming list1 is created from p_value_observed_dict.values()
list2 = list(p_value_observed_dict_toc.values())

# Remove NoneType values
filtered_list = [x for x in list2 if x is not None]

# Now count the values less than 0.05
count = len([x for x in filtered_list if x < 0.25])

print(count)
#52%, 60%, 72%

1572


In [39]:
from collections import defaultdict
# Dictionary to store the count of values < 0.05 for each gene
count_less_than_005 = defaultdict(int)
# Dictionary to store the total count of entries for each gene
total_count = defaultdict(int)

for key, value in p_value_observed_dict_toc.items():
    gene = key.split('_')[0]
    total_count[gene] += 1
    if value < 0.05:
        count_less_than_005[gene] += 1

# Calculate the proportion of values < 0.05 for each gene
proportion_less_than_005 = {gene: count_less_than_005[gene] / total for gene, total in total_count.items()}

count_gene = len([x for x in proportion_less_than_005.values() if x > 0.49])

count_gene

71