In [30]:
from cogent3.util.deserialise import deserialise_object
import json
import os
import glob
from cogent3 import get_app, open_data_store
from clock_project.simulation.wts import calculate_non_stationarity
import numpy as np

load_json_app = get_app("load_json")


def set_model(triads_species_name, ens_tree):
    tree_topology_triads = ens_tree.get_sub_tree(triads_species_name.values())
    null = get_app("model", "TN93", tree=tree_topology_triads, time_het="max", optimise_motif_probs=True)
    alt = get_app("model", "GN", tree=tree_topology_triads, time_het="max", optimise_motif_probs=True)
    hyp = get_app("hypothesis", null, alt)
    return hyp

def get_model_fitting_result(triads_species_name, triads_aln, ens_tree):
    # ingroup_species_gene_name = [triads_species_name['ingroup1'], triads_species_name['ingroup2']]
    # outgroup = triads_species_name['outgroup']
    model = set_model(triads_species_name, ens_tree)
    model_fitting_result = model(triads_aln)
    return model_fitting_result


def process_path(path, result_lf_dir, triads_info_dir):
    model_fitting_result_dict = {}
    file_name = os.path.basename(path.rstrip('/'))
    print(file_name)
    triads_alignment_paths = glob.glob(os.path.join(path, "*.json"))
    triads_info_path = os.path.join(triads_info_dir, file_name, "triads_species_names_dict.json")
    triads_speccies_infos = json.load((open(triads_info_path, 'r')))
    result_lf_path = os.path.join(result_lf_dir, f"{file_name}.json")
    result_lf = load_json_app(result_lf_path)
    
    for alignment_path in triads_alignment_paths:
        identifier_number = os.path.basename(alignment_path).rsplit('.', 1)[0]
        print(identifier_number)
        triads_aln = deserialise_object(json.load(open(alignment_path, 'r')))
        triads_species_name = triads_speccies_infos[identifier_number]
        print(triads_species_name)
        ens_tree = result_lf.get_ens_tree()
        model_fitting_result= get_model_fitting_result(triads_species_name, triads_aln, ens_tree)
        model_fitting_result_dict[identifier_number] = model_fitting_result

    return model_fitting_result_dict

In [None]:
from clock_project.maths.evolutionary_rate import calculate_stationary_distribution

def get_STI(triads_info):
    matrix_ingroup_dict = {}
    nuc_freq_ingroup_dict = {}
    STI_dict = {}
    for identifier, info in triads_info.items():
        triads_info_value = info['triads_info_small_tree']
        triads_names = info['triads_species_names']
        matrix_dict = triads_info_value['matrices']
        nuc_freqs_dict = triads_info_value['nuc_freqs_dict']
        nuc_freqs_ingroup_pair = {triads_names['ingroup1']: nuc_freqs_dict['ingroup1'], triads_names['ingroup2']: nuc_freqs_dict['ingroup2']}
        nuc_freq_ingroup_dict[identifier] = nuc_freqs_ingroup_pair
        matrix_ingroup_pair = {triads_names['ingroup1']: matrix_dict[triads_names['ingroup1']], triads_names['ingroup2']: matrix_dict[triads_names['ingroup2']]}
        matrix_ingroup_dict[identifier] = matrix_ingroup_pair
        stationary_distirbution_dict = {triads_names['ingroup1']: calculate_stationary_distribution(matrix_dict[triads_names['ingroup1']]), triads_names['ingroup2']: calculate_stationary_distribution(matrix_dict[triads_names['ingroup2']])}
        







In [None]:
from clock_project.maths.evolutionary_rate import calculate_stationary_distribution

def get_STI(triads_info):
    matrix_ingroup_dict = {}
    nuc_freq_ingroup_dict = {}
    STI_dict = {}
    for identifier, info in triads_info.items():
        triads_info_value = info['triads_info_small_tree']
        triads_names = info['triads_species_names']
        matrix_dict = triads_info_value['matrices']
        nuc_freqs_dict = triads_info_value['nuc_freqs_dict']
        nuc_freqs_ingroup_pair = {
            triads_names['ingroup1']: nuc_freqs_dict['ingroup1'],
            triads_names['ingroup2']: nuc_freqs_dict['ingroup2']
        }
        nuc_freq_ingroup_dict[identifier] = nuc_freqs_ingroup_pair
        matrix_ingroup_pair = {
            triads_names['ingroup1']: matrix_dict[triads_names['ingroup1']],
            triads_names['ingroup2']: matrix_dict[triads_names['ingroup2']]
        }
        matrix_ingroup_dict[identifier] = matrix_ingroup_pair
        stationary_distribution_dict = {
            triads_names['ingroup1']: calculate_stationary_distribution(matrix_dict[triads_names['ingroup1']]),
            triads_names['ingroup2']: calculate_stationary_distribution(matrix_dict[triads_names['ingroup2']])
        }
        
        # Calculate STI values for each ingroup species
        for species, freqs in nuc_freqs_ingroup_pair.items():
            π = stationary_distribution_dict[species]
            p = freqs
            ΔC = p[2] - π[2]  # C
            ΔG = p[3] - π[3]  # G
            ΔA = p[1] - π[1]  # A
            ΔT = p[0] - π[0]  # T
            
            STI1 = ΔC + ΔG
            STI2 = ΔA - ΔT
            STI3 = ΔC - ΔG
            
            if identifier not in STI_dict:
                STI_dict[identifier] = {}
            STI_dict[identifier][species] = (STI1, STI2, STI3)

    return STI_dict


In [43]:
path_test = '/Users/gulugulu/Desktop/honours/data_local/whole_genome_mammal87/triads_alignment_350_threshold/ENSG00000152779'
result_lf_dir_test = '/Users/gulugulu/repos/PuningAnalysis/results/output_data/model_fitting_result_350_threshold'
triads_info_dir_test = '/Users/gulugulu/Desktop/honours/data_local/whole_genome_mammal87/triads_350_threshold'

In [44]:
model_fitting_result_dict = process_path(path_test, result_lf_dir_test, triads_info_dir_test)

ENSG00000152779
396
{'ingroup1': 'Ferret', 'ingroup2': 'Narwhal', 'outgroup': 'Megabat'}
115
{'ingroup1': 'Sooty_mangabey', 'ingroup2': 'Panamanian_white_faced_capuchin', 'outgroup': 'Tarsier'}
457
{'ingroup1': 'Red_fox', 'ingroup2': 'Cat', 'outgroup': 'Donkey'}
20
{'ingroup1': 'Macaque', 'ingroup2': 'Sumatran_orangutan', 'outgroup': 'Panamanian_white_faced_capuchin'}
98
{'ingroup1': 'Vervet_AGM', 'ingroup2': 'Gorilla', 'outgroup': "Ma's_night_monkey"}
61
{'ingroup1': 'Vervet_AGM', 'ingroup2': 'Arctic_ground_squirrel', 'outgroup': 'Tree_Shrew'}
302
{'ingroup1': 'Cow', 'ingroup2': 'Dog', 'outgroup': 'Megabat'}
181
{'ingroup1': 'Sooty_mangabey', 'ingroup2': 'Long_tailed_chinchilla', 'outgroup': 'Tree_Shrew'}
494
{'ingroup1': 'American_black_bear', 'ingroup2': 'Megabat', 'outgroup': 'Guinea_Pig'}
230
{'ingroup1': 'Squirrel', 'ingroup2': 'Tree_Shrew', 'outgroup': 'Greater_horseshoe_bat'}
16
{'ingroup1': 'Alpaca', 'ingroup2': 'Ferret', 'outgroup': 'Greater_horseshoe_bat'}
267
{'ingroup1': '

In [45]:
model_fitting_result_dict

{'396': Statistics
      LR    df    pvalue
 -----------------------
 59.0506    36    0.0091
 -----------------------
 hypothesis    key              lnL    nfp    DLC     unique_Q
 -------------------------------------------------------------
 null          'TN93'    -1251.4650     15    True    True    
 alt           'GN'      -1221.9397     51    True    True    
 -------------------------------------------------------------,
 '115': Statistics
      LR    df    pvalue
 -----------------------
 35.9832    36    0.4694
 -----------------------
 hypothesis    key              lnL    nfp    DLC     unique_Q
 -------------------------------------------------------------
 null          'TN93'    -1025.9271     15    True    True    
 alt           'GN'      -1007.9355     51    True    True    
 -------------------------------------------------------------,
 '457': Statistics
      LR    df    pvalue
 -----------------------
 51.7731    36    0.0430
 -----------------------
 hypothesis

In [11]:
model_fitting_dir = '/Users/gulugulu/Desktop/honours/data_local/whole_genome_mammal87/triads_model_fitting_info/ENSG00000160050'
model_fitting_result_dir = os.path.join(model_fitting_dir, 'model_fitting_result')
model_fitting_results_paths = glob.glob(os.path.join(model_fitting_result_dir, '*.json'))
model_fitting_results_path ='/Users/gulugulu/Desktop/honours/data_local/whole_genome_mammal87/triads_model_fitting_info/ENSG00000160050/model_fitting_result/100.json'
model_fitting_result = load_json_app(model_fitting_results_path)

In [49]:
p_value_dict = {}
for identifier, result in model_fitting_result_dict.items():
    p_value = result.pvalue
    p_value_dict[identifier] = p_value
    

In [50]:
len(p_value_dict)

100

In [51]:
import plotly.express as px
px.histogram(p_value_dict.values())

In [41]:
len(p_value_dict)

100

In [43]:
valid_triads_identifier = []
load_json_app = get_app("load_json")

for identifier, model_fitting_results in model_fitting_result_alt_dict.items():
    vio = []
    exclude_params = ("length", "mprobs")
    list_of_params = model_fitting_results.get_param_rules()
    for param in list_of_params:
        if param["par_name"] not in exclude_params:
            if (abs(param["init"] - param["lower"]) <= 1e-12) or (
                abs(param["init"] - param["upper"]) <= 1e-12):
                vio.append(param)
    if vio == []:
        valid_triads_identifier.append(identifier)


In [41]:
valid_triads_identifier

['134',
 '40',
 '99',
 '99',
 '99',
 '99',
 '76',
 '102',
 '107',
 '106',
 '13',
 '13',
 '188',
 '183',
 '63',
 '96',
 '23']

In [4]:
model_fitting_result_nul_dict

NameError: name 'model_fitting_result_nul_dict' is not defined

In [15]:
from cogent3 import available_models
available_models(model_types=None)

Model Type,Abbreviation,Description
nucleotide,BH,Barry and Hartigan Discrete Time substitution model Barry and Hartigan 1987. Biometrics 43: 261–76.
nucleotide,DT,"Discrete Time substitution model (non-stationary, non-reversible). motif_length=2 makes this a dinucleotide model, motif_length=3 a trinucleotide model."
nucleotide,GN,"General Markov Nucleotide (non-stationary, non-reversible). Kaehler, Yap, Zhang, Huttley, 2015, Sys Biol 64 (2): 281–93"
nucleotide,ssGN,"strand-symmetric general Markov nucleotide (non-stationary, non-reversible). Kaehler, 2017, Journal of Theoretical Biology 420: 144–51"
nucleotide,K80,Kimura 1980
nucleotide,JC69,Jukes and Cantor's 1969 model
nucleotide,GTR,General Time Reversible nucleotide substitution model.
nucleotide,TN93,Tamura and Nei 1993 model
nucleotide,HKY85,"Hasegawa, Kishino and Yano 1985 model"
nucleotide,F81,Felsenstein's 1981 model
