In [7]:
from cogent3.maths.measure import jsd
import numpy as np
from cogent3 import get_app
import plotly.express as px
from scipy.stats import pearsonr
from clock_project.genome_analysis.homolog_analysis import gather_fasta_paths2, gather_fasta_paths, extract_info, pairwise_jsd_matrix, prepare_dataframes, jsd_genetic_distance_scatters
import statsmodels.api as sm
import json


In [8]:
loader = get_app("load_unaligned", format="fasta", moltype="dna")
codon_aligner = get_app("progressive_align", "codon", unique_guides = True)
cpos3 = get_app("take_codon_positions", 3)
def length_divisible_by_three(row):
    return len(row) % 3 == 0

In [9]:
base_dir = "/Users/gulugulu/Desktop/honours/data_local/whole_genome_mammal87/sampled_homologies"

homology_fa_paths = gather_fasta_paths2(base_dir)

In [5]:
motif_list = []
valid_homology_list = []
prob_homology_list = []
species_less_than_3 = []
small_sample_size = []
for path in homology_fa_paths:
    seqs = loader(path)
    seqs_divisible = seqs.take_seqs_if(length_divisible_by_three)
    seqs_no_stop_codon = seqs_divisible.trim_stop_codons(strict=True)
    try:
        seq_name_seqs_filtered = []
        for seq in seqs_no_stop_codon.seqs:
            seq_name = seq.get_name() 
            if (str(seq[0:3]) == 'ATG') & (seq.is_degenerate() == False): 
                seq_name_seqs_filtered.append(seq_name)
        seqs_filtered = seqs_no_stop_codon.take_seqs(seq_name_seqs_filtered)
        if seqs_filtered.num_seqs < 3:
            species_less_than_3.append(path)
        else:
            aligned = codon_aligner(seqs_filtered)
            aligned_no_degenerates = aligned.no_degenerates(motif_length=3)
            just3rd_aligned_no_degenerates = cpos3(aligned_no_degenerates)
            motif = list(just3rd_aligned_no_degenerates.get_motif_probs().values())
            motif_list.append(motif)
            pairwise_distance = just3rd_aligned_no_degenerates.distance_matrix(calc='tn93', show_progress=False, drop_invalid=True)
            pairwise_distance_array = pairwise_distance.array
            if len(np.unique(pairwise_distance_array)) < 20:
                small_sample_size.append(path)
            else:
                valid_homology_list.append(path)
    except AttributeError as e:
        prob_homology_list.append(path)

In [6]:
import json
with open ('/Users/gulugulu/repos/PuningAnalysis/results/output_data/valid_path.json', 'w') as outfile:
    json.dump(valid_homology_list, outfile)

In [26]:
len(valid_homology_list), len(prob_homology_list), len(small_sample_size), len(species_less_than_3)

(188, 542, 54, 5)

In [10]:
def get_pairwise_jsd_genetic_distance(path):
    seqs = loader(path)
    seq_name_seqs_filtered = []
    seqs_divisible = seqs.take_seqs_if(length_divisible_by_three)
    seqs_no_stop_codon = seqs_divisible.trim_stop_codons(strict=True)
    for seq in seqs_no_stop_codon.seqs:
        seq_name = seq.get_name() 
        if (str(seq[0:3]) == 'ATG') & (seq.is_degenerate() == False): 
            seq_name_seqs_filtered.append(seq_name)
    seqs_filtered = seqs_no_stop_codon.take_seqs(seq_name_seqs_filtered)
    aligned = codon_aligner(seqs_filtered)
    aligned_no_degenerates = aligned.no_degenerates()
    just3rd_alined_no_degenerates = cpos3(aligned_no_degenerates)
    pairwise_distance = just3rd_alined_no_degenerates.distance_matrix(calc='tn93', show_progress=False, drop_invalid=True)
    pairwise_distance_array = pairwise_distance.array
    nuc_freqs = just3rd_alined_no_degenerates.probs_per_seq()
    sub_nuc_freqs = {}
    for key in pairwise_distance.keys():
        sub_nuc_freqs[key] = nuc_freqs[key]

    pairwise_jsd = pairwise_jsd_matrix(sub_nuc_freqs)
    pairwise_distance_values = pairwise_distance_array[np.triu_indices(n=pairwise_distance_array.shape[0], k=1)]
    pairwise_jsd_values = pairwise_jsd[np.triu_indices(n=pairwise_jsd.shape[0], k=1)]

    return pairwise_distance_values, pairwise_jsd_values

In [17]:
# import os
# species_data = {}
# correlation_info = {}
# for path in valid_homology_list:
#     filename = os.path.basename(path)
#     gene_name = os.path.splitext(filename)[0]
#     pairwise_distance, pairwise_jsd = get_pairwise_jsd_genetic_distance(path)
#     col, p_value = pearsonr(pairwise_jsd, pairwise_distance)
#     correlation_info[gene_name] = {'correlation_factor': col, 'p_value': p_value}
#     species_data[gene_name]  = {'pairwise_distance': pairwise_distance, 'pairwise_jsd': pairwise_jsd, 'col': col, 'p_value': p_value}


# for key, value in species_data.items():
#     for subkey, subvalue in value.items():
#         if isinstance(subvalue, np.ndarray):
#             species_data[key][subkey] = subvalue.tolist()

# with open ('/Users/gulugulu/repos/PuningAnalysis/results/output_data/jsd_distance.json', 'w') as outfile2:
#     json.dump(species_data, outfile2)

In [4]:
with open ('/Users/gulugulu/repos/PuningAnalysis/results/output_data/jsd_distance.json', 'r') as infile:
    species_data = json.load(infile)

In [5]:
species_dfs = prepare_dataframes(species_data)
jsd_distance_sub_fig = jsd_genetic_distance_scatters(species_dfs)
#jsd_distance_sub_fig.show()
jsd_distance_sub_fig.write_image('jsd_genetic_distance_scatters.pdf')

In [13]:
path = '/Users/gulugulu/Desktop/honours/data_local/whole_genome_mammal87/sampled_homologies/ENSG00000142661.fa'
pairwise_distance_values, pairwise_jsd_values = get_pairwise_jsd_genetic_distance(path)
fig = px.scatter(x = np.log(pairwise_jsd_values), y = pairwise_distance_values, labels = {'x': 'Pairwise Jensen-Shannon Divergence', 'y': 'Pairwise distance (TN93)'})
fig.update_layout(
    yaxis_title_font={'size': 20},  
    xaxis_title_font={'size': 20},  # Bigger x-axis title font size
    template='plotly_white'
)
fig.show()

In [31]:
px.histogram(np.log(pairwise_jsd_values))

In [2]:
base_dir1 = "/Users/gulugulu/Desktop/honours/data_local/whole_genome_mammal87/sampled_homolog/sampled_homologies"
homology_fasta_paths = gather_fasta_paths(base_dir1)


In [12]:
motif_list = []
valid_homology_list = []
prob_homology_list = []
species_less_than_3 = []
small_sample_size = []
for path in homology_fasta_paths:
    seqs = loader(path)
    seqs_divisible = seqs.take_seqs_if(length_divisible_by_three)
    seqs_no_stop_codon = seqs_divisible.trim_stop_codons(strict=True)
    try:
        seq_name_seqs_filtered = []
        for seq in seqs_no_stop_codon.seqs:
            seq_name = seq.get_name() 
            if (str(seq[0:3]) == 'ATG') & (seq.is_degenerate() == False): 
                seq_name_seqs_filtered.append(seq_name)
        seqs_filtered = seqs_no_stop_codon.take_seqs(seq_name_seqs_filtered)
        if seqs_filtered.num_seqs < 3:
            species_less_than_3.append(path)
        else:
            aligned = codon_aligner(seqs_filtered)
            aligned_no_degenerates = aligned.no_degenerates(motif_length=3)
            just3rd_aligned_no_degenerates = cpos3(aligned_no_degenerates)
            motif = list(just3rd_aligned_no_degenerates.get_motif_probs().values())
            motif_list.append(motif)
            pairwise_distance = just3rd_aligned_no_degenerates.distance_matrix(calc='tn93', show_progress=False, drop_invalid=True)
            pairwise_distance_array = pairwise_distance.array
            if len(np.unique(pairwise_distance_array)) < 20:
                small_sample_size.append(path)
            else:
                valid_homology_list.append(path)
    except AttributeError as e:
        prob_homology_list.append(path)

In [19]:
species_data = {}
correlation_info = {}
for path in valid_homology_list:
    info = extract_info(path)
    pairwise_distance, pairwise_jsd = get_pairwise_jsd_genetic_distance(path)
    col, p_value = pearsonr(pairwise_jsd, pairwise_distance)
    correlation_info[info] = {'correlation_factor': col, 'p_value': p_value}
    species_data[info]  = {'pairwise_distance': pairwise_distance, 'pairwise_jsd': pairwise_jsd, 'col': col, 'p_value': p_value}

species_dfs = prepare_dataframes(species_data)
jsd_distance_sub_fig = jsd_genetic_distance_scatters(species_dfs)
jsd_distance_sub_fig.show()


In [94]:
path = '/Users/gulugulu/Desktop/honours/data_local/whole_genome_mammal87/sampled_homolog/sampled_homologies/seqcoll-351.fasta'
seqs = loader(path)
seqs_filtered = seqs.take_seqs_if(length_divisible_by_three)
seq_name_with_start_codon = []
for seq in seqs.seqs:
    seq_name = seq.get_name() 
    if (str(seq[0:3]) == 'ATG') & (seq.is_degenerate() == False): 
        seq_name_with_start_codon.append(seq.get_name())
seqs_with_start_codon = seqs.take_seqs(seq_name_with_start_codon)
seqs_with_start_no_end = seqs_with_start_codon.trim_stop_codons(strict=True)
aligned_with_start_codon = codon_aligner(seqs_with_start_no_end)

In [95]:
just3rd_alined_with_start_codon_no_degenerates = cpos3(aligned_with_start_codon.no_degenerates())
pairwise_distance = just3rd_alined_with_start_codon_no_degenerates.distance_matrix(calc='tn93', show_progress=False, drop_invalid=True)
pairwise_distance_array = pairwise_distance.array

nuc_freqs = just3rd_alined_with_start_codon_no_degenerates.probs_per_seq()
sub_nuc_freqs = {}
for key in pairwise_distance.keys():
    sub_nuc_freqs[key] = nuc_freqs[key]

pairwise_jsd = pairwise_jsd_matrix(sub_nuc_freqs)
pairwise_distance_values = pairwise_distance_array[np.triu_indices(n=pairwise_distance_array.shape[0], k=1)]
pairwise_jsd_values = pairwise_jsd[np.triu_indices(n=pairwise_jsd.shape[0], k=1)]

In [None]:
fig = px.scatter(x = pairwise_jsd_values, y = pairwise_distance_values, labels = {'x': 'Pairwise Jensen-Shannon Divergence', 'y': 'Pairwise distance (TN93)'})
fig.update_layout(
    yaxis_title_font={'size': 20},  
    xaxis_title_font={'size': 20},  # Bigger x-axis title font size
    template='plotly_white'
)
fig.show()
fig.write_image('../../results/figures/JSD_vs_tn93_distance.pdf')

In [14]:
from cogent3.evolve.models import GN
from cogent3 import get_app, make_tree

dist_cal = get_app("fast_slow_dist", fast_calc="tn93", moltype="dna")
est_tree = get_app("quick_tree", drop_invalid=False)
tree_func = dist_cal + est_tree
GN_model = get_app("model", "TN93", tree_func=tree_func, unique_trees = True,time_het="max", show_progress = False, opt_args = dict(max_restarts=5))
res = GN_model(aligned_with_start_codon)

NameError: name 'aligned_with_start_codon' is not defined

In [90]:
res.lf.tree.get_figure(show_support=True, threshold=0.8, height = 1200, width = 1200)

<cogent3.draw.dendrogram.Dendrogram at 0x170d63770>

In [96]:
# Add a constant to the input features (intercept term)
X = sm.add_constant(np.log10(pairwise_jsd_values + 1))

# Fit the linear regression model
model = sm.OLS(pairwise_distance_values, X).fit()

# Print the summary of the model
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.488
Model:                            OLS   Adj. R-squared:                  0.488
Method:                 Least Squares   F-statistic:                     1576.
Date:                Fri, 14 Jun 2024   Prob (F-statistic):          1.67e-242
Time:                        16:30:36   Log-Likelihood:                 277.29
No. Observations:                1653   AIC:                            -550.6
Df Residuals:                    1651   BIC:                            -539.8
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.3120      0.006     48.466      0.0