In [1]:
import os
import glob
import re
from cogent3.maths.measure import jsd
import numpy as np
from cogent3 import get_app, load_aligned_seqs


In [2]:
# !pip install --upgrade 'nbformat>=4.2.0'

In [44]:
# Main variables
base_dir1 = "/Users/gulugulu/repos/EnsemblLite_folk_puning/sampled_homology_test_100"
def gather_fasta_paths(base_dir):
    pattern = os.path.join(base_dir, '*.fasta')
    # Use glob.glob to find all files matching the pattern
    fasta_files = glob.glob(pattern)
    return fasta_files

def extract_info(path):
    # Adjust the regular expression to match 'seqcoll-35' or similar patterns before '.fasta'
    match = re.search(r'/([^/]+)\.fasta$', path)
    if match:
        return match.group(1)
    else:
        return "unknown"


In [46]:
def pairwise_jsd_matrix(species_data):
    species_keys = list(species_data.keys())
    num_species = len(species_keys)
    jsd_matrix = np.zeros((num_species, num_species))  # Initialize a square matrix

    for i, species_1 in enumerate(species_keys):
        for j, species_2 in enumerate(species_keys):
            if i < j:  # To avoid recomputation, calculate only for i < j
                jsd_value = jsd(species_data[species_1], species_data[species_2])
                jsd_matrix[i, j] = jsd_value
                jsd_matrix[j, i] = jsd_value  # JSD is symmetric

    return jsd_matrix

In [47]:
homology_fasta_paths = gather_fasta_paths(base_dir1)

In [48]:
def length_divisible_by_three(row):
    return len(row) % 3 == 0

In [63]:
loader = get_app("load_unaligned", format="fasta", moltype="dna")
codon_aligner = get_app("progressive_align", "codon", unique_guides = True)
cpos3 = get_app("take_codon_positions", 3)
# omit_degens = get_app("omit_degenerates", moltype="dna")


In [64]:
path = '/Users/gulugulu/repos/sampled_homology_test/seqcoll-1.fasta'
loader = get_app("load_unaligned", format="fasta", moltype="dna")
seqs = loader(path)


In [65]:
filtered_seqs = seqs.take_seqs_if(length_divisible_by_three)

In [66]:
seqs_no_stop_codon = filtered_seqs.trim_stop_codons(strict=True)
aligned = codon_aligner(seqs_no_stop_codon)

In [67]:
aligned.information_plot()

<cogent3.draw.drawable.Drawable at 0x1a3ad8110>

In [71]:
aligned_no_degenerates = aligned.no_degenerates(motif_length = 3)

just3rd_alined_no_degenerates = cpos3(aligned_no_degenerates)
pairwise_distance = just3rd_alined_no_degenerates.distance_matrix(calc='tn93', show_progress=False, drop_invalid=True)
pairwise_distance

In [None]:
pairwise_distance_array = pairwise_distance.array
nuc_freqs = just3rd_alined_no_degenerates.probs_per_seq()
sub_nuc_freqs = {}
for key in pairwise_distance.keys():
    sub_nuc_freqs[key] = nuc_freqs[key]

pairwise_jsd = pairwise_jsd_matrix(sub_nuc_freqs)
pairwise_distance_values = pairwise_distance_array[np.triu_indices(n=pairwise_distance_array.shape[0], k=1)]
pairwise_jsd_values = pairwise_jsd[np.triu_indices(n=pairwise_jsd.shape[0], k=1)]

In [None]:
pairwise_distance.shape

(20, 20)

In [None]:
import plotly.express as px
fig = px.scatter(x=pairwise_distance_values, y=pairwise_jsd_values, labels={
                 'x': 'Pairwise Distance',
                 'y': 'Pairwise Jensen-Shannon Divergence'
             })
fig.show()

In [None]:
def create_filter_by_name(exclude_name):
    def filter_by_name(seq):
        return seq.name != exclude_name
    return filter_by_name

name_to_exclude = "sequence_to_exclude"
custom_filter = create_filter_by_name(name_to_exclude)



In [93]:
path = '/Users/gulugulu/repos/sampled_homology_all/seqcoll-15.fasta'
homology_info = extract_info(path)
seqs = loader(path)

In [94]:
# Filter out the sequence by name
seqs_filtered = seqs.take_seqs_if(length_divisible_by_three)

In [95]:
seqs_no_stop_codon = seqs_filtered.trim_stop_codons(strict=True)
aligned = codon_aligner(seqs_no_stop_codon)
aligned_no_degenerates = aligned.no_degenerates()
just3rd_alined_no_degenerates = cpos3(aligned_no_degenerates)

In [96]:
# aligned

In [99]:
aligned.information_plot()

<cogent3.draw.drawable.Drawable at 0x1a1686690>

In [98]:
# aa = seqs_filtered.get_translation()
# aligned.set_repr_policy(num_seqs=65)
# aa
pairwise_distance = just3rd_alined_no_degenerates.distance_matrix(calc='paralinear', show_progress=False, drop_invalid=True)
pairwise_distance

AttributeError: 'NotCompleted' object has no attribute 'distance_matrix'

In [None]:
pairwise_distance_array = pairwise_distance.array
nuc_freqs = just3rd_alined_no_degenerates.probs_per_seq()
sub_nuc_freqs = {}
for key in pairwise_distance.keys():
    sub_nuc_freqs[key] = nuc_freqs[key]

pairwise_jsd = pairwise_jsd_matrix(sub_nuc_freqs)
pairwise_distance_values = pairwise_distance_array[np.triu_indices(n=pairwise_distance_array.shape[0], k=1)]
pairwise_jsd_values = pairwise_jsd[np.triu_indices(n=pairwise_jsd.shape[0], k=1)]

In [None]:
import plotly.express as px
fig = px.scatter(x=pairwise_distance_values, y=pairwise_jsd_values, labels={
                 'x': 'Pairwise Distance',
                 'y': 'Pairwise Jensen-Shannon Divergence'
             },
             title="Scatter Plot of Pairwise Distances vs. Jensen-Shannon Divergence")
fig.show()

In [None]:


def get_data(base_dir):
    fasta_files_paths = gather_fasta_paths(base_dir)
    results = {}
    # prob_fasta = []
    for path in fasta_files_paths:
        loader = get_app("load_unaligned", format="fasta", moltype="dna")
        codon_aligner = get_app("progressive_align", "GNC", distance="paralinear")
        cpos3 = get_app("take_codon_positions", 3)
        homology_info = extract_info(path)
   #     try:
        seqs = loader(path)
        seqs_no_stop_codon = seqs.trim_stop_codons(strict=False)
        aligned = codon_aligner(seqs_no_stop_codon)
        aligned_no_degenerates = aligned.no_degenerates()
        just3rd_no_degenerates = cpos3(aligned_no_degenerates)
        pairwise_distance = just3rd_no_degenerates.distance_matrix(calc='logdet', show_progress=False, drop_invalid=False)
        pairwise_distance_array = pairwise_distance.array
        nuc_freqs = just3rd_no_degenerates.probs_per_seq()
        sub_nuc_freqs = {}
        for key in pairwise_distance.keys():
            sub_nuc_freqs[key] = nuc_freqs[key]
        
        pairwise_jsd = pairwise_jsd_matrix(sub_nuc_freqs)
        pairwise_distance_values = pairwise_distance_array[np.triu_indices(n=pairwise_distance_array.shape[0], k=1)]
        pairwise_jsd_values = pairwise_jsd[np.triu_indices(n=pairwise_jsd.shape[0], k=1)]
        results[homology_info] = {}
        results[homology_info]['pairwise_distance'] = pairwise_distance_values
        results[homology_info]['pairwise_jsd'] = pairwise_jsd_values
        # except AttributeError:
        #     prob_fasta.append(homology_info)

    return results #, prob_fasta



        


In [None]:
info = get_data(base_dir1)
info


{}

In [None]:
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots

def plot_scatters(info_dict):
    # Assuming info_dict is structured with keys and 'pairwise_jsd' and 'pairwise_distance' as sub-keys
    keys = list(info_dict.keys())
    rows = int(len(keys) ** 0.5) + 1  # Calculate the number of rows for subplots
    cols = (len(keys) + rows - 1) // rows  # Calculate the number of columns

    fig = make_subplots(rows=rows, cols=cols, subplot_titles=[f'{key}' for key in keys])
    
    # Populate subplots
    for index, key in enumerate(keys, start=1):
        value = info_dict[key]
        data = {'JSD Value': value['pairwise_jsd'],
                'Distance Value': value['pairwise_distance']}
        df = pd.DataFrame(data)
        
        row = (index - 1) // cols + 1
        col = (index - 1) % cols + 1
        
        fig.add_trace(
            go.Scatter(
                x=df['JSD Value'],
                y=df['Distance Value'],
                mode='markers'
            ),
            row=row,
            col=col
        )
    
    fig.update_layout(
        height=300 * rows,  # Set a reasonable height based on number of rows
        width=300 * cols,   # Set a reasonable width based on number of columns
        title_text="Scatter Plots of 3rd codon position JSD Vs Genetic Distance",
        showlegend=False,
        yaxis_title_text='Genetic Distance',
        xaxis_title_text='JSD'
    )
    
    return fig


In [None]:
fig = plot_scatters(info)
#fig.write_image('Genetic Distance vs JSD (3rd codon position).pdf')
fig.show()

ValueError: 
The 'cols' argument to make_subplots must be an int greater than 0.
    Received value of type <class 'int'>: 0

In [None]:
# path = homology_fasta_paths[7]
# print(path)
# loader = get_app("load_unaligned", format="fasta", moltype="dna")
# codon_aligner = get_app("progressive_align", "GNC")
# seqs = loader(path)
# seqs_no_stop_codon = seqs.trim_stop_codons(strict=False)
# aligned = codon_aligner(seqs_no_stop_codon)
# aligned_no_degenerates = aligned.no_degenerates()
# cpos3 = get_app("take_codon_positions", 3)
# just3rd_no_degenerates = cpos3(aligned_no_degenerates)
# pairwise_distance = just3rd_no_degenerates.distance_matrix(calc='logdet', show_progress=True, drop_invalid=True)
# pairwise_distance = just3rd_no_degenerates.distance_matrix(calc='logdet', show_progress=True, drop_invalid=True)
# pairwise_distance_array = pairwise_distance.array
# nuc_freqs = just3rd_no_degenerates.probs_per_seq()
# sub_nuc_freqs = {}
# for key in pairwise_distance.keys():
#     sub_nuc_freqs[key] = nuc_freqs[key]
# pairwise_jsd = pairwise_jsd_matrix(sub_nuc_freqs)
# pairwise_distance_values = pairwise_distance_array[np.triu_indices(n=pairwise_distance_array.shape[0], k=1)]
# pairwise_jsd_values = pairwise_jsd[np.triu_indices(n=pairwise_jsd.shape[0], k=1)]
