In [218]:
import os
import glob
import re
from cogent3.maths.measure import jsd
import numpy as np
from cogent3 import get_app


In [219]:
# Main variables
base_dir1 = "/Users/gulugulu/sampled"
def gather_fasta_paths(base_dir):
    pattern = os.path.join(base_dir, '*.fasta')
    # Use glob.glob to find all files matching the pattern
    fasta_files = glob.glob(pattern)
    return fasta_files

def extract_info(path):
    # Adjust the regular expression to match 'seqcoll-35' or similar patterns before '.fasta'
    match = re.search(r'/([^/]+)\.fasta$', path)
    if match:
        return match.group(1)
    else:
        return "unknown"


In [220]:
homology_fasta_paths = gather_fasta_paths(base_dir1)

In [221]:
homology_fasta_paths.index('/Users/gulugulu/sampled/seqcoll-75.fasta')

32

In [222]:
def pairwise_jsd_matrix(species_data):
    species_keys = list(species_data.keys())
    num_species = len(species_keys)
    jsd_matrix = np.zeros((num_species, num_species))  # Initialize a square matrix

    for i, species_1 in enumerate(species_keys):
        for j, species_2 in enumerate(species_keys):
            if i < j:  # To avoid recomputation, calculate only for i < j
                jsd_value = jsd(species_data[species_1], species_data[species_2])
                jsd_matrix[i, j] = jsd_value
                jsd_matrix[j, i] = jsd_value  # JSD is symmetric

    return jsd_matrix



In [566]:
def get_data(base_dir):
    fasta_files_paths = gather_fasta_paths(base_dir)
    results = {}
    prob_fasta = []
    for path in fasta_files_paths:
        loader = get_app("load_unaligned", format="fasta", moltype="dna")
        codon_aligner = get_app("progressive_align", "GNC", distance="paralinear")
        cpos3 = get_app("take_codon_positions", 3)
        homology_info = extract_info(path)
        try:
            seqs = loader(path)
            seqs_no_stop_codon = seqs.trim_stop_codons(strict=False)
            aligned = codon_aligner(seqs_no_stop_codon)
            aligned_no_degenerates = aligned.no_degenerates()
            just3rd_no_degenerates = cpos3(aligned_no_degenerates)
            pairwise_distance = just3rd_no_degenerates.distance_matrix(calc='logdet', show_progress=False, drop_invalid=True)
            pairwise_distance_array = pairwise_distance.array
            nuc_freqs = just3rd_no_degenerates.probs_per_seq()
            sub_nuc_freqs = {}
            for key in pairwise_distance.keys():
                sub_nuc_freqs[key] = nuc_freqs[key]
            
            pairwise_jsd = pairwise_jsd_matrix(sub_nuc_freqs)
            pairwise_distance_values = pairwise_distance_array[np.triu_indices(n=pairwise_distance_array.shape[0], k=1)]
            pairwise_jsd_values = pairwise_jsd[np.triu_indices(n=pairwise_jsd.shape[0], k=1)]
            results[homology_info] = {}
            results[homology_info]['pairwise_distance'] = pairwise_distance_values
            results[homology_info]['pairwise_jsd'] = pairwise_jsd_values
        except AttributeError:
            prob_fasta.append(homology_info)

    return results, prob_fasta



        


In [567]:
info = get_data(base_dir1)

numseqs=2 not equal to numtips=2
These were different: {'ursus_maritimus-ENSUMAG00000017028', 'ursus americanus-ENSUAMG00000019808', 'ursus maritimus-ENSUMAG00000017028', 'ursus_americanus-ENSUAMG00000019808'}
numseqs=2 not equal to numtips=2
These were different: {'macaca nemestrina-ENSMNEG00000028102', 'macaca fascicularis-ENSMFAG00000031225', 'macaca_fascicularis-ENSMFAG00000031225', 'macaca_nemestrina-ENSMNEG00000028102'}
numseqs=2 not equal to numtips=2
These were different: {'marmota_marmota_marmota-ENSMMMG00000019697', 'sciurus vulgaris-ENSSVLG00005007040', 'marmota marmota marmota-ENSMMMG00000019697', 'sciurus_vulgaris-ENSSVLG00005007040'}
numseqs=2 not equal to numtips=2
These were different: {'homo sapiens-ENSG00000281406', 'pongo_abelii-ENSPPYG00000038315', 'homo_sapiens-ENSG00000281406', 'pongo abelii-ENSPPYG00000038315'}
numseqs=2 not equal to numtips=2
These were different: {'delphinapterus leucas-ENSDLEG00000008359', 'delphinapterus_leucas-ENSDLEG00000008359', 'monodon m

In [578]:
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots

def plot_scatters(info_dict):
    # Assuming info_dict is structured with keys and 'pairwise_jsd' and 'pairwise_distance' as sub-keys
    keys = list(info_dict.keys())
    rows = int(len(keys) ** 0.5) + 1  # Calculate the number of rows for subplots
    cols = (len(keys) + rows - 1) // rows  # Calculate the number of columns

    fig = make_subplots(rows=rows, cols=cols, subplot_titles=[f'{key}' for key in keys])
    
    # Populate subplots
    for index, key in enumerate(keys, start=1):
        value = info_dict[key]
        data = {'JSD Value': value['pairwise_jsd'],
                'Distance Value': value['pairwise_distance']}
        df = pd.DataFrame(data)
        
        row = (index - 1) // cols + 1
        col = (index - 1) % cols + 1
        
        fig.add_trace(
            go.Scatter(
                x=df['JSD Value'],
                y=df['Distance Value'],
                mode='markers'
            ),
            row=row,
            col=col
        )
    
    fig.update_layout(
        height=300 * rows,  # Set a reasonable height based on number of rows
        width=300 * cols,   # Set a reasonable width based on number of columns
        title_text="Scatter Plots of 3rd codon position JSD Vs Genetic Distance",
        showlegend=False,
        yaxis_title_text='Genetic Distance',
        xaxis_title_text='JSD'
    )
    
    return fig


In [579]:
fig = plot_scatters(info[0])
fig.write_image('Genetic Distance vs JSD (3rd codon position).pdf')
fig.show()

In [580]:
# path = homology_fasta_paths[7]
# print(path)
# loader = get_app("load_unaligned", format="fasta", moltype="dna")
# codon_aligner = get_app("progressive_align", "GNC")
# seqs = loader(path)
# seqs_no_stop_codon = seqs.trim_stop_codons(strict=False)
# aligned = codon_aligner(seqs_no_stop_codon)
# aligned_no_degenerates = aligned.no_degenerates()
# cpos3 = get_app("take_codon_positions", 3)
# just3rd_no_degenerates = cpos3(aligned_no_degenerates)
# pairwise_distance = just3rd_no_degenerates.distance_matrix(calc='logdet', show_progress=True, drop_invalid=True)
# pairwise_distance = just3rd_no_degenerates.distance_matrix(calc='logdet', show_progress=True, drop_invalid=True)
# pairwise_distance_array = pairwise_distance.array
# nuc_freqs = just3rd_no_degenerates.probs_per_seq()
# sub_nuc_freqs = {}
# for key in pairwise_distance.keys():
#     sub_nuc_freqs[key] = nuc_freqs[key]
# pairwise_jsd = pairwise_jsd_matrix(sub_nuc_freqs)
# pairwise_distance_values = pairwise_distance_array[np.triu_indices(n=pairwise_distance_array.shape[0], k=1)]
# pairwise_jsd_values = pairwise_jsd[np.triu_indices(n=pairwise_jsd.shape[0], k=1)]
