In [1]:
import os
import glob
import re
from cogent3.maths.measure import jsd
import numpy as np
from cogent3 import get_app, load_aligned_seqs


In [2]:
# !pip install --upgrade 'nbformat>=4.2.0'

In [87]:
# Main variables
base_dir1 = "/Users/gulugulu/sampled_homology_200"
def gather_fasta_paths(base_dir):
    pattern = os.path.join(base_dir, '*.fasta')
    # Use glob.glob to find all files matching the pattern
    fasta_files = glob.glob(pattern)
    return fasta_files

def extract_info(path):
    # Adjust the regular expression to match 'seqcoll-35' or similar patterns before '.fasta'
    match = re.search(r'/([^/]+)\.fasta$', path)
    if match:
        return match.group(1)
    else:
        return "unknown"


In [88]:
def pairwise_jsd_matrix(species_data):
    species_keys = list(species_data.keys())
    num_species = len(species_keys)
    jsd_matrix = np.zeros((num_species, num_species))  # Initialize a square matrix

    for i, species_1 in enumerate(species_keys):
        for j, species_2 in enumerate(species_keys):
            if i < j:  # To avoid recomputation, calculate only for i < j
                jsd_value = jsd(species_data[species_1], species_data[species_2])
                jsd_matrix[i, j] = jsd_value
                jsd_matrix[j, i] = jsd_value  # JSD is symmetric

    return jsd_matrix

In [180]:
homology_fasta_paths = gather_fasta_paths(base_dir1)
homology_fasta_paths

['/Users/gulugulu/sampled_homology_200/seqcoll-35.fasta',
 '/Users/gulugulu/sampled_homology_200/seqcoll-8.fasta',
 '/Users/gulugulu/sampled_homology_200/seqcoll-187.fasta',
 '/Users/gulugulu/sampled_homology_200/seqcoll-69.fasta',
 '/Users/gulugulu/sampled_homology_200/seqcoll-102.fasta',
 '/Users/gulugulu/sampled_homology_200/seqcoll-56.fasta',
 '/Users/gulugulu/sampled_homology_200/seqcoll-145.fasta',
 '/Users/gulugulu/sampled_homology_200/seqcoll-100.fasta',
 '/Users/gulugulu/sampled_homology_200/seqcoll-185.fasta',
 '/Users/gulugulu/sampled_homology_200/seqcoll-178.fasta',
 '/Users/gulugulu/sampled_homology_200/seqcoll-96.fasta',
 '/Users/gulugulu/sampled_homology_200/seqcoll-72.fasta',
 '/Users/gulugulu/sampled_homology_200/seqcoll-37.fasta',
 '/Users/gulugulu/sampled_homology_200/seqcoll-124.fasta',
 '/Users/gulugulu/sampled_homology_200/seqcoll-158.fasta',
 '/Users/gulugulu/sampled_homology_200/seqcoll-165.fasta',
 '/Users/gulugulu/sampled_homology_200/seqcoll-120.fasta',
 '/Us

In [181]:
loader = get_app("load_unaligned", format="fasta", moltype="dna")
codon_aligner = get_app("progressive_align", "codon", unique_guides = True)
cpos3 = get_app("take_codon_positions", 3)
# omit_degens = get_app("omit_degenerates", moltype="dna")
def length_divisible_by_three(row):
    return len(row) % 3 == 0

In [238]:
motif_list = []

In [257]:
motif_list = []
prob_homology_list = []
species_less_than_3 = []
for path in homology_fasta_paths:
    seqs = loader(path)
    if seqs.num_seqs < 3:
        species_less_than_3.append(path)
    else:
        try:
            filtered_seqs = seqs.take_seqs_if(length_divisible_by_three)
            seqs_no_stop_codon = filtered_seqs.trim_stop_codons(strict=True)
            aligned = codon_aligner(seqs_no_stop_codon)
            aligned_no_degenerates = aligned.no_degenerates(motif_length=3)
            just3rd_aligned_no_degenerates = cpos3(aligned_no_degenerates)
            motif = list(just3rd_aligned_no_degenerates.get_motif_probs().values())
            motif_list.append(motif)
        except AttributeError as e:
            prob_homology_list.append(path)

numseqs=2 not equal to numtips=2
These were different: {'mus spretus-MGP SPRETEiJ G0023182', 'mus_spicilegus-ENSMSIG00000020523', 'mus_spretus-MGP_SPRETEiJ_G0023182', 'mus spicilegus-ENSMSIG00000020523'}


In [254]:
len(prob_homology_list)

41

In [244]:
path = homology_fasta_paths[1]
seqs = loader(path)

In [258]:
motif_list

[[0.23227969348659003,
  0.33524904214559387,
  0.18582375478927204,
  0.24664750957854406],
 [0.22364672364672364,
  0.2535612535612536,
  0.15289648622981955,
  0.36989553656220325],
 [0.1731958762886598, 0.30927835051546393, 0.2, 0.31752577319587627],
 [0.0456140350877193,
  0.7824561403508772,
  0.02280701754385965,
  0.14912280701754385],
 [0.2977293790546803,
  0.14874884151992585,
  0.346501390176089,
  0.2070203892493049],
 [0.2833333333333333,
  0.19166666666666668,
  0.14583333333333334,
  0.37916666666666665],
 [0.36289592760180994,
  0.2660633484162896,
  0.17647058823529413,
  0.19457013574660634],
 [0.0, 0.0, 0.0, 1.0],
 [0.21994535519125682,
  0.3592896174863388,
  0.16256830601092895,
  0.2581967213114754],
 [0.16266666666666665, 0.2613333333333333, 0.24, 0.336],
 [0.2804814233385662,
  0.2360020931449503,
  0.28309785452642594,
  0.20041862899005755],
 [0.2317862165963432,
  0.3457102672292546,
  0.19184247538677918,
  0.23066104078762306],
 [0.23704268292682926,
  0.2

In [225]:
pairwise_distance = just3rd_alined_no_degenerates.distance_matrix(calc='paralinear', show_progress=False, drop_invalid=True)

In [226]:
pairwise_distance

names,ailuropoda_melanoleuca-ENSAMEG00000001822,aotus_nancymaae-ENSANAG00000027904,balaenoptera_musculus-ENSBMSG00010002146,callithrix_jacchus-ENSCJAG00000038687,camelus_dromedarius-ENSCDRG00005002177,canis_lupus_dingo-ENSCAFG00020014499,canis_lupus_familiaris-ENSCAFG00845018684,carlito_syrichta-ENSTSYG00000008753,cebus_imitator-ENSCCAG00000029698,cercocebus_atys-ENSCATG00000039675,chlorocebus_sabaeus-ENSCSAG00000009793,delphinapterus_leucas-ENSDLEG00000020809,echinops_telfairi-ENSETEG00000013620,equus_asinus-ENSEASG00005004638,equus_caballus-ENSECAG00000021621,erinaceus_europaeus-ENSEEUG00000012673,gorilla_gorilla-ENSGGOG00000004602,homo_sapiens-ENSG00000120162,ictidomys_tridecemlineatus-ENSSTOG00000024167,macaca_fascicularis-ENSMFAG00000041346,macaca_mulatta-ENSMMUG00000062843,macaca_nemestrina-ENSMNEG00000032479,mandrillus_leucophaeus-ENSMLEG00000033020,marmota_marmota_marmota-ENSMMMG00000009010,mesocricetus_auratus-ENSMAUG00000008957,microcebus_murinus-ENSMICG00000004847,monodon_monoceros-ENSMMNG00015020353,mus_caroli-MGP_CAROLIEiJ_G0025887,mus_musculus-ENSMUSG00000073910,mus_spicilegus-ENSMSIG00000025868,mus_spretus-MGP_SPRETEiJ_G0026836,nomascus_leucogenys-ENSNLEG00000004706,octodon_degus-ENSODEG00000002400,otolemur_garnettii-ENSOGAG00000029287,pan_paniscus-ENSPPAG00000042835,pan_troglodytes-ENSPTRG00000020837,peromyscus_maniculatus_bairdii-ENSPEMG00000029804,phocoena_sinus-ENSPSNG00000001987,physeter_catodon-ENSPCTG00005018315,pongo_abelii-ENSPPYG00000019156,prolemur_simus-ENSPSMG00000011151,propithecus_coquereli-ENSPCOG00000020239,pteropus_vampyrus-ENSPVAG00000009661,rhinolophus_ferrumequinum-ENSRFEG00010019145,rhinopithecus_bieti-ENSRBIG00000037786,rhinopithecus_roxellana-ENSRROG00000040795,sorex_araneus-ENSSARG00000010163,sus_scrofa-ENSSSCG00000038811,tupaia_belangeri-ENSTBEG00000006444,tursiops_truncatus-ENSTTRG00000005481,urocitellus_parryii-ENSUPAG00010008004,ursus_americanus-ENSUAMG00000026746,ursus_maritimus-ENSUMAG00000005779,vicugna_pacos-ENSVPAG00000006406,vulpes_vulpes-ENSVVUG00000021206
ailuropoda_melanoleuca-ENSAMEG00000001822,0.0000,0.4112,0.2574,0.4112,0.1922,0.1415,0.1415,0.3006,0.4645,0.3743,0.2924,0.2370,0.1596,0.1827,0.1827,0.2625,0.4268,0.4224,0.2727,0.3791,0.3791,0.3743,0.3743,0.2504,0.4298,0.2614,0.2370,0.4191,0.4064,0.4064,0.4064,0.3743,0.4380,0.2526,0.4224,0.4224,0.3672,0.2443,0.2095,0.2732,0.2995,0.3395,0.1935,0.3976,0.3743,0.3743,0.3166,0.2451,0.1864,0.2095,0.2685,0.0000,0.0000,0.2177,0.1415
aotus_nancymaae-ENSANAG00000027904,0.4112,0.0000,0.5073,0.0000,0.2961,0.2400,0.2400,0.0829,0.0173,0.1139,0.1675,0.4922,0.4088,0.2127,0.2127,0.4072,0.1130,0.0880,0.3750,0.1389,0.1389,0.1139,0.1139,0.2731,0.4083,0.2151,0.4922,0.4433,0.5239,0.5239,0.5239,0.0660,0.4904,0.1921,0.0880,0.0880,0.5395,0.5016,0.4485,0.1139,0.2495,0.2179,0.2881,0.3867,0.0660,0.0660,0.3818,0.4165,0.2852,0.4485,0.3587,0.4112,0.4112,0.2999,0.2400
balaenoptera_musculus-ENSBMSG00010002146,0.2574,0.5073,0.0000,0.5073,0.2667,0.2567,0.2567,0.3224,0.5493,0.3602,0.3385,0.0485,0.2302,0.2624,0.2624,0.4097,0.4327,0.4950,0.3152,0.2979,0.2979,0.3602,0.3602,0.2868,0.4432,0.2955,0.0485,0.3649,0.3548,0.3548,0.3548,0.4503,0.2743,0.3547,0.4950,0.4950,0.3921,0.0466,0.0254,0.3602,0.3433,0.3790,0.2539,0.4698,0.4503,0.4503,0.2117,0.2344,0.2227,0.0254,0.3200,0.2574,0.2574,0.2969,0.2567
callithrix_jacchus-ENSCJAG00000038687,0.4112,0.0000,0.5073,0.0000,0.2961,0.2400,0.2400,0.0829,0.0173,0.1139,0.1675,0.4922,0.4088,0.2127,0.2127,0.4072,0.1130,0.0880,0.3750,0.1389,0.1389,0.1139,0.1139,0.2731,0.4083,0.2151,0.4922,0.4433,0.5239,0.5239,0.5239,0.0660,0.4904,0.1921,0.0880,0.0880,0.5395,0.5016,0.4485,0.1139,0.2495,0.2179,0.2881,0.3867,0.0660,0.0660,0.3818,0.4165,0.2852,0.4485,0.3587,0.4112,0.4112,0.2999,0.2400
camelus_dromedarius-ENSCDRG00005002177,0.1922,0.2961,0.2667,0.2961,0.0000,0.1400,0.1400,0.2151,0.3269,0.2543,0.2814,0.2501,0.1410,0.1633,0.1633,0.3497,0.2906,0.2839,0.2773,0.2610,0.2610,0.2543,0.2543,0.2489,0.3632,0.1856,0.2501,0.3807,0.4599,0.4599,0.4599,0.2543,0.3785,0.2337,0.2839,0.2839,0.5142,0.2546,0.2191,0.3444,0.2238,0.2570,0.1942,0.2964,0.2543,0.2543,0.3110,0.1925,0.2493,0.2191,0.2017,0.1922,0.1922,0.0200,0.1400
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
urocitellus_parryii-ENSUPAG00010008004,0.2685,0.3587,0.3200,0.3587,0.2017,0.2299,0.2299,0.2696,0.3842,0.3110,0.2415,0.3137,0.2396,0.2024,0.2024,0.2974,0.3387,0.3731,0.0427,0.2766,0.2766,0.3110,0.3110,0.0438,0.3537,0.1769,0.3137,0.3347,0.3948,0.3948,0.3948,0.3110,0.2386,0.1984,0.3731,0.3731,0.5503,0.3159,0.2838,0.3110,0.2395,0.2213,0.3169,0.2909,0.3110,0.3110,0.2882,0.2593,0.2416,0.2838,0.0000,0.2685,0.2685,0.1818,0.2299
ursus_americanus-ENSUAMG00000026746,0.0000,0.4112,0.2574,0.4112,0.1922,0.1415,0.1415,0.3006,0.4645,0.3743,0.2924,0.2370,0.1596,0.1827,0.1827,0.2625,0.4268,0.4224,0.2727,0.3791,0.3791,0.3743,0.3743,0.2504,0.4298,0.2614,0.2370,0.4191,0.4064,0.4064,0.4064,0.3743,0.4380,0.2526,0.4224,0.4224,0.3672,0.2443,0.2095,0.2732,0.2995,0.3395,0.1935,0.3976,0.3743,0.3743,0.3166,0.2451,0.1864,0.2095,0.2685,0.0000,0.0000,0.2177,0.1415
ursus_maritimus-ENSUMAG00000005779,0.0000,0.4112,0.2574,0.4112,0.1922,0.1415,0.1415,0.3006,0.4645,0.3743,0.2924,0.2370,0.1596,0.1827,0.1827,0.2625,0.4268,0.4224,0.2727,0.3791,0.3791,0.3743,0.3743,0.2504,0.4298,0.2614,0.2370,0.4191,0.4064,0.4064,0.4064,0.3743,0.4380,0.2526,0.4224,0.4224,0.3672,0.2443,0.2095,0.2732,0.2995,0.3395,0.1935,0.3976,0.3743,0.3743,0.3166,0.2451,0.1864,0.2095,0.2685,0.0000,0.0000,0.2177,0.1415
vicugna_pacos-ENSVPAG00000006406,0.2177,0.2999,0.2969,0.2999,0.0200,0.1702,0.1702,0.2541,0.3307,0.2581,0.2852,0.2803,0.1672,0.1880,0.1880,0.3238,0.2945,0.2877,0.2499,0.2649,0.2649,0.2581,0.2581,0.2289,0.4073,0.2285,0.2803,0.4392,0.5283,0.5283,0.5283,0.2581,0.3834,0.2761,0.2877,0.2877,0.5590,0.2865,0.2493,0.3482,0.2749,0.2994,0.2220,0.3328,0.2581,0.2581,0.3628,0.2221,0.2981,0.2493,0.1818,0.2177,0.2177,0.0000,0.1702


In [227]:
pairwise_distance_array = pairwise_distance.array
nuc_freqs = just3rd_alined_no_degenerates.probs_per_seq()
sub_nuc_freqs = {}
for key in pairwise_distance.keys():
    sub_nuc_freqs[key] = nuc_freqs[key]

pairwise_jsd = pairwise_jsd_matrix(sub_nuc_freqs)
pairwise_distance_values = pairwise_distance_array[np.triu_indices(n=pairwise_distance_array.shape[0], k=1)]
pairwise_jsd_values = pairwise_jsd[np.triu_indices(n=pairwise_jsd.shape[0], k=1)]

In [228]:
import plotly.express as px
fig = px.scatter(x=pairwise_distance_values, y=pairwise_jsd_values, labels={
                 'x': 'Pairwise Distance',
                 'y': 'Pairwise Jensen-Shannon Divergence'
             })
fig.show()

In [229]:
# def create_filter_by_name(exclude_name):
#     def filter_by_name(seq):
#         return seq.name != exclude_name
#     return filter_by_name

# name_to_exclude = "sequence_to_exclude"
# custom_filter = create_filter_by_name(name_to_exclude)



In [230]:
# path = '/Users/gulugulu/repos/sampled_homology_all/seqcoll-24.fasta'
# homology_info = extract_info(path)
# seqs = loader(path)

In [231]:
# # Filter out the sequence by name
# seqs_filtered = seqs.take_seqs_if(length_divisible_by_three)
# seqs_no_stop_codon = seqs_filtered.trim_stop_codons(strict=True)
# aligned = codon_aligner(seqs_no_stop_codon)
# aligned_no_degenerates = aligned.no_degenerates()
# just3rd_alined_no_degenerates = cpos3(aligned_no_degenerates)
# pairwise_distance = just3rd_alined_no_degenerates.distance_matrix(calc='paralinear', show_progress=False, drop_invalid=True)
# pairwise_distance
# pairwise_distance_array = pairwise_distance.array
# nuc_freqs = just3rd_alined_no_degenerates.probs_per_seq()
# sub_nuc_freqs = {}
# for key in pairwise_distance.keys():
#     sub_nuc_freqs[key] = nuc_freqs[key]

# pairwise_jsd = pairwise_jsd_matrix(sub_nuc_freqs)
# pairwise_distance_values = pairwise_distance_array[np.triu_indices(n=pairwise_distance_array.shape[0], k=1)]
# pairwise_jsd_values = pairwise_jsd[np.triu_indices(n=pairwise_jsd.shape[0], k=1)]

In [232]:
# import plotly.express as px
# fig = px.scatter(x=pairwise_distance_values, y=pairwise_jsd_values, labels={
#                  'x': 'Pairwise Distance',
#                  'y': 'Pairwise Jensen-Shannon Divergence'
#              },
#              title="Scatter Plot of Pairwise Distances vs. Jensen-Shannon Divergence")
# fig.show()

In [233]:


# def get_data(base_dir):
#     fasta_files_paths = gather_fasta_paths(base_dir)
#     results = {}
#     # prob_fasta = []
#     for path in fasta_files_paths:
#         loader = get_app("load_unaligned", format="fasta", moltype="dna")
#         codon_aligner = get_app("progressive_align", "GNC", distance="paralinear")
#         cpos3 = get_app("take_codon_positions", 3)
#         homology_info = extract_info(path)
#    #     try:
#         seqs = loader(path)
#         seqs_no_stop_codon = seqs.trim_stop_codons(strict=False)
#         aligned = codon_aligner(seqs_no_stop_codon)
#         aligned_no_degenerates = aligned.no_degenerates()
#         just3rd_no_degenerates = cpos3(aligned_no_degenerates)
#         pairwise_distance = just3rd_no_degenerates.distance_matrix(calc='logdet', show_progress=False, drop_invalid=False)
#         pairwise_distance_array = pairwise_distance.array
#         nuc_freqs = just3rd_no_degenerates.probs_per_seq()
#         sub_nuc_freqs = {}
#         for key in pairwise_distance.keys():
#             sub_nuc_freqs[key] = nuc_freqs[key]
        
#         pairwise_jsd = pairwise_jsd_matrix(sub_nuc_freqs)
#         pairwise_distance_values = pairwise_distance_array[np.triu_indices(n=pairwise_distance_array.shape[0], k=1)]
#         pairwise_jsd_values = pairwise_jsd[np.triu_indices(n=pairwise_jsd.shape[0], k=1)]
#         results[homology_info] = {}
#         results[homology_info]['pairwise_distance'] = pairwise_distance_values
#         results[homology_info]['pairwise_jsd'] = pairwise_jsd_values
#         # except AttributeError:
#         #     prob_fasta.append(homology_info)

#     return results #, prob_fasta



        


In [234]:
# import pandas as pd
# import plotly.graph_objects as go
# from plotly.subplots import make_subplots

# def plot_scatters(info_dict):
#     # Assuming info_dict is structured with keys and 'pairwise_jsd' and 'pairwise_distance' as sub-keys
#     keys = list(info_dict.keys())
#     rows = int(len(keys) ** 0.5) + 1  # Calculate the number of rows for subplots
#     cols = (len(keys) + rows - 1) // rows  # Calculate the number of columns

#     fig = make_subplots(rows=rows, cols=cols, subplot_titles=[f'{key}' for key in keys])
    
#     # Populate subplots
#     for index, key in enumerate(keys, start=1):
#         value = info_dict[key]
#         data = {'JSD Value': value['pairwise_jsd'],
#                 'Distance Value': value['pairwise_distance']}
#         df = pd.DataFrame(data)
        
#         row = (index - 1) // cols + 1
#         col = (index - 1) % cols + 1
        
#         fig.add_trace(
#             go.Scatter(
#                 x=df['JSD Value'],
#                 y=df['Distance Value'],
#                 mode='markers'
#             ),
#             row=row,
#             col=col
#         )
    
#     fig.update_layout(
#         height=300 * rows,  # Set a reasonable height based on number of rows
#         width=300 * cols,   # Set a reasonable width based on number of columns
#         title_text="Scatter Plots of 3rd codon position JSD Vs Genetic Distance",
#         showlegend=False,
#         yaxis_title_text='Genetic Distance',
#         xaxis_title_text='JSD'
#     )
    
#     return fig


In [235]:
# fig = plot_scatters(info)
# #fig.write_image('Genetic Distance vs JSD (3rd codon position).pdf')
# fig.show()

In [236]:
# path = homology_fasta_paths[7]
# print(path)
# loader = get_app("load_unaligned", format="fasta", moltype="dna")
# codon_aligner = get_app("progressive_align", "GNC")
# seqs = loader(path)
# seqs_no_stop_codon = seqs.trim_stop_codons(strict=False)
# aligned = codon_aligner(seqs_no_stop_codon)
# aligned_no_degenerates = aligned.no_degenerates()
# cpos3 = get_app("take_codon_positions", 3)
# just3rd_no_degenerates = cpos3(aligned_no_degenerates)
# pairwise_distance = just3rd_no_degenerates.distance_matrix(calc='logdet', show_progress=True, drop_invalid=True)
# pairwise_distance = just3rd_no_degenerates.distance_matrix(calc='logdet', show_progress=True, drop_invalid=True)
# pairwise_distance_array = pairwise_distance.array
# nuc_freqs = just3rd_no_degenerates.probs_per_seq()
# sub_nuc_freqs = {}
# for key in pairwise_distance.keys():
#     sub_nuc_freqs[key] = nuc_freqs[key]
# pairwise_jsd = pairwise_jsd_matrix(sub_nuc_freqs)
# pairwise_distance_values = pairwise_distance_array[np.triu_indices(n=pairwise_distance_array.shape[0], k=1)]
# pairwise_jsd_values = pairwise_jsd[np.triu_indices(n=pairwise_jsd.shape[0], k=1)]
