In [1]:
import pathlib
import contextlib
import tempfile
from cogent3 import get_app

@contextlib.contextmanager
def tempdir(working_dir: pathlib.Path | str | None = None) -> pathlib.Path:
    """context manager returns a temporary directory in working_dir"""
    with tempfile.TemporaryDirectory(dir=working_dir) as temp_dir:
        yield pathlib.Path(temp_dir)

import subprocess
import sys

def exec_command(
    cmnd: str,
    stdout: int = subprocess.PIPE,
    stderr: int = subprocess.PIPE,
) -> str | None:
    """Executes shell command and returns stdout if completed with exit code 0."""
    proc = subprocess.Popen(cmnd, shell=True, stdout=stdout, stderr=stderr)
    out, err = proc.communicate()
    if proc.returncode != 0:
        msg = err
        sys.stderr.writelines(f"FAILED: {cmnd}\n{msg}")
        sys.exit(proc.returncode)
    return out.decode("utf8") if out is not None else None


#apps to load the cogent3 seq collections from fasta file and tranalate to aa
loader = get_app("load_unaligned", format="fasta", moltype="dna")
translater = get_app("translate_seqs")
process = loader + translater



In [2]:
from cogent3 import get_app, open_data_store

seq_dir = "/Users/gulugulu/Desktop/sampled_homologies_mammal87_113"
input_dstore_seqs = open_data_store(seq_dir, suffix="fa", mode="r")
fasta_dir = pathlib.Path(seq_dir)  # Replace with your directory path
# Get all .fasta files in the directory
fasta_files = list(fasta_dir.glob("*.fa"))

In [19]:
len(fasta_files)

1924

In [4]:
def get_mafft_aligned_seq(seqs: str) -> str:
    """
    Loads sequences from the input directory, translates them to amino acids,
    aligns using MAFFT, and returns the aligned DNA sequence collection.

    Parameters
    ----------
    seqs_dir: str

    Returns
    -------
    str
        Path to the aligned amino acid FASTA file.
    """
    # Temporary directory context
    with tempdir() as temp_dir:
        aa_fasta_path = temp_dir / "aa_sequences.fasta"
        aligned_aa_path = temp_dir / "aligned_aa.fasta"

        # Load and translate the first FASTA file
        aa_seqs = translater(seqs)

        # Write translated amino acid sequences to temporary FASTA file
        aa_seqs.write(aa_fasta_path, format="fasta")

        # Build the MAFFT command
        mafft_command = f"mafft --amino {aa_fasta_path} > {aligned_aa_path}"
        print(f"Running MAFFT: {mafft_command}")

        # Execute the MAFFT command
        exec_command(mafft_command)

        # Load the aligned amino acid sequences
        loader_aligned = get_app("load_aligned", format="fasta")
        aligned_seq_collection = loader_aligned(str(aligned_aa_path)).to_type(array_align=True)        

        aligned_seqs = aligned_seq_collection.replace_seqs(seqs)
        

    return aligned_seqs

In [5]:
from clock_project.genome_analysis.sequence_alignment_filtering import (cpos3, aligner, filter)

In [6]:
valid_homology_list = []
prob_homology_list = []
species_less_than_3 = []
small_sample_size = []
short_alignment = []
valid_homology_index_dict = {}
alignment_lengths = {}
sample_size_dict = {}

In [7]:

# for i in range(len(input_dstore_seqs.completed[0:1])):
#     path = input_dstore_seqs.completed[i]
#     file_name = path.unique_id.split('.')[0]
#     seqs = loader(path)
#     seqs_filtered = filter(seqs)
#     try:
#         if seqs_filtered.num_seqs < 3:
#             species_less_than_3.append(path)
#         else: 
#             aln = get_mafft_aligned_seq(seqs_filtered)
#             print(aln)
#             aln_3rd = cpos3(aln)
#             aln_3rd_no_degenerates = aln_3rd.no_degenerates()
#             alignment_length = aln_3rd_no_degenerates.get_lengths()[0]
#             sample_size = aln_3rd_no_degenerates.num_seqs
#             alignment_lengths[file_name] = alignment_length
#             sample_size_dict[file_name] = sample_size
#             if aln_3rd_no_degenerates.num_seqs < 30:
#                 small_sample_size.append(path)
#                 print(f"Path added to small size: {path} with length {alignment_length}, sample size {sample_size}")
            
#             elif alignment_length < 550:
#                 short_alignment.append(path)
#                 print(f"Path added to short alignment: {path} with length {alignment_length}, sample size {sample_size}")
            
#             else:
#                 valid_homology_list.append(path)
#                 valid_homology_index_dict[file_name] = i
#                 print(f"Path added to valid_homology_list: {path} with length {alignment_length}, sample size {sample_size}")
    
#     except Exception as e:
#         print(e)
#         prob_homology_list.append(path)
            
                





In [20]:

path = input_dstore_seqs[5]
file_name = path.unique_id.split('.')[0]
seqs = loader(path)
seqs.num_seqs
# if seqs_filtered.num_seqs < 3:
#     species_less_than_3.append(path)
# else: 
#     aln = get_mafft_aligned_seq(seqs_filtered)

81

In [22]:
input_dstore_seqs[5]

DataMember(data_store=/Users/gulugulu/Desktop/sampled_homologies_mammal87_113, unique_id=ENSG00000127054.fa)

In [10]:
path

DataMember(data_store=/Users/gulugulu/Desktop/sampled_homologies_mammal87_113, unique_id=ENSG00000131791.fa)

In [11]:
# get_mafft_aligned_seq(seqs)

In [12]:
from cogent3.app.composable import define_app
from cogent3.app.typing import UnalignedSeqsType, SeqsCollectionType

translater = get_app("translate_seqs")

def try_get_translation(seq):
    try:
        seq.get_translation()
        return True  # Translation successful
    except Exception:  # Handle any error raised during translation
        return False  # Skip sequence if translation fails

@define_app
def remove_unresolvable_codon_sequence(seqs: UnalignedSeqsType) -> UnalignedSeqsType:
    # Keep only sequences where translation does not raise an error
    valid_seqs = seqs.take_seqs_if(
        lambda seq: try_get_translation(seq)
    )
    return valid_seqs

remove_unresolvable_seqeunce = remove_unresolvable_codon_sequence()


In [21]:
remove_unresolvable_seqeunce(seqs)

0,1
,0
dasypus_novemcinctus-ENSDNOG00000025863,CTGATTGGCTGCGGGCCGCCCAATCAAGAGACGCGCCTGTACGCGTTTCCGGTGGGCGGA
echinops_telfairi-ENSETEG00000002094,CAGCTGGTGCTCGGCCAGTCCCAGCTCCTTCAGGGCTTGCTCGGAGGACACCAGCCGGAA
loxodonta_africana-ENSLAFG00000013687,GCGGCCACCCTCACCCCCACTCCCCGAGCAGGGGCTGGCCAGGACGTGGGCCGCAGCTGT
capra_hircus-ENSCHIG00000023117,ATGCCCGAGATCAGGGTCACGCCCCTGGGGGCTGGCCAGGATGTGGGCCGAAGCTGCATT
mus_spicilegus-ENSMSIG00000015231,ATGGGTTCCGATGGTGGGGGAGCGGTGGTCTCTACTCCCGCGGAGGGCAAAGGGTGCAGC
cavia_porcellus-ENSCPOG00000005152,ATGGAGGTCCCGCTGCTGTGGCTCTGTGAAGGGGTCCCTAGCACCAGAATTTGGCCTACA
chinchilla_lanigera-ENSCLAG00000004586,TGGCCAGAGTTATTGGTTACTGCCAGACAGCCAGGGCCACGGGCTGGCCAGGATGTGGGC
bison_bison_bison-ENSBBBG00000016616,ATGCCCGAGATCAGGGTCACGCCCTTGGGGGCTGGCCAGGATGTGGGCCGAAGCTGCATT
ictidomys_tridecemlineatus-ENSSTOG00000020019,CCCCCAACTTTGAGAATTGAGGGGAAGCTTGGGGCTGGGCAGGATGTGGGTCGCAGCTGC


In [14]:
seqs_dir2 = "/Users/gulugulu/Desktop/honours/data_local_2/sampled_homologies"
input_dstore_seqs2 = open_data_store(seqs_dir2, suffix="fa", mode="r")

In [15]:
seqs2 = loader(input_dstore_seqs2[1])
seqs2.num_seqs

83

In [16]:
remove_unresolvable_seqeunce(seqs2)

0,1
,0
echinops_telfairi-ENSETEG00000019111,ATGGGAAACACCACCAGCGACCGGGTGTCCGGGGAGCGCCACGGCACCAAGGCAGCGCGC
dasypus_novemcinctus-ENSDNOG00000045622,ATGGGAAACACCACCAGCGACCGGGTGTCCGGGGAGCGCCACGGCACCAAAGCTGCGCGC
loxodonta_africana-ENSLAFG00000029299,ATGGGAAACACCACCAGCGACCGGGTGTCCGGGGAGCGCCACGGCACCAAGGCTGCGCGT
tursiops_truncatus-ENSTTRG00000010361,ATGGGAAACACCACTAGCGACCGGGTGGCCGGGGAGCGCCACGGCGCCAAGGCTGCGCGC
tupaia_belangeri-ENSTBEG00000005363,CTCCCTGGGGACAAAGAGTTTGTATCATGGCAGCAGGATTTGGAGGACTCCGTAAAGCCC
ochotona_princeps-ENSOPRG00000001630,ATGGGAAATACCACCAGCGACCGGGTGTCCGGGGAACGCCACGGCTCCAAGGCTGCGCGC
mustela_putorius_furo-ENSMPUG00000000612,ATGGGAAACACCACCAGCGACCGGGTGGCTGGCGAGCGCCACGGCTCCAAGTCTTCACGC
chlorocebus_sabaeus-ENSCSAG00000018388,ATGGGAAACACCACCAGCGACCGGGTGTCCGGGGAGCGCCACGGTGCCAAGGCTGCACGC
otolemur_garnettii-ENSOGAG00000012807,ATGGGAAACACCACCAGCGACCGGGTGTCCGGGGAGCGCCACGGCGCCAAGGCTGCGCGC


In [17]:
filtered_seqs = filter(seqs)
translater(filtered_seqs)

NotCompleted(type=ERROR, origin=translate_seqs, source="ENSG00000131791.fa", message="Traceback (most recent call last):
  File "/Users/gulugulu/opt/anaconda3/envs/c312/lib/python3.12/site-packages/cogent3/core/sequence.py", line 1872, in get_translation
    resolved = moltype.resolve_ambiguity(
               ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/gulugulu/opt/anaconda3/envs/c312/lib/python3.12/site-packages/cogent3/core/moltype.py", line 1346, in resolve_ambiguity
    raise AlphabetError(ambig_motif)
cogent3.core.alphabet.AlphabetError: TGA

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/gulugulu/opt/anaconda3/envs/c312/lib/python3.12/site-packages/cogent3/app/composable.py", line 401, in _call
    result = self.main(val, *args, **kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/gulugulu/opt/anaconda3/envs/c312/lib/python3.12/site-packages/cogent3/app/translate.py", line 377, in main
    ret