In [2]:
import pathlib
import contextlib
import tempfile
from cogent3 import get_app

@contextlib.contextmanager
def tempdir(working_dir: pathlib.Path | str | None = None) -> pathlib.Path:
    """context manager returns a temporary directory in working_dir"""
    with tempfile.TemporaryDirectory(dir=working_dir) as temp_dir:
        yield pathlib.Path(temp_dir)

import subprocess
import sys

def exec_command(
    cmnd: str,
    stdout: int = subprocess.PIPE,
    stderr: int = subprocess.PIPE,
) -> str | None:
    """Executes shell command and returns stdout if completed with exit code 0."""
    proc = subprocess.Popen(cmnd, shell=True, stdout=stdout, stderr=stderr)
    out, err = proc.communicate()
    if proc.returncode != 0:
        msg = err
        sys.stderr.writelines(f"FAILED: {cmnd}\n{msg}")
        sys.exit(proc.returncode)
    return out.decode("utf8") if out is not None else None


In [3]:
#apps to load the cogents3 seq collections from fasta file and tranalate to aa
loader = get_app("load_unaligned", format="fasta")
translater = get_app("translate_seqs")
process = loader + translater

# def align_via_aa(seqs: typing.SeqsCollectionType, gc=1) -> typing.AlignedSeqsType:
#     """Translates a nucleotide align amino acid sequences back to DNA"""
#     translater = get_app("translate_seqs", gc=gc)
#     prot_aln = get_app("progressive_align", "protein", unique_guides=True)
#     app = translater + prot_aln
#     aligned_aa = app(seqs).to_type(array_align=True)
#     return aligned_aa.replace_seqs(seqs)

# in_dstore = open_data_store("/Users/gavin/repos/Cogent3/working/symposia-talk/sampled_homologies_3000_renamed", suffix="fa", mode="r")

# loader = get_app("load_unaligned", moltype="dna")

# m = in_dstore.completed[0]
# trim_stop = get_app("trim_stop_codons", gc=1)
# aligner = align_via_aa()
# app = loader + trim_stop + aligner
# g = app(m)


In [4]:

from cogent3 import get_app, open_data_store
from clock_project.genome_analysis.sequence_alignment_filtering import (aligner,drop_invalid_length, trim_stop)
from cogent3 import get_app, open_data_store


seq_dir = "/Users/gulugulu/Desktop/honours/data_local_2/sampled_homologies"
output_dir = '/Users/gulugulu/Desktop/honours/data_local/mammal_genome_codon_aln'
input_dstore_seqs = open_data_store(seq_dir, suffix="fa", mode="r")
loader = get_app("load_unaligned", format="fasta", moltype="dna")
seq_filtered_dir = "/Users/gulugulu/Desktop/honours/data_local_2/sampled_homologies_filtered"
seqs = loader(input_dstore_seqs[51])
seq1 = drop_invalid_length(seqs)
valid_seq = trim_stop(seq1)
a = aligner(valid_seq)
a.dotplot()

<cogent3.draw.dotplot.Dotplot at 0x167afac30>

In [5]:
seqs

0,1
,0
procavia_capensis-ENSPCAG00000011052,ATGGGGGACGAGGACGACGACGAGGGCGGTGCGGTGGAGCTGAGGATCACGGAAGCTAAC
mustela_putorius_furo-ENSMPUG00000012899,ATGGGGGACGAGGACGAGGACGAGGGCTGCGCCGTGGAGCTGCGGATCACCGAAGCCAAC
tursiops_truncatus-ENSTTRG00000010308,ATGGGGGACGAGGACGAGGACGAGGGCTGTGCGGTGGAGCTGCAGATCACCGAAGCCAAC
loxodonta_africana-ENSLAFG00000008921,ATGGGGGACGAGGACGAGGATGAGAGCTGTGCGGTGGAGCTGCGGATCACGGAAGCCAAC
otolemur_garnettii-ENSOGAG00000007168,ATGGGAGACGAGGACGAGGATGAGGGCTGTGCCCTGGAGCTGTGGATCACGGAAGCCAAC
chlorocebus_sabaeus-ENSCSAG00000007216,ATGGGGGACGAAGACGAGGATGAGAGCTGCGCCGTGGAGCTGCGGATCACTGAAGCCAAC
tupaia_belangeri-ENSTBEG00000014083,ATGGGGGACGAGGACGAGGATGAGGGCTGCGCGGTGGAGCTGCGGATCACGGAAGCCAAC
ochotona_princeps-ENSOPRG00000015562,ATGGGGGACGAGGACGAGGAGGAGGGCTGCGCCGTGGAGATGCAAATCACGGAAGCCAAC
pteropus_vampyrus-ENSPVAG00000017141,ATGGGGGACGAGGACGAGGACGAGGGCTGTGCTGTGGAGCTGAGGATCACGGAAGCCAAC


In [5]:
from cogent3 import get_app, open_data_store

seq_dir = "/Users/gulugulu/Desktop/honours/data_local_2/sampled_homologies"
input_dstore_seqs = open_data_store(seq_dir, suffix="fa", mode="r")
input_dstore_seqs[0]

fasta_dir = pathlib.Path(seq_dir)  # Replace with your directory path

# Get all .fasta files in the directory
fasta_files = list(fasta_dir.glob("*.fa"))


In [23]:

def get_mafft_aligned_seq(seqs_dir: str) -> str:
    """
    Loads sequences from the input directory, translates them to amino acids,
    aligns using MAFFT, and returns the aligned DNA sequence collection.

    Parameters
    ----------
    seqs_dir: str

    Returns
    -------
    str
        Path to the aligned amino acid FASTA file.
    """
    # Temporary directory context
    with tempdir() as temp_dir:
        aa_fasta_path = temp_dir / "aa_sequences.fasta"
        aligned_aa_path = temp_dir / "aligned_aa.fasta"

        # Load and translate the first FASTA file
        seqs = loader(seqs_dir)
        aa_seqs = translater(seqs_dir)

        # Write translated amino acid sequences to temporary FASTA file
        aa_seqs.write(aa_fasta_path, format="fasta")

        # Build the MAFFT command
        mafft_command = f"mafft --amino {aa_fasta_path} > {aligned_aa_path}"
        print(f"Running MAFFT: {mafft_command}")

        # Execute the MAFFT command
        exec_command(mafft_command)

        # Load the aligned amino acid sequences
        loader_aligned = get_app("load_aligned", format="fasta")
        aligned_seq_collection = loader_aligned(str(aligned_aa_path)).to_type(array_align=True)        

        aligned_seqs = aligned_seq_collection.replace_seqs(seqs)
        

    return aligned_seqs

In [1]:
path = '/Users/gulugulu/Desktop/honours/data_local_2/sampled_homologies/valid_seqs.fa'
seqs = loader(path)
aa = translater(seqs)

NameError: name 'loader' is not defined

In [25]:
mafft_seqs_aligned = get_mafft_aligned_seq(path)
mafft_seqs_aligned.dotplot(name1 = 'cebus_imitator-ENSCCAG00000029039', name2 = 'camelus_dromedarius-ENSCDRG00005017479')

Running MAFFT: mafft --amino /var/folders/d8/pdrt51hx2jb17vf6k28_x6mh0000gn/T/tmp5x054oru/aa_sequences.fasta > /var/folders/d8/pdrt51hx2jb17vf6k28_x6mh0000gn/T/tmp5x054oru/aligned_aa.fasta


<cogent3.draw.dotplot.Dotplot at 0x16fb9bd40>

In [9]:
mafft_seqs_aligned.write('/Users/gulugulu/Desktop/honours/data_local_2/sampled_homologies/problem.fa')

In [10]:
pair = mafft_seqs_aligned.take_seqs(['cebus_imitator-ENSCCAG00000029039', 'camelus_dromedarius-ENSCDRG00005017479'])
pair = pair.omit_gap_pos()
pair.set_repr_policy(num_pos = 600)
pair



0,1
,0
camelus_dromedarius-ENSCDRG00005017479,------------------------------------------------------------
cebus_imitator-ENSCCAG00000029039,ATGGGGGACGAGGACGAGGATGAGGGCTGCGCCGTGGAGCTGCGGATCACAGAAGCCAAC
,60
camelus_dromedarius-ENSCDRG00005017479,------------------------------------------------------------
cebus_imitator-ENSCCAG00000029039,CTGACCGGGCACGAGGAGAAGGTGAGCGTGGAGAACTTCGAGCTGCTCAAGGTGCTGGGC
,120
camelus_dromedarius-ENSCDRG00005017479,------------------------------------------------------------
cebus_imitator-ENSCCAG00000029039,ACGGGAGCCTACGGCAAGGTGTTCCTGGTGCGGAAGGCGGGCGGGCACGACGCGGGGAAG
,180


In [11]:
mafft_seqs_aligned.dotplot(name1 = 'cebus_imitator-ENSCCAG00000029039', name2 = 'camelus_dromedarius-ENSCDRG00005017479')

<cogent3.draw.dotplot.Dotplot at 0x16fdd28d0>

In [12]:
pair.dotplot()

<cogent3.draw.dotplot.Dotplot at 0x170da30e0>

In [13]:
from clock_project.genome_analysis.sequence_alignment_filtering import aligner
cogent3_aligner_seqs = aligner(seqs)

In [14]:
cogent3_aligner_seqs.dotplot(name1 = 'microtus_ochrogaster-ENSMOCG00000017820', name2 = 'saimiri_boliviensis_boliviensis-ENSSBOG00000031882')

<cogent3.draw.dotplot.Dotplot at 0x170e91400>

In [15]:
mafft_seqs_aligned.dotplot(name1 = 'microtus_ochrogaster-ENSMOCG00000017820', name2 = 'saimiri_boliviensis_boliviensis-ENSSBOG00000031882')

<cogent3.draw.dotplot.Dotplot at 0x16fd179b0>

In [16]:
cogent3_aligner_seqs.take_seqs(['microtus_ochrogaster-ENSMOCG00000017820','saimiri_boliviensis_boliviensis-ENSSBOG00000031882'])[1322:1421]

0,1
,0
microtus_ochrogaster-ENSMOCG00000017820,GAACACTCAGGCCGAGGTGTCTCGACGGATCTTGAAGTGCTCCCCTCCCTTCCCCCTCCG
saimiri_boliviensis_boliviensis-ENSSBOG00000031882,......A.....T..................C........................CT..


In [17]:
mafft_seqs_aligned.take_seqs(['microtus_ochrogaster-ENSMOCG00000017820','saimiri_boliviensis_boliviensis-ENSSBOG00000031882'])[1322:1421]

0,1
,0
microtus_ochrogaster-ENSMOCG00000017820,TCCCTTCCCCCTCCGGATCGGGCCTGTGGCACAGGACCTGCTACAGAGGCTGCTGTGTAA
saimiri_boliviensis_boliviensis-ENSSBOG00000031882,...........CT.................G...........G...C....A..T..C..


In [18]:
cogent3_aligner_seqs.dotplot(name1 = 'cebus_imitator-ENSCCAG00000029039', name2 = 'camelus_dromedarius-ENSCDRG00005017479')

<cogent3.draw.dotplot.Dotplot at 0x16fa42de0>

In [19]:
cogent3_aligner_seqs.dotplot(name1 = 'moschus_moschiferus-ENSMMSG00000022543', name2 = 'saimiri_boliviensis_boliviensis-ENSSBOG00000031882')


<cogent3.draw.dotplot.Dotplot at 0x16fa74ce0>

In [20]:
mafft_seqs_aligned.dotplot(name1 = 'moschus_moschiferus-ENSMMSG00000022543', name2 = 'saimiri_boliviensis_boliviensis-ENSSBOG00000031882')


<cogent3.draw.dotplot.Dotplot at 0x16fa39220>

In [21]:
cogent3_aligner_seqs.dotplot(name1 = 'moschus_moschiferus-ENSMMSG00000022543', name2 = 'dipodomys_ordii-ENSDORG00000007519')


<cogent3.draw.dotplot.Dotplot at 0x16fdab3b0>

In [22]:
mafft_seqs_aligned.dotplot(name1 = 'moschus_moschiferus-ENSMMSG00000022543', name2 = 'dipodomys_ordii-ENSDORG00000007519')

<cogent3.draw.dotplot.Dotplot at 0x16fa03dd0>