# Align sequences using MUSCLE

Let's import the SeqIO module to parse fasta files.

In [1]:
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
import os

Let's check that there are directories for the input.

In [2]:
out_amount = os.listdir("ncbi_downloads")

if len(out_amount) <= 1:
    raise "No fasta files."

# Get the list of organisms /taxa
list_orgs = open("ncbi_downloads/organism_list.txt").read().splitlines()

In [3]:
def parse_fasta_files(list_orgs: list[str]):
  """A function that takes a list of subdirectory names and returns a list of SeqRecords."""

  fasta_records: list[SeqRecord] = []
  for org in list_orgs:
    path = f"ncbi_downloads/{org}/nucleotide.fasta"
    if not os.path.exists(path):
      raise "Missing fasta file for " + org
    else:
      file = open(path)
      fasta_record = list(SeqIO.parse(file, "fasta"))
      fasta_records = fasta_records + fasta_record
  return fasta_records



In [4]:
fasta_records = parse_fasta_files(list_orgs)

accession_ids: list[str] = []
duplicates = []

for idx, fasta in enumerate(fasta_records):
  if fasta.id in accession_ids:
    duplicates.append(idx)

  accession_ids.append(fasta.id)

for idx in sorted(duplicates, reverse=True):
  del fasta_records[idx]

SeqIO.write(fasta_records, "all_sequences.fasta", "fasta")

117

Run the muscle command.
```.ps1
muscle -align ./src/all_sequences.fasta -output ./src/all_sequences_aligned.phy
```

For me personally, I submitted it to an online server by {cite:ts}`Madeira2022`.

> Status of job: [here](https://wwwdev.ebi.ac.uk/Tools/jdispatcher/msa/muscle/summary?jobId=muscle-E20231103-041923-0439-15952161-p1m)