In [1]:
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Seq import UndefinedSequenceError

import os
import logging

from pandas import notna, read_csv

logging.basicConfig(level=logging.DEBUG)

In [8]:
from Bio import Entrez
Entrez.email = "kaedeito@student.ubc.ca"
# This line sets the name of the tool that is making the queries
Entrez.tool = "download_other_taxa.ipynb"

SEARCH_NEW = False
GRANULARITY = 5
logger = logging.getLogger("download_16s_other_taxa")

dir_path = os.path.realpath("..\\..\\datasets\\full_analysis")


In [18]:
# list files in dir_path
# load files that were downloaded from a assembled genome
def list_files():
  fna_path = os.path.join(dir_path, "other_taxa_16S")
  print(fna_path)
  list_other = os.listdir(fna_path)
  # filter out files that do not start with "txid"
  list_other = [f for f in list_other if f.startswith("txid")]
  return list_other

In [27]:
# load the list of files
def load_files():
  list_other = list_files()
  results: list[SeqRecord] = []
  for f in list_other:
    logger.debug(f)
    fna_path = os.path.join(dir_path, "other_taxa_16S")
    record = list(SeqIO.parse(os.path.join(fna_path, f), "fasta"))
    logger.debug(len(record))
    results.extend(record)
  return results


In [28]:
gene_records = load_files()

DEBUG:download_16s_other_taxa:txid1541440.fna
DEBUG:download_16s_other_taxa:1
DEBUG:download_16s_other_taxa:txid1960125.fna
DEBUG:download_16s_other_taxa:1
DEBUG:download_16s_other_taxa:txid2172824.fna
DEBUG:download_16s_other_taxa:1
DEBUG:download_16s_other_taxa:txid2304602.fna
DEBUG:download_16s_other_taxa:1
DEBUG:download_16s_other_taxa:txid2592383.fna
DEBUG:download_16s_other_taxa:1
DEBUG:download_16s_other_taxa:txid517724.fna
DEBUG:download_16s_other_taxa:1


E:\Kaede\Documents\GitHub\BIOL417_CORAL\datasets\full_analysis\other_taxa_16S


In [3]:
def save_file(file_name, records, rec_type):
  """
  Save the records to the file.

  `file_name`: The name of the file to save to.

  `records`: The records to save.

  `rec_type`: The type of record to save.
  """
  file_path = os.path.join(dir_path, file_name)
  with open(file_path, "w") as out_handle:
      try:
        count = SeqIO.write(records, out_handle, rec_type)
        logger.debug(f"{rec_type} {count} Saved")
      except UndefinedSequenceError as e_seq:
        logger.error(e_seq)
        logger.error(f"Failed to write {rec_type} to file {file_name}")
      except Exception as e:
        logger.error(e)
        logger.error(f"Failed to write {rec_type} to file {file_name}")

In [4]:
def find_records(list_accession: list[str]):
  handle = Entrez.efetch(db="nucleotide", id=list_accession, rettype="gb")
  records: list[SeqRecord] = list(SeqIO.parse(handle, "gb"))
  return len(records), records

In [29]:
# load other taxa data (.csv)
other_taxa = read_csv(os.path.join(dir_path, "other_taxa.csv"))

# load accession numbers
other_taxa_accession = other_taxa.loc[notna(other_taxa["accession_id"]), "accession_id"].to_list()

rec_length, records = find_records(other_taxa_accession)

save_file("other_taxa_16S\\16S_from_accession.fasta", records, "fasta")

# save everything together (records and gene_records)
all_records = records + gene_records
save_file("other_taxa_16S\\16S_other_taxa_all.fasta", all_records, "fasta")

DEBUG:download_16s_other_taxa:fasta 7 Saved
DEBUG:download_16s_other_taxa:fasta 13 Saved
