In [2]:
pip install biopython

Collecting biopython
  Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.3 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.2/3.3 MB[0m [31m4.6 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/3.3 MB[0m [31m19.2 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m3.3/3.3 MB[0m [31m35.3 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m26.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.85


In [15]:
from Bio import Entrez


Entrez.email = "hsundar@ucdavis.edu"

def search_complete_genomes(query, retmax = 0):
    """
    Search the NCBI genbank nucleotide database for complete genomes.

    Input:
        Query(Str): Query to search for
        Retmax(int): num records to search (0 defaults to all)

    Returns:
        [Count] : total count for query
        [IdList]: list of virus id's
    """
    try:
        with Entrez.esearch(db="nucleotide", term= query, retmax=retmax) as handle:
            record = Entrez.read(handle)
        return record["Count"], record["IdList"]
    except Exception as e:
        print(f"Error searching for complete genomes: {e}")
        return []


query = "Coronaviridae[Organism]"


# Test run, only grabbing data for first 50
total_count, record_ids = search_complete_genomes(query, retmax=10)
print(f"For query : {query}, found {total_count}")


For query : Coronaviridae[Organism], found 9086324


In [16]:
from Bio import SeqIO
import time

def fetch_genome_details(accession_id):
    """
    Method to fetch metadata and genome sequence for a given accession ID

    Input:
        Accession_id(int): id to look up in database

    """
    try:
        with Entrez.efetch(db = "nucleotide", id = accession_id, rettype = "gb", retmode="text") as handle:
            record = SeqIO.read(handle, "genbank")

        collection_date = "Unknown"
        isolation_source = "Unknown"

        for feature in record.features:
            if feature.type == "source":
                collection_date = feature.qualifiers.get("collection_date", ["Unknown"])[0]
                isolation_source = feature.qualifiers.get("isolation_source", ["Unknown"])[0]

        return {
            "Accession": record.id,
            "Description": record.description,
            "Organism": record.annotations.get("organism", "Unknown"),
            "Length": len(record.seq),
            "Source": record.annotations.get("source", "Unknown"),
            "Taxonomy": "; ".join(record.annotations.get("taxonomy", [])),
            "Collection Date": collection_date,
            "Isolation Source": isolation_source,
            "Sequence": str(record.seq),
        }
    except Exception as e:
        print(f"Error fetching details for {accession_id}: {e}")
        return None


genome_data = []
for idx, record_id in enumerate(record_ids):
    print(f"Fetching record {idx + 1}/{len(record_ids)}: {record_id}")
    details = fetch_genome_details(record_id)
    if details:
        genome_data.append(details)

    #Can only grab 1 per second without violating NCBI limits
    time.sleep(1)


Fetching record 1/10: 2026632707
Fetching record 2/10: 2026632694
Fetching record 3/10: 2026632681
Fetching record 4/10: 2026632668
Fetching record 5/10: 2026632655
Fetching record 6/10: 2026632642
Fetching record 7/10: 2026632629
Fetching record 8/10: 2026632616
Fetching record 9/10: 2026632603
Fetching record 10/10: 2026632590


In [17]:
import csv

def save_to_csv(data, filename):
    """Save genome data to a CSV file."""
    with open(filename, "w", newline="", encoding="utf-8") as csvfile:
        fieldnames = ["Accession", "Description", "Organism", "Length", "Source", "Taxonomy","Collection Date", "Isolation Source","Sequence"]

        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        writer.writeheader()
        writer.writerows(data)

    print(f"Data saved to {filename}.")

# Save the genome data to CSV
save_to_csv(genome_data, filename="genbank_genome_coronaviridae_test_run.csv")


Data saved to genbank_genome_coronaviridae_test_run.csv.
