In [None]:

# %% [code]
from Bio import Entrez, SeqIO
import csv

# Always set your email for NCBI Entrez
Entrez.email = "your.email@example.com"

# Define search term: all viral complete genomes
search_term = "Viruses[Organism] AND complete genome[Title]"

# How many records to fetch
target_count = 3000

# %% [code]
# Perform the search
handle = Entrez.esearch(db="nucleotide", term=search_term, retmax=target_count)
search_results = Entrez.read(handle)
handle.close()
id_list = search_results["IdList"]
print(f"Found {len(id_list)} record IDs; retrieving up to {target_count} entries...")

# %% [code]
# Function to fetch and parse GenBank records in batches
def fetch_records(id_list, batch_size=200):
    records = []
    for start in range(0, len(id_list), batch_size):
        end = min(len(id_list), start + batch_size)
        fetch_handle = Entrez.efetch(db="nucleotide", id=id_list[start:end], rettype="gb", retmode="text")
        for gb_record in SeqIO.parse(fetch_handle, "genbank"):
            # Extract required fields
            accession = gb_record.id
            description = gb_record.description
            organism = gb_record.annotations.get("organism", "")
            length = len(gb_record.seq)
            # Source and taxonomy
            source = gb_record.features[0].qualifiers.get("source", [""])[0]
            taxonomy = ";".join(gb_record.annotations.get("taxonomy", []))
            # Collection Date and Isolation Source from 'source' feature
            collection_date = ""
            isolation_source = ""
            for feat in gb_record.features:
                if feat.type == "source":
                    qualifiers = feat.qualifiers
                    collection_date = qualifiers.get("collection_date", [""])[0]
                    isolation_source = qualifiers.get("isolation_source", [""])[0]
                    break
            seq = str(gb_record.seq)
            records.append([
                accession,
                description,
                organism,
                length,
                source,
                taxonomy,
                collection_date,
                isolation_source,
                seq
            ])
        fetch_handle.close()
    return records

# %% [code]
# Fetch the records
records = fetch_records(id_list)
print(f"Parsed {len(records)} GenBank records.")

# %% [code]
# Write to CSV
output_file = "viral_genomes.csv"
with open(output_file, "w", newline="") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["Accession", "Description", "Organism", "Length", "Source", "Taxonomy", "Collection Date", "Isolation Source", "Genomic Sequence"])
    writer.writerows(records)

print(f"Successfully wrote {len(records)} records to {output_file}.")
