In [1]:
!pip install biopython

Collecting biopython
  Using cached biopython-1.85-cp312-cp312-win_amd64.whl.metadata (13 kB)
Using cached biopython-1.85-cp312-cp312-win_amd64.whl (2.8 MB)
Installing collected packages: biopython
Successfully installed biopython-1.85


In [1]:
import pandas as pd
from Bio import SeqIO
from Bio import Entrez
from http.client import IncompleteRead
import time
import os
from tqdm import tqdm  # Progress bar

# Set your email and API key
Entrez.email = "milindshri28@gmail.com"  # Replace with your email
Entrez.api_key = "61a0177ec4272f5e476e8be374a500c38308"   

def safe_efetch(db, id, rettype, retmode, max_retries=3):
    """Retries efetch in case of connection failure."""
    for attempt in range(max_retries):
        try:
            handle = Entrez.efetch(db=db, id=id, rettype=rettype, retmode=retmode)
            return Entrez.read(handle)
        except IncompleteRead as e:
            print(f"IncompleteRead error: {e}. Retrying {attempt + 1}/{max_retries}...")
            time.sleep(5)  # Wait before retrying
        except Exception as e:
            print(f"Error: {e}")
            return None
    return None

def fetch_protein_data(query, max_records=10):
    """
    Fetches Accession numbers, VRL, and other metadata from the NCBI protein database.
    
    :param query: Search term for NCBI database
    :param max_records: Maximum number of records to retrieve
    :return: List of dictionaries with metadata
    """
    try:
        print(f"Searching for '{query}' in NCBI Protein Database...")
        handle = Entrez.esearch(db="protein", term=query, retmax=max_records)
        record = Entrez.read(handle)
        handle.close()
        
        ids = record["IdList"]
        data = []

        print(f"Fetching {len(ids)} protein records...")

        # Progress bar
        for protein_id in tqdm(ids, desc="Processing", unit="record", dynamic_ncols=True):
            time.sleep(1)  # Avoid NCBI rate limits
            records = safe_efetch("protein", protein_id, "gb", "xml")
            if not records:
                continue  # Skip if failed

            for rec in records:
                accession = rec.get("GBSeq_accession-version", "N/A")
                modification_date = rec.get("GBSeq_update-date", "N/A")
                description = rec.get("GBSeq_definition", "N/A")
                organism = rec.get("GBSeq_source", "N/A")
                sequence_length = rec.get("GBSeq_length", "N/A")
                name = rec.get("GBSeq_locus", "N/A")

                host, geolocation, date_of_collection = "N/A", "N/A", "N/A"
                
                for feature in rec.get("GBSeq_feature-table", []):
                    if feature.get("GBFeature_key") == "source":
                        for qualifier in feature.get("GBFeature_quals", []):
                            if qualifier["GBQualifier_name"] == "host":
                                host = qualifier["GBQualifier_value"]
                            elif qualifier["GBQualifier_name"] == "geo_loc_name":
                                geolocation = qualifier["GBQualifier_value"]
                            elif qualifier["GBQualifier_name"] == "collection_date":
                                date_of_collection = qualifier["GBQualifier_value"]

                data.append({
                    "Accession": accession,
                    "DOM": modification_date,
                    "Organism": organism,
                    "Host": host,
                    "Geolocation": geolocation,
                    "DOC": date_of_collection,
                    "Name": description,
                    "Size": sequence_length,
                })
        
        print("✅ Fetching complete!")
        return data
    except Exception as e:
        print(f"Error: {e}")
        return []


In [2]:
if __name__ == "__main__":
    search_query = 'Severe fever with thrombocytopenia virus AND "nucleocapsid" '  # Replace with your query
    max_records_to_fetch = 5    # Adjust as needed
    
    print("Fetching data from NCBI...")
    nucleotide_data = fetch_protein_data(search_query, max_records = max_records_to_fetch)

print("Done")

Fetching data from NCBI...
Searching for 'Severe fever with thrombocytopenia AND "nucleocapsid" ' in NCBI Protein Database...
Fetching 5 protein records...


Processing: 100%|████████████████████████████████████████████████████████████████████| 5/5 [00:12<00:00,  2.46s/record]

✅ Fetching complete!
Done





In [4]:
# Assuming `nucleotide_data` is a list of dictionaries
data = nucleotide_data

# Convert the list to a DataFrame
if data:  # Check if data is not empty
    df = pd.DataFrame(data)
    
    # Save the DataFrame to an Excel file
    output_file = "Sup_Data.xlsx"
    df.to_excel(output_file, index=False, engine="openpyxl")
    
    print(f"Excel file saved as '{output_file}'")
else:
    print("No data available to save.")


Excel file saved as 'Sup_Data.xlsx'
