In [1]:
import requests
import csv
import sys

# ==========================================
# CONFIGURATION
# ==========================================

# 1. User Identification (REQUIRED by UniProt to avoid 403 Errors)
# Please replace with your actual email so UniProt can contact you if the script causes issues.
CONTACT_EMAIL = "uusshas12@gmail.com"

# 2. Define Datasets & Queries
# Limits removed to collect ALL available data as requested.
datasets = {
    # "thermoproteota": {
    #     "query": "(taxonomy_id:28889) AND (fragment:false) AND (length:[100 TO 800])",
    #     "limit": None    # Download all (~222k)
    # },
    "halobacteria": {
        "query": "(taxonomy_id:183963) AND (fragment:false) AND (length:[100 TO 800]) AND NOT (keyword:methanogen)",
        "limit": None    # Download all (~1M)
    }
}

# 3. Define Columns (Mapped to UniProt API Fields)
fields = [
    "id",                       # Entry Name
    "gene_names",               # Gene Names
    "organism_name",            # Organism
    "organism_id",              # Organism (ID)
    "protein_name",             # Protein names
    "xref_proteomes",           # Proteomes (Group by this later)
    "fragment",                 # Fragment
    "length",                   # Length
    "sequence",                 # Sequence
    "absorption",               # Absorption
    "ft_act_site",              # Active site
    "ft_binding",               # Binding site
    "cc_catalytic_activity",    # Catalytic activity
    "cc_cofactor",              # Cofactor
    "ft_dna_bind",              # DNA binding
    "cc_activity_regulation",   # Activity regulation
    "cc_function",              # Function [CC]
    "kinetics",                 # Kinetics
    "cc_pathway",               # Pathway
    "ph_dependence",            # pH dependence
    "ft_site",                  # Site
    "temp_dependence",          # Temperature dependence
    "reviewed",                 # Reviewed
    "go_p",                     # GO (biological process)
    "go_c",                     # GO (cellular component)
    "go",                       # Gene Ontology (GO)
    "go_f",                     # GO (molecular function)
    "go_id",                    # Gene Ontology IDs
    "ft_mutagen",               # Mutagenesis (Lab mutations)
    "ft_variant",               # Natural variant (Crucial for disease/evolution)
    "cc_subcellular_location",  # Subcellular location [CC]
    "structure_3d",             # 3D
    "protein_families",         # Protein families
    "cc_similarity",            # Sequence similarities
    "xref_alphafolddb"          # AlphaFoldDB
]

API_URL = "https://rest.uniprot.org/uniprotkb/stream"

# ==========================================
# DOWNLOAD FUNCTION
# ==========================================
def download_data(name, config):
    filename = f"{name}.csv"
    print(f"\n[START] Collecting {name}...")
    print(f"Query: {config['query']}")
    
    # UniProt requires a User-Agent header with an email
    headers = {
        "User-Agent": f"PythonScript/1.0 ({CONTACT_EMAIL})"
    }

    params = {
        "query": config["query"],
        "format": "tsv",     # TSV is safer to stream
        "fields": ",".join(fields)
    }

    count = 0
    
    try:
        # stream=True is essential for large datasets
        with requests.get(API_URL, params=params, headers=headers, stream=True, timeout=60) as response:
            response.raise_for_status()
            
            # iter_lines yields byte strings, we decode them
            lines = response.iter_lines(decode_unicode=True)
            
            with open(filename, "w", newline="", encoding="utf-8") as f:
                writer = csv.writer(f)
                
                # Write Header
                try:
                    header = next(lines).split("\t")
                    writer.writerow(header)
                except StopIteration:
                    print(f"[ERROR] No data found for {name}")
                    return

                # Write Data
                for line in lines:
                    if line:
                        writer.writerow(line.split("\t"))
                        count += 1
                        
                        # Progress indicator (every 50k rows for speed)
                        if count % 50000 == 0:
                            sys.stdout.write(f"\rRows saved: {count}")
                            sys.stdout.flush()
                        
                        if config["limit"] and count >= config["limit"]:
                            print(f"\n[STOP] Reached limit of {config['limit']}")
                            break
        
        print(f"\n[DONE] Saved {count} rows to {filename}")
        
    except requests.exceptions.HTTPError as err:
        print(f"\n[ERROR] HTTP Error: {err}")
        if response.status_code == 403:
            print("Hint: 403 usually means UniProt blocked the script. Check your CONTACT_EMAIL.")
    except Exception as e:
        print(f"\n[ERROR] Failed to download {name}: {e}")

# ==========================================
# RUN
# ==========================================
if __name__ == "__main__":
    for name, config in datasets.items():
        download_data(name, config)


[START] Collecting halobacteria...
Query: (taxonomy_id:183963) AND (fragment:false) AND (length:[100 TO 800]) AND NOT (keyword:methanogen)
Rows saved: 1000000
[DONE] Saved 1013662 rows to halobacteria.csv


In [None]:
import requests
import csv
import sys
import time

# ==========================================
# CONFIGURATION
# ==========================================

# 1. User Identification (REQUIRED by UniProt to avoid 403 Errors)
# Please replace with your actual email so UniProt can contact you if the script causes issues.
CONTACT_EMAIL = "uusshas12@gmail.com"

# 2. Define Columns (Mapped to UniProt API Fields)
fields = [
    "id", "gene_names", "organism_name", "organism_id", "protein_name", 
    "xref_proteomes", "fragment", "length", "sequence", "absorption", 
    "ft_act_site", "ft_binding", "cc_catalytic_activity", "cc_cofactor", 
    "ft_dna_bind", "cc_activity_regulation", "cc_function", "kinetics", 
    "cc_pathway", "ph_dependence", "ft_site", "temp_dependence", 
    "reviewed", "go_p", "go_c", "go", "go_f", "go_id", "ft_mutagen", 
    "ft_variant", "cc_subcellular_location", "structure_3d", 
    "protein_families", "cc_similarity", "xref_alphafolddb"
]

API_URL = "https://rest.uniprot.org/uniprotkb/stream"

# ==========================================
# DOWNLOAD FUNCTION (Supports Write and Append)
# ==========================================
def download_data(name, query, filename, write_mode, write_header, limit=None):
    """
    Downloads data from UniProt using the streaming API and writes to a file.
    
    :param name: A descriptive name for the current operation (e.g., 'Gammaproteobacteria 100-200').
    :param query: The specific UniProt API query string.
    :param filename: The file to write to (will be opened in write_mode).
    :param write_mode: File mode ('w' for write/overwrite, 'a' for append).
    :param write_header: Boolean flag to determine if the header row should be written.
    :param limit: Optional row limit for testing.
    :return: The number of rows saved in this run, or 0 on failure.
    """
    print(f"\n[START] Collecting {name}...")
    print(f"Query: {query}")
    
    # UniProt requires a User-Agent header with an email
    headers = {
        "User-Agent": f"PythonScript/1.0 ({CONTACT_EMAIL})"
    }

    params = {
        "query": query,
        "format": "tsv",     # TSV is safer to stream
        "fields": ",".join(fields)
    }

    count = 0
    max_retries = 5
    initial_delay = 1
    
    for attempt in range(max_retries):
        try:
            # stream=True is essential for large datasets
            with requests.get(API_URL, params=params, headers=headers, stream=True, timeout=120) as response:
                response.raise_for_status()
                
                # iter_lines yields byte strings, we decode them
                lines = response.iter_lines(decode_unicode=True)
                
                # Open file in the specified mode ('w' or 'a')
                with open(filename, write_mode, newline="", encoding="utf-8") as f:
                    writer = csv.writer(f)
                    
                    # --- Header Handling ---
                    # The UniProt stream always starts with the header row.
                    
                    # Get the header line from the stream
                    try:
                        header_line = next(lines)
                    except StopIteration:
                        print(f"[ERROR] No data found for {name} in this range.")
                        return 0 # No data
                        
                    if write_header and header_line:
                        # Only write the header if requested (first chunk only)
                        writer.writerow(header_line.split("\t"))
                    # If write_header is False (append mode), we skip the line, but we must
                    # consume it to get to the data rows. (Already done by next(lines))

                    # --- Write Data ---
                    for line in lines:
                        if line:
                            writer.writerow(line.split("\t"))
                            count += 1
                            
                            # Progress indicator (every 50k rows for speed)
                            if count % 50000 == 0:
                                sys.stdout.write(f"\rRows saved for chunk: {count}")
                                sys.stdout.flush()
                            
                            if limit and count >= limit:
                                print(f"\n[STOP] Reached limit of {limit} for this chunk.")
                                break
                
                print(f"\n[CHUNK DONE] Saved {count} rows for {name}.")
                return count # Success
        
        except requests.exceptions.HTTPError as err:
            print(f"\n[ERROR] HTTP Error on attempt {attempt + 1}/{max_retries}: {err}")
            if response.status_code == 403:
                print("Hint: 403 usually means UniProt blocked the script. Check your CONTACT_EMAIL.")
                # Non-retriable error for 403, as backoff won't help
                return 0
        except requests.exceptions.RequestException as e:
            print(f"\n[ERROR] Request failed on attempt {attempt + 1}/{max_retries}: {e}")
        except Exception as e:
            print(f"\n[FATAL ERROR] Failed to download {name}: {e}")
            return 0 # Fatal error

        # Exponential Backoff (for retriable errors)
        if attempt < max_retries - 1:
            delay = initial_delay * (2 ** attempt)
            print(f"Retrying in {delay} seconds...")
            time.sleep(delay)
            
    print(f"\n[FAILED] Failed to complete download for {name} after {max_retries} attempts.")
    return 0

# ==========================================
# RUN
# ==========================================
if __name__ == "__main__":
    
    gamma_name = "gammaproteobacteria"
    gamma_filename = f"{gamma_name}.csv"
    
    # Define the required modular length ranges
    length_ranges = [
        (100, 200), (201, 300), (301, 400), (401, 500), 
        (501, 600), (601, 700), (701, 800)
    ]
    
    # Base query components for Gammaproteobacteria (Taxonomy ID 1236)
    base_query_part = "(taxonomy_id:1236) AND (fragment:false)"
    
    print(f"--- Starting Modular Download for {gamma_name} (Total {len(length_ranges)} blocks) ---")
    
    # Flag to control writing the header only once
    is_first_chunk = True
    total_count = 0
    
    for start_len, end_len in length_ranges:
        # Construct the full query for the specific length range
        range_query_part = f"(length:[{start_len} TO {end_len}])"
        full_query = f"{base_query_part} AND {range_query_part}"
        
        # Determine file write mode and header flag
        # 1. First chunk uses 'w' (write/overwrite) to create the file and write the header.
        # 2. Subsequent chunks use 'a' (append) and skip the header.
        write_mode = "w" if is_first_chunk else "a"
        write_header = is_first_chunk
        
        chunk_count = download_data(
            name=f"{gamma_name} (Length {start_len}-{end_len})",
            query=full_query,
            filename=gamma_filename,
            write_mode=write_mode,
            write_header=write_header
        )
        
        total_count += chunk_count
            
        # Set flag to False for all subsequent runs
        is_first_chunk = False 
        
    print(f"\n--- Modular Download Complete for {gamma_name}. ---")
    print(f"Total Rows Saved to {gamma_filename}: {total_count}")
    
    # NOTE: Other datasets defined in the original script have been omitted from the
    # main execution block, as only the gammaproteobacteria modular download was requested.