# Halobacteria

In [6]:
import requests
import csv
import sys

# ==========================================
# CONFIGURATION
# ==========================================

# 1. User Identification (REQUIRED by UniProt to avoid 403 Errors)
# Please replace with your actual email so UniProt can contact you if the script causes issues.
CONTACT_EMAIL = "uusshas12@gmail.com"

# 2. Define Datasets & Queries
# Limits removed to collect ALL available data as requested.
datasets = {
    # "thermoproteota": {
    #     "query": "(taxonomy_id:28889) AND (fragment:false) AND (length:[100 TO 800])",
    #     "limit": None    # Download all (~222k)
    # },
    "salt_extreme_halobacteria": {
        "query": "(taxonomy_id:183963) AND (fragment:false) AND (length:[100 TO 800]) AND NOT (keyword:methanogen) AND (reviewed:true)",
        "limit": None    # Download all (~1M)
    }
}

# 3. Define Columns (Mapped to UniProt API Fields)
fields = [
    "id",                       # Entry Name
    "gene_names",               # Gene Names
    "organism_name",            # Organism
    "organism_id",              # Organism (ID)
    "protein_name",             # Protein names
    "xref_proteomes",           # Proteomes (Group by this later)
    "fragment",                 # Fragment
    "length",                   # Length
    "sequence",                 # Sequence
    "absorption",               # Absorption
    "ft_act_site",              # Active site
    "ft_binding",               # Binding site
    "cc_catalytic_activity",    # Catalytic activity
    "cc_cofactor",              # Cofactor
    "ft_dna_bind",              # DNA binding
    "cc_activity_regulation",   # Activity regulation
    "cc_function",              # Function [CC]
    "kinetics",                 # Kinetics
    "cc_pathway",               # Pathway
    "ph_dependence",            # pH dependence
    "ft_site",                  # Site
    "temp_dependence",          # Temperature dependence
    "reviewed",                 # Reviewed
    "go_p",                     # GO (biological process)
    "go_c",                     # GO (cellular component)
    "go",                       # Gene Ontology (GO)
    "go_f",                     # GO (molecular function)
    "go_id",                    # Gene Ontology IDs
    "ft_mutagen",               # Mutagenesis (Lab mutations)
    "ft_variant",               # Natural variant (Crucial for disease/evolution)
    "cc_subcellular_location",  # Subcellular location [CC]
    "structure_3d",             # 3D
    "protein_families",         # Protein families
    "cc_similarity",            # Sequence similarities
    "xref_alphafolddb"          # AlphaFoldDB
]

API_URL = "https://rest.uniprot.org/uniprotkb/stream"

# ==========================================
# DOWNLOAD FUNCTION
# ==========================================
def download_data(name, config):
    filename = f"{name}.csv"
    print(f"\n[START] Collecting {name}...")
    print(f"Query: {config['query']}")
    
    # UniProt requires a User-Agent header with an email
    headers = {
        "User-Agent": f"PythonScript/1.0 ({CONTACT_EMAIL})"
    }

    params = {
        "query": config["query"],
        "format": "tsv",     # TSV is safer to stream
        "fields": ",".join(fields)
    }

    count = 0
    
    try:
        # stream=True is essential for large datasets
        with requests.get(API_URL, params=params, headers=headers, stream=True, timeout=60) as response:
            response.raise_for_status()
            
            # iter_lines yields byte strings, we decode them
            lines = response.iter_lines(decode_unicode=True)
            
            with open(filename, "w", newline="", encoding="utf-8") as f:
                writer = csv.writer(f)
                
                # Write Header
                try:
                    header = next(lines).split("\t")
                    writer.writerow(header)
                except StopIteration:
                    print(f"[ERROR] No data found for {name}")
                    return

                # Write Data
                for line in lines:
                    if line:
                        writer.writerow(line.split("\t"))
                        count += 1
                        
                        # Progress indicator (every 50k rows for speed)
                        if count % 50000 == 0:
                            sys.stdout.write(f"\rRows saved: {count}")
                            sys.stdout.flush()
                            break
                        
                        if config["limit"] and count >= config["limit"]:
                            print(f"\n[STOP] Reached limit of {config['limit']}")
                            break
        
        print(f"\n[DONE] Saved {count} rows to {filename}")
        
    except requests.exceptions.HTTPError as err:
        print(f"\n[ERROR] HTTP Error: {err}")
        if response.status_code == 403:
            print("Hint: 403 usually means UniProt blocked the script. Check your CONTACT_EMAIL.")
    except Exception as e:
        print(f"\n[ERROR] Failed to download {name}: {e}")

# ==========================================
# RUN
# ==========================================
if __name__ == "__main__":
    for name, config in datasets.items():
        download_data(name, config)


[START] Collecting salt_extreme_halobacteria...
Query: (taxonomy_id:183963) AND (fragment:false) AND (length:[100 TO 800]) AND NOT (keyword:methanogen) AND (reviewed:true)

[DONE] Saved 1872 rows to salt_extreme_halobacteria.csv


In [7]:
import requests
import csv
import sys

# ==========================================
# CONFIGURATION
# ==========================================

# 1. User Identification (REQUIRED by UniProt to avoid 403 Errors)
# Please replace with your actual email so UniProt can contact you if the script causes issues.
CONTACT_EMAIL = "uusshas12@gmail.com"

# 2. Define Datasets & Queries
# Limits removed to collect ALL available data as requested.
datasets = {
    "heat_extremethermoproteota": {
        "query": "(taxonomy_id:2281 OR taxonomy_id:2269 OR taxonomy_id:189538) AND (fragment:false) AND (length:[100 TO 800]) AND (reviewed:true)",
        "limit": None    # Download all (~222k)
    }
#     "salt_extreme_halobacteria": {
#         "query": "(taxonomy_id:183963) AND (fragment:false) AND (length:[100 TO 800]) AND NOT (keyword:methanogen) AND (reviewed:false)",
#         "limit": None    # Download all (~1M)
#     }
}

# 3. Define Columns (Mapped to UniProt API Fields)
fields = [
    "id",                       # Entry Name
    "gene_names",               # Gene Names
    "organism_name",            # Organism
    "organism_id",              # Organism (ID)
    "protein_name",             # Protein names
    "xref_proteomes",           # Proteomes (Group by this later)
    "fragment",                 # Fragment
    "length",                   # Length
    "sequence",                 # Sequence
    "absorption",               # Absorption
    "ft_act_site",              # Active site
    "ft_binding",               # Binding site
    "cc_catalytic_activity",    # Catalytic activity
    "cc_cofactor",              # Cofactor
    "ft_dna_bind",              # DNA binding
    "cc_activity_regulation",   # Activity regulation
    "cc_function",              # Function [CC]
    "kinetics",                 # Kinetics
    "cc_pathway",               # Pathway
    "ph_dependence",            # pH dependence
    "ft_site",                  # Site
    "temp_dependence",          # Temperature dependence
    "reviewed",                 # Reviewed
    "go_p",                     # GO (biological process)
    "go_c",                     # GO (cellular component)
    "go",                       # Gene Ontology (GO)
    "go_f",                     # GO (molecular function)
    "go_id",                    # Gene Ontology IDs
    "ft_mutagen",               # Mutagenesis (Lab mutations)
    "ft_variant",               # Natural variant (Crucial for disease/evolution)
    "cc_subcellular_location",  # Subcellular location [CC]
    "structure_3d",             # 3D
    "protein_families",         # Protein families
    "cc_similarity",            # Sequence similarities
    "xref_alphafolddb"          # AlphaFoldDB
]

API_URL = "https://rest.uniprot.org/uniprotkb/stream"

# ==========================================
# DOWNLOAD FUNCTION
# ==========================================
def download_data(name, config):
    filename = f"{name}.csv"
    print(f"\n[START] Collecting {name}...")
    print(f"Query: {config['query']}")
    
    # UniProt requires a User-Agent header with an email
    headers = {
        "User-Agent": f"PythonScript/1.0 ({CONTACT_EMAIL})"
    }

    params = {
        "query": config["query"],
        "format": "tsv",     # TSV is safer to stream
        "fields": ",".join(fields)
    }

    count = 0
    
    try:
        # stream=True is essential for large datasets
        with requests.get(API_URL, params=params, headers=headers, stream=True, timeout=60) as response:
            response.raise_for_status()
            
            # iter_lines yields byte strings, we decode them
            lines = response.iter_lines(decode_unicode=True)
            
            with open(filename, "w", newline="", encoding="utf-8") as f:
                writer = csv.writer(f)
                
                # Write Header
                try:
                    header = next(lines).split("\t")
                    writer.writerow(header)
                except StopIteration:
                    print(f"[ERROR] No data found for {name}")
                    return

                # Write Data
                for line in lines:
                    if line:
                        writer.writerow(line.split("\t"))
                        count += 1
                        
                        # Progress indicator (every 50k rows for speed)
                        if count % 50000 == 0:
                            sys.stdout.write(f"\rRows saved: {count}")
                            sys.stdout.flush()
                            break
                        
                        if config["limit"] and count >= config["limit"]:
                            print(f"\n[STOP] Reached limit of {config['limit']}")
                            break
        
        print(f"\n[DONE] Saved {count} rows to {filename}")
        
    except requests.exceptions.HTTPError as err:
        print(f"\n[ERROR] HTTP Error: {err}")
        if response.status_code == 403:
            print("Hint: 403 usually means UniProt blocked the script. Check your CONTACT_EMAIL.")
    except Exception as e:
        print(f"\n[ERROR] Failed to download {name}: {e}")

# ==========================================
# RUN
# ==========================================
if __name__ == "__main__":
    for name, config in datasets.items():
        download_data(name, config)


[START] Collecting heat_extremethermoproteota...
Query: (taxonomy_id:2281 OR taxonomy_id:2269 OR taxonomy_id:189538) AND (fragment:false) AND (length:[100 TO 800]) AND (reviewed:true)

[DONE] Saved 2284 rows to heat_extremethermoproteota.csv


In [8]:
import requests
import csv
import sys

# ==========================================
# CONFIGURATION
# ==========================================

# 1. User Identification (REQUIRED by UniProt to avoid 403 Errors)
# Please replace with your actual email so UniProt can contact you if the script causes issues.
CONTACT_EMAIL = "uusshas12@gmail.com"

# 2. Define Datasets & Queries
# Limits removed to collect ALL available data as requested.
datasets = {
    "control_mesophilic": {
        "query": "(taxonomy_id:1236) AND (fragment:false) AND (length:[100 TO 800]) AND (reviewed:true)",
        "limit": None    # Download all (~222k)
    }
}

# 3. Define Columns (Mapped to UniProt API Fields)
fields = [
    "id",                       # Entry Name
    "gene_names",               # Gene Names
    "organism_name",            # Organism
    "organism_id",              # Organism (ID)
    "protein_name",             # Protein names
    "xref_proteomes",           # Proteomes (Group by this later)
    "fragment",                 # Fragment
    "length",                   # Length
    "sequence",                 # Sequence
    "absorption",               # Absorption
    "ft_act_site",              # Active site
    "ft_binding",               # Binding site
    "cc_catalytic_activity",    # Catalytic activity
    "cc_cofactor",              # Cofactor
    "ft_dna_bind",              # DNA binding
    "cc_activity_regulation",   # Activity regulation
    "cc_function",              # Function [CC]
    "kinetics",                 # Kinetics
    "cc_pathway",               # Pathway
    "ph_dependence",            # pH dependence
    "ft_site",                  # Site
    "temp_dependence",          # Temperature dependence
    "reviewed",                 # Reviewed
    "go_p",                     # GO (biological process)
    "go_c",                     # GO (cellular component)
    "go",                       # Gene Ontology (GO)
    "go_f",                     # GO (molecular function)
    "go_id",                    # Gene Ontology IDs
    "ft_mutagen",               # Mutagenesis (Lab mutations)
    "ft_variant",               # Natural variant (Crucial for disease/evolution)
    "cc_subcellular_location",  # Subcellular location [CC]
    "structure_3d",             # 3D
    "protein_families",         # Protein families
    "cc_similarity",            # Sequence similarities
    "xref_alphafolddb"          # AlphaFoldDB
]

API_URL = "https://rest.uniprot.org/uniprotkb/stream"

# ==========================================
# DOWNLOAD FUNCTION
# ==========================================
def download_data(name, config):
    filename = f"{name}.csv"
    print(f"\n[START] Collecting {name}...")
    print(f"Query: {config['query']}")
    
    # UniProt requires a User-Agent header with an email
    headers = {
        "User-Agent": f"PythonScript/1.0 ({CONTACT_EMAIL})"
    }

    params = {
        "query": config["query"],
        "format": "tsv",     # TSV is safer to stream
        "fields": ",".join(fields)
    }

    count = 0
    
    try:
        # stream=True is essential for large datasets
        with requests.get(API_URL, params=params, headers=headers, stream=True, timeout=60) as response:
            response.raise_for_status()
            
            # iter_lines yields byte strings, we decode them
            lines = response.iter_lines(decode_unicode=True)
            
            with open(filename, "w", newline="", encoding="utf-8") as f:
                writer = csv.writer(f)
                
                # Write Header
                try:
                    header = next(lines).split("\t")
                    writer.writerow(header)
                except StopIteration:
                    print(f"[ERROR] No data found for {name}")
                    return

                # Write Data
                for line in lines:
                    if line:
                        writer.writerow(line.split("\t"))
                        count += 1
                        
                        # Progress indicator (every 50k rows for speed)
                        if count % 50000 == 0:
                            sys.stdout.write(f"\rRows saved: {count}")
                            sys.stdout.flush()
                            break
                        
                        if config["limit"] and count >= config["limit"]:
                            print(f"\n[STOP] Reached limit of {config['limit']}")
                            break
        
        print(f"\n[DONE] Saved {count} rows to {filename}")
        
    except requests.exceptions.HTTPError as err:
        print(f"\n[ERROR] HTTP Error: {err}")
        if response.status_code == 403:
            print("Hint: 403 usually means UniProt blocked the script. Check your CONTACT_EMAIL.")
    except Exception as e:
        print(f"\n[ERROR] Failed to download {name}: {e}")

# ==========================================
# RUN
# ==========================================
if __name__ == "__main__":
    for name, config in datasets.items():
        download_data(name, config)


[START] Collecting control_mesophilic...
Query: (taxonomy_id:1236) AND (fragment:false) AND (length:[100 TO 800]) AND (reviewed:true)
Rows saved: 50000
[DONE] Saved 50000 rows to control_mesophilic.csv


In [9]:
import requests
import csv
import sys

# ==========================================
# CONFIGURATION
# ==========================================

# 1. User Identification (REQUIRED by UniProt to avoid 403 Errors)
# Please replace with your actual email so UniProt can contact you if the script causes issues.
CONTACT_EMAIL = "uusshas12@gmail.com"

# 2. Define Datasets & Queries
# Limits removed to collect ALL available data as requested.
datasets = {
    "control_mesophilic_archaea": {
        "query": "(taxonomy_id:183925) AND (fragment:false) AND (length:[100 TO 800]) AND (reviewed:true)",
        "limit": None    # Download all (~222k)
    }
}

# 3. Define Columns (Mapped to UniProt API Fields)
fields = [
    "id",                       # Entry Name
    "gene_names",               # Gene Names
    "organism_name",            # Organism
    "organism_id",              # Organism (ID)
    "protein_name",             # Protein names
    "xref_proteomes",           # Proteomes (Group by this later)
    "fragment",                 # Fragment
    "length",                   # Length
    "sequence",                 # Sequence
    "absorption",               # Absorption
    "ft_act_site",              # Active site
    "ft_binding",               # Binding site
    "cc_catalytic_activity",    # Catalytic activity
    "cc_cofactor",              # Cofactor
    "ft_dna_bind",              # DNA binding
    "cc_activity_regulation",   # Activity regulation
    "cc_function",              # Function [CC]
    "kinetics",                 # Kinetics
    "cc_pathway",               # Pathway
    "ph_dependence",            # pH dependence
    "ft_site",                  # Site
    "temp_dependence",          # Temperature dependence
    "reviewed",                 # Reviewed
    "go_p",                     # GO (biological process)
    "go_c",                     # GO (cellular component)
    "go",                       # Gene Ontology (GO)
    "go_f",                     # GO (molecular function)
    "go_id",                    # Gene Ontology IDs
    "ft_mutagen",               # Mutagenesis (Lab mutations)
    "ft_variant",               # Natural variant (Crucial for disease/evolution)
    "cc_subcellular_location",  # Subcellular location [CC]
    "structure_3d",             # 3D
    "protein_families",         # Protein families
    "cc_similarity",            # Sequence similarities
    "xref_alphafolddb"          # AlphaFoldDB
]

API_URL = "https://rest.uniprot.org/uniprotkb/stream"

# ==========================================
# DOWNLOAD FUNCTION
# ==========================================
def download_data(name, config):
    filename = f"{name}.csv"
    print(f"\n[START] Collecting {name}...")
    print(f"Query: {config['query']}")
    
    # UniProt requires a User-Agent header with an email
    headers = {
        "User-Agent": f"PythonScript/1.0 ({CONTACT_EMAIL})"
    }

    params = {
        "query": config["query"],
        "format": "tsv",     # TSV is safer to stream
        "fields": ",".join(fields)
    }

    count = 0
    
    try:
        # stream=True is essential for large datasets
        with requests.get(API_URL, params=params, headers=headers, stream=True, timeout=60) as response:
            response.raise_for_status()
            
            # iter_lines yields byte strings, we decode them
            lines = response.iter_lines(decode_unicode=True)
            
            with open(filename, "w", newline="", encoding="utf-8") as f:
                writer = csv.writer(f)
                
                # Write Header
                try:
                    header = next(lines).split("\t")
                    writer.writerow(header)
                except StopIteration:
                    print(f"[ERROR] No data found for {name}")
                    return

                # Write Data
                for line in lines:
                    if line:
                        writer.writerow(line.split("\t"))
                        count += 1
                        
                        # Progress indicator (every 50k rows for speed)
                        if count % 50000 == 0:
                            sys.stdout.write(f"\rRows saved: {count}")
                            sys.stdout.flush()
                            break
                        
                        if config["limit"] and count >= config["limit"]:
                            print(f"\n[STOP] Reached limit of {config['limit']}")
                            break
        
        print(f"\n[DONE] Saved {count} rows to {filename}")
        
    except requests.exceptions.HTTPError as err:
        print(f"\n[ERROR] HTTP Error: {err}")
        if response.status_code == 403:
            print("Hint: 403 usually means UniProt blocked the script. Check your CONTACT_EMAIL.")
    except Exception as e:
        print(f"\n[ERROR] Failed to download {name}: {e}")

# ==========================================
# RUN
# ==========================================
if __name__ == "__main__":
    for name, config in datasets.items():
        download_data(name, config)


[START] Collecting control_mesophilic_archaea...
Query: (taxonomy_id:183925) AND (fragment:false) AND (length:[100 TO 800]) AND (reviewed:true)

[DONE] Saved 1114 rows to control_mesophilic_archaea.csv
