In [11]:
import wget
import os
import json

def download_pdb_files(json_path, download_folder, min_db_len=200):
    # Ensure the download folder exists
    os.makedirs(download_folder, exist_ok=True)
    
    # Load JSON data
    with open(json_path, 'r') as file:
        data = json.load(file)
    
    # Base URL for downloading structures
    base_url = "https://api.esmatlas.com/fetchPredictedStructure/"
    
    # Iterate over each item to extract .pdb filenames and form URLs
    for item in data:
        for alignment in item.get("alignments", []):
            target = alignment.get("target", "")
            db_len = alignment.get("dbLen", 0)  # Get dbLen, default to 0 if missing
            
            # Print statement to verify dbLen values (for debugging)
            print(f"Checking {target}: dbLen = {db_len}")

            # Filter by dbLen threshold
            if db_len >= min_db_len and target.endswith(".pdb.gz"):
                pdb_id = target.rstrip(".gz")  # Strip .gz to get the correct pdb_id
                download_url = f"{base_url}{pdb_id}"
                output_path = os.path.join(download_folder, pdb_id)
                
                print(f"Downloading {pdb_id} with dbLen = {db_len}...")
                wget.download(download_url, out=output_path)  # Download using Python wget
                print(f"\nDownloaded {pdb_id} with dbLen = {db_len}")
            else:
                print(f"Skipping {target} due to dbLen = {db_len} (below threshold)")

# Specify paths (edit these as needed)
json_path = r"C:\Users\henry\OneDrive\Documents\foldseek\input.json"
download_folder = r"C:\Users\henry\OneDrive\Documents\foldseek\output_structures2"

# Specify minimum dbLen threshold for downloading
min_db_len = 300

# Run the download function
download_pdb_files(json_path, download_folder, min_db_len)
print("\nDownload completed!")


Checking MGYP000683091819.pdb.gz: dbLen = 219
Skipping MGYP000683091819.pdb.gz due to dbLen = 219 (below threshold)
Checking MGYP001023032885.pdb.gz: dbLen = 136
Skipping MGYP001023032885.pdb.gz due to dbLen = 136 (below threshold)
Checking MGYP002507125006.pdb.gz: dbLen = 281
Skipping MGYP002507125006.pdb.gz due to dbLen = 281 (below threshold)
Checking MGYP000306886148.pdb.gz: dbLen = 285
Skipping MGYP000306886148.pdb.gz due to dbLen = 285 (below threshold)
Checking MGYP002682116782.pdb.gz: dbLen = 159
Skipping MGYP002682116782.pdb.gz due to dbLen = 159 (below threshold)
Checking MGYP000904335190.pdb.gz: dbLen = 197
Skipping MGYP000904335190.pdb.gz due to dbLen = 197 (below threshold)
Checking MGYP001850178658.pdb.gz: dbLen = 153
Skipping MGYP001850178658.pdb.gz due to dbLen = 153 (below threshold)
Checking MGYP000985620284.pdb.gz: dbLen = 227
Skipping MGYP000985620284.pdb.gz due to dbLen = 227 (below threshold)
Checking MGYP003595924315.pdb.gz: dbLen = 167
Skipping MGYP003595924315.

In [13]:
import wget
import os
import json

def download_pdb_files(json_path, download_folder, min_db_len=200, min_eval=1e-10):
    # Ensure the download folder exists
    os.makedirs(download_folder, exist_ok=True)
    
    # Load JSON data
    with open(json_path, 'r') as file:
        data = json.load(file)
    
    # Base URL for downloading structures
    base_url = "https://api.esmatlas.com/fetchPredictedStructure/"
    
    # Iterate over each item to extract .pdb filenames and form URLs
    for item in data:
        for alignment in item.get("alignments", []):
            target = alignment.get("target", "")
            db_len = alignment.get("dbLen", 0)  # Get dbLen, default to 0 if missing
            eval_value = alignment.get("eval", 1)  # Get E-value, default to 1 if missing
            
            # Print statement to verify dbLen and eval values (for debugging)
            print(f"Checking {target}: dbLen = {db_len}, E-value = {eval_value}")

            # Filter by dbLen and E-value thresholds
            if db_len >= min_db_len and eval_value <= min_eval and target.endswith(".pdb.gz"):
                pdb_id = target.rstrip(".gz")  # Strip .gz to get the correct pdb_id
                download_url = f"{base_url}{pdb_id}"
                output_path = os.path.join(download_folder, pdb_id)
                
                print(f"Downloading {pdb_id} with dbLen = {db_len} and E-value = {eval_value}...")
                wget.download(download_url, out=output_path)  # Download using Python wget
                print(f"\nDownloaded {pdb_id} with dbLen = {db_len} and E-value = {eval_value}")
            else:
                print(f"Skipping {target} due to dbLen = {db_len} or E-value = {eval_value} (outside thresholds)")

# Specify paths (edit these as needed)
json_path = r"C:\Users\henry\OneDrive\Documents\foldseek\input.json"
download_folder = r"C:\Users\henry\OneDrive\Documents\foldseek\output_structures4"

# Specify thresholds for dbLen and E-value
min_db_len = 200
min_eval = 1e-20

# Run the download function
download_pdb_files(json_path, download_folder, min_db_len, min_eval)
print("\nDownload completed!")


Checking MGYP000683091819.pdb.gz: dbLen = 219, E-value = 1.084e-21
Downloading MGYP000683091819.pdb with dbLen = 219 and E-value = 1.084e-21...

Downloaded MGYP000683091819.pdb with dbLen = 219 and E-value = 1.084e-21
Checking MGYP001023032885.pdb.gz: dbLen = 136, E-value = 2.076e-19
Skipping MGYP001023032885.pdb.gz due to dbLen = 136 or E-value = 2.076e-19 (outside thresholds)
Checking MGYP002507125006.pdb.gz: dbLen = 281, E-value = 6.48e-22
Downloading MGYP002507125006.pdb with dbLen = 281 and E-value = 6.48e-22...

Downloaded MGYP002507125006.pdb with dbLen = 281 and E-value = 6.48e-22
Checking MGYP000306886148.pdb.gz: dbLen = 285, E-value = 4.44e-20
Skipping MGYP000306886148.pdb.gz due to dbLen = 285 or E-value = 4.44e-20 (outside thresholds)
Checking MGYP002682116782.pdb.gz: dbLen = 159, E-value = 1.787e-17
Skipping MGYP002682116782.pdb.gz due to dbLen = 159 or E-value = 1.787e-17 (outside thresholds)
Checking MGYP000904335190.pdb.gz: dbLen = 197, E-value = 3.755e-17
Skipping MGYP

In [6]:
%pip install subprocess


Note: you may need to restart the kernel to use updated packages.


ERROR: Could not find a version that satisfies the requirement subprocess (from versions: none)
ERROR: No matching distribution found for subprocess


In [16]:
import wget
import os
import json

def download_pdb_files(json_path, download_folder, min_db_len=200, max_db_len=500, min_eval=1e-10):
    # Ensure the download folder exists
    os.makedirs(download_folder, exist_ok=True)
    
    # Load JSON data
    with open(json_path, 'r') as file:
        data = json.load(file)
    
    # Base URL for downloading structures
    base_url = "https://api.esmatlas.com/fetchPredictedStructure/"
    
    # Iterate over each item to extract .pdb filenames and form URLs
    for item in data:
        for alignment in item.get("alignments", []):
            target = alignment.get("target", "")
            db_len = alignment.get("dbLen", 0)  # Get dbLen, default to 0 if missing
            eval_value = alignment.get("eval", 1)  # Get E-value, default to 1 if missing
            
            # Print statement to verify dbLen and eval values (for debugging)
            print(f"Checking {target}: dbLen = {db_len}, E-value = {eval_value}")

            # Filter by dbLen range and E-value threshold
            if min_db_len <= db_len <= max_db_len and eval_value <= min_eval and target.endswith(".pdb.gz"):
                pdb_id = target.rstrip(".gz")  # Strip .gz to get the correct pdb_id
                download_url = f"{base_url}{pdb_id}"
                output_path = os.path.join(download_folder, pdb_id)
                
                print(f"Downloading {pdb_id} with dbLen = {db_len} and E-value = {eval_value}...")
                wget.download(download_url, out=output_path)  # Download using Python wget
                print(f"\nDownloaded {pdb_id} with dbLen = {db_len} and E-value = {eval_value}")
            else:
                print(f"Skipping {target} due to dbLen = {db_len} or E-value = {eval_value} (outside thresholds)")

# Specify paths (edit these as needed)
json_path = r"C:\Users\henry\OneDrive\Documents\foldseek\input.json"
download_folder = r"C:\Users\henry\OneDrive\Documents\foldseek\output_structures_all"

# Specify thresholds for dbLen and E-value
min_db_len = 0
max_db_len = 1000000
min_eval = 1

# Run the download function
download_pdb_files(json_path, download_folder, min_db_len, max_db_len, min_eval)
print("\nDownload completed!")


Checking MGYP000683091819.pdb.gz: dbLen = 219, E-value = 1.084e-21
Downloading MGYP000683091819.pdb with dbLen = 219 and E-value = 1.084e-21...

Downloaded MGYP000683091819.pdb with dbLen = 219 and E-value = 1.084e-21
Checking MGYP001023032885.pdb.gz: dbLen = 136, E-value = 2.076e-19
Downloading MGYP001023032885.pdb with dbLen = 136 and E-value = 2.076e-19...

Downloaded MGYP001023032885.pdb with dbLen = 136 and E-value = 2.076e-19
Checking MGYP002507125006.pdb.gz: dbLen = 281, E-value = 6.48e-22
Downloading MGYP002507125006.pdb with dbLen = 281 and E-value = 6.48e-22...

Downloaded MGYP002507125006.pdb with dbLen = 281 and E-value = 6.48e-22
Checking MGYP000306886148.pdb.gz: dbLen = 285, E-value = 4.44e-20
Downloading MGYP000306886148.pdb with dbLen = 285 and E-value = 4.44e-20...

Downloaded MGYP000306886148.pdb with dbLen = 285 and E-value = 4.44e-20
Checking MGYP002682116782.pdb.gz: dbLen = 159, E-value = 1.787e-17
Downloading MGYP002682116782.pdb with dbLen = 159 and E-value = 1.7