In [20]:
import requests
import os
import json
from datetime import datetime

In [21]:
# Define the path to your JSON file
json_file_path = 'pdb_json/search.json'  # Add all new JSON to pdb_json folder- rename as necessary

In [22]:
# Load the query from the JSON file
with open(json_file_path, 'r') as file:
    query = json.load(file)

# Extract the filename without the extension
json_filename = os.path.splitext(os.path.basename(json_file_path))[0]

# Get the directory of the JSON file to use as the base path for the new folder
base_path = os.path.dirname(json_file_path)

# Generate a unique folder name with a timestamp and the JSON filename
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
folder_name = f"{json_filename}_downloads_{timestamp}"
folder_path = os.path.join(base_path, folder_name)

# Create the new folder
os.makedirs(folder_path, exist_ok=True)

In [23]:
# API endpoint for searching
search_url = 'https://search.rcsb.org/rcsbsearch/v2/query'

# Initialize variables for pagination and tracking total downloads
start = 0
rows_per_request = 100  # Adjust based on what the API allows and what you find practical
total_downloaded = 0

In [24]:
while True:
    # Update the pagination parameters in the query
    query['request_options']['paginate']['start'] = start
    query['request_options']['paginate']['rows'] = rows_per_request

    # Perform the search
    response = requests.post(search_url, json=query)
    if response.status_code != 200:
        print(f"Error in API request: {response.status_code}")
        break

    search_results = response.json()
    pdb_ids = [result['identifier'] for result in search_results['result_set']]

    if not pdb_ids:
        print("No more results to download.")
        break

    # Download PDB files for each ID and save them in the new folder
    for pdb_id in pdb_ids:
        pdb_download_url = f'https://files.rcsb.org/download/{pdb_id}.pdb1.gz'
        pdb_response = requests.get(pdb_download_url)

        # Define the path for saving this PDB file
        pdb_file_path = os.path.join(folder_path, f"{pdb_id}.pdb1.gz")

        # Save the PDB file to the specified folder
        with open(pdb_file_path, "wb") as file:
            file.write(pdb_response.content)

    total_downloaded += len(pdb_ids)
    print(f"Downloaded {len(pdb_ids)} PDB files, total downloaded: {total_downloaded}. Saving to {folder_path}")

    # Prepare for the next batch
    start += rows_per_request

print(f"Finished downloading {total_downloaded} PDB files to {folder_path}.")

Downloaded 31 PDB files, total downloaded: 31. Saving to pdb_json\search_downloads_20240210_102432
No more results to download.
Finished downloading 31 PDB files to pdb_json\search_downloads_20240210_102432.
