In [42]:
from tqdm import tqdm

In [33]:
from pypdb.clients.search.search_client import perform_search
from pypdb.clients.search.operators import sequence_operators
from pypdb.clients.search.search_client import ReturnType

# Protein sequence for querying
sequence = "MIKRSKKNSLALSLTADQMVSALLDAEPPILYSEYDPTRPFSEASMMGLLTNLADRELVHMINWAKRVPGFVDLTLHDQVHLLECAWLEILMIGLVWRSMEHPGKLLFAPNLLLDRNQGKCVEGMVEIFDMLLATSSRFRMMNLQGEEFVCLKSIILLNSGVYTFLSSTLKSLEEKDHIHRVLDKITDTLIHLMAKAGLTLQQQHERLAQLLLILSHIRHMSNKGMEHLYSMKCKNVVPLYDLLLEMLDAHRLHAPTS"

# Define the sequence similarity search operator
search_operator = sequence_operators.SequenceOperator(
    sequence=sequence,
    sequence_type=sequence_operators.SequenceType.PROTEIN,
    evalue_cutoff=0.01,
    identity_cutoff=0.85,
)

# Define the return type as ENTRY
return_type = ReturnType.ENTRY

# Perform the search and retrieve the results
pdb_results = perform_search(search_operator=search_operator, return_type=return_type)
data = [{"pdb_id": pdb_result} for pdb_result in pdb_results]
data[:10]

Querying RCSB Search using the following parameters:
 {"query": {"type": "terminal", "service": "sequence", "parameters": {"evalue_cutoff": 0.01, "identity_cutoff": 0.85, "target": "pdb_protein_sequence", "value": "MIKRSKKNSLALSLTADQMVSALLDAEPPILYSEYDPTRPFSEASMMGLLTNLADRELVHMINWAKRVPGFVDLTLHDQVHLLECAWLEILMIGLVWRSMEHPGKLLFAPNLLLDRNQGKCVEGMVEIFDMLLATSSRFRMMNLQGEEFVCLKSIILLNSGVYTFLSSTLKSLEEKDHIHRVLDKITDTLIHLMAKAGLTLQQQHERLAQLLLILSHIRHMSNKGMEHLYSMKCKNVVPLYDLLLEMLDAHRLHAPTS"}}, "request_options": {"return_all_hits": true}, "return_type": "entry"} 



[{'pdb_id': '1A52'},
 {'pdb_id': '1L2I'},
 {'pdb_id': '1R5K'},
 {'pdb_id': '3ERD'},
 {'pdb_id': '3ERT'},
 {'pdb_id': '6SBO'},
 {'pdb_id': '2QXS'},
 {'pdb_id': '5DX3'},
 {'pdb_id': '5DXB'},
 {'pdb_id': '5DXE'}]

In [34]:
from pypdb.clients.pdb import pdb_client
from gemmi import cif

for el in tqdm(data):
    _id = el["pdb_id"]

    print(f"Downloading {_id}...")

    pdb_file = pdb_client.get_pdb_file(pdb_id=_id, filetype=pdb_client.PDBFileType.CIF)
    doc = cif.read_string(pdb_file)
    el["doc"] = doc

Downloading 1A52...
Sending GET request to https://files.rcsb.org/download/1A52.cif to fetch 1A52's cif file as a string.




Downloading 1L2I...
Sending GET request to https://files.rcsb.org/download/1L2I.cif to fetch 1L2I's cif file as a string.
Downloading 1R5K...
Sending GET request to https://files.rcsb.org/download/1R5K.cif to fetch 1R5K's cif file as a string.
Downloading 3ERD...
Sending GET request to https://files.rcsb.org/download/3ERD.cif to fetch 3ERD's cif file as a string.
Downloading 3ERT...
Sending GET request to https://files.rcsb.org/download/3ERT.cif to fetch 3ERT's cif file as a string.
Downloading 6SBO...
Sending GET request to https://files.rcsb.org/download/6SBO.cif to fetch 6SBO's cif file as a string.
Downloading 2QXS...
Sending GET request to https://files.rcsb.org/download/2QXS.cif to fetch 2QXS's cif file as a string.
Downloading 5DX3...
Sending GET request to https://files.rcsb.org/download/5DX3.cif to fetch 5DX3's cif file as a string.
Downloading 5DXB...
Sending GET request to https://files.rcsb.org/download/5DXB.cif to fetch 5DXB's cif file as a string.
Downloading 5DXE...
Send

In [31]:
from typing import List
from pandas import DataFrame
import pubchempy as pcp


def get_inchi_key(compound_id):  # -> Series[Any] | Any | str | None:
    try:
        if not (compounds := pcp.get_compounds(compound_id, "name")):
            return "No compounds found for this identifier"
        if isinstance(compounds, DataFrame):
            return compounds.first(offset=0).inchikey
        elif isinstance(compounds, List) and len(compounds) > 0:
            return compounds[0].inchikey
    except Exception as e:
        return str(e)

In [47]:
# Clear inchi_keys and cid so we can re-run the code without duplicating
for el in data:
    el["inchi_keys"] = []
    el["cids"] = []

for el in tqdm(data, desc="Getting InChI Keys"):
    for block in doc:
        compound_table = block.find_loop("_pdbx_entity_nonpoly.comp_id")
        for row in compound_table:
            compound_identifier = row[0]
            cids.append(compound_identifier)
            inchi_key = get_inchi_key(compound_id=compound_identifier)
            if inchi_key == "No compounds found for this identifier":
                continue
            inchi_keys = el.get("inchi_keys", [])
            inchi_keys.append(inchi_key)

Getting InChI Keys: 100%|██████████| 329/329 [08:58<00:00,  1.64s/it]


In [54]:
data[:10]

# Count all unique InChI Keys in data
unique = set(
    [item for sublist in [el["inchi_keys"] for el in data] for item in sublist]
)
print(f"Found {len(unique)} unique InChI Keys")

Found 2 unique InChI Keys
