In [1]:
from tqdm import tqdm

In [15]:
# !pip install tqdm pypdb
# !pip install gemmi
# !pip install pubchempy

In [2]:
from pypdb.clients.search.search_client import perform_search
from pypdb.clients.search.operators import sequence_operators
from pypdb.clients.search.search_client import ReturnType

# Protein sequence for querying
sequence = "MIKRSKKNSLALSLTADQMVSALLDAEPPILYSEYDPTRPFSEASMMGLLTNLADRELVHMINWAKRVPGFVDLTLHDQVHLLECAWLEILMIGLVWRSMEHPGKLLFAPNLLLDRNQGKCVEGMVEIFDMLLATSSRFRMMNLQGEEFVCLKSIILLNSGVYTFLSSTLKSLEEKDHIHRVLDKITDTLIHLMAKAGLTLQQQHERLAQLLLILSHIRHMSNKGMEHLYSMKCKNVVPLYDLLLEMLDAHRLHAPTS"

# Define the sequence similarity search operator
search_operator = sequence_operators.SequenceOperator(
    sequence=sequence,
    sequence_type=sequence_operators.SequenceType.PROTEIN,
    evalue_cutoff=0.01,
    identity_cutoff=0.85,
)

# Define the return type as ENTRY
return_type = ReturnType.ENTRY

# Perform the search and retrieve the results
pdb_results = perform_search(search_operator=search_operator, return_type=return_type)
data = [{"pdb_id": pdb_result} for pdb_result in pdb_results]

Querying RCSB Search using the following parameters:
 {"query": {"type": "terminal", "service": "sequence", "parameters": {"evalue_cutoff": 0.01, "identity_cutoff": 0.85, "target": "pdb_protein_sequence", "value": "MIKRSKKNSLALSLTADQMVSALLDAEPPILYSEYDPTRPFSEASMMGLLTNLADRELVHMINWAKRVPGFVDLTLHDQVHLLECAWLEILMIGLVWRSMEHPGKLLFAPNLLLDRNQGKCVEGMVEIFDMLLATSSRFRMMNLQGEEFVCLKSIILLNSGVYTFLSSTLKSLEEKDHIHRVLDKITDTLIHLMAKAGLTLQQQHERLAQLLLILSHIRHMSNKGMEHLYSMKCKNVVPLYDLLLEMLDAHRLHAPTS"}}, "request_options": {"return_all_hits": true}, "return_type": "entry"} 



In [3]:
data[0]

{'pdb_id': '1A52'}

In [4]:
%%capture
from pypdb.clients.pdb import pdb_client
from gemmi import cif
import re

# for testing
data = data[:10]


for el in tqdm(data):
    _id = el["pdb_id"]

    print(f"Downloading {_id}...")

    pdb_file = pdb_client.get_pdb_file(pdb_id=_id, filetype=pdb_client.PDBFileType.CIF)
    doc = cif.read_string(pdb_file)
    el["doc"] = doc
    pattern = re.compile(r'\b(\w+)\s+non-polymer\s+\.\s+')
    matches = pattern.findall(pdb_file)
    el['chem_id'] = matches

In [5]:
data[0]

{'pdb_id': '1A52',
 'doc': <gemmi.cif.Document with 1 blocks (1A52)>,
 'chem_id': ['AU', 'EST', 'HOH']}

In [13]:
from pypdb import get_info, to_dict

def get_inchi_pdb(chem_id):
    out = get_info(chem_id, url_root = 'https://data.rcsb.org/rest/v1/core/chemcomp/')
    # print(out['rcsb_chem_comp_descriptor']['in_ch_i'])
    # print(out['rcsb_chem_comp_descriptor']['in_ch_ikey'])
    return out['rcsb_chem_comp_descriptor']['in_ch_i'] , out['rcsb_chem_comp_descriptor']['in_ch_ikey']

for el in data:
    el["in_ch_i"] = []
    el["in_ch_ikey"] = []

for el in tqdm(data):
    for chem_id in el['chem_id']:
        # print(chem_id)
        # print(f"Downloading {chem_id}...")
        in_ch_i, in_ch_ikey = get_inchi_pdb(chem_id)
        el["in_ch_i"].append(in_ch_i)
        el["in_ch_ikey"].append(in_ch_ikey)

100%|██████████| 10/10 [00:04<00:00,  2.09it/s]


In [15]:
data[0]

{'pdb_id': '1A52',
 'doc': <gemmi.cif.Document with 1 blocks (1A52)>,
 'chem_id': ['AU', 'EST', 'HOH'],
 'in_ch_i': ['InChI=1S/Au/q+1',
  'InChI=1S/C18H24O2/c1-18-9-8-14-13-5-3-12(19)10-11(13)2-4-15(14)16(18)6-7-17(18)20/h3,5,10,14-17,19-20H,2,4,6-9H2,1H3/t14-,15-,16+,17+,18+/m1/s1',
  'InChI=1S/H2O/h1H2'],
 'in_ch_ikey': ['ZBKIUFWVEIBQRT-UHFFFAOYSA-N',
  'VOXZDWNPVJITMN-ZBRFXRBCSA-N',
  'XLYOFNOQVPJJNP-UHFFFAOYSA-N']}

In [16]:
from typing import List
from pandas import DataFrame
import pubchempy as pcp

