In [29]:
!pip install requests
!pip install rcsbsearchapi


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m


In [3]:
import re
import requests

from rcsbsearchapi.search import SequenceQuery

### Phase 1: Solicit User for UniProt ID and Query/Save the Protein's Similar Sequences

In [42]:
import requests

def is_uniprot(pid):
    """
    Check if a UniProt ID exists in the UniProt database.

    Parameters:
    pid (str): The UniProt ID to verify.

    Returns:
    bool: True if the UniProt ID exists, False otherwise.
    """
    uniprot_api = f"https://www.uniprot.org/uniprot/{pid}.txt"
    response = requests.get(uniprot_api)
    return response.status_code == 200

def get_fasta(pid):
    """
    Retrieve the FASTA formatted sequence for a given UniProt ID.

    Parameters:
    pid (str): The UniProt ID for which to fetch the FASTA sequence.

    Returns:
    str: The FASTA formatted sequence if the UniProt ID exists, otherwise an error message.
    """
    uniprot_fasta_url = f"https://www.uniprot.org/uniprot/{pid}.fasta"
    response = requests.get(uniprot_fasta_url)
    if response.status_code == 200:
        return response.text
    else:
        return f"Error: Unable to retrieve FASTA for UniProt ID {pid}"


In [84]:
def fetch_similarpid(fasta_sequence, pid):
    """
    Performs a sequence similarity search against the RCSB Protein Data Bank (PDB) using a FASTA sequence.

    Parameters:
    fasta_sequence (str): The FASTA formatted sequence of a protein to search for.
    evalue_cutoff (float): The e-value cutoff for the search. Default is 1.
    identity_cutoff (float): The minimum identity percentage for the search. Default is 0.9.

    Returns:
    arr: The search result as an array of PDB IDs.
    """

    search_request = {
        "query": {
            "type": "terminal",
            "service": "sequence",
            "parameters": {
            "evalue_cutoff": 0.0001,
            "identity_cutoff": 0.6,
            "sequence_type": "protein",
            "value": "MDIKNSPSSLNSPSSYNCSQSILPLEHGSIYIPSSYVDSHHEYPAMTFYSPAVMNYSIPSNVTNLEGGPGRQTTSPNVLWPTPGHLSPLVVHRQLSHLYAEPQKSPWCEARSLEHTLPVNRETLKRKVSGNRCASPVTGPGSKRDAHFCAVCSDYASGYHYGVWSCEGCKAFFKRSIQGHNDYICPATNQCTIDKNRRKSCQACRLRKCYEVGMVKCGSRRERCGYRLVRRQRSADEQLHCAGKAKRSGGHAPRVRELLLDALSPEQLVLTLLEAEPPHVLISRPSAPFTEASMMMSLTKLADKELVHMISWAKKIPGFVELSLFDQVRLLESCWMEVLMMGLMWRSIDHPGKLIFAPDLVLDRDEGKCVEGILEIFDMLLATTSRFRELKLQHKEYLCVKAMILLNSSMYPLVTATQDADSSRKLAHLLNAVTDALVWVIAKSGISSQQQSMRLANLLMLLSHVRHASNKGMEHLLNMKCKNVVPVYDLLLEMLNAHVLRGCKSSITGSECSPAEDSKSKEGSQNPQSQ"
            }
        },
        "return_type": "polymer_entity",
        "request_options": {
            "paginate": {
            "start": 0,
            "rows": 1000
            },
            "results_content_type": [
            "experimental"
            ],
            "sort": [
            {
                "sort_by": "score",
                "direction": "desc"
            }
            ],
            "scoring_strategy": "combined"
        }
}

    # The json parameter in the requests.post automatically converts the Python dictionary to a JSON payload.
    response = requests.post("https://search.rcsb.org/rcsbsearch/v2/query", json=search_request)

    # JSON elements are turned into a list    
    identifiers_json = response.json()
    identifiers = [result["identifier"] for result in identifiers_json["result_set"]]

    print(identifiers)
    return identifiers


In [None]:
# use this section to parse the results from previous code block and store all related UniProt IDs as an array

In [112]:
def fetch_inchi_keys(pdb_id):
    """
    Fetch InChIKeys for non-polymer entities (small molecules) associated with a PDB ID.

    Parameters:
    pdb_id (str): A string representing the PDB ID for which to fetch the InChIKeys.

    Returns:
    dict: A dictionary object that contains the response with InChIKeys.
    """
    match = re.match(r'(\w+)_([0-9]+)', pdb_id)

    if match:
        pdb_id = match.group(1)
        num = match.group(2)
    else:
        raise Exception(f"Invalid PDB ID: {pdb_id}")
    
    url = 'https://data.rcsb.org/graphql'
    headers = {'Content-Type': 'application/json'}
    query = """
    query Structure($id: String!) {
      entry(entry_id: $id) {
        rcsb_id
        polymer_entities {
          rcsb_polymer_entity_container_identifiers {
            uniprot_ids
            reference_sequence_identifiers {
              database_accession
            }
          }
        }
        nonpolymer_entities {
          nonpolymer_comp {
            rcsb_chem_comp_descriptor {
              InChIKey
            }
          }
        }
      }
    }
    """
    variables = {'id': pdb_id}
    response = requests.post(url, headers=headers, json={'query': query, 'variables': variables})
    
    if response.status_code == 200:
        return response.json()
    else:
        raise Exception(f"Query failed to run by returning code of {response.status_code}. {response.text}")

In [113]:
#### PHASE 1 EXECUTION ####

def main():
    pid = input("Enter a Uniprot ID: ")
    if is_uniprot(pid):
        fasta = get_fasta(pid)
        if fasta:
            # Extract the sequence from the FASTA format
            sequence = re.search(r'(?<=\n)[A-Z\n]+', fasta)
            fasta_sequence = sequence.group(0).replace("\n", "")
            
            # Fetch similar protein IDs based on the FASTA sequence. Array is returned.
            pdb_results = fetch_similarpid(fasta_sequence, pid)

        else:
            raise Exception("No FASTA sequence was retrieved.")
        
        # Extract inChI keys from PDB IDs from the fetch_similarpid results
        for pdb_id in pdb_results:
            print(fetch_inchi_keys(pdb_id))

        # Data Cleaning and Preparation for Phase II

    
    else:
        print(f"Error: {pid} does not exist in UniProt.")

if __name__ == "__main__":
    main()

['1YY4_1', '1YYE_1', '1QKM_1', '1L2J_1', '2I0G_1', '2JJ3_1', '2QTU_1', '2Z4B_1', '4ZI1_1', '5TOA_1', '1NDE_1', '1U3R_1', '1U9E_1', '2GIU_1', '1U3Q_1', '1U3S_1', '1X76_1', '1X78_1', '1X7B_1', '1X7J_1', '2YJD_1', '3OLL_1', '3OLS_1', '3OMO_1', '3OMP_1', '3OMQ_1', '4J24_1', '4J26_1', '2FSZ_1', '2NV7_1', '2YLY_1', '1ZAF_1', '2J7X_1', '2J7Y_1', '1HJ1_1', '1QKN_1', '7XVY_1', '7XVZ_1', '7XWP_1', '7XWQ_1', '7XWR_1', '7NDO_1', '7NEL_1', '7NFB_1', '1HCQ_3', '1HCP_1', '4AA6_1', '4OLN_1', '8IFO_1', '1LO1_3', '8CEF_3', '4OND_1', '1CIT_3', '6LC1_2']
{'data': {'entry': {'rcsb_id': '1YY4', 'polymer_entities': [{'rcsb_polymer_entity_container_identifiers': {'uniprot_ids': ['Q92731'], 'reference_sequence_identifiers': [{'database_accession': 'Q92731'}]}}, {'rcsb_polymer_entity_container_identifiers': {'uniprot_ids': ['Q15788'], 'reference_sequence_identifiers': [{'database_accession': 'Q15788'}]}}], 'nonpolymer_entities': [{'nonpolymer_comp': {'rcsb_chem_comp_descriptor': {'InChIKey': 'YHEHVRSGKUYDON-UHF

### Phase 2: Query All Assays Related to all Identified Proteins from PubChem BioAssay

### Phase 3: Find Substances That Were Tested From Assays and Store in Dictionary

### Phase 4: Refer to Stored Protein Data Bank for Pairs of InChi and UniProt Match 

### Phase 5: Output Remaining Data as a Table