# Loading proteins

In [1]:
from IPython.utils import io

with io.capture_output() as captured:
    !conda install -y requests
    !conda install -y biopython
    !conda install -y pandas

print(captured.stdout if ("Error" in captured.stdout) or ("ERROR" in captured.stdout) else "Installation successful")

Installation successful


Prepare filesystem:

In [2]:
ORIGINAL_PDB_FILES_DIRECTORY = "proteins/original_pdb_files"
!mkdir {ORIGINAL_PDB_FILES_DIRECTORY}

ORIGINAL_FASTA_FILES_DIRECTORY = "proteins/original_fasta_files"
!mkdir {ORIGINAL_FASTA_FILES_DIRECTORY}

INFERRED_FASTA_FILES_DIRECTORY = "proteins/inferred_fasta_files"
!mkdir {INFERRED_FASTA_FILES_DIRECTORY}

## Obtaining PDB IDs of selected protein structures
Useful functions:

In [3]:
import requests, json
from random import sample, seed


def display_query_results(results):
    print(f"Query ID: {results['query_id']}")
    print(f"Result type: {results['result_type']}")
    print(f"Total count of matches: {results['total_count']}")
    print(f"Total count of returned entities: {len(results['result_set'])}")
    to_display = min(len(results['result_set']), 10)
    print(f"First {to_display} items in result set:")
    for i in range(to_display):
        print(results['result_set'][i])


def retrieve_query_from_pdb(request):
    query = json.dumps(request)
    data = requests.get(f"https://search.rcsb.org/rcsbsearch/v2/query?json={query}")
    if data.status_code == 200:
        print("The query was processed successfully!")
        results = data.json()
        display_query_results(results)
        return results
    print(f"The request failed with status code {data.status_code}")
    return None


def select_random_pdb_ids(ids_list, sample_size, entity_ids_list = False):
    seed(0)
    if entity_ids_list:
        ids_list = [entity_id.split("_")[0] for entity_id in ids_list]
    return sample(ids_list, sample_size)

### Monomers
Firstly, we will create a dataset containing 500 monomer proteins.

This query returns polymer entity IDs from entries that:
- were deposited into PDB between 22. 7. 2022 and 22. 2. 2024
- contain only protein entities (no nucleic acids or oligosaccharides)
- have available pdb file
- are not synthetic
- the whole structure contains only one chain, which is not longer than 400 amino acids
- the sequence identity between the chains is at most 70% (to remove redundancy from the data and obtain a diverse dataset)

Unfortunately, it is not possible to perform the sequence identity filtering on whole entries. Therefore, the PDB IDs will be obtained from polymer entity IDs. 

In [38]:
monomers_query = {
    "query": {
        "type": "group",
        "logical_operator": "and",
        "nodes": [
            {"type": "terminal",
             "service": "text",
             "parameters": {
                 "attribute": "rcsb_accession_info.deposit_date",
                 "operator": "range",
                 "value": {
                     "from": "2022-07-22",
                     "include_lower": True,
                     "to": "2024-02-22",
                     "include_upper": True
                    }
                }
            },
            {"type": "terminal",
             "service": "text",
             "parameters": {
                 "attribute": "rcsb_entry_info.selected_polymer_entity_types",
                 "operator": "exact_match",
                 "value": "Protein (only)"
                }
            },
            {"type": "terminal",
             "service": "text",
             "parameters": {
                 "attribute": "pdbx_database_status.pdb_format_compatible",
                 "operator": "exact_match",
                 "value": "Y"
                }
            },
            {"type": "terminal",
             "service": "text",
             "parameters": {
                 "attribute": "rcsb_entity_source_organism.ncbi_scientific_name",
                 "operator": "exact_match",
                 "negation": True,
                 "value": "synthetic construct"
                }
            },
            {"type": "terminal",
             "service": "text",
             "parameters": {
                 "attribute": "rcsb_entry_info.polymer_entity_count_protein",
                 "operator": "equals",
                 "value": 1
                }
            },
            {"type": "terminal",
             "service": "text",
             "parameters": {
                 "attribute": "entity_poly.rcsb_sample_sequence_length",
                 "operator": "less_or_equal",
                 "value": 400
                }
            },
            ] 
        },
    "request_options": {
        "return_all_hits": True,
        "results_verbosity": "compact",
        "group_by": {
            "aggregation_method": "sequence_identity",
            "similarity_cutoff": 70
            },
        "group_by_return_type": "representatives"
        },
    "return_type": "polymer_entity"
}


monomers_result = retrieve_query_from_pdb(monomers_query)

The query was processed successfully!
Query ID: ff4d7094-ad42-4bb0-a6ca-5f74fdb8a86b
Result type: polymer_entity
Total count of matches: 5355
Total count of returned entities: 1736
First 10 items in result set:
5SSZ_1
7FY1_1
7G1Q_1
7G1X_1
7YKB_1
7YKC_1
7YKM_1
7YL8_1
7YLE_1
7YLG_1


The 500 monomers used for the experiment will be selected at random.

In [39]:
monomers_pdb_ids = select_random_pdb_ids(monomers_result["result_set"], 500, entity_ids_list = True)
monomers_pdb_ids[:10]

['8XPV',
 '8GQ4',
 '8TIF',
 '8H3Z',
 '8ALL',
 '8EIL',
 '8I86',
 '8HW1',
 '8GY0',
 '8ULM']

### Small protein complexes
Secondly, we will add 200 protein complexes to the dataset.

This query returns PDB IDs of entries that:
- were deposited into PDB between 22. 7. 2022 and 22. 2. 2024
- contain only protein entities (no nucleic acids or oligosaccharides)
- have available pdb file
- are not synthetic
- the whole structure contains two, three or four unique chains, which are 20-400 amino acids
- the PDB deposit group ID is empty

As mentioned above, it is not possible to remove redundancy from the data when searching for whole entries. To avoid choosing from a set containing multiple almost identical structures, only structures not belonging to any of the PDB deposit groups were chosen. There were only two such groups matching the criteria, containing aroud 300 structures altogether. I think that by doing this, I lose the least data while retaining as much diversity as possible.

Moreover, I tightened up the restriction on the chain length, because there are some complexes where the second chain is just a few amino acids short and the sequence is unknown (e.g. 8HEV, 8HF7 or 8EM8). However, this is easier said than done. The attribute `entity_poly.rcsb_sample_sequence_length` selects structures with at least one protein matching the values, while I need structures with all chain lengths in the range. I hadn't found a better way to do this, so I decided to create the set using set operations - create a dataset of all complexes matching the query without any restrictions on length, and then remove the structures containing a chain longer than 400 aa or shorter than 20 aa. 

In [40]:
all_complexes_query = {
    "query": {
        "type": "group",
        "logical_operator": "and",
        "nodes": [
            {"type": "terminal",
             "service": "text",
             "parameters": {
                 "attribute": "rcsb_accession_info.deposit_date",
                 "operator": "range",
                 "value": {
                     "from": "2022-07-22",
                     "include_lower": True,
                     "to": "2024-02-22",
                     "include_upper": True
                    }
                }
            },
            {"type": "terminal",
             "service": "text",
             "parameters": {
                 "attribute": "rcsb_entry_info.selected_polymer_entity_types",
                 "operator": "exact_match",
                 "value": "Protein (only)"
                }
            },
            {"type": "terminal",
             "service": "text",
             "parameters": {
                 "attribute": "pdbx_database_status.pdb_format_compatible",
                 "operator": "exact_match",
                 "value": "Y"
                }
            },
            {"type": "terminal",
             "service": "text",
             "parameters": {
                 "attribute": "rcsb_entity_source_organism.ncbi_scientific_name",
                 "operator": "exact_match",
                 "negation": True,
                 "value": "synthetic construct"
                }
            },
            {"type": "terminal",
             "service": "text",
             "parameters": {
                 "attribute": "rcsb_entry_info.polymer_entity_count_protein",
                 "operator": "range",
                 "value": {
                     "from": 2,
                     "include_lower": True,
                     "to": 4,
                     "include_upper": True
                    }
                }
            },
            {"type": "terminal",
             "service": "text",
             "parameters": {
                 "attribute": "pdbx_deposit_group.group_id",
                 "operator": "exists",
                 "negation": True
                }
            }
            ] 
        },
    "request_options": {
        "return_all_hits": True,
        "results_verbosity": "compact"
        },
    "return_type": "entry"
}

all_complexes_result = retrieve_query_from_pdb(all_complexes_query)

The query was processed successfully!
Query ID: 5bf81fbf-7ada-4474-baa1-a74bbe729e15
Result type: entry
Total count of matches: 2171
Total count of returned entities: 2171
First 10 items in result set:
7YKA
7YKF
7YKG
7YKH
7YKI
7YKJ
7YLB
7YLM
7YLS
7YMD


In [41]:
long_complexes_query = {
    "query": {
        "type": "group",
        "logical_operator": "and",
        "nodes": [
            {"type": "terminal",
             "service": "text",
             "parameters": {
                 "attribute": "rcsb_accession_info.deposit_date",
                 "operator": "range",
                 "value": {
                     "from": "2022-07-22",
                     "include_lower": True,
                     "to": "2024-02-22",
                     "include_upper": True
                    }
                }
            },
            {"type": "terminal",
             "service": "text",
             "parameters": {
                 "attribute": "rcsb_entry_info.selected_polymer_entity_types",
                 "operator": "exact_match",
                 "value": "Protein (only)"
                }
            },
            {"type": "terminal",
             "service": "text",
             "parameters": {
                 "attribute": "pdbx_database_status.pdb_format_compatible",
                 "operator": "exact_match",
                 "value": "Y"
                }
            },
            {"type": "terminal",
             "service": "text",
             "parameters": {
                 "attribute": "rcsb_entity_source_organism.ncbi_scientific_name",
                 "operator": "exact_match",
                 "negation": True,
                 "value": "synthetic construct"
                }
            },
            {"type": "terminal",
             "service": "text",
             "parameters": {
                 "attribute": "rcsb_entry_info.polymer_entity_count_protein",
                 "operator": "range",
                 "value": {
                     "from": 2,
                     "include_lower": True,
                     "to": 4,
                     "include_upper": True
                    }
                }
            },
            {"type": "terminal",
             "service": "text",
             "parameters": {
                 "attribute": "pdbx_deposit_group.group_id",
                 "operator": "exists",
                 "negation": True
                }
            },
            {"type": "terminal",
             "service": "text",
             "parameters": {
                 "attribute": "entity_poly.rcsb_sample_sequence_length",
                 "operator": "greater",
                 "value": 400
                }
            }
            ] 
        },
    "request_options": {
        "return_all_hits": True,
        "results_verbosity": "compact"
        },
    "return_type": "entry"
}

long_complexes_result = retrieve_query_from_pdb(long_complexes_query)

The query was processed successfully!
Query ID: 579f2925-ee65-40c6-8631-b43ac72f5d10
Result type: entry
Total count of matches: 1007
Total count of returned entities: 1007
First 10 items in result set:
7YKJ
7YLM
7YLS
7YMD
7YNI
7YNJ
7YNK
7YO3
7YOT
7YOU


In [42]:
short_complexes_query = {
    "query": {
        "type": "group",
        "logical_operator": "and",
        "nodes": [
            {"type": "terminal",
             "service": "text",
             "parameters": {
                 "attribute": "rcsb_accession_info.deposit_date",
                 "operator": "range",
                 "value": {
                     "from": "2022-07-22",
                     "include_lower": True,
                     "to": "2024-02-22",
                     "include_upper": True
                    }
                }
            },
            {"type": "terminal",
             "service": "text",
             "parameters": {
                 "attribute": "rcsb_entry_info.selected_polymer_entity_types",
                 "operator": "exact_match",
                 "value": "Protein (only)"
                }
            },
            {"type": "terminal",
             "service": "text",
             "parameters": {
                 "attribute": "pdbx_database_status.pdb_format_compatible",
                 "operator": "exact_match",
                 "value": "Y"
                }
            },
            {"type": "terminal",
             "service": "text",
             "parameters": {
                 "attribute": "rcsb_entity_source_organism.ncbi_scientific_name",
                 "operator": "exact_match",
                 "negation": True,
                 "value": "synthetic construct"
                }
            },
            {"type": "terminal",
             "service": "text",
             "parameters": {
                 "attribute": "rcsb_entry_info.polymer_entity_count_protein",
                 "operator": "range",
                 "value": {
                     "from": 2,
                     "include_lower": True,
                     "to": 4,
                     "include_upper": True
                    }
                }
            },
            {"type": "terminal",
             "service": "text",
             "parameters": {
                 "attribute": "pdbx_deposit_group.group_id",
                 "operator": "exists",
                 "negation": True
                }
            },
            {"type": "terminal",
             "service": "text",
             "parameters": {
                 "attribute": "entity_poly.rcsb_sample_sequence_length",
                 "operator": "less",
                 "value": 20
                }
            }
            ] 
        },
    "request_options": {
        "return_all_hits": True,
        "results_verbosity": "compact"
        },
    "return_type": "entry"
}

short_complexes_result = retrieve_query_from_pdb(short_complexes_query)

The query was processed successfully!
Query ID: a8870cba-d2fb-4349-a4cc-3d84aae204d1
Result type: entry
Total count of matches: 397
Total count of returned entities: 397
First 10 items in result set:
7YKF
7YKG
7YKH
7YKI
7YMK
7YOD
7YOH
7YOI
7YOL
7YOM


In [43]:
complexes_result = list(set(all_complexes_result["result_set"]).difference(long_complexes_result["result_set"],
                                                                           short_complexes_result["result_set"]))
len(complexes_result)

827

The 200 heterooligomers used for the experiment will be selected at random.

In [44]:
complexes_pdb_ids = select_random_pdb_ids(complexes_result, 200)
complexes_pdb_ids[:10]

['8HI2',
 '8JYC',
 '8HCI',
 '8HTD',
 '8AXX',
 '8BBP',
 '8CQL',
 '8FA2',
 '8FAS',
 '8BDL']

### Synthetic constructs
Lastly, 50 synthetic proteins will be added to the dataset.

This query returns PDB IDs of entries that:
- were deposited into PDB between 22. 7. 2022 and 22. 2. 2024
- contain only protein entities (no nucleic acids or oligosaccharides)
- have available pdb file
- are purely synthetic
- the whole structure contains one unique chain (for convenience), whose length is between 30 and 400 amino acids (including the borders)
- the PDB deposit group ID is empty

The requirement for the chains to be longer than 30 amino acids was added because the database contains a significant amount of very short synthetic peptides (almost a half of the matching synthetic proteins was shorter than 30 amino acids).

In [45]:
synthetic_query = {
    "query": {
        "type": "group",
        "logical_operator": "and",
        "nodes": [
            {"type": "terminal",
             "service": "text",
             "parameters": {
                 "attribute": "rcsb_accession_info.deposit_date",
                 "operator": "range",
                 "value": {
                     "from": "2022-07-22",
                     "include_lower": True,
                     "to": "2024-02-22",
                     "include_upper": True
                    }
                }
            },
            {"type": "terminal",
             "service": "text",
             "parameters": {
                 "attribute": "rcsb_entry_info.selected_polymer_entity_types",
                 "operator": "exact_match",
                 "value": "Protein (only)"
                }
            },
            {"type": "terminal",
             "service": "text",
             "parameters": {
                 "attribute": "pdbx_database_status.pdb_format_compatible",
                 "operator": "exact_match",
                 "value": "Y"
                }
            },
            {"type": "terminal",
             "service": "text",
             "parameters": {
                 "attribute": "rcsb_entry_info.polymer_entity_count_protein",
                 "operator": "equals",
                 "value": 1
                }
            },
            {"type": "terminal",
             "service": "text",
             "parameters": {
                 "attribute": "rcsb_entity_source_organism.ncbi_scientific_name",
                 "operator": "exact_match",
                 "value": "synthetic construct"
                }
            },
            {"type": "terminal",
             "service": "text",
             "parameters": {
                 "attribute": "rcsb_entity_source_organism.ncbi_parent_scientific_name",
                 "operator": "contains_words",
                 "negation": True,
                 "value": "Eukaryota Archaea Bacteria Riboviria Duplodnaviria"
                }
            },
            {"type": "terminal",
             "service": "text",
             "parameters": {
                 "attribute": "entity_poly.rcsb_sample_sequence_length",
                 "operator": "range",
                 "value": {
                     "from": 30,
                     "include_lower": True,
                     "to": 400,
                     "include_upper": True
                    }
                }
            },
            {"type": "terminal",
             "service": "text",
             "parameters": {
                 "attribute": "pdbx_deposit_group.group_id",
                 "operator": "exists",
                 "negation": True
                }
            }
            ] 
        },
    "request_options": {
        "return_all_hits": True,
        "results_verbosity": "compact"
    },
    "return_type": "entry"
}


synthetic_result = retrieve_query_from_pdb(synthetic_query)

The query was processed successfully!
Query ID: 30a2115c-dffe-4e2b-96bc-72a2111e7bdb
Result type: entry
Total count of matches: 98
Total count of returned entities: 98
First 10 items in result set:
7YRW
7YRX
8AO0
8AO1
8B15
8B16
8B30
8B45
8BCS
8BFE


We will select 50 synthetic proteins at random:

In [46]:
synthetic_pdb_ids = select_random_pdb_ids(synthetic_result["result_set"], 50)
synthetic_pdb_ids[:10]

['8FIH',
 '8FJE',
 '8B16',
 '8ERW',
 '8H7D',
 '8GAQ',
 '8FIQ',
 '8EU0',
 '8GAA',
 '8FBJ']

## Pandas dataframe
Create pandas dataframe to store information about the proteins and files and save the dataframe as a csv file:

In [47]:
import pandas as pd
import numpy as np

proteins = pd.DataFrame({'pdb_id': np.concatenate([monomers_pdb_ids, complexes_pdb_ids, synthetic_pdb_ids]),
                         'label': np.repeat(['monomer', 'complex', 'synthetic'],
                                            [len(monomers_pdb_ids), len(complexes_pdb_ids), len(synthetic_pdb_ids)])})

proteins

Unnamed: 0,pdb_id,label
0,8XPV,monomer
1,8GQ4,monomer
2,8TIF,monomer
3,8H3Z,monomer
4,8ALL,monomer
...,...,...
745,8HNE,synthetic
746,8FIN,synthetic
747,8J0A,synthetic
748,8HDU,synthetic


In [48]:
proteins.to_csv("proteins/original_protein_dataset.csv", sep = ",", index = False)