# Creating dataset

In [1]:
from IPython.utils import io

with io.capture_output() as captured:
    !conda install -y requests
    !conda install -y biopython
    !conda install -y pandas

print(captured.stdout if ("Error" in captured.stdout) or ("ERROR" in captured.stdout) else "Installation successful")

Installation successful


## Obtaining PDB IDs of selected protein structures
Useful functions:

In [2]:
import requests, json
from random import sample, seed


def display_query_results(results):
    print(f"Query ID: {results['query_id']}")
    print(f"Result type: {results['result_type']}")
    print(f"Total count of matches: {results['total_count']}")
    print(f"Total count of returned entities: {len(results['result_set'])}")
    to_display = min(len(results['result_set']), 10)
    print(f"First {to_display} items in result set:")
    for i in range(to_display):
        print(results['result_set'][i])


def retrieve_query_from_pdb(request):
    query = json.dumps(request)
    data = requests.get(f"https://search.rcsb.org/rcsbsearch/v2/query?json={query}")
    if data.status_code == 200:
        print("The query was processed successfully!")
        results = data.json()
        display_query_results(results)
        return results
    print(f"The request failed with status code {data.status_code}")
    return None


def select_random_pdb_ids(ids_list, sample_size, entity_ids_list = False):
    seed(0)
    if entity_ids_list:
        ids_list = [entity_id.split("_")[0] for entity_id in ids_list]
    return sample(ids_list, sample_size)

### Monomers
Firstly, we will create a dataset containing 1000 monomer proteins.

This query returns polymer entity IDs from entries that:
- were deposited into PDB between 22. 7. 2022 and 22. 7. 2024
- contain only protein entities (no nucleic acids or oligosaccharides)
- the whole structure contains one unique chain with length between 20 and 400
- the oligomeric state of the entry is labelled as Monomer
- have available pdb file
- are not labelled as DE NOVO PROTEIN (https://mmcif.wwpdb.org/dictionaries/mmcif_pdbx_v50.dic/Items/_struct_keywords.pdbx_keywords.html)
- the pairwise sequence identity between the chains in the returned dataset is at most 70% (to remove redundancy from the data and obtain a diverse dataset)

I decided to skip filtering by resolusion as structures determined by NMR miss the attribute entirely and other matching structures have better resolution than 9 A. 
Unfortunately, it is not possible to perform the sequence identity filtering on whole entries. Therefore, the PDB IDs will be obtained from polymer entity IDs. 

In [3]:
monomers_query = {
    "query": {
        "type": "group",
        "logical_operator": "and",
        "nodes": [
            {"type": "terminal",
             "service": "text",
             "parameters": {
                 "attribute": "rcsb_accession_info.deposit_date",
                 "operator": "range",
                 "value": {
                     "from": "2022-07-22",
                     "include_lower": True,
                     "to": "2024-07-22",
                     "include_upper": True
                    }
                }
            },
            {"type": "terminal",
             "service": "text",
             "parameters": {
                 "attribute": "rcsb_entry_info.selected_polymer_entity_types",
                 "operator": "exact_match",
                 "value": "Protein (only)"
                }
            },
            {"type": "terminal",
             "service": "text",
             "parameters": {
                 "attribute": "rcsb_entry_info.polymer_entity_count_protein",
                 "operator": "equals",
                 "value": 1
                }
            },
            {"type": "terminal",
             "service": "text",
             "parameters": {
                 "attribute": "entity_poly.rcsb_sample_sequence_length",
                 "operator": "range",
                 "value": {
                     "from": 20,
                     "include_lower": True,
                     "to": 400,
                     "include_upper": True
                    }
                }
            },
            {"type": "terminal",
             "service": "text",
             "parameters": {
                 "attribute": "rcsb_struct_symmetry.oligomeric_state",
                 "operator": "exact_match",
                 "value": "Monomer"
                }
            },
            {"type": "terminal",
             "service": "text",
             "parameters": {
                 "attribute": "pdbx_database_status.pdb_format_compatible",
                 "operator": "exact_match",
                 "value": "Y"
                }
            },
            {"type": "terminal",
             "service": "text",
             "parameters": {
                 "attribute": "struct_keywords.pdbx_keywords",
                 "operator": "contains_phrase",
                 "negation": True,
                 "value": "DE NOVO PROTEIN"
                }
            }
            ] 
        },
    "request_options": {
        "return_all_hits": True,
        "results_verbosity": "compact",
        "group_by": {
            "aggregation_method": "sequence_identity",
            "similarity_cutoff": 70
            },
        "group_by_return_type": "representatives"
        },
    "return_type": "polymer_entity"
}


monomers_result = retrieve_query_from_pdb(monomers_query)

The query was processed successfully!
Query ID: 17f36f4d-0d50-4984-8f81-f8a2df463640
Result type: polymer_entity
Total count of matches: 3830
Total count of returned entities: 1323
First 10 items in result set:
5SSZ_1
7FY1_1
7G1Q_1
7G1X_1
7H9J_1
7YKB_1
7YKM_1
7YL8_1
7YLE_1
7YLL_1


The 1000 monomers used for the experiment will be selected at random.

In [4]:
monomers_pdb_ids = select_random_pdb_ids(monomers_result["result_set"], 1000, entity_ids_list = True)
monomers_pdb_ids[:10]

['8P0E',
 '8PX8',
 '8B2E',
 '8HOE',
 '8TCE',
 '8SSX',
 '8PJS',
 '8J3N',
 '8SHY',
 '8OE5']

### Small protein complexes
Secondly, we will add 250 protein complexes to the dataset.

This query returns PDB IDs of entries that:
- were deposited into PDB between 22. 7. 2022 and 22. 7. 2024
- contain only protein entities (no nucleic acids or oligosaccharides)
- the assembly contains 2 to 6 chains, which are 20-400 amino acids long
- the oligomeric state of the entry is NOT labelled as Monomer
- have available pdb file
- are not labelled as  DE NOVO PROTEIN
- the PDB deposit group ID is empty

As mentioned above, it is not possible to remove redundancy from the data when searching for whole entries. To avoid choosing from a set containing multiple almost identical structures, only structures not belonging to any of the PDB deposit groups were chosen. There were only three such groups matching the criteria, containing 337 structures altogether. I think that by doing this, I lose the least data while retaining as much diversity as possible.

Moreover, I wanted to retain the restriction on the chain length because there are some complexes where the second chain is just a few amino acids short, and the sequence is unknown (e.g. 8HEV, 8HF7 or 8EM8). However, this is easier said than done. The attribute `entity_poly.rcsb_sample_sequence_length` selects structures with at least one protein matching the values, while I need structures with all chain lengths in the range. I hadn't found a better way to do this, so I decided to create the set using set operations - create a dataset of all complexes matching the query without any restrictions on length and then remove the structures containing a chain longer than 400 aa or shorter than 20 aa. 

In [5]:
all_complexes_query = {
    "query": {
        "type": "group",
        "logical_operator": "and",
        "nodes": [
            {"type": "terminal",
             "service": "text",
             "parameters": {
                 "attribute": "rcsb_accession_info.deposit_date",
                 "operator": "range",
                 "value": {
                     "from": "2022-07-22",
                     "include_lower": True,
                     "to": "2024-07-22",
                     "include_upper": True
                    }
                }
            },
            {"type": "terminal",
             "service": "text",
             "parameters": {
                 "attribute": "rcsb_entry_info.selected_polymer_entity_types",
                 "operator": "exact_match",
                 "value": "Protein (only)"
                }
            },
            {"type": "terminal",
             "service": "text",
             "parameters": {
                 "attribute": "rcsb_assembly_info.polymer_entity_instance_count_protein",
                 "operator": "range",
                 "value": {
                     "from": 2,
                     "include_lower": True,
                     "to": 6,
                     "include_upper": True
                    }
                }
            },
            {"type": "terminal",
             "service": "text",
             "parameters": {
                 "attribute": "rcsb_struct_symmetry.oligomeric_state",
                 "operator": "exact_match",
                 "negation": True,
                 "value": "Monomer"
                }
            },
            {"type": "terminal",
             "service": "text",
             "parameters": {
                 "attribute": "pdbx_database_status.pdb_format_compatible",
                 "operator": "exact_match",
                 "value": "Y"
                }
            },
            {"type": "terminal",
             "service": "text",
             "parameters": {
                 "attribute": "struct_keywords.pdbx_keywords",
                 "operator": "contains_phrase",
                 "negation": True,
                 "value": "DE NOVO PROTEIN"
                }
            },
            {"type": "terminal",
             "service": "text",
             "parameters": {
                 "attribute": "pdbx_deposit_group.group_id",
                 "operator": "exists",
                 "negation": True
                }
            }
            ] 
        },
    "request_options": {
        "return_all_hits": True,
        "results_verbosity": "compact"
        },
    "return_type": "entry"
}

all_complexes_result = retrieve_query_from_pdb(all_complexes_query)

The query was processed successfully!
Query ID: f68b8353-d763-4785-b801-4fad8a49ede1
Result type: entry
Total count of matches: 7803
Total count of returned entities: 7803
First 10 items in result set:
7YKA
7YKC
7YKD
7YKF
7YKG
7YKH
7YKI
7YKJ
7YKK
7YKL


In [6]:
long_complexes_query = {
    "query": {
        "type": "group",
        "logical_operator": "and",
        "nodes": [
            {"type": "terminal",
             "service": "text",
             "parameters": {
                 "attribute": "rcsb_accession_info.deposit_date",
                 "operator": "range",
                 "value": {
                     "from": "2022-07-22",
                     "include_lower": True,
                     "to": "2024-07-22",
                     "include_upper": True
                    }
                }
            },
            {"type": "terminal",
             "service": "text",
             "parameters": {
                 "attribute": "rcsb_entry_info.selected_polymer_entity_types",
                 "operator": "exact_match",
                 "value": "Protein (only)"
                }
            },
            {"type": "terminal",
             "service": "text",
             "parameters": {
                 "attribute": "rcsb_assembly_info.polymer_entity_instance_count_protein",
                 "operator": "range",
                 "value": {
                     "from": 2,
                     "include_lower": True,
                     "to": 6,
                     "include_upper": True
                    }
                }
            },
            {"type": "terminal",
             "service": "text",
             "parameters": {
                 "attribute": "rcsb_struct_symmetry.oligomeric_state",
                 "operator": "exact_match",
                 "negation": True,
                 "value": "Monomer"
                }
            },
            {"type": "terminal",
             "service": "text",
             "parameters": {
                 "attribute": "pdbx_database_status.pdb_format_compatible",
                 "operator": "exact_match",
                 "value": "Y"
                }
            },
            {"type": "terminal",
             "service": "text",
             "parameters": {
                 "attribute": "struct_keywords.pdbx_keywords",
                 "operator": "contains_phrase",
                 "negation": True,
                 "value": "DE NOVO PROTEIN"
                }
            },
            {"type": "terminal",
             "service": "text",
             "parameters": {
                 "attribute": "pdbx_deposit_group.group_id",
                 "operator": "exists",
                 "negation": True
                }
            },
            {"type": "terminal",
             "service": "text",
             "parameters": {
                 "attribute": "entity_poly.rcsb_sample_sequence_length",
                 "operator": "greater",
                 "value": 400
                }
            }
            ] 
        },
    "request_options": {
        "return_all_hits": True,
        "results_verbosity": "compact"
        },
    "return_type": "entry"
}

long_complexes_result = retrieve_query_from_pdb(long_complexes_query)

The query was processed successfully!
Query ID: da5c289a-30d7-47a3-8e16-6533db67906d
Result type: entry
Total count of matches: 3283
Total count of returned entities: 3283
First 10 items in result set:
7YKJ
7YKK
7YKL
7YKR
7YKS
7YKT
7YKV
7YKZ
7YLM
7YLQ


In [7]:
short_complexes_query = {
    "query": {
        "type": "group",
        "logical_operator": "and",
        "nodes": [
            {"type": "terminal",
             "service": "text",
             "parameters": {
                 "attribute": "rcsb_accession_info.deposit_date",
                 "operator": "range",
                 "value": {
                     "from": "2022-07-22",
                     "include_lower": True,
                     "to": "2024-07-22",
                     "include_upper": True
                    }
                }
            },
            {"type": "terminal",
             "service": "text",
             "parameters": {
                 "attribute": "rcsb_entry_info.selected_polymer_entity_types",
                 "operator": "exact_match",
                 "value": "Protein (only)"
                }
            },
            {"type": "terminal",
             "service": "text",
             "parameters": {
                 "attribute": "rcsb_assembly_info.polymer_entity_instance_count_protein",
                 "operator": "range",
                 "value": {
                     "from": 2,
                     "include_lower": True,
                     "to": 6,
                     "include_upper": True
                    }
                }
            },
            {"type": "terminal",
             "service": "text",
             "parameters": {
                 "attribute": "rcsb_struct_symmetry.oligomeric_state",
                 "operator": "exact_match",
                 "negation": True,
                 "value": "Monomer"
                }
            },
            {"type": "terminal",
             "service": "text",
             "parameters": {
                 "attribute": "pdbx_database_status.pdb_format_compatible",
                 "operator": "exact_match",
                 "value": "Y"
                }
            },
            {"type": "terminal",
             "service": "text",
             "parameters": {
                 "attribute": "struct_keywords.pdbx_keywords",
                 "operator": "contains_phrase",
                 "negation": True,
                 "value": "DE NOVO PROTEIN"
                }
            },
            {"type": "terminal",
             "service": "text",
             "parameters": {
                 "attribute": "pdbx_deposit_group.group_id",
                 "operator": "exists",
                 "negation": True
                }
            },
            {"type": "terminal",
             "service": "text",
             "parameters": {
                 "attribute": "entity_poly.rcsb_sample_sequence_length",
                 "operator": "less",
                 "value": 20
                }
            }
            ] 
        },
    "request_options": {
        "return_all_hits": True,
        "results_verbosity": "compact"
        },
    "return_type": "entry"
}

short_complexes_result = retrieve_query_from_pdb(short_complexes_query)

The query was processed successfully!
Query ID: eb0e27a1-ba92-4a35-83e3-9876054e00bd
Result type: entry
Total count of matches: 688
Total count of returned entities: 688
First 10 items in result set:
7YKD
7YKF
7YKG
7YKH
7YKI
7YMK
7YOD
7YOH
7YOI
7YOL


In [8]:
complexes_result = list(set(all_complexes_result["result_set"]).difference(long_complexes_result["result_set"],
                                                                           short_complexes_result["result_set"]))
len(complexes_result)

3988

The 250 oligomers used for the experiment will be selected at random.

In [9]:
complexes_pdb_ids = select_random_pdb_ids(complexes_result, 250)
complexes_pdb_ids[:10]

['8G2X',
 '8T35',
 '8K1V',
 '8CNK',
 '8UD2',
 '8I5L',
 '8EY0',
 '8PEC',
 '8QV7',
 '8QH1']

### Synthetic constructs
Lastly, synthetic proteins will be added to the dataset.

This query returns PDB IDs of entries that:
- were deposited into PDB between 22. 7. 2022 and 22. 7. 2024
- contain only protein entities (no nucleic acids or oligosaccharides)
- the whole structure contains one unique chain (for convenience), whose length is between 20 and 400 amino acids (including the borders)
- have available pdb file
- are labelled as DE NOVO PROTEIN
- the sequence identity between the chains is at most 70% (to remove redundancy from the data and obtain a diverse dataset)

In [10]:
synthetic_query = {
    "query": {
        "type": "group",
        "logical_operator": "and",
        "nodes": [
            {"type": "terminal",
             "service": "text",
             "parameters": {
                 "attribute": "rcsb_accession_info.deposit_date",
                 "operator": "range",
                 "value": {
                     "from": "2022-07-22",
                     "include_lower": True,
                     "to": "2024-07-22",
                     "include_upper": True
                    }
                }
            },
            {"type": "terminal",
             "service": "text",
             "parameters": {
                 "attribute": "rcsb_entry_info.selected_polymer_entity_types",
                 "operator": "exact_match",
                 "value": "Protein (only)"
                }
            },
            {"type": "terminal",
             "service": "text",
             "parameters": {
                 "attribute": "rcsb_entry_info.polymer_entity_count_protein",
                 "operator": "equals",
                 "value": 1
                }
            },
            {"type": "terminal",
             "service": "text",
             "parameters": {
                 "attribute": "entity_poly.rcsb_sample_sequence_length",
                 "operator": "range",
                 "value": {
                     "from": 20,
                     "include_lower": True,
                     "to": 400,
                     "include_upper": True
                    }
                }
            },
            {"type": "terminal",
             "service": "text",
             "parameters": {
                 "attribute": "pdbx_database_status.pdb_format_compatible",
                 "operator": "exact_match",
                 "value": "Y"
                }
            },
            {"type": "terminal",
             "service": "text",
             "parameters": {
                 "attribute": "struct_keywords.pdbx_keywords",
                 "operator": "contains_phrase",
                 "value": "DE NOVO PROTEIN"
                }
            }
            ] 
        },
    "request_options": {
        "return_all_hits": True,
        "results_verbosity": "compact",
        "group_by": {
            "aggregation_method": "sequence_identity",
            "similarity_cutoff": 70
            },
        "group_by_return_type": "representatives"
        },
    "return_type": "polymer_entity"
}

synthetic_result = retrieve_query_from_pdb(synthetic_query)

The query was processed successfully!
Query ID: 25e6aa4d-592c-44f4-a99b-fad548634c90
Result type: polymer_entity
Total count of matches: 139
Total count of returned entities: 103
First 10 items in result set:
8B16_1
8B1X_1
8B45_1
8BCS_1
8BFE_1
8BL6_1
8BL9_1
8CH0_1
8DT0_1
8DZ8_1


We will use all the synthetic proteins:

In [11]:
synthetic_pdb_ids = select_random_pdb_ids(synthetic_result["result_set"], len(synthetic_result["result_set"]), entity_ids_list = True)
synthetic_pdb_ids[:10]

['8H7E',
 '8VE7',
 '8JYT',
 '8BL6',
 '8FIQ',
 '8OYX',
 '8OYS',
 '8JPA',
 '8FLX',
 '8OT7']

## Pandas dataframe
Create pandas dataframe to store information about the proteins and files and save the dataframe as a csv file:

In [12]:
import pandas as pd
import numpy as np

proteins = pd.DataFrame({'pdb_id': np.concatenate([monomers_pdb_ids, complexes_pdb_ids, synthetic_pdb_ids]),
                         'label': np.repeat(['monomer', 'complex', 'synthetic'],
                                            [len(monomers_pdb_ids), len(complexes_pdb_ids), len(synthetic_pdb_ids)])})

proteins

Unnamed: 0,pdb_id,label
0,8P0E,monomer
1,8PX8,monomer
2,8B2E,monomer
3,8HOE,monomer
4,8TCE,monomer
...,...,...
1348,8G9J,synthetic
1349,8OYV,synthetic
1350,8TNO,synthetic
1351,8FJE,synthetic


In [13]:
import os

DATA_DIRECTORY = "data"
if not os.path.exists(DATA_DIRECTORY):
    os.makedirs(DATA_DIRECTORY)

proteins.to_csv(f"{DATA_DIRECTORY}/proteins_dataset.csv", sep = ",", index = False)