<a href="https://colab.research.google.com/github/MicheleBonus/cpclab_notebooks/blob/main/solr_pattern_search.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# @title Run the following code cell first
import datetime
import requests
import pandas as pd
!pip install solrq
from solrq import Q
from copy import deepcopy

class PDBESearch:
    BASE_URL = "https://www.ebi.ac.uk/pdbe/search/pdb/select?"

    def __init__(self, search_pattern):
        self.search_pattern = search_pattern
        self.search_terms_status = Q(status='REL')
        self._set_search_terms()

        self.filter_terms = [
            'pdb_id', 'number_of_polymer_entities', 'number_of_polymers', 'number_of_bound_entities',
            'number_of_bound_molecules', 'experimental_method', 'release_year', 'resolution', 'ec_number',
            'has_bound_molecule', 'bound_compound_id', 'em_resolution', 'data_quality', 'assembly_mol_wt',
            'journal', 'refinement_software', 'synchrotron_site', 'em_electron_detection',
            'cath_homologous_superfamily', 'assembly_composition', 'processing_site'
        ]

    def _set_search_terms(self):
        self.search_terms_regex = f"molecule_sequence:/{self.search_pattern}/"
        self.full_search_terms = str(self.search_terms_status) + " AND " + self.search_terms_regex

    @staticmethod
    def _change_lists_to_strings(results):
        for row in results:
            for data in row:
                if type(row[data]) == list:
                    row[data] = [str(a) for a in row[data]]
                    row[data] = ','.join(sorted(list(set(row[data]))))
        return results

    def fetch_data(self):
        search_params = {
            'q': self.full_search_terms,
            'fl': ','.join(self.filter_terms),
            'rows': 100,
            'wt': 'json',
            'group': 'true',
            'group.field': 'pdb_id',
            'group.ngroups': 'true'
        }

        response = requests.post(self.BASE_URL, data=search_params)

        if response.status_code == 200:
            response_data = response.json()
            preresults = response_data.get('grouped', {}).get('pdb_id', {}).get('groups', [])

            all_results = []
            for group in preresults:
                docs = group.get('doclist', {}).get('docs', [])
                all_results.extend(docs)

            all_results_copy = deepcopy(all_results)
            all_results_cleaned = self._change_lists_to_strings(all_results_copy)
            all_results_cleaned_df = pd.DataFrame(all_results_cleaned)
            return all_results_cleaned_df
        else:
            raise Exception(f"[No data retrieved - {response.status_code}] {response.text}")

# Set parameters and run code cell
Either run the example or change the regular expression. Examples for valid regular expressions are:
- `'.*?G.{4}GK[TS].*'` (*anything* -- **G** -- *any 4 amino acids* -- **GK** -- *either* **T** *or* **S** -- *anything*)
- `'.*?[RK].{2,3}[DE].{2,3}Y.*'` (*anything* -- *either* **R** *or* **K** -- *any 2 or 3 amino acids* -- *either* **D** *or* **E** -- *any 2 or 3 amino acids* -- **Y** -- *anything*)

In [None]:
pattern = '.*?G.{4}GK[TS].*'
searcher = PDBESearch(search_pattern=pattern)
df = searcher.fetch_data()
df

# Remarks
## 1. Limited results
For demonstration purposes, we will only search until we received 100 results. This is defined in the code at:

```python
...
    def fetch_data(self):
        search_params = {
            'q': self.full_search_terms,
            'fl': ','.join(self.filter_terms),
            'rows': 100,
            'wt': 'json',
            'group': 'true',
            'group.field': 'pdb_id',
            'group.ngroups': 'true'
        }
...
```

If you want to return all data, you might want to change this to `220000` (a number that is slightly above the total number of structures in the PDB.

## 2. Custom output
Whatever is in the final dataframe is fully customizable. This is defined in the code at:

```python
...
        self.filter_terms = [
            'pdb_id', 'number_of_polymer_entities', 'number_of_polymers', 'number_of_bound_entities',
            'number_of_bound_molecules', 'experimental_method', 'release_year', 'resolution', 'ec_number',
            'has_bound_molecule', 'bound_compound_id', 'em_resolution', 'data_quality', 'assembly_mol_wt',
            'journal', 'refinement_software', 'synchrotron_site', 'em_electron_detection',
            'cath_homologous_superfamily', 'assembly_composition', 'processing_site'
        ]
...
```