In [None]:
import math
from pathlib import Path
from zipfile import ZipFile
from tempfile import TemporaryDirectory

import numpy as np
import pandas as pd
from rdkit.Chem import PandasTools
from chembl_webresource_client.new_client import new_client
from tqdm.auto import tqdm

In [None]:
HERE = Path(_dh[-1])
DATA = HERE / "data"

In [4]:
# Next, we create resource objects for API access.
targets_api = new_client.target
compounds_api = new_client.molecule
bioactivities_api = new_client.activity

In [5]:
type(targets_api)

chembl_webresource_client.query_set.QuerySet

# Get target data  
- Get UniProt ID of the target of interest from UniProt website  
- Use UniProt ID to get target information

Aldose reductase (AKR1B1): `P15121`  
Sodium/glucose cotransporter 2 (SLC5A2, SGLT2): `P31639`

## AKR1B1

In [6]:
uniprot_id = "P15121"

Fetch target data from ChEMBL

In [7]:
# Get target information from ChEMBL but restrict it to specified values only
targets = targets_api.get(target_components__accession=uniprot_id).only(
    "target_chembl_id", "organism", "pref_name", "target_type"
)
print(f'The type of the targets is "{type(targets)}"')

The type of the targets is "<class 'chembl_webresource_client.query_set.QuerySet'>"


Download target data from ChEMBL

In [8]:
targets = pd.DataFrame.from_records(targets)
targets

Unnamed: 0,organism,pref_name,target_chembl_id,target_type
0,Homo sapiens,Aldose reductase,CHEMBL1900,SINGLE PROTEIN
1,Homo sapiens,Aldose reductase,CHEMBL1900,SINGLE PROTEIN
2,Homo sapiens,Baculoviral IAP repeat-containing protein 2/Al...,CHEMBL4802032,PROTEIN-PROTEIN INTERACTION


Select target (target ChEMBL ID)

In [9]:
target = targets.iloc[0]
target

organism                Homo sapiens
pref_name           Aldose reductase
target_chembl_id          CHEMBL1900
target_type           SINGLE PROTEIN
Name: 0, dtype: object

Save selected ChEMBL ID

In [10]:
chembl_id = target.target_chembl_id
print(f"The target ChEMBL ID is {chembl_id}")
# NBVAL_CHECK_OUTPUT

The target ChEMBL ID is CHEMBL1900


Fetch bioactivity data for the target from ChEMBL

In [11]:
bioactivities = bioactivities_api.filter(
    target_chembl_id=chembl_id, type="IC50", relation="=", assay_type="B"
).only(
    "activity_id",
    "assay_chembl_id",
    "assay_description",
    "assay_type",
    "molecule_chembl_id",
    "type",
    "standard_units",
    "relation",
    "standard_value",
    "target_chembl_id",
    "target_organism",
)

print(f"Length and type of bioactivities object: {len(bioactivities)}, {type(bioactivities)}")

Length and type of bioactivities object: 981, <class 'chembl_webresource_client.query_set.QuerySet'>


In [12]:
print(f"Length and type of first element: {len(bioactivities[0])}, {type(bioactivities[0])}")
bioactivities[0]

Length and type of first element: 13, <class 'dict'>


{'activity_id': 72831,
 'assay_chembl_id': 'CHEMBL764859',
 'assay_description': 'Inhibition of human placental aldose reductase (HPAR) activity with glyceraldehyde as substrate',
 'assay_type': 'B',
 'molecule_chembl_id': 'CHEMBL18854',
 'relation': '=',
 'standard_units': 'nM',
 'standard_value': '230.0',
 'target_chembl_id': 'CHEMBL1900',
 'target_organism': 'Homo sapiens',
 'type': 'IC50',
 'units': 'uM',
 'value': '0.23'}

Download bioactivity data from ChEMBL (QuerySet) in the form of a pandas DataFrame.

In [13]:
bioactivities_df = pd.DataFrame.from_dict(bioactivities)
print(f"DataFrame shape: {bioactivities_df.shape}")
bioactivities_df.head()

DataFrame shape: (981, 13)


Unnamed: 0,activity_id,assay_chembl_id,assay_description,assay_type,molecule_chembl_id,relation,standard_units,standard_value,target_chembl_id,target_organism,type,units,value
0,72831,CHEMBL764859,Inhibition of human placental aldose reductase...,B,CHEMBL18854,=,nM,230.0,CHEMBL1900,Homo sapiens,IC50,uM,0.23
1,73884,CHEMBL764859,Inhibition of human placental aldose reductase...,B,CHEMBL19744,=,nM,130.0,CHEMBL1900,Homo sapiens,IC50,uM,0.13
2,73885,CHEMBL764859,Inhibition of human placental aldose reductase...,B,CHEMBL19711,=,nM,100.0,CHEMBL1900,Homo sapiens,IC50,uM,0.1
3,77187,CHEMBL764859,Inhibition of human placental aldose reductase...,B,CHEMBL19392,=,nM,5630.0,CHEMBL1900,Homo sapiens,IC50,uM,5.63
4,80619,CHEMBL764859,Inhibition of human placental aldose reductase...,B,CHEMBL19746,=,nM,7470.0,CHEMBL1900,Homo sapiens,IC50,uM,7.47


Drop "units" and "value" column to use standard values and unit only.

In [15]:
bioactivities_df.drop(["units", "value"], axis=1, inplace=True)
bioactivities_df.head()

Unnamed: 0,activity_id,assay_chembl_id,assay_description,assay_type,molecule_chembl_id,relation,standard_units,standard_value,target_chembl_id,target_organism,type
0,72831,CHEMBL764859,Inhibition of human placental aldose reductase...,B,CHEMBL18854,=,nM,230.0,CHEMBL1900,Homo sapiens,IC50
1,73884,CHEMBL764859,Inhibition of human placental aldose reductase...,B,CHEMBL19744,=,nM,130.0,CHEMBL1900,Homo sapiens,IC50
2,73885,CHEMBL764859,Inhibition of human placental aldose reductase...,B,CHEMBL19711,=,nM,100.0,CHEMBL1900,Homo sapiens,IC50
3,77187,CHEMBL764859,Inhibition of human placental aldose reductase...,B,CHEMBL19392,=,nM,5630.0,CHEMBL1900,Homo sapiens,IC50
4,80619,CHEMBL764859,Inhibition of human placental aldose reductase...,B,CHEMBL19746,=,nM,7470.0,CHEMBL1900,Homo sapiens,IC50


Preprocessing

In [16]:
bioactivities_df.dtypes

activity_id            int64
assay_chembl_id       object
assay_description     object
assay_type            object
molecule_chembl_id    object
relation              object
standard_units        object
standard_value        object
target_chembl_id      object
target_organism       object
type                  object
dtype: object

In [17]:
bioactivities_df = bioactivities_df.astype({"standard_value": "float64"})

In [18]:
# Delete entries with missing values
bioactivities_df.dropna(axis=0, how="any", inplace=True)

DataFrame shape: (981, 11)


In [19]:
# Keep entries with unit nM only
print(f"Units in downloaded data: {bioactivities_df['standard_units'].unique()}")
print(
    f"Number of non-nM entries:\
    {bioactivities_df[bioactivities_df['standard_units'] != 'nM'].shape[0]}"
)
bioactivities_df = bioactivities_df[bioactivities_df["standard_units"] == "nM"]
print(f"Units after filtering: {bioactivities_df['standard_units'].unique()}")

Units in downloaded data: ['nM' 'ug.mL-1']
Number of non-nM entries:    17


In [21]:
# Delete duplicate molecules
bioactivities_df.drop_duplicates("molecule_chembl_id", keep="first", inplace=True)

DataFrame shape: (784, 11)


In [22]:
# Reset dataframe index
bioactivities_df.reset_index(drop=True, inplace=True)

In [23]:
# Rename columns
bioactivities_df.rename(
    columns={"standard_value": "IC50", "standard_units": "units"}, inplace=True
)

In [24]:
print(f"DataFrame shape: {bioactivities_df.shape}")
bioactivities_df

DataFrame shape: (784, 11)


Unnamed: 0,activity_id,assay_chembl_id,assay_description,assay_type,molecule_chembl_id,relation,units,IC50,target_chembl_id,target_organism,type
0,72831,CHEMBL764859,Inhibition of human placental aldose reductase...,B,CHEMBL18854,=,nM,230.0,CHEMBL1900,Homo sapiens,IC50
1,73884,CHEMBL764859,Inhibition of human placental aldose reductase...,B,CHEMBL19744,=,nM,130.0,CHEMBL1900,Homo sapiens,IC50
2,73885,CHEMBL764859,Inhibition of human placental aldose reductase...,B,CHEMBL19711,=,nM,100.0,CHEMBL1900,Homo sapiens,IC50
3,77187,CHEMBL764859,Inhibition of human placental aldose reductase...,B,CHEMBL19392,=,nM,5630.0,CHEMBL1900,Homo sapiens,IC50
4,80619,CHEMBL764859,Inhibition of human placental aldose reductase...,B,CHEMBL19746,=,nM,7470.0,CHEMBL1900,Homo sapiens,IC50
...,...,...,...,...,...,...,...,...,...,...,...
779,22836862,CHEMBL4718956,"Inhibition of recombinant human ALR2 using D,L...",B,CHEMBL4761144,=,nM,61700.0,CHEMBL1900,Homo sapiens,IC50
780,22836863,CHEMBL4718956,"Inhibition of recombinant human ALR2 using D,L...",B,CHEMBL4757375,=,nM,83400.0,CHEMBL1900,Homo sapiens,IC50
781,22836864,CHEMBL4718956,"Inhibition of recombinant human ALR2 using D,L...",B,CHEMBL4744462,=,nM,12400.0,CHEMBL1900,Homo sapiens,IC50
782,22836865,CHEMBL4718956,"Inhibition of recombinant human ALR2 using D,L...",B,CHEMBL4777280,=,nM,1970.0,CHEMBL1900,Homo sapiens,IC50


# Get compound data  
Get molecular structures of the molecules that are linked to respective bioactivity ChEMBL IDs.

Fetch compound data from ChEMBL

In [25]:
compounds_provider = compounds_api.filter(
    molecule_chembl_id__in=list(bioactivities_df["molecule_chembl_id"])
).only("molecule_chembl_id", "molecule_structures")

Download compound data from ChEMBL: export the QuerySet object into a pandas DataFrame

In [26]:
compounds = list(tqdm(compounds_provider))
compounds_df = pd.DataFrame.from_records(
    compounds,
)
print(f"DataFrame shape: {compounds_df.shape}")
compounds_df.head()

  0%|          | 0/784 [00:00<?, ?it/s]

DataFrame shape: (784, 2)


Unnamed: 0,molecule_chembl_id,molecule_structures
0,CHEMBL6246,{'canonical_smiles': 'O=c1oc2c(O)c(O)cc3c(=O)o...
1,CHEMBL6,{'canonical_smiles': 'COc1ccc2c(c1)c(CC(=O)O)c...
2,CHEMBL28,{'canonical_smiles': 'O=c1cc(-c2ccc(O)cc2)oc2c...
3,CHEMBL269277,{'canonical_smiles': 'C=C(C)[C@@H]1CC[C@]2(C(=...
4,CHEMBL7461,{'canonical_smiles': 'CCOC(=O)C1=C(O)C(=O)N(CC...


Preprocessing

In [27]:
# Remove missing entries
compounds_df.dropna(axis=0, how="any", inplace=True)

In [28]:
# Delete duplicates
compounds_df.drop_duplicates("molecule_chembl_id", keep="first", inplace=True)

So far, we have multiple different molecular structure representations. We only want to keep the canonical SMILES.

In [29]:
compounds_df.iloc[0].molecule_structures.keys()

dict_keys(['canonical_smiles', 'molfile', 'standard_inchi', 'standard_inchi_key'])

In [30]:
canonical_smiles = []

for i, compounds in compounds_df.iterrows():
    try:
        canonical_smiles.append(compounds["molecule_structures"]["canonical_smiles"])
    except KeyError:
        canonical_smiles.append(None)

compounds_df["smiles"] = canonical_smiles
compounds_df.drop("molecule_structures", axis=1, inplace=True)
compounds_df.dropna(axis=0, how="any", inplace=True)

# Output: bioactivity-compound data

In [31]:
print(f"Bioactivities filtered: {bioactivities_df.shape[0]}")
bioactivities_df.columns

Bioactivities filtered: 784


Index(['activity_id', 'assay_chembl_id', 'assay_description', 'assay_type',
       'molecule_chembl_id', 'relation', 'units', 'IC50', 'target_chembl_id',
       'target_organism', 'type'],
      dtype='object')

In [32]:
print(f"Compounds filtered: {compounds_df.shape[0]}")
compounds_df.columns

Compounds filtered: 784


Index(['molecule_chembl_id', 'smiles'], dtype='object')

Merge values of interest from bioactivities_df and compounds_df in an output_df based on the compounds’ ChEMBL IDs (molecule_chembl_id), keeping the following columns:

- ChEMBL IDs: molecule_chembl_id  
- SMILES: smiles  
- units: units  
- IC50: IC50  

In [33]:
# Merge DataFrames
output_df = pd.merge(
    bioactivities_df[["molecule_chembl_id", "IC50", "units"]],
    compounds_df,
    on="molecule_chembl_id",
)

# Reset row indices
output_df.reset_index(drop=True, inplace=True)

print(f"Dataset with {output_df.shape[0]} entries.")

Dataset with 784 entries.


In [34]:
output_df.dtypes

molecule_chembl_id     object
IC50                  float64
units                  object
smiles                 object
dtype: object

Add pIC50 values

In [35]:
def convert_ic50_to_pic50(IC50_value):
    pIC50_value = 9 - math.log10(IC50_value)
    return pIC50_value

In [39]:
# Apply conversion to each row of the compounds DataFrame
output_df["pIC50"] = output_df.apply(lambda x: convert_ic50_to_pic50(x.IC50), axis=1)
output_df.drop(["IC50", "units"], axis=1, inplace=True)
output_df.head()

Unnamed: 0,molecule_chembl_id,smiles,pIC50
0,CHEMBL18854,CCCCCCS(=O)(=O)c1ccc(Cl)cc1C1NC(=O)NC1=O,6.638272
1,CHEMBL19744,CNS(=O)(=O)c1ccc(Cl)cc1C1NC(=O)NC1=O,6.886057
2,CHEMBL19711,O=C1NC(=O)C(c2cc(Cl)ccc2S(=O)(=O)NCCCc2ccccc2)N1,7.0
3,CHEMBL19392,O=C1NC(=O)C(c2cc(Cl)ccc2S(=O)(=O)NCCCCc2ccccc2)N1,5.249492
4,CHEMBL19746,COc1ccc(F)cc1C1NC(=O)NC1=O,5.126679


# Save file

In [40]:
output_df.to_csv('{}.csv'.format(uniprot_id))