#CafChem tools for accessing Chembl via the API.

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/MauricioCafiero/CafChem/blob/main/notebooks/Chembl_CafChem.ipynb)

## This notebook allows you to:
- Search Uniprot for IDs for proteins
- use uniprot IDs to probe Chembl for targets, molecules and bioactivities

## Requirements:
- This notebook will install chembl_webresource_client.
- Runs on CPU or your local runtime.

## Set-up

In [1]:
!pip install -q chembl_webresource_client

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.2/55.2 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.4/61.4 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m70.7/70.7 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.6/67.6 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
bigframes 2.12.0 requires google-cloud-bigquery-storage<3.0.0,>=2.30.0, which is not installed.
bigframes 2.12.0 requires google-cloud-bigquery[bqstorage,pandas]>=3.31.0, but you have google-cloud-bigquery 3.25.0 which is incompatible.
bigframes 2.12.0 requires rich<14,>=12.4.4, but you have rich 14.1.0 which is incompatible.
gcsfs 2025.3.0

### Import libraries

In [2]:
import numpy as np
import pandas as pd
from chembl_webresource_client.new_client import new_client
from tqdm.auto import tqdm
import requests 

import os

## Query Uniprot for IDs for a protein

In [3]:
protein_name = 'MAOB'
url = f'https://rest.uniprot.org/uniprotkb/search?query={protein_name}&format=tsv'
response = requests.get(url).text
#print(response.status_code)

In [4]:
f = open(f"{protein_name}_uniprot_ids.tsv", "w")
f.write(response)
f.close()

## Define clients and load Uniprot ID files
- set to select human proteins but can be adjusted to anything

In [5]:
targets = new_client.target
compounds = new_client.molecule
bioact = new_client.activity

### If not working on Kaggle rename the path to your path

In [6]:
prot_df = pd.read_csv(f'/kaggle/working/{protein_name}_uniprot_ids.tsv', sep='\t')
prot_human_df = prot_df[prot_df['Organism'] == "Homo sapiens (Human)"]
print(f"Found {len(prot_human_df)} Human proteins out of {len(prot_df)} total proteins")

Found 1 Human proteins out of 25 total proteins


In [7]:
prot_human_df.head()

Unnamed: 0,Entry,Entry Name,Reviewed,Protein names,Gene Names,Organism,Length
0,P27338,AOFB_HUMAN,reviewed,Amine oxidase [flavin-containing] B (EC 1.4.3....,MAOB,Homo sapiens (Human),520


In [8]:
prot_up_ids = prot_human_df["Entry"].to_list()

## Search Chembl for Uniprot ID, return targets

In [9]:
prot_dfs = []
for ui in prot_up_ids:
    target_info = targets.get(target_components__accession=ui).only("target_chembl_id","orgamism", "pref_name", "target_type")
    target_info = pd.DataFrame.from_records(target_info)
    if len(target_info) > 0:
        prot_dfs.append(target_info)
        print(f"Found info for Uniprot ID: {ui}")

print(f"Collected {len(prot_dfs)} dataframes")

Found info for Uniprot ID: P27338
Collected 1 dataframes


### scroll through dataframe to find frames of interest

In [10]:
prot_dfs[0].head()

Unnamed: 0,pref_name,target_chembl_id,target_type
0,Amine oxidase [flavin-containing] B,CHEMBL2039,SINGLE PROTEIN
1,Amine oxidase [flavin-containing] B,CHEMBL2039,SINGLE PROTEIN
2,Monoamine oxidase,CHEMBL2095205,PROTEIN FAMILY


## Choose Chembl ID and get bioactivities
- first [] is the desired dataframe
- second [] is the record within the dataframe 

In [11]:
specific_target = prot_dfs[0].iloc[0]
chembl_id = specific_target.target_chembl_id
print(f"Using chembl id: {chembl_id}")

Using chembl id: CHEMBL2039


In [12]:
bioact_chosen = bioact.filter(target_chembl_id=chembl_id, type="IC50", relation="=").only(
    "molecule_chembl_id",
    "type",
    "standard_units",
    "relation",
    "standard_value",
)

print(f"Lenth of Bioactivities: {len(bioact_chosen)}")

Lenth of Bioactivities: 5518


In [13]:
bioact_chosen[0] # look at example

{'molecule_chembl_id': 'CHEMBL350093',
 'relation': '=',
 'standard_units': 'nM',
 'standard_value': '18.0',
 'type': 'IC50',
 'units': 'nM',
 'value': '18.0'}

In [14]:
chembl_ids = []
ic50s = []
for record in bioact_chosen:
    if record["standard_units"] == 'nM':
        chembl_ids.append(record["molecule_chembl_id"])
        ic50s.append(float(record["standard_value"]))

bioact_dict = {'chembl_ids' : chembl_ids, 'IC50s': ic50s}
bioact_df = pd.DataFrame.from_dict(bioact_dict)
bioact_df.drop_duplicates(subset=["chembl_ids"], keep= "last")
bioact_df.head()

Unnamed: 0,chembl_ids,ic50s
0,CHEMBL350093,18.0
1,CHEMBL161907,9.0
2,CHEMBL17079,4.4
3,CHEMBL157182,395000.0
4,CHEMBL160347,23400.0


In [15]:
print(f"Number of records: {len(bioact_df)}")
print(bioact_df.shape)

Number of records: 5515
(5515, 2)


## Get compounds

In [16]:
compounds_provider = compounds.filter(molecule_chembl_id__in=bioact_df["chembl_ids"].to_list()).only(
    "molecule_chembl_id",
    "molecule_structures"
)

In [17]:
cids_list = []
smiles_list = []

for record in compounds_provider:
    cid = record['molecule_chembl_id']
    cids_list.append(cid)
    
    if record['molecule_structures']:
        if record['molecule_structures']['canonical_smiles']:
            smile = record['molecule_structures']['canonical_smiles']
        else:
            print("No canonical smiles")
            smile = None
    else:
        print('no structures')
        smile = None
    smiles_list.append(smile)

new_dict = {'SMILES': smiles_list, 'chembl_ids_2': cids_list}
new_df = pd.DataFrame.from_dict(new_dict)

total_bioact_df = pd.merge(bioact_df, new_df, left_on='chembl_ids', right_on='chembl_ids_2')
print(f"number of records: {len(total_bioact_df)}")

total_bioact_df.drop_duplicates(subset=["chembl_ids"], keep= "last")
print(f"number of records after removing duplicates: {len(total_bioact_df)}")

total_bioact_df.dropna(axis=0, how='any', inplace=True)
total_bioact_df.drop(["chembl_ids_2"],axis=1,inplace=True)
print(f"number of records after dropping Null values: {len(total_bioact_df)}")

total_bioact_df.head()

no structures
no structures
no structures
no structures
no structures
number of records: 5515
number of records after removing duplicates: 5515
number of records after dropping Null values: 5510


Unnamed: 0,chembl_ids,ic50s,SMILES
0,CHEMBL350093,18.0,N#CCCN1CC(=O)OC(c2ccc(OCc3ccccc3)cc2)=N1
1,CHEMBL161907,9.0,O=c1c(=O)c2ccc(OCCCC(F)(F)F)cc2c1=O
2,CHEMBL17079,4.4,N#CCCn1nc(-c2ccc(OCc3ccccc3)cc2)oc1=S
3,CHEMBL157182,395000.0,C/N=C1/CCc2c1n(C)c1cc(Cl)c(OC(=O)NC)cc21
4,CHEMBL160347,23400.0,COc1cc(Br)c2oc(C3CCNCC3)cc2c1


## Save to a CSV file

In [18]:
total_bioact_df.to_csv(f'{protein_name}_dataset.csv', index=False)