## **Installing Libraries**

Install the ChEMBL web service package so that we can retrieve bioactivity data from the ChEMBL Database.

In [None]:
! pip install chembl_webresource_client

Collecting chembl_webresource_client
  Downloading chembl_webresource_client-0.10.9-py3-none-any.whl.metadata (1.4 kB)
Collecting requests-cache~=1.2 (from chembl_webresource_client)
  Downloading requests_cache-1.2.1-py3-none-any.whl.metadata (9.9 kB)
Collecting cattrs>=22.2 (from requests-cache~=1.2->chembl_webresource_client)
  Downloading cattrs-24.1.3-py3-none-any.whl.metadata (8.4 kB)
Collecting url-normalize>=1.4 (from requests-cache~=1.2->chembl_webresource_client)
  Downloading url_normalize-2.2.1-py3-none-any.whl.metadata (5.6 kB)
Downloading chembl_webresource_client-0.10.9-py3-none-any.whl (55 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.2/55.2 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading requests_cache-1.2.1-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.4/61.4 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading cattrs-24.1.3-py3-none-any.whl (66 kB)
[2K   [90m━━━━━━━━━━━━━━

In [None]:
# Import necessary libraries
import pandas as pd
from chembl_webresource_client.new_client import new_client

## **Search for Target Protein (mTOR)**

## **Target search for serine/threonine-protein kinase mTORn**

In [None]:
# Target search for Serine/threonine-protein kinase mTOR
target = new_client.target
target_query = target.search('Serine/threonine-protein kinase mTOR')
targets = pd.DataFrame.from_dict(target_query)
targets

Unnamed: 0,cross_references,organism,pref_name,score,species_group_flag,target_chembl_id,target_components,target_type,tax_id
0,[],Homo sapiens,Serine/threonine-protein kinase mTOR,30.0,False,CHEMBL2842,"[{'accession': 'P42345', 'component_descriptio...",SINGLE PROTEIN,9606.0
1,[],Rattus norvegicus,Serine/threonine-protein kinase mTOR,30.0,False,CHEMBL1075134,"[{'accession': 'P42346', 'component_descriptio...",SINGLE PROTEIN,10116.0
2,[],Mus musculus,Serine/threonine-protein kinase mTOR,30.0,False,CHEMBL1255165,"[{'accession': 'Q9JLN9', 'component_descriptio...",SINGLE PROTEIN,10090.0
3,[],Mus musculus,mTORC1,30.0,False,CHEMBL5465384,"[{'accession': 'Q9JLN9', 'component_descriptio...",PROTEIN COMPLEX,10090.0
4,[],Homo sapiens,DEPTOR/mTOR,26.0,False,CHEMBL4523674,"[{'accession': 'P42345', 'component_descriptio...",PROTEIN-PROTEIN INTERACTION,9606.0
...,...,...,...,...,...,...,...,...,...
9995,[],Homo sapiens,Integrin alpha4/beta7 complex,0.0,False,CHEMBL5465395,"[{'accession': 'P48169', 'component_descriptio...",PROTEIN COMPLEX,9606.0
9996,[],Mus musculus,Tryptophan 5-hydroxylase 1,0.0,False,CHEMBL5465551,"[{'accession': 'P17532', 'component_descriptio...",SINGLE PROTEIN,10090.0
9997,[],Rattus norvegicus,Soluble guanylate cyclase alpha1/beta1 complex,0.0,False,CHEMBL5482988,"[{'accession': 'P19686', 'component_descriptio...",PROTEIN COMPLEX,10116.0
9998,[],Mus musculus,VHL-Nicotinamide phosphoribosyltransferase,0.0,False,CHEMBL5482989,"[{'accession': 'Q99KQ4', 'component_descriptio...",PROTEIN-PROTEIN INTERACTION,10090.0


## **Select and retrieve bioactivity data for serine/threonine-protein kinase mTOR (first entry)**

In [5]:
selected_target = targets.target_chembl_id[0]
selected_target

'CHEMBL2842'

Here, we will retrieve only bioactivity data for serine/threonine-protein kinase mTOR (CHEMBL2842) that are reported as IC50 values.

In [6]:
activity = new_client.activity
res = activity.filter(target_chembl_id=selected_target).filter(standard_type='IC50')
df = pd.DataFrame.from_dict(res)

df_filtered = df[(df['type'] == 'IC50') & ((df['units'] == 'nM')  |  (df ['units'] == 'uM'))]

df_filtered

Unnamed: 0,action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
12,,,1410291,[],CHEMBL830918,Inhibition of mTOR protein isolated from HeLa ...,B,,,BAO_0000190,...,Homo sapiens,Serine/threonine-protein kinase mTOR,9606,,,IC50,uM,UO_0000065,,4.8
13,,,1412283,[],CHEMBL830918,Inhibition of mTOR protein isolated from HeLa ...,B,,,BAO_0000190,...,Homo sapiens,Serine/threonine-protein kinase mTOR,9606,,,IC50,uM,UO_0000065,,2.5
14,,,1412288,[],CHEMBL830918,Inhibition of mTOR protein isolated from HeLa ...,B,,,BAO_0000190,...,Homo sapiens,Serine/threonine-protein kinase mTOR,9606,,,IC50,uM,UO_0000065,,6.4
15,,,1412303,[],CHEMBL830918,Inhibition of mTOR protein isolated from HeLa ...,B,,,BAO_0000190,...,Homo sapiens,Serine/threonine-protein kinase mTOR,9606,,,IC50,uM,UO_0000065,,5.3
16,,,1459738,[],CHEMBL830270,Inhibition of Mammalian target of Rapamycin mTOR,B,,,BAO_0000190,...,Homo sapiens,Serine/threonine-protein kinase mTOR,9606,,,IC50,uM,UO_0000065,,1.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5264,,,25755796,"[{'comments': None, 'relation': None, 'result_...",CHEMBL5446092,Affinity Biochemical interaction: (Dissociatio...,B,,,BAO_0000190,...,Homo sapiens,Serine/threonine-protein kinase mTOR,9606,,,IC50,nM,UO_0000065,,0.19
5265,,,25777959,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5477195,Selectivity interaction (SelectScreen Kinase P...,B,,,BAO_0000190,...,Homo sapiens,Serine/threonine-protein kinase mTOR,9606,,,IC50,nM,UO_0000065,,4.32
5266,,,25782764,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5468047,Selectivity interaction (Enzyme panel (AstraZe...,B,,,BAO_0000190,...,Homo sapiens,Serine/threonine-protein kinase mTOR,9606,,,IC50,uM,UO_0000065,,8.0
5267,,,25786790,"[{'comments': None, 'relation': None, 'result_...",CHEMBL5473708,Selectivity interaction (Kinase panel (KinaseG...,B,,,BAO_0000190,...,Homo sapiens,Serine/threonine-protein kinase mTOR,9606,,,IC50,uM,UO_0000065,,9.0


In [7]:
df_filtered.to_csv('bioactivity_data.csv', index=False)

Finally we will save the resulting bioactivity data to a CSV file .

## **Data processing**

If any compounds has missing value for the standard_value and canonical_smiles column then drop it.

In [8]:
import pandas as pd

df_filtered = pd.read_csv('bioactivity_data.csv')
df2 = df_filtered[df_filtered.standard_value.notna()]
df2 = df2[df2.canonical_smiles.notna()]
df2

Unnamed: 0,action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,,1410291,[],CHEMBL830918,Inhibition of mTOR protein isolated from HeLa ...,B,,,BAO_0000190,...,Homo sapiens,Serine/threonine-protein kinase mTOR,9606,,,IC50,uM,UO_0000065,,4.80
1,,,1412283,[],CHEMBL830918,Inhibition of mTOR protein isolated from HeLa ...,B,,,BAO_0000190,...,Homo sapiens,Serine/threonine-protein kinase mTOR,9606,,,IC50,uM,UO_0000065,,2.50
2,,,1412288,[],CHEMBL830918,Inhibition of mTOR protein isolated from HeLa ...,B,,,BAO_0000190,...,Homo sapiens,Serine/threonine-protein kinase mTOR,9606,,,IC50,uM,UO_0000065,,6.40
3,,,1412303,[],CHEMBL830918,Inhibition of mTOR protein isolated from HeLa ...,B,,,BAO_0000190,...,Homo sapiens,Serine/threonine-protein kinase mTOR,9606,,,IC50,uM,UO_0000065,,5.30
4,,,1459738,[],CHEMBL830270,Inhibition of Mammalian target of Rapamycin mTOR,B,,,BAO_0000190,...,Homo sapiens,Serine/threonine-protein kinase mTOR,9606,,,IC50,uM,UO_0000065,,1.70
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5131,,,25755796,"[{'comments': None, 'relation': None, 'result_...",CHEMBL5446092,Affinity Biochemical interaction: (Dissociatio...,B,,,BAO_0000190,...,Homo sapiens,Serine/threonine-protein kinase mTOR,9606,,,IC50,nM,UO_0000065,,0.19
5132,,,25777959,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5477195,Selectivity interaction (SelectScreen Kinase P...,B,,,BAO_0000190,...,Homo sapiens,Serine/threonine-protein kinase mTOR,9606,,,IC50,nM,UO_0000065,,4.32
5133,,,25782764,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5468047,Selectivity interaction (Enzyme panel (AstraZe...,B,,,BAO_0000190,...,Homo sapiens,Serine/threonine-protein kinase mTOR,9606,,,IC50,uM,UO_0000065,,8.00
5134,,,25786790,"[{'comments': None, 'relation': None, 'result_...",CHEMBL5473708,Selectivity interaction (Kinase panel (KinaseG...,B,,,BAO_0000190,...,Homo sapiens,Serine/threonine-protein kinase mTOR,9606,,,IC50,uM,UO_0000065,,9.00


Delete duplicate rows with the same normalized SMILES value, and then calculate the number of unique normalized SMILES values.

In [9]:
df2_nr = df2.drop_duplicates(['canonical_smiles'])
df2_nr

Unnamed: 0,action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,,1410291,[],CHEMBL830918,Inhibition of mTOR protein isolated from HeLa ...,B,,,BAO_0000190,...,Homo sapiens,Serine/threonine-protein kinase mTOR,9606,,,IC50,uM,UO_0000065,,4.80
1,,,1412283,[],CHEMBL830918,Inhibition of mTOR protein isolated from HeLa ...,B,,,BAO_0000190,...,Homo sapiens,Serine/threonine-protein kinase mTOR,9606,,,IC50,uM,UO_0000065,,2.50
2,,,1412288,[],CHEMBL830918,Inhibition of mTOR protein isolated from HeLa ...,B,,,BAO_0000190,...,Homo sapiens,Serine/threonine-protein kinase mTOR,9606,,,IC50,uM,UO_0000065,,6.40
3,,,1412303,[],CHEMBL830918,Inhibition of mTOR protein isolated from HeLa ...,B,,,BAO_0000190,...,Homo sapiens,Serine/threonine-protein kinase mTOR,9606,,,IC50,uM,UO_0000065,,5.30
4,,,1459738,[],CHEMBL830270,Inhibition of Mammalian target of Rapamycin mTOR,B,,,BAO_0000190,...,Homo sapiens,Serine/threonine-protein kinase mTOR,9606,,,IC50,uM,UO_0000065,,1.70
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5107,"{'action_type': 'INHIBITOR', 'description': 'N...",,25668044,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5375563,Inhibition of recombinant human GST-tagged mTO...,B,,,BAO_0000190,...,Homo sapiens,Serine/threonine-protein kinase mTOR,9606,,,IC50,nM,UO_0000065,,7.10
5108,"{'action_type': 'INHIBITOR', 'description': 'N...",,25683584,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5380301,Inhibition of mTOR (unknown origin) incubated ...,B,,,BAO_0000190,...,Homo sapiens,Serine/threonine-protein kinase mTOR,9606,,,IC50,uM,UO_0000065,,0.83
5109,"{'action_type': 'INHIBITOR', 'description': 'N...",,25683585,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5380301,Inhibition of mTOR (unknown origin) incubated ...,B,,,BAO_0000190,...,Homo sapiens,Serine/threonine-protein kinase mTOR,9606,,,IC50,uM,UO_0000065,,2.85
5133,,,25782764,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5468047,Selectivity interaction (Enzyme panel (AstraZe...,B,,,BAO_0000190,...,Homo sapiens,Serine/threonine-protein kinase mTOR,9606,,,IC50,uM,UO_0000065,,8.00


Combine the 3 columns (molecule_chembl_id,canonical_smiles,standard_value) into a DataFrame

In [10]:
selection = ['molecule_chembl_id','canonical_smiles','standard_value']
df3 = df2_nr[selection]
df3

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value
0,CHEMBL435507,CC1CN(c2cc(=O)c3ccc4ccccc4c3o2)CCO1,4800.0
1,CHEMBL98350,O=c1cc(N2CCOCC2)oc2c(-c3ccccc3)cccc12,2500.0
2,CHEMBL104468,O=c1cc(N2CCOCC2)oc2c1ccc1ccccc12,6400.0
3,CHEMBL179242,O=c1cc(N2CCOCC2)nc2c3ccccc3ccn12,5300.0
4,CHEMBL188678,O=c1cc(N2CCOCC2)oc2c(-c3cccc4c3sc3ccccc34)cccc12,1700.0
...,...,...,...
5107,CHEMBL5433283,Cc1cccc2c(-c3ncc4c(n3)N3CCOC[C@@]3(C)C(=O)N4C3...,7.1
5108,CHEMBL5437417,N#Cc1c(-c2ccc(Cl)cc2)nc(N/N=C/c2cccc(Cl)c2)nc1...,830.0
5109,CHEMBL5432408,N#Cc1c(-c2ccc(Cl)cc2)nc(N/N=C/c2cccc(O)c2)nc1N...,2850.0
5133,CHEMBL2165191,Cc1cc([C@@H](C)Nc2ccccc2C(=O)O)c2nc(N3CCOCC3)c...,8000.0


## **Labeling compounds as either being active, inactive or intermediate**

In [11]:
bioactivity_threshold = []
for i in df3.standard_value:
  if float(i) >= 1000:
    bioactivity_threshold.append("inactive")
  elif float(i) <= 100:
    bioactivity_threshold.append("active")
  else:
    bioactivity_threshold.append("intermediate")

df3 = df3.reset_index(drop=True)

bioactivity_class = pd.Series(bioactivity_threshold, name='class')
df4 = pd.concat([df3, bioactivity_class], axis=1)
df4

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,class
0,CHEMBL435507,CC1CN(c2cc(=O)c3ccc4ccccc4c3o2)CCO1,4800.0,inactive
1,CHEMBL98350,O=c1cc(N2CCOCC2)oc2c(-c3ccccc3)cccc12,2500.0,inactive
2,CHEMBL104468,O=c1cc(N2CCOCC2)oc2c1ccc1ccccc12,6400.0,inactive
3,CHEMBL179242,O=c1cc(N2CCOCC2)nc2c3ccccc3ccn12,5300.0,inactive
4,CHEMBL188678,O=c1cc(N2CCOCC2)oc2c(-c3cccc4c3sc3ccccc34)cccc12,1700.0,inactive
...,...,...,...,...
4567,CHEMBL5433283,Cc1cccc2c(-c3ncc4c(n3)N3CCOC[C@@]3(C)C(=O)N4C3...,7.1,active
4568,CHEMBL5437417,N#Cc1c(-c2ccc(Cl)cc2)nc(N/N=C/c2cccc(Cl)c2)nc1...,830.0,intermediate
4569,CHEMBL5432408,N#Cc1c(-c2ccc(Cl)cc2)nc(N/N=C/c2cccc(O)c2)nc1N...,2850.0,inactive
4570,CHEMBL2165191,Cc1cc([C@@H](C)Nc2ccccc2C(=O)O)c2nc(N3CCOCC3)c...,8000.0,inactive


Delete rows containing intermediate

In [12]:
df5 = df4[df4['class'] != 'intermediate']
df5 = df5.sort_values('standard_value')
df5

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,class
2160,CHEMBL3645910,CCc1ncnc(-c2cc(F)c(C(=O)N3CCN4CCC[C@H]4C3)c(Cl...,0.07,active
89,CHEMBL558955,COC(=O)N1CCC(n2ncc3c(N4CCOCC4)nc(-c4ccc(NC(=O)...,0.08,active
167,CHEMBL585480,CCn1ncc2c(N3CC4CCC(C3)O4)nc(-c3ccc(NC(=O)Nc4cc...,0.10,active
148,CHEMBL566004,CCn1ncc2c(N3CC4CCC(C3)O4)nc(-c3ccc(NC(=O)Nc4cc...,0.10,active
497,CHEMBL1098245,COC(=O)N1CCC(n2ncc3c(N4CCOCC4)nc(-c4ccc(NC(=O)...,0.10,active
...,...,...,...,...
674,CHEMBL1241864,Nc1ncnc2c1c(-c1ccc3[nH]c(=O)ccc3c1)nn2C1CCCC1,100000.00,inactive
568,CHEMBL1241482,Nc1ncnc2c1c(-c1ccc(F)c(O)c1)nn2Cc1ocnc1-c1ccccc1,100000.00,inactive
781,CHEMBL1631890,Fc1cc(F)cc(C#Cc2n[nH]c3ccccc23)c1,200000.00,inactive
827,CHEMBL113,Cn1c(=O)c2c(ncn2C)n(C)c1=O,400000.00,inactive


In [13]:
counts = df5['class'].value_counts()
active_count = counts['active']
inactive_count = counts['inactive']

print("Active count:", active_count)
print("Inactive count:", inactive_count)

Active count: 2468
Inactive count: 1181


Saves dataframe to CSV file

In [14]:
df5.to_csv('Serine_threonine-protein_Kinase_mTOR.csv', index=False)
from google.colab import files
files.download('Serine_threonine-protein_Kinase_mTOR.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>