# Computational Drug Discovery Project - Ligand Based Drug Design [Part1] Data Collection and Pre-processing

Perform Data Collection and Pre-Processing from the ChEMBL Database

ChEMBL database is an open large-scale bioactivity database containing binding, functional and ADME information for a large number of drug-like bioactive compounds.

-------------------------------------------------------------------------------------------------------------------

In [None]:
# installing libraries
!pip install pandas

In [None]:
# install the ChEMBL web service package
!pip install chembl_webresource_client

In [1]:
# importing libraries
import pandas as pd
from chembl_webresource_client.new_client import new_client

## Search for the target protein

In [2]:
# target search for PPAR alpha
target = new_client.target
target_query = target.search('Peroxisome proliferator-activated receptor alpha')
target_s = pd.DataFrame.from_dict(target_query)
target_s

Unnamed: 0,cross_references,organism,pref_name,score,species_group_flag,target_chembl_id,target_components,target_type,tax_id
0,[],Homo sapiens,Peroxisome proliferator-activated receptor,42.0,False,CHEMBL3559683,"[{'accession': 'P37231', 'component_descriptio...",PROTEIN FAMILY,9606.0
1,[],Homo sapiens,PPAR alpha/gamma,40.0,False,CHEMBL2111325,"[{'accession': 'P37231', 'component_descriptio...",SELECTIVITY GROUP,9606.0
2,"[{'xref_id': 'Q95N78', 'xref_name': None, 'xre...",Canis lupus familiaris,Peroxisome proliferator-activated receptor alpha,39.0,False,CHEMBL2847,"[{'accession': 'Q95N78', 'component_descriptio...",SINGLE PROTEIN,9615.0
3,[],Canis familiaris,Peroxisome proliferator-activated receptor delta,39.0,False,CHEMBL1932904,"[{'accession': 'Q0ZAQ8', 'component_descriptio...",SINGLE PROTEIN,9615.0
4,"[{'xref_id': 'Q07869', 'xref_name': None, 'xre...",Homo sapiens,Peroxisome proliferator-activated receptor alpha,38.0,False,CHEMBL239,"[{'accession': 'Q07869', 'component_descriptio...",SINGLE PROTEIN,9606.0
...,...,...,...,...,...,...,...,...,...
8260,[],Homo sapiens,Histone-lysine N-methyltransferase PRDM7,0.0,False,CHEMBL5214861,"[{'accession': 'Q9NQW5', 'component_descriptio...",SINGLE PROTEIN,9606.0
8261,[],Homo sapiens,PR domain zinc finger protein 2,0.0,False,CHEMBL5214862,"[{'accession': 'Q13029', 'component_descriptio...",SINGLE PROTEIN,9606.0
8262,[],Homo sapiens,PR domain zinc finger protein 10,0.0,False,CHEMBL5214863,"[{'accession': 'Q9NQV6', 'component_descriptio...",SINGLE PROTEIN,9606.0
8263,[],Homo sapiens,PR domain zinc finger protein 8,0.0,False,CHEMBL5214864,"[{'accession': 'Q9NQV8', 'component_descriptio...",SINGLE PROTEIN,9606.0


In [3]:
# select and retrieve bioactivity data for Human PPAR alpha
selected_target = target_s.target_chembl_id[4]
selected_target

'CHEMBL239'

In [4]:
all_activity = new_client.activity
res = all_activity.filter(target_chembl_id=selected_target)

In [5]:
df = pd.DataFrame.from_dict(res)
df

Unnamed: 0,action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,,105366,[],CHEMBL764508,In vitro binding affinity against human Peroxi...,B,,,BAO_0000190,...,Homo sapiens,Peroxisome proliferator-activated receptor alpha,9606,,,IC50,uM,UO_0000065,,10.0
1,,Not Determined,105369,[],CHEMBL763851,In vitro transcriptional activation in COS cel...,F,,,BAO_0000188,...,Homo sapiens,Peroxisome proliferator-activated receptor alpha,9606,,,EC50,uM,UO_0000065,,
2,,,105857,[],CHEMBL763845,In vitro effective concentration for agonist a...,F,,,BAO_0000188,...,Homo sapiens,Peroxisome proliferator-activated receptor alpha,9606,,,EC50,uM,UO_0000065,,2.7
3,,,106513,[],CHEMBL763565,Transcriptional activation activity on human P...,B,,,BAO_0000188,...,Homo sapiens,Peroxisome proliferator-activated receptor alpha,9606,,,EC50,uM,UO_0000065,,0.000615
4,,,106514,[],CHEMBL882312,Transcriptional activation activity on human I...,B,,,BAO_0000188,...,Homo sapiens,Peroxisome proliferator-activated receptor alpha,9606,,,EC50,uM,UO_0000065,,0.667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8981,"{'action_type': 'PARTIAL AGONIST', 'descriptio...",,24971622,[],CHEMBL5218425,Partial agonist activity at PPARalpha (unknown...,B,,,BAO_0000188,...,Homo sapiens,Peroxisome proliferator-activated receptor alpha,9606,,,EC50,uM,UO_0000065,,58.2
8982,,ND,24971623,[],CHEMBL5218425,Partial agonist activity at PPARalpha (unknown...,B,,,BAO_0000188,...,Homo sapiens,Peroxisome proliferator-activated receptor alpha,9606,,,EC50,,,,
8983,"{'action_type': 'PARTIAL AGONIST', 'descriptio...",,24971624,[],CHEMBL5218425,Partial agonist activity at PPARalpha (unknown...,B,,,BAO_0000188,...,Homo sapiens,Peroxisome proliferator-activated receptor alpha,9606,,,EC50,uM,UO_0000065,,0.02
8984,,,24971633,[],CHEMBL5218430,Partial agonist activity at PPARalpha (unknown...,B,,,BAO_0000375,...,Homo sapiens,Peroxisome proliferator-activated receptor alpha,9606,,,Activity,,,,


In [6]:
df.to_csv('ppara_initial_bioactivity_data.csv', index=False)

Retrieve only bioactivity data for CHEMBL239 that are reported as values with nM (nanomolar) standard unit, binding measurements of compounds, and a direct single protein target.

In [7]:
activity = new_client.activity
res_in = activity.filter(target_chembl_id=selected_target).filter(standard_relation='=').filter(assay_type='B').filter(confidence_scores=9)

In [8]:
df2 = pd.DataFrame.from_dict(res_in)
df2

Unnamed: 0,action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,,106513,[],CHEMBL763565,Transcriptional activation activity on human P...,B,,,BAO_0000188,...,Homo sapiens,Peroxisome proliferator-activated receptor alpha,9606,,,EC50,uM,UO_0000065,,0.000615
1,,,106514,[],CHEMBL882312,Transcriptional activation activity on human I...,B,,,BAO_0000188,...,Homo sapiens,Peroxisome proliferator-activated receptor alpha,9606,,,EC50,uM,UO_0000065,,0.667
2,,,106515,[],CHEMBL763566,Transcriptional activation activity on human T...,B,,,BAO_0000188,...,Homo sapiens,Peroxisome proliferator-activated receptor alpha,9606,,,EC50,uM,UO_0000065,,0.014
3,,,106516,[],CHEMBL763895,Ratio of transcriptional activation of I272F m...,B,,,BAO_0000193,...,Homo sapiens,Peroxisome proliferator-activated receptor alpha,9606,,,Ratio,,,,108.0
4,,,106517,[],CHEMBL763898,Ratio of transcriptional activation of T279M m...,B,,,BAO_0000193,...,Homo sapiens,Peroxisome proliferator-activated receptor alpha,9606,,,Ratio,,,,2.28
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3984,,,24971600,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5218415,Displacement of fluormone pan-PPARgreen from P...,B,,,BAO_0001103,...,Homo sapiens,Peroxisome proliferator-activated receptor alpha,9606,,,Activity,%,UO_0000187,,37.6
3985,"{'action_type': 'PARTIAL AGONIST', 'descriptio...",,24971608,[],CHEMBL5218421,Binding affinity to PPARalpha (unknown origin)...,B,,,BAO_0000192,...,Homo sapiens,Peroxisome proliferator-activated receptor alpha,9606,,,Ki,uM,UO_0000065,,9.4
3986,"{'action_type': 'AGONIST', 'description': 'Bin...",,24971611,[],CHEMBL5218421,Binding affinity to PPARalpha (unknown origin)...,B,,,BAO_0000192,...,Homo sapiens,Peroxisome proliferator-activated receptor alpha,9606,,,Ki,uM,UO_0000065,,0.006999999999999999
3987,"{'action_type': 'PARTIAL AGONIST', 'descriptio...",,24971622,[],CHEMBL5218425,Partial agonist activity at PPARalpha (unknown...,B,,,BAO_0000188,...,Homo sapiens,Peroxisome proliferator-activated receptor alpha,9606,,,EC50,uM,UO_0000065,,58.2


Select important types that are related to potency of drugs

In [9]:
# select types of bioactivity 
filter_list = ["IC50", "EC50", "Ki","Kd"]
df3=df2[df2.standard_type.isin(filter_list)]
df3

Unnamed: 0,action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,,106513,[],CHEMBL763565,Transcriptional activation activity on human P...,B,,,BAO_0000188,...,Homo sapiens,Peroxisome proliferator-activated receptor alpha,9606,,,EC50,uM,UO_0000065,,0.000615
1,,,106514,[],CHEMBL882312,Transcriptional activation activity on human I...,B,,,BAO_0000188,...,Homo sapiens,Peroxisome proliferator-activated receptor alpha,9606,,,EC50,uM,UO_0000065,,0.667
2,,,106515,[],CHEMBL763566,Transcriptional activation activity on human T...,B,,,BAO_0000188,...,Homo sapiens,Peroxisome proliferator-activated receptor alpha,9606,,,EC50,uM,UO_0000065,,0.014
5,,,108758,[],CHEMBL764508,In vitro binding affinity against human Peroxi...,B,,,BAO_0000190,...,Homo sapiens,Peroxisome proliferator-activated receptor alpha,9606,,,IC50,uM,UO_0000065,,2.1
6,,,108768,[],CHEMBL764508,In vitro binding affinity against human Peroxi...,B,,,BAO_0000190,...,Homo sapiens,Peroxisome proliferator-activated receptor alpha,9606,,,IC50,uM,UO_0000065,,0.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3961,"{'action_type': 'AGONIST', 'description': 'Bin...",,24907997,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5162396,Agonist activity at yeast Gal4-fused human PPA...,B,,,BAO_0000188,...,Homo sapiens,Peroxisome proliferator-activated receptor alpha,9606,,,EC50,uM,UO_0000065,,12.1
3985,"{'action_type': 'PARTIAL AGONIST', 'descriptio...",,24971608,[],CHEMBL5218421,Binding affinity to PPARalpha (unknown origin)...,B,,,BAO_0000192,...,Homo sapiens,Peroxisome proliferator-activated receptor alpha,9606,,,Ki,uM,UO_0000065,,9.4
3986,"{'action_type': 'AGONIST', 'description': 'Bin...",,24971611,[],CHEMBL5218421,Binding affinity to PPARalpha (unknown origin)...,B,,,BAO_0000192,...,Homo sapiens,Peroxisome proliferator-activated receptor alpha,9606,,,Ki,uM,UO_0000065,,0.006999999999999999
3987,"{'action_type': 'PARTIAL AGONIST', 'descriptio...",,24971622,[],CHEMBL5218425,Partial agonist activity at PPARalpha (unknown...,B,,,BAO_0000188,...,Homo sapiens,Peroxisome proliferator-activated receptor alpha,9606,,,EC50,uM,UO_0000065,,58.2


In [10]:
df3['type'].value_counts()

EC50      1355
IC50       696
pEC50      154
Ki         134
pIC50       69
Kd          28
Log Ki      13
Name: type, dtype: int64

In [11]:
# select columns including id, structures, bioactivity values and types
selection = ['molecule_chembl_id','canonical_smiles','standard_value','type']
df4 = df3[selection]
df4

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,type
0,CHEMBL318351,CC[C@@H](Cc1ccc(OC)c(C(=O)NCc2ccc(Oc3ccc(F)cc3...,0.615,EC50
1,CHEMBL318351,CC[C@@H](Cc1ccc(OC)c(C(=O)NCc2ccc(Oc3ccc(F)cc3...,667.0,EC50
2,CHEMBL318351,CC[C@@H](Cc1ccc(OC)c(C(=O)NCc2ccc(Oc3ccc(F)cc3...,14.0,EC50
5,CHEMBL94496,CCCc1cc(Oc2ccc(C(C)C)cc2)ccc1OCCCOc1cccc(C2SC(...,2100.0,IC50
6,CHEMBL420441,CCCc1cc(Oc2ccc(Cl)cc2)ccc1OCCCOc1cccc(C2SC(=O)...,100.0,IC50
...,...,...,...,...
3961,CHEMBL5199782,CC(C)(C)OC(=O)NC(COCc1ccc(C(=O)c2ccccc2)cc1)C(...,12100.0,EC50
3985,CHEMBL5219536,COc1cccc2ccc(-c3cc(-c4ccc5cccc(OC)c5c4O)c4cccc...,9400.0,Ki
3986,CHEMBL21241,CC(C)(Sc1ccc(CCN(CCCCC2CCCCC2)C(=O)NC2CCCCC2)c...,7.0,Ki
3987,CHEMBL5219536,COc1cccc2ccc(-c3cc(-c4ccc5cccc(OC)c5c4O)c4cccc...,58200.0,EC50


## Data pre-processing

### Handling missing data

In [12]:
# drop missing values in standard_value columns
df5 = df4[df4.standard_value.notna()]
df5 = df5[df4.canonical_smiles.notna()]
df5

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,type
0,CHEMBL318351,CC[C@@H](Cc1ccc(OC)c(C(=O)NCc2ccc(Oc3ccc(F)cc3...,0.615,EC50
1,CHEMBL318351,CC[C@@H](Cc1ccc(OC)c(C(=O)NCc2ccc(Oc3ccc(F)cc3...,667.0,EC50
2,CHEMBL318351,CC[C@@H](Cc1ccc(OC)c(C(=O)NCc2ccc(Oc3ccc(F)cc3...,14.0,EC50
5,CHEMBL94496,CCCc1cc(Oc2ccc(C(C)C)cc2)ccc1OCCCOc1cccc(C2SC(...,2100.0,IC50
6,CHEMBL420441,CCCc1cc(Oc2ccc(Cl)cc2)ccc1OCCCOc1cccc(C2SC(=O)...,100.0,IC50
...,...,...,...,...
3961,CHEMBL5199782,CC(C)(C)OC(=O)NC(COCc1ccc(C(=O)c2ccccc2)cc1)C(...,12100.0,EC50
3985,CHEMBL5219536,COc1cccc2ccc(-c3cc(-c4ccc5cccc(OC)c5c4O)c4cccc...,9400.0,Ki
3986,CHEMBL21241,CC(C)(Sc1ccc(CCN(CCCCC2CCCCC2)C(=O)NC2CCCCC2)c...,7.0,Ki
3987,CHEMBL5219536,COc1cccc2ccc(-c3cc(-c4ccc5cccc(OC)c5c4O)c4cccc...,58200.0,EC50


In [13]:
# save the resulting data to a CSV file
df5.to_csv('ppara_standardv_bioactivity.csv', index=False)

### Merge duplicate data

In [14]:
df6 = pd.read_csv('ppara_standardv_bioactivity.csv')
df6

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,type
0,CHEMBL318351,CC[C@@H](Cc1ccc(OC)c(C(=O)NCc2ccc(Oc3ccc(F)cc3...,0.615,EC50
1,CHEMBL318351,CC[C@@H](Cc1ccc(OC)c(C(=O)NCc2ccc(Oc3ccc(F)cc3...,667.000,EC50
2,CHEMBL318351,CC[C@@H](Cc1ccc(OC)c(C(=O)NCc2ccc(Oc3ccc(F)cc3...,14.000,EC50
3,CHEMBL94496,CCCc1cc(Oc2ccc(C(C)C)cc2)ccc1OCCCOc1cccc(C2SC(...,2100.000,IC50
4,CHEMBL420441,CCCc1cc(Oc2ccc(Cl)cc2)ccc1OCCCOc1cccc(C2SC(=O)...,100.000,IC50
...,...,...,...,...
2444,CHEMBL5199782,CC(C)(C)OC(=O)NC(COCc1ccc(C(=O)c2ccccc2)cc1)C(...,12100.000,EC50
2445,CHEMBL5219536,COc1cccc2ccc(-c3cc(-c4ccc5cccc(OC)c5c4O)c4cccc...,9400.000,Ki
2446,CHEMBL21241,CC(C)(Sc1ccc(CCN(CCCCC2CCCCC2)C(=O)NC2CCCCC2)c...,7.000,Ki
2447,CHEMBL5219536,COc1cccc2ccc(-c3cc(-c4ccc5cccc(OC)c5c4O)c4cccc...,58200.000,EC50


In [15]:
# select duplicate data
df_du=df6[df6.duplicated(['molecule_chembl_id'], keep=False)]
df_du

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,type
0,CHEMBL318351,CC[C@@H](Cc1ccc(OC)c(C(=O)NCc2ccc(Oc3ccc(F)cc3...,0.615,EC50
1,CHEMBL318351,CC[C@@H](Cc1ccc(OC)c(C(=O)NCc2ccc(Oc3ccc(F)cc3...,667.000,EC50
2,CHEMBL318351,CC[C@@H](Cc1ccc(OC)c(C(=O)NCc2ccc(Oc3ccc(F)cc3...,14.000,EC50
6,CHEMBL322086,CC[C@@H](Cc1ccc(OC)c(C(=O)NCc2ccccc2)c1)C(=O)O,3380.000,EC50
7,CHEMBL322086,CC[C@@H](Cc1ccc(OC)c(C(=O)NCc2ccccc2)c1)C(=O)O,11400.000,EC50
...,...,...,...,...
2435,CHEMBL295416,Cc1cccc(Nc2cc(Cl)nc(SCC(=O)O)n2)c1C,1560.000,EC50
2445,CHEMBL5219536,COc1cccc2ccc(-c3cc(-c4ccc5cccc(OC)c5c4O)c4cccc...,9400.000,Ki
2446,CHEMBL21241,CC(C)(Sc1ccc(CCN(CCCCC2CCCCC2)C(=O)NC2CCCCC2)c...,7.000,Ki
2447,CHEMBL5219536,COc1cccc2ccc(-c3cc(-c4ccc5cccc(OC)c5c4O)c4cccc...,58200.000,EC50


In [16]:
# average values
df_mean = df_du.groupby(['molecule_chembl_id','canonical_smiles']).agg({'standard_value': 'mean','type': lambda x: pd.unique(x)}).reset_index()
df_mean

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,type
0,CHEMBL104850,Cc1oc(-c2ccccc2)nc1CCOc1ccc(C[C@@H](C(=O)O)n2c...,1609.556000,"[Ki, IC50, pEC50]"
1,CHEMBL106256,CCCc1c(O)c(C(=O)CC)cc(Cl)c1OC(C(=O)O)c1ccccc1,548.500000,[Ki]
2,CHEMBL106666,COC(=O)c1ccccc1N[C@@H](Cc1ccc(OCCc2nc(-c3ccccc...,2283.685000,"[Ki, pEC50]"
3,CHEMBL107367,Cc1oc(-c2ccccc2)nc1CCOc1ccc(C[C@H](Nc2ccccc2C(...,855.838571,"[Ki, Log Ki, IC50, EC50, pEC50]"
4,CHEMBL107498,CCCCCCCCCC(=O)O,30500.000000,"[IC50, EC50]"
...,...,...,...,...
276,CHEMBL87380,CCO[C@@H](Cc1ccc(OCC=C(c2ccc(C)cc2)c2ccc(C)cc2...,695.915000,"[EC50, pEC50]"
277,CHEMBL8739,CC/C=C\C/C=C\C/C=C\CCCCCCCC(=O)O,802.633333,"[IC50, Kd]"
278,CHEMBL89152,CCO[C@@H](Cc1ccc(OC/C=C(\c2ccccc2)c2ccc(Br)cc2...,1371.060000,"[EC50, pEC50]"
279,CHEMBL94306,CCCc1cc(Oc2ccccc2)ccc1OCCCOc1cccc(C2SC(=O)NC2=...,29.000000,"[IC50, Ki, EC50]"


In [17]:
# calulate standard deviation
df_std = df_du.groupby('molecule_chembl_id').agg({'canonical_smiles': lambda x: pd.unique(x),'standard_value': 'std','type': lambda x: pd.unique(x)}).reset_index()

In [18]:
# rename standard_value column to std
df_std = df_std.rename(columns={"standard_value": "std"})
df_std

Unnamed: 0,molecule_chembl_id,canonical_smiles,std,type
0,CHEMBL104850,[Cc1oc(-c2ccccc2)nc1CCOc1ccc(C[C@@H](C(=O)O)n2...,1423.219925,"[Ki, IC50, pEC50]"
1,CHEMBL106256,[CCCc1c(O)c(C(=O)CC)cc(Cl)c1OC(C(=O)O)c1ccccc1],361.331565,[Ki]
2,CHEMBL106666,[COC(=O)c1ccccc1N[C@@H](Cc1ccc(OCCc2nc(-c3cccc...,1673.983381,"[Ki, pEC50]"
3,CHEMBL107367,[Cc1oc(-c2ccccc2)nc1CCOc1ccc(C[C@H](Nc2ccccc2C...,949.876842,"[Ki, Log Ki, IC50, EC50, pEC50]"
4,CHEMBL107498,[CCCCCCCCCC(=O)O],707.106781,"[IC50, EC50]"
...,...,...,...,...
276,CHEMBL87380,[CCO[C@@H](Cc1ccc(OCC=C(c2ccc(C)cc2)c2ccc(C)cc...,4.716952,"[EC50, pEC50]"
277,CHEMBL8739,[CC/C=C\C/C=C\C/C=C\CCCCCCCC(=O)O],688.259256,"[IC50, Kd]"
278,CHEMBL89152,[CCO[C@@H](Cc1ccc(OC/C=C(\c2ccccc2)c2ccc(Br)cc...,544.469195,"[EC50, pEC50]"
279,CHEMBL94306,[CCCc1cc(Oc2ccccc2)ccc1OCCCOc1cccc(C2SC(=O)NC2...,1.154701,"[IC50, Ki, EC50]"


In [19]:
# add std data to mean data
column = df_std["std"]
df7 = pd.concat([df_mean,column], axis = 1)
df7

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,type,std
0,CHEMBL104850,Cc1oc(-c2ccccc2)nc1CCOc1ccc(C[C@@H](C(=O)O)n2c...,1609.556000,"[Ki, IC50, pEC50]",1423.219925
1,CHEMBL106256,CCCc1c(O)c(C(=O)CC)cc(Cl)c1OC(C(=O)O)c1ccccc1,548.500000,[Ki],361.331565
2,CHEMBL106666,COC(=O)c1ccccc1N[C@@H](Cc1ccc(OCCc2nc(-c3ccccc...,2283.685000,"[Ki, pEC50]",1673.983381
3,CHEMBL107367,Cc1oc(-c2ccccc2)nc1CCOc1ccc(C[C@H](Nc2ccccc2C(...,855.838571,"[Ki, Log Ki, IC50, EC50, pEC50]",949.876842
4,CHEMBL107498,CCCCCCCCCC(=O)O,30500.000000,"[IC50, EC50]",707.106781
...,...,...,...,...,...
276,CHEMBL87380,CCO[C@@H](Cc1ccc(OCC=C(c2ccc(C)cc2)c2ccc(C)cc2...,695.915000,"[EC50, pEC50]",4.716952
277,CHEMBL8739,CC/C=C\C/C=C\C/C=C\CCCCCCCC(=O)O,802.633333,"[IC50, Kd]",688.259256
278,CHEMBL89152,CCO[C@@H](Cc1ccc(OC/C=C(\c2ccccc2)c2ccc(Br)cc2...,1371.060000,"[EC50, pEC50]",544.469195
279,CHEMBL94306,CCCc1cc(Oc2ccccc2)ccc1OCCCOc1cccc(C2SC(=O)NC2=...,29.000000,"[IC50, Ki, EC50]",1.154701


In [20]:
# drop standard deviation >= 2
indexNames = df7[df7['std'] >= 2].index
df8 = df7.drop(indexNames , inplace=False)
df8

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,type,std
22,CHEMBL15594,CCCCC/C=C\C/C=C\C/C=C\C/C=C\CCCC(=O)O,1200.0,[IC50],0.0
24,CHEMBL167430,CCO[C@@H](Cc1ccc(OCCn2c3ccccc3c3cc(Br)ccc32)cc...,2399.22,"[pEC50, EC50]",0.6755
44,CHEMBL192518,O=C(O)[C@H](Cc1ccccc1)Oc1ccc(C(F)(F)F)cc1,500.396667,"[EC50, pEC50]",0.687047
46,CHEMBL1934484,Cc1ccc(-c2nc(C)c(C(=O)N[C@H]3CCCN(c4cccc(C(=O)...,6.0,[EC50],0.0
69,CHEMBL212911,CCc1ccc(O[C@H](C)CCOc2ccc(CCC(=O)O)c(C)c2)c(C(...,4257.0,[IC50],0.0
81,CHEMBL2237300,Cc1oc(C2CCCCC2)nc1CCOc1ccc(C[C@H](Oc2ccccc2)C(...,1.0,"[pEC50, EC50]",0.0
82,CHEMBL2282517,Cc1sc(-c2ccccc2)nc1CCOc1ccc(C[C@@](O)(C(=O)O)c...,1047.065,"[pEC50, EC50]",0.091924
83,CHEMBL2282518,Cc1sc(N2CCN(C)CC2)nc1CCOc1ccc(C[C@@](O)(C(=O)O...,6456.27,"[pEC50, EC50]",0.381838
88,CHEMBL2282523,Cc1oc(-c2ccccc2)nc1CCOc1ccc(C[C@@](O)(C(=O)O)c...,457.045,"[pEC50, EC50]",0.06364
101,CHEMBL272137,CCCOc1ccc(C[C@@H](CC)C(=O)O)cc1CNC(=O)c1ccc(C2...,500.0,[EC50],0.0


In [21]:
# drop std columns
df9= df8.drop(columns=['std'])
df9

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,type
22,CHEMBL15594,CCCCC/C=C\C/C=C\C/C=C\C/C=C\CCCC(=O)O,1200.0,[IC50]
24,CHEMBL167430,CCO[C@@H](Cc1ccc(OCCn2c3ccccc3c3cc(Br)ccc32)cc...,2399.22,"[pEC50, EC50]"
44,CHEMBL192518,O=C(O)[C@H](Cc1ccccc1)Oc1ccc(C(F)(F)F)cc1,500.396667,"[EC50, pEC50]"
46,CHEMBL1934484,Cc1ccc(-c2nc(C)c(C(=O)N[C@H]3CCCN(c4cccc(C(=O)...,6.0,[EC50]
69,CHEMBL212911,CCc1ccc(O[C@H](C)CCOc2ccc(CCC(=O)O)c(C)c2)c(C(...,4257.0,[IC50]
81,CHEMBL2237300,Cc1oc(C2CCCCC2)nc1CCOc1ccc(C[C@H](Oc2ccccc2)C(...,1.0,"[pEC50, EC50]"
82,CHEMBL2282517,Cc1sc(-c2ccccc2)nc1CCOc1ccc(C[C@@](O)(C(=O)O)c...,1047.065,"[pEC50, EC50]"
83,CHEMBL2282518,Cc1sc(N2CCN(C)CC2)nc1CCOc1ccc(C[C@@](O)(C(=O)O...,6456.27,"[pEC50, EC50]"
88,CHEMBL2282523,Cc1oc(-c2ccccc2)nc1CCOc1ccc(C[C@@](O)(C(=O)O)c...,457.045,"[pEC50, EC50]"
101,CHEMBL272137,CCCOc1ccc(C[C@@H](CC)C(=O)O)cc1CNC(=O)c1ccc(C2...,500.0,[EC50]


Select non-duplicate data

In [22]:
# drop duplicate data
df10 = df6.drop_duplicates(['canonical_smiles'],keep=False)
df10

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,type
3,CHEMBL94496,CCCc1cc(Oc2ccc(C(C)C)cc2)ccc1OCCCOc1cccc(C2SC(...,2100.0,IC50
4,CHEMBL420441,CCCc1cc(Oc2ccc(Cl)cc2)ccc1OCCCOc1cccc(C2SC(=O)...,100.0,IC50
5,CHEMBL328615,CCCc1cc(Oc2ccc(Cl)c(C)c2)ccc1OCCCOc1cccc(C2SC(...,162.0,IC50
8,CHEMBL96107,O=C1NC(=O)C(c2cccc(OCCCOc3ccc(Oc4ccccc4)cc3)c2)S1,47.0,IC50
9,CHEMBL94397,CCCc1cc(Oc2ccc(O)cc2)ccc1OCCCOc1cccc(C2SC(=O)N...,950.0,IC50
...,...,...,...,...
2440,CHEMBL5194126,CC(C)(C)OC(=O)NC(CSCc1ccc(C(=O)c2ccc([N+](=O)[...,4800.0,EC50
2441,CHEMBL5194570,CC(C)(C)OC(=O)NC(COCc1ccc(-c2ccccc2)cc1)C(=O)O,7200.0,EC50
2442,CHEMBL5207420,CC(SCc1ccccc1)C(NC(=O)c1ccccc1)C(=O)O,9100.0,EC50
2443,CHEMBL5186824,CC(C)(C)OC(=O)NC(COCc1ccc(Cc2ccccc2)cc1)C(=O)O,10000.0,EC50


Combine two data frames

In [23]:
df_f = pd.concat([df9,df10], axis=0)
df_f

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,type
22,CHEMBL15594,CCCCC/C=C\C/C=C\C/C=C\C/C=C\CCCC(=O)O,1200.000000,[IC50]
24,CHEMBL167430,CCO[C@@H](Cc1ccc(OCCn2c3ccccc3c3cc(Br)ccc32)cc...,2399.220000,"[pEC50, EC50]"
44,CHEMBL192518,O=C(O)[C@H](Cc1ccccc1)Oc1ccc(C(F)(F)F)cc1,500.396667,"[EC50, pEC50]"
46,CHEMBL1934484,Cc1ccc(-c2nc(C)c(C(=O)N[C@H]3CCCN(c4cccc(C(=O)...,6.000000,[EC50]
69,CHEMBL212911,CCc1ccc(O[C@H](C)CCOc2ccc(CCC(=O)O)c(C)c2)c(C(...,4257.000000,[IC50]
...,...,...,...,...
2440,CHEMBL5194126,CC(C)(C)OC(=O)NC(CSCc1ccc(C(=O)c2ccc([N+](=O)[...,4800.000000,EC50
2441,CHEMBL5194570,CC(C)(C)OC(=O)NC(COCc1ccc(-c2ccccc2)cc1)C(=O)O,7200.000000,EC50
2442,CHEMBL5207420,CC(SCc1ccccc1)C(NC(=O)c1ccccc1)C(=O)O,9100.000000,EC50
2443,CHEMBL5186824,CC(C)(C)OC(=O)NC(COCc1ccc(Cc2ccccc2)cc1)C(=O)O,10000.000000,EC50


In [24]:
df_f.to_csv('ppara_bioactivity_data_preprocessed.csv', index=False)

In [25]:
df_f['type'].value_counts()

EC50                 869
IC50                 513
pEC50                 91
Ki                    86
pIC50                 67
[EC50]                13
Log Ki                 9
[IC50]                 7
Kd                     5
[EC50, Kd]             1
[pEC50, EC50]          1
[Ki, EC50]             1
[pEC50, EC50]          1
[IC50, EC50]           1
[EC50, pEC50]          1
[EC50, Kd, pEC50]      1
[IC50, Ki, EC50]       1
[pEC50, EC50]          1
[pEC50, EC50]          1
[pEC50, EC50]          1
[pEC50, EC50]          1
[EC50, pEC50]          1
[EC50, pEC50]          1
Name: type, dtype: int64