In [38]:
# import packages

# global imports
import pandas as pd
import numpy as np
from chembl_webresource_client.new_client import new_client

# RDKit
from rdkit.Chem import AllChem as Chem
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole

In [40]:
# extract hERG data from ChEMBL

# preparing the ChEMBL client
from chembl_webresource_client.new_client import new_client
target = new_client.target
activity = new_client.activity

# getting the hERG IC50 data from ChEMBL compounds
herg = target.search('herg')[0]
herg_activities = activity.filter(target_chembl_id = herg['target_chembl_id']).filter(standard_type = "IC50")#.filter(standard_relation = "=")

print(len(herg_activities))

11886


In [41]:
# print out overview of data associated with one compound

for prop in herg_activities[0]:
    print(prop)

activity_comment
activity_id
activity_properties
assay_chembl_id
assay_description
assay_type
bao_endpoint
bao_format
bao_label
canonical_smiles
data_validity_comment
data_validity_description
document_chembl_id
document_journal
document_year
ligand_efficiency
molecule_chembl_id
molecule_pref_name
parent_molecule_chembl_id
pchembl_value
potential_duplicate
qudt_units
record_id
relation
src_id
standard_flag
standard_relation
standard_text_value
standard_type
standard_units
standard_upper_value
standard_value
target_chembl_id
target_organism
target_pref_name
target_tax_id
text_value
toid
type
units
uo_units
upper_value
value


In [42]:
# write extracted data from herg_activities into a pandas dataframe

# define empty dataframe and parameters
herg_dataframe = pd.DataFrame(columns = ['Smiles', 'Class', 'Value_type', 'Relation', 'Activity', 'Unit', 'Assay_type', 'Assay_id'])
limit = len(herg_activities)
#limit = 1000

# extracting the hERG data from the query
herg_dataframe['Smiles'] = [herg_activities[index]['canonical_smiles'] for index in range(0, limit)]
herg_dataframe['Value_type'] = [herg_activities[index]['standard_type'] for index in range(0, limit)]
herg_dataframe['Relation'] = [herg_activities[index]['standard_relation'] for index in range(0, limit)]
herg_dataframe['Activity'] = [herg_activities[index]['standard_value'] for index in range(0, limit)]
herg_dataframe['Unit'] = [herg_activities[index]['standard_units'] for index in range(0, limit)]
herg_dataframe['Assay_type'] = [herg_activities[index]['assay_type'] for index in range(0, limit)]
herg_dataframe['Assay_id'] = [herg_activities[index]['assay_chembl_id'] for index in range(0, limit)]

# change datatype of activity values to float
herg_dataframe = herg_dataframe.astype({"Activity": float})

print(len(herg_dataframe))
display(herg_dataframe.head())

11886


Unnamed: 0,Smiles,Class,Value_type,Relation,Activity,Unit,Assay_type,Assay_id
0,O=C1NCCN1CCN1CCC(c2cn(-c3ccc(F)cc3)c3ccc(Cl)cc...,,IC50,=,14.0,nM,T,CHEMBL841079
1,O=C1NCCN1CCN1CCC(c2cn(-c3ccc(F)cc3)c3ccc(Cl)cc...,,IC50,~,3.0,nM,F,CHEMBL841078
2,O=C(CCCN1CC=C(n2c(=O)[nH]c3ccccc32)CC1)c1ccc(F...,,IC50,=,32.2,nM,F,CHEMBL691014
3,O=C(O[C@@H]1C[C@@H]2C[C@H]3C[C@H](C1)N2CC3=O)c...,,IC50,=,5950.0,nM,F,CHEMBL691013
4,COc1ccc(CCN(C)CCCC(C#N)(c2ccc(OC)c(OC)c2)C(C)C...,,IC50,=,143.0,nM,F,CHEMBL691014


In [43]:
# restrict data set to IC50 measurement with unit nM, with relations "=, <, >" and from binding assays (B) or functional assays (F)

herg_dataframe = herg_dataframe[(herg_dataframe['Unit'] ==  'nM') & (herg_dataframe['Relation'].isin(['=', '>', '<']))  & ((herg_dataframe['Assay_type'] ==  'B') |(herg_dataframe['Assay_type'] ==  'F'))]

print(len(herg_dataframe))
display(herg_dataframe.head())

10161


Unnamed: 0,Smiles,Class,Value_type,Relation,Activity,Unit,Assay_type,Assay_id
2,O=C(CCCN1CC=C(n2c(=O)[nH]c3ccccc32)CC1)c1ccc(F...,,IC50,=,32.2,nM,F,CHEMBL691014
3,O=C(O[C@@H]1C[C@@H]2C[C@H]3C[C@H](C1)N2CC3=O)c...,,IC50,=,5950.0,nM,F,CHEMBL691013
4,COc1ccc(CCN(C)CCCC(C#N)(c2ccc(OC)c(OC)c2)C(C)C...,,IC50,=,143.0,nM,F,CHEMBL691014
5,CCCCN(CCCC)CCC(O)c1cc2c(Cl)cc(Cl)cc2c2cc(C(F)(...,,IC50,=,196.0,nM,F,CHEMBL877203
6,CCOC(=O)N1CCC(=C2c3ccc(Cl)cc3CCc3cccnc32)CC1,,IC50,=,173.0,nM,F,CHEMBL691014


In [44]:
# add class labels to dataframe according to IC50 threshold in nM and remove rows with inconclusive relation-activity pairs
# order of pattern: IC50 "Relation" "Activity"

IC50_threshold = 10000

for (index, row) in herg_dataframe.iterrows():
    
    relation = row["Relation"]
    activity = row["Activity"]
    
    # delete inconclusive rows
    
    if relation == ">" and activity < IC50_threshold :
        herg_dataframe = herg_dataframe.drop([index])
    if relation == "<" and activity > IC50_threshold:
        herg_dataframe = herg_dataframe.drop([index])
        
    # add class labels for conclusive rows
    
    if relation == ">" and activity >= IC50_threshold:
        herg_dataframe.loc[index, "Class"] = 0
    if relation == "<" and activity <= IC50_threshold:
        herg_dataframe.loc[index, "Class"] = 1
        
    if relation == "=" and activity < IC50_threshold:
        herg_dataframe.loc[index, "Class"] = 1
        
    if relation == "=" and activity >= IC50_threshold:
        herg_dataframe.loc[index, "Class"] = 0
        
    
print(len(herg_dataframe[herg_dataframe["Class"] == 0]))
print(len(herg_dataframe[herg_dataframe["Class"] == 1]))

4874
5013


In [46]:
# check for missing values (i.e. NA values)

display(herg_dataframe[herg_dataframe['Smiles'].isnull()])
display(herg_dataframe[herg_dataframe['Class'].isnull()])
display(herg_dataframe[herg_dataframe['Value_type'].isnull()])
display(herg_dataframe[herg_dataframe['Relation'].isnull()])
display(herg_dataframe[herg_dataframe['Activity'].isnull()])
display(herg_dataframe[herg_dataframe['Unit'].isnull()])
display(herg_dataframe[herg_dataframe['Assay_type'].isnull()])
display(herg_dataframe[herg_dataframe['Assay_id'].isnull()])

Unnamed: 0,Smiles,Class,Value_type,Relation,Activity,Unit,Assay_type,Assay_id


Unnamed: 0,Smiles,Class,Value_type,Relation,Activity,Unit,Assay_type,Assay_id


Unnamed: 0,Smiles,Class,Value_type,Relation,Activity,Unit,Assay_type,Assay_id


Unnamed: 0,Smiles,Class,Value_type,Relation,Activity,Unit,Assay_type,Assay_id


Unnamed: 0,Smiles,Class,Value_type,Relation,Activity,Unit,Assay_type,Assay_id


Unnamed: 0,Smiles,Class,Value_type,Relation,Activity,Unit,Assay_type,Assay_id


Unnamed: 0,Smiles,Class,Value_type,Relation,Activity,Unit,Assay_type,Assay_id


Unnamed: 0,Smiles,Class,Value_type,Relation,Activity,Unit,Assay_type,Assay_id


In [47]:
# save the pandas dataframe

herg_dataframe.to_csv("/home/user/Desktop/Lhasa_Mini_Project/Code/Code_for_Lhasa_Mini_Project_from_Markus/data/ChEMBL_hERG/ChEMBL_hERG_IC50_data.csv", index=False)