## DATA COLLECTION
##### chEMBL is an open large-scale bioactivity database.
##### chembl_webresource_client is the library developed and supported by chEMBL group. The library helps accessing chEMBL data.


In [2]:
import pandas as pd #for data processing 
from chembl_webresource_client.new_client import new_client

###### Lets download the biological activity data from chEMBL database. The dataset  is comprised of compounds that have been biologically tested for their activity towards target ###### 

In [3]:
target = new_client.target  
target_query = target.search('FLT3')  #search the target,'FLT3' is genetic code for Tyrosine protein kinase receptor
targets = pd.DataFrame.from_dict(target_query)
targets

Unnamed: 0,cross_references,organism,pref_name,score,species_group_flag,target_chembl_id,target_components,target_type,tax_id
0,[],Mus musculus,Receptor-type tyrosine-protein kinase FLT3,18.0,False,CHEMBL2034796,"[{'accession': 'Q00342', 'component_descriptio...",SINGLE PROTEIN,10090
1,"[{'xref_id': 'P36888', 'xref_name': None, 'xre...",Homo sapiens,Tyrosine-protein kinase receptor FLT3,16.0,False,CHEMBL1974,"[{'accession': 'P36888', 'component_descriptio...",SINGLE PROTEIN,9606


In [4]:
#selecting our target
selected_target = targets.target_chembl_id[1]  
selected_target

'CHEMBL1974'

##### Lets retrieve biological activity data for Tyrosine Protein kinase receptor that are reported as IC50 values in nM

In [5]:
activity = new_client.activity  
data = activity.filter(target_chembl_id=selected_target).filter(standard_type="IC50")

In [8]:
#convert data to dataframe using pandas 
df = pd.DataFrame.from_dict(data)
df.head()

Unnamed: 0,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,bao_endpoint,bao_format,bao_label,canonical_smiles,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,866063,[],CHEMBL766072,Inhibition of chimeric PDGF receptor with FLT-...,B,BAO_0000190,BAO_0000219,cell-based format,COc1cc2c(N3CCN(C(=O)Nc4ccc(OC(C)C)cc4)CC3)ncnc...,...,Homo sapiens,Tyrosine-protein kinase receptor FLT3,9606,,,IC50,uM,UO_0000065,,0.128
1,,872532,[],CHEMBL766072,Inhibition of chimeric PDGF receptor with FLT-...,B,BAO_0000190,BAO_0000219,cell-based format,COc1cc2c(N3CCN(C(=O)Nc4ccc(OC(C)C)cc4)CC3)ncnc...,...,Homo sapiens,Tyrosine-protein kinase receptor FLT3,9606,,,IC50,uM,UO_0000065,,0.22
2,,872564,[],CHEMBL766072,Inhibition of chimeric PDGF receptor with FLT-...,B,BAO_0000190,BAO_0000219,cell-based format,COc1cc2c(N3CCN(C(=O)Nc4ccc(C#N)cc4)CC3)ncnc2cc...,...,Homo sapiens,Tyrosine-protein kinase receptor FLT3,9606,,,IC50,uM,UO_0000065,,8.79
3,,879718,[],CHEMBL766072,Inhibition of chimeric PDGF receptor with FLT-...,B,BAO_0000190,BAO_0000219,cell-based format,COc1cc2c(N3CCN(C(=O)Nc4ccc(C#N)cc4)CC3)ncnc2cc...,...,Homo sapiens,Tyrosine-protein kinase receptor FLT3,9606,,,IC50,uM,UO_0000065,,1.91
4,,884645,[],CHEMBL766072,Inhibition of chimeric PDGF receptor with FLT-...,B,BAO_0000190,BAO_0000219,cell-based format,Cc1ccc(NC(=O)c2ccc(CN3CCN(C)CC3)cc2)cc1Nc1nccc...,...,Homo sapiens,Tyrosine-protein kinase receptor FLT3,9606,,,IC50,uM,UO_0000065,,30.0


In [10]:
#dimension of dataframe 
df.shape 

(3464, 43)

In [13]:
#removing data which has 'Nan' standard value 
df2 = df[df.standard_value.notna()]
df2.head()

Unnamed: 0,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,bao_endpoint,bao_format,bao_label,canonical_smiles,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,866063,[],CHEMBL766072,Inhibition of chimeric PDGF receptor with FLT-...,B,BAO_0000190,BAO_0000219,cell-based format,COc1cc2c(N3CCN(C(=O)Nc4ccc(OC(C)C)cc4)CC3)ncnc...,...,Homo sapiens,Tyrosine-protein kinase receptor FLT3,9606,,,IC50,uM,UO_0000065,,0.128
1,,872532,[],CHEMBL766072,Inhibition of chimeric PDGF receptor with FLT-...,B,BAO_0000190,BAO_0000219,cell-based format,COc1cc2c(N3CCN(C(=O)Nc4ccc(OC(C)C)cc4)CC3)ncnc...,...,Homo sapiens,Tyrosine-protein kinase receptor FLT3,9606,,,IC50,uM,UO_0000065,,0.22
2,,872564,[],CHEMBL766072,Inhibition of chimeric PDGF receptor with FLT-...,B,BAO_0000190,BAO_0000219,cell-based format,COc1cc2c(N3CCN(C(=O)Nc4ccc(C#N)cc4)CC3)ncnc2cc...,...,Homo sapiens,Tyrosine-protein kinase receptor FLT3,9606,,,IC50,uM,UO_0000065,,8.79
3,,879718,[],CHEMBL766072,Inhibition of chimeric PDGF receptor with FLT-...,B,BAO_0000190,BAO_0000219,cell-based format,COc1cc2c(N3CCN(C(=O)Nc4ccc(C#N)cc4)CC3)ncnc2cc...,...,Homo sapiens,Tyrosine-protein kinase receptor FLT3,9606,,,IC50,uM,UO_0000065,,1.91
4,,884645,[],CHEMBL766072,Inhibition of chimeric PDGF receptor with FLT-...,B,BAO_0000190,BAO_0000219,cell-based format,Cc1ccc(NC(=O)c2ccc(CN3CCN(C)CC3)cc2)cc1Nc1nccc...,...,Homo sapiens,Tyrosine-protein kinase receptor FLT3,9606,,,IC50,uM,UO_0000065,,30.0


In [16]:
df2.shape 

(3383, 43)

##### 81 entries with NaN value are removed 

#### Lets Combine all those features/columns that are important for model training

In [17]:
columns= ['molecule_chembl_id','canonical_smiles','standard_value']
df3 = df2[columns]
df3.head()

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value
0,CHEMBL330863,COc1cc2c(N3CCN(C(=O)Nc4ccc(OC(C)C)cc4)CC3)ncnc...,128.0
1,CHEMBL124660,COc1cc2c(N3CCN(C(=O)Nc4ccc(OC(C)C)cc4)CC3)ncnc...,220.0
2,CHEMBL126699,COc1cc2c(N3CCN(C(=O)Nc4ccc(C#N)cc4)CC3)ncnc2cc...,8790.0
3,CHEMBL445636,COc1cc2c(N3CCN(C(=O)Nc4ccc(C#N)cc4)CC3)ncnc2cc...,1910.0
4,CHEMBL941,Cc1ccc(NC(=O)c2ccc(CN3CCN(C)CC3)cc2)cc1Nc1nccc...,30000.0


#### Labeling the compounds as Active/Inactive/Intermediate

Compunds are being labeled(Active\Inactive\Intermediate) based on their potency value(Standard_values)
compounds having values < 1000 nM will be considered active , Those greater than 10000 nM will be considered to be inactive

In [18]:
bioactivity_threshold = []
for i in df3.standard_value:
    if float(i) >= 10000:
        bioactivity_threshold.append("inactive")
    elif float(i) <= 1000:
        bioactivity_threshold.append("active")
    else:
        bioactivity_threshold.append("intermediate")

#### Let's Add Bioactivity data to our dataset

In [19]:
bioactivity_class = pd.Series(bioactivity_threshold, name='bioactivity')
df5 = pd.concat([df3, bioactivity_class], axis=1)
df5.head()

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,bioactivity
0,CHEMBL330863,COc1cc2c(N3CCN(C(=O)Nc4ccc(OC(C)C)cc4)CC3)ncnc...,128.0,active
1,CHEMBL124660,COc1cc2c(N3CCN(C(=O)Nc4ccc(OC(C)C)cc4)CC3)ncnc...,220.0,active
2,CHEMBL126699,COc1cc2c(N3CCN(C(=O)Nc4ccc(C#N)cc4)CC3)ncnc2cc...,8790.0,intermediate
3,CHEMBL445636,COc1cc2c(N3CCN(C(=O)Nc4ccc(C#N)cc4)CC3)ncnc2cc...,1910.0,intermediate
4,CHEMBL941,Cc1ccc(NC(=O)c2ccc(CN3CCN(C)CC3)cc2)cc1Nc1nccc...,30000.0,inactive


In [20]:
df5.bioactivity.value_counts()

active          1645
inactive        1283
intermediate     455
Name: bioactivity, dtype: int64

###### There are 1645 Active and 1283 inactive compounds/molecules present in our dataset 

##### Lets save this dataset 

In [None]:
df5.to_csv('TPKR_data2.csv',index=False)