## **Importing libraries**

In [27]:
# Import necessary libraries
import numpy as np
import pandas as pd
from chembl_webresource_client.new_client import new_client

## **Search for Target protein**

### **Target search for shp2**

In [2]:
# Target search for coronavirus
target = new_client.target
target_query = target.search('shp2')
targets = pd.DataFrame.from_dict(target_query)
targets

Unnamed: 0,cross_references,organism,pref_name,score,species_group_flag,target_chembl_id,target_components,target_type,tax_id
0,"[{'xref_id': 'P35235', 'xref_name': None, 'xre...",Mus musculus,Protein-tyrosine phosphatase 2C,17.0,False,CHEMBL2620,"[{'accession': 'P35235', 'component_descriptio...",SINGLE PROTEIN,10090
1,"[{'xref_id': 'Q06124', 'xref_name': None, 'xre...",Homo sapiens,Protein-tyrosine phosphatase 2C,15.0,False,CHEMBL3864,"[{'accession': 'Q06124', 'component_descriptio...",SINGLE PROTEIN,9606
2,[],Homo sapiens,VHL/Protein-tyrosine phosphatase 2C,13.0,False,CHEMBL4630742,"[{'accession': 'Q06124', 'component_descriptio...",PROTEIN-PROTEIN INTERACTION,9606


In [3]:
selected_target = targets.target_chembl_id[1]
selected_target

'CHEMBL3864'

Here, we will retrieve only bioactivity data for *coronavirus 3C-like proteinase* (CHEMBL3927) that are reported as IC$_{50}$ values in nM (nanomolar) unit.

In [4]:
activity = new_client.activity
res = activity.filter(target_chembl_id=selected_target).filter(standard_type="IC50")

In [5]:
df = pd.DataFrame.from_dict(res)

In [7]:
df

Unnamed: 0,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,bao_format,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,185463,[],CHEMBL764641,Inhibition of Syp(N) SH2 domain block binding ...,B,,,BAO_0000190,BAO_0000357,...,Homo sapiens,Protein-tyrosine phosphatase 2C,9606,,,IC50,uM,UO_0000065,,100.0
1,,213040,[],CHEMBL764641,Inhibition of Syp(N) SH2 domain block binding ...,B,,,BAO_0000190,BAO_0000357,...,Homo sapiens,Protein-tyrosine phosphatase 2C,9606,,,IC50,uM,UO_0000065,,100.0
2,,558682,[],CHEMBL805163,Inhibitory activity against SH2 domain of SH-P...,F,,,BAO_0000190,BAO_0000019,...,Homo sapiens,Protein-tyrosine phosphatase 2C,9606,,,IC50,uM,UO_0000065,,1.54
3,,576834,[],CHEMBL805163,Inhibitory activity against SH2 domain of SH-P...,F,,,BAO_0000190,BAO_0000019,...,Homo sapiens,Protein-tyrosine phosphatase 2C,9606,,,IC50,uM,UO_0000065,,22.0
4,,596575,[],CHEMBL812686,In vitro inhibitory concentration required aga...,B,,,BAO_0000190,BAO_0000357,...,Homo sapiens,Protein-tyrosine phosphatase 2C,9606,,,IC50,uM,UO_0000065,,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1234,,22982901,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL4773301,Inhibition of recombinant SHP2 (unknown origin...,B,,,BAO_0000179,BAO_0000357,...,Homo sapiens,Protein-tyrosine phosphatase 2C,9606,,,IC50,uM,,,3.32
1235,,22982902,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL4773301,Inhibition of recombinant SHP2 (unknown origin...,B,,,BAO_0000179,BAO_0000357,...,Homo sapiens,Protein-tyrosine phosphatase 2C,9606,,,IC50,uM,,,1.24
1236,,22982903,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL4773301,Inhibition of recombinant SHP2 (unknown origin...,B,,,BAO_0000179,BAO_0000357,...,Homo sapiens,Protein-tyrosine phosphatase 2C,9606,,,IC50,uM,,,3.97
1237,Not Determined,22982904,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL4773301,Inhibition of recombinant SHP2 (unknown origin...,B,,,BAO_0000179,BAO_0000357,...,Homo sapiens,Protein-tyrosine phosphatase 2C,9606,,,IC50,,,,


In [8]:
df.standard_type.unique()

array(['IC50'], dtype=object)

Finally we will save the resulting bioactivity data to a CSV file **bioactivity_data.csv**.

In [9]:
df.to_csv('shp_bioactivity_data.csv', index=False)

## **Handling missing data**
If any compounds has missing value for the **standard_value** column then drop it

In [10]:
df2 = df[df.standard_value.notna()]
df2

Unnamed: 0,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,bao_format,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,185463,[],CHEMBL764641,Inhibition of Syp(N) SH2 domain block binding ...,B,,,BAO_0000190,BAO_0000357,...,Homo sapiens,Protein-tyrosine phosphatase 2C,9606,,,IC50,uM,UO_0000065,,100.0
1,,213040,[],CHEMBL764641,Inhibition of Syp(N) SH2 domain block binding ...,B,,,BAO_0000190,BAO_0000357,...,Homo sapiens,Protein-tyrosine phosphatase 2C,9606,,,IC50,uM,UO_0000065,,100.0
2,,558682,[],CHEMBL805163,Inhibitory activity against SH2 domain of SH-P...,F,,,BAO_0000190,BAO_0000019,...,Homo sapiens,Protein-tyrosine phosphatase 2C,9606,,,IC50,uM,UO_0000065,,1.54
3,,576834,[],CHEMBL805163,Inhibitory activity against SH2 domain of SH-P...,F,,,BAO_0000190,BAO_0000019,...,Homo sapiens,Protein-tyrosine phosphatase 2C,9606,,,IC50,uM,UO_0000065,,22.0
4,,596575,[],CHEMBL812686,In vitro inhibitory concentration required aga...,B,,,BAO_0000190,BAO_0000357,...,Homo sapiens,Protein-tyrosine phosphatase 2C,9606,,,IC50,uM,UO_0000065,,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1233,,22982900,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL4773301,Inhibition of recombinant SHP2 (unknown origin...,B,,,BAO_0000179,BAO_0000357,...,Homo sapiens,Protein-tyrosine phosphatase 2C,9606,,,IC50,uM,,,100.0
1234,,22982901,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL4773301,Inhibition of recombinant SHP2 (unknown origin...,B,,,BAO_0000179,BAO_0000357,...,Homo sapiens,Protein-tyrosine phosphatase 2C,9606,,,IC50,uM,,,3.32
1235,,22982902,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL4773301,Inhibition of recombinant SHP2 (unknown origin...,B,,,BAO_0000179,BAO_0000357,...,Homo sapiens,Protein-tyrosine phosphatase 2C,9606,,,IC50,uM,,,1.24
1236,,22982903,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL4773301,Inhibition of recombinant SHP2 (unknown origin...,B,,,BAO_0000179,BAO_0000357,...,Homo sapiens,Protein-tyrosine phosphatase 2C,9606,,,IC50,uM,,,3.97


Apparently, for this dataset there is no missing data. But we can use the above code cell for bioactivity data of other target protein.

## **Data pre-processing of the bioactivity data**

In [11]:
bioactivity_class = []
for i in df2.standard_value:
  if float(i) > 1000:
    bioactivity_class.append("inactive")
  elif float(i) <= 1000:
    bioactivity_class.append("active")

### **Iterate the *molecule_chembl_id* to a list**

In [12]:
mol_cid = []
for i in df2.molecule_chembl_id:
  mol_cid.append(i)

### **Iterate *canonical_smiles* to a list**

In [13]:
canonical_smiles = []
for i in df2.canonical_smiles:
  canonical_smiles.append(i)

### **Iterate *standard_value* to a list**

In [14]:
standard_value = []
for i in df2.standard_value:
  standard_value.append(i)

### **Combine the 4 lists into a dataframe**

In [15]:
data_tuples = list(zip(mol_cid, canonical_smiles, bioactivity_class, standard_value))
df3 = pd.DataFrame( data_tuples,  columns=['molecule_chembl_id', 'canonical_smiles', 'bioactivity_class', 'standard_value'])

In [16]:
df3

Unnamed: 0,molecule_chembl_id,canonical_smiles,bioactivity_class,standard_value
0,CHEMBL328907,CC[C@H](C)[C@H](NC(=O)[C@H](CCC(=O)O)NC(=O)[C@...,inactive,100000.0
1,CHEMBL2092743,CC[C@H](C)[C@H](NC(=O)[C@H](CCC(=O)O)NC(=O)[C@...,inactive,100000.0
2,CHEMBL438997,CC[C@H](C)[C@H](NC(=O)[C@H](Cc1ccc(OP(=O)(O)O)...,inactive,1540.0
3,CHEMBL263010,CC[C@H](C)[C@H](NC(=O)[C@H](Cc1ccc(OC(C(=O)O)C...,inactive,22000.0
4,CHEMBL420456,Nc1nc(N)c2nc(CN3CCN(Cc4ccc(-c5ccccc5)cc4)CC3)n...,inactive,5000.0
...,...,...,...,...
1140,CHEMBL4778849,CC(C)(C)c1ccc(Oc2ccc(NC(=O)C3=CO[C@@H](O[C@@H]...,inactive,100000.0
1141,CHEMBL4786441,O=C(O)C1=CO[C@@H](O[C@@H]2O[C@H](COCc3ccccc3)[...,inactive,3320.0
1142,CHEMBL4788294,CCOS(=O)(=O)/C=C/c1ccc(OCC2=CC[C@@H]3C(C(=O)Nc...,inactive,1240.0
1143,CHEMBL4800140,CCOS(=O)(=O)/C=C/c1ccc(OCC2=CC[C@@H]3C(C(=O)Nc...,inactive,3970.0


In [17]:
df3['bioactivity_class'].value_counts()

inactive    920
active      225
Name: bioactivity_class, dtype: int64

In [None]:
df3.to_csv('shp2_chembl.csv', index=False)