In [2]:
import pandas as pd
from chembl_webresource_client.new_client import new_client

### Search for Target Protein

#### Target search for corona virus from the CHEMBL database

In [3]:
target = new_client.target
target_query = target.search('coronavirus')
targets = pd.DataFrame.from_dict(target_query)
targets

Unnamed: 0,cross_references,organism,pref_name,score,species_group_flag,target_chembl_id,target_components,target_type,tax_id
0,[],Coronavirus,Coronavirus,17.0,False,CHEMBL613732,[],ORGANISM,11119
1,[],SARS coronavirus,SARS coronavirus,15.0,False,CHEMBL612575,[],ORGANISM,227859
2,[],Feline coronavirus,Feline coronavirus,15.0,False,CHEMBL612744,[],ORGANISM,12663
3,[],Human coronavirus 229E,Human coronavirus 229E,13.0,False,CHEMBL613837,[],ORGANISM,11137
4,"[{'xref_id': 'P0C6U8', 'xref_name': None, 'xre...",SARS coronavirus,SARS coronavirus 3C-like proteinase,10.0,False,CHEMBL3927,"[{'accession': 'P0C6U8', 'component_descriptio...",SINGLE PROTEIN,227859
5,[],Middle East respiratory syndrome-related coron...,Middle East respiratory syndrome-related coron...,9.0,False,CHEMBL4296578,[],ORGANISM,1335626
6,"[{'xref_id': 'P0C6X7', 'xref_name': None, 'xre...",SARS coronavirus,Replicase polyprotein 1ab,4.0,False,CHEMBL5118,"[{'accession': 'P0C6X7', 'component_descriptio...",SINGLE PROTEIN,227859
7,[],Severe acute respiratory syndrome coronavirus 2,Replicase polyprotein 1ab,4.0,False,CHEMBL4523582,"[{'accession': 'P0DTD1', 'component_descriptio...",SINGLE PROTEIN,2697049


### Select and retrieve bioactivity data for SARS coronavirus 3C-like proteinase

In [4]:
selected_target = targets.target_chembl_id[4]
selected_target

'CHEMBL3927'

The next step is to retrieve only the bioactivity data for coronavirus 3C-like proteinase (CHEMBL3927) that are reported as $IC_{50}$ values in nM(nanomolar) unit.

In [5]:
activity = new_client.activity
res = activity.filter(target_chembl_id=selected_target).filter(standard_type="IC50")

In [6]:
df = pd.DataFrame.from_dict(res)

In [7]:
df.head(10)

Unnamed: 0,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,bao_format,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,1480935,[],CHEMBL829584,In vitro inhibitory concentration against SARS...,B,,,BAO_0000190,BAO_0000357,...,SARS coronavirus,SARS coronavirus 3C-like proteinase,227859,,,IC50,uM,UO_0000065,,7.2
1,,1480936,[],CHEMBL829584,In vitro inhibitory concentration against SARS...,B,,,BAO_0000190,BAO_0000357,...,SARS coronavirus,SARS coronavirus 3C-like proteinase,227859,,,IC50,uM,UO_0000065,,9.4
2,,1481061,[],CHEMBL830868,In vitro inhibitory concentration against SARS...,B,,,BAO_0000190,BAO_0000357,...,SARS coronavirus,SARS coronavirus 3C-like proteinase,227859,,,IC50,uM,UO_0000065,,13.5
3,,1481065,[],CHEMBL829584,In vitro inhibitory concentration against SARS...,B,,,BAO_0000190,BAO_0000357,...,SARS coronavirus,SARS coronavirus 3C-like proteinase,227859,,,IC50,uM,UO_0000065,,13.11
4,,1481066,[],CHEMBL829584,In vitro inhibitory concentration against SARS...,B,,,BAO_0000190,BAO_0000357,...,SARS coronavirus,SARS coronavirus 3C-like proteinase,227859,,,IC50,uM,UO_0000065,,2.0
5,,1481068,[],CHEMBL828143,In vitro inhibitory concentration SARS coronav...,B,,,BAO_0000190,BAO_0000357,...,SARS coronavirus,SARS coronavirus 3C-like proteinase,227859,,,IC50,uM,UO_0000065,,0.98
6,,1481088,[],CHEMBL829584,In vitro inhibitory concentration against SARS...,B,,,BAO_0000190,BAO_0000357,...,SARS coronavirus,SARS coronavirus 3C-like proteinase,227859,,,IC50,uM,UO_0000065,,4.82
7,,1481089,[],CHEMBL829584,In vitro inhibitory concentration against SARS...,B,,,BAO_0000190,BAO_0000357,...,SARS coronavirus,SARS coronavirus 3C-like proteinase,227859,,,IC50,uM,UO_0000065,,0.95
8,,1481093,[],CHEMBL829584,In vitro inhibitory concentration against SARS...,B,,,BAO_0000190,BAO_0000357,...,SARS coronavirus,SARS coronavirus 3C-like proteinase,227859,,,IC50,uM,UO_0000065,,11.2
9,,1481209,[],CHEMBL829584,In vitro inhibitory concentration against SARS...,B,,,BAO_0000190,BAO_0000357,...,SARS coronavirus,SARS coronavirus 3C-like proteinase,227859,,,IC50,uM,UO_0000065,,23.5


Confirming that we only have the $IC_{50}$ of the standard_type

In [8]:
print("Number of unique values: ", df.standard_type.nunique())
print("Unique values: ", df.standard_type.unique())

Number of unique values:  1
Unique values:  ['IC50']


In [9]:
df[["standard_value"]].head()

Unnamed: 0,standard_value
0,7200.0
1,9400.0
2,13500.0
3,13110.0
4,2000.0


**standard_value** indicates the potency of the drug, so the higher its value is the potency gets worse.

>The number value indicates the amount of concentration needed for that particular drug to be potent enough. So, having a lower number is a good potency indicator as it requires little amount of concentration to serve as an efficient agent.

#### Saving the resulting bioactivity data to a CSV file

In [10]:
df.to_csv('bioactivity_data.csv', index=False)

### If using Google Drive and Colab, follow the following steps to create a directory to import your csv files to colab

`from google.colab import drive`

`drive.mount('/content/gdrive', force_remount=True)`

Create the directory to copy your data to

`! mkdir "/content/gdrive/My Drive/data": File exists`

Copy the csv file to the directory created

`! cp bioactivity_data.csv "/content/gdrive/My Drive/data"`

Check if the data is imported to the desired directory. You should be able to see the files in the directory once your run the command

`! ls "/content/gdrive/My Drive/data"`

To check the time the data was copied to the directory type `-l` next to `ls` on the command above.

If you want to check the content of the csv file before loading it to your progam, you can do so like this:

`! head bioactivity_data.csv`

### Handling Missing Data

If there exists any missing value for the **standard_value** column, then we drop the feature.

In [17]:
df2 = df[df.standard_value.notna()]

In [20]:
df2.head()

Unnamed: 0,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,bao_format,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,1480935,[],CHEMBL829584,In vitro inhibitory concentration against SARS...,B,,,BAO_0000190,BAO_0000357,...,SARS coronavirus,SARS coronavirus 3C-like proteinase,227859,,,IC50,uM,UO_0000065,,7.2
1,,1480936,[],CHEMBL829584,In vitro inhibitory concentration against SARS...,B,,,BAO_0000190,BAO_0000357,...,SARS coronavirus,SARS coronavirus 3C-like proteinase,227859,,,IC50,uM,UO_0000065,,9.4
2,,1481061,[],CHEMBL830868,In vitro inhibitory concentration against SARS...,B,,,BAO_0000190,BAO_0000357,...,SARS coronavirus,SARS coronavirus 3C-like proteinase,227859,,,IC50,uM,UO_0000065,,13.5
3,,1481065,[],CHEMBL829584,In vitro inhibitory concentration against SARS...,B,,,BAO_0000190,BAO_0000357,...,SARS coronavirus,SARS coronavirus 3C-like proteinase,227859,,,IC50,uM,UO_0000065,,13.11
4,,1481066,[],CHEMBL829584,In vitro inhibitory concentration against SARS...,B,,,BAO_0000190,BAO_0000357,...,SARS coronavirus,SARS coronavirus 3C-like proteinase,227859,,,IC50,uM,UO_0000065,,2.0


In [30]:
df.isnull().sum()

activity_comment             133
activity_id                    0
activity_properties            0
assay_chembl_id                0
assay_description              0
assay_type                     0
assay_variant_accession      133
assay_variant_mutation       133
bao_endpoint                   0
bao_format                     0
bao_label                      0
canonical_smiles               0
data_validity_comment         91
data_validity_description     91
document_chembl_id             0
document_journal               0
document_year                  0
ligand_efficiency             42
molecule_chembl_id             0
molecule_pref_name           107
parent_molecule_chembl_id      0
pchembl_value                 42
potential_duplicate            0
qudt_units                     0
record_id                      0
relation                       0
src_id                         0
standard_flag                  0
standard_relation              0
standard_text_value          133
standard_t

In [27]:
df2.activity_comment

0      None
1      None
2      None
3      None
4      None
       ... 
128    None
129    None
130    None
131    None
132    None
Name: activity_comment, Length: 133, dtype: object