## This Project focuses on curating a dataset for drug discovery for the Influenza HA protein

# Installing Libraries

In [1]:
# chembl web_resource client
!pip install chembl_webresource_client

Collecting chembl_webresource_client
  Downloading chembl_webresource_client-0.10.9-py3-none-any.whl.metadata (1.4 kB)
Collecting requests-cache~=1.2 (from chembl_webresource_client)
  Downloading requests_cache-1.2.1-py3-none-any.whl.metadata (9.9 kB)
Collecting easydict (from chembl_webresource_client)
  Downloading easydict-1.13-py3-none-any.whl.metadata (4.2 kB)
Collecting cattrs>=22.2 (from requests-cache~=1.2->chembl_webresource_client)
  Downloading cattrs-24.1.2-py3-none-any.whl.metadata (8.4 kB)
Collecting url-normalize>=1.4 (from requests-cache~=1.2->chembl_webresource_client)
  Downloading url_normalize-1.4.3-py2.py3-none-any.whl.metadata (3.1 kB)
Downloading chembl_webresource_client-0.10.9-py3-none-any.whl (55 kB)
Downloading requests_cache-1.2.1-py3-none-any.whl (61 kB)
Downloading easydict-1.13-py3-none-any.whl (6.8 kB)
Downloading cattrs-24.1.2-py3-none-any.whl (66 kB)
Downloading url_normalize-1.4.3-py2.py3-none-any.whl (6.8 kB)
Installing collected packages: easydict,

In [2]:
# import libraries
import pandas as pd
# instantiate a new client from chembl
from chembl_webresource_client.new_client import new_client

# **Search for Target Protein**

### *Target Search: Influenza Haemagglutinin*

In [3]:
target = new_client.target
# search keyword
target_query = target.search('influenza')
# convert results to a pandas dataframe
targets = pd.DataFrame.from_dict(target_query)
# dislay dataframe contents
targets

Unnamed: 0,cross_references,organism,pref_name,score,species_group_flag,target_chembl_id,target_components,target_type,tax_id
0,[],unidentified influenza virus,unidentified influenza virus,12.0,False,CHEMBL613128,[],ORGANISM,11309
1,[],Influenza B virus,Influenza B virus,12.0,False,CHEMBL613129,[],ORGANISM,11520
2,[],Influenza A virus,Influenza A virus,12.0,False,CHEMBL613740,[],ORGANISM,11320
3,[],Influenza C virus,Influenza C virus,12.0,False,CHEMBL612783,[],ORGANISM,11552
4,"[{'xref_id': 'P03438', 'xref_name': None, 'xre...",Influenza A virus (strain A/X-31 H3N2),Influenza A virus Hemagglutinin,11.0,False,CHEMBL4918,"[{'accession': 'P03438', 'component_descriptio...",SINGLE PROTEIN,132504
5,[],Influenza A virus H3N2,Influenza A virus H3N2,11.0,False,CHEMBL2366902,[],ORGANISM,41857
6,[],Influenza A virus (H1N1),Influenza A virus (H1N1),11.0,False,CHEMBL2367089,[],ORGANISM,1323429
7,"[{'xref_id': 'P63231', 'xref_name': None, 'xre...",Influenza A virus (A/Udorn/307/1972(H3N2)),Influenza virus A matrix protein M2,9.0,False,CHEMBL2052,"[{'accession': 'P0DOF8', 'component_descriptio...",SINGLE PROTEIN,381517
8,[],Influenza A virus (A/Puerto Rico/8/1934(H1N1)),Influenza A virus (A/Puerto Rico/8/1934(H1N1)),8.0,False,CHEMBL612610,[],ORGANISM,211044
9,[],Haemophilus influenzae,Haemophilus influenzae,7.0,False,CHEMBL355,[],ORGANISM,727


## **Select haemagglutinin protein and retrieve bioactivity data using the target id**

In [4]:
influenza_target = targets.target_chembl_id[4]
influenza_target

'CHEMBL4918'

In [5]:
# retrieve activity values
activity = new_client.activity
res = activity.filter(target_chembl_id=influenza_target).filter(standard_type="IC50")
res_df = pd.DataFrame.from_dict(res)

In [6]:
res_df

Unnamed: 0,action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,,12201437,[],CHEMBL2209966,Inhibition of hemagglutininin in Influenza A v...,B,,,BAO_0000190,...,Influenza A virus (strain A/X-31 H3N2),Influenza A virus Hemagglutinin,132504,,,IC50,uM,UO_0000065,,2.6
1,,,12201438,[],CHEMBL2209966,Inhibition of hemagglutininin in Influenza A v...,B,,,BAO_0000190,...,Influenza A virus (strain A/X-31 H3N2),Influenza A virus Hemagglutinin,132504,,,IC50,uM,UO_0000065,,3.7


In [7]:
res_df.standard_type.unique()

array(['IC50'], dtype=object)

# **Save Results to CSV**

In [8]:
res_df.to_csv("influenza_ha_bioactivity.csv", index=False)

## **Data Preparation**

In [3]:
inf_data = pd.read_csv('influenza_ha_bioactivity.csv')
inf_data

Unnamed: 0,action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,,12201437,[],CHEMBL2209966,Inhibition of hemagglutininin in Influenza A v...,B,,,BAO_0000190,...,Influenza A virus (strain A/X-31 H3N2),Influenza A virus Hemagglutinin,132504,,,IC50,uM,UO_0000065,,2.6
1,,,12201438,[],CHEMBL2209966,Inhibition of hemagglutininin in Influenza A v...,B,,,BAO_0000190,...,Influenza A virus (strain A/X-31 H3N2),Influenza A virus Hemagglutinin,132504,,,IC50,uM,UO_0000065,,3.7


In [8]:
# remove null and na values from standard_type column
inf_data_1 = inf_data[inf_data.standard_value.notna()]
inf_data_1

Unnamed: 0,action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,,12201437,[],CHEMBL2209966,Inhibition of hemagglutininin in Influenza A v...,B,,,BAO_0000190,...,Influenza A virus (strain A/X-31 H3N2),Influenza A virus Hemagglutinin,132504,,,IC50,uM,UO_0000065,,2.6
1,,,12201438,[],CHEMBL2209966,Inhibition of hemagglutininin in Influenza A v...,B,,,BAO_0000190,...,Influenza A virus (strain A/X-31 H3N2),Influenza A virus Hemagglutinin,132504,,,IC50,uM,UO_0000065,,3.7


## *Label compounds as active, inactive or intermediate*
### Standard values that are less than 1000 nm (1 microM) are active | those greater than 10000 (10 microM) are inactive | those in-between are intermediate

In [9]:
# create a list to store bioactivity classes
bioactivity_class = []

# set appropriate labels
for value in inf_data_1.standard_value:
    if float(value) >= 10000:
        bioactivity_class.append("inactive")
    elif float(value) <= 1000:
        bioactivity_class.append("active")
    else:
        bioactivity_class.append("intermediate")

In [10]:
# iterate through chembl molecule_id
mol_chembl_id = []
for id in inf_data_1.molecule_chembl_id:
    mol_chembl_id.append(id)
    
# iterate through canonical smiles
canon_smiles = []
for smile in inf_data_1.canonical_smiles:
    canon_smiles.append(smile)
    

# iterate through standard_value
stand_value = []
for value in inf_data_1.standard_value:
    stand_value.append(value)
    

# create dataframe to combine all four lists
bioactivity_data = pd.DataFrame(data = {"molecule_chembl_id": mol_chembl_id, "canonical_smiles": canon_smiles, "standard_value": stand_value, "bioactivity_class": bioactivity_class})

In [11]:
bioactivity_data

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,bioactivity_class
0,CHEMBL2207987,Cc1nc2sccn2c1C(=O)NN1C(=O)C(C)SC12CCC(C)CC2,2600.0,intermediate
1,CHEMBL2207978,CC1CCC2(CC1)SC(C)C(=O)N2NC(=O)C12CC3CC(CC(C3)C...,3700.0,intermediate
