<a href="https://colab.research.google.com/github/IrinAnnPaul/Projects_ML/blob/main/Predictive_Modeling_Acetylcholinesterase_Inhibitors/Data_collection_and_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Predictive Modeling for Acetylcholinesterase Inhibitors: Data Collection and Preprocessing

### Installing and Importing Libraries

In [1]:
! pip install chembl_webresource_client

Collecting chembl_webresource_client
  Downloading chembl_webresource_client-0.10.9-py3-none-any.whl.metadata (1.4 kB)
Collecting requests-cache~=1.2 (from chembl_webresource_client)
  Downloading requests_cache-1.2.1-py3-none-any.whl.metadata (9.9 kB)
Collecting cattrs>=22.2 (from requests-cache~=1.2->chembl_webresource_client)
  Downloading cattrs-24.1.2-py3-none-any.whl.metadata (8.4 kB)
Collecting url-normalize>=1.4 (from requests-cache~=1.2->chembl_webresource_client)
  Downloading url_normalize-1.4.3-py2.py3-none-any.whl.metadata (3.1 kB)
Downloading chembl_webresource_client-0.10.9-py3-none-any.whl (55 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.2/55.2 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading requests_cache-1.2.1-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.4/61.4 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading cattrs-24.1.2-py3-none-any.whl (66 kB)
[2K   [90m━━━━━━━━━━

In [2]:
import pandas as pd
from chembl_webresource_client.new_client import new_client

## Search for Target protein

### Acetylcholinesterase target search

In [3]:
# Target search for acetylcholinesterase
target = new_client.target
target_query = target.search('acetylcholinesterase')
targets = pd.DataFrame.from_dict(target_query)
targets

Unnamed: 0,cross_references,organism,pref_name,score,species_group_flag,target_chembl_id,target_components,target_type,tax_id
0,"[{'xref_id': 'P22303', 'xref_name': None, 'xre...",Homo sapiens,Acetylcholinesterase,28.0,False,CHEMBL220,"[{'accession': 'P22303', 'component_descriptio...",SINGLE PROTEIN,9606
1,[],Homo sapiens,Cholinesterases; ACHE & BCHE,28.0,False,CHEMBL2095233,"[{'accession': 'P06276', 'component_descriptio...",SELECTIVITY GROUP,9606
2,[],Drosophila melanogaster,Acetylcholinesterase,18.0,False,CHEMBL2242744,"[{'accession': 'P07140', 'component_descriptio...",SINGLE PROTEIN,7227
3,[],Bemisia tabaci,AChE2,16.0,False,CHEMBL2366409,"[{'accession': 'B3SST5', 'component_descriptio...",SINGLE PROTEIN,7038
4,[],Leptinotarsa decemlineata,Acetylcholinesterase,16.0,False,CHEMBL2366490,"[{'accession': 'Q27677', 'component_descriptio...",SINGLE PROTEIN,7539
5,"[{'xref_id': 'P04058', 'xref_name': None, 'xre...",Torpedo californica,Acetylcholinesterase,15.0,False,CHEMBL4780,"[{'accession': 'P04058', 'component_descriptio...",SINGLE PROTEIN,7787
6,"[{'xref_id': 'P21836', 'xref_name': None, 'xre...",Mus musculus,Acetylcholinesterase,15.0,False,CHEMBL3198,"[{'accession': 'P21836', 'component_descriptio...",SINGLE PROTEIN,10090
7,"[{'xref_id': 'P37136', 'xref_name': None, 'xre...",Rattus norvegicus,Acetylcholinesterase,15.0,False,CHEMBL3199,"[{'accession': 'P37136', 'component_descriptio...",SINGLE PROTEIN,10116
8,"[{'xref_id': 'O42275', 'xref_name': None, 'xre...",Electrophorus electricus,Acetylcholinesterase,15.0,False,CHEMBL4078,"[{'accession': 'O42275', 'component_descriptio...",SINGLE PROTEIN,8005
9,"[{'xref_id': 'P23795', 'xref_name': None, 'xre...",Bos taurus,Acetylcholinesterase,15.0,False,CHEMBL4768,"[{'accession': 'P23795', 'component_descriptio...",SINGLE PROTEIN,9913


### Select and retrieve bioactivity data for Human Acetylcholinesterase

The first entry corresponds to the target protein Human Acetylcholinesterase, hence it is assigned to the target Chembl ID

In [4]:
selected_target = targets.target_chembl_id[0]
selected_target


'CHEMBL220'

In [7]:
# only retrieve data reported using IC50 values.
activity = new_client.activity
res = activity.filter(target_chembl_id=selected_target).filter(standard_type="IC50")
print(f"Number of records retrieved: {len(res)}")

df = pd.DataFrame.from_dict(res)
df

# save as a csv file
df.to_csv('acetylcholinesterase_01_bioactivity_data_raw.csv', index=False)

Number of records retrieved: 9091


## Handling missing data

Rows with missing standard_value and canonical_smiles are dropped

In [9]:
# remove rows with missing std value and canonical smiles
df2 = df[df.standard_value.notna()]
df2 = df2[df.canonical_smiles.notna()]

# drop duplicate rows
df2_nr = df2.drop_duplicates(['canonical_smiles'])
df2_nr


  df2 = df2[df.canonical_smiles.notna()]


Unnamed: 0,action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,,33969,[],CHEMBL643384,Inhibitory concentration against acetylcholine...,B,,,BAO_0000190,...,Homo sapiens,Acetylcholinesterase,9606,,,IC50,uM,UO_0000065,,0.75
1,,,37563,[],CHEMBL643384,Inhibitory concentration against acetylcholine...,B,,,BAO_0000190,...,Homo sapiens,Acetylcholinesterase,9606,,,IC50,uM,UO_0000065,,0.1
2,,,37565,[],CHEMBL643384,Inhibitory concentration against acetylcholine...,B,,,BAO_0000190,...,Homo sapiens,Acetylcholinesterase,9606,,,IC50,uM,UO_0000065,,50.0
3,,,38902,[],CHEMBL643384,Inhibitory concentration against acetylcholine...,B,,,BAO_0000190,...,Homo sapiens,Acetylcholinesterase,9606,,,IC50,uM,UO_0000065,,0.3
4,,,41170,[],CHEMBL643384,Inhibitory concentration against acetylcholine...,B,,,BAO_0000190,...,Homo sapiens,Acetylcholinesterase,9606,,,IC50,uM,UO_0000065,,0.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9086,"{'action_type': 'INHIBITOR', 'description': 'N...",,25111481,[],CHEMBL5265203,Inhibition of AChE (unknown origin),B,,,BAO_0000190,...,Homo sapiens,Acetylcholinesterase,9606,,,IC50,uM,UO_0000065,,0.16
9087,,,25402914,[],CHEMBL5303778,Cross screening panel,B,,,BAO_0000190,...,Homo sapiens,Acetylcholinesterase,9606,,,pIC50,,UO_0000065,,5.1
9088,,,25402962,[],CHEMBL5303826,Cross screening panel,B,,,BAO_0000190,...,Homo sapiens,Acetylcholinesterase,9606,,,pIC50,,UO_0000065,,4.0
9089,,,25403899,[],CHEMBL5303876,Cross screening panel,B,,,BAO_0000190,...,Homo sapiens,Acetylcholinesterase,9606,,,pIC50,,UO_0000065,,4.2


## Data pre-processing of the bioactivity data

In [10]:
# combining required columns to a data frame
selection = ['molecule_chembl_id','canonical_smiles','standard_value']
df3 = df2_nr[selection]
df3

# save as a csv file
df3.to_csv('acetylcholinesterase_02_bioactivity_data_preprocessed.csv', index=False)

The bioactivity data is measured in IC50 units. Compounds with values below 1,000 nM are considered active, those above 10,000 nM are inactive, and compounds with values between 1,000 and 10,000 nM are classified as intermediate.

In [11]:
bioactivity_threshold = []
for i in df3.standard_value:
  if float(i) >= 10000:
    bioactivity_threshold.append("inactive")
  elif float(i) <= 1000:
    bioactivity_threshold.append("active")
  else:
    bioactivity_threshold.append("intermediate")

# labelling the compounds based on their bioactivity
bioactivity_class = pd.Series(bioactivity_threshold, name='class')
df4 = pd.concat([df3, bioactivity_class], axis=1)
df4

# save as csv file
df4.to_csv('acetylcholinesterase_03_bioactivity_data_curated.csv', index=False)

In [12]:
# zipping the files for future use
! zip acetylcholinesterase.zip *.csv

  adding: acetylcholinesterase_01_bioactivity_data_raw.csv (deflated 91%)
  adding: acetylcholinesterase_02_bioactivity_data_preprocessed.csv (deflated 80%)
  adding: acetylcholinesterase_03_bioactivity_data_curated.csv (deflated 82%)
