# Data retrieval PubCHEM and ChEMBL REST programatic access



---
Author : *Natalia García Sánchez, Ana Solbas



# INDEX
1. PubCHEM programatic access -> [Pubchem section](#Pubchem) 
2. ChEMBL programatic access -> [ChEMBL section](#ChEMBL) 
3. Conclusion -> [Conclusion](#Conclusion) 

### Importing libraries

In [None]:
# libraries
import json  # lets us work with the json format
import requests  # allows Python to make web requests
import pandas as pd # analysis of tabular data
from pandas import json_normalize
import numpy as np # numerical library
from IPython.display import Image # Public API for display tools in IPython.


In [None]:
df = pd.read_excel('/content/drive/MyDrive/Colab Notebooks/biostruct/pubchem.xlsx')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
p_chemlist = df.PubChem_label.tolist()


<a id='#Pubchem'></a>

## 1. PubCHEM programatic access

* Searching for Canonical SMILES
* Doing a 2D similarity analysis with ibuprofen SMILES

In [None]:
BASE_URL1 = "https://pubchem.ncbi.nlm.nih.gov/rest/pug"
smileslist = []
for p_chemname in p_chemlist:

  url1 = BASE_URL1+ f"/compound/name/{p_chemname}/property/CanonicalSMILES/JSON"

  r1 = requests.get(url1)
  response = r1.json()

  if "PropertyTable" in response:
      SMILES = response["PropertyTable"]["Properties"][0]["CanonicalSMILES"];
      smileslist.append(SMILES)
  else:
      print(f"Could not find matches for PubChem name: {p_chemname}")

Could not find matches for PubChem name: cephem
Could not find matches for PubChem name: CAS 1405-87-4


In [None]:
df = df[df['PubChem_label'] != 'cephem']
df = df[df['PubChem_label'] != 'CAS 1405-87-4']

In [None]:
df['SMILES'] = smileslist

In [None]:
print("Canonical SMILES for ibuprofen is : ", SMILES)

Canonical SMILES for ibuprofen is :  CC(C)CC1=CC=C(C=C1)C(C)C(=O)O


In [None]:
url2 = BASE_URL1 +f"/compound/fastsimilarity_2d/smiles/cids/JSON?Threshold=80"
r2 = requests.post(url2, data={'smiles' : SMILES})
cids_simmolPub = r2.json()

print("# Number of CIDs:", len(cids_simmolPub['IdentifierList']['CID']))
print(sorted(cids_simmolPub['IdentifierList']['CID']))

# Number of CIDs: 108825
[107, 997, 999, 1303, 1542, 2449, 3332, 3335, 3394, 3395, 3672, 3825, 3848, 3858, 3965, 4127, 4702, 4775, 5127, 5258, 6463, 6730, 6862, 7012, 7403, 7418, 7458, 7470, 7489, 7559, 7590, 7600, 7601, 7643, 7655, 7658, 7709, 8333, 8373, 8767, 8792, 9024, 9837, 9978, 10194, 10296, 10356, 10714, 10726, 10820, 11393, 11470, 11782, 11892, 11897, 11915, 12031, 12073, 12086, 12121, 12439, 13084, 13222, 13234, 13737, 13890, 14098, 14103, 14144, 14583, 14695, 15071, 15093, 15250, 15618, 15717, 15858, 15879, 15880, 15890, 16046, 16197, 16237, 16757, 17099, 17124, 17314, 17452, 18663, 18744, 18960, 19024, 19147, 19570, 19986, 20202, 20233, 20288, 20447, 20724, 21101, 21106, 21235, 21717, 21880, 21881, 21882, 22220, 22642, 22810, 23042, 23269, 23739, 24123, 24371, 24915, 25239, 25685, 25932, 26403, 26436, 27266, 27877, 29039, 29759, 29788, 30314, 30725, 30801, 30931, 30938, 31102, 31210, 31722, 32153, 32282, 32601, 33479, 33654, 34170, 34656, 34658, 34872, 34882, 37589, 37791,

<a id='#ChEMBL'></a>

## 2. ChEMBL programatic access

* Searching for Canonical SMILES
* Doing a 2D similarity analysis with ibuprofen SMILES

In [None]:
BASE_URL2 = "https://www.ebi.ac.uk/chembl/api/data/"

In [None]:
df

Unnamed: 0,PubChem_ID,PubChem_label,SMILES
0,https://pubchem.ncbi.nlm.nih.gov/compound/1000...,spectinomycin,CC1CC(=O)C2(C(O1)OC3C(C(C(C(C3O2)NC)O)NC)O)O
1,https://pubchem.ncbi.nlm.nih.gov/compound/1000...,amoxicillin,CC1(C(N2C(S1)C(C2=O)NC(=O)C(C3=CC=C(C=C3)O)N)C...
2,https://pubchem.ncbi.nlm.nih.gov/compound/1000...,monomycin,C1C(C(C(C(C1N)OC2C(C(C(C(O2)CO)O)O)N)OC3C(C(C(...
3,https://pubchem.ncbi.nlm.nih.gov/compound/1000...,azithromycin,CCC1C(C(C(N(CC(CC(C(C(C(C(C(=O)O1)C)OC2CC(C(C(...
4,https://pubchem.ncbi.nlm.nih.gov/compound/1000...,clindamycin,CCCC1CC(N(C1)C)C(=O)NC(C2C(C(C(C(O2)SC)O)O)O)C...
...,...,...,...
84,https://pubchem.ncbi.nlm.nih.gov/compound/1064...,delamanid,CC1(CN2C=C(N=C2O1)[N+](=O)[O-])COC3=CC=C(C=C3)...
85,https://pubchem.ncbi.nlm.nih.gov/compound/1258...,tetracycline,CC1(C2CC3C(C(=O)C(=C(C3(C(=O)C2=C(C4=C1C=CC=C4...
86,https://pubchem.ncbi.nlm.nih.gov/compound/1441...,oxytetracycline,CC1(C2C(C3C(C(=O)C(=C(C3(C(=O)C2=C(C4=C1C=CC=C...
87,https://pubchem.ncbi.nlm.nih.gov/compound/1000...,pentobarbital,CCCC(C)C1(C(=O)NC(=O)NC1=O)CC


In [None]:
specsmileslist = df.SMILES.tolist()
nosmileslist = []
chemblidlist=[]
formulalist=[]
weightlist=[]
chebiidlist=[]
atclist=[]
for specsmiles in specsmileslist:
  #uses previous SMILES for the search
  tanimoto_url = BASE_URL2 + f"similarity/{specsmiles}/100"
  r3 = requests.get(tanimoto_url, headers={"Accept":"application/json"}).json()
  if len(r3['molecules'])>0:
    propretrieved = list(r3['molecules'][0].keys())
    if 'molecule_chembl_id' in propretrieved and 'molecule_properties' in propretrieved and 'atc_classifications' in propretrieved and 'chebi_par_id'in propretrieved:
      cids_simmolCh = pd.DataFrame.from_dict(r3['molecules'])[['molecule_chembl_id', 'chebi_par_id', 'atc_classifications']]
      chemblidlist.append(cids_simmolCh['molecule_chembl_id'][0])
      chebiidlist.append(cids_simmolCh['chebi_par_id'][0])
      atclist.append([cids_simmolCh['atc_classifications'][0]])
      dictprop = r3['molecules'][0]['molecule_properties']

      formulalist.append(dictprop['full_molformula'])
      weightlist.append(dictprop['full_mwt'])
      print("Number of ChEMBL IDs found : ", cids_simmolCh.shape[0] )
    else:
        print(f"Could not find matches for PubChem name: {specsmiles}")
        nosmileslist.append(specsmiles)
  else:
      print(f"Could not find matches for PubChem name: {specsmiles}")
      nosmileslist.append(specsmiles)
  chemblidlist


Number of ChEMBL IDs found :  10
Number of ChEMBL IDs found :  4
Number of ChEMBL IDs found :  11
Number of ChEMBL IDs found :  6
Number of ChEMBL IDs found :  8
Number of ChEMBL IDs found :  7
Number of ChEMBL IDs found :  15
Number of ChEMBL IDs found :  3
Number of ChEMBL IDs found :  2
Number of ChEMBL IDs found :  12
Number of ChEMBL IDs found :  8
Number of ChEMBL IDs found :  6
Number of ChEMBL IDs found :  15
Number of ChEMBL IDs found :  5
Number of ChEMBL IDs found :  5
Number of ChEMBL IDs found :  2
Number of ChEMBL IDs found :  5
Number of ChEMBL IDs found :  2
Number of ChEMBL IDs found :  1
Number of ChEMBL IDs found :  5
Number of ChEMBL IDs found :  7
Number of ChEMBL IDs found :  1
Number of ChEMBL IDs found :  1
Number of ChEMBL IDs found :  6
Number of ChEMBL IDs found :  5
Number of ChEMBL IDs found :  2
Number of ChEMBL IDs found :  10
Number of ChEMBL IDs found :  8
Number of ChEMBL IDs found :  4
Number of ChEMBL IDs found :  5
Number of ChEMBL IDs found :  10
N

In [None]:
df = df[~df['SMILES'].isin(nosmileslist)]
df['ChEMBL_ID'] = chemblidlist
df['CHEBI_ID'] = chebiidlist
df['formula'] = formulalist
df['mass'] = weightlist
df['atcClassification'] = atclist


In [None]:
df

Unnamed: 0,PubChem_ID,PubChem_label,SMILES,ChEMBL_ID,CHEBI_ID,formula,mass,atcClassification
0,https://pubchem.ncbi.nlm.nih.gov/compound/1000...,spectinomycin,CC1CC(=O)C2(C(O1)OC3C(C(C(C(C3O2)NC)O)NC)O)O,CHEMBL4543601,,C14H33Cl3N2O10,495.78,[[]]
1,https://pubchem.ncbi.nlm.nih.gov/compound/1000...,amoxicillin,CC1(C(N2C(S1)C(C2=O)NC(=O)C(C3=CC=C(C=C3)O)N)C...,CHEMBL2105950,51255.0,C16H18N3NaO5S,387.39,[[]]
2,https://pubchem.ncbi.nlm.nih.gov/compound/1000...,monomycin,C1C(C(C(C(C1N)OC2C(C(C(C(O2)CO)O)O)N)OC3C(C(C(...,CHEMBL4303788,,C23H45N5O14,615.63,[[]]
3,https://pubchem.ncbi.nlm.nih.gov/compound/1000...,azithromycin,CCC1C(C(C(N(CC(CC(C(C(C(C(C(=O)O1)C)OC2CC(C(C(...,CHEMBL2361091,,C38H72N2O12,749.00,[[]]
4,https://pubchem.ncbi.nlm.nih.gov/compound/1000...,clindamycin,CCCC1CC(N(C1)C)C(=O)NC(C2C(C(C(C(O2)SC)O)O)O)C...,CHEMBL1490142,,C18H36Cl2N2O6S,479.47,[[]]
...,...,...,...,...,...,...,...,...
83,https://pubchem.ncbi.nlm.nih.gov/compound/1000...,APAs,C1=CC(=C(C=C1N)O)C(=O)O,CHEMBL1169,27565.0,C7H7NO3,153.14,[[J04AA01]]
84,https://pubchem.ncbi.nlm.nih.gov/compound/1064...,delamanid,CC1(CN2C=C(N=C2O1)[N+](=O)[O-])COC3=CC=C(C=C3)...,CHEMBL3188499,,C25H25F3N4O6,534.49,[[]]
86,https://pubchem.ncbi.nlm.nih.gov/compound/1441...,oxytetracycline,CC1(C2C(C3C(C(=O)C(=C(C3(C(=O)C2=C(C4=C1C=CC=C...,CHEMBL4591469,,C22H25ClN2O9,496.90,[[]]
87,https://pubchem.ncbi.nlm.nih.gov/compound/1000...,pentobarbital,CCCC(C)C1(C(=O)NC(=O)NC1=O)CC,CHEMBL971,7984.0,C11H17N2NaO3,248.26,[[]]


In [None]:
df.to_csv('pubchem_chembl_chebi.csv', index=False)

<a id='#Conclusion'></a>