In [1]:
import os
import json
import requests as rq
import re
import numpy as np
import pandas as pd
import pubchempy as pcp
import sdf

In [2]:
pubchem = pd.read_csv('PubChem_compound_list.csv')

In [3]:
with open(os.path.join('..','dataframes_resources','dataframes_props.json')) as file:
    res = json.load(file)
column_order = res['column_order_dbs']

In [4]:
column_order

['CID',
 'HBondAcceptorCount',
 'HBondDonorCount',
 'MolecularWeight',
 'LogP',
 'RuleFive',
 'IsomericSMILES',
 'ATC_Code_Short',
 'ATC_Code_Explanation']

In [5]:
pubchem.columns

Index(['cid', 'cmpdname', 'cmpdsynonym', 'mw', 'mf', 'polararea', 'complexity',
       'xlogp', 'heavycnt', 'hbonddonor', 'hbondacc', 'rotbonds', 'inchi',
       'isosmiles', 'canonicalsmiles', 'inchikey', 'iupacname', 'exactmass',
       'monoisotopicmass', 'charge', 'covalentunitcnt', 'isotopeatomcnt',
       'totalatomstereocnt', 'definedatomstereocnt', 'undefinedatomstereocnt',
       'totalbondstereocnt', 'definedbondstereocnt', 'undefinedbondstereocnt',
       'pclidcnt', 'gpidcnt', 'gpfamilycnt', 'neighbortype', 'meshheadings',
       'annothits', 'annothitcnt', 'aids', 'cidcdate', 'sidsrcname', 'depcatg',
       'annotation'],
      dtype='object')

In [6]:
pubchem_select = pubchem[['cid', 'hbondacc', 'hbonddonor', 'mw', 'xlogp', 'isosmiles']]

In [7]:
pubchem_select

Unnamed: 0,cid,hbondacc,hbonddonor,mw,xlogp,isosmiles
0,1,4,0,203.24,0.4,CC(=O)OC(CC(=O)[O-])C[N+](C)(C)C
1,119,3,2,103.12,-3.2,C(CC(=O)O)CN
2,137,4,2,131.13,-3.8,C(CC(=O)O)C(=O)CN
3,174,2,2,62.07,-1.4,C(CO)O
4,176,2,1,60.05,-0.2,CC(=O)O
...,...,...,...,...,...,...
4479,168009818,70,58,4813.00,-6.8,CCC(C)C(C(=O)NC(C)C(=O)NC(CCC(=O)N)C(=O)NC(CCC...
4480,168012238,13,3,687.90,2.6,C[C@@H]1C[C@@H]([C@H]([C@@H](O1)OC2[C@H](C[C@@...
4481,168012331,16,4,1030.30,5.6,C[C@@H]1CC[C@H]2C[C@@H](/C(=C/C=C/C=C\[C@H](C[...
4482,168013289,56,24,2002.20,-10.5,C(CSC[C@@H]1[C@@H]2[C@@H]([C@H]([C@H](O1)O[C@@...


In [8]:
pubchem_select = pubchem_select.rename(columns={'cid': 'CID', 'hbondacc': 'HBondAcceptorCount', 'hbonddonor': 'HBondDonorCount', 'mw': 'MolecularWeight', 'xlogp': 'LogP', 'isosmiles': 'IsomericSMILES'})

In [9]:
pubchem_select

Unnamed: 0,CID,HBondAcceptorCount,HBondDonorCount,MolecularWeight,LogP,IsomericSMILES
0,1,4,0,203.24,0.4,CC(=O)OC(CC(=O)[O-])C[N+](C)(C)C
1,119,3,2,103.12,-3.2,C(CC(=O)O)CN
2,137,4,2,131.13,-3.8,C(CC(=O)O)C(=O)CN
3,174,2,2,62.07,-1.4,C(CO)O
4,176,2,1,60.05,-0.2,CC(=O)O
...,...,...,...,...,...,...
4479,168009818,70,58,4813.00,-6.8,CCC(C)C(C(=O)NC(C)C(=O)NC(CCC(=O)N)C(=O)NC(CCC...
4480,168012238,13,3,687.90,2.6,C[C@@H]1C[C@@H]([C@H]([C@@H](O1)OC2[C@H](C[C@@...
4481,168012331,16,4,1030.30,5.6,C[C@@H]1CC[C@H]2C[C@@H](/C(=C/C=C/C=C\[C@H](C[...
4482,168013289,56,24,2002.20,-10.5,C(CSC[C@@H]1[C@@H]2[C@@H]([C@H]([C@H](O1)O[C@@...


In [10]:
# Set lipinski rules
hdonor = pubchem_select['HBondDonorCount'] < 6
haccept = pubchem_select['HBondAcceptorCount'] < 10
mw = pubchem_select['MolecularWeight'] < 500
clogP = pubchem_select['LogP'] < 5

In [11]:
# Apply rules to dataframe
pubchem_select['RuleFive'] = np.where(((hdonor & haccept & mw) | (hdonor & haccept & clogP) | (hdonor & mw & clogP) | (haccept & mw & clogP)), 1, 0)

In [12]:
pubchem_select

Unnamed: 0,CID,HBondAcceptorCount,HBondDonorCount,MolecularWeight,LogP,IsomericSMILES,RuleFive
0,1,4,0,203.24,0.4,CC(=O)OC(CC(=O)[O-])C[N+](C)(C)C,1
1,119,3,2,103.12,-3.2,C(CC(=O)O)CN,1
2,137,4,2,131.13,-3.8,C(CC(=O)O)C(=O)CN,1
3,174,2,2,62.07,-1.4,C(CO)O,1
4,176,2,1,60.05,-0.2,CC(=O)O,1
...,...,...,...,...,...,...,...
4479,168009818,70,58,4813.00,-6.8,CCC(C)C(C(=O)NC(C)C(=O)NC(CCC(=O)N)C(=O)NC(CCC...,0
4480,168012238,13,3,687.90,2.6,C[C@@H]1C[C@@H]([C@H]([C@@H](O1)OC2[C@H](C[C@@...,0
4481,168012331,16,4,1030.30,5.6,C[C@@H]1CC[C@H]2C[C@@H](/C(=C/C=C/C=C\[C@H](C[...,0
4482,168013289,56,24,2002.20,-10.5,C(CSC[C@@H]1[C@@H]2[C@@H]([C@H]([C@H](O1)O[C@@...,0


In [13]:
def get_atc_code(cid):
    url = f'https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/{cid}/JSON'
    response = rq.get(url)
    pat = r"\"[A-Z]\d{2}[A-Z]{2}\d{2}\""
    atc_code_found = re.search(pat, response.text)
    if atc_code_found:
        return atc_code_found.group(0).strip('"')

In [None]:
def get_atc_code_json(cid):
    url = f'https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/{cid}/JSON'
    response = rq.get(url)
    pat = r"\"[A-Z]\d{2}[A-Z]{2}\d{2}\""
    atc_code_found = re.search(pat, response.text)
    if atc_code_found:
        return atc_code_found.group(0).strip('"')

In [14]:
pubchem_select['ATC_Code'] = pubchem_select['CID'].map(get_atc_code)

In [None]:
pubchem_select

Unnamed: 0,CID,HBondAcceptorCount,HBondDonorCount,MolecularWeight,LogP,IsomericSMILES,RuleFive,ATC_Code
0,1,4,0,203.24,0.4,CC(=O)OC(CC(=O)[O-])C[N+](C)(C)C,1,N06BX12
1,119,3,2,103.12,-3.2,C(CC(=O)O)CN,1,N03AG03
2,137,4,2,131.13,-3.8,C(CC(=O)O)C(=O)CN,1,L01XD04
3,174,2,2,62.07,-1.4,C(CO)O,1,
4,176,2,1,60.05,-0.2,CC(=O)O,1,G01AD02
...,...,...,...,...,...,...,...,...
4479,168009818,70,58,4813.00,-6.8,CCC(C)C(C(=O)NC(C)C(=O)NC(CCC(=O)N)C(=O)NC(CCC...,0,A10BX16
4480,168012238,13,3,687.90,2.6,C[C@@H]1C[C@@H]([C@H]([C@@H](O1)OC2[C@H](C[C@@...,0,J01FA05
4481,168012331,16,4,1030.30,5.6,C[C@@H]1CC[C@H]2C[C@@H](/C(=C/C=C/C=C\[C@H](C[...,0,L01XE09
4482,168013289,56,24,2002.20,-10.5,C(CSC[C@@H]1[C@@H]2[C@@H]([C@H]([C@H](O1)O[C@@...,0,V03AB35


In [None]:
pubchem_select.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4484 entries, 0 to 4483
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   CID                 4484 non-null   int64  
 1   HBondAcceptorCount  4484 non-null   int64  
 2   HBondDonorCount     4484 non-null   int64  
 3   MolecularWeight     4484 non-null   float64
 4   LogP                3776 non-null   float64
 5   IsomericSMILES      4484 non-null   object 
 6   RuleFive            4484 non-null   int32  
 7   ATC_Code            4412 non-null   object 
dtypes: float64(2), int32(1), int64(3), object(2)
memory usage: 262.9+ KB
