In [1]:
import os
import json
import requests as rq
import re
import numpy as np
import pandas as pd
import pubchempy as pcp
import sdf

In [2]:
pubchem = pd.read_csv('PubChem_compound_list.csv')

In [3]:
with open(os.path.join('..','dataframes_resources','dataframes_props.json')) as file:
    res = json.load(file)
column_order = res['column_order_dbs']

In [4]:
column_order

['CID',
 'HBondAcceptorCount',
 'HBondDonorCount',
 'IsomericSMILES',
 'MolecularWeight',
 'LogP',
 'RuleFive',
 'MATC_Code_Short',
 'MATC_Code_Explanation']

In [5]:
pubchem.columns

Index(['cid', 'cmpdname', 'cmpdsynonym', 'mw', 'mf', 'polararea', 'complexity',
       'xlogp', 'heavycnt', 'hbonddonor', 'hbondacc', 'rotbonds', 'inchi',
       'isosmiles', 'canonicalsmiles', 'inchikey', 'iupacname', 'exactmass',
       'monoisotopicmass', 'charge', 'covalentunitcnt', 'isotopeatomcnt',
       'totalatomstereocnt', 'definedatomstereocnt', 'undefinedatomstereocnt',
       'totalbondstereocnt', 'definedbondstereocnt', 'undefinedbondstereocnt',
       'pclidcnt', 'gpidcnt', 'gpfamilycnt', 'neighbortype', 'meshheadings',
       'annothits', 'annothitcnt', 'aids', 'cidcdate', 'sidsrcname', 'depcatg',
       'annotation'],
      dtype='object')

In [6]:
pubchem_select = pubchem[['cid', 'hbondacc', 'hbonddonor', 'mw', 'xlogp', 'isosmiles']]

In [7]:
pubchem_select

Unnamed: 0,cid,hbondacc,hbonddonor,mw,xlogp,isosmiles
0,1,4,0,203.24,0.4,CC(=O)OC(CC(=O)[O-])C[N+](C)(C)C
1,119,3,2,103.12,-3.2,C(CC(=O)O)CN
2,137,4,2,131.13,-3.8,C(CC(=O)O)C(=O)CN
3,174,2,2,62.07,-1.4,C(CO)O
4,176,2,1,60.05,-0.2,CC(=O)O
...,...,...,...,...,...,...
4479,168009818,70,58,4813.00,-6.8,CCC(C)C(C(=O)NC(C)C(=O)NC(CCC(=O)N)C(=O)NC(CCC...
4480,168012238,13,3,687.90,2.6,C[C@@H]1C[C@@H]([C@H]([C@@H](O1)OC2[C@H](C[C@@...
4481,168012331,16,4,1030.30,5.6,C[C@@H]1CC[C@H]2C[C@@H](/C(=C/C=C/C=C\[C@H](C[...
4482,168013289,56,24,2002.20,-10.5,C(CSC[C@@H]1[C@@H]2[C@@H]([C@H]([C@H](O1)O[C@@...


In [8]:
pubchem_select = pubchem_select.rename(columns={'cid': 'CID', 'hbondacc': 'HBondAcceptorCount', 'hbonddonor': 'HBondDonorCount', 'mw': 'MolecularWeight', 'xlogp': 'LogP', 'isosmiles': 'IsomericSMILES'})

In [9]:
pubchem_select

Unnamed: 0,CID,HBondAcceptorCount,HBondDonorCount,MolecularWeight,LogP,IsomericSMILES
0,1,4,0,203.24,0.4,CC(=O)OC(CC(=O)[O-])C[N+](C)(C)C
1,119,3,2,103.12,-3.2,C(CC(=O)O)CN
2,137,4,2,131.13,-3.8,C(CC(=O)O)C(=O)CN
3,174,2,2,62.07,-1.4,C(CO)O
4,176,2,1,60.05,-0.2,CC(=O)O
...,...,...,...,...,...,...
4479,168009818,70,58,4813.00,-6.8,CCC(C)C(C(=O)NC(C)C(=O)NC(CCC(=O)N)C(=O)NC(CCC...
4480,168012238,13,3,687.90,2.6,C[C@@H]1C[C@@H]([C@H]([C@@H](O1)OC2[C@H](C[C@@...
4481,168012331,16,4,1030.30,5.6,C[C@@H]1CC[C@H]2C[C@@H](/C(=C/C=C/C=C\[C@H](C[...
4482,168013289,56,24,2002.20,-10.5,C(CSC[C@@H]1[C@@H]2[C@@H]([C@H]([C@H](O1)O[C@@...


In [10]:
# Set lipinski rules
hdonor = pubchem_select['HBondDonorCount'] < 6
haccept = pubchem_select['HBondAcceptorCount'] < 10
mw = pubchem_select['MolecularWeight'] < 500
clogP = pubchem_select['LogP'] < 5

In [11]:
# Apply rules to dataframe
pubchem_select['RuleFive'] = np.where(((hdonor & haccept & mw) | (hdonor & haccept & clogP) | (hdonor & mw & clogP) | (haccept & mw & clogP)), 1, 0)

In [12]:
pubchem_select

Unnamed: 0,CID,HBondAcceptorCount,HBondDonorCount,MolecularWeight,LogP,IsomericSMILES,RuleFive
0,1,4,0,203.24,0.4,CC(=O)OC(CC(=O)[O-])C[N+](C)(C)C,1
1,119,3,2,103.12,-3.2,C(CC(=O)O)CN,1
2,137,4,2,131.13,-3.8,C(CC(=O)O)C(=O)CN,1
3,174,2,2,62.07,-1.4,C(CO)O,1
4,176,2,1,60.05,-0.2,CC(=O)O,1
...,...,...,...,...,...,...,...
4479,168009818,70,58,4813.00,-6.8,CCC(C)C(C(=O)NC(C)C(=O)NC(CCC(=O)N)C(=O)NC(CCC...,0
4480,168012238,13,3,687.90,2.6,C[C@@H]1C[C@@H]([C@H]([C@@H](O1)OC2[C@H](C[C@@...,0
4481,168012331,16,4,1030.30,5.6,C[C@@H]1CC[C@H]2C[C@@H](/C(=C/C=C/C=C\[C@H](C[...,0
4482,168013289,56,24,2002.20,-10.5,C(CSC[C@@H]1[C@@H]2[C@@H]([C@H]([C@H](O1)O[C@@...,0


In [15]:
not_found = []

In [16]:
def get_atc_code(cid):
    
    url = f'https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/{cid}/JSON'
    response = rq.get(url)
    pat = r"\"[A-Z]\d{2}[A-Z]{2}\d{2}\""
    atc_code_found = re.search(pat, response.text)
    if atc_code_found:
        return atc_code_found.group(0).strip('"')
    else:
        not_found.append(cid)


In [23]:
def get_atc_code_json(cid):
    url = f'https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/{cid}/JSON'
    response = rq.get(url)
    pat = r"\"[A-Z]\d{2}[A-Z]{2}\d{2}\""
    atc_code_found = re.search(pat, response.text)
    if atc_code_found:
        return atc_code_found.group(0).strip('"')

In [17]:
pubchem_select['ATC_Code'] = pubchem_select['CID'].map(get_atc_code)

In [18]:
not_found

[174,
 1972,
 5581,
 5826,
 14956,
 14960,
 56494,
 64655,
 65430,
 66274,
 66376,
 66380,
 69438,
 69439,
 71180,
 71184,
 71368,
 122051,
 123829,
 135326,
 156375,
 157443,
 166973,
 170848,
 443997,
 450503,
 450510,
 452949,
 656666,
 3086672,
 5386092,
 5388879,
 5462076,
 5702198,
 5748852,
 5771695,
 6093445,
 6328682,
 6452753,
 6455416,
 6918077,
 9834479,
 9919339,
 10103319,
 10405586,
 10869981,
 11133125,
 11306073,
 11430828,
 11431716,
 11789360,
 11954386,
 16220016,
 16667679,
 22617237,
 23663553,
 23667636,
 23690531,
 23724931,
 23863720,
 44399170,
 44421047,
 46783367,
 46783793,
 49799974,
 50986201,
 52950901,
 53486284,
 56971651,
 70957463,
 71306412,
 71311649,
 71313388,
 71661158,
 72941734,
 73438937,
 76957057,
 76962197,
 76962714,
 76963828,
 76967078,
 90478549,
 91827561,
 117587538,
 117587627,
 118705500,
 121494118,
 122197546,
 122361636,
 122362248,
 131668620,
 131668621,
 131673845,
 131674879,
 131704315,
 131704316,
 131704321,
 131704327,
 

In [19]:
len(not_found)

115

In [22]:
pubchem_select

Unnamed: 0,CID,HBondAcceptorCount,HBondDonorCount,MolecularWeight,LogP,IsomericSMILES,RuleFive,ATC_Code
0,1,4,0,203.24,0.4,CC(=O)OC(CC(=O)[O-])C[N+](C)(C)C,1,N06BX12
1,119,3,2,103.12,-3.2,C(CC(=O)O)CN,1,N03AG03
2,137,4,2,131.13,-3.8,C(CC(=O)O)C(=O)CN,1,L01XD04
3,174,2,2,62.07,-1.4,C(CO)O,1,
4,176,2,1,60.05,-0.2,CC(=O)O,1,G01AD02
...,...,...,...,...,...,...,...,...
4479,168009818,70,58,4813.00,-6.8,CCC(C)C(C(=O)NC(C)C(=O)NC(CCC(=O)N)C(=O)NC(CCC...,0,A10BX16
4480,168012238,13,3,687.90,2.6,C[C@@H]1C[C@@H]([C@H]([C@@H](O1)OC2[C@H](C[C@@...,0,J01FA05
4481,168012331,16,4,1030.30,5.6,C[C@@H]1CC[C@H]2C[C@@H](/C(=C/C=C/C=C\[C@H](C[...,0,L01XE09
4482,168013289,56,24,2002.20,-10.5,C(CSC[C@@H]1[C@@H]2[C@@H]([C@H]([C@H](O1)O[C@@...,0,V03AB35


In [25]:
pubchem_select['MATC_Code_Short'] = pubchem_select['ATC_Code'].str[0]

In [26]:
with open(os.path.join('..','dataframes_resources','dataframes_props.json')) as file:
    res = json.load(file)
matc_codes_explanation = res['matc_codes_explanation']

In [27]:
pubchem_select['MATC_Code_Explanation'] = pubchem_select['MATC_Code_Short'].map(matc_codes_explanation)

In [31]:
pubchem_select = pubchem_select.drop('ATC_Code', axis=1)

In [33]:
pubchem_select = pubchem_select[pubchem_select['MATC_Code_Short'].isna() == False]

In [34]:
pubchem_select.head()

Unnamed: 0,CID,HBondAcceptorCount,HBondDonorCount,MolecularWeight,LogP,IsomericSMILES,RuleFive,MATC_Code_Short,MATC_Code_Explanation
0,1,4,0,203.24,0.4,CC(=O)OC(CC(=O)[O-])C[N+](C)(C)C,1,N,NERVOUS SYSTEM
1,119,3,2,103.12,-3.2,C(CC(=O)O)CN,1,N,NERVOUS SYSTEM
2,137,4,2,131.13,-3.8,C(CC(=O)O)C(=O)CN,1,L,ANTINEOPLASTIC AND IMMUNOMODULATING AGENTS
4,176,2,1,60.05,-0.2,CC(=O)O,1,G,GENITO URINARY SYSTEM AND SEX HORMONES
5,187,2,0,146.21,0.2,CC(=O)OCC[N+](C)(C)C,1,S,SENSORY ORGANS
