In [3]:
import os
import json
import requests as rq
import re
import numpy as np
import pandas as pd
import pubchempy as pcp
import sdf

In [4]:
pubchem = pd.read_csv('C:/Users/josin/GitRepositories/TFM_project/MVP/dataframes/mock_dataframes/mock_pubchem.csv')

In [5]:
with open(os.path.join('..','dataframes_resources','dataframes_props.json')) as file:
    res = json.load(file)
column_order = res['column_order_dbs']

In [6]:
column_order

['CID',
 'HBondAcceptorCount',
 'HBondDonorCount',
 'IsomericSMILES',
 'MolecularWeight',
 'LogP',
 'RuleFive',
 'MATC_Code_Short',
 'MATC_Code_Explanation']

In [7]:
pubchem.columns

Index(['Unnamed: 0', 'cid', 'cmpdname', 'cmpdsynonym', 'mw', 'mf', 'polararea',
       'complexity', 'xlogp', 'heavycnt', 'hbonddonor', 'hbondacc', 'rotbonds',
       'inchi', 'isosmiles', 'canonicalsmiles', 'inchikey', 'iupacname',
       'exactmass', 'monoisotopicmass', 'charge', 'covalentunitcnt',
       'isotopeatomcnt', 'totalatomstereocnt', 'definedatomstereocnt',
       'undefinedatomstereocnt', 'totalbondstereocnt', 'definedbondstereocnt',
       'undefinedbondstereocnt', 'pclidcnt', 'gpidcnt', 'gpfamilycnt',
       'neighbortype', 'meshheadings', 'annothits', 'annothitcnt', 'aids',
       'cidcdate', 'sidsrcname', 'depcatg', 'annotation'],
      dtype='object')

In [8]:
pubchem_select = pubchem[['cid', 'hbondacc', 'hbonddonor', 'mw', 'xlogp', 'isosmiles']]

In [9]:
pubchem_select

Unnamed: 0,cid,hbondacc,hbonddonor,mw,xlogp,isosmiles
0,1,4,0,203.24,0.4,CC(=O)OC(CC(=O)[O-])C[N+](C)(C)C
1,119,3,2,103.12,-3.2,C(CC(=O)O)CN
2,137,4,2,131.13,-3.8,C(CC(=O)O)C(=O)CN
3,174,2,2,62.07,-1.4,C(CO)O
4,176,2,1,60.05,-0.2,CC(=O)O
...,...,...,...,...,...,...
95,2161,6,2,298.29,3.1,CC(C)C1=CC2=C(C=C1)OC3=NC(=C(C=C3C2=O)C(=O)O)N
96,2162,7,2,408.90,3.0,CCOC(=O)C1=C(NC(=C(C1C2=CC=CC=C2Cl)C(=O)OC)C)C...
97,2164,3,2,226.27,2.1,CCC1(C(=O)NC(=O)NC1=O)CCC(C)C
98,2165,4,2,355.90,2.6,CCN(CC)CC1=C(C=CC(=C1)NC2=C3C=CC(=CC3=NC=C2)Cl)O


In [10]:
pubchem_select = pubchem_select.rename(columns={'cid': 'CID', 'hbondacc': 'HBondAcceptorCount', 'hbonddonor': 'HBondDonorCount', 'mw': 'MolecularWeight', 'xlogp': 'LogP', 'isosmiles': 'IsomericSMILES'})

In [11]:
pubchem_select

Unnamed: 0,CID,HBondAcceptorCount,HBondDonorCount,MolecularWeight,LogP,IsomericSMILES
0,1,4,0,203.24,0.4,CC(=O)OC(CC(=O)[O-])C[N+](C)(C)C
1,119,3,2,103.12,-3.2,C(CC(=O)O)CN
2,137,4,2,131.13,-3.8,C(CC(=O)O)C(=O)CN
3,174,2,2,62.07,-1.4,C(CO)O
4,176,2,1,60.05,-0.2,CC(=O)O
...,...,...,...,...,...,...
95,2161,6,2,298.29,3.1,CC(C)C1=CC2=C(C=C1)OC3=NC(=C(C=C3C2=O)C(=O)O)N
96,2162,7,2,408.90,3.0,CCOC(=O)C1=C(NC(=C(C1C2=CC=CC=C2Cl)C(=O)OC)C)C...
97,2164,3,2,226.27,2.1,CCC1(C(=O)NC(=O)NC1=O)CCC(C)C
98,2165,4,2,355.90,2.6,CCN(CC)CC1=C(C=CC(=C1)NC2=C3C=CC(=CC3=NC=C2)Cl)O


In [12]:
# Set lipinski rules
hdonor = pubchem_select['HBondDonorCount'] < 6
haccept = pubchem_select['HBondAcceptorCount'] < 10
mw = pubchem_select['MolecularWeight'] < 500
clogP = pubchem_select['LogP'] < 5

In [13]:
# Apply rules to dataframe
pubchem_select['RuleFive'] = np.where(((hdonor & haccept & mw) | (hdonor & haccept & clogP) | (hdonor & mw & clogP) | (haccept & mw & clogP)), 1, 0)

In [14]:
pubchem_select

Unnamed: 0,CID,HBondAcceptorCount,HBondDonorCount,MolecularWeight,LogP,IsomericSMILES,RuleFive
0,1,4,0,203.24,0.4,CC(=O)OC(CC(=O)[O-])C[N+](C)(C)C,1
1,119,3,2,103.12,-3.2,C(CC(=O)O)CN,1
2,137,4,2,131.13,-3.8,C(CC(=O)O)C(=O)CN,1
3,174,2,2,62.07,-1.4,C(CO)O,1
4,176,2,1,60.05,-0.2,CC(=O)O,1
...,...,...,...,...,...,...,...
95,2161,6,2,298.29,3.1,CC(C)C1=CC2=C(C=C1)OC3=NC(=C(C=C3C2=O)C(=O)O)N,1
96,2162,7,2,408.90,3.0,CCOC(=O)C1=C(NC(=C(C1C2=CC=CC=C2Cl)C(=O)OC)C)C...,1
97,2164,3,2,226.27,2.1,CCC1(C(=O)NC(=O)NC1=O)CCC(C)C,1
98,2165,4,2,355.90,2.6,CCN(CC)CC1=C(C=CC(=C1)NC2=C3C=CC(=CC3=NC=C2)Cl)O,1


In [15]:
not_found = []

In [16]:
def get_atc_code(cid):
    
    url = f'https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/{cid}/JSON'
    response = rq.get(url)
    pat = r"\"[A-Z]\d{2}[A-Z]{2}\d{2}\""
    atc_code_found = re.search(pat, response.text)
    if atc_code_found:
        return atc_code_found.group(0).strip('"')
    else:
        not_found.append(cid)


In [17]:
def get_atc_code_json(cid):
    url = f'https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/{cid}/JSON'
    response = rq.get(url)
    pat = r"\"[A-Z]\d{2}[A-Z]{2}\d{2}\""
    atc_code_found = re.search(pat, response.text)
    if atc_code_found:
        return atc_code_found.group(0).strip('"')

In [18]:
pubchem_select['ATC_Code'] = pubchem_select['CID'].map(get_atc_code)

In [19]:
not_found

[119, 174, 460, 1972]

In [20]:
len(not_found)

4

In [21]:
pubchem_select

Unnamed: 0,CID,HBondAcceptorCount,HBondDonorCount,MolecularWeight,LogP,IsomericSMILES,RuleFive,ATC_Code
0,1,4,0,203.24,0.4,CC(=O)OC(CC(=O)[O-])C[N+](C)(C)C,1,N06BX12
1,119,3,2,103.12,-3.2,C(CC(=O)O)CN,1,
2,137,4,2,131.13,-3.8,C(CC(=O)O)C(=O)CN,1,L01XD04
3,174,2,2,62.07,-1.4,C(CO)O,1,
4,176,2,1,60.05,-0.2,CC(=O)O,1,G01AD02
...,...,...,...,...,...,...,...,...
95,2161,6,2,298.29,3.1,CC(C)C1=CC2=C(C=C1)OC3=NC(=C(C=C3C2=O)C(=O)O)N,1,R03DX01
96,2162,7,2,408.90,3.0,CCOC(=O)C1=C(NC(=C(C1C2=CC=CC=C2Cl)C(=O)OC)C)C...,1,C08CA01
97,2164,3,2,226.27,2.1,CCC1(C(=O)NC(=O)NC1=O)CCC(C)C,1,N05CA02
98,2165,4,2,355.90,2.6,CCN(CC)CC1=C(C=CC(=C1)NC2=C3C=CC(=CC3=NC=C2)Cl)O,1,P01BA06


In [22]:
pubchem_select['MATC_Code_Short'] = pubchem_select['ATC_Code'].str[0]

In [23]:
with open(os.path.join('..','dataframes_resources','dataframes_props.json')) as file:
    res = json.load(file)
matc_codes_explanation = res['matc_codes_explanation']

In [24]:
pubchem_select['MATC_Code_Explanation'] = pubchem_select['MATC_Code_Short'].map(matc_codes_explanation)

In [25]:
pubchem_select = pubchem_select.drop('ATC_Code', axis=1)

In [26]:
pubchem_select = pubchem_select[pubchem_select['MATC_Code_Short'].isna() == False]

In [27]:
pubchem_select.head()

Unnamed: 0,CID,HBondAcceptorCount,HBondDonorCount,MolecularWeight,LogP,IsomericSMILES,RuleFive,MATC_Code_Short,MATC_Code_Explanation
0,1,4,0,203.24,0.4,CC(=O)OC(CC(=O)[O-])C[N+](C)(C)C,1,N,NERVOUS SYSTEM
2,137,4,2,131.13,-3.8,C(CC(=O)O)C(=O)CN,1,L,ANTINEOPLASTIC AND IMMUNOMODULATING AGENTS
4,176,2,1,60.05,-0.2,CC(=O)O,1,G,GENITO URINARY SYSTEM AND SEX HORMONES
5,187,2,0,146.21,0.2,CC(=O)OCC[N+](C)(C)C,1,S,SENSORY ORGANS
6,237,4,1,400.0,6.0,CCN(CC)CCCC(C)NC1=C2C=C(C=CC2=NC3=C1C=CC(=C3)C...,1,P,"ANTIPARASITIC PRODUCTS, INSECTICIDES AND REPEL..."
