In [1]:
from IPython.display import clear_output
import os
import requests as rq
import re
import pandas as pd
import pubchempy as pcp


In [2]:
def get_name(response):
    return response['Record']['RecordTitle']

In [3]:
def get_atc_code(response):
    pat = r"\"[A-Z]\d{2}[A-Z]{2}\d{2}\""
    atc_code_found = re.search(pat, response.text)
    if atc_code_found:
        return atc_code_found.group(0).strip('"')

In [4]:
def get_h_bond_accept(response):
    for att0 in response['Record']['Section']:
        if att0['TOCHeading'] == 'Chemical and Physical Properties':
            for att1 in att0['Section']:
                if att1['TOCHeading'] == 'Computed Properties':
                    for att2 in att1['Section']:
                        if att2['TOCHeading'] == 'Hydrogen Bond Acceptor Count':
                            return att2['Information'][0]['Value']['Number'][0]

In [5]:
def get_h_bond_donor(response):
    for att0 in response['Record']['Section']:
        if att0['TOCHeading'] == 'Chemical and Physical Properties':
            for att1 in att0['Section']:
                if att1['TOCHeading'] == 'Computed Properties':
                    for att2 in att1['Section']:
                        if att2['TOCHeading'] == 'Hydrogen Bond Donor Count':
                            return att2['Information'][0]['Value']['Number'][0]

In [6]:
def get_mol_weight(response):
    for att0 in response_json['Record']['Section']:
        if att0['TOCHeading'] == 'Chemical and Physical Properties':
            for att1 in att0['Section']:
                if att1['TOCHeading'] == 'Computed Properties':
                    for att2 in att1['Section']:
                        if att2['TOCHeading'] == 'Molecular Weight':
                            return float(att2['Information'][0]['Value']['StringWithMarkup'][0]['String'])

In [7]:
def get_logp(response):
    for att0 in response_json['Record']['Section']:
        if att0['TOCHeading'] == 'Chemical and Physical Properties':
            for att1 in att0['Section']:
                if att1['TOCHeading'] == 'Experimental Properties':
                    for att2 in att1['Section']:
                        if att2['TOCHeading'] == 'LogP':
                            try:
                                return att2['Information'][0]['Value']['Number'][0]
                            except:
                                logP_string = att2['Information'][0]['Value']['StringWithMarkup'][0]['String']
                                return float(''.join([i for i in logP_string if i.isdigit() or i in ('.', '-')]))

In [8]:
def get_xlogp3(response):
    for att0 in response['Record']['Section']:
        if att0['TOCHeading'] == 'Chemical and Physical Properties':
            for att1 in att0['Section']:
                if att1['TOCHeading'] == 'Computed Properties':
                    for att2 in att1['Section']:
                        if att2['TOCHeading'] == 'XLogP3':
                            try:
                                return att2['Information'][0]['Value']['Number'][0]
                            except:
                                logP_string = att2['Information'][0]['Value']['StringWithMarkup'][0]['String'] 
                                return float(''.join([i for i in logP_string if i.isdigit() or i in ('.', '-')]))

In [9]:
def get_isomeric_smiles(response):
    for att0 in response['Record']['Section']:
        if att0['TOCHeading'] == 'Names and Identifiers':
            for att1 in att0['Section']:
                if att1['TOCHeading'] == 'Computed Descriptors':
                    for att2 in att1['Section']:
                        if att2['TOCHeading'] == 'Isomeric SMILES':
                            return att2['Information'][0]['Value']['StringWithMarkup'][0]['String']

In [10]:
def get_canonical_smiles(response):
    for att0 in response['Record']['Section']:
        if att0['TOCHeading'] == 'Names and Identifiers':
            for att1 in att0['Section']:
                if att1['TOCHeading'] == 'Computed Descriptors':
                    for att2 in att1['Section']:
                        if att2['TOCHeading'] == 'Canonical SMILES':
                            return att2['Information'][0]['Value']['StringWithMarkup'][0]['String']

In [11]:
cid = 1 #1983 #10917
url = f'https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/{cid}/JSON'
response = rq.get(url)
response_json = rq.get(url).json()

In [12]:
get_name(response_json)

'Acetylcarnitine'

In [13]:
get_atc_code(response)

'N06BX12'

In [14]:
get_mol_weight(response_json)

203.24

In [15]:
get_h_bond_donor(response_json)

0

In [16]:
get_h_bond_accept(response_json)

4

In [17]:
get_logp(response_json)

In [18]:
get_xlogp3(response_json)

0.4

In [19]:
get_isomeric_smiles(response_json)

In [20]:
get_canonical_smiles(response_json)

'CC(=O)OC(CC(=O)[O-])C[N+](C)(C)C'

In [21]:
get_isomeric_smiles(response_json) if get_isomeric_smiles(response_json) else get_canonical_smiles(response_json)

'CC(=O)OC(CC(=O)[O-])C[N+](C)(C)C'

In [22]:
get_logp(response_json) if get_logp(response_json) else get_xlogp3(response_json)

0.4

In [23]:
df_keys = ["CID", "HBondAcceptorCount", "HBondDonorCount", "MolecularWeight", "LogP", "IsomericSMILES", "ATC_Code"]
""" df_atc = pd.DataFrame(columns = df_keys)
df_no_atc = pd.DataFrame(columns = df_keys) """

' df_atc = pd.DataFrame(columns = df_keys)\ndf_no_atc = pd.DataFrame(columns = df_keys) '

In [25]:
cids = list(range(260364, 300000))
df_atc = pd.read_csv('pubchem_fin_atc.csv')
df_no_atc = pd.read_csv('pubchem_fin_no_atc.csv')
df_dic = {}

i=0
for cid in cids:
    url = f'https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/{cid}/JSON'
    
    response_text = rq.get(url)
    response_json = rq.get(url).json()
    
    try:
        df_dic['CID'] = cid
        df_dic['HBondAcceptorCount'] = get_h_bond_accept(response_json)
        df_dic['HBondDonorCount'] = get_h_bond_donor(response_json)
        df_dic['MolecularWeight'] = get_mol_weight(response_json)
        df_dic['LogP'] = get_logp(response_json) if get_logp(response_json) else get_xlogp3(response_json)
        df_dic['IsomericSMILES'] = get_isomeric_smiles(response_json) if get_isomeric_smiles(response_json) else get_canonical_smiles(response_json)
        df_dic['ATC_Code'] = get_atc_code(response_text)

        if response_text:
            if get_atc_code(response_text):
                print(get_atc_code(response_text))
                df_atc.loc[len(df_atc)] = df_dic
            else:
                print('no atc')
                df_no_atc.loc[len(df_no_atc)] = df_dic
        df_atc.to_csv('pubchem_fin_atc.csv', index=False)
        df_no_atc.to_csv('pubchem_fin_no_atc.csv', index=False)
    except:
        print('except', cid)
        pass

    i += 1
    clear_output()
    print(df_dic)
    print(len(df_atc), len(df_no_atc))

display(df_atc.head())
display(df_no_atc.head())

{'CID': 260364, 'HBondAcceptorCount': 4, 'HBondDonorCount': 3, 'MolecularWeight': 215.63, 'LogP': -1.3, 'IsomericSMILES': 'C1=CC(=CC=C1C(CC(=O)O)NO)Cl', 'ATC_Code': None}
34 53448


In [None]:
# código para añadir descripción de los códigos atc

atc_codes=pd.read_csv('atc_codes.csv', sep=';')
atc_dict = atc_codes.to_dict()
atc_dict
pc_df = pd.read_csv('pubchem_1_2000_atc.csv')
pc_df['atc_code'] = pc_df['atc_code'].str[0]
pc_df['atc_desc'] = pc_df['atc_code'].apply(lambda x)
pc_df

'Acetaminophen'

In [24]:
df_atc = pd.read_csv('pubchem_fin_atc.csv')
df_no_atc = pd.read_csv('pubchem_fin_no_atc.csv')
display(df_atc.tail(), df_no_atc.tail())

Unnamed: 0,CID,HBondAcceptorCount,HBondDonorCount,MolecularWeight,LogP,IsomericSMILES,ATC_Code
27,235905,1,1,300.5,5.3,C[C@]12CC[C@H]3[C@H]([C@@H]1CC[C@]2(CC=C)O)CCC...,G03DC01
28,241903,12,3,811.0,3.7,CC[C@@]1(C[C@@H]2C[C@@](C3=C(CCN(C2)C1)C4=CC=C...,L01CA01
29,244809,3,2,302.4,2.8,C[C@]12CC[C@H]3[C@H]([C@@H]1C[C@H]([C@H]2O)O)C...,G03GB03
30,244879,6,3,516.7,5.5,C[C@H]1[C@@H]2CC[C@]3([C@H]([C@]2(CC[C@H]1O)C)...,D06AX01
31,247839,3,1,344.5,3.2,C[C@H]1C[C@H]2[C@@H]3CC[C@@H]([C@]3(C[C@@H]([C...,S01BA08


Unnamed: 0,CID,HBondAcceptorCount,HBondDonorCount,MolecularWeight,LogP,IsomericSMILES,ATC_Code
45702,252616,2,0,398.09,4.7,COC1=CC=C(C=C1)C(=O)C(C(C2=CC=CC=C2)Br)Br,
45703,252617,4,1,241.24,3.4,C1=CC=C(C=C1)NN=CC2=CC=C(C=C2)[N+](=O)[O-],
45704,252618,4,1,241.24,3.4,C1=CC=C(C=C1)NN=CC2=CC(=CC=C2)[N+](=O)[O-],
45705,252619,1,1,238.32,3.6,C1C(CC(C1C2=CC=CC=C2)C3=CC=CC=C3)O,
45706,252620,1,2,303.4,2.2,C1C(C(C2=C1NC(=NC2=O)N)C3=CC=CC=C3)C4=CC=CC=C4,
