In [156]:
from chembl_webresource_client.new_client import new_client
import pandas as pd
import json

In [157]:
# Read chembl ids from txt
src_file_path='./data/CHEMBL_ID2.txt'
chembl_ids=[]
with open(src_file_path) as ids_txt:
    for id in ids_txt.read().split(','):
        chembl_ids.append(id.replace(' ',''))
chembl_ids

['CHEMBL70',
 'CHEMBL485',
 'CHEMBL364713',
 'CHEMBL19224',
 'CHEMBL459324',
 'CHEMBL656',
 'CHEMBL1712170',
 'CHEMBL651',
 'CHEMBL1595',
 'CHEMBL1237044',
 'CHEMBL100116',
 'CHEMBL33986',
 'CHEMBL895',
 'CHEMBL592',
 'CHEMBL1201776',
 'CHEMBL607',
 'CHEMBL596',
 'CHEMBL658',
 'CHEMBL634',
 'CHEMBL1005',
 'CHEMBL841',
 'CHEMBL651',
 'CHEMBL511142',
 'CHEMBL656',
 'CHEMBL963',
 'CHEMBL398707',
 'CHEMBL19019',
 'CHEMBL80',
 'CHEMBL1213351',
 'CHEMBL1201294',
 'CHEMBL140050',
 'CHEMBL742',
 'CHEMBL299031',
 'CHEMBL5276190',
 'CHEMBL403893',
 'CHEMBL53',
 'CHEMBL133836',
 'CHEMBL112',
 'CHEMBL521',
 'CHEMBL25',
 'CHEMBL24',
 'CHEMBL12',
 'CHEMBL1064',
 'CHEMBL633',
 'CHEMBL529',
 'CHEMBL1464',
 'CHEMBL1431',
 'CHEMBL998',
 'CHEMBL1487',
 'CHEMBL1790041',
 'CHEMBL35',
 'CHEMBL384467',
 'CHEMBL1503',
 'CHEMBL1491',
 'CHEMBL1624',
 'CHEMBL8',
 'CHEMBL841',
 'CHEMBL472',
 'CHEMBL13',
 'CHEMBL532',
 'CHEMBL139',
 'CHEMBL1502',
 'CHEMBL253376',
 'CHEMBL157101',
 'CHEMBL1741',
 'CHEMBL1622',
 'CH

In [158]:
molecule = new_client.molecule
def get_molecules(chembl_ids):
    molecule_dict={}
    for chembl_id in chembl_ids:
        try: 
            response = molecule.get(chembl_id)
            molecule_dict[chembl_id]=response  
        except Exception as e:
            return str(e) 
        
    return molecule_dict


def check_natural(molecule):
    return molecule['natural_product']
    
def check_analgesic(molecule):
    return 1 if molecule['indication_class'] and 'Analgesic' in molecule['indication_class'] else 0

def get_pka_acid(molecule):
    # structures=molecule['molecule_structures']
    # print(structures)
    # print(structures.get('cx_most_apka'))
    return molecule['molecule_properties']['cx_most_apka'] if molecule['molecule_properties'].get('cx_most_apka') else None

    
molecule_props=get_molecules(chembl_ids)    


results = [(key, check_natural(value), check_analgesic(value), get_pka_acid(value), value['molecule_structures']['canonical_smiles']) for key,value in molecule_props.items()]

results_df = pd.DataFrame(results, columns=['CHEMBL_ID', 'Is_Natural_Product', 'Is_Anaglesic','pka_acid', 'Smiles'])

results_df['pka_acid'] = pd.to_numeric(results_df['pka_acid'], errors='coerce')
pka_acid_values=results_df['pka_acid'].copy().dropna()
results_df['pka_acid'].fillna(pka_acid_values.mean(), inplace=True)
results_df

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  results_df['pka_acid'].fillna(pka_acid_values.mean(), inplace=True)


Unnamed: 0,CHEMBL_ID,Is_Natural_Product,Is_Anaglesic,pka_acid,Smiles
0,CHEMBL70,1,1,10.260000,CN1CC[C@]23c4c5ccc(O)c4O[C@H]2[C@@H](O)C=C[C@H...
1,CHEMBL485,1,1,13.780000,COc1ccc2c3c1O[C@H]1[C@@H](O)C=C[C@H]4[C@@H](C2...
2,CHEMBL364713,1,0,13.070000,COc1ccc2c(c1OC)C(=O)O[C@@H]2[C@H]1c2c(cc3c(c2O...
3,CHEMBL19224,1,0,8.917755,COc1ccc(Cc2nccc3cc(OC)c(OC)cc23)cc1OC
4,CHEMBL459324,1,0,8.917755,CC(=O)Oc1ccc2c3c1O[C@H]1[C@@H](OC(C)=O)C=C[C@H...
...,...,...,...,...,...
82,CHEMBL809,1,0,8.917755,CN[C@H]1CC[C@@H](c2ccc(Cl)c(Cl)c2)c2ccccc21
83,CHEMBL549,1,0,8.917755,CN(C)CCCC1(c2ccc(F)cc2)OCc2cc(C#N)ccc21
84,CHEMBL1508,1,0,8.917755,CN(C)CCC[C@@]1(c2ccc(F)cc2)OCc2cc(C#N)ccc21
85,CHEMBL637,1,0,8.917755,COc1ccc(C(CN(C)C)C2(O)CCCCC2)cc1


In [159]:
results_df.to_csv('./data/natural_products3.csv', index=False)